From 374954b9d648f503f6783325a1266457953a998d Mon Sep 17 00:00:00 2001 From: Tibor Frank Date: Wed, 3 May 2023 13:53:27 +0000 Subject: C-Docs: New structure Change-Id: I73d107f94b28b138f3350a9e1eedb0555583a9ca Signed-off-by: Tibor Frank --- docs/content/methodology/_index.md | 4 +- docs/content/methodology/access_control_lists.md | 70 ---- .../methodology/data_plane_throughput/_index.md | 6 - .../data_plane_throughput/data_plane_throughput.md | 129 ------ .../methodology/data_plane_throughput/mlrsearch.md | 88 ---- .../data_plane_throughput/mrr_throughput.md | 56 --- .../methodology/data_plane_throughput/plrsearch.md | 383 ------------------ .../methodology/dut_state_considerations.md | 148 ------- .../methodology/generic_segmentation_offload.md | 116 ------ docs/content/methodology/geneve.md | 66 --- .../methodology/hoststack_testing/_index.md | 6 - .../hoststack_testing/quicudpip_with_vppecho.md | 48 --- .../hoststack_testing/tcpip_with_iperf3.md | 52 --- .../hoststack_testing/udpip_with_iperf3.md | 44 -- .../hoststack_testing/vsap_ab_with_nginx.md | 39 -- .../internet_protocol_security_ipsec.md | 74 ---- docs/content/methodology/measurements/_index.md | 6 + .../measurements/data_plane_throughput/_index.md | 6 + .../data_plane_throughput/data_plane_throughput.md | 129 ++++++ .../data_plane_throughput/mlr_search.md | 88 ++++ .../measurements/data_plane_throughput/mrr.md | 56 +++ .../data_plane_throughput/plr_search.md | 383 ++++++++++++++++++ .../methodology/measurements/packet_latency.md | 52 +++ docs/content/methodology/measurements/telemetry.md | 158 ++++++++ docs/content/methodology/multi_core_speedup.md | 51 --- .../methodology/network_address_translation.md | 445 --------------------- docs/content/methodology/overview/_index.md | 6 + .../overview/dut_state_considerations.md | 148 +++++++ .../methodology/overview/multi_core_speedup.md | 51 +++ .../methodology/overview/per_thread_resources.md | 101 +++++ docs/content/methodology/overview/terminology.md | 97 +++++ .../methodology/overview/vpp_forwarding_modes.md | 104 +++++ docs/content/methodology/packet_flow_ordering.md | 42 -- docs/content/methodology/packet_latency.md | 45 --- docs/content/methodology/per_patch_testing.md | 230 +++++++++++ docs/content/methodology/per_thread_resources.md | 102 ----- docs/content/methodology/reconfiguration_tests.md | 68 ---- .../methodology/root_cause_analysis/_index.md | 6 - .../perpatch_performance_tests.md | 228 ----------- docs/content/methodology/suite_generation.md | 124 ------ docs/content/methodology/telemetry.md | 167 -------- docs/content/methodology/terminology.md | 82 ---- docs/content/methodology/test/_index.md | 6 + .../methodology/test/access_control_lists.md | 66 +++ .../test/generic_segmentation_offload.md | 117 ++++++ docs/content/methodology/test/hoststack/_index.md | 6 + .../test/hoststack/quicudpip_with_vppecho.md | 48 +++ .../test/hoststack/tcpip_with_iperf3.md | 52 +++ .../test/hoststack/udpip_with_iperf3.md | 44 ++ .../test/hoststack/vsap_ab_with_nginx.md | 39 ++ .../methodology/test/internet_protocol_security.md | 73 ++++ .../test/network_address_translation.md | 445 +++++++++++++++++++++ .../methodology/test/packet_flow_ordering.md | 42 ++ docs/content/methodology/test/reconfiguration.md | 68 ++++ .../methodology/test/tunnel_encapsulations.md | 87 ++++ docs/content/methodology/test/vpp_device.md | 15 + docs/content/methodology/trending/_index.md | 12 + docs/content/methodology/trending/analysis.md | 224 +++++++++++ docs/content/methodology/trending/presentation.md | 34 ++ .../methodology/trending_methodology/_index.md | 6 - .../methodology/trending_methodology/overview.md | 10 - .../trending_methodology/trend_analysis.md | 224 ----------- .../trending_methodology/trend_presentation.md | 36 -- docs/content/methodology/trex_traffic_generator.md | 195 --------- docs/content/methodology/tunnel_encapsulations.md | 41 -- docs/content/methodology/vpp_device_functional.md | 15 - docs/content/methodology/vpp_forwarding_modes.md | 104 ----- docs/content/methodology/vpp_startup_settings.md | 44 -- 68 files changed, 2995 insertions(+), 3362 deletions(-) delete mode 100644 docs/content/methodology/access_control_lists.md delete mode 100644 docs/content/methodology/data_plane_throughput/_index.md delete mode 100644 docs/content/methodology/data_plane_throughput/data_plane_throughput.md delete mode 100644 docs/content/methodology/data_plane_throughput/mlrsearch.md delete mode 100644 docs/content/methodology/data_plane_throughput/mrr_throughput.md delete mode 100644 docs/content/methodology/data_plane_throughput/plrsearch.md delete mode 100644 docs/content/methodology/dut_state_considerations.md delete mode 100644 docs/content/methodology/generic_segmentation_offload.md delete mode 100644 docs/content/methodology/geneve.md delete mode 100644 docs/content/methodology/hoststack_testing/_index.md delete mode 100644 docs/content/methodology/hoststack_testing/quicudpip_with_vppecho.md delete mode 100644 docs/content/methodology/hoststack_testing/tcpip_with_iperf3.md delete mode 100644 docs/content/methodology/hoststack_testing/udpip_with_iperf3.md delete mode 100644 docs/content/methodology/hoststack_testing/vsap_ab_with_nginx.md delete mode 100644 docs/content/methodology/internet_protocol_security_ipsec.md create mode 100644 docs/content/methodology/measurements/_index.md create mode 100644 docs/content/methodology/measurements/data_plane_throughput/_index.md create mode 100644 docs/content/methodology/measurements/data_plane_throughput/data_plane_throughput.md create mode 100644 docs/content/methodology/measurements/data_plane_throughput/mlr_search.md create mode 100644 docs/content/methodology/measurements/data_plane_throughput/mrr.md create mode 100644 docs/content/methodology/measurements/data_plane_throughput/plr_search.md create mode 100644 docs/content/methodology/measurements/packet_latency.md create mode 100644 docs/content/methodology/measurements/telemetry.md delete mode 100644 docs/content/methodology/multi_core_speedup.md delete mode 100644 docs/content/methodology/network_address_translation.md create mode 100644 docs/content/methodology/overview/_index.md create mode 100644 docs/content/methodology/overview/dut_state_considerations.md create mode 100644 docs/content/methodology/overview/multi_core_speedup.md create mode 100644 docs/content/methodology/overview/per_thread_resources.md create mode 100644 docs/content/methodology/overview/terminology.md create mode 100644 docs/content/methodology/overview/vpp_forwarding_modes.md delete mode 100644 docs/content/methodology/packet_flow_ordering.md delete mode 100644 docs/content/methodology/packet_latency.md create mode 100644 docs/content/methodology/per_patch_testing.md delete mode 100644 docs/content/methodology/per_thread_resources.md delete mode 100644 docs/content/methodology/reconfiguration_tests.md delete mode 100644 docs/content/methodology/root_cause_analysis/_index.md delete mode 100644 docs/content/methodology/root_cause_analysis/perpatch_performance_tests.md delete mode 100644 docs/content/methodology/suite_generation.md delete mode 100644 docs/content/methodology/telemetry.md delete mode 100644 docs/content/methodology/terminology.md create mode 100644 docs/content/methodology/test/_index.md create mode 100644 docs/content/methodology/test/access_control_lists.md create mode 100644 docs/content/methodology/test/generic_segmentation_offload.md create mode 100644 docs/content/methodology/test/hoststack/_index.md create mode 100644 docs/content/methodology/test/hoststack/quicudpip_with_vppecho.md create mode 100644 docs/content/methodology/test/hoststack/tcpip_with_iperf3.md create mode 100644 docs/content/methodology/test/hoststack/udpip_with_iperf3.md create mode 100644 docs/content/methodology/test/hoststack/vsap_ab_with_nginx.md create mode 100644 docs/content/methodology/test/internet_protocol_security.md create mode 100644 docs/content/methodology/test/network_address_translation.md create mode 100644 docs/content/methodology/test/packet_flow_ordering.md create mode 100644 docs/content/methodology/test/reconfiguration.md create mode 100644 docs/content/methodology/test/tunnel_encapsulations.md create mode 100644 docs/content/methodology/test/vpp_device.md create mode 100644 docs/content/methodology/trending/_index.md create mode 100644 docs/content/methodology/trending/analysis.md create mode 100644 docs/content/methodology/trending/presentation.md delete mode 100644 docs/content/methodology/trending_methodology/_index.md delete mode 100644 docs/content/methodology/trending_methodology/overview.md delete mode 100644 docs/content/methodology/trending_methodology/trend_analysis.md delete mode 100644 docs/content/methodology/trending_methodology/trend_presentation.md delete mode 100644 docs/content/methodology/trex_traffic_generator.md delete mode 100644 docs/content/methodology/tunnel_encapsulations.md delete mode 100644 docs/content/methodology/vpp_device_functional.md delete mode 100644 docs/content/methodology/vpp_forwarding_modes.md delete mode 100644 docs/content/methodology/vpp_startup_settings.md (limited to 'docs/content/methodology') diff --git a/docs/content/methodology/_index.md b/docs/content/methodology/_index.md index 6f0dcae783..dbef64db94 100644 --- a/docs/content/methodology/_index.md +++ b/docs/content/methodology/_index.md @@ -1,6 +1,6 @@ --- -bookCollapseSection: true +bookCollapseSection: false bookFlatSection: true title: "Methodology" weight: 2 ---- \ No newline at end of file +--- diff --git a/docs/content/methodology/access_control_lists.md b/docs/content/methodology/access_control_lists.md deleted file mode 100644 index 9767d3f86a..0000000000 --- a/docs/content/methodology/access_control_lists.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -title: "Access Control Lists" -weight: 12 ---- - -# Access Control Lists - -VPP is tested in a number of data plane feature configurations across -different forwarding modes. Following sections list features tested. - -## ACL Security-Groups - -Both stateless and stateful access control lists (ACL), also known as -security-groups, are supported by VPP. - -Following ACL configurations are tested for MAC switching with L2 -bridge-domains: - -- *l2bdbasemaclrn-iacl{E}sl-{F}flows*: Input stateless ACL, with {E} - entries and {F} flows. -- *l2bdbasemaclrn-oacl{E}sl-{F}flows*: Output stateless ACL, with {E} - entries and {F} flows. -- *l2bdbasemaclrn-iacl{E}sf-{F}flows*: Input stateful ACL, with {E} - entries and {F} flows. -- *l2bdbasemaclrn-oacl{E}sf-{F}flows*: Output stateful ACL, with {E} - entries and {F} flows. - -Following ACL configurations are tested with IPv4 routing: - -- *ip4base-iacl{E}sl-{F}flows*: Input stateless ACL, with {E} entries - and {F} flows. -- *ip4base-oacl{E}sl-{F}flows*: Output stateless ACL, with {E} entries - and {F} flows. -- *ip4base-iacl{E}sf-{F}flows*: Input stateful ACL, with {E} entries and - {F} flows. -- *ip4base-oacl{E}sf-{F}flows*: Output stateful ACL, with {E} entries - and {F} flows. - -ACL tests are executed with the following combinations of ACL entries -and number of flows: - -- ACL entry definitions - - - flow non-matching deny entry: (src-ip4, dst-ip4, src-port, dst-port). - - flow matching permit ACL entry: (src-ip4, dst-ip4). - -- {E} - number of non-matching deny ACL entries, {E} = [1, 10, 50]. -- {F} - number of UDP flows with different tuple (src-ip4, dst-ip4, - src-port, dst-port), {F} = [100, 10k, 100k]. -- All {E}x{F} combinations are tested per ACL type, total of 9. - -## ACL MAC-IP - -MAC-IP binding ACLs are tested for MAC switching with L2 bridge-domains: - -- *l2bdbasemaclrn-macip-iacl{E}sl-{F}flows*: Input stateless ACL, with - {E} entries and {F} flows. - -MAC-IP ACL tests are executed with the following combinations of ACL -entries and number of flows: - -- ACL entry definitions - - - flow non-matching deny entry: (dst-ip4, dst-mac, bit-mask) - - flow matching permit ACL entry: (dst-ip4, dst-mac, bit-mask) - -- {E} - number of non-matching deny ACL entries, {E} = [1, 10, 50] -- {F} - number of UDP flows with different tuple (dst-ip4, dst-mac), - {F} = [100, 10k, 100k] -- All {E}x{F} combinations are tested per ACL type, total of 9. diff --git a/docs/content/methodology/data_plane_throughput/_index.md b/docs/content/methodology/data_plane_throughput/_index.md deleted file mode 100644 index 5791438b3b..0000000000 --- a/docs/content/methodology/data_plane_throughput/_index.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -bookCollapseSection: true -bookFlatSection: false -title: "Data Plane Throughput" -weight: 4 ---- \ No newline at end of file diff --git a/docs/content/methodology/data_plane_throughput/data_plane_throughput.md b/docs/content/methodology/data_plane_throughput/data_plane_throughput.md deleted file mode 100644 index 7ff1d38d17..0000000000 --- a/docs/content/methodology/data_plane_throughput/data_plane_throughput.md +++ /dev/null @@ -1,129 +0,0 @@ ---- -title: "Data Plane Throughput" -weight: 1 ---- - -# Data Plane Throughput - -Network data plane throughput is measured using multiple test methods in -order to obtain representative and repeatable results across the large -set of performance test cases implemented and executed within CSIT. - -Following throughput test methods are used: - -- MLRsearch - Multiple Loss Ratio search -- MRR - Maximum Receive Rate -- PLRsearch - Probabilistic Loss Ratio search - -Description of each test method is followed by generic test properties -shared by all methods. - -## MLRsearch Tests - -### Description - -Multiple Loss Ratio search (MLRsearch) tests discover multiple packet -throughput rates in a single search, reducing the overall test execution -time compared to a binary search. Each rate is associated with a -distinct Packet Loss Ratio (PLR) criteria. In FD.io CSIT two throughput -rates are discovered: Non-Drop Rate (NDR, with zero packet loss, PLR=0) -and Partial Drop Rate (PDR, with PLR<0.5%). MLRsearch is compliant with -RFC2544. - -### Usage - -MLRsearch tests are run to discover NDR and PDR rates for each VPP and -DPDK release covered by CSIT report. Results for small frame sizes -(64b/78B, IMIX) are presented in packet throughput graphs -(Box-and-Whisker Plots) with NDR and PDR rates plotted against the test -cases covering popular VPP packet paths. - -Each test is executed at least 10 times to verify measurements -repeatability and results are compared between releases and test -environments. NDR and PDR packet and bandwidth throughput results for -all frame sizes and for all tests are presented in detailed results -tables. - -### Details - -See [MLRSearch]({{< ref "mlrsearch/#MLRsearch" >}}) section for more detail. -MLRsearch is being standardized in IETF in -[draft-ietf-bmwg-mlrsearch](https://datatracker.ietf.org/doc/html/draft-ietf-bmwg-mlrsearch-01). - -## MRR Tests - -### Description - -Maximum Receive Rate (MRR) tests are complementary to MLRsearch tests, -as they provide a maximum “raw” throughput benchmark for development and -testing community. - -MRR tests measure the packet forwarding rate under the maximum load -offered by traffic generator (dependent on link type and NIC model) over -a set trial duration, regardless of packet loss. Maximum load for -specified Ethernet frame size is set to the bi-directional link rate. - -### Usage - -MRR tests are much faster than MLRsearch as they rely on a single trial -or a small set of trials with very short duration. It is this property -that makes them suitable for continuous execution in daily performance -trending jobs enabling detection of performance anomalies (regressions, -progressions) resulting from data plane code changes. - -MRR tests are also used for VPP per patch performance jobs verifying -patch performance vs parent. CSIT reports include MRR throughput -comparisons between releases and test environments. Small frame sizes -only (64b/78B, IMIX). - -### Details - -See [MRR Throughput]({{< ref "mrr_throughput/#MRR Throughput" >}}) -section for more detail about MRR tests configuration. - -FD.io CSIT performance dashboard includes complete description of -[daily performance trending tests](https://s3-docs.fd.io/csit/master/trending/methodology/performance_tests.html) -and [VPP per patch tests](https://s3-docs.fd.io/csit/master/trending/methodology/perpatch_performance_tests.html). - -## PLRsearch Tests - -### Description - -Probabilistic Loss Ratio search (PLRsearch) tests discovers a packet -throughput rate associated with configured Packet Loss Ratio (PLR) -criteria for tests run over an extended period of time a.k.a. soak -testing. PLRsearch assumes that system under test is probabilistic in -nature, and not deterministic. - -### Usage - -PLRsearch are run to discover a sustained throughput for PLR=10^-7 -(close to NDR) for VPP release covered by CSIT report. Results for small -frame sizes (64b/78B) are presented in packet throughput graphs (Box -Plots) for a small subset of baseline tests. - -Each soak test lasts 30 minutes and is executed at least twice. Results are -compared against NDR and PDR rates discovered with MLRsearch. - -### Details - -See [PLRSearch]({{< ref "plrsearch/#PLRsearch" >}}) methodology section for -more detail. PLRsearch is being standardized in IETF in -[draft-vpolak-bmwg-plrsearch](https://tools.ietf.org/html/draft-vpolak-bmwg-plrsearch). - -## Generic Test Properties - -All data plane throughput test methodologies share following generic -properties: - -- Tested L2 frame sizes (untagged Ethernet): - - - IPv4 payload: 64B, IMIX (28x64B, 16x570B, 4x1518B), 1518B, 9000B. - - IPv6 payload: 78B, IMIX (28x78B, 16x570B, 4x1518B), 1518B, 9000B. - - All quoted sizes include frame CRC, but exclude per frame - transmission overhead of 20B (preamble, inter frame gap). - -- Offered packet load is always bi-directional and symmetric. -- All measured and reported packet and bandwidth rates are aggregate - bi-directional rates reported from external Traffic Generator - perspective. \ No newline at end of file diff --git a/docs/content/methodology/data_plane_throughput/mlrsearch.md b/docs/content/methodology/data_plane_throughput/mlrsearch.md deleted file mode 100644 index 73039c9b02..0000000000 --- a/docs/content/methodology/data_plane_throughput/mlrsearch.md +++ /dev/null @@ -1,88 +0,0 @@ ---- -title: "MLRsearch" -weight: 2 ---- - -# MLRsearch - -## Overview - -Multiple Loss Ratio search (MLRsearch) tests use an optimized search algorithm -implemented in FD.io CSIT project. MLRsearch discovers any number of -loss ratio loads in a single search. - -Two loss ratio goals are of interest in FD.io CSIT, leading to Non-Drop Rate -(NDR, loss ratio goal is exact zero) and Partial Drop Rate -(PDR, non-zero loss ratio goal, currently 0.5%). - -MLRsearch discovers all the loads in a single pass, reducing required time -duration compared to separate `binary search`es[^1] for each rate. Overall -search time is reduced even further by relying on shorter trial -durations of intermediate steps, with only the final measurements -conducted at the specified final trial duration. This results in the -shorter overall execution time when compared to standard NDR/PDR binary -search, while guaranteeing similar results. - -.. Note:: All throughput rates are *always* bi-directional - aggregates of two equal (symmetric) uni-directional packet rates - received and reported by an external traffic generator, - unless the test specifically requires unidirectional traffic. - -## Search Implementation - -Detailed description of the MLRsearch algorithm is included in the IETF -draft -[draft-ietf-bmwg-mlrsearch-02](https://datatracker.ietf.org/doc/html/draft-ietf-bmwg-mlrsearch-02) -that is in the process of being standardized in the IETF Benchmarking -Methodology Working Group (BMWG). -(Newer version is published in IETF, describing improvements not yet used -in CSIT production.) - -MLRsearch is also available as a -[PyPI (Python Package Index) library](https://pypi.org/project/MLRsearch/). - -## Algorithm highlights - -MRR and receive rate at MRR load are used as initial guesses for the search. - -All previously measured trials (except the very first one which can act -as a warm-up) are taken into consideration, unless superseded -by a trial at the same load but higher duration. - -For every loss ratio goal, tightest upper and lower bound -(from results of large enough trial duration) form an interval. -Exit condition is given by that interval reaching low enough relative width. -Small enough width is achieved by bisecting the current interval. -The bisection can be uneven, to save measurements based on information theory. - -Switching to higher trial duration generally requires a re-measure -at a load from previous trial duration. -When the re-measurement does not confirm previous bound classification -(e.g. tightest lower bound at shorter trial duration becomes -a newest tightest upper bound upon re-measurement), -external search is used to find close enough bound of the lost type. -External search is a generalization of the first stage of -`exponential search`[^2]. - -Shorter trial durations use double width goal, -because one bisection is always safe before risking external search. - -Within an iteration for a specific trial duration, smaller loss ratios (NDR) -are narrowed down first before search continues with higher loss ratios (PDR). - -Other heuristics are there, aimed to prevent unneccessarily narrow intervals, -and to handle corner cases around min and max load. - -## Deviations from RFC 2544 - -CSIT does not have any explicit wait times before and after trial traffic. - -Small differences between intended and offered load are tolerated, -mainly due to various time overheads preventing precise measurement -of the traffic duration (and TRex can sometimes suffer from duration -stretching). - -The final trial duration is only 30s (10s for reconf tests). - -[^1]: [binary search](https://en.wikipedia.org/wiki/Binary_search) -[^2]: [exponential search](https://en.wikipedia.org/wiki/Exponential_search) diff --git a/docs/content/methodology/data_plane_throughput/mrr_throughput.md b/docs/content/methodology/data_plane_throughput/mrr_throughput.md deleted file mode 100644 index 076946fb66..0000000000 --- a/docs/content/methodology/data_plane_throughput/mrr_throughput.md +++ /dev/null @@ -1,56 +0,0 @@ ---- -title: "MRR Throughput" -weight: 4 ---- - -# MRR Throughput - -Maximum Receive Rate (MRR) tests are complementary to MLRsearch tests, -as they provide a maximum "raw" throughput benchmark for development and -testing community. MRR tests measure the packet forwarding rate under -the maximum load offered by traffic generator over a set trial duration, -regardless of packet loss. - -MRR tests are currently used for following test jobs: - -- Report performance comparison: 64B, IMIX for vhost, memif. -- Daily performance trending: 64B, IMIX for vhost, memif. -- Per-patch performance verification: 64B. -- Initial iterations of MLRsearch and PLRsearch: 64B. - -Maximum offered load for specific L2 Ethernet frame size is set to -either the maximum bi-directional link rate or tested NIC model -capacity, as follows: - -- For 10GE NICs the maximum packet rate load is 2x14.88 Mpps for 64B, a - 10GE bi-directional link rate. -- For 25GE NICs the maximum packet rate load is 2x18.75 Mpps for 64B, a - 25GE bi-directional link sub-rate limited by 25GE NIC used on TRex TG, - XXV710. -- For 40GE NICs the maximum packet rate load is 2x18.75 Mpps for 64B, a - 40GE bi-directional link sub-rate limited by 40GE NIC used on TRex - TG,XL710. Packet rate for other tested frame sizes is limited by - PCIeGen3 x8 bandwidth limitation of ~50Gbps. - -MRR test code implements multiple bursts of offered packet load and has -two configurable burst parameters: individual trial duration and number -of trials in a single burst. This enables more precise performance -trending by providing more results data for analysis. - -Burst parameter settings vary between different tests using MRR: - -- MRR individual trial duration: - - - Report performance comparison: 1 sec. - - Daily performance trending: 1 sec. - - Per-patch performance verification: 10 sec. - - Initial iteration for MLRsearch: 1 sec. - - Initial iteration for PLRsearch: 5.2 sec. - -- Number of MRR trials per burst: - - - Report performance comparison: 10. - - Daily performance trending: 10. - - Per-patch performance verification: 5. - - Initial iteration for MLRsearch: 1. - - Initial iteration for PLRsearch: 1. \ No newline at end of file diff --git a/docs/content/methodology/data_plane_throughput/plrsearch.md b/docs/content/methodology/data_plane_throughput/plrsearch.md deleted file mode 100644 index 1facccc63b..0000000000 --- a/docs/content/methodology/data_plane_throughput/plrsearch.md +++ /dev/null @@ -1,383 +0,0 @@ ---- -title: "PLRsearch" -weight: 3 ---- - -# PLRsearch - -## Motivation for PLRsearch - -Network providers are interested in throughput a system can sustain. - -`RFC 2544`[^3] assumes loss ratio is given by a deterministic function of -offered load. But NFV software systems are not deterministic enough. -This makes deterministic algorithms (such as `binary search`[^9] per RFC 2544 -and MLRsearch with single trial) to return results, -which when repeated show relatively high standard deviation, -thus making it harder to tell what "the throughput" actually is. - -We need another algorithm, which takes this indeterminism into account. - -## Generic Algorithm - -Detailed description of the PLRsearch algorithm is included in the IETF -draft `draft-vpolak-bmwg-plrsearch-02`[^1] that is in the process -of being standardized in the IETF Benchmarking Methodology Working Group (BMWG). - -### Terms - -The rest of this page assumes the reader is familiar with the following terms -defined in the IETF draft: - -+ Trial Order Independent System -+ Duration Independent System -+ Target Loss Ratio -+ Critical Load -+ Offered Load regions - - + Zero Loss Region - + Non-Deterministic Region - + Guaranteed Loss Region - -+ Fitting Function - - + Stretch Function - + Erf Function - -+ Bayesian Inference - - + Prior distribution - + Posterior Distribution - -+ Numeric Integration - - + Monte Carlo - + Importance Sampling - -## FD.io CSIT Implementation Specifics - -The search receives min_rate and max_rate values, to avoid measurements -at offered loads not supporeted by the traffic generator. - -The implemented tests cases use bidirectional traffic. -The algorithm stores each rate as bidirectional rate (internally, -the algorithm is agnostic to flows and directions, -it only cares about aggregate counts of packets sent and packets lost), -but debug output from traffic generator lists unidirectional values. - -### Measurement Delay - -In a sample implemenation in FD.io CSIT project, there is roughly 0.5 -second delay between trials due to restrictons imposed by packet traffic -generator in use (T-Rex). - -As measurements results come in, posterior distribution computation takes -more time (per sample), although there is a considerable constant part -(mostly for inverting the fitting functions). - -Also, the integrator needs a fair amount of samples to reach the region -the posterior distribution is concentrated at. - -And of course, the speed of the integrator depends on computing power -of the CPU the algorithm is able to use. - -All those timing related effects are addressed by arithmetically increasing -trial durations with configurable coefficients -(currently 5.1 seconds for the first trial, -each subsequent trial being 0.1 second longer). - -### Rounding Errors and Underflows - -In order to avoid them, the current implementation tracks natural logarithm -(instead of the original quantity) for any quantity which is never negative. -Logarithm of zero is minus infinity (not supported by Python), -so special value "None" is used instead. -Specific functions for frequent operations (such as "logarithm -of sum of exponentials") are defined to handle None correctly. - -### Fitting Functions - -Current implementation uses two fitting functions, called "stretch" and "erf". -In general, their estimates for critical rate differ, -which adds a simple source of systematic error, -on top of randomness error reported by integrator. -Otherwise the reported stdev of critical rate estimate -is unrealistically low. - -Both functions are not only increasing, but also convex -(meaning the rate of increase is also increasing). - -Both fitting functions have several mathematically equivalent formulas, -each can lead to an arithmetic overflow or underflow in different sub-terms. -Overflows can be eliminated by using different exact formulas -for different argument ranges. -Underflows can be avoided by using approximate formulas -in affected argument ranges, such ranges have their own formulas to compute. -At the end, both fitting function implementations -contain multiple "if" branches, discontinuities are a possibility -at range boundaries. - -### Prior Distributions - -The numeric integrator expects all the parameters to be distributed -(independently and) uniformly on an interval (-1, 1). - -As both "mrr" and "spread" parameters are positive and not dimensionless, -a transformation is needed. Dimentionality is inherited from max_rate value. - -The "mrr" parameter follows a `Lomax distribution`[^4] -with alpha equal to one, but shifted so that mrr is always greater than 1 -packet per second. - -The "stretch" parameter is generated simply as the "mrr" value -raised to a random power between zero and one; -thus it follows a `reciprocal distribution`[^5]. - -### Integrator - -After few measurements, the posterior distribution of fitting function -arguments gets quite concentrated into a small area. -The integrator is using `Monte Carlo`[^6] with `importance sampling`[^7] -where the biased distribution is `bivariate Gaussian`[^8] distribution, -with deliberately larger variance. -If the generated sample falls outside (-1, 1) interval, -another sample is generated. - -The center and the covariance matrix for the biased distribution -is based on the first and second moments of samples seen so far -(within the computation). The center is used directly, -covariance matrix is scaled up by a heurictic constant (8.0 by default). -The following additional features are applied -designed to avoid hyper-focused distributions. - -Each computation starts with the biased distribution inherited -from the previous computation (zero point and unit covariance matrix -is used in the first computation), but the overal weight of the data -is set to the weight of the first sample of the computation. -Also, the center is set to the first sample point. -When additional samples come, their weight (including the importance correction) -is compared to sum of the weights of data seen so far (within the iteration). -If the new sample is more than one e-fold more impactful, both weight values -(for data so far and for the new sample) are set to (geometric) average -of the two weights. - -This combination showed the best behavior, as the integrator usually follows -two phases. First phase (where inherited biased distribution -or single big sample are dominating) is mainly important -for locating the new area the posterior distribution is concentrated at. -The second phase (dominated by whole sample population) -is actually relevant for the critical rate estimation. - -### Offered Load Selection - -First two measurements are hardcoded to happen at the middle of rate interval -and at max_rate. Next two measurements follow MRR-like logic, -offered load is decreased so that it would reach target loss ratio -if offered load decrease lead to equal decrease of loss rate. - -The rest of measurements start directly in between -erf and stretch estimate average. -There is one workaround implemented, aimed at reducing the number of consequent -zero loss measurements (per fitting function). The workaround first stores -every measurement result which loss ratio was the targed loss ratio or higher. -Sorted list (called lossy loads) of such results is maintained. - -When a sequence of one or more zero loss measurement results is encountered, -a smallest of lossy loads is drained from the list. -If the estimate average is smaller than the drained value, -a weighted average of this estimate and the drained value is used -as the next offered load. The weight of the estimate decreases exponentially -with the length of consecutive zero loss results. - -This behavior helps the algorithm with convergence speed, -as it does not need so many zero loss result to get near critical region. -Using the smallest (not drained yet) of lossy loads makes it sure -the new offered load is unlikely to result in big loss region. -Draining even if the estimate is large enough helps to discard -early measurements when loss hapened at too low offered load. -Current implementation adds 4 copies of lossy loads and drains 3 of them, -which leads to fairly stable behavior even for somewhat inconsistent SUTs. - -### Caveats - -As high loss count measurements add many bits of information, -they need a large amount of small loss count measurements to balance them, -making the algorithm converge quite slowly. Typically, this happens -when few initial measurements suggest spread way bigger then later measurements. -The workaround in offered load selection helps, -but more intelligent workarounds could get faster convergence still. - -Some systems evidently do not follow the assumption of repeated measurements -having the same average loss rate (when the offered load is the same). -The idea of estimating the trend is not implemented at all, -as the observed trends have varied characteristics. - -Probably, using a more realistic fitting functions -will give better estimates than trend analysis. - -## Bottom Line - -The notion of Throughput is easy to grasp, but it is harder to measure -with any accuracy for non-deterministic systems. - -Even though the notion of critical rate is harder to grasp than the notion -of throughput, it is easier to measure using probabilistic methods. - -In testing, the difference between througput measurements and critical -rate measurements is usually small. - -In pactice, rules of thumb such as "send at max 95% of purported throughput" -are common. The correct benchmarking analysis should ask "Which notion is -95% of throughput an approximation to?" before attempting to answer -"Is 95% of critical rate safe enough?". - -## Algorithmic Analysis - -### Motivation - -While the estimation computation is based on hard probability science; -the offered load selection part of PLRsearch logic is pure heuristics, -motivated by what would a human do based on measurement and computation results. - -The quality of any heuristic is not affected by soundness of its motivation, -just by its ability to achieve the intended goals. -In case of offered load selection, the goal is to help the search to converge -to the long duration estimates sooner. - -But even those long duration estimates could still be of poor quality. -Even though the estimate computation is Bayesian (so it is the best it could be -within the applied assumptions), it can still of poor quality when compared -to what a human would estimate. - -One possible source of poor quality is the randomnes inherently present -in Monte Carlo numeric integration, but that can be supressed -by tweaking the time related input parameters. - -The most likely source of poor quality then are the assumptions. -Most importantly, the number and the shape of fitting functions; -but also others, such as trial order independence and duration independence. - -The result can have poor quality in basically two ways. -One way is related to location. Both upper and lower bounds -can be overestimates or underestimates, meaning the entire estimated interval -between lower bound and upper bound lays above or below (respectively) -of human-estimated interval. -The other way is related to the estimation interval width. -The interval can be too wide or too narrow, compared to human estimation. - -An estimate from a particular fitting function can be classified -as an overestimate (or underestimate) just by looking at time evolution -(without human examining measurement results). Overestimates -decrease by time, underestimates increase by time (assuming -the system performance stays constant). - -Quality of the width of the estimation interval needs human evaluation, -and is unrelated to both rate of narrowing (both good and bad estimate intervals -get narrower at approximately the same relative rate) and relatative width -(depends heavily on the system being tested). - -### Graphical Examples - -The following pictures show the upper (red) and lower (blue) bound, -as well as average of Stretch (pink) and Erf (light green) estimate, -and offered load chosen (grey), as computed by PLRsearch, -after each trial measurement within the 30 minute duration of a test run. - -Both graphs are focusing on later estimates. Estimates computed from -few initial measurements are wildly off the y-axis range shown. - -The following analysis will rely on frequency of zero loss measurements -and magnitude of loss ratio if nonzero. - -The offered load selection strategy used implies zero loss measurements -can be gleaned from the graph by looking at offered load points. -When the points move up farther from lower estimate, it means -the previous measurement had zero loss. After non-zero loss, -the offered load starts again right between (the previous values of) -the estimate curves. - -The very big loss ratio results are visible as noticeable jumps -of both estimates downwards. Medium and small loss ratios are much harder -to distinguish just by looking at the estimate curves, -the analysis is based on raw loss ratio measurement results. - -The following descriptions should explain why the graphs seem to signal -low quality estimate at first sight, but a more detailed look -reveals the quality is good (considering the measurement results). - -#### L2 patch - -Both fitting functions give similar estimates, the graph shows -"stochasticity" of measurements (estimates increase and decrease -within small time regions), and an overall trend of decreasing estimates. - -On the first look, the final interval looks fairly narrow, -especially compared to the region the estimates have travelled -during the search. But the look at the frequency of zero loss results shows -this is not a case of overestimation. Measurements at around the same -offered load have higher probability of zero loss earlier -(when performed farther from upper bound), but smaller probability later -(when performed closer to upper bound). That means it is the performance -of the system under test that decreases (slightly) over time. - -With that in mind, the apparent narrowness of the interval -is not a sign of low quality, just a consequence of PLRsearch assuming -the performance stays constant. - -{{< figure src="/cdocs/PLR_patch.svg" >}} - -#### Vhost - -This test case shows what looks like a quite broad estimation interval, -compared to other test cases with similarly looking zero loss frequencies. -Notable features are infrequent high-loss measurement results -causing big drops of estimates, and lack of long-term convergence. - -Any convergence in medium-sized intervals (during zero loss results) -is reverted by the big loss results, as they happen quite far -from the critical load estimates, and the two fitting functions -extrapolate differently. - -In other words, human only seeing estimates from one fitting function -would expect narrower end interval, but human seeing the measured loss ratios -agrees that the interval should be wider than that. - -{{< figure src="/cdocs/PLR_vhost.svg" >}} - -#### Summary - -The two graphs show the behavior of PLRsearch algorithm applied to soaking test -when some of PLRsearch assumptions do not hold: - -+ L2 patch measurement results violate the assumption - of performance not changing over time. -+ Vhost measurement results violate the assumption - of Poisson distribution matching the loss counts. - -The reported upper and lower bounds can have distance larger or smaller -than a first look by a human would expect, but a more closer look reveals -the quality is good, considering the circumstances. - -The usefullness of the critical load estimate is of questionable value -when the assumptions are violated. - -Some improvements can be made via more specific workarounds, -for example long term limit of L2 patch performance could be estmated -by some heuristic. - -Other improvements can be achieved only by asking users -whether loss patterns matter. Is it better to have single digit losses -distributed fairly evenly over time (as Poisson distribution would suggest), -or is it better to have short periods of medium losses -mixed with long periods of zero losses (as happens in Vhost test) -with the same overall loss ratio? - -[^1]: [draft-vpolak-bmwg-plrsearch-02](https://tools.ietf.org/html/draft-vpolak-bmwg-plrsearch-02) -[^2]: [plrsearch draft](https://tools.ietf.org/html/draft-vpolak-bmwg-plrsearch-00) -[^3]: [RFC 2544](https://tools.ietf.org/html/rfc2544) -[^4]: [Lomax distribution](https://en.wikipedia.org/wiki/Lomax_distribution) -[^5]: [reciprocal distribution](https://en.wikipedia.org/wiki/Reciprocal_distribution) -[^6]: [Monte Carlo](https://en.wikipedia.org/wiki/Monte_Carlo_integration) -[^7]: [importance sampling](https://en.wikipedia.org/wiki/Importance_sampling) -[^8]: [bivariate Gaussian](https://en.wikipedia.org/wiki/Multivariate_normal_distribution) -[^9]: [binary search](https://en.wikipedia.org/wiki/Binary_search_algorithm) \ No newline at end of file diff --git a/docs/content/methodology/dut_state_considerations.md b/docs/content/methodology/dut_state_considerations.md deleted file mode 100644 index 55e408f5f2..0000000000 --- a/docs/content/methodology/dut_state_considerations.md +++ /dev/null @@ -1,148 +0,0 @@ ---- -title: "DUT state considerations" -weight: 6 ---- - -# DUT state considerations - -This page discusses considerations for Device Under Test (DUT) state. -DUTs such as VPP require configuration, to be provided before the aplication -starts (via config files) or just after it starts (via API or CLI access). - -During operation DUTs gather various telemetry data, depending on configuration. -This internal state handling is part of normal operation, -so any performance impact is included in the test results. -Accessing telemetry data is additional load on DUT, -so we are not doing that in main trial measurements that affect results, -but we include separate trials specifically for gathering runtime telemetry. - -But there is one kind of state that needs specific handling. -This kind of DUT state is dynamically created based on incoming traffic, -it affects how DUT handles the traffic, and (unlike telemetry counters) -it has uneven impact on CPU load. -Typical example is NAT, where detecting new sessions takes more CPU than -forwarding packet on existing (open or recently closed) sessions. -We call DUT configurations with this kind of state "stateful", -and configurations without them "stateless". -(Even though stateless configurations contain state described in previous -paragraphs, and some configuration items may have "stateful" in their name, -such as stateful ACLs.) - -# Stateful DUT configurations - -Typically, the level of CPU impact of traffic depends on DUT state. -The first packets causing DUT state to change have higher impact, -subsequent packets matching that state have lower impact. - -From performance point of view, this is similar to traffic phases -for stateful protocols, see -[NGFW draft](https://tools.ietf.org/html/draft-ietf-bmwg-ngfw-performance-05#section-4.3.4). -In CSIT we borrow the terminology (even if it does not fit perfectly, -see discussion below). Ramp-up traffic causes the state change, -sustain traffic does not change the state. - -As the performance is different, each test has to choose which traffic -it wants to test, and manipulate the DUT state to achieve the intended impact. - -## Ramp-up trial - -Tests aiming at sustain performance need to make sure DUT state is created. -We achieve this via a ramp-up trial, specific purpose of which -is to create the state. - -Subsequent trials need no specific handling, as long as the state -remains the same. But some state can time-out, so additional ramp-up -trials are inserted whenever the code detects the state can time-out. -Note that a trial with zero loss refreshes the state, -so only the time since the last non-zero loss trial is tracked. - -For the state to be set completely, it is important both DUT and TG -do not lose any packets. We achieve this by setting the profile multiplier -(TPS from now on) to low enough value. - -It is also important each state-affecting packet is sent. -For size-limited traffic profile it is guaranteed by the size limit. -For continuous traffic, we set a long enough duration (based on TPS). - -At the end of the ramp-up trial, we check DUT state to confirm -it has been created as expected. -Test fails if the state is not (completely) created. - -## State Reset - -Tests aiming at ramp-up performance do not use ramp-up trial, -and they need to reset the DUT state before each trial measurement. -The way of resetting the state depends on test, -usually an API call is used to partially de-configure -the part that holds the state, and then re-configure it back. - -In CSIT we control the DUT state behavior via a test variable "resetter". -If it is not set, DUT state is not reset. -If it is set, each search algorithm (including MRR) will invoke it -before all trial measurements (both main and telemetry ones). -Any configuration keyword enabling a feature with DUT state -will check whether a test variable for ramp-up rate is present. -If it is present, resetter is not set. -If it is not present, the keyword sets the apropriate resetter value. -This logic makes sure either ramp-up or state reset are used. - -Notes: If both ramp-up and state reset were used, the DUT behavior -would be identical to just reset, while test would take longer to execute. -If neither were used, DUT will show different performance in subsequent trials, -violating assumptions of search algorithms. - -## DUT versus protocol ramp-up - -There are at least three different causes for bandwidth possibly increasing -within a single measurement trial. - -The first is DUT switching from state modification phase to constant phase, -it is the primary focus of this document. -Using ramp-up traffic before main trials eliminates this cause -for tests wishing to measure the performance of the next phase. -Using size-limited profiles eliminates the next phase -for tests wishing to measure performance of this phase. - -The second is protocol such as TCP ramping up their throughput to utilize -the bandwidth available. This is the original meaning of "ramp up" -in the NGFW draft (see above). -In existing tests we are not using this meaning of TCP ramp-up. -Instead we use only small transactions, and large enough initial window -so TCP acts as ramped-up already. - -The third is TCP increasing offered load due to retransmissions triggered by -packet loss. In CSIT we again try to avoid this behavior -by using small enough data to transfer, so overlap of multiple transactions -(primary cause of packet loss) is unlikely. -But in MRR tests, packet loss and non-constant offered load are still expected. - -# Stateless DUT configuratons - -These are simple configurations, which do not set any resetter value -(even if ramp-up duration is not configured). -Majority of existing tests are of this type, using continuous traffic profiles. - -In order to identify limits of Trex performance, -we have added suites with stateless DUT configuration (VPP ip4base) -subjected to size-limited ASTF traffic. -The discovered rates serve as a basis of comparison -for evaluating the results for stateful DUT configurations (VPP NAT44ed) -subjected to the same traffic profiles. - -# DUT versus TG state - -Traffic Generator profiles can be stateful (ASTF) or stateless (STL). -DUT configuration can be stateful or stateless (with respect to packet traffic). - -In CSIT we currently use all four possible configurations: - -- Regular stateless VPP tests use stateless traffic profiles. - -- Stateless VPP configuration with stateful profile is used as a base for - comparison. - -- Some stateful DUT configurations (NAT44DET, NAT44ED unidirectional) - are tested using stateless traffic profiles and continuous traffic. - -- The rest of stateful DUT configurations (NAT44ED bidirectional) - are tested using stateful traffic profiles and size limited traffic. diff --git a/docs/content/methodology/generic_segmentation_offload.md b/docs/content/methodology/generic_segmentation_offload.md deleted file mode 100644 index ddb19ba826..0000000000 --- a/docs/content/methodology/generic_segmentation_offload.md +++ /dev/null @@ -1,116 +0,0 @@ ---- -title: "Generic Segmentation Offload" -weight: 15 ---- - -# Generic Segmentation Offload - -## Overview - -Generic Segmentation Offload (GSO) reduces per-packet processing -overhead by enabling applications to pass a multi-packet buffer to -(v)NIC and process a smaller number of large packets (e.g. frame size of -64 KB), instead of processing higher numbers of small packets (e.g. -frame size of 1500 B), thus reducing per-packet overhead. - -GSO tests for VPP vhostuser and tapv2 interfaces. All tests cases use iPerf3 -client and server applications running TCP/IP as a traffic generator. For -performance comparison the same tests are run without GSO enabled. - -## GSO Test Topologies - -Two VPP GSO test topologies are implemented: - -1. iPerfC_GSOvirtio_LinuxVM --- GSOvhost_VPP_GSOvhost --- iPerfS_GSOvirtio_LinuxVM - - - Tests VPP GSO on vhostuser interfaces and interaction with Linux - virtio with GSO enabled. - -2. iPerfC_GSOtap_LinuxNspace --- GSOtapv2_VPP_GSOtapv2 --- iPerfS_GSOtap_LinuxNspace - - - Tests VPP GSO on tapv2 interfaces and interaction with Linux tap - with GSO enabled. - -Common configuration: - -- iPerfC (client) and iPerfS (server) run in TCP/IP mode without upper - bandwidth limit. -- Trial duration is set to 30 sec. -- iPerfC, iPerfS and VPP run in the single SUT node. - - -## VPP GSOtap Topology - -### VPP Configuration - -VPP GSOtap tests are executed without using hyperthreading. VPP worker runs on -a single core. Multi-core tests are not executed. Each interface belongs to -separate namespace. Following core pinning scheme is used: - -- 1t1c (rxq=1, rx_qsz=4096, tx_qsz=4096) - - system isolated: 0,28,56,84 - - vpp mt: 1 - - vpp wt: 2 - - vhost: 3-5 - - iperf-s: 6 - - iperf-c: 7 - -### iPerf3 Server Configuration - -iPerf3 version used 3.7 - - $ sudo -E -S ip netns exec tap1_namespace iperf3 \ - --server --daemon --pidfile /tmp/iperf3_server.pid --logfile /tmp/iperf3.log --port 5201 --affinity - -For the full iPerf3 reference please see: -[iPerf3 docs](https://github.com/esnet/iperf/blob/master/docs/invoking.rst). - - -### iPerf3 Client Configuration - -iPerf3 version used 3.7 - - $ sudo -E -S ip netns exec tap1_namespace iperf3 \ - --client 2.2.2.2 --bind 1.1.1.1 --port 5201 --parallel --time 30.0 --affinity --zerocopy - -For the full iPerf3 reference please see: -[iPerf3 docs](https://github.com/esnet/iperf/blob/master/docs/invoking.rst). - - -## VPP GSOvhost Topology - -### VPP Configuration - -VPP GSOvhost tests are executed without using hyperthreading. VPP worker runs -on a single core. Multi-core tests are not executed. Following core pinning -scheme is used: - -- 1t1c (rxq=1, rx_qsz=1024, tx_qsz=1024) - - system isolated: 0,28,56,84 - - vpp mt: 1 - - vpp wt: 2 - - vm-iperf-s: 3,4,5,6,7 - - vm-iperf-c: 8,9,10,11,12 - - iperf-s: 1 - - iperf-c: 1 - -### iPerf3 Server Configuration - -iPerf3 version used 3.7 - - $ sudo iperf3 \ - --server --daemon --pidfile /tmp/iperf3_server.pid --logfile /tmp/iperf3.log --port 5201 --affinity X - -For the full iPerf3 reference please see: -[iPerf3 docs](https://github.com/esnet/iperf/blob/master/docs/invoking.rst). - - -### iPerf3 Client Configuration - -iPerf3 version used 3.7 - - $ sudo iperf3 \ - --client 2.2.2.2 --bind 1.1.1.1 --port 5201 --parallel --time 30.0 --affinity X --zerocopy - -For the full iPerf3 reference please see: -[iPerf3 docs](https://github.com/esnet/iperf/blob/master/docs/invoking.rst). \ No newline at end of file diff --git a/docs/content/methodology/geneve.md b/docs/content/methodology/geneve.md deleted file mode 100644 index f4a0af92e7..0000000000 --- a/docs/content/methodology/geneve.md +++ /dev/null @@ -1,66 +0,0 @@ ---- -title: "GENEVE" -weight: 21 ---- - -# GENEVE - -## GENEVE Prefix Bindings - -GENEVE prefix bindings should be representative to target applications, where -a packet flows of particular set of IPv4 addresses (L3 underlay network) is -routed via dedicated GENEVE interface by building an L2 overlay. - -Private address ranges to be used in tests: - -- East hosts ip address range: 10.0.1.0 - 10.127.255.255 (10.0/9 prefix) - - - Total of 2^23 - 256 (8 388 352) of usable IPv4 addresses - - Usable in tests for up to 32 767 GENEVE tunnels (IPv4 underlay networks) - -- West hosts ip address range: 10.128.1.0 - 10.255.255.255 (10.128/9 prefix) - - - Total of 2^23 - 256 (8 388 352) of usable IPv4 addresses - - Usable in tests for up to 32 767 GENEVE tunnels (IPv4 underlay networks) - -## GENEVE Tunnel Scale - -If N is a number of GENEVE tunnels (and IPv4 underlay networks) then TG sends -256 packet flows in every of N different sets: - -- i = 1,2,3, ... N - GENEVE tunnel index - -- East-West direction: GENEVE encapsulated packets - - - Outer IP header: - - - src ip: 1.1.1.1 - - - dst ip: 1.1.1.2 - - - GENEVE header: - - - vni: i - - - Inner IP header: - - - src_ip_range(i) = 10.(0 + rounddown(i/255)).(modulo(i/255)).(0-to-255) - - - dst_ip_range(i) = 10.(128 + rounddown(i/255)).(modulo(i/255)).(0-to-255) - -- West-East direction: non-encapsulated packets - - - IP header: - - - src_ip_range(i) = 10.(128 + rounddown(i/255)).(modulo(i/255)).(0-to-255) - - - dst_ip_range(i) = 10.(0 + rounddown(i/255)).(modulo(i/255)).(0-to-255) - - **geneve-tunnels** | **total-flows** --------------------:|----------------: - 1 | 256 - 4 | 1 024 - 16 | 4 096 - 64 | 16 384 - 256 | 65 536 - 1 024 | 262 144 \ No newline at end of file diff --git a/docs/content/methodology/hoststack_testing/_index.md b/docs/content/methodology/hoststack_testing/_index.md deleted file mode 100644 index b658313040..0000000000 --- a/docs/content/methodology/hoststack_testing/_index.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -bookCollapseSection: true -bookFlatSection: false -title: "Hoststack Testing" -weight: 14 ---- \ No newline at end of file diff --git a/docs/content/methodology/hoststack_testing/quicudpip_with_vppecho.md b/docs/content/methodology/hoststack_testing/quicudpip_with_vppecho.md deleted file mode 100644 index c7d57a51b3..0000000000 --- a/docs/content/methodology/hoststack_testing/quicudpip_with_vppecho.md +++ /dev/null @@ -1,48 +0,0 @@ ---- -title: "QUIC/UDP/IP with vpp_echo" -weight: 1 ---- - -# QUIC/UDP/IP with vpp_echo - -[vpp_echo performance testing tool](https://wiki.fd.io/view/VPP/HostStack#External_Echo_Server.2FClient_.28vpp_echo.29) -is a bespoke performance test application which utilizes the 'native -HostStack APIs' to verify performance and correct handling of -connection/stream events with uni-directional and bi-directional -streams of data. - -Because iperf3 does not support the QUIC transport protocol, vpp_echo -is used for measuring the maximum attainable goodput of the VPP Host -Stack connection utilizing the QUIC transport protocol across two -instances of VPP running on separate DUT nodes. The QUIC transport -protocol supports multiple streams per connection and test cases -utilize different combinations of QUIC connections and number of -streams per connection. - -The test configuration is as follows: - - DUT1 Network DUT2 - [ vpp_echo-client -> VPP1 ]=======[ VPP2 -> vpp_echo-server] - N-streams/connection - -where, - -1. vpp_echo server attaches to VPP2 and LISTENs on VPP2:TCP port 1234. -2. vpp_echo client creates one or more connections to VPP1 and opens - one or more stream per connection to VPP2:TCP port 1234. -3. vpp_echo client transmits a uni-directional stream as fast as the - VPP Host Stack allows to the vpp_echo server for the test duration. -4. At the end of the test the vpp_echo client emits the goodput - measurements for all streams and the sum of all streams. - -Test cases include - -1. 1 QUIC Connection with 1 Stream -2. 1 QUIC connection with 10 Streams -3. 10 QUIC connetions with 1 Stream -4. 10 QUIC connections with 10 Streams - -with stream sizes to provide reasonable test durations. The VPP Host -Stack QUIC transport is configured to utilize the picotls encryption -library. In the future, tests utilizing addtional encryption -algorithms will be added. diff --git a/docs/content/methodology/hoststack_testing/tcpip_with_iperf3.md b/docs/content/methodology/hoststack_testing/tcpip_with_iperf3.md deleted file mode 100644 index 7baa88ab50..0000000000 --- a/docs/content/methodology/hoststack_testing/tcpip_with_iperf3.md +++ /dev/null @@ -1,52 +0,0 @@ ---- -title: "TCP/IP with iperf3" -weight: 2 ---- - -# TCP/IP with iperf3 - -[iperf3 goodput measurement tool](https://github.com/esnet/iperf) -is used for measuring the maximum attainable goodput of the VPP Host -Stack connection across two instances of VPP running on separate DUT -nodes. iperf3 is a popular open source tool for active measurements -of the maximum achievable goodput on IP networks. - -Because iperf3 utilizes the POSIX socket interface APIs, the current -test configuration utilizes the LD_PRELOAD mechanism in the linux -kernel to connect iperf3 to the VPP Host Stack using the VPP -Communications Library (VCL) LD_PRELOAD library (libvcl_ldpreload.so). - -In the future, a forked version of iperf3 which has been modified to -directly use the VCL application APIs may be added to determine the -difference in performance of 'VCL Native' applications versus utilizing -LD_PRELOAD which inherently has more overhead and other limitations. - -The test configuration is as follows: - - DUT1 Network DUT2 - [ iperf3-client -> VPP1 ]=======[ VPP2 -> iperf3-server] - -where, - -1. iperf3 server attaches to VPP2 and LISTENs on VPP2:TCP port 5201. -2. iperf3 client attaches to VPP1 and opens one or more stream - connections to VPP2:TCP port 5201. -3. iperf3 client transmits a uni-directional stream as fast as the - VPP Host Stack allows to the iperf3 server for the test duration. -4. At the end of the test the iperf3 client emits the goodput - measurements for all streams and the sum of all streams. - -Test cases include 1 and 10 Streams with a 20 second test duration -with the VPP Host Stack configured to utilize the Cubic TCP -congestion algorithm. - -Note: iperf3 is single threaded, so it is expected that the 10 stream -test shows little or no performance improvement due to -multi-thread/multi-core execution. - -There are also variations of these test cases which use the VPP Network -Simulator (NSIM) plugin to test the VPP Hoststack goodput with 1 percent -of the traffic being dropped at the output interface of VPP1 thereby -simulating a lossy network. The NSIM tests are experimental and the -test results are not currently representative of typical results in a -lossy network. diff --git a/docs/content/methodology/hoststack_testing/udpip_with_iperf3.md b/docs/content/methodology/hoststack_testing/udpip_with_iperf3.md deleted file mode 100644 index 01ddf61269..0000000000 --- a/docs/content/methodology/hoststack_testing/udpip_with_iperf3.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -title: "UDP/IP with iperf3" -weight: 3 ---- - -# UDP/IP with iperf3 - -[iperf3 goodput measurement tool](https://github.com/esnet/iperf) -is used for measuring the maximum attainable goodput of the VPP Host -Stack connection across two instances of VPP running on separate DUT -nodes. iperf3 is a popular open source tool for active measurements -of the maximum achievable goodput on IP networks. - -Because iperf3 utilizes the POSIX socket interface APIs, the current -test configuration utilizes the LD_PRELOAD mechanism in the linux -kernel to connect iperf3 to the VPP Host Stack using the VPP -Communications Library (VCL) LD_PRELOAD library (libvcl_ldpreload.so). - -In the future, a forked version of iperf3 which has been modified to -directly use the VCL application APIs may be added to determine the -difference in performance of 'VCL Native' applications versus utilizing -LD_PRELOAD which inherently has more overhead and other limitations. - -The test configuration is as follows: - - DUT1 Network DUT2 - [ iperf3-client -> VPP1 ]=======[ VPP2 -> iperf3-server] - -where, - -1. iperf3 server attaches to VPP2 and LISTENs on VPP2:UDP port 5201. -2. iperf3 client attaches to VPP1 and transmits one or more streams - of packets to VPP2:UDP port 5201. -3. iperf3 client transmits a uni-directional stream as fast as the - VPP Host Stack allows to the iperf3 server for the test duration. -4. At the end of the test the iperf3 client emits the goodput - measurements for all streams and the sum of all streams. - -Test cases include 1 and 10 Streams with a 20 second test duration -with the VPP Host Stack using the UDP transport layer.. - -Note: iperf3 is single threaded, so it is expected that the 10 stream -test shows little or no performance improvement due to -multi-thread/multi-core execution. diff --git a/docs/content/methodology/hoststack_testing/vsap_ab_with_nginx.md b/docs/content/methodology/hoststack_testing/vsap_ab_with_nginx.md deleted file mode 100644 index 2dc4d2b7f9..0000000000 --- a/docs/content/methodology/hoststack_testing/vsap_ab_with_nginx.md +++ /dev/null @@ -1,39 +0,0 @@ ---- -title: "VSAP ab with nginx" -weight: 4 ---- - -# VSAP ab with nginx - -[VSAP (VPP Stack Acceleration Project)](https://wiki.fd.io/view/VSAP) -aims to establish an industry user space application ecosystem based on -the VPP hoststack. As a pre-requisite to adapting open source applications -using VPP Communications Library to accelerate performance, the VSAP team -has introduced baseline tests utilizing the LD_PRELOAD mechanism to capture -baseline performance data. - -[AB (Apache HTTP server benchmarking tool)](https://httpd.apache.org/docs/2.4/programs/ab.html) -is used for measuring the maximum connections-per-second and requests-per-second. - -[NGINX](https://www.nginx.com) is a popular open source HTTP server -application. Because NGINX utilizes the POSIX socket interface APIs, the test -configuration uses the LD_PRELOAD mechanism to connect NGINX to the VPP -Hoststack using the VPP Communications Library (VCL) LD_PRELOAD library -(libvcl_ldpreload.so). - -In the future, a version of NGINX which has been modified to -directly use the VCL application APIs will be added to determine the -difference in performance of 'VCL Native' applications versus utilizing -LD_PRELOAD which inherently has more overhead and other limitations. - -The test configuration is as follows: - - TG Network DUT - [ AB ]=============[ VPP -> nginx ] - -where, - -1. nginx attaches to VPP and listens on TCP port 80 -2. ab runs CPS and RPS tests with packets flowing from the Test Generator node, - across 100G NICs, through VPP hoststack to NGINX. -3. At the end of the tests, the results are reported by AB. diff --git a/docs/content/methodology/internet_protocol_security_ipsec.md b/docs/content/methodology/internet_protocol_security_ipsec.md deleted file mode 100644 index 711004f2c0..0000000000 --- a/docs/content/methodology/internet_protocol_security_ipsec.md +++ /dev/null @@ -1,74 +0,0 @@ ---- -title: "Internet Protocol Security (IPsec)" -weight: 11 ---- - -# Internet Protocol Security (IPsec) - -VPP IPsec performance tests are executed for the following crypto -plugins: - -- `crypto_native`, used for software based crypto leveraging CPU - platform optimizations e.g. Intel's AES-NI instruction set. -- `crypto_ipsecmb`, used for hardware based crypto with Intel QAT PCIe - cards. - -## IPsec with VPP Native SW Crypto - -CSIT implements following IPsec test cases relying on VPP native crypto -(`crypto_native` plugin): - - **VPP Crypto Engine** | **ESP Encryption** | **ESP Integrity** | **Scale Tested** -----------------------:|-------------------:|------------------:|-----------------: - crypto_native | AES[128\|256]-GCM | GCM | 1 to 60k tunnels - crypto_native | AES128-CBC | SHA[256\|512] | 1 to 60k tunnels - -VPP IPsec with SW crypto are executed in both tunnel and policy modes, -with tests running on 3-node testbeds: 3n-icx, 3n-tsh. - -## IPsec with Intel QAT HW - -CSIT implements following IPsec test cases relying on ipsecmb library -(`crypto_ipsecmb` plugin) and Intel QAT 8950 (50G HW crypto card): - -dpdk_cryptodev - - **VPP Crypto Engine** | **VPP Crypto Workers** | **ESP Encryption** | **ESP Integrity** | **Scale Tested** -----------------------:|-----------------------:|-------------------:|------------------:|-----------------: - crypto_ipsecmb | sync/all workers | AES[128\|256]-GCM | GCM | 1, 1k tunnels - crypto_ipsecmb | sync/all workers | AES[128]-CBC | SHA[256\|512] | 1, 1k tunnels - crypto_ipsecmb | async/crypto worker | AES[128\|256]-GCM | GCM | 1, 4, 1k tunnels - crypto_ipsecmb | async/crypto worker | AES[128]-CBC | SHA[256\|512] | 1, 4, 1k tunnels - -## IPsec with Async Crypto Feature Workers - -*TODO Description to be added* - -## IPsec Uni-Directional Tests with VPP Native SW Crypto - -CSIT implements following IPsec uni-directional test cases relying on VPP native -crypto (`crypto_native` plugin) in tunnel mode: - - **VPP Crypto Engine** | **ESP Encryption** | **ESP Integrity** | **Scale Tested** -----------------------:|-------------------:|------------------:|-------------------: - crypto_native | AES[128\|256]-GCM | GCM | 4, 1k, 10k tunnels - crypto_native | AES128-CBC | SHA[512] | 4, 1k, 10k tunnels - -In policy mode: - - **VPP Crypto Engine** | **ESP Encryption** | **ESP Integrity** | **Scale Tested** -----------------------:|-------------------:|------------------:|------------------: - crypto_native | AES[256]-GCM | GCM | 1, 40, 1k tunnels - -The tests are running on 2-node testbeds: 2n-tx2. The uni-directional tests -are partially addressing a weakness in 2-node testbed setups with T-Rex as -the traffic generator. With just one DUT node, we can either encrypt or decrypt -traffic in each direction. - -The testcases are only doing encryption - packets are encrypted on the DUT and -then arrive at TG where no additional packet processing is needed (just -counting packets). - -Decryption would require that the traffic generator generated encrypted packets -which the DUT then would decrypt. However, T-Rex does not have the capability -to encrypt packets. diff --git a/docs/content/methodology/measurements/_index.md b/docs/content/methodology/measurements/_index.md new file mode 100644 index 0000000000..9e9232969e --- /dev/null +++ b/docs/content/methodology/measurements/_index.md @@ -0,0 +1,6 @@ +--- +bookCollapseSection: true +bookFlatSection: false +title: "Measurements" +weight: 2 +--- diff --git a/docs/content/methodology/measurements/data_plane_throughput/_index.md b/docs/content/methodology/measurements/data_plane_throughput/_index.md new file mode 100644 index 0000000000..8fc7f66f3e --- /dev/null +++ b/docs/content/methodology/measurements/data_plane_throughput/_index.md @@ -0,0 +1,6 @@ +--- +bookCollapseSection: true +bookFlatSection: false +title: "Data Plane Throughput" +weight: 1 +--- \ No newline at end of file diff --git a/docs/content/methodology/measurements/data_plane_throughput/data_plane_throughput.md b/docs/content/methodology/measurements/data_plane_throughput/data_plane_throughput.md new file mode 100644 index 0000000000..865405ba2f --- /dev/null +++ b/docs/content/methodology/measurements/data_plane_throughput/data_plane_throughput.md @@ -0,0 +1,129 @@ +--- +title: "Overview" +weight: 1 +--- + +# Data Plane Throughput + +Network data plane throughput is measured using multiple test methods in +order to obtain representative and repeatable results across the large +set of performance test cases implemented and executed within CSIT. + +Following throughput test methods are used: + +- MLRsearch - Multiple Loss Ratio search +- PLRsearch - Probabilistic Loss Ratio search +- MRR - Maximum Receive Rate + +Description of each test method is followed by generic test properties +shared by all methods. + +## MLRsearch Tests + +### Description + +Multiple Loss Ratio search (MLRsearch) tests discover multiple packet +throughput rates in a single search, reducing the overall test execution +time compared to a binary search. Each rate is associated with a +distinct Packet Loss Ratio (PLR) criteria. In FD.io CSIT two throughput +rates are discovered: Non-Drop Rate (NDR, with zero packet loss, PLR=0) +and Partial Drop Rate (PDR, with PLR<0.5%). MLRsearch is compliant with +RFC2544. + +### Usage + +MLRsearch tests are run to discover NDR and PDR rates for each VPP and +DPDK release covered by CSIT report. Results for small frame sizes +(64B/78B, IMIX) are presented in packet throughput graphs +(Box-and-Whisker Plots) with NDR and PDR rates plotted against the test +cases covering popular VPP packet paths. + +Each test is executed at least 10 times to verify measurements +repeatability and results are compared between releases and test +environments. NDR and PDR packet and bandwidth throughput results for +all frame sizes and for all tests are presented in detailed results +tables. + +### Details + +See [MLRSearch]({{< ref "mlr_search/#MLRsearch" >}}) section for more detail. +MLRsearch is being standardized in IETF in +[draft-ietf-bmwg-mlrsearch](https://datatracker.ietf.org/doc/html/draft-ietf-bmwg-mlrsearch-01). + +## PLRsearch Tests + +### Description + +Probabilistic Loss Ratio search (PLRsearch) tests discovers a packet +throughput rate associated with configured Packet Loss Ratio (PLR) +criteria for tests run over an extended period of time a.k.a. soak +testing. PLRsearch assumes that system under test is probabilistic in +nature, and not deterministic. + +### Usage + +PLRsearch are run to discover a sustained throughput for PLR=10^-7^ +(close to NDR) for VPP release covered by CSIT report. Results for small +frame sizes (64B/78B) are presented in packet throughput graphs (Box +Plots) for a small subset of baseline tests. + +Each soak test lasts 30 minutes and is executed at least twice. Results are +compared against NDR and PDR rates discovered with MLRsearch. + +### Details + +See [PLRSearch]({{< ref "plr_search/#PLRsearch" >}}) methodology section for +more detail. PLRsearch is being standardized in IETF in +[draft-vpolak-bmwg-plrsearch](https://tools.ietf.org/html/draft-vpolak-bmwg-plrsearch). + +## MRR Tests + +### Description + +Maximum Receive Rate (MRR) tests are complementary to MLRsearch tests, +as they provide a maximum “raw” throughput benchmark for development and +testing community. + +MRR tests measure the packet forwarding rate under the maximum load +offered by traffic generator (dependent on link type and NIC model) over +a set trial duration, regardless of packet loss. Maximum load for +specified Ethernet frame size is set to the bi-directional link rate. + +### Usage + +MRR tests are much faster than MLRsearch as they rely on a single trial +or a small set of trials with very short duration. It is this property +that makes them suitable for continuous execution in daily performance +trending jobs enabling detection of performance anomalies (regressions, +progressions) resulting from data plane code changes. + +MRR tests are also used for VPP per patch performance jobs verifying +patch performance vs parent. CSIT reports include MRR throughput +comparisons between releases and test environments. Small frame sizes +only (64B/78B, IMIX). + +### Details + +See [MRR Throughput]({{< ref "mrr/#MRR" >}}) +section for more detail about MRR tests configuration. + +FD.io CSIT performance dashboard includes complete description of +[daily performance trending tests]({{< ref "../../trending/analysis" >}}) +and [VPP per patch tests]({{< ref "../../per_patch_testing.md" >}}). + +## Generic Test Properties + +All data plane throughput test methodologies share following generic +properties: + +- Tested L2 frame sizes (untagged Ethernet): + + - IPv4 payload: 64B, IMIX (28x64B, 16x570B, 4x1518B), 1518B, 9000B. + - IPv6 payload: 78B, IMIX (28x78B, 16x570B, 4x1518B), 1518B, 9000B. + - All quoted sizes include frame CRC, but exclude per frame + transmission overhead of 20B (preamble, inter frame gap). + +- Offered packet load is always bi-directional and symmetric. +- All measured and reported packet and bandwidth rates are aggregate + bi-directional rates reported from external Traffic Generator + perspective. diff --git a/docs/content/methodology/measurements/data_plane_throughput/mlr_search.md b/docs/content/methodology/measurements/data_plane_throughput/mlr_search.md new file mode 100644 index 0000000000..93bdb51efe --- /dev/null +++ b/docs/content/methodology/measurements/data_plane_throughput/mlr_search.md @@ -0,0 +1,88 @@ +--- +title: "MLR Search" +weight: 2 +--- + +# MLR Search + +## Overview + +Multiple Loss Ratio search (MLRsearch) tests use an optimized search algorithm +implemented in FD.io CSIT project. MLRsearch discovers any number of +loss ratio loads in a single search. + +Two loss ratio goals are of interest in FD.io CSIT, leading to Non-Drop Rate +(NDR, loss ratio goal is exact zero) and Partial Drop Rate +(PDR, non-zero loss ratio goal, currently 0.5%). + +MLRsearch discovers all the loads in a single pass, reducing required time +duration compared to separate `binary search`es[^1] for each rate. Overall +search time is reduced even further by relying on shorter trial +durations of intermediate steps, with only the final measurements +conducted at the specified final trial duration. This results in the +shorter overall execution time when compared to standard NDR/PDR binary +search, while guaranteeing similar results. + + Note: All throughput rates are *always* bi-directional aggregates of two + equal (symmetric) uni-directional packet rates received and reported by an + external traffic generator, unless the test specifically requires + unidirectional traffic. + +## Search Implementation + +Detailed description of the MLRsearch algorithm is included in the IETF +draft +[draft-ietf-bmwg-mlrsearch-02](https://datatracker.ietf.org/doc/html/draft-ietf-bmwg-mlrsearch-02) +that is in the process of being standardized in the IETF Benchmarking +Methodology Working Group (BMWG). +(Newer version is published in IETF, describing improvements not yet used +in CSIT production.) + +MLRsearch is also available as a +[PyPI (Python Package Index) library](https://pypi.org/project/MLRsearch/). + +## Algorithm highlights + +MRR and receive rate at MRR load are used as initial guesses for the search. + +All previously measured trials (except the very first one which can act +as a warm-up) are taken into consideration, unless superseded +by a trial at the same load but higher duration. + +For every loss ratio goal, tightest upper and lower bound +(from results of large enough trial duration) form an interval. +Exit condition is given by that interval reaching low enough relative width. +Small enough width is achieved by bisecting the current interval. +The bisection can be uneven, to save measurements based on information theory. + +Switching to higher trial duration generally requires a re-measure +at a load from previous trial duration. +When the re-measurement does not confirm previous bound classification +(e.g. tightest lower bound at shorter trial duration becomes +a newest tightest upper bound upon re-measurement), +external search is used to find close enough bound of the lost type. +External search is a generalization of the first stage of +`exponential search`[^2]. + +Shorter trial durations use double width goal, +because one bisection is always safe before risking external search. + +Within an iteration for a specific trial duration, smaller loss ratios (NDR) +are narrowed down first before search continues with higher loss ratios (PDR). + +Other heuristics are there, aimed to prevent unneccessarily narrow intervals, +and to handle corner cases around min and max load. + +## Deviations from RFC 2544 + +CSIT does not have any explicit wait times before and after trial traffic. + +Small differences between intended and offered load are tolerated, +mainly due to various time overheads preventing precise measurement +of the traffic duration (and TRex can sometimes suffer from duration +stretching). + +The final trial duration is only 30s (10s for reconf tests). + +[^1]: [binary search](https://en.wikipedia.org/wiki/Binary_search) +[^2]: [exponential search](https://en.wikipedia.org/wiki/Exponential_search) diff --git a/docs/content/methodology/measurements/data_plane_throughput/mrr.md b/docs/content/methodology/measurements/data_plane_throughput/mrr.md new file mode 100644 index 0000000000..e8c3e62eb6 --- /dev/null +++ b/docs/content/methodology/measurements/data_plane_throughput/mrr.md @@ -0,0 +1,56 @@ +--- +title: "MRR" +weight: 4 +--- + +# MRR + +Maximum Receive Rate (MRR) tests are complementary to MLRsearch tests, +as they provide a maximum "raw" throughput benchmark for development and +testing community. MRR tests measure the packet forwarding rate under +the maximum load offered by traffic generator over a set trial duration, +regardless of packet loss. + +MRR tests are currently used for following test jobs: + +- Report performance comparison: 64B, IMIX for vhost, memif. +- Daily performance trending: 64B, IMIX for vhost, memif. +- Per-patch performance verification: 64B. +- Initial iterations of MLRsearch and PLRsearch: 64B. + +Maximum offered load for specific L2 Ethernet frame size is set to +either the maximum bi-directional link rate or tested NIC model +capacity, as follows: + +- For 10GE NICs the maximum packet rate load is 2x14.88 Mpps for 64B, a + 10GE bi-directional link rate. +- For 25GE NICs the maximum packet rate load is 2x18.75 Mpps for 64B, a + 25GE bi-directional link sub-rate limited by 25GE NIC used on TRex TG, + XXV710. +- For 40GE NICs the maximum packet rate load is 2x18.75 Mpps for 64B, a + 40GE bi-directional link sub-rate limited by 40GE NIC used on TRex + TG, XL710. Packet rate for other tested frame sizes is limited by + PCIeGen3 x8 bandwidth limitation of ~50Gbps. + +MRR test code implements multiple bursts of offered packet load and has +two configurable burst parameters: individual trial duration and number +of trials in a single burst. This enables more precise performance +trending by providing more results data for analysis. + +Burst parameter settings vary between different tests using MRR: + +- MRR individual trial duration: + + - Report performance comparison: 1 sec. + - Daily performance trending: 1 sec. + - Per-patch performance verification: 10 sec. + - Initial iteration for MLRsearch: 1 sec. + - Initial iteration for PLRsearch: 5.2 sec. + +- Number of MRR trials per burst: + + - Report performance comparison: 10. + - Daily performance trending: 10. + - Per-patch performance verification: 5. + - Initial iteration for MLRsearch: 1. + - Initial iteration for PLRsearch: 1. diff --git a/docs/content/methodology/measurements/data_plane_throughput/plr_search.md b/docs/content/methodology/measurements/data_plane_throughput/plr_search.md new file mode 100644 index 0000000000..529bac1f7f --- /dev/null +++ b/docs/content/methodology/measurements/data_plane_throughput/plr_search.md @@ -0,0 +1,383 @@ +--- +title: "PLR Search" +weight: 3 +--- + +# PLR Search + +## Motivation for PLRsearch + +Network providers are interested in throughput a system can sustain. + +`RFC 2544`[^1] assumes loss ratio is given by a deterministic function of +offered load. But NFV software systems are not deterministic enough. +This makes deterministic algorithms (such as `binary search`[^2] per RFC 2544 +and MLRsearch with single trial) to return results, +which when repeated show relatively high standard deviation, +thus making it harder to tell what "the throughput" actually is. + +We need another algorithm, which takes this indeterminism into account. + +## Generic Algorithm + +Detailed description of the PLRsearch algorithm is included in the IETF +draft `Probabilistic Loss Ratio Search for Packet Throughput`[^3] that is in the +process of being standardized in the IETF Benchmarking Methodology Working Group +(BMWG). + +### Terms + +The rest of this page assumes the reader is familiar with the following terms +defined in the IETF draft: + ++ Trial Order Independent System ++ Duration Independent System ++ Target Loss Ratio ++ Critical Load ++ Offered Load regions + + + Zero Loss Region + + Non-Deterministic Region + + Guaranteed Loss Region + ++ Fitting Function + + + Stretch Function + + Erf Function + ++ Bayesian Inference + + + Prior distribution + + Posterior Distribution + ++ Numeric Integration + + + Monte Carlo + + Importance Sampling + +## FD.io CSIT Implementation Specifics + +The search receives min_rate and max_rate values, to avoid measurements +at offered loads not supporeted by the traffic generator. + +The implemented tests cases use bidirectional traffic. +The algorithm stores each rate as bidirectional rate (internally, +the algorithm is agnostic to flows and directions, +it only cares about aggregate counts of packets sent and packets lost), +but debug output from traffic generator lists unidirectional values. + +### Measurement Delay + +In a sample implemenation in FD.io CSIT project, there is roughly 0.5 +second delay between trials due to restrictons imposed by packet traffic +generator in use (T-Rex). + +As measurements results come in, posterior distribution computation takes +more time (per sample), although there is a considerable constant part +(mostly for inverting the fitting functions). + +Also, the integrator needs a fair amount of samples to reach the region +the posterior distribution is concentrated at. + +And of course, the speed of the integrator depends on computing power +of the CPU the algorithm is able to use. + +All those timing related effects are addressed by arithmetically increasing +trial durations with configurable coefficients +(currently 5.1 seconds for the first trial, +each subsequent trial being 0.1 second longer). + +### Rounding Errors and Underflows + +In order to avoid them, the current implementation tracks natural logarithm +(instead of the original quantity) for any quantity which is never negative. +Logarithm of zero is minus infinity (not supported by Python), +so special value "None" is used instead. +Specific functions for frequent operations (such as "logarithm +of sum of exponentials") are defined to handle None correctly. + +### Fitting Functions + +Current implementation uses two fitting functions, called "stretch" and "erf". +In general, their estimates for critical rate differ, +which adds a simple source of systematic error, +on top of randomness error reported by integrator. +Otherwise the reported stdev of critical rate estimate +is unrealistically low. + +Both functions are not only increasing, but also convex +(meaning the rate of increase is also increasing). + +Both fitting functions have several mathematically equivalent formulas, +each can lead to an arithmetic overflow or underflow in different sub-terms. +Overflows can be eliminated by using different exact formulas +for different argument ranges. +Underflows can be avoided by using approximate formulas +in affected argument ranges, such ranges have their own formulas to compute. +At the end, both fitting function implementations +contain multiple "if" branches, discontinuities are a possibility +at range boundaries. + +### Prior Distributions + +The numeric integrator expects all the parameters to be distributed +(independently and) uniformly on an interval (-1, 1). + +As both "mrr" and "spread" parameters are positive and not dimensionless, +a transformation is needed. Dimentionality is inherited from max_rate value. + +The "mrr" parameter follows a `Lomax distribution`[^4] +with alpha equal to one, but shifted so that mrr is always greater than 1 +packet per second. + +The "stretch" parameter is generated simply as the "mrr" value +raised to a random power between zero and one; +thus it follows a `reciprocal distribution`[^5]. + +### Integrator + +After few measurements, the posterior distribution of fitting function +arguments gets quite concentrated into a small area. +The integrator is using `Monte Carlo`[^6] with `importance sampling`[^7] +where the biased distribution is `bivariate Gaussian`[^8] distribution, +with deliberately larger variance. +If the generated sample falls outside (-1, 1) interval, +another sample is generated. + +The center and the covariance matrix for the biased distribution +is based on the first and second moments of samples seen so far +(within the computation). The center is used directly, +covariance matrix is scaled up by a heurictic constant (8.0 by default). +The following additional features are applied +designed to avoid hyper-focused distributions. + +Each computation starts with the biased distribution inherited +from the previous computation (zero point and unit covariance matrix +is used in the first computation), but the overal weight of the data +is set to the weight of the first sample of the computation. +Also, the center is set to the first sample point. +When additional samples come, their weight (including the importance correction) +is compared to sum of the weights of data seen so far (within the iteration). +If the new sample is more than one e-fold more impactful, both weight values +(for data so far and for the new sample) are set to (geometric) average +of the two weights. + +This combination showed the best behavior, as the integrator usually follows +two phases. First phase (where inherited biased distribution +or single big sample are dominating) is mainly important +for locating the new area the posterior distribution is concentrated at. +The second phase (dominated by whole sample population) +is actually relevant for the critical rate estimation. + +### Offered Load Selection + +First two measurements are hardcoded to happen at the middle of rate interval +and at max_rate. Next two measurements follow MRR-like logic, +offered load is decreased so that it would reach target loss ratio +if offered load decrease lead to equal decrease of loss rate. + +The rest of measurements start directly in between +erf and stretch estimate average. +There is one workaround implemented, aimed at reducing the number of consequent +zero loss measurements (per fitting function). The workaround first stores +every measurement result which loss ratio was the targed loss ratio or higher. +Sorted list (called lossy loads) of such results is maintained. + +When a sequence of one or more zero loss measurement results is encountered, +a smallest of lossy loads is drained from the list. +If the estimate average is smaller than the drained value, +a weighted average of this estimate and the drained value is used +as the next offered load. The weight of the estimate decreases exponentially +with the length of consecutive zero loss results. + +This behavior helps the algorithm with convergence speed, +as it does not need so many zero loss result to get near critical region. +Using the smallest (not drained yet) of lossy loads makes it sure +the new offered load is unlikely to result in big loss region. +Draining even if the estimate is large enough helps to discard +early measurements when loss hapened at too low offered load. +Current implementation adds 4 copies of lossy loads and drains 3 of them, +which leads to fairly stable behavior even for somewhat inconsistent SUTs. + +### Caveats + +As high loss count measurements add many bits of information, +they need a large amount of small loss count measurements to balance them, +making the algorithm converge quite slowly. Typically, this happens +when few initial measurements suggest spread way bigger then later measurements. +The workaround in offered load selection helps, +but more intelligent workarounds could get faster convergence still. + +Some systems evidently do not follow the assumption of repeated measurements +having the same average loss rate (when the offered load is the same). +The idea of estimating the trend is not implemented at all, +as the observed trends have varied characteristics. + +Probably, using a more realistic fitting functions +will give better estimates than trend analysis. + +## Bottom Line + +The notion of Throughput is easy to grasp, but it is harder to measure +with any accuracy for non-deterministic systems. + +Even though the notion of critical rate is harder to grasp than the notion +of throughput, it is easier to measure using probabilistic methods. + +In testing, the difference between througput measurements and critical +rate measurements is usually small. + +In pactice, rules of thumb such as "send at max 95% of purported throughput" +are common. The correct benchmarking analysis should ask "Which notion is +95% of throughput an approximation to?" before attempting to answer +"Is 95% of critical rate safe enough?". + +## Algorithmic Analysis + +### Motivation + +While the estimation computation is based on hard probability science; +the offered load selection part of PLRsearch logic is pure heuristics, +motivated by what would a human do based on measurement and computation results. + +The quality of any heuristic is not affected by soundness of its motivation, +just by its ability to achieve the intended goals. +In case of offered load selection, the goal is to help the search to converge +to the long duration estimates sooner. + +But even those long duration estimates could still be of poor quality. +Even though the estimate computation is Bayesian (so it is the best it could be +within the applied assumptions), it can still of poor quality when compared +to what a human would estimate. + +One possible source of poor quality is the randomnes inherently present +in Monte Carlo numeric integration, but that can be supressed +by tweaking the time related input parameters. + +The most likely source of poor quality then are the assumptions. +Most importantly, the number and the shape of fitting functions; +but also others, such as trial order independence and duration independence. + +The result can have poor quality in basically two ways. +One way is related to location. Both upper and lower bounds +can be overestimates or underestimates, meaning the entire estimated interval +between lower bound and upper bound lays above or below (respectively) +of human-estimated interval. +The other way is related to the estimation interval width. +The interval can be too wide or too narrow, compared to human estimation. + +An estimate from a particular fitting function can be classified +as an overestimate (or underestimate) just by looking at time evolution +(without human examining measurement results). Overestimates +decrease by time, underestimates increase by time (assuming +the system performance stays constant). + +Quality of the width of the estimation interval needs human evaluation, +and is unrelated to both rate of narrowing (both good and bad estimate intervals +get narrower at approximately the same relative rate) and relatative width +(depends heavily on the system being tested). + +### Graphical Examples + +The following pictures show the upper (red) and lower (blue) bound, +as well as average of Stretch (pink) and Erf (light green) estimate, +and offered load chosen (grey), as computed by PLRsearch, +after each trial measurement within the 30 minute duration of a test run. + +Both graphs are focusing on later estimates. Estimates computed from +few initial measurements are wildly off the y-axis range shown. + +The following analysis will rely on frequency of zero loss measurements +and magnitude of loss ratio if nonzero. + +The offered load selection strategy used implies zero loss measurements +can be gleaned from the graph by looking at offered load points. +When the points move up farther from lower estimate, it means +the previous measurement had zero loss. After non-zero loss, +the offered load starts again right between (the previous values of) +the estimate curves. + +The very big loss ratio results are visible as noticeable jumps +of both estimates downwards. Medium and small loss ratios are much harder +to distinguish just by looking at the estimate curves, +the analysis is based on raw loss ratio measurement results. + +The following descriptions should explain why the graphs seem to signal +low quality estimate at first sight, but a more detailed look +reveals the quality is good (considering the measurement results). + +#### L2 patch + +Both fitting functions give similar estimates, the graph shows +"stochasticity" of measurements (estimates increase and decrease +within small time regions), and an overall trend of decreasing estimates. + +On the first look, the final interval looks fairly narrow, +especially compared to the region the estimates have travelled +during the search. But the look at the frequency of zero loss results shows +this is not a case of overestimation. Measurements at around the same +offered load have higher probability of zero loss earlier +(when performed farther from upper bound), but smaller probability later +(when performed closer to upper bound). That means it is the performance +of the system under test that decreases (slightly) over time. + +With that in mind, the apparent narrowness of the interval +is not a sign of low quality, just a consequence of PLRsearch assuming +the performance stays constant. + +{{< figure src="/cdocs/PLR_patch.svg" >}} + +#### Vhost + +This test case shows what looks like a quite broad estimation interval, +compared to other test cases with similarly looking zero loss frequencies. +Notable features are infrequent high-loss measurement results +causing big drops of estimates, and lack of long-term convergence. + +Any convergence in medium-sized intervals (during zero loss results) +is reverted by the big loss results, as they happen quite far +from the critical load estimates, and the two fitting functions +extrapolate differently. + +In other words, human only seeing estimates from one fitting function +would expect narrower end interval, but human seeing the measured loss ratios +agrees that the interval should be wider than that. + +{{< figure src="/cdocs/PLR_vhost.svg" >}} + +#### Summary + +The two graphs show the behavior of PLRsearch algorithm applied to soaking test +when some of PLRsearch assumptions do not hold: + ++ L2 patch measurement results violate the assumption + of performance not changing over time. ++ Vhost measurement results violate the assumption + of Poisson distribution matching the loss counts. + +The reported upper and lower bounds can have distance larger or smaller +than a first look by a human would expect, but a more closer look reveals +the quality is good, considering the circumstances. + +The usefullness of the critical load estimate is of questionable value +when the assumptions are violated. + +Some improvements can be made via more specific workarounds, +for example long term limit of L2 patch performance could be estmated +by some heuristic. + +Other improvements can be achieved only by asking users +whether loss patterns matter. Is it better to have single digit losses +distributed fairly evenly over time (as Poisson distribution would suggest), +or is it better to have short periods of medium losses +mixed with long periods of zero losses (as happens in Vhost test) +with the same overall loss ratio? + +[^1]: [RFC 2544: Benchmarking Methodology for Network Interconnect Devices](https://tools.ietf.org/html/rfc2544) +[^2]: [Binary search](https://en.wikipedia.org/wiki/Binary_search_algorithm) +[^3]: [Probabilistic Loss Ratio Search for Packet Throughput](https://tools.ietf.org/html/draft-vpolak-bmwg-plrsearch-02) +[^4]: [Lomax distribution](https://en.wikipedia.org/wiki/Lomax_distribution) +[^5]: [Reciprocal distribution](https://en.wikipedia.org/wiki/Reciprocal_distribution) +[^6]: [Monte Carlo](https://en.wikipedia.org/wiki/Monte_Carlo_integration) +[^7]: [Importance sampling](https://en.wikipedia.org/wiki/Importance_sampling) +[^8]: [Bivariate Gaussian](https://en.wikipedia.org/wiki/Multivariate_normal_distribution) diff --git a/docs/content/methodology/measurements/packet_latency.md b/docs/content/methodology/measurements/packet_latency.md new file mode 100644 index 0000000000..f3606b5ffb --- /dev/null +++ b/docs/content/methodology/measurements/packet_latency.md @@ -0,0 +1,52 @@ +--- +title: "Packet Latency" +weight: 2 +--- + +# Packet Latency + +TRex Traffic Generator (TG) is used for measuring one-way latency in +2-Node and 3-Node physical testbed topologies. TRex integrates +[High Dynamic Range Histogram (HDRH)](http://hdrhistogram.org/) +functionality and reports per packet latency distribution for latency +streams sent in parallel to the main load packet streams. + +Following methodology is used: + +- Only NDRPDR test type measures latency and only after NDR and PDR + values are determined. Other test types do not involve latency + streams. + +- Latency is measured at different background load packet rates: + + - No-Load: latency streams only. + - Low-Load: at 10% PDR. + - Mid-Load: at 50% PDR. + - High-Load: at 90% PDR. + +- Latency is measured for all tested packet sizes except IMIX due to + TRex TG restriction. + +- TG sends dedicated latency streams, one per direction, each at the + rate of 9 kpps at the prescribed packet size; these are sent in + addition to the main load streams. + +- TG reports Min/Avg/Max and HDRH latency values distribution per stream + direction, hence two sets of latency values are reported per test case + (marked as E-W and W-E). + +- +/- 1 usec is the measurement accuracy of TRex TG and the data in HDRH + latency values distribution is rounded to microseconds. + +- TRex TG introduces a (background) always-on Tx + Rx latency bias of 4 + usec on average per direction resulting from TRex software writing and + reading packet timestamps on CPU cores. Quoted values are based on TG + back-to-back latency measurements. + +- Latency graphs are not smoothed, each latency value has its own + horizontal line across corresponding packet percentiles. + +- Percentiles are shown on X-axis using a logarithmic scale, so the + maximal latency value (ending at 100% percentile) would be in + infinity. The graphs are cut at 99.9999% (hover information still + lists 100%). diff --git a/docs/content/methodology/measurements/telemetry.md b/docs/content/methodology/measurements/telemetry.md new file mode 100644 index 0000000000..aed32d9e17 --- /dev/null +++ b/docs/content/methodology/measurements/telemetry.md @@ -0,0 +1,158 @@ +--- +title: "Telemetry" +weight: 3 +--- + +# Telemetry + +OpenMetrics specifies the de-facto standard for transmitting cloud-native +metrics at scale, with support for both text representation and Protocol +Buffers. + +## RFC + +- RFC2119 +- RFC5234 +- RFC8174 +- draft-richih-opsawg-openmetrics-00 + +## Reference + +[OpenMetrics](https://github.com/OpenObservability/OpenMetrics/blob/master/specification/OpenMetrics.md) + +## Metric Types + +- Gauge +- Counter +- StateSet +- Info +- Histogram +- GaugeHistogram +- Summary +- Unknown + +Telemetry module in CSIT currently support only Gauge, Counter and Info. + +## Anatomy of CSIT telemetry implementation + +Existing implementation consists of several measurment building blocks: +the main measuring block running search algorithms (MLR, PLR, SOAK, MRR, ...), +the latency measuring block and the several telemetry blocks with or without +traffic running on a background. + +The main measuring block must not be interrupted by any read operation that can +impact data plane traffic processing during throughput search algorithm. Thus +operational reads are done before (pre-stat) and after (post-stat) that block. + +Some operational reads must be done while traffic is running and usually +consists of two reads (pre-run-stat, post-run-stat) with defined delay between +them. + +## MRR measurement + + traffic_start(r=mrr) traffic_stop |< measure >| + | | | (r=mrr) | + | pre_run_stat post_run_stat | pre_stat | | post_stat + | | | | | | | | + o--------o---------------o-------o------o------+---------------+------o------> + t + Legend: + - pre_run_stat + - vpp-clear-runtime + - post_run_stat + - vpp-show-runtime + - bash-perf-stat // if extended_debug == True + - pre_stat + - vpp-clear-stats + - vpp-enable-packettrace // if extended_debug == True + - vpp-enable-elog + - post_stat + - vpp-show-stats + - vpp-show-packettrace // if extended_debug == True + - vpp-show-elog + + |< measure >| + | (r=mrr) | + | | + |< traffic_trial0 >|< traffic_trial1 >|< traffic_trialN >| + | (i=0,t=duration) | (i=1,t=duration) | (i=N,t=duration) | + | | | | + o-----------------------o------------------------o------------------------o---> + t + + +## MLR measurement + + |< measure >| traffic_start(r=pdr) traffic_stop traffic_start(r=ndr) traffic_stop |< [ latency ] >| + | (r=mlr) | | | | | | .9/.5/.1/.0 | + | | | pre_run_stat post_run_stat | | pre_run_stat post_run_stat | | | + | | | | | | | | | | | | + +-------------+---o-------o---------------o--------o-------------o-------o---------------o--------o------------[-------------------]---> + t + Legend: + - pre_run_stat + - vpp-clear-runtime + - post_run_stat + - vpp-show-runtime + - bash-perf-stat // if extended_debug == True + - pre_stat + - vpp-clear-stats + - vpp-enable-packettrace // if extended_debug == True + - vpp-enable-elog + - post_stat + - vpp-show-stats + - vpp-show-packettrace // if extended_debug == True + - vpp-show-elog + +## MRR measurement + + traffic_start(r=mrr) traffic_stop |< measure >| + | | | (r=mrr) | + | |< stat_runtime >| | stat_pre_trial | | stat_post_trial + | | | | | | | | + o---+------------------+---o------o------------+-------------+----o------------> + t + Legend: + - stat_runtime + - vpp-runtime + - stat_pre_trial + - vpp-clear-stats + - vpp-enable-packettrace // if extended_debug == True + - stat_post_trial + - vpp-show-stats + - vpp-show-packettrace // if extended_debug == True + + |< measure >| + | (r=mrr) | + | | + |< traffic_trial0 >|< traffic_trial1 >|< traffic_trialN >| + | (i=0,t=duration) | (i=1,t=duration) | (i=N,t=duration) | + | | | | + o------------------------o------------------------o------------------------o---> + t + + |< stat_runtime >| + | | + |< program0 >|< program1 >|< programN >| + | (@=params) | (@=params) | (@=params) | + | | | | + o------------------------o------------------------o------------------------o---> + t + +## MLR measurement + + |< measure >| traffic_start(r=pdr) traffic_stop traffic_start(r=ndr) traffic_stop |< [ latency ] >| + | (r=mlr) | | | | | | .9/.5/.1/.0 | + | | | |< stat_runtime >| | | |< stat_runtime >| | | | + | | | | | | | | | | | | + +-------------+---o---+------------------+---o--------------o---+------------------+---o-----------[-----------------]---> + t + Legend: + - stat_runtime + - vpp-runtime + - stat_pre_trial + - vpp-clear-stats + - vpp-enable-packettrace // if extended_debug == True + - stat_post_trial + - vpp-show-stats + - vpp-show-packettrace // if extended_debug == True diff --git a/docs/content/methodology/multi_core_speedup.md b/docs/content/methodology/multi_core_speedup.md deleted file mode 100644 index c0c9ae2570..0000000000 --- a/docs/content/methodology/multi_core_speedup.md +++ /dev/null @@ -1,51 +0,0 @@ ---- -title: "Multi-Core Speedup" -weight: 13 ---- - -# Multi-Core Speedup - -All performance tests are executed with single physical core and with -multiple cores scenarios. - -## Intel Hyper-Threading (HT) - -Intel Xeon processors used in FD.io CSIT can operate either in HT -Disabled mode (single logical core per each physical core) or in HT -Enabled mode (two logical cores per each physical core). HT setting is -applied in BIOS and requires server SUT reload for it to take effect, -making it impractical for continuous changes of HT mode of operation. - -Performance tests are executed with server SUTs' Intel XEON processors -configured with Intel Hyper-Threading Enabled for all Xeon -Cascadelake and Xeon Icelake testbeds. - -## Multi-core Tests - -Multi-core tests are executed in the following VPP worker thread and physical -core configurations: - -#. Intel Xeon Icelake and Cascadelake testbeds (2n-icx, 3n-icx, 2n-clx) - with Intel HT enabled (2 logical CPU cores per each physical core): - - #. 2t1c - 2 VPP worker threads on 1 physical core. - #. 4t2c - 4 VPP worker threads on 2 physical cores. - #. 8t4c - 8 VPP worker threads on 4 physical cores. - -VPP worker threads are the data plane threads running on isolated -logical cores. With Intel HT enabled VPP workers are placed as sibling -threads on each used physical core. VPP control threads (main, stats) -are running on a separate non-isolated core together with other Linux -processes. - -In all CSIT tests care is taken to ensure that each VPP worker handles -the same amount of received packet load and does the same amount of -packet processing work. This is achieved by evenly distributing per -interface type (e.g. physical, virtual) receive queues over VPP workers -using default VPP round-robin mapping and by loading these queues with -the same amount of packet flows. - -If number of VPP workers is higher than number of physical or virtual -interfaces, multiple receive queues are configured on each interface. -NIC Receive Side Scaling (RSS) for physical interfaces and multi-queue -for virtual interfaces are used for this purpose. \ No newline at end of file diff --git a/docs/content/methodology/network_address_translation.md b/docs/content/methodology/network_address_translation.md deleted file mode 100644 index ef341dc892..0000000000 --- a/docs/content/methodology/network_address_translation.md +++ /dev/null @@ -1,445 +0,0 @@ ---- -title: "Network Address Translation" -weight: 7 ---- - -# Network Address Translation - -## NAT44 Prefix Bindings - -NAT44 prefix bindings should be representative to target applications, -where a number of private IPv4 addresses from the range defined by -RFC1918 is mapped to a smaller set of public IPv4 addresses from the -public range. - -Following quantities are used to describe inside to outside IP address -and port bindings scenarios: - -- Inside-addresses, number of inside source addresses - (representing inside hosts). -- Ports-per-inside-address, number of TCP/UDP source - ports per inside source address. -- Outside-addresses, number of outside (public) source addresses - allocated to NAT44. -- Ports-per-outside-address, number of TCP/UDP source - ports per outside source address. The maximal number of - ports-per-outside-address usable for NAT is 64 512 - (in non-reserved port range 1024-65535, RFC4787). -- Sharing-ratio, equal to inside-addresses divided by outside-addresses. - -CSIT NAT44 tests are designed to take into account the maximum number of -ports (sessions) required per inside host (inside-address) and at the -same time to maximize the use of outside-address range by using all -available outside ports. With this in mind, the following scheme of -NAT44 sharing ratios has been devised for use in CSIT: - - **ports-per-inside-address** | **sharing-ratio** ------------------------------:|------------------: - 63 | 1024 - 126 | 512 - 252 | 256 - 504 | 128 - -Initial CSIT NAT44 tests, including associated TG/TRex traffic profiles, -are based on ports-per-inside-address set to 63 and the sharing ratio of -1024. This approach is currently used for all NAT44 tests including -NAT44det (NAT44 deterministic used for Carrier Grade NAT applications) -and NAT44ed (Endpoint Dependent). - -Private address ranges to be used in tests: - -- 192.168.0.0 - 192.168.255.255 (192.168/16 prefix) - - - Total of 2^16 (65 536) of usable IPv4 addresses. - - Used in tests for up to 65 536 inside addresses (inside hosts). - -- 172.16.0.0 - 172.31.255.255 (172.16/12 prefix) - - - Total of 2^20 (1 048 576) of usable IPv4 addresses. - - Used in tests for up to 1 048 576 inside addresses (inside hosts). - -### NAT44 Session Scale - -NAT44 session scale tested is govern by the following logic: - -- Number of inside-addresses(hosts) H[i] = (H[i-1] x 2^2) with H(0)=1 024, - i = 1,2,3, ... - - - H[i] = 1 024, 4 096, 16 384, 65 536, 262 144, ... - -- Number of sessions S[i] = H[i] * ports-per-inside-address - - - ports-per-inside-address = 63 - - **i** | **hosts** | **sessions** -------:|----------:|-------------: - 0 | 1 024 | 64 512 - 1 | 4 096 | 258 048 - 2 | 16 384 | 1 032 192 - 3 | 65 536 | 4 128 768 - 4 | 262 144 | 16 515 072 - -### NAT44 Deterministic - -NAT44det performance tests are using TRex STL (Stateless) API and traffic -profiles, similar to all other stateless packet forwarding tests like -ip4, ip6 and l2, sending UDP packets in both directions -inside-to-outside and outside-to-inside. - -The inside-to-outside traffic uses single destination address (20.0.0.0) -and port (1024). -The inside-to-outside traffic covers whole inside address and port range, -the outside-to-inside traffic covers whole outside address and port range. - -NAT44det translation entries are created during the ramp-up phase, -followed by verification that all entries are present, -before proceeding to the main measurements of the test. -This ensures session setup does not impact the forwarding performance test. - -Associated CSIT test cases use the following naming scheme to indicate -NAT44det scenario tested: - -- ethip4udp-nat44det-h{H}-p{P}-s{S}-[mrr|ndrpdr|soak] - - - {H}, number of inside hosts, H = 1024, 4096, 16384, 65536, 262144. - - {P}, number of ports per inside host, P = 63. - - {S}, number of sessions, S = 64512, 258048, 1032192, 4128768, - 16515072. - - [mrr|ndrpdr|soak], MRR, NDRPDR or SOAK test. - -### NAT44 Endpoint-Dependent - -In order to excercise NAT44ed ability to translate based on both -source and destination address and port, the inside-to-outside traffic -varies also destination address and port. Destination port is the same -as source port, destination address has the same offset as the source address, -but applied to different subnet (starting with 20.0.0.0). - -As the mapping is not deterministic (for security reasons), -we cannot easily use stateless bidirectional traffic profiles. -Inside address and port range is fully covered, -but we do not know which outside-to-inside source address and port to use -to hit an open session. - -Therefore, NAT44ed is benchmarked using following methodologies: - -- Unidirectional throughput using *stateless* traffic profile. -- Connections-per-second (CPS) using *stateful* traffic profile. -- Bidirectional throughput (TPUT, see below) using *stateful* traffic profile. - -Unidirectional NAT44ed throughput tests are using TRex STL (Stateless) -APIs and traffic profiles, but with packets sent only in -inside-to-outside direction. -Similarly to NAT44det, NAT44ed unidirectional throughput tests include -a ramp-up phase to establish and verify the presence of required NAT44ed -binding entries. As the sessions have finite duration, the test code -keeps inserting ramp-up trials during the search, if it detects a risk -of sessions timing out. Any zero loss trial visits all sessions, -so it acts also as a ramp-up. - -Stateful NAT44ed tests are using TRex ASTF (Advanced Stateful) APIs and -traffic profiles, with packets sent in both directions. Tests are run -with both UDP and TCP sessions. -As NAT44ed CPS (connections-per-second) stateful tests -measure (also) session opening performance, -they use state reset instead of ramp-up trial. -NAT44ed TPUT (bidirectional throughput) tests prepend ramp-up trials -as in the unidirectional tests, -so the test results describe performance without translation entry -creation overhead. - -Associated CSIT test cases use the following naming scheme to indicate -NAT44det case tested: - -- Stateless: ethip4udp-nat44ed-h{H}-p{P}-s{S}-udir-[mrr|ndrpdr|soak] - - - {H}, number of inside hosts, H = 1024, 4096, 16384, 65536, 262144. - - {P}, number of ports per inside host, P = 63. - - {S}, number of sessions, S = 64512, 258048, 1032192, 4128768, - 16515072. - - udir-[mrr|ndrpdr|soak], unidirectional stateless tests MRR, NDRPDR - or SOAK. - -- Stateful: ethip4[udp|tcp]-nat44ed-h{H}-p{P}-s{S}-[cps|tput]-[mrr|ndrpdr|soak] - - - [udp|tcp], UDP or TCP sessions - - {H}, number of inside hosts, H = 1024, 4096, 16384, 65536, 262144. - - {P}, number of ports per inside host, P = 63. - - {S}, number of sessions, S = 64512, 258048, 1032192, 4128768, - 16515072. - - [cps|tput], connections-per-second session establishment rate or - packets-per-second average rate, or packets-per-second rate - without session establishment. - - [mrr|ndrpdr|soak], bidirectional stateful tests MRR, NDRPDR, or SOAK. - -## Stateful traffic profiles - -There are several important details which distinguish ASTF profiles -from stateless profiles. - -### General considerations - -#### Protocols - -ASTF profiles are limited to either UDP or TCP protocol. - -#### Programs - -Each template in the profile defines two "programs", one for the client side -and one for the server side. - -Each program specifies when that side has to wait until enough data is received -(counted in packets for UDP and in bytes for TCP) -and when to send additional data. Together, the two programs -define a single transaction. Due to packet loss, transaction may take longer, -use more packets (retransmission) or never finish in its entirety. - -#### Instances - -A client instance is created according to TPS parameter for the trial, -and sends the first packet of the transaction (in some cases more packets). -Each client instance uses a different source address (see sequencing below) -and some source port. The destination address also comes from a range, -but destination port has to be constant for a given program. - -TRex uses an opaque way to chose source ports, but as session counting shows, -next client with the same source address uses a different source port. - -Server instance is created when the first packet arrives to the server side. -Source address and port of the first packet are used as destination address -and port for the server responses. This is the ability we need -when outside surface is not predictable. - -When a program reaches its end, the instance is deleted. -This creates possible issues with server instances. If the server instance -does not read all the data client has sent, late data packets -can cause a second copy of server instance to be created, -which breaks assumptions on how many packet a transaction should have. - -The need for server instances to read all the data reduces the overall -bandwidth TRex is able to create in ASTF mode. - -Note that client instances are not created on packets, -so it is safe to end client program without reading all server data -(unless the definition of transaction success requires that). - -#### Sequencing - -ASTF profiles offer two modes for choosing source and destination IP addresses -for client programs: seqential and pseudorandom. -In current tests we are using sequential addressing only (if destination -address varies at all). - -For client destination UDP/TCP port, we use a single constant value. -(TRex can support multiple program pairs in the same traffic profile, -distinguished by the port number.) - -#### Transaction overlap - -If a transaction takes longer to finish, compared to period implied by TPS, -TRex will have multiple client or server instances active at a time. - -During calibration testing we have found this increases CPU utilization, -and for high TPS it can lead to TRex's Rx or Tx buffers becoming full. -This generally leads to duration stretching, and/or packet loss on TRex. - -Currently used transactions were chosen to be short, so risk of bad behavior -is decreased. But in MRR tests, where load is computed based on NIC ability, -not TRex ability, anomalous behavior is still possible -(e.g. MRR values being way lower than NDR). - -#### Delays - -TRex supports adding constant delays to ASTF programs. -This can be useful, for example if we want to separate connection establishment -from data transfer. - -But as TRex tracks delayed instances as active, this still results -in higher CPU utilization and reduced performance issues -(as other overlaping transactions). So the current tests do not use any delays. - -#### Keepalives - -Both UDP and TCP protocol implementations in TRex programs support keepalive -duration. That means there is a configurable period of keepalive time, -and TRex sends keepalive packets automatically (outside the program) -for the time the program is active (started, not ended yet) -but not sending any packets. - -For TCP this is generally not a big deal, as the other side usually -retransmits faster. But for UDP it means a packet loss may leave -the receiving program running. - -In order to avoid keepalive packets, keepalive value is set to a high number. -Here, "high number" means that even at maximum scale and minimum TPS, -there are still no keepalive packets sent within the corresponding -(computed) trial duration. This number is kept the same also for -smaller scale traffic profiles, to simplify maintenance. - -#### Transaction success - -The transaction is considered successful at Layer-7 (L7) level -when both program instances close. At this point, various L7 counters -(unofficial name) are updated on TRex. - -We found that proper close and L7 counter update can be CPU intensive, -whereas lower-level counters (ipackets, opackets) called L2 counters -can keep up with higher loads. - -For some tests, we do not need to confirm the whole transaction was successful. -CPS (connections per second) tests are a typical example. -We care only for NAT44ed creating a session (needs one packet -in inside-to-outside direction per session) and being able to use it -(needs one packet in outside-to-inside direction). - -Similarly in TPUT tests (packet throuput, counting both control -and data packets), we care about NAT44ed ability to forward packets, -we do not care whether aplications (TRex) can fully process them at that rate. - -Therefore each type of tests has its own formula (usually just one counter -already provided by TRex) to count "successful enough" transactions -and attempted transactions. Currently, all tests relying on L7 counters -use size-limited profiles, so they know what the count of attempted -transactions should be, but due to duration stretching -TRex might have been unable to send that many packets. -For search purposes, unattempted transactions are treated the same -as attempted but failed transactions. - -Sometimes even the number of transactions as tracked by search algorithm -does not match the transactions as defined by ASTF programs. -See TCP TPUT profile below. - -### UDP CPS - -This profile uses a minimalistic transaction to verify NAT44ed session has been -created and it allows outside-to-inside traffic. - -Client instance sends one packet and ends. -Server instance sends one packet upon creation and ends. - -In principle, packet size is configurable, -but currently used tests apply only one value (100 bytes frame). - -Transaction counts as attempted when opackets counter increases on client side. -Transaction counts as successful when ipackets counter increases on client side. - -### TCP CPS - -This profile uses a minimalistic transaction to verify NAT44ed session has been -created and it allows outside-to-inside traffic. - -Client initiates TCP connection. Client waits until connection is confirmed -(by reading zero data bytes). Client ends. -Server accepts the connection. Server waits for indirect confirmation -from client (by waiting for client to initiate close). Server ends. - -Without packet loss, the whole transaction takes 7 packets to finish -(4 and 3 per direction). -From NAT44ed point of view, only the first two are needed to verify -the session got created. - -Packet size is not configurable, but currently used tests report -frame size as 64 bytes. - -Transaction counts as attempted when tcps_connattempt counter increases -on client side. -Transaction counts as successful when tcps_connects counter increases -on client side. - -### UDP TPUT - -This profile uses a small transaction of "request-response" type, -with several packets simulating data payload. - -Client sends 5 packets and closes immediately. -Server reads all 5 packets (needed to avoid late packets creating new -server instances), then sends 5 packets and closes. -The value 5 was chosen to mirror what TCP TPUT (see below) choses. - -Packet size is configurable, currently we have tests for 100, -1518 and 9000 bytes frame (to match size of TCP TPUT data frames, see below). - -As this is a packet oriented test, we do not track the whole -10 packet transaction. Similarly to stateless tests, we treat each packet -as a "transaction" for search algorthm packet loss ratio purposes. -Therefore a "transaction" is attempted when opacket counter on client -or server side is increased. Transaction is successful if ipacket counter -on client or server side is increased. - -If one of 5 client packets is lost, server instance will get stuck -in the reading phase. This probably decreases TRex performance, -but it leads to more stable results then alternatives. - -### TCP TPUT - -This profile uses a small transaction of "request-response" type, -with some data amount to be transferred both ways. - -In CSIT release 22.06, TRex behavior changed, so we needed to edit -the traffic profile. Let us describe the pre-22.06 profile first. - -Client connects, sends 5 data packets worth of data, -receives 5 data packets worth of data and closes its side of the connection. -Server accepts connection, reads 5 data packets worth of data, -sends 5 data packets worth of data and closes its side of the connection. -As usual in TCP, sending side waits for ACK from the receiving side -before proceeding with next step of its program. - -Server read is needed to avoid premature close and second server instance. -Client read is not stricly needed, but ACKs allow TRex to close -the server instance quickly, thus saving CPU and improving performance. - -The number 5 of data packets was chosen so TRex is able to send them -in a single burst, even with 9000 byte frame size (TRex has a hard limit -on initial window size). -That leads to 16 packets (9 of them in c2s direction) to be exchanged -if no loss occurs. -The size of data packets is controlled by the traffic profile setting -the appropriate maximum segment size. Due to TRex restrictions, -the minimal size for IPv4 data frame achievable by this method is 70 bytes, -which is more than our usual minimum of 64 bytes. -For that reason, the data frame sizes available for testing are 100 bytes -(that allows room for eventually adding IPv6 ASTF tests), -1518 bytes and 9000 bytes. There is no control over control packet sizes. - -Exactly as in UDP TPUT, ipackets and opackets counters are used for counting -"transactions" (in fact packets). - -If packet loss occurs, there can be large transaction overlap, even if most -ASTF programs finish eventually. This can lead to big duration stretching -and somehow uneven rate of packets sent. This makes it hard to interpret -MRR results (frequently MRR is below NDR for this reason), -but NDR and PDR results tend to be stable enough. - -In 22.06, the "ACK from the receiving side" behavior changed, -the receiving side started sending ACK sometimes -also before receiving the full set of 5 data packets. -If the previous profile is understood as a "single challenge, single response" -where challenge (and also response) is sent as a burst of 5 data packets, -the new profile uses "bursts" of 1 packet instead, but issues -the challenge-response part 5 times sequentially -(waiting for receiving the response before sending next challenge). -This new profile happens to have the same overall packet count -(when no re-transmissions are needed). -Although it is possibly more taxing for TRex CPU, -the results are comparable to the old traffic profile. - -## Ip4base tests - -Contrary to stateless traffic profiles, we do not have a simple limit -that would guarantee TRex is able to send traffic at specified load. -For that reason, we have added tests where "nat44ed" is replaced by "ip4base". -Instead of NAT44ed processing, the tests set minimalistic IPv4 routes, -so that packets are forwarded in both inside-to-outside and outside-to-inside -directions. - -The packets arrive to server end of TRex with different source address&port -than in NAT44ed tests (no translation to outside values is done with ip4base), -but those are not specified in the stateful traffic profiles. -The server end (as always) uses the received address&port as destination -for outside-to-inside traffic. Therefore the same stateful traffic profile -works for both NAT44ed and ip4base test (of the same scale). - -The NAT44ed results are displayed together with corresponding ip4base results. -If they are similar, TRex is probably the bottleneck. -If NAT44ed result is visibly smaller, it describes the real VPP performance. diff --git a/docs/content/methodology/overview/_index.md b/docs/content/methodology/overview/_index.md new file mode 100644 index 0000000000..10f362013f --- /dev/null +++ b/docs/content/methodology/overview/_index.md @@ -0,0 +1,6 @@ +--- +bookCollapseSection: true +bookFlatSection: false +title: "Overview" +weight: 1 +--- diff --git a/docs/content/methodology/overview/dut_state_considerations.md b/docs/content/methodology/overview/dut_state_considerations.md new file mode 100644 index 0000000000..eca10a22cd --- /dev/null +++ b/docs/content/methodology/overview/dut_state_considerations.md @@ -0,0 +1,148 @@ +--- +title: "DUT State Considerations" +weight: 5 +--- + +# DUT State Considerations + +This page discusses considerations for Device Under Test (DUT) state. +DUTs such as VPP require configuration, to be provided before the aplication +starts (via config files) or just after it starts (via API or CLI access). + +During operation DUTs gather various telemetry data, depending on configuration. +This internal state handling is part of normal operation, +so any performance impact is included in the test results. +Accessing telemetry data is additional load on DUT, +so we are not doing that in main trial measurements that affect results, +but we include separate trials specifically for gathering runtime telemetry. + +But there is one kind of state that needs specific handling. +This kind of DUT state is dynamically created based on incoming traffic, +it affects how DUT handles the traffic, and (unlike telemetry counters) +it has uneven impact on CPU load. +Typical example is NAT, where detecting new sessions takes more CPU than +forwarding packet on existing (open or recently closed) sessions. +We call DUT configurations with this kind of state "stateful", +and configurations without them "stateless". +(Even though stateless configurations contain state described in previous +paragraphs, and some configuration items may have "stateful" in their name, +such as stateful ACLs.) + +# Stateful DUT configurations + +Typically, the level of CPU impact of traffic depends on DUT state. +The first packets causing DUT state to change have higher impact, +subsequent packets matching that state have lower impact. + +From performance point of view, this is similar to traffic phases +for stateful protocols, see +[NGFW draft](https://tools.ietf.org/html/draft-ietf-bmwg-ngfw-performance-05#section-4.3.4). +In CSIT we borrow the terminology (even if it does not fit perfectly, +see discussion below). Ramp-up traffic causes the state change, +sustain traffic does not change the state. + +As the performance is different, each test has to choose which traffic +it wants to test, and manipulate the DUT state to achieve the intended impact. + +## Ramp-up trial + +Tests aiming at sustain performance need to make sure DUT state is created. +We achieve this via a ramp-up trial, specific purpose of which +is to create the state. + +Subsequent trials need no specific handling, as long as the state +remains the same. But some state can time-out, so additional ramp-up +trials are inserted whenever the code detects the state can time-out. +Note that a trial with zero loss refreshes the state, +so only the time since the last non-zero loss trial is tracked. + +For the state to be set completely, it is important both DUT and TG +do not lose any packets. We achieve this by setting the profile multiplier +(TPS from now on) to low enough value. + +It is also important each state-affecting packet is sent. +For size-limited traffic profile it is guaranteed by the size limit. +For continuous traffic, we set a long enough duration (based on TPS). + +At the end of the ramp-up trial, we check DUT state to confirm +it has been created as expected. +Test fails if the state is not (completely) created. + +## State Reset + +Tests aiming at ramp-up performance do not use ramp-up trial, +and they need to reset the DUT state before each trial measurement. +The way of resetting the state depends on test, +usually an API call is used to partially de-configure +the part that holds the state, and then re-configure it back. + +In CSIT we control the DUT state behavior via a test variable "resetter". +If it is not set, DUT state is not reset. +If it is set, each search algorithm (including MRR) will invoke it +before all trial measurements (both main and telemetry ones). +Any configuration keyword enabling a feature with DUT state +will check whether a test variable for ramp-up rate is present. +If it is present, resetter is not set. +If it is not present, the keyword sets the apropriate resetter value. +This logic makes sure either ramp-up or state reset are used. + +Notes: If both ramp-up and state reset were used, the DUT behavior +would be identical to just reset, while test would take longer to execute. +If neither were used, DUT will show different performance in subsequent trials, +violating assumptions of search algorithms. + +## DUT versus protocol ramp-up + +There are at least three different causes for bandwidth possibly increasing +within a single measurement trial. + +The first is DUT switching from state modification phase to constant phase, +it is the primary focus of this document. +Using ramp-up traffic before main trials eliminates this cause +for tests wishing to measure the performance of the next phase. +Using size-limited profiles eliminates the next phase +for tests wishing to measure performance of this phase. + +The second is protocol such as TCP ramping up their throughput to utilize +the bandwidth available. This is the original meaning of "ramp up" +in the NGFW draft (see above). +In existing tests we are not using this meaning of TCP ramp-up. +Instead we use only small transactions, and large enough initial window +so TCP acts as ramped-up already. + +The third is TCP increasing offered load due to retransmissions triggered by +packet loss. In CSIT we again try to avoid this behavior +by using small enough data to transfer, so overlap of multiple transactions +(primary cause of packet loss) is unlikely. +But in MRR tests, packet loss and non-constant offered load are still expected. + +# Stateless DUT configuratons + +These are simple configurations, which do not set any resetter value +(even if ramp-up duration is not configured). +Majority of existing tests are of this type, using continuous traffic profiles. + +In order to identify limits of Trex performance, +we have added suites with stateless DUT configuration (VPP ip4base) +subjected to size-limited ASTF traffic. +The discovered rates serve as a basis of comparison +for evaluating the results for stateful DUT configurations (VPP NAT44ed) +subjected to the same traffic profiles. + +# DUT versus TG state + +Traffic Generator profiles can be stateful (ASTF) or stateless (STL). +DUT configuration can be stateful or stateless (with respect to packet traffic). + +In CSIT we currently use all four possible configurations: + +- Regular stateless VPP tests use stateless traffic profiles. + +- Stateless VPP configuration with stateful profile is used as a base for + comparison. + +- Some stateful DUT configurations (NAT44DET, NAT44ED unidirectional) + are tested using stateless traffic profiles and continuous traffic. + +- The rest of stateful DUT configurations (NAT44ED bidirectional) + are tested using stateful traffic profiles and size limited traffic. diff --git a/docs/content/methodology/overview/multi_core_speedup.md b/docs/content/methodology/overview/multi_core_speedup.md new file mode 100644 index 0000000000..f438e8e996 --- /dev/null +++ b/docs/content/methodology/overview/multi_core_speedup.md @@ -0,0 +1,51 @@ +--- +title: "Multi-Core Speedup" +weight: 3 +--- + +# Multi-Core Speedup + +All performance tests are executed with single physical core and with +multiple cores scenarios. + +## Intel Hyper-Threading (HT) + +Intel Xeon processors used in FD.io CSIT can operate either in HT +Disabled mode (single logical core per each physical core) or in HT +Enabled mode (two logical cores per each physical core). HT setting is +applied in BIOS and requires server SUT reload for it to take effect, +making it impractical for continuous changes of HT mode of operation. + +Performance tests are executed with server SUTs' Intel XEON processors +configured with Intel Hyper-Threading Enabled for all Xeon +Cascadelake and Xeon Icelake testbeds. + +## Multi-core Tests + +Multi-core tests are executed in the following VPP worker thread and physical +core configurations: + +1. Intel Xeon Icelake and Cascadelake testbeds (2n-icx, 3n-icx, 2n-clx) + with Intel HT enabled (2 logical CPU cores per each physical core): + + 1. 2t1c - 2 VPP worker threads on 1 physical core. + 2. 4t2c - 4 VPP worker threads on 2 physical cores. + 3. 8t4c - 8 VPP worker threads on 4 physical cores. + +VPP worker threads are the data plane threads running on isolated +logical cores. With Intel HT enabled VPP workers are placed as sibling +threads on each used physical core. VPP control threads (main, stats) +are running on a separate non-isolated core together with other Linux +processes. + +In all CSIT tests care is taken to ensure that each VPP worker handles +the same amount of received packet load and does the same amount of +packet processing work. This is achieved by evenly distributing per +interface type (e.g. physical, virtual) receive queues over VPP workers +using default VPP round-robin mapping and by loading these queues with +the same amount of packet flows. + +If number of VPP workers is higher than number of physical or virtual +interfaces, multiple receive queues are configured on each interface. +NIC Receive Side Scaling (RSS) for physical interfaces and multi-queue +for virtual interfaces are used for this purpose. diff --git a/docs/content/methodology/overview/per_thread_resources.md b/docs/content/methodology/overview/per_thread_resources.md new file mode 100644 index 0000000000..c23efb50bd --- /dev/null +++ b/docs/content/methodology/overview/per_thread_resources.md @@ -0,0 +1,101 @@ +--- +title: "Per Thread Resources" +weight: 2 +--- + +# Per Thread Resources + +CSIT test framework is managing mapping of the following resources per thread: + +1. Cores, physical cores (pcores) allocated as pairs of sibling logical cores + (lcores) if server in HyperThreading/SMT mode, or as single lcores + if server not in HyperThreading/SMT mode. Note that if server's + processors are running in HyperThreading/SMT mode sibling lcores are + always used. +2. Receive Queues (RxQ), packet receive queues allocated on each + physical and logical interface tested. +3. Transmit Queues(TxQ), packet transmit queues allocated on each + physical and logical interface tested. + +Approach to mapping per thread resources depends on the application/DUT +tested (VPP or DPDK apps) and associated thread types, as follows: + +1. Data-plane workers, used for data-plane packet processing, when no + feature workers present. + + - Cores: data-plane workers are typically tested in 1, 2 and 4 pcore + configurations, running on single lcore per pcore or on sibling + lcores per pcore. Result is a set of {T}t{C}c thread-core + configurations, where{T} stands for a total number of threads + (lcores), and {C} for a total number of pcores. Tested + configurations are encoded in CSIT test case names, + e.g. "1c", "2c", "4c", and test tags "2T1C" (or "1T1C"), "4T2C" + (or "2T2C"), "8T4C" (or "4T4C"). + - Interface Receive Queues (RxQ): as of CSIT-2106 release, number of + RxQs used on each physical or virtual interface is equal to the + number of data-plane workers. In other words each worker has a + dedicated RxQ on each interface tested. This ensures packet + processing load to be equal for each worker, subject to RSS flow + load balancing efficacy. Note: Before CSIT-2106 total number of + RxQs across all interfaces of specific type was equal to the + number of data-plane workers. + - Interface Transmit Queues (TxQ): number of TxQs used on each + physical or virtual interface is equal to the number of data-plane + workers. In other words each worker has a dedicated TxQ on each + interface tested. + - Applies to VPP and DPDK Testpmd and L3Fwd. + +2. Data-plane and feature workers (e.g. IPsec async crypto workers), the + latter dedicated to specific feature processing. + + - Cores: data-plane and feature workers are tested in 2, 3 and 4 + pcore configurations, running on single lcore per pcore or on + sibling lcores per pcore. This results in a two sets of + thread-core combinations separated by "-", {T}t{C}c-{T}t{C}c, with + the leading set denoting total number of threads (lcores) and + pcores used for data-plane workers, and the trailing set denoting + total number of lcores and pcores used for feature workers. + Accordingly, tested configurations are encoded in CSIT test case + names, e.g. "1c-1c", "1c-2c", "1c-3c", and test tags "2T1C_2T1C" + (or "1T1C_1T1C"), "2T1C_4T2C" (or "1T1C_2T2C"), "2T1C_6T3C" + (or "1T1C_3T3C"). + - RxQ and TxQ: no RxQs and no TxQs are used by feature workers. + - Applies to VPP only. + +3. Management/main worker, control plane and management. + + - Cores: single lcore. + - RxQ: not used (VPP default behaviour). + - TxQ: single TxQ per interface, allocated but not used (VPP default + behaviour). + - Applies to VPP only. + +## VPP Thread Configuration + +Mapping of cores and RxQs to VPP data-plane worker threads is done in +the VPP startup.conf during test suite setup: + +1. `corelist-workers `: List of logical cores to run VPP + data-plane workers and feature workers. The actual lcores' + allocations depends on HyperThreading/SMT server configuration and + per test core configuration. + + - For tests without feature workers, by default, all CPU cores + configured in startup.conf are used for data-plane workers. + - For tests with feature workers, CSIT code distributes lcores across + data-plane and feature workers. + +2. `num-rx-queues `: Number of Rx queues used per interface. + +Mapping of TxQs to VPP data-plane worker threads uses the default VPP +setting of one TxQ per interface per data-plane worker. + +## DPDK Thread Configuration + +Mapping of cores and RxQs to DPDK Testpmd/L3Fwd data-plane worker +threads is done in the startup CLI: + +1. `-l ` - List of logical cores to run DPDK + application. +2. `nb-cores=` - Number of forwarding cores. +3. `rxq=` - Number of Rx queues used per interface. diff --git a/docs/content/methodology/overview/terminology.md b/docs/content/methodology/overview/terminology.md new file mode 100644 index 0000000000..c9115e9291 --- /dev/null +++ b/docs/content/methodology/overview/terminology.md @@ -0,0 +1,97 @@ +--- +title: "Terminology" +weight: 1 +--- + +# Terminology + +- **Frame size**: size of an Ethernet Layer-2 frame on the wire, including + any VLAN tags (dot1q, dot1ad) and Ethernet FCS, but excluding Ethernet + preamble and inter-frame gap. Measured in Bytes. + +- **Packet size**: same as frame size, both terms used interchangeably. + +- **Inner L2 size**: for tunneled L2 frames only, size of an encapsulated + Ethernet Layer-2 frame, preceded with tunnel header, and followed by + tunnel trailer. Measured in Bytes. + +- **Inner IP size**: for tunneled IP packets only, size of an encapsulated + IPv4 or IPv6 packet, preceded with tunnel header, and followed by + tunnel trailer. Measured in Bytes. + +- **Device Under Test (DUT)**: In software networking, "device" denotes a + specific piece of software tasked with packet processing. Such device + is surrounded with other software components (such as operating system + kernel). It is not possible to run devices without also running the + other components, and hardware resources are shared between both. For + purposes of testing, the whole set of hardware and software components + is called "System Under Test" (SUT). As SUT is the part of the whole + test setup performance of which can be measured with RFC2544, using + SUT instead of RFC2544 DUT. Device under test + (DUT) can be re-introduced when analyzing test results using whitebox + techniques, but this document sticks to blackbox testing. + +- **System Under Test (SUT)**: System under test (SUT) is a part of the + whole test setup whose performance is to be benchmarked. The complete + methodology contains other parts, whose performance is either already + established, or not affecting the benchmarking result. + +- **Bi-directional throughput tests**: involve packets/frames flowing in + both east-west and west-east directions over every tested interface of + SUT/DUT. Packet flow metrics are measured per direction, and can be + reported as aggregate for both directions (i.e. throughput) and/or + separately for each measured direction (i.e. latency). In most cases + bi-directional tests use the same (symmetric) load in both directions. + +- **Uni-directional throughput tests**: involve packets/frames flowing in + only one direction, i.e. either east-west or west-east direction, over + every tested interface of SUT/DUT. Packet flow metrics are measured + and are reported for measured direction. + +- **Packet Loss Ratio (PLR)**: ratio of packets received relative to packets + transmitted over the test trial duration, calculated using formula: + PLR = ( pkts_transmitted - pkts_received ) / pkts_transmitted. + For bi-directional throughput tests aggregate PLR is calculated based + on the aggregate number of packets transmitted and received. + +- **Packet Throughput Rate**: maximum packet offered load DUT/SUT forwards + within the specified Packet Loss Ratio (PLR). In many cases the rate + depends on the frame size processed by DUT/SUT. Hence packet + throughput rate MUST be quoted with specific frame size as received by + DUT/SUT during the measurement. For bi-directional tests, packet + throughput rate should be reported as aggregate for both directions. + Measured in packets-per-second (pps) or frames-per-second (fps), + equivalent metrics. + +- **Bandwidth Throughput Rate**: a secondary metric calculated from packet + throughput rate using formula: bw_rate = pkt_rate * (frame_size + + L1_overhead) * 8, where L1_overhead for Ethernet includes preamble (8 + Bytes) and inter-frame gap (12 Bytes). For bi-directional tests, + bandwidth throughput rate should be reported as aggregate for both + directions. Expressed in bits-per-second (bps). + +- **Non Drop Rate (NDR)**: maximum packet/bandwith throughput rate sustained + by DUT/SUT at PLR equal zero (zero packet loss) specific to tested + frame size(s). MUST be quoted with specific packet size as received by + DUT/SUT during the measurement. Packet NDR measured in + packets-per-second (or fps), bandwidth NDR expressed in + bits-per-second (bps). + +- **Partial Drop Rate (PDR)**: maximum packet/bandwith throughput rate + sustained by DUT/SUT at PLR greater than zero (non-zero packet loss) + specific to tested frame size(s). MUST be quoted with specific packet + size as received by DUT/SUT during the measurement. Packet PDR + measured in packets-per-second (or fps), bandwidth PDR expressed in + bits-per-second (bps). + +- **Maximum Receive Rate (MRR)**: packet/bandwidth rate regardless of PLR + sustained by DUT/SUT under specified Maximum Transmit Rate (MTR) + packet load offered by traffic generator. MUST be quoted with both + specific packet size and MTR as received by DUT/SUT during the + measurement. Packet MRR measured in packets-per-second (or fps), + bandwidth MRR expressed in bits-per-second (bps). + +- **Trial**: a single measurement step. + +- **Trial duration**: amount of time over which packets are transmitted and + received in a single measurement step. diff --git a/docs/content/methodology/overview/vpp_forwarding_modes.md b/docs/content/methodology/overview/vpp_forwarding_modes.md new file mode 100644 index 0000000000..b3c3bba984 --- /dev/null +++ b/docs/content/methodology/overview/vpp_forwarding_modes.md @@ -0,0 +1,104 @@ +--- +title: "VPP Forwarding Modes" +weight: 4 +--- + +# VPP Forwarding Modes + +VPP is tested in a number of L2, IPv4 and IPv6 packet lookup and forwarding +modes. Within each mode baseline and scale tests are executed, the latter with +varying number of FIB entries. + +## L2 Ethernet Switching + +VPP is tested in three L2 forwarding modes: + +- *l2patch*: L2 patch, the fastest point-to-point L2 path that loops + packets between two interfaces without any Ethernet frame checks or + lookups. +- *l2xc*: L2 cross-connect, point-to-point L2 path with all Ethernet + frame checks, but no MAC learning and no MAC lookup. +- *l2bd*: L2 bridge-domain, multipoint-to-multipoint L2 path with all + Ethernet frame checks, with MAC learning (unless static MACs are used) + and MAC lookup. + +l2bd tests are executed in baseline and scale configurations: + +- *l2bdbase*: Two MAC FIB entries are learned by VPP to enable packet + switching between two interfaces in two directions. VPP L2 switching + is tested with 254 IPv4 unique flows per direction, varying IPv4 + source address per flow in order to invoke RSS based packet + distribution across VPP workers. The same source and destination MAC + address is used for all flows per direction. IPv4 source address is + incremented for every packet. + +- *l2bdscale*: A high number of MAC FIB entries are learned by VPP to + enable packet switching between two interfaces in two directions. + Tested MAC FIB sizes include: i) 10k with 5k unique flows per + direction, ii) 100k with 2 x 50k flows and iii) 1M with 2 x 500k + flows. Unique flows are created by using distinct source and + destination MAC addresses that are changed for every packet using + incremental ordering, making VPP learn (or refresh) distinct src MAC + entries and look up distinct dst MAC entries for every packet. For + details, see + [Packet Flow Ordering]({{< ref "packet_flow_ordering#Packet Flow Ordering" >}}). + +Ethernet wire encapsulations tested include: untagged, dot1q, dot1ad. + +## IPv4 Routing + +IPv4 routing tests are executed in baseline and scale configurations: + +- *ip4base*: Two /32 IPv4 FIB entries are configured in VPP to enable + packet routing between two interfaces in two directions. VPP routing + is tested with 253 IPv4 unique flows per direction, varying IPv4 + source address per flow in order to invoke RSS based packet + distribution across VPP workers. IPv4 source address is incremented + for every packet. + +- *ip4scale*: A high number of /32 IPv4 FIB entries are configured in + VPP. Tested IPv4 FIB sizes include: i) 20k with 10k unique flows per + direction, ii) 200k with 2 * 100k flows and iii) 2M with 2 * 1M + flows. Unique flows are created by using distinct IPv4 destination + addresses that are changed for every packet, using incremental or + random ordering. For details, see + [Packet Flow Ordering]({{< ref "packet_flow_ordering#Packet Flow Ordering" >}}). + +## IPv6 Routing + +Similarly to IPv4, IPv6 routing tests are executed in baseline and scale +configurations: + +- *ip6base*: Two /128 IPv4 FIB entries are configured in VPP to enable + packet routing between two interfaces in two directions. VPP routing + is tested with 253 IPv6 unique flows per direction, varying IPv6 + source address per flow in order to invoke RSS based packet + distribution across VPP workers. IPv6 source address is incremented + for every packet. + +- *ip4scale*: A high number of /128 IPv6 FIB entries are configured in + VPP. Tested IPv6 FIB sizes include: i) 20k with 10k unique flows per + direction, ii) 200k with 2 * 100k flows and iii) 2M with 2 * 1M + flows. Unique flows are created by using distinct IPv6 destination + addresses that are changed for every packet, using incremental or + random ordering. For details, see + [Packet Flow Ordering]({{< ref "packet_flow_ordering#Packet Flow Ordering" >}}). + +## SRv6 Routing + +SRv6 routing tests are executed in a number of baseline configurations, +in each case SR policy and steering policy are configured for one +direction and one (or two) SR behaviours (functions) in the other +directions: + +- *srv6enc1sid*: One SID (no SRH present), one SR function - End. +- *srv6enc2sids*: Two SIDs (SRH present), two SR functions - End and + End.DX6. +- *srv6enc2sids-nodecaps*: Two SIDs (SRH present) without decapsulation, + one SR function - End. +- *srv6proxy-dyn*: Dynamic SRv6 proxy, one SR function - End.AD. +- *srv6proxy-masq*: Masquerading SRv6 proxy, one SR function - End.AM. +- *srv6proxy-stat*: Static SRv6 proxy, one SR function - End.AS. + +In all listed cases low number of IPv6 flows (253 per direction) is +routed by VPP. diff --git a/docs/content/methodology/packet_flow_ordering.md b/docs/content/methodology/packet_flow_ordering.md deleted file mode 100644 index d2b3bfb90c..0000000000 --- a/docs/content/methodology/packet_flow_ordering.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: "Packet Flow Ordering" -weight: 9 ---- - -# Packet Flow Ordering - -TRex Traffic Generator (TG) supports two main ways how to cover -address space (on allowed ranges) in scale tests. - -In most cases only one field value (e.g. IPv4 destination address) is -altered, in some cases two fields (e.g. IPv4 destination address and UDP -destination port) are altered. - -## Incremental Ordering - -This case is simpler to implement and offers greater control. - -When changing two fields, they can be incremented synchronously, or one -after another. In the latter case we can specify which one is -incremented each iteration and which is incremented by "carrying over" -only when the other "wraps around". This way also visits all -combinations once before the "carry" field also wraps around. - -It is possible to use increments other than 1. - -## Randomized Ordering - -This case chooses each field value at random (from the allowed range). -In case of two fields, they are treated independently. -TRex allows to set random seed to get deterministic numbers. -We use a different seed for each field and traffic direction. -The seed has to be a non-zero number, we use 1, 2, 3, and so on. - -The seeded random mode in TRex requires a "limit" value, -which acts as a cycle length limit (after this many iterations, -the seed resets to its initial value). -We use the maximal allowed limit value (computed as 2^24 - 1). - -Randomized profiles do not avoid duplicated values, -and do not guarantee each possible value is visited, -so it is not very useful for stateful tests. diff --git a/docs/content/methodology/packet_latency.md b/docs/content/methodology/packet_latency.md deleted file mode 100644 index fd7c0e00e8..0000000000 --- a/docs/content/methodology/packet_latency.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -title: "Packet Latency" -weight: 8 ---- - -# Packet Latency - -TRex Traffic Generator (TG) is used for measuring one-way latency in -2-Node and 3-Node physical testbed topologies. TRex integrates -[High Dynamic Range Histogram (HDRH)](http://hdrhistogram.org/) -functionality and reports per packet latency distribution for latency -streams sent in parallel to the main load packet streams. - -Following methodology is used: - -- Only NDRPDR test type measures latency and only after NDR and PDR - values are determined. Other test types do not involve latency - streams. -- Latency is measured at different background load packet rates: - - - No-Load: latency streams only. - - Low-Load: at 10% PDR. - - Mid-Load: at 50% PDR. - - High-Load: at 90% PDR. - -- Latency is measured for all tested packet sizes except IMIX due to - TRex TG restriction. -- TG sends dedicated latency streams, one per direction, each at the - rate of 9 kpps at the prescribed packet size; these are sent in - addition to the main load streams. -- TG reports Min/Avg/Max and HDRH latency values distribution per stream - direction, hence two sets of latency values are reported per test case - (marked as E-W and W-E). -- +/- 1 usec is the measurement accuracy of TRex TG and the data in HDRH - latency values distribution is rounded to microseconds. -- TRex TG introduces a (background) always-on Tx + Rx latency bias of 4 - usec on average per direction resulting from TRex software writing and - reading packet timestamps on CPU cores. Quoted values are based on TG - back-to-back latency measurements. -- Latency graphs are not smoothed, each latency value has its own - horizontal line across corresponding packet percentiles. -- Percentiles are shown on X-axis using a logarithmic scale, so the - maximal latency value (ending at 100% percentile) would be in - infinity. The graphs are cut at 99.9999% (hover information still - lists 100%). \ No newline at end of file diff --git a/docs/content/methodology/per_patch_testing.md b/docs/content/methodology/per_patch_testing.md new file mode 100644 index 0000000000..a64a52caf6 --- /dev/null +++ b/docs/content/methodology/per_patch_testing.md @@ -0,0 +1,230 @@ +--- +title: "Per-patch Testing" +weight: 5 +--- + +# Per-patch Testing + +Updated for CSIT git commit id: 72b45cfe662107c8e1bb549df71ba51352a898ee. + +A methodology similar to trending analysis is used for comparing performance +before a DUT code change is merged. This can act as a verify job to disallow +changes which would decrease performance without a good reason. + +## Existing jobs + +VPP is the only project currently using such jobs. +They are not started automatically, must be triggered on demand. +They allow full tag expressions, but some tags are enforced (such as MRR). + +There are jobs available for multiple types of testbeds, +based on various processors. +Their Gerrit triggers words are of the form "perftest-{node_arch}" +where the node_arch combinations currently supported are: +2n-clx, 2n-tx2, 2n-zn2, 3n-tsh. + +## Test selection + +Gerrit trigger line without any additional arguments selects +a small set of test cases to run. +If additional arguments are added to the Gerrit trigger, they are treated +as Robot tag expressions to select tests to run. +While very flexible, this method of test selection also allows the user +to accidentally select too high number of tests, blocking the testbed for days. + +What follows is a list of explanations and recommendations +to help users to select the minimal set of tests cases. + +### Verify cycles + +When Gerrit schedules multiple jobs to run for the same patch set, +it waits until all runs are complete. +While it is waiting, it is possible to trigger more jobs +(adding runs to the set Gerrit is waiting for), but it is not possible +to trigger more runs for the same job, until Gerrit is done waiting. +After Gerrit is done waiting, it becames possible to trigger +the same job again. + +Example. User triggers one set of tests on 2n-icx and immediately +also triggers other set of tests on 3n-icx. Then the user notices +2n-icx run end early because of a typo in tag expression. +When the user tries to re-trigger 2n-icx (with fixed tag expression), +that comment gets ignored by Jenkins. +Only when 3n-icx job finishes, the user can trigger 2n-icx. + +### One comment many jobs + +In the past, the CSIT code which parses for perftest trigger comments +was buggy, which lead to bad behavior (as in selection all performance test, +because "perftest" is also a robot tag) when user included multiple +perftest trigger words in the same comment. + +The worst bugs were fixed since then, but it is still recommended +to use just one trigger word per Gerrit comment, just to be safe. + +### Multiple test cases in run + +While Robot supports OR operator, it does not support parentheses, +so the OR operator is not very useful. It is recommended +to use space instead of OR operator. + +Example template: +perftest-2n-icx {tag_expression_1} {tag_expression_2} + +See below for more concrete examples. + +### Suite tags + +Traditionally, CSIT maintains broad Robot tags that can be used to select tests. + +But it is not recommended to use them for test selection, +as it is not that easy to determine how many test cases are selected. + +The recommended way is to look into CSIT repository first, +and locate a specific suite the user is interested in, +and use its suite tag. For example, "ethip4-ip4base" is a suite tag +selecting just one suite in CSIT git repository, +avoiding all scale, container, and other simialr variants. + +Note that CSIT uses "autogen" code generator, +so the robot running in Jenkins has access to more suites +than visible just by looking into CSIT git repository, +so suite tag is not enough to select even the intended suite, +and user still probably wants to narrow down +to a single test case within a suite. + +### Fully specified tag expressions + +Here is one template to select a single test case: +{test_type}AND{nic_model}AND{nic_driver}AND{cores}AND{frame_size}AND{suite_tag} +where the variables are all lower case (so AND operator stands out). + +Currently only one test type is supported by the performance comparison jobs: +"mrr". +The nic_driver options depend on nic_model. For Intel cards "drv_avf" +(AVF plugin) and "drv_vfio_pci" (DPDK plugin) are popular, for Mellanox +"drv_rdma_core". Currently, the performance using "drv_af_xdp" is not reliable +enough, so do not use it unless you are specifically testing for AF_XDP. + +The most popular nic_model is "nic_intel-xxv710", but that is not available +on all testbed types. +It is safe to use "1c" for cores (unless you are suspection multi-core +performance is affected differently) and "64b" for frame size ("78b" for ip6 +and more for dot1q and other encapsulated traffic; +"1518b" is popular for ipsec and other payload-bound tests). + +As there are more test cases than CSIT can periodically test, +it is possible to encounter an old test case that currently fails. +To avoid that, you can look at "job spec" files we use for periodic testing, +for example +[this one](https://github.com/FDio/csit/blob/master/resources/job_specs/report_iterative/2n-icx/vpp-mrr-00.md). + +### Shortening triggers + +Advanced users may use the following tricks to avoid writing long trigger +comments. + +Robot supports glob matching, which can be used to select multiple suite tags at +once. + +Not specifying one of 6 parts of the recommended expression pattern +will select all available options. For example not specifying nic_driver +for nic_intel-xxv710 will select all 3 applicable drivers. +You can use NOT operator to reject some options (e.g. NOTdrv_af_xdp), +but beware, with NOT the order matters: +tag1ANDtag2NOTtag3 is not the same as tag1NOTtag3ANDtag2, +the latter is evaluated as tag1AND(NOT(tag3ANDtag2)). + +Beware when not specifying nic_model. As a precaution, +CSIT code will insert the defailt NIC model for the tetsbed used. +Example: Specifying drv_rdma_core without specifying nic_model +will fail, as the default nic_model is nic_intel-xxv710 +which does not support RDMA core driver. + +### Complete example + +A user wants to test a VPP change which may affect load balance whith bonding. +Searching tag documentation for "bonding" finds LBOND tag and its variants. +Searching CSIT git repository (directory tests/) finds 8 suite files, +all suited only for 3-node testbeds. +All suites are using vhost, but differ by the forwarding app inside VM +(DPDK or VPP), by the forwarding mode of VPP acting as host level vswitch +(MAC learning or cross connect), and by the number of DUT1-DUT2 links +available (1 or 2). + +As not all NICs and testbeds offer enogh ports for 2 parallel DUT-DUT links, +the user looks at +[testbed specifications](https://github.com/FDio/csit/tree/master/topologies/available) +and finds that only xxv710 NIC on 3n-icx testbed matches the requirements. +Quick look into the suites confirm the smallest frame size is 64 bytes +(despite DOT1Q robot tag, as the encapsulation does not happen on TG-DUT links). +It is ok to use just 1 physical core, as 3n-icx has hyperthreading enabled, +so VPP vswitch will use 2 worker threads. + +The user decides the vswitch forwarding mode is not important +(so choses cross connect as that has less CPU overhead), +but wants to test both NIC drivers (not AF_XDP), both apps in VM, +and both 1 and 2 parallel links. + +After shortening, this is the trigger comment fianlly used: +perftest-3n-icx mrrANDnic_intel-x710AND1cAND64bAND?lbvpplacp-dot1q-l2xcbase-eth-2vhostvr1024-1vm*NOTdrv_af_xdp + +## Basic operation + +The job builds VPP .deb packages for both the patch under test +(called "current") and its parent patch (called "parent"). + +For each test (from a set defined by tag expression), +both builds are subjected to several trial measurements (BMRR). +Measured samples are grouped to "parent" sequence, +followed by "current" sequence. The same Minimal Description Length +algorithm as in trending is used to decide whether it is one big group, +or two smaller gropus. If it is one group, a "normal" result +is declared for the test. If it is two groups, and current average +is less then parent average, the test is declared a regression. +If it is two groups and current average is larger or equal, +the test is declared a progression. + +The whole job fails (giving -1) if some trial measurement failed, +or if any test was declared a regression. + +## Temporary specifics + +The Minimal Description Length analysis is performed by +CSIT code equivalent to jumpavg-0.1.3 library available on PyPI. + +In hopes of strengthening of signal (code performance) compared to noise +(all other factors influencing the measured values), several workarounds +are applied. + +In contrast to trending, trial duration is set to 10 seconds, +and only 5 samples are measured for each build. +Both parameters are set in ci-management. + +This decreases sensitivity to regressions, but also decreases +probability of false positives. + +## Console output + +The following information as visible towards the end of Jenkins console output, +repeated for each analyzed test. + +The original 5 values are visible in order they were measured. +The 5 values after processing are also visible in output, +this time sorted by value (so people can see minimum and maximum). + +The next output is difference of averages. It is the current average +minus the parent average, expressed as percentage of the parent average. + +The next three outputs contain the jumpavg representation +of the two groups and a combined group. +Here, "bits" is the description length; for "current" sequence +it includes effect from "parent" average value +(jumpavg-0.1.3 penalizes sequences with too close averages). + +Next, a sentence describing which grouping description is shorter, +and by how much bits. +Finally, the test result classification is visible. + +The algorithm does not track test case names, +so test cases are indexed (from 0). diff --git a/docs/content/methodology/per_thread_resources.md b/docs/content/methodology/per_thread_resources.md deleted file mode 100644 index cd862fa824..0000000000 --- a/docs/content/methodology/per_thread_resources.md +++ /dev/null @@ -1,102 +0,0 @@ ---- -title: "Per Thread Resources" -weight: 2 ---- - -# Per Thread Resources - -CSIT test framework is managing mapping of the following resources per -thread: - -1. Cores, physical cores (pcores) allocated as pairs of sibling logical cores - (lcores) if server in HyperThreading/SMT mode, or as single lcores - if server not in HyperThreading/SMT mode. Note that if server's - processors are running in HyperThreading/SMT mode sibling lcores are - always used. -2. Receive Queues (RxQ), packet receive queues allocated on each - physical and logical interface tested. -3. Transmit Queues(TxQ), packet transmit queues allocated on each - physical and logical interface tested. - -Approach to mapping per thread resources depends on the application/DUT -tested (VPP or DPDK apps) and associated thread types, as follows: - -1. Data-plane workers, used for data-plane packet processing, when no - feature workers present. - - - Cores: data-plane workers are typically tested in 1, 2 and 4 pcore - configurations, running on single lcore per pcore or on sibling - lcores per pcore. Result is a set of {T}t{C}c thread-core - configurations, where{T} stands for a total number of threads - (lcores), and {C} for a total number of pcores. Tested - configurations are encoded in CSIT test case names, - e.g. "1c", "2c", "4c", and test tags "2T1C"(or "1T1C"), "4T2C" - (or "2T2C"), "8T4C" (or "4T4C"). - - Interface Receive Queues (RxQ): as of CSIT-2106 release, number of - RxQs used on each physical or virtual interface is equal to the - number of data-plane workers. In other words each worker has a - dedicated RxQ on each interface tested. This ensures packet - processing load to be equal for each worker, subject to RSS flow - load balancing efficacy. Note: Before CSIT-2106 total number of - RxQs across all interfaces of specific type was equal to the - number of data-plane workers. - - Interface Transmit Queues (TxQ): number of TxQs used on each - physical or virtual interface is equal to the number of data-plane - workers. In other words each worker has a dedicated TxQ on each - interface tested. - - Applies to VPP and DPDK Testpmd and L3Fwd. - -2. Data-plane and feature workers (e.g. IPsec async crypto workers), the - latter dedicated to specific feature processing. - - - Cores: data-plane and feature workers are tested in 2, 3 and 4 - pcore configurations, running on single lcore per pcore or on - sibling lcores per pcore. This results in a two sets of - thread-core combinations separated by "-", {T}t{C}c-{T}t{C}c, with - the leading set denoting total number of threads (lcores) and - pcores used for data-plane workers, and the trailing set denoting - total number of lcores and pcores used for feature workers. - Accordingly, tested configurations are encoded in CSIT test case - names, e.g. "1c-1c", "1c-2c", "1c-3c", and test tags "2T1C_2T1C" - (or "1T1C_1T1C"), "2T1C_4T2C"(or "1T1C_2T2C"), "2T1C_6T3C" - (or "1T1C_3T3C"). - - RxQ and TxQ: no RxQs and no TxQs are used by feature workers. - - Applies to VPP only. - -3. Management/main worker, control plane and management. - - - Cores: single lcore. - - RxQ: not used (VPP default behaviour). - - TxQ: single TxQ per interface, allocated but not used - (VPP default behaviour). - - Applies to VPP only. - -## VPP Thread Configuration - -Mapping of cores and RxQs to VPP data-plane worker threads is done in -the VPP startup.conf during test suite setup: - -1. `corelist-workers `: List of logical cores to run VPP - data-plane workers and feature workers. The actual lcores' - allocations depends on HyperThreading/SMT server configuration and - per test core configuration. - - - For tests without feature workers, by default, all CPU cores - configured in startup.conf are used for data-plane workers. - - For tests with feature workers, CSIT code distributes lcores across - data-plane and feature workers. - -2. `num-rx-queues `: Number of Rx queues used per interface. - -Mapping of TxQs to VPP data-plane worker threads uses the default VPP -setting of one TxQ per interface per data-plane worker. - -## DPDK Thread Configuration - -Mapping of cores and RxQs to DPDK Testpmd/L3Fwd data-plane worker -threads is done in the startup CLI: - -1. `-l ` - List of logical cores to run DPDK - application. -2. `nb-cores=` - Number of forwarding cores. -3. `rxq=` - Number of Rx queues used per interface. diff --git a/docs/content/methodology/reconfiguration_tests.md b/docs/content/methodology/reconfiguration_tests.md deleted file mode 100644 index 837535526d..0000000000 --- a/docs/content/methodology/reconfiguration_tests.md +++ /dev/null @@ -1,68 +0,0 @@ ---- -title: "Reconfiguration Tests" -weight: 16 ---- - -# Reconfiguration Tests - -## Overview - -Reconf tests are designed to measure the impact of VPP re-configuration -on data plane traffic. -While VPP takes some measures against the traffic being -entirely stopped for a prolonged time, -the immediate forwarding rate varies during the re-configuration, -as some configurations steps need the active dataplane worker threads -to be stopped temporarily. - -As the usual methods of measuring throughput need multiple trial measurements -with somewhat long durations, and the re-configuration process can also be long, -finding an offered load which would result in zero loss -during the re-configuration process would be time-consuming. - -Instead, reconf tests first find a througput value (lower bound for NDR) -without re-configuration, and then maintain that ofered load -during re-configuration. The measured loss count is then assumed to be caused -by the re-configuration process. The result published by reconf tests -is the effective blocked time, that is -the loss count divided by the offered load. - -## Current Implementation - -Each reconf suite is based on a similar MLRsearch performance suite. - -MLRsearch parameters are changed to speed up the throughput discovery. -For example, PDR is not searched for, and the final trial duration is shorter. - -The MLRsearch suite has to contain a configuration parameter -that can be scaled up, e.g. number of tunnels or number of service chains. -Currently, only increasing the scale is supported -as the re-configuration operation. In future, scale decrease -or other operations can be implemented. - -The traffic profile is not changed, so the traffic present is processed -only by the smaller scale configuration. The added tunnels / chains -are not targetted by the traffic. - -For the re-configuration, the same Robot Framework and Python libraries -are used, as were used in the initial configuration, with the exception -of the final calls that do not interact with VPP (e.g. starting -virtual machines) being skipped to reduce the test overall duration. - -## Discussion - -Robot Framework introduces a certain overhead, which may affect timing -of individual VPP API calls, which in turn may affect -the number of packets lost. - -The exact calls executed may contain unnecessary info dumps, repeated commands, -or commands which change a value that do not need to be changed (e.g. MTU). -Thus, implementation details are affecting the results, even if their effect -on the corresponding MLRsearch suite is negligible. - -The lower bound for NDR is the only value safe to be used when zero packets lost -are expected without re-configuration. But different suites show different -"jitter" in that value. For some suites, the lower bound is not tight, -allowing full NIC buffers to drain quickly between worker pauses. -For other suites, lower bound for NDR still has quite a large probability -of non-zero packet loss even without re-configuration. diff --git a/docs/content/methodology/root_cause_analysis/_index.md b/docs/content/methodology/root_cause_analysis/_index.md deleted file mode 100644 index 79cfe73769..0000000000 --- a/docs/content/methodology/root_cause_analysis/_index.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -bookCollapseSection: true -bookFlatSection: false -title: "Root Cause Analysis" -weight: 20 ---- \ No newline at end of file diff --git a/docs/content/methodology/root_cause_analysis/perpatch_performance_tests.md b/docs/content/methodology/root_cause_analysis/perpatch_performance_tests.md deleted file mode 100644 index 900ea0b874..0000000000 --- a/docs/content/methodology/root_cause_analysis/perpatch_performance_tests.md +++ /dev/null @@ -1,228 +0,0 @@ ---- -title: "Per-patch performance tests" -weight: 1 ---- - -# Per-patch performance tests - -Updated for CSIT git commit id: 72b45cfe662107c8e1bb549df71ba51352a898ee. - -A methodology similar to trending analysis is used for comparing performance -before a DUT code change is merged. This can act as a verify job to disallow -changes which would decrease performance without a good reason. - -## Existing jobs - -VPP is the only project currently using such jobs. -They are not started automatically, must be triggered on demand. -They allow full tag expressions, but some tags are enforced (such as MRR). - -There are jobs available for multiple types of testbeds, -based on various processors. -Their Gerrit triggers words are of the form "perftest-{node_arch}" -where the node_arch combinations currently supported are: -2n-clx, 2n-tx2, 2n-zn2, 3n-tsh. - -## Test selection - -Gerrit trigger line without any additional arguments selects -a small set of test cases to run. -If additional arguments are added to the Gerrit trigger, they are treated -as Robot tag expressions to select tests to run. -While very flexible, this method of test selection also allows the user -to accidentally select too high number of tests, blocking the testbed for days. - -What follows is a list of explanations and recommendations -to help users to select the minimal set of tests cases. - -### Verify cycles - -When Gerrit schedules multiple jobs to run for the same patch set, -it waits until all runs are complete. -While it is waiting, it is possible to trigger more jobs -(adding runs to the set Gerrit is waiting for), but it is not possible -to trigger more runs for the same job, until Gerrit is done waiting. -After Gerrit is done waiting, it becames possible to trigger -the same job again. - -Example. User triggers one set of tests on 2n-icx and immediately -also triggers other set of tests on 3n-icx. Then the user notices -2n-icx run end early because of a typo in tag expression. -When the user tries to re-trigger 2n-icx (with fixed tag expression), -that comment gets ignored by Jenkins. -Only when 3n-icx job finishes, the user can trigger 2n-icx. - -### One comment many jobs - -In the past, the CSIT code which parses for perftest trigger comments -was buggy, which lead to bad behavior (as in selection all performance test, -because "perftest" is also a robot tag) when user included multiple -perftest trigger words in the same comment. - -The worst bugs were fixed since then, but it is still recommended -to use just one trigger word per Gerrit comment, just to be safe. - -### Multiple test cases in run - -While Robot supports OR operator, it does not support parentheses, -so the OR operator is not very useful. It is recommended -to use space instead of OR operator. - -Example template: -perftest-2n-icx {tag_expression_1} {tag_expression_2} - -See below for more concrete examples. - -### Suite tags - -Traditionally, CSIT maintains broad Robot tags that can be used to select tests. - -But it is not recommended to use them for test selection, -as it is not that easy to determine how many test cases are selected. - -The recommended way is to look into CSIT repository first, -and locate a specific suite the user is interested in, -and use its suite tag. For example, "ethip4-ip4base" is a suite tag -selecting just one suite in CSIT git repository, -avoiding all scale, container, and other simialr variants. - -Note that CSIT uses "autogen" code generator, -so the robot running in Jenkins has access to more suites -than visible just by looking into CSIT git repository, -so suite tag is not enough to select even the intended suite, -and user still probably wants to narrow down -to a single test case within a suite. - -### Fully specified tag expressions - -Here is one template to select a single test case: -{test_type}AND{nic_model}AND{nic_driver}AND{cores}AND{frame_size}AND{suite_tag} -where the variables are all lower case (so AND operator stands out). - -Currently only one test type is supported by the performance comparison jobs: -"mrr". -The nic_driver options depend on nic_model. For Intel cards "drv_avf" (AVF plugin) -and "drv_vfio_pci" (DPDK plugin) are popular, for Mellanox "drv_rdma_core". -Currently, the performance using "drv_af_xdp" is not reliable enough, so do not use it -unless you are specifically testing for AF_XDP. - -The most popular nic_model is "nic_intel-xxv710", but that is not available -on all testbed types. -It is safe to use "1c" for cores (unless you are suspection multi-core performance -is affected differently) and "64b" for frame size ("78b" for ip6 -and more for dot1q and other encapsulated traffic; -"1518b" is popular for ipsec and other payload-bound tests). - -As there are more test cases than CSIT can periodically test, -it is possible to encounter an old test case that currently fails. -To avoid that, you can look at "job spec" files we use for periodic testing, -for example -[this one](https://github.com/FDio/csit/blob/master/resources/job_specs/report_iterative/2n-icx/vpp-mrr-00.md). - -### Shortening triggers - -Advanced users may use the following tricks to avoid writing long trigger comments. - -Robot supports glob matching, which can be used to select multiple suite tags at once. - -Not specifying one of 6 parts of the recommended expression pattern -will select all available options. For example not specifying nic_driver -for nic_intel-xxv710 will select all 3 applicable drivers. -You can use NOT operator to reject some options (e.g. NOTdrv_af_xdp), -but beware, with NOT the order matters: -tag1ANDtag2NOTtag3 is not the same as tag1NOTtag3ANDtag2, -the latter is evaluated as tag1AND(NOT(tag3ANDtag2)). - -Beware when not specifying nic_model. As a precaution, -CSIT code will insert the defailt NIC model for the tetsbed used. -Example: Specifying drv_rdma_core without specifying nic_model -will fail, as the default nic_model is nic_intel-xxv710 -which does not support RDMA core driver. - -### Complete example - -A user wants to test a VPP change which may affect load balance whith bonding. -Searching tag documentation for "bonding" finds LBOND tag and its variants. -Searching CSIT git repository (directory tests/) finds 8 suite files, -all suited only for 3-node testbeds. -All suites are using vhost, but differ by the forwarding app inside VM -(DPDK or VPP), by the forwarding mode of VPP acting as host level vswitch -(MAC learning or cross connect), and by the number of DUT1-DUT2 links -available (1 or 2). - -As not all NICs and testbeds offer enogh ports for 2 parallel DUT-DUT links, -the user looks at -[testbed specifications](https://github.com/FDio/csit/tree/master/topologies/available) -and finds that only xxv710 NIC on 3n-icx testbed matches the requirements. -Quick look into the suites confirm the smallest frame size is 64 bytes -(despite DOT1Q robot tag, as the encapsulation does not happen on TG-DUT links). -It is ok to use just 1 physical core, as 3n-icx has hyperthreading enabled, -so VPP vswitch will use 2 worker threads. - -The user decides the vswitch forwarding mode is not important -(so choses cross connect as that has less CPU overhead), -but wants to test both NIC drivers (not AF_XDP), both apps in VM, -and both 1 and 2 parallel links. - -After shortening, this is the trigger comment fianlly used: -perftest-3n-icx mrrANDnic_intel-x710AND1cAND64bAND?lbvpplacp-dot1q-l2xcbase-eth-2vhostvr1024-1vm*NOTdrv_af_xdp - -## Basic operation - -The job builds VPP .deb packages for both the patch under test -(called "current") and its parent patch (called "parent"). - -For each test (from a set defined by tag expression), -both builds are subjected to several trial measurements (BMRR). -Measured samples are grouped to "parent" sequence, -followed by "current" sequence. The same Minimal Description Length -algorithm as in trending is used to decide whether it is one big group, -or two smaller gropus. If it is one group, a "normal" result -is declared for the test. If it is two groups, and current average -is less then parent average, the test is declared a regression. -If it is two groups and current average is larger or equal, -the test is declared a progression. - -The whole job fails (giving -1) if some trial measurement failed, -or if any test was declared a regression. - -## Temporary specifics - -The Minimal Description Length analysis is performed by -CSIT code equivalent to jumpavg-0.1.3 library available on PyPI. - -In hopes of strengthening of signal (code performance) compared to noise -(all other factors influencing the measured values), several workarounds -are applied. - -In contrast to trending, trial duration is set to 10 seconds, -and only 5 samples are measured for each build. -Both parameters are set in ci-management. - -This decreases sensitivity to regressions, but also decreases -probability of false positives. - -## Console output - -The following information as visible towards the end of Jenkins console output, -repeated for each analyzed test. - -The original 5 values are visible in order they were measured. -The 5 values after processing are also visible in output, -this time sorted by value (so people can see minimum and maximum). - -The next output is difference of averages. It is the current average -minus the parent average, expressed as percentage of the parent average. - -The next three outputs contain the jumpavg representation -of the two groups and a combined group. -Here, "bits" is the description length; for "current" sequence -it includes effect from "parent" average value -(jumpavg-0.1.3 penalizes sequences with too close averages). - -Next, a sentence describing which grouping description is shorter, -and by how much bits. -Finally, the test result classification is visible. - -The algorithm does not track test case names, -so test cases are indexed (from 0). diff --git a/docs/content/methodology/suite_generation.md b/docs/content/methodology/suite_generation.md deleted file mode 100644 index 4fa9dee0ce..0000000000 --- a/docs/content/methodology/suite_generation.md +++ /dev/null @@ -1,124 +0,0 @@ ---- -title: "Suite Generation" -weight: 19 ---- - -# Suite Generation - -CSIT uses robot suite files to define tests. -However, not all suite files available for Jenkins jobs -(or manually started bootstrap scripts) are present in CSIT git repository. -They are generated only when needed. - -## Autogen Library - -There is a code generation layer implemented as Python library called "autogen", -called by various bash scripts. - -It generates the full extent of CSIT suites, using the ones in git as templates. - -## Sources - -The generated suites (and their contents) are affected by multiple information -sources, listed below. - -### Git Suites - -The suites present in git repository act as templates for generating suites. -One of autogen design principles is that any template suite should also act -as a full suite (no placeholders). - -In practice, autogen always re-creates the template suite with exactly -the same content, it is one of checks that autogen works correctly. - -### Regenerate Script - -Not all suites present in CSIT git repository act as template for autogen. -The distinction is on per-directory level. Directories with -regenerate_testcases.py script usually consider all suites as templates -(unless possibly not included by the glob patten in the script). - -The script also specifies minimal frame size, indirectly, by specifying protocol -(protocol "ip4" is the default, leading to 64B frame size). - -### Constants - -Values in Constants.py are taken into consideration when generating suites. -The values are mostly related to different NIC models and NIC drivers. - -### Python Code - -Python code in resources/libraries/python/autogen contains several other -information sources. - -#### Testcase Templates - -The test case part of template suite is ignored, test case lines -are created according to text templates in Testcase.py file. - -#### Testcase Argument Lists - -Each testcase template has different number of "arguments", e.g. values -to put into various placeholders. Different test types need different -lists of the argument values, the lists are in regenerate_glob method -in Regenerator.py file. - -#### Iteration Over Values - -Python code detects the test type (usually by substrings of suite file name), -then iterates over different quantities based on type. -For example, only ndrpdr suite templates generate other types (mrr and soak). - -#### Hardcoded Exclusions - -Some combinations of values are known not to work, so they are excluded. -Examples: Density tests for too much CPUs; IMIX for ASTF. - -## Non-Sources - -Some information sources are available in CSIT repository, -but do not affect the suites generated by autogen. - -### Testbeds - -Overall, no information visible in topology yaml files is taken into account -by autogen. - -#### Testbed Architecture - -Historically, suite files are agnostic to testbed architecture, e.g. ICX or ALT. - -#### Testbed Size - -Historically, 2-node and 3-node suites have diferent names, and while -most of the code is common, the differences are not always simple enough. -Autogen treat 2-node and 3-node suites as independent templates. - -TRex suites are intended for a 1-node circuit of otherwise 2-node or 3-node -testbeds, so they support all 3 robot tags. -They are also detected and treated differently by autogen, -mainly because they need different testcase arguments (no CPU count). -Autogen does nothing specifically related to the fact they should run -only in testbeds/NICs with TG-TG line available. - -#### Other Topology Info - -Some bonding tests need two (parallel) links between DUTs. -Autogen does not care, as suites are agnostic. -Robot tag marks the difference, but the link presence is not explicitly checked. - -### Job specs - -Information in job spec files depend on generated suites (not the other way). -Autogen should generate more suites, as job spec is limited by time budget. -More suites should be available for manually triggered verify jobs, -so autogen covers that. - -### Bootstrap Scripts - -Historically, bootstrap scripts perform some logic, -perhaps adding exclusion options to Robot invocation -(e.g. skipping testbed+NIC combinations for tests that need parallel links). - -Once again, the logic here relies on what autogen generates, -autogen does not look into bootstrap scripts. diff --git a/docs/content/methodology/telemetry.md b/docs/content/methodology/telemetry.md deleted file mode 100644 index e7a2571573..0000000000 --- a/docs/content/methodology/telemetry.md +++ /dev/null @@ -1,167 +0,0 @@ ---- -title: "Telemetry" -weight: 20 ---- - -# Telemetry - -OpenMetrics specifies the de-facto standard for transmitting cloud-native -metrics at scale, with support for both text representation and Protocol -Buffers. - -## RFC - -- RFC2119 -- RFC5234 -- RFC8174 -- draft-richih-opsawg-openmetrics-00 - -## Reference - -[OpenMetrics](https://github.com/OpenObservability/OpenMetrics/blob/master/specification/OpenMetrics.md) - -## Metric Types - -- Gauge -- Counter -- StateSet -- Info -- Histogram -- GaugeHistogram -- Summary -- Unknown - -Telemetry module in CSIT currently support only Gauge, Counter and Info. - -## Anatomy of CSIT telemetry implementation - -Existing implementation consists of several measurment building blocks: -the main measuring block running search algorithms (MLR, PLR, SOAK, MRR, ...), -the latency measuring block and the several telemetry blocks with or without -traffic running on a background. - -The main measuring block must not be interrupted by any read operation that can -impact data plane traffic processing during throughput search algorithm. Thus -operational reads are done before (pre-stat) and after (post-stat) that block. - -Some operational reads must be done while traffic is running and usually -consists of two reads (pre-run-stat, post-run-stat) with defined delay between -them. - -## MRR measurement - - traffic_start(r=mrr) traffic_stop |< measure >| - | | | (r=mrr) | - | pre_run_stat post_run_stat | pre_stat | | post_stat - | | | | | | | | - --o--------o---------------o---------o-------o--------+-------------------+------o------------> - t - - Legend: - - pre_run_stat - - vpp-clear-runtime - - post_run_stat - - vpp-show-runtime - - bash-perf-stat // if extended_debug == True - - pre_stat - - vpp-clear-stats - - vpp-enable-packettrace // if extended_debug == True - - vpp-enable-elog - - post_stat - - vpp-show-stats - - vpp-show-packettrace // if extended_debug == True - - vpp-show-elog - - - |< measure >| - | (r=mrr) | - | | - |< traffic_trial0 >|< traffic_trial1 >|< traffic_trialN >| - | (i=0,t=duration) | (i=1,t=duration) | (i=N,t=duration) | - | | | | - --o------------------------o------------------------o------------------------o---> - t - - -## MLR measurement - - |< measure >| traffic_start(r=pdr) traffic_stop traffic_start(r=ndr) traffic_stop |< [ latency ] >| - | (r=mlr) | | | | | | .9/.5/.1/.0 | - | | | pre_run_stat post_run_stat | | pre_run_stat post_run_stat | | | - | | | | | | | | | | | | - --+-------------------+----o--------o---------------o---------o--------------o--------o---------------o---------o------------[---------------------]---> - t - - Legend: - - pre_run_stat - - vpp-clear-runtime - - post_run_stat - - vpp-show-runtime - - bash-perf-stat // if extended_debug == True - - pre_stat - - vpp-clear-stats - - vpp-enable-packettrace // if extended_debug == True - - vpp-enable-elog - - post_stat - - vpp-show-stats - - vpp-show-packettrace // if extended_debug == True - - vpp-show-elog - - -## MRR measurement - - traffic_start(r=mrr) traffic_stop |< measure >| - | | | (r=mrr) | - | |< stat_runtime >| | stat_pre_trial | | stat_post_trial - | | | | | | | | - ----o---+--------------------------+---o-------------o------------+-------------------+-----o-------------> - t - - Legend: - - stat_runtime - - vpp-runtime - - stat_pre_trial - - vpp-clear-stats - - vpp-enable-packettrace // if extended_debug == True - - stat_post_trial - - vpp-show-stats - - vpp-show-packettrace // if extended_debug == True - - - |< measure >| - | (r=mrr) | - | | - |< traffic_trial0 >|< traffic_trial1 >|< traffic_trialN >| - | (i=0,t=duration) | (i=1,t=duration) | (i=N,t=duration) | - | | | | - --o------------------------o------------------------o------------------------o---> - t - - - |< stat_runtime >| - | | - |< program0 >|< program1 >|< programN >| - | (@=params) | (@=params) | (@=params) | - | | | | - --o------------------------o------------------------o------------------------o---> - t - - -## MLR measurement - - |< measure >| traffic_start(r=pdr) traffic_stop traffic_start(r=ndr) traffic_stop |< [ latency ] >| - | (r=mlr) | | | | | | .9/.5/.1/.0 | - | | | |< stat_runtime >| | | |< stat_runtime >| | | | - | | | | | | | | | | | | - --+-------------------+-----o---+--------------------------+---o--------------o---+--------------------------+---o-----------[---------------------]---> - t - - Legend: - - stat_runtime - - vpp-runtime - - stat_pre_trial - - vpp-clear-stats - - vpp-enable-packettrace // if extended_debug == True - - stat_post_trial - - vpp-show-stats - - vpp-show-packettrace // if extended_debug == True diff --git a/docs/content/methodology/terminology.md b/docs/content/methodology/terminology.md deleted file mode 100644 index 229db7d145..0000000000 --- a/docs/content/methodology/terminology.md +++ /dev/null @@ -1,82 +0,0 @@ ---- -title: "Terminology" -weight: 1 ---- - -# Terminology - -- **Frame size**: size of an Ethernet Layer-2 frame on the wire, including - any VLAN tags (dot1q, dot1ad) and Ethernet FCS, but excluding Ethernet - preamble and inter-frame gap. Measured in Bytes. -- **Packet size**: same as frame size, both terms used interchangeably. -- **Inner L2 size**: for tunneled L2 frames only, size of an encapsulated - Ethernet Layer-2 frame, preceded with tunnel header, and followed by - tunnel trailer. Measured in Bytes. -- **Inner IP size**: for tunneled IP packets only, size of an encapsulated - IPv4 or IPv6 packet, preceded with tunnel header, and followed by - tunnel trailer. Measured in Bytes. -- **Device Under Test (DUT)**: In software networking, "device" denotes a - specific piece of software tasked with packet processing. Such device - is surrounded with other software components (such as operating system - kernel). It is not possible to run devices without also running the - other components, and hardware resources are shared between both. For - purposes of testing, the whole set of hardware and software components - is called "System Under Test" (SUT). As SUT is the part of the whole - test setup performance of which can be measured with RFC2544, using - SUT instead of RFC2544 DUT. Device under test - (DUT) can be re-introduced when analyzing test results using whitebox - techniques, but this document sticks to blackbox testing. -- **System Under Test (SUT)**: System under test (SUT) is a part of the - whole test setup whose performance is to be benchmarked. The complete - methodology contains other parts, whose performance is either already - established, or not affecting the benchmarking result. -- **Bi-directional throughput tests**: involve packets/frames flowing in - both east-west and west-east directions over every tested interface of - SUT/DUT. Packet flow metrics are measured per direction, and can be - reported as aggregate for both directions (i.e. throughput) and/or - separately for each measured direction (i.e. latency). In most cases - bi-directional tests use the same (symmetric) load in both directions. -- **Uni-directional throughput tests**: involve packets/frames flowing in - only one direction, i.e. either east-west or west-east direction, over - every tested interface of SUT/DUT. Packet flow metrics are measured - and are reported for measured direction. -- **Packet Loss Ratio (PLR)**: ratio of packets received relative to packets - transmitted over the test trial duration, calculated using formula: - PLR = ( pkts_transmitted - pkts_received ) / pkts_transmitted. - For bi-directional throughput tests aggregate PLR is calculated based - on the aggregate number of packets transmitted and received. -- **Packet Throughput Rate**: maximum packet offered load DUT/SUT forwards - within the specified Packet Loss Ratio (PLR). In many cases the rate - depends on the frame size processed by DUT/SUT. Hence packet - throughput rate MUST be quoted with specific frame size as received by - DUT/SUT during the measurement. For bi-directional tests, packet - throughput rate should be reported as aggregate for both directions. - Measured in packets-per-second (pps) or frames-per-second (fps), - equivalent metrics. -- **Bandwidth Throughput Rate**: a secondary metric calculated from packet - throughput rate using formula: bw_rate = pkt_rate * (frame_size + - L1_overhead) * 8, where L1_overhead for Ethernet includes preamble (8 - Bytes) and inter-frame gap (12 Bytes). For bi-directional tests, - bandwidth throughput rate should be reported as aggregate for both - directions. Expressed in bits-per-second (bps). -- **Non Drop Rate (NDR)**: maximum packet/bandwith throughput rate sustained - by DUT/SUT at PLR equal zero (zero packet loss) specific to tested - frame size(s). MUST be quoted with specific packet size as received by - DUT/SUT during the measurement. Packet NDR measured in - packets-per-second (or fps), bandwidth NDR expressed in - bits-per-second (bps). -- **Partial Drop Rate (PDR)**: maximum packet/bandwith throughput rate - sustained by DUT/SUT at PLR greater than zero (non-zero packet loss) - specific to tested frame size(s). MUST be quoted with specific packet - size as received by DUT/SUT during the measurement. Packet PDR - measured in packets-per-second (or fps), bandwidth PDR expressed in - bits-per-second (bps). -- **Maximum Receive Rate (MRR)**: packet/bandwidth rate regardless of PLR - sustained by DUT/SUT under specified Maximum Transmit Rate (MTR) - packet load offered by traffic generator. MUST be quoted with both - specific packet size and MTR as received by DUT/SUT during the - measurement. Packet MRR measured in packets-per-second (or fps), - bandwidth MRR expressed in bits-per-second (bps). -- **Trial**: a single measurement step. -- **Trial duration**: amount of time over which packets are transmitted and - received in a single measurement step. diff --git a/docs/content/methodology/test/_index.md b/docs/content/methodology/test/_index.md new file mode 100644 index 0000000000..857cc7b168 --- /dev/null +++ b/docs/content/methodology/test/_index.md @@ -0,0 +1,6 @@ +--- +bookCollapseSection: true +bookFlatSection: false +title: "Test" +weight: 3 +--- diff --git a/docs/content/methodology/test/access_control_lists.md b/docs/content/methodology/test/access_control_lists.md new file mode 100644 index 0000000000..354e6b72bb --- /dev/null +++ b/docs/content/methodology/test/access_control_lists.md @@ -0,0 +1,66 @@ +--- +title: "Access Control Lists" +weight: 5 +--- + +# Access Control Lists + +VPP is tested in a number of data plane feature configurations across +different forwarding modes. Following sections list features tested. + +## ACL Security-Groups + +Both stateless and stateful access control lists (ACL), also known as +security-groups, are supported by VPP. + +Following ACL configurations are tested for MAC switching with L2 +bridge-domains: + +- *l2bdbasemaclrn-iacl{E}sl-{F}flows*: Input stateless ACL, with {E} + entries and {F} flows. +- *l2bdbasemaclrn-oacl{E}sl-{F}flows*: Output stateless ACL, with {E} + entries and {F} flows. +- *l2bdbasemaclrn-iacl{E}sf-{F}flows*: Input stateful ACL, with {E} + entries and {F} flows. +- *l2bdbasemaclrn-oacl{E}sf-{F}flows*: Output stateful ACL, with {E} + entries and {F} flows. + +Following ACL configurations are tested with IPv4 routing: + +- *ip4base-iacl{E}sl-{F}flows*: Input stateless ACL, with {E} entries + and {F} flows. +- *ip4base-oacl{E}sl-{F}flows*: Output stateless ACL, with {E} entries + and {F} flows. +- *ip4base-iacl{E}sf-{F}flows*: Input stateful ACL, with {E} entries and + {F} flows. +- *ip4base-oacl{E}sf-{F}flows*: Output stateful ACL, with {E} entries + and {F} flows. + +ACL tests are executed with the following combinations of ACL entries +and number of flows: + +- ACL entry definitions + - flow non-matching deny entry: (src-ip4, dst-ip4, src-port, dst-port). + - flow matching permit ACL entry: (src-ip4, dst-ip4). +- {E} - number of non-matching deny ACL entries, {E} = [1, 10, 50]. +- {F} - number of UDP flows with different tuple (src-ip4, dst-ip4, + src-port, dst-port), {F} = [100, 10k, 100k]. +- All {E}x{F} combinations are tested per ACL type, total of 9. + +## ACL MAC-IP + +MAC-IP binding ACLs are tested for MAC switching with L2 bridge-domains: + +- *l2bdbasemaclrn-macip-iacl{E}sl-{F}flows*: Input stateless ACL, with + {E} entries and {F} flows. + +MAC-IP ACL tests are executed with the following combinations of ACL +entries and number of flows: + +- ACL entry definitions + - flow non-matching deny entry: (dst-ip4, dst-mac, bit-mask) + - flow matching permit ACL entry: (dst-ip4, dst-mac, bit-mask) +- {E} - number of non-matching deny ACL entries, {E} = [1, 10, 50] +- {F} - number of UDP flows with different tuple (dst-ip4, dst-mac), + {F} = [100, 10k, 100k] +- All {E}x{F} combinations are tested per ACL type, total of 9. diff --git a/docs/content/methodology/test/generic_segmentation_offload.md b/docs/content/methodology/test/generic_segmentation_offload.md new file mode 100644 index 0000000000..0032d203de --- /dev/null +++ b/docs/content/methodology/test/generic_segmentation_offload.md @@ -0,0 +1,117 @@ +--- +title: "Generic Segmentation Offload" +weight: 7 +--- + +# Generic Segmentation Offload + +## Overview + +Generic Segmentation Offload (GSO) reduces per-packet processing +overhead by enabling applications to pass a multi-packet buffer to +(v)NIC and process a smaller number of large packets (e.g. frame size of +64 KB), instead of processing higher numbers of small packets (e.g. +frame size of 1500 B), thus reducing per-packet overhead. + +GSO tests for VPP vhostuser and tapv2 interfaces. All tests cases use iPerf3 +client and server applications running TCP/IP as a traffic generator. For +performance comparison the same tests are run without GSO enabled. + +## GSO Test Topologies + +Two VPP GSO test topologies are implemented: + +1. iPerfC_GSOvirtio_LinuxVM --- GSOvhost_VPP_GSOvhost --- iPerfS_GSOvirtio_LinuxVM + - Tests VPP GSO on vhostuser interfaces and interaction with Linux + virtio with GSO enabled. +2. iPerfC_GSOtap_LinuxNspace --- GSOtapv2_VPP_GSOtapv2 --- iPerfS_GSOtap_LinuxNspace + - Tests VPP GSO on tapv2 interfaces and interaction with Linux tap + with GSO enabled. + +Common configuration: + +- iPerfC (client) and iPerfS (server) run in TCP/IP mode without upper + bandwidth limit. +- Trial duration is set to 30 sec. +- iPerfC, iPerfS and VPP run in the single SUT node. + + +## VPP GSOtap Topology + +### VPP Configuration + +VPP GSOtap tests are executed without using hyperthreading. VPP worker runs on +a single core. Multi-core tests are not executed. Each interface belongs to +separate namespace. Following core pinning scheme is used: + +- 1t1c (rxq=1, rx_qsz=4096, tx_qsz=4096) + - system isolated: 0,28,56,84 + - vpp mt: 1 + - vpp wt: 2 + - vhost: 3-5 + - iperf-s: 6 + - iperf-c: 7 + +### iPerf3 Server Configuration + +iPerf3 version used 3.7 + + $ sudo -E -S ip netns exec tap1_namespace iperf3 \ + --server --daemon --pidfile /tmp/iperf3_server.pid \ + --logfile /tmp/iperf3.log --port 5201 --affinity + +For the full iPerf3 reference please see +[iPerf3 docs](https://github.com/esnet/iperf/blob/master/docs/invoking.rst). + + +### iPerf3 Client Configuration + +iPerf3 version used 3.7 + + $ sudo -E -S ip netns exec tap1_namespace iperf3 \ + --client 2.2.2.2 --bind 1.1.1.1 --port 5201 --parallel \ + --time 30.0 --affinity --zerocopy + +For the full iPerf3 reference please see +[iPerf3 docs](https://github.com/esnet/iperf/blob/master/docs/invoking.rst). + + +## VPP GSOvhost Topology + +### VPP Configuration + +VPP GSOvhost tests are executed without using hyperthreading. VPP worker runs +on a single core. Multi-core tests are not executed. Following core pinning +scheme is used: + +- 1t1c (rxq=1, rx_qsz=1024, tx_qsz=1024) + - system isolated: 0,28,56,84 + - vpp mt: 1 + - vpp wt: 2 + - vm-iperf-s: 3,4,5,6,7 + - vm-iperf-c: 8,9,10,11,12 + - iperf-s: 1 + - iperf-c: 1 + +### iPerf3 Server Configuration + +iPerf3 version used 3.7 + + $ sudo iperf3 \ + --server --daemon --pidfile /tmp/iperf3_server.pid \ + --logfile /tmp/iperf3.log --port 5201 --affinity X + +For the full iPerf3 reference please see +[iPerf3 docs](https://github.com/esnet/iperf/blob/master/docs/invoking.rst). + + +### iPerf3 Client Configuration + +iPerf3 version used 3.7 + + $ sudo iperf3 \ + --client 2.2.2.2 --bind 1.1.1.1 --port 5201 --parallel \ + --time 30.0 --affinity X --zerocopy + +For the full iPerf3 reference please see +[iPerf3 docs](https://github.com/esnet/iperf/blob/master/docs/invoking.rst). diff --git a/docs/content/methodology/test/hoststack/_index.md b/docs/content/methodology/test/hoststack/_index.md new file mode 100644 index 0000000000..2ae872c54e --- /dev/null +++ b/docs/content/methodology/test/hoststack/_index.md @@ -0,0 +1,6 @@ +--- +bookCollapseSection: true +bookFlatSection: false +title: "Hoststack" +weight: 6 +--- diff --git a/docs/content/methodology/test/hoststack/quicudpip_with_vppecho.md b/docs/content/methodology/test/hoststack/quicudpip_with_vppecho.md new file mode 100644 index 0000000000..c7d57a51b3 --- /dev/null +++ b/docs/content/methodology/test/hoststack/quicudpip_with_vppecho.md @@ -0,0 +1,48 @@ +--- +title: "QUIC/UDP/IP with vpp_echo" +weight: 1 +--- + +# QUIC/UDP/IP with vpp_echo + +[vpp_echo performance testing tool](https://wiki.fd.io/view/VPP/HostStack#External_Echo_Server.2FClient_.28vpp_echo.29) +is a bespoke performance test application which utilizes the 'native +HostStack APIs' to verify performance and correct handling of +connection/stream events with uni-directional and bi-directional +streams of data. + +Because iperf3 does not support the QUIC transport protocol, vpp_echo +is used for measuring the maximum attainable goodput of the VPP Host +Stack connection utilizing the QUIC transport protocol across two +instances of VPP running on separate DUT nodes. The QUIC transport +protocol supports multiple streams per connection and test cases +utilize different combinations of QUIC connections and number of +streams per connection. + +The test configuration is as follows: + + DUT1 Network DUT2 + [ vpp_echo-client -> VPP1 ]=======[ VPP2 -> vpp_echo-server] + N-streams/connection + +where, + +1. vpp_echo server attaches to VPP2 and LISTENs on VPP2:TCP port 1234. +2. vpp_echo client creates one or more connections to VPP1 and opens + one or more stream per connection to VPP2:TCP port 1234. +3. vpp_echo client transmits a uni-directional stream as fast as the + VPP Host Stack allows to the vpp_echo server for the test duration. +4. At the end of the test the vpp_echo client emits the goodput + measurements for all streams and the sum of all streams. + +Test cases include + +1. 1 QUIC Connection with 1 Stream +2. 1 QUIC connection with 10 Streams +3. 10 QUIC connetions with 1 Stream +4. 10 QUIC connections with 10 Streams + +with stream sizes to provide reasonable test durations. The VPP Host +Stack QUIC transport is configured to utilize the picotls encryption +library. In the future, tests utilizing addtional encryption +algorithms will be added. diff --git a/docs/content/methodology/test/hoststack/tcpip_with_iperf3.md b/docs/content/methodology/test/hoststack/tcpip_with_iperf3.md new file mode 100644 index 0000000000..7baa88ab50 --- /dev/null +++ b/docs/content/methodology/test/hoststack/tcpip_with_iperf3.md @@ -0,0 +1,52 @@ +--- +title: "TCP/IP with iperf3" +weight: 2 +--- + +# TCP/IP with iperf3 + +[iperf3 goodput measurement tool](https://github.com/esnet/iperf) +is used for measuring the maximum attainable goodput of the VPP Host +Stack connection across two instances of VPP running on separate DUT +nodes. iperf3 is a popular open source tool for active measurements +of the maximum achievable goodput on IP networks. + +Because iperf3 utilizes the POSIX socket interface APIs, the current +test configuration utilizes the LD_PRELOAD mechanism in the linux +kernel to connect iperf3 to the VPP Host Stack using the VPP +Communications Library (VCL) LD_PRELOAD library (libvcl_ldpreload.so). + +In the future, a forked version of iperf3 which has been modified to +directly use the VCL application APIs may be added to determine the +difference in performance of 'VCL Native' applications versus utilizing +LD_PRELOAD which inherently has more overhead and other limitations. + +The test configuration is as follows: + + DUT1 Network DUT2 + [ iperf3-client -> VPP1 ]=======[ VPP2 -> iperf3-server] + +where, + +1. iperf3 server attaches to VPP2 and LISTENs on VPP2:TCP port 5201. +2. iperf3 client attaches to VPP1 and opens one or more stream + connections to VPP2:TCP port 5201. +3. iperf3 client transmits a uni-directional stream as fast as the + VPP Host Stack allows to the iperf3 server for the test duration. +4. At the end of the test the iperf3 client emits the goodput + measurements for all streams and the sum of all streams. + +Test cases include 1 and 10 Streams with a 20 second test duration +with the VPP Host Stack configured to utilize the Cubic TCP +congestion algorithm. + +Note: iperf3 is single threaded, so it is expected that the 10 stream +test shows little or no performance improvement due to +multi-thread/multi-core execution. + +There are also variations of these test cases which use the VPP Network +Simulator (NSIM) plugin to test the VPP Hoststack goodput with 1 percent +of the traffic being dropped at the output interface of VPP1 thereby +simulating a lossy network. The NSIM tests are experimental and the +test results are not currently representative of typical results in a +lossy network. diff --git a/docs/content/methodology/test/hoststack/udpip_with_iperf3.md b/docs/content/methodology/test/hoststack/udpip_with_iperf3.md new file mode 100644 index 0000000000..01ddf61269 --- /dev/null +++ b/docs/content/methodology/test/hoststack/udpip_with_iperf3.md @@ -0,0 +1,44 @@ +--- +title: "UDP/IP with iperf3" +weight: 3 +--- + +# UDP/IP with iperf3 + +[iperf3 goodput measurement tool](https://github.com/esnet/iperf) +is used for measuring the maximum attainable goodput of the VPP Host +Stack connection across two instances of VPP running on separate DUT +nodes. iperf3 is a popular open source tool for active measurements +of the maximum achievable goodput on IP networks. + +Because iperf3 utilizes the POSIX socket interface APIs, the current +test configuration utilizes the LD_PRELOAD mechanism in the linux +kernel to connect iperf3 to the VPP Host Stack using the VPP +Communications Library (VCL) LD_PRELOAD library (libvcl_ldpreload.so). + +In the future, a forked version of iperf3 which has been modified to +directly use the VCL application APIs may be added to determine the +difference in performance of 'VCL Native' applications versus utilizing +LD_PRELOAD which inherently has more overhead and other limitations. + +The test configuration is as follows: + + DUT1 Network DUT2 + [ iperf3-client -> VPP1 ]=======[ VPP2 -> iperf3-server] + +where, + +1. iperf3 server attaches to VPP2 and LISTENs on VPP2:UDP port 5201. +2. iperf3 client attaches to VPP1 and transmits one or more streams + of packets to VPP2:UDP port 5201. +3. iperf3 client transmits a uni-directional stream as fast as the + VPP Host Stack allows to the iperf3 server for the test duration. +4. At the end of the test the iperf3 client emits the goodput + measurements for all streams and the sum of all streams. + +Test cases include 1 and 10 Streams with a 20 second test duration +with the VPP Host Stack using the UDP transport layer.. + +Note: iperf3 is single threaded, so it is expected that the 10 stream +test shows little or no performance improvement due to +multi-thread/multi-core execution. diff --git a/docs/content/methodology/test/hoststack/vsap_ab_with_nginx.md b/docs/content/methodology/test/hoststack/vsap_ab_with_nginx.md new file mode 100644 index 0000000000..2dc4d2b7f9 --- /dev/null +++ b/docs/content/methodology/test/hoststack/vsap_ab_with_nginx.md @@ -0,0 +1,39 @@ +--- +title: "VSAP ab with nginx" +weight: 4 +--- + +# VSAP ab with nginx + +[VSAP (VPP Stack Acceleration Project)](https://wiki.fd.io/view/VSAP) +aims to establish an industry user space application ecosystem based on +the VPP hoststack. As a pre-requisite to adapting open source applications +using VPP Communications Library to accelerate performance, the VSAP team +has introduced baseline tests utilizing the LD_PRELOAD mechanism to capture +baseline performance data. + +[AB (Apache HTTP server benchmarking tool)](https://httpd.apache.org/docs/2.4/programs/ab.html) +is used for measuring the maximum connections-per-second and requests-per-second. + +[NGINX](https://www.nginx.com) is a popular open source HTTP server +application. Because NGINX utilizes the POSIX socket interface APIs, the test +configuration uses the LD_PRELOAD mechanism to connect NGINX to the VPP +Hoststack using the VPP Communications Library (VCL) LD_PRELOAD library +(libvcl_ldpreload.so). + +In the future, a version of NGINX which has been modified to +directly use the VCL application APIs will be added to determine the +difference in performance of 'VCL Native' applications versus utilizing +LD_PRELOAD which inherently has more overhead and other limitations. + +The test configuration is as follows: + + TG Network DUT + [ AB ]=============[ VPP -> nginx ] + +where, + +1. nginx attaches to VPP and listens on TCP port 80 +2. ab runs CPS and RPS tests with packets flowing from the Test Generator node, + across 100G NICs, through VPP hoststack to NGINX. +3. At the end of the tests, the results are reported by AB. diff --git a/docs/content/methodology/test/internet_protocol_security.md b/docs/content/methodology/test/internet_protocol_security.md new file mode 100644 index 0000000000..1a02c43a0a --- /dev/null +++ b/docs/content/methodology/test/internet_protocol_security.md @@ -0,0 +1,73 @@ +--- +title: "Internet Protocol Security" +weight: 4 +--- + +# Internet Protocol Security + +VPP Internet Protocol Security (IPsec) performance tests are executed for the +following crypto plugins: + +- `crypto_native`, used for software based crypto leveraging CPU + platform optimizations e.g. Intel's AES-NI instruction set. +- `crypto_ipsecmb`, used for hardware based crypto with Intel QAT PCIe cards. + +## IPsec with VPP Native SW Crypto + +CSIT implements following IPsec test cases relying on VPP native crypto +(`crypto_native` plugin): + + **VPP Crypto Engine** | **ESP Encryption** | **ESP Integrity** | **Scale Tested** +----------------------:|-------------------:|------------------:|-----------------: + crypto_native | AES[128\|256]-GCM | GCM | 1 to 60k tunnels + crypto_native | AES128-CBC | SHA[256\|512] | 1 to 60k tunnels + +VPP IPsec with SW crypto are executed in both tunnel and policy modes, +with tests running on 3-node testbeds: 3n-icx, 3n-tsh. + +## IPsec with Intel QAT HW + +CSIT implements following IPsec test cases relying on ipsecmb library +(`crypto_ipsecmb` plugin) and Intel QAT 8950 (50G HW crypto card): + +dpdk_cryptodev + + **VPP Crypto Engine** | **VPP Crypto Workers** | **ESP Encryption** | **ESP Integrity** | **Scale Tested** +----------------------:|-----------------------:|-------------------:|------------------:|-----------------: + crypto_ipsecmb | sync/all workers | AES[128\|256]-GCM | GCM | 1, 1k tunnels + crypto_ipsecmb | sync/all workers | AES[128]-CBC | SHA[256\|512] | 1, 1k tunnels + crypto_ipsecmb | async/crypto worker | AES[128\|256]-GCM | GCM | 1, 4, 1k tunnels + crypto_ipsecmb | async/crypto worker | AES[128]-CBC | SHA[256\|512] | 1, 4, 1k tunnels + +## IPsec with Async Crypto Feature Workers + +*TODO Description to be added* + +## IPsec Uni-Directional Tests with VPP Native SW Crypto + +CSIT implements following IPsec uni-directional test cases relying on VPP native +crypto (`crypto_native` plugin) in tunnel mode: + + **VPP Crypto Engine** | **ESP Encryption** | **ESP Integrity** | **Scale Tested** +----------------------:|-------------------:|------------------:|-------------------: + crypto_native | AES[128\|256]-GCM | GCM | 4, 1k, 10k tunnels + crypto_native | AES128-CBC | SHA[512] | 4, 1k, 10k tunnels + +In policy mode: + + **VPP Crypto Engine** | **ESP Encryption** | **ESP Integrity** | **Scale Tested** +----------------------:|-------------------:|------------------:|------------------: + crypto_native | AES[256]-GCM | GCM | 1, 40, 1k tunnels + +The tests are running on 2-node testbeds: 2n-tx2. The uni-directional tests +are partially addressing a weakness in 2-node testbed setups with T-Rex as +the traffic generator. With just one DUT node, we can either encrypt or decrypt +traffic in each direction. + +The testcases are only doing encryption - packets are encrypted on the DUT and +then arrive at TG where no additional packet processing is needed (just +counting packets). + +Decryption would require that the traffic generator generated encrypted packets +which the DUT then would decrypt. However, T-Rex does not have the capability +to encrypt packets. diff --git a/docs/content/methodology/test/network_address_translation.md b/docs/content/methodology/test/network_address_translation.md new file mode 100644 index 0000000000..f443eabc5f --- /dev/null +++ b/docs/content/methodology/test/network_address_translation.md @@ -0,0 +1,445 @@ +--- +title: "Network Address Translation" +weight: 1 +--- + +# Network Address Translation + +## NAT44 Prefix Bindings + +NAT44 prefix bindings should be representative to target applications, +where a number of private IPv4 addresses from the range defined by +RFC1918 is mapped to a smaller set of public IPv4 addresses from the +public range. + +Following quantities are used to describe inside to outside IP address +and port bindings scenarios: + +- Inside-addresses, number of inside source addresses + (representing inside hosts). +- Ports-per-inside-address, number of TCP/UDP source + ports per inside source address. +- Outside-addresses, number of outside (public) source addresses + allocated to NAT44. +- Ports-per-outside-address, number of TCP/UDP source + ports per outside source address. The maximal number of + ports-per-outside-address usable for NAT is 64 512 + (in non-reserved port range 1024-65535, RFC4787). +- Sharing-ratio, equal to inside-addresses divided by outside-addresses. + +CSIT NAT44 tests are designed to take into account the maximum number of +ports (sessions) required per inside host (inside-address) and at the +same time to maximize the use of outside-address range by using all +available outside ports. With this in mind, the following scheme of +NAT44 sharing ratios has been devised for use in CSIT: + + **ports-per-inside-address** | **sharing-ratio** +-----------------------------:|------------------: + 63 | 1024 + 126 | 512 + 252 | 256 + 504 | 128 + +Initial CSIT NAT44 tests, including associated TG/TRex traffic profiles, +are based on ports-per-inside-address set to 63 and the sharing ratio of +1024. This approach is currently used for all NAT44 tests including +NAT44det (NAT44 deterministic used for Carrier Grade NAT applications) +and NAT44ed (Endpoint Dependent). + +Private address ranges to be used in tests: + +- 192.168.0.0 - 192.168.255.255 (192.168/16 prefix) + + - Total of 2^16 (65 536) of usable IPv4 addresses. + - Used in tests for up to 65 536 inside addresses (inside hosts). + +- 172.16.0.0 - 172.31.255.255 (172.16/12 prefix) + + - Total of 2^20 (1 048 576) of usable IPv4 addresses. + - Used in tests for up to 1 048 576 inside addresses (inside hosts). + +### NAT44 Session Scale + +NAT44 session scale tested is govern by the following logic: + +- Number of inside-addresses(hosts) H[i] = (H[i-1] x 2^2) with H(0)=1 024, + i = 1,2,3, ... + + - H[i] = 1 024, 4 096, 16 384, 65 536, 262 144, ... + +- Number of sessions S[i] = H[i] * ports-per-inside-address + + - ports-per-inside-address = 63 + + **i** | **hosts** | **sessions** +------:|----------:|-------------: + 0 | 1 024 | 64 512 + 1 | 4 096 | 258 048 + 2 | 16 384 | 1 032 192 + 3 | 65 536 | 4 128 768 + 4 | 262 144 | 16 515 072 + +### NAT44 Deterministic + +NAT44det performance tests are using TRex STL (Stateless) API and traffic +profiles, similar to all other stateless packet forwarding tests like +ip4, ip6 and l2, sending UDP packets in both directions +inside-to-outside and outside-to-inside. + +The inside-to-outside traffic uses single destination address (20.0.0.0) +and port (1024). +The inside-to-outside traffic covers whole inside address and port range, +the outside-to-inside traffic covers whole outside address and port range. + +NAT44det translation entries are created during the ramp-up phase, +followed by verification that all entries are present, +before proceeding to the main measurements of the test. +This ensures session setup does not impact the forwarding performance test. + +Associated CSIT test cases use the following naming scheme to indicate +NAT44det scenario tested: + +- ethip4udp-nat44det-h{H}-p{P}-s{S}-[mrr|ndrpdr|soak] + + - {H}, number of inside hosts, H = 1024, 4096, 16384, 65536, 262144. + - {P}, number of ports per inside host, P = 63. + - {S}, number of sessions, S = 64512, 258048, 1032192, 4128768, + 16515072. + - [mrr|ndrpdr|soak], MRR, NDRPDR or SOAK test. + +### NAT44 Endpoint-Dependent + +In order to excercise NAT44ed ability to translate based on both +source and destination address and port, the inside-to-outside traffic +varies also destination address and port. Destination port is the same +as source port, destination address has the same offset as the source address, +but applied to different subnet (starting with 20.0.0.0). + +As the mapping is not deterministic (for security reasons), +we cannot easily use stateless bidirectional traffic profiles. +Inside address and port range is fully covered, +but we do not know which outside-to-inside source address and port to use +to hit an open session. + +Therefore, NAT44ed is benchmarked using following methodologies: + +- Unidirectional throughput using *stateless* traffic profile. +- Connections-per-second (CPS) using *stateful* traffic profile. +- Bidirectional throughput (TPUT, see below) using *stateful* traffic profile. + +Unidirectional NAT44ed throughput tests are using TRex STL (Stateless) +APIs and traffic profiles, but with packets sent only in +inside-to-outside direction. +Similarly to NAT44det, NAT44ed unidirectional throughput tests include +a ramp-up phase to establish and verify the presence of required NAT44ed +binding entries. As the sessions have finite duration, the test code +keeps inserting ramp-up trials during the search, if it detects a risk +of sessions timing out. Any zero loss trial visits all sessions, +so it acts also as a ramp-up. + +Stateful NAT44ed tests are using TRex ASTF (Advanced Stateful) APIs and +traffic profiles, with packets sent in both directions. Tests are run +with both UDP and TCP sessions. +As NAT44ed CPS (connections-per-second) stateful tests +measure (also) session opening performance, +they use state reset instead of ramp-up trial. +NAT44ed TPUT (bidirectional throughput) tests prepend ramp-up trials +as in the unidirectional tests, +so the test results describe performance without translation entry +creation overhead. + +Associated CSIT test cases use the following naming scheme to indicate +NAT44det case tested: + +- Stateless: ethip4udp-nat44ed-h{H}-p{P}-s{S}-udir-[mrr|ndrpdr|soak] + + - {H}, number of inside hosts, H = 1024, 4096, 16384, 65536, 262144. + - {P}, number of ports per inside host, P = 63. + - {S}, number of sessions, S = 64512, 258048, 1032192, 4128768, + 16515072. + - udir-[mrr|ndrpdr|soak], unidirectional stateless tests MRR, NDRPDR + or SOAK. + +- Stateful: ethip4[udp|tcp]-nat44ed-h{H}-p{P}-s{S}-[cps|tput]-[mrr|ndrpdr|soak] + + - [udp|tcp], UDP or TCP sessions + - {H}, number of inside hosts, H = 1024, 4096, 16384, 65536, 262144. + - {P}, number of ports per inside host, P = 63. + - {S}, number of sessions, S = 64512, 258048, 1032192, 4128768, + 16515072. + - [cps|tput], connections-per-second session establishment rate or + packets-per-second average rate, or packets-per-second rate + without session establishment. + - [mrr|ndrpdr|soak], bidirectional stateful tests MRR, NDRPDR, or SOAK. + +## Stateful traffic profiles + +There are several important details which distinguish ASTF profiles +from stateless profiles. + +### General considerations + +#### Protocols + +ASTF profiles are limited to either UDP or TCP protocol. + +#### Programs + +Each template in the profile defines two "programs", one for the client side +and one for the server side. + +Each program specifies when that side has to wait until enough data is received +(counted in packets for UDP and in bytes for TCP) +and when to send additional data. Together, the two programs +define a single transaction. Due to packet loss, transaction may take longer, +use more packets (retransmission) or never finish in its entirety. + +#### Instances + +A client instance is created according to TPS parameter for the trial, +and sends the first packet of the transaction (in some cases more packets). +Each client instance uses a different source address (see sequencing below) +and some source port. The destination address also comes from a range, +but destination port has to be constant for a given program. + +TRex uses an opaque way to chose source ports, but as session counting shows, +next client with the same source address uses a different source port. + +Server instance is created when the first packet arrives to the server side. +Source address and port of the first packet are used as destination address +and port for the server responses. This is the ability we need +when outside surface is not predictable. + +When a program reaches its end, the instance is deleted. +This creates possible issues with server instances. If the server instance +does not read all the data client has sent, late data packets +can cause a second copy of server instance to be created, +which breaks assumptions on how many packet a transaction should have. + +The need for server instances to read all the data reduces the overall +bandwidth TRex is able to create in ASTF mode. + +Note that client instances are not created on packets, +so it is safe to end client program without reading all server data +(unless the definition of transaction success requires that). + +#### Sequencing + +ASTF profiles offer two modes for choosing source and destination IP addresses +for client programs: seqential and pseudorandom. +In current tests we are using sequential addressing only (if destination +address varies at all). + +For client destination UDP/TCP port, we use a single constant value. +(TRex can support multiple program pairs in the same traffic profile, +distinguished by the port number.) + +#### Transaction overlap + +If a transaction takes longer to finish, compared to period implied by TPS, +TRex will have multiple client or server instances active at a time. + +During calibration testing we have found this increases CPU utilization, +and for high TPS it can lead to TRex's Rx or Tx buffers becoming full. +This generally leads to duration stretching, and/or packet loss on TRex. + +Currently used transactions were chosen to be short, so risk of bad behavior +is decreased. But in MRR tests, where load is computed based on NIC ability, +not TRex ability, anomalous behavior is still possible +(e.g. MRR values being way lower than NDR). + +#### Delays + +TRex supports adding constant delays to ASTF programs. +This can be useful, for example if we want to separate connection establishment +from data transfer. + +But as TRex tracks delayed instances as active, this still results +in higher CPU utilization and reduced performance issues +(as other overlaping transactions). So the current tests do not use any delays. + +#### Keepalives + +Both UDP and TCP protocol implementations in TRex programs support keepalive +duration. That means there is a configurable period of keepalive time, +and TRex sends keepalive packets automatically (outside the program) +for the time the program is active (started, not ended yet) +but not sending any packets. + +For TCP this is generally not a big deal, as the other side usually +retransmits faster. But for UDP it means a packet loss may leave +the receiving program running. + +In order to avoid keepalive packets, keepalive value is set to a high number. +Here, "high number" means that even at maximum scale and minimum TPS, +there are still no keepalive packets sent within the corresponding +(computed) trial duration. This number is kept the same also for +smaller scale traffic profiles, to simplify maintenance. + +#### Transaction success + +The transaction is considered successful at Layer-7 (L7) level +when both program instances close. At this point, various L7 counters +(unofficial name) are updated on TRex. + +We found that proper close and L7 counter update can be CPU intensive, +whereas lower-level counters (ipackets, opackets) called L2 counters +can keep up with higher loads. + +For some tests, we do not need to confirm the whole transaction was successful. +CPS (connections per second) tests are a typical example. +We care only for NAT44ed creating a session (needs one packet +in inside-to-outside direction per session) and being able to use it +(needs one packet in outside-to-inside direction). + +Similarly in TPUT tests (packet throuput, counting both control +and data packets), we care about NAT44ed ability to forward packets, +we do not care whether aplications (TRex) can fully process them at that rate. + +Therefore each type of tests has its own formula (usually just one counter +already provided by TRex) to count "successful enough" transactions +and attempted transactions. Currently, all tests relying on L7 counters +use size-limited profiles, so they know what the count of attempted +transactions should be, but due to duration stretching +TRex might have been unable to send that many packets. +For search purposes, unattempted transactions are treated the same +as attempted but failed transactions. + +Sometimes even the number of transactions as tracked by search algorithm +does not match the transactions as defined by ASTF programs. +See TCP TPUT profile below. + +### UDP CPS + +This profile uses a minimalistic transaction to verify NAT44ed session has been +created and it allows outside-to-inside traffic. + +Client instance sends one packet and ends. +Server instance sends one packet upon creation and ends. + +In principle, packet size is configurable, +but currently used tests apply only one value (100 bytes frame). + +Transaction counts as attempted when opackets counter increases on client side. +Transaction counts as successful when ipackets counter increases on client side. + +### TCP CPS + +This profile uses a minimalistic transaction to verify NAT44ed session has been +created and it allows outside-to-inside traffic. + +Client initiates TCP connection. Client waits until connection is confirmed +(by reading zero data bytes). Client ends. +Server accepts the connection. Server waits for indirect confirmation +from client (by waiting for client to initiate close). Server ends. + +Without packet loss, the whole transaction takes 7 packets to finish +(4 and 3 per direction). +From NAT44ed point of view, only the first two are needed to verify +the session got created. + +Packet size is not configurable, but currently used tests report +frame size as 64 bytes. + +Transaction counts as attempted when tcps_connattempt counter increases +on client side. +Transaction counts as successful when tcps_connects counter increases +on client side. + +### UDP TPUT + +This profile uses a small transaction of "request-response" type, +with several packets simulating data payload. + +Client sends 5 packets and closes immediately. +Server reads all 5 packets (needed to avoid late packets creating new +server instances), then sends 5 packets and closes. +The value 5 was chosen to mirror what TCP TPUT (see below) choses. + +Packet size is configurable, currently we have tests for 100, +1518 and 9000 bytes frame (to match size of TCP TPUT data frames, see below). + +As this is a packet oriented test, we do not track the whole +10 packet transaction. Similarly to stateless tests, we treat each packet +as a "transaction" for search algorthm packet loss ratio purposes. +Therefore a "transaction" is attempted when opacket counter on client +or server side is increased. Transaction is successful if ipacket counter +on client or server side is increased. + +If one of 5 client packets is lost, server instance will get stuck +in the reading phase. This probably decreases TRex performance, +but it leads to more stable results then alternatives. + +### TCP TPUT + +This profile uses a small transaction of "request-response" type, +with some data amount to be transferred both ways. + +In CSIT release 22.06, TRex behavior changed, so we needed to edit +the traffic profile. Let us describe the pre-22.06 profile first. + +Client connects, sends 5 data packets worth of data, +receives 5 data packets worth of data and closes its side of the connection. +Server accepts connection, reads 5 data packets worth of data, +sends 5 data packets worth of data and closes its side of the connection. +As usual in TCP, sending side waits for ACK from the receiving side +before proceeding with next step of its program. + +Server read is needed to avoid premature close and second server instance. +Client read is not stricly needed, but ACKs allow TRex to close +the server instance quickly, thus saving CPU and improving performance. + +The number 5 of data packets was chosen so TRex is able to send them +in a single burst, even with 9000 byte frame size (TRex has a hard limit +on initial window size). +That leads to 16 packets (9 of them in c2s direction) to be exchanged +if no loss occurs. +The size of data packets is controlled by the traffic profile setting +the appropriate maximum segment size. Due to TRex restrictions, +the minimal size for IPv4 data frame achievable by this method is 70 bytes, +which is more than our usual minimum of 64 bytes. +For that reason, the data frame sizes available for testing are 100 bytes +(that allows room for eventually adding IPv6 ASTF tests), +1518 bytes and 9000 bytes. There is no control over control packet sizes. + +Exactly as in UDP TPUT, ipackets and opackets counters are used for counting +"transactions" (in fact packets). + +If packet loss occurs, there can be large transaction overlap, even if most +ASTF programs finish eventually. This can lead to big duration stretching +and somehow uneven rate of packets sent. This makes it hard to interpret +MRR results (frequently MRR is below NDR for this reason), +but NDR and PDR results tend to be stable enough. + +In 22.06, the "ACK from the receiving side" behavior changed, +the receiving side started sending ACK sometimes +also before receiving the full set of 5 data packets. +If the previous profile is understood as a "single challenge, single response" +where challenge (and also response) is sent as a burst of 5 data packets, +the new profile uses "bursts" of 1 packet instead, but issues +the challenge-response part 5 times sequentially +(waiting for receiving the response before sending next challenge). +This new profile happens to have the same overall packet count +(when no re-transmissions are needed). +Although it is possibly more taxing for TRex CPU, +the results are comparable to the old traffic profile. + +## Ip4base tests + +Contrary to stateless traffic profiles, we do not have a simple limit +that would guarantee TRex is able to send traffic at specified load. +For that reason, we have added tests where "nat44ed" is replaced by "ip4base". +Instead of NAT44ed processing, the tests set minimalistic IPv4 routes, +so that packets are forwarded in both inside-to-outside and outside-to-inside +directions. + +The packets arrive to server end of TRex with different source address&port +than in NAT44ed tests (no translation to outside values is done with ip4base), +but those are not specified in the stateful traffic profiles. +The server end (as always) uses the received address&port as destination +for outside-to-inside traffic. Therefore the same stateful traffic profile +works for both NAT44ed and ip4base test (of the same scale). + +The NAT44ed results are displayed together with corresponding ip4base results. +If they are similar, TRex is probably the bottleneck. +If NAT44ed result is visibly smaller, it describes the real VPP performance. diff --git a/docs/content/methodology/test/packet_flow_ordering.md b/docs/content/methodology/test/packet_flow_ordering.md new file mode 100644 index 0000000000..c2c87038d4 --- /dev/null +++ b/docs/content/methodology/test/packet_flow_ordering.md @@ -0,0 +1,42 @@ +--- +title: "Packet Flow Ordering" +weight: 2 +--- + +# Packet Flow Ordering + +TRex Traffic Generator (TG) supports two main ways how to cover +address space (on allowed ranges) in scale tests. + +In most cases only one field value (e.g. IPv4 destination address) is +altered, in some cases two fields (e.g. IPv4 destination address and UDP +destination port) are altered. + +## Incremental Ordering + +This case is simpler to implement and offers greater control. + +When changing two fields, they can be incremented synchronously, or one +after another. In the latter case we can specify which one is +incremented each iteration and which is incremented by "carrying over" +only when the other "wraps around". This way also visits all +combinations once before the "carry" field also wraps around. + +It is possible to use increments other than 1. + +## Randomized Ordering + +This case chooses each field value at random (from the allowed range). +In case of two fields, they are treated independently. +TRex allows to set random seed to get deterministic numbers. +We use a different seed for each field and traffic direction. +The seed has to be a non-zero number, we use 1, 2, 3, and so on. + +The seeded random mode in TRex requires a "limit" value, +which acts as a cycle length limit (after this many iterations, +the seed resets to its initial value). +We use the maximal allowed limit value (computed as 2^24 - 1). + +Randomized profiles do not avoid duplicated values, +and do not guarantee each possible value is visited, +so it is not very useful for stateful tests. diff --git a/docs/content/methodology/test/reconfiguration.md b/docs/content/methodology/test/reconfiguration.md new file mode 100644 index 0000000000..6dec4d918b --- /dev/null +++ b/docs/content/methodology/test/reconfiguration.md @@ -0,0 +1,68 @@ +--- +title: "Reconfiguration" +weight: 8 +--- + +# Reconfiguration + +## Overview + +Reconf tests are designed to measure the impact of VPP re-configuration +on data plane traffic. +While VPP takes some measures against the traffic being +entirely stopped for a prolonged time, +the immediate forwarding rate varies during the re-configuration, +as some configurations steps need the active dataplane worker threads +to be stopped temporarily. + +As the usual methods of measuring throughput need multiple trial measurements +with somewhat long durations, and the re-configuration process can also be long, +finding an offered load which would result in zero loss +during the re-configuration process would be time-consuming. + +Instead, reconf tests first find a througput value (lower bound for NDR) +without re-configuration, and then maintain that ofered load +during re-configuration. The measured loss count is then assumed to be caused +by the re-configuration process. The result published by reconf tests +is the effective blocked time, that is +the loss count divided by the offered load. + +## Current Implementation + +Each reconf suite is based on a similar MLRsearch performance suite. + +MLRsearch parameters are changed to speed up the throughput discovery. +For example, PDR is not searched for, and the final trial duration is shorter. + +The MLRsearch suite has to contain a configuration parameter +that can be scaled up, e.g. number of tunnels or number of service chains. +Currently, only increasing the scale is supported +as the re-configuration operation. In future, scale decrease +or other operations can be implemented. + +The traffic profile is not changed, so the traffic present is processed +only by the smaller scale configuration. The added tunnels / chains +are not targetted by the traffic. + +For the re-configuration, the same Robot Framework and Python libraries +are used, as were used in the initial configuration, with the exception +of the final calls that do not interact with VPP (e.g. starting +virtual machines) being skipped to reduce the test overall duration. + +## Discussion + +Robot Framework introduces a certain overhead, which may affect timing +of individual VPP API calls, which in turn may affect +the number of packets lost. + +The exact calls executed may contain unnecessary info dumps, repeated commands, +or commands which change a value that do not need to be changed (e.g. MTU). +Thus, implementation details are affecting the results, even if their effect +on the corresponding MLRsearch suite is negligible. + +The lower bound for NDR is the only value safe to be used when zero packets lost +are expected without re-configuration. But different suites show different +"jitter" in that value. For some suites, the lower bound is not tight, +allowing full NIC buffers to drain quickly between worker pauses. +For other suites, lower bound for NDR still has quite a large probability +of non-zero packet loss even without re-configuration. diff --git a/docs/content/methodology/test/tunnel_encapsulations.md b/docs/content/methodology/test/tunnel_encapsulations.md new file mode 100644 index 0000000000..c047c43dfa --- /dev/null +++ b/docs/content/methodology/test/tunnel_encapsulations.md @@ -0,0 +1,87 @@ +--- +title: "Tunnel Encapsulations" +weight: 3 +--- + +# Tunnel Encapsulations + +Tunnel encapsulations testing is grouped based on the type of outer +header: IPv4 or IPv6. + +## IPv4 Tunnels + +VPP is tested in the following IPv4 tunnel baseline configurations: + +- *ip4vxlan-l2bdbase*: VXLAN over IPv4 tunnels with L2 bridge-domain MAC + switching. +- *ip4vxlan-l2xcbase*: VXLAN over IPv4 tunnels with L2 cross-connect. +- *ip4lispip4-ip4base*: LISP over IPv4 tunnels with IPv4 routing. +- *ip4lispip6-ip6base*: LISP over IPv4 tunnels with IPv6 routing. +- *ip4gtpusw-ip4base*: GTPU over IPv4 tunnels with IPv4 routing. + +In all cases listed above low number of MAC, IPv4, IPv6 flows (253 or 254 per +direction) is switched or routed by VPP. + +In addition selected IPv4 tunnels are tested at scale: + +- *dot1q--ip4vxlanscale-l2bd*: VXLAN over IPv4 tunnels with L2 bridge- + domain MAC switching, with scaled up dot1q VLANs (10, 100, 1k), + mapped to scaled up L2 bridge-domains (10, 100, 1k), that are in turn + mapped to (10, 100, 1k) VXLAN tunnels. 64.5k flows are transmitted per + direction. + +## IPv6 Tunnels + +VPP is tested in the following IPv6 tunnel baseline configurations: + +- *ip6lispip4-ip4base*: LISP over IPv4 tunnels with IPv4 routing. +- *ip6lispip6-ip6base*: LISP over IPv4 tunnels with IPv6 routing. + +In all cases listed above low number of IPv4, IPv6 flows (253 or 254 per +direction) is routed by VPP. + +## GENEVE + +### GENEVE Prefix Bindings + +GENEVE prefix bindings should be representative to target applications, where +a packet flows of particular set of IPv4 addresses (L3 underlay network) is +routed via dedicated GENEVE interface by building an L2 overlay. + +Private address ranges to be used in tests: + +- East hosts ip address range: 10.0.1.0 - 10.127.255.255 (10.0/9 prefix) + - Total of 2^23 - 256 (8 388 352) of usable IPv4 addresses + - Usable in tests for up to 32 767 GENEVE tunnels (IPv4 underlay networks) +- West hosts ip address range: 10.128.1.0 - 10.255.255.255 (10.128/9 prefix) + - Total of 2^23 - 256 (8 388 352) of usable IPv4 addresses + - Usable in tests for up to 32 767 GENEVE tunnels (IPv4 underlay networks) + +### GENEVE Tunnel Scale + +If N is a number of GENEVE tunnels (and IPv4 underlay networks) then TG sends +256 packet flows in every of N different sets: + +- i = 1,2,3, ... N - GENEVE tunnel index +- East-West direction: GENEVE encapsulated packets + - Outer IP header: + - src ip: 1.1.1.1 + - dst ip: 1.1.1.2 + - GENEVE header: + - vni: i + - Inner IP header: + - src_ip_range(i) = 10.(0 + rounddown(i/255)).(modulo(i/255)).(0-to-255) + - dst_ip_range(i) = 10.(128 + rounddown(i/255)).(modulo(i/255)).(0-to-255) +- West-East direction: non-encapsulated packets + - IP header: + - src_ip_range(i) = 10.(128 + rounddown(i/255)).(modulo(i/255)).(0-to-255) + - dst_ip_range(i) = 10.(0 + rounddown(i/255)).(modulo(i/255)).(0-to-255) + + **geneve-tunnels** | **total-flows** +-------------------:|----------------: + 1 | 256 + 4 | 1 024 + 16 | 4 096 + 64 | 16 384 + 256 | 65 536 + 1 024 | 262 144 diff --git a/docs/content/methodology/test/vpp_device.md b/docs/content/methodology/test/vpp_device.md new file mode 100644 index 0000000000..0a5ee90308 --- /dev/null +++ b/docs/content/methodology/test/vpp_device.md @@ -0,0 +1,15 @@ +--- +title: "VPP Device" +weight: 9 +--- + +# VPP Device + +Includes VPP_Device test environment for functional VPP +device tests integrated into LFN CI/CD infrastructure. VPP_Device tests +run on 1-Node testbeds (1n-skx, 1n-arm) and rely on Linux SRIOV Virtual +Function (VF), dot1q VLAN tagging and external loopback cables to +facilitate packet passing over external physical links. Initial focus is +on few baseline tests. New device tests can be added by small edits +to existing CSIT Performance (2-node) test. RF test definition code +stays unchanged with the exception of traffic generator related L2 KWs. diff --git a/docs/content/methodology/trending/_index.md b/docs/content/methodology/trending/_index.md new file mode 100644 index 0000000000..4289e7ff96 --- /dev/null +++ b/docs/content/methodology/trending/_index.md @@ -0,0 +1,12 @@ +--- +bookCollapseSection: true +bookFlatSection: false +title: "Trending" +weight: 4 +--- + +# Trending + +This document describes a high-level design of a system for continuous +performance measuring, trending and change detection for FD.io VPP SW +data plane (and other performance tests run within CSIT sub-project). diff --git a/docs/content/methodology/trending/analysis.md b/docs/content/methodology/trending/analysis.md new file mode 100644 index 0000000000..fe952259ab --- /dev/null +++ b/docs/content/methodology/trending/analysis.md @@ -0,0 +1,224 @@ +--- +title: "Analysis" +weight: 1 +--- + +# Trend Analysis + +All measured performance trend data is treated as time-series data +that is modeled as a concatenation of groups, +within each group the samples come (independently) from +the same normal distribution (with some center and standard deviation). + +Center of the normal distribution for the group (equal to population average) +is called a trend for the group. +All the analysis is based on finding the right partition into groups +and comparing their trends. + +## Anomalies in graphs + +In graphs, the start of the following group is marked as a regression (red +circle) or progression (green circle), if the new trend is lower (or higher +respectively) then the previous group's. + +## Implementation details + +### Partitioning into groups + +While sometimes the samples within a group are far from being distributed +normally, currently we do not have a better tractable model. + +Here, "sample" should be the result of single trial measurement, with group +boundaries set only at test run granularity. But in order to avoid detecting +causes unrelated to VPP performance, the current presentation takes average of +all trials within the run as the sample. Effectively, this acts as a single +trial with aggregate duration. + +Performance graphs show the run average as a dot (not all individual trial +results). + +The group boundaries are selected based on `Minimum Description Length`[^1]. + +### Minimum Description Length + +`Minimum Description Length`[^1] (MDL) is a particular formalization +of `Occam's razor`[^2] principle. + +The general formulation mandates to evaluate a large set of models, +but for anomaly detection purposes, it is useful to consider +a smaller set of models, so that scoring and comparing them is easier. + +For each candidate model, the data should be compressed losslessly, +which includes model definitions, encoded model parameters, +and the raw data encoded based on probabilities computed by the model. +The model resulting in shortest compressed message is the "the" correct model. + +For our model set (groups of normally distributed samples), +we need to encode group length (which penalizes too many groups), +group average (more on that later), group stdev and then all the samples. + +Luckily, the "all the samples" part turns out to be quite easy to compute. +If sample values are considered as coordinates in (multi-dimensional) +Euclidean space, fixing stdev means the point with allowed coordinates +lays on a sphere. Fixing average intersects the sphere with a (hyper)-plane, +and Gaussian probability density on the resulting sphere is constant. +So the only contribution is the "area" of the sphere, which only depends +on the number of samples and stdev. + +A somehow ambiguous part is in choosing which encoding +is used for group size, average and stdev. +Different encodings cause different biases to large or small values. +In our implementation we have chosen probability density +corresponding to uniform distribution (from zero to maximal sample value) +for stdev and average of the first group, +but for averages of subsequent groups we have chosen a distribution +which discourages delimiting groups with averages close together. + +Our implementation assumes that measurement precision is 1.0 pps. +Thus it is slightly wrong for trial durations other than 1.0 seconds. +Also, all the calculations assume 1.0 pps is totally negligible, +compared to stdev value. + +The group selection algorithm currently has no parameters, +all the aforementioned encodings and handling of precision is hard-coded. +In principle, every group selection is examined, and the one encodable +with least amount of bits is selected. +As the bit amount for a selection is just sum of bits for every group, +finding the best selection takes number of comparisons +quadratically increasing with the size of data, +the overall time complexity being probably cubic. + +The resulting group distribution looks good +if samples are distributed normally enough within a group. +But for obviously different distributions (for example +`bimodal distribution`[^3]) the groups tend to focus on less relevant factors +(such as "outlier" density). + +## Common Patterns + +When an anomaly is detected, it frequently falls into few known patterns, +each having its typical behavior over time. + +We are going to describe the behaviors, +as they motivate our choice of trend compliance metrics. + +### Sample time and analysis time + +But first we need to distinguish two roles time plays in analysis, +so it is more clear which role we are referring to. + +Sample time is the more obvious one. +It is the time the sample is generated. +It is the start time or the end time of the Jenkins job run, +does not really matter which (parallel runs are disabled, +and length of gap between samples does not affect metrics). + +Analysis time is the time the current analysis is computed. +Again, the exact time does not usually matter, +what matters is how many later (and how fewer earlier) samples +were considered in the computation. + +For some patterns, it is usual for a previously reported +anomaly to "vanish", or previously unseen anomaly to "appear late", +as later samples change which partition into groups is more probable. + +Dashboard and graphs are always showing the latest analysis time, +the compliance metrics are using earlier sample time +with the same latest analysis time. + +Alerting e-mails use the latest analysis time at the time of sending, +so the values reported there are likely to be different +from the later analysis time results shown in dashboard and graphs. + +### Ordinary regression + +The real performance changes from previously stable value +into a new stable value. + +For medium to high magnitude of the change, one run +is enough for anomaly detection to mark this regression. + +Ordinary progressions are detected in the same way. + +### Small regression + +The real performance changes from previously stable value +into a new stable value, but the difference is small. + +For the anomaly detection algorithm, this change is harder to detect, +depending on the standard deviation of the previous group. + +If the new performance value stays stable, eventually +the detection algorithm is able to detect this anomaly +when there are enough samples around the new value. + +If the difference is too small, it may remain undetected +(as new performance change happens, or full history of samples +is still not enough for the detection). + +Small progressions have the same behavior. + +### Reverted regression + +This pattern can have two different causes. +We would like to distinguish them, but that is usually +not possible to do just by looking at the measured values (and not telemetry). + +In one cause, the real DUT performance has changed, +but got restored immediately. +In the other cause, no real performance change happened, +just some temporary infrastructure issue +has caused a wrong low value to be measured. + +For small measured changes, this pattern may remain undetected. +For medium and big measured changes, this is detected when the regression +happens on just the last sample. + +For big changes, the revert is also immediately detected +as a subsequent progression. The trend is usually different +from the previously stable trend (as the two population averages +are not likely to be exactly equal), but the difference +between the two trends is relatively small. + +For medium changes, the detection algorithm may need several new samples +to detect a progression (as it dislikes single sample groups), +in the meantime reporting regressions (difference decreasing +with analysis time), until it stabilizes the same way as for big changes +(regression followed by progression, small difference +between the old stable trend and last trend). + +As it is very hard for a fault code or an infrastructure issue +to increase performance, the opposite (temporary progression) +almost never happens. + +### Summary + +There is a trade-off between detecting small regressions +and not reporting the same old regressions for a long time. + +For people reading e-mails, a sudden regression with a big number of samples +in the last group means this regression was hard for the algorithm to detect. + +If there is a big regression with just one run in the last group, +we are not sure if it is real, or just a temporary issue. +It is useful to wait some time before starting an investigation. + +With decreasing (absolute value of) difference, the number of expected runs +increases. If there is not enough runs, we still cannot distinguish +real regression from temporary regression just from the current metrics +(although humans frequently can tell by looking at the graph). + +When there is a regression or progression with just a small difference, +it is probably an artifact of a temporary regression. +Not worth examining, unless temporary regressions happen somewhat frequently. + +It is not easy for the metrics to locate the previous stable value, +especially if multiple anomalies happened in the last few weeks. +It is good to compare last trend with long term trend maximum, +as it highlights the difference between "now" and "what could be". +It is good to exclude last week from the trend maximum, +as including the last week would hide all real progressions. + +[^1]: [Minimum Description Length](https://en.wikipedia.org/wiki/Minimum_description_length) +[^2]: [Occam's Razor](https://en.wikipedia.org/wiki/Occam%27s_razor) +[^3]: [Bimodal Distribution](https://en.wikipedia.org/wiki/Bimodal_distribution) diff --git a/docs/content/methodology/trending/presentation.md b/docs/content/methodology/trending/presentation.md new file mode 100644 index 0000000000..84925b46c8 --- /dev/null +++ b/docs/content/methodology/trending/presentation.md @@ -0,0 +1,34 @@ +--- +title: "Presentation" +weight: 2 +--- + +# Trend Presentation + +## Failed tests + +The Failed tests tables list the tests which failed during the last test run. +Separate tables are generated for each testbed. + +## Regressions and progressions + +These tables list tests which encountered a regression or progression during the +specified time period, which is currently set to the last 21 days. + +## Trendline Graphs + +Trendline graphs show measured per run averages of MRR values, NDR or PDR +values, group average values, and detected anomalies. +The graphs are constructed as follows: + +- X-axis represents the date in the format MMDD. +- Y-axis represents run-average MRR value, NDR or PDR values in Mpps. For PDR + tests also a graph with average latency at 50% PDR [us] is generated. +- Markers to indicate anomaly classification: + - Regression - red circle. + - Progression - green circle. +- The line shows average MRR value of each group. + +In addition the graphs show dynamic labels while hovering over graph data +points, presenting the CSIT build date, measured value, VPP reference, trend job +build ID and the LF testbed ID. diff --git a/docs/content/methodology/trending_methodology/_index.md b/docs/content/methodology/trending_methodology/_index.md deleted file mode 100644 index 551d950cc7..0000000000 --- a/docs/content/methodology/trending_methodology/_index.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -bookCollapseSection: true -bookFlatSection: false -title: "Trending Methodology" -weight: 22 ---- \ No newline at end of file diff --git a/docs/content/methodology/trending_methodology/overview.md b/docs/content/methodology/trending_methodology/overview.md deleted file mode 100644 index 90d8a2507c..0000000000 --- a/docs/content/methodology/trending_methodology/overview.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -title: "Overview" -weight: 1 ---- - -# Overview - -This document describes a high-level design of a system for continuous -performance measuring, trending and change detection for FD.io VPP SW -data plane (and other performance tests run within CSIT sub-project). diff --git a/docs/content/methodology/trending_methodology/trend_analysis.md b/docs/content/methodology/trending_methodology/trend_analysis.md deleted file mode 100644 index 7f1870f577..0000000000 --- a/docs/content/methodology/trending_methodology/trend_analysis.md +++ /dev/null @@ -1,224 +0,0 @@ ---- -title: "Trending Analysis" -weight: 2 ---- - -# Trend Analysis - -All measured performance trend data is treated as time-series data -that is modeled as a concatenation of groups, -within each group the samples come (independently) from -the same normal distribution (with some center and standard deviation). - -Center of the normal distribution for the group (equal to population average) -is called a trend for the group. -All the analysis is based on finding the right partition into groups -and comparing their trends. - -## Anomalies in graphs - -In graphs, the start of the following group is marked as a regression (red -circle) or progression (green circle), if the new trend is lower (or higher -respectively) then the previous group's. - -## Implementation details - -### Partitioning into groups - -While sometimes the samples within a group are far from being distributed -normally, currently we do not have a better tractable model. - -Here, "sample" should be the result of single trial measurement, with group -boundaries set only at test run granularity. But in order to avoid detecting -causes unrelated to VPP performance, the current presentation takes average of -all trials within the run as the sample. Effectively, this acts as a single -trial with aggregate duration. - -Performance graphs show the run average as a dot (not all individual trial -results). - -The group boundaries are selected based on `Minimum Description Length`[^1]. - -### Minimum Description Length - -`Minimum Description Length`[^1] (MDL) is a particular formalization -of `Occam's razor`[^2] principle. - -The general formulation mandates to evaluate a large set of models, -but for anomaly detection purposes, it is useful to consider -a smaller set of models, so that scoring and comparing them is easier. - -For each candidate model, the data should be compressed losslessly, -which includes model definitions, encoded model parameters, -and the raw data encoded based on probabilities computed by the model. -The model resulting in shortest compressed message is the "the" correct model. - -For our model set (groups of normally distributed samples), -we need to encode group length (which penalizes too many groups), -group average (more on that later), group stdev and then all the samples. - -Luckily, the "all the samples" part turns out to be quite easy to compute. -If sample values are considered as coordinates in (multi-dimensional) -Euclidean space, fixing stdev means the point with allowed coordinates -lays on a sphere. Fixing average intersects the sphere with a (hyper)-plane, -and Gaussian probability density on the resulting sphere is constant. -So the only contribution is the "area" of the sphere, which only depends -on the number of samples and stdev. - -A somehow ambiguous part is in choosing which encoding -is used for group size, average and stdev. -Different encodings cause different biases to large or small values. -In our implementation we have chosen probability density -corresponding to uniform distribution (from zero to maximal sample value) -for stdev and average of the first group, -but for averages of subsequent groups we have chosen a distribution -which discourages delimiting groups with averages close together. - -Our implementation assumes that measurement precision is 1.0 pps. -Thus it is slightly wrong for trial durations other than 1.0 seconds. -Also, all the calculations assume 1.0 pps is totally negligible, -compared to stdev value. - -The group selection algorithm currently has no parameters, -all the aforementioned encodings and handling of precision is hard-coded. -In principle, every group selection is examined, and the one encodable -with least amount of bits is selected. -As the bit amount for a selection is just sum of bits for every group, -finding the best selection takes number of comparisons -quadratically increasing with the size of data, -the overall time complexity being probably cubic. - -The resulting group distribution looks good -if samples are distributed normally enough within a group. -But for obviously different distributions (for example -`bimodal distribution`[^3]) the groups tend to focus on less relevant factors -(such as "outlier" density). - -## Common Patterns - -When an anomaly is detected, it frequently falls into few known patterns, -each having its typical behavior over time. - -We are going to describe the behaviors, -as they motivate our choice of trend compliance metrics. - -### Sample time and analysis time - -But first we need to distinguish two roles time plays in analysis, -so it is more clear which role we are referring to. - -Sample time is the more obvious one. -It is the time the sample is generated. -It is the start time or the end time of the Jenkins job run, -does not really matter which (parallel runs are disabled, -and length of gap between samples does not affect metrics). - -Analysis time is the time the current analysis is computed. -Again, the exact time does not usually matter, -what matters is how many later (and how fewer earlier) samples -were considered in the computation. - -For some patterns, it is usual for a previously reported -anomaly to "vanish", or previously unseen anomaly to "appear late", -as later samples change which partition into groups is more probable. - -Dashboard and graphs are always showing the latest analysis time, -the compliance metrics are using earlier sample time -with the same latest analysis time. - -Alerting e-mails use the latest analysis time at the time of sending, -so the values reported there are likely to be different -from the later analysis time results shown in dashboard and graphs. - -### Ordinary regression - -The real performance changes from previously stable value -into a new stable value. - -For medium to high magnitude of the change, one run -is enough for anomaly detection to mark this regression. - -Ordinary progressions are detected in the same way. - -### Small regression - -The real performance changes from previously stable value -into a new stable value, but the difference is small. - -For the anomaly detection algorithm, this change is harder to detect, -depending on the standard deviation of the previous group. - -If the new performance value stays stable, eventually -the detection algorithm is able to detect this anomaly -when there are enough samples around the new value. - -If the difference is too small, it may remain undetected -(as new performance change happens, or full history of samples -is still not enough for the detection). - -Small progressions have the same behavior. - -### Reverted regression - -This pattern can have two different causes. -We would like to distinguish them, but that is usually -not possible to do just by looking at the measured values (and not telemetry). - -In one cause, the real DUT performance has changed, -but got restored immediately. -In the other cause, no real performance change happened, -just some temporary infrastructure issue -has caused a wrong low value to be measured. - -For small measured changes, this pattern may remain undetected. -For medium and big measured changes, this is detected when the regression -happens on just the last sample. - -For big changes, the revert is also immediately detected -as a subsequent progression. The trend is usually different -from the previously stable trend (as the two population averages -are not likely to be exactly equal), but the difference -between the two trends is relatively small. - -For medium changes, the detection algorithm may need several new samples -to detect a progression (as it dislikes single sample groups), -in the meantime reporting regressions (difference decreasing -with analysis time), until it stabilizes the same way as for big changes -(regression followed by progression, small difference -between the old stable trend and last trend). - -As it is very hard for a fault code or an infrastructure issue -to increase performance, the opposite (temporary progression) -almost never happens. - -### Summary - -There is a trade-off between detecting small regressions -and not reporting the same old regressions for a long time. - -For people reading e-mails, a sudden regression with a big number of samples -in the last group means this regression was hard for the algorithm to detect. - -If there is a big regression with just one run in the last group, -we are not sure if it is real, or just a temporary issue. -It is useful to wait some time before starting an investigation. - -With decreasing (absolute value of) difference, the number of expected runs -increases. If there is not enough runs, we still cannot distinguish -real regression from temporary regression just from the current metrics -(although humans frequently can tell by looking at the graph). - -When there is a regression or progression with just a small difference, -it is probably an artifact of a temporary regression. -Not worth examining, unless temporary regressions happen somewhat frequently. - -It is not easy for the metrics to locate the previous stable value, -especially if multiple anomalies happened in the last few weeks. -It is good to compare last trend with long term trend maximum, -as it highlights the difference between "now" and "what could be". -It is good to exclude last week from the trend maximum, -as including the last week would hide all real progressions. - -[^1]: [Minimum Description Length](https://en.wikipedia.org/wiki/Minimum_description_length) -[^2]: [Occam's razor](https://en.wikipedia.org/wiki/Occam%27s_razor) -[^3]: [bimodal distribution](https://en.wikipedia.org/wiki/Bimodal_distribution) diff --git a/docs/content/methodology/trending_methodology/trend_presentation.md b/docs/content/methodology/trending_methodology/trend_presentation.md deleted file mode 100644 index 4c58589a0b..0000000000 --- a/docs/content/methodology/trending_methodology/trend_presentation.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -title: "Trending Presentation" -weight: 3 ---- - -# Trend Presentation - -## Failed tests - -The Failed tests tables list the tests which failed during the last test run. -Separate tables are generated for each testbed. - -## Regressions and progressions - -These tables list tests which encountered a regression or progression during the -specified time period, which is currently set to the last 21 days. - -## Trendline Graphs - -Trendline graphs show measured per run averages of MRR values, NDR or PDR -values, group average values, and detected anomalies. -The graphs are constructed as follows: - -- X-axis represents the date in the format MMDD. -- Y-axis represents run-average MRR value, NDR or PDR values in Mpps. For PDR - tests also a graph with average latency at 50% PDR [us] is generated. -- Markers to indicate anomaly classification: - - - Regression - red circle. - - Progression - green circle. - -- The line shows average MRR value of each group. - -In addition the graphs show dynamic labels while hovering over graph data -points, presenting the CSIT build date, measured value, VPP reference, trend job -build ID and the LF testbed ID. diff --git a/docs/content/methodology/trex_traffic_generator.md b/docs/content/methodology/trex_traffic_generator.md deleted file mode 100644 index 4f62d91c47..0000000000 --- a/docs/content/methodology/trex_traffic_generator.md +++ /dev/null @@ -1,195 +0,0 @@ ---- -title: "TRex Traffic Generator" -weight: 5 ---- - -# TRex Traffic Generator - -## Usage - -[TRex traffic generator](https://trex-tgn.cisco.com) is used for majority of -CSIT performance tests. TRex is used in multiple types of performance tests, -see [Data Plane Throughtput]({{< ref "data_plane_throughput/data_plane_throughput/#Data Plane Throughtput" >}}) -for more detail. - -## Traffic modes - -TRex is primarily used in two (mutually incompatible) modes. - -### Stateless mode - -Sometimes abbreviated as STL. -A mode with high performance, which is unable to react to incoming traffic. -We use this mode whenever it is possible. -Typical test where this mode is not applicable is NAT44ED, -as DUT does not assign deterministic outside address+port combinations, -so we are unable to create traffic that does not lose packets -in out2in direction. - -Measurement results are based on simple L2 counters -(opackets, ipackets) for each traffic direction. - -### Stateful mode - -A mode capable of reacting to incoming traffic. -Contrary to the stateless mode, only UDP and TCP is supported -(carried over IPv4 or IPv6 packets). -Performance is limited, as TRex needs to do more CPU processing. -TRex suports two subtypes of stateful traffic, -CSIT uses ASTF (Advanced STateFul mode). - -This mode is suitable for NAT44ED tests, as clients send packets from inside, -and servers react to it, so they see the outside address and port to respond to. -Also, they do not send traffic before NAT44ED has created the corresponding -translation entry. - -When possible, L2 counters (opackets, ipackets) are used. -Some tests need L7 counters, which track protocol state (e.g. TCP), -but those values are less than reliable on high loads. - -## Traffic Continuity - -Generated traffic is either continuous, or limited (by number of transactions). -Both modes support both continuities in principle. - -### Continuous traffic - -Traffic is started without any data size goal. -Traffic is ended based on time duration, as hinted by search algorithm. -This is useful when DUT behavior does not depend on the traffic duration. -The default for stateless mode. - -### Limited traffic - -Traffic has defined data size goal (given as number of transactions), -duration is computed based on this goal. -Traffic is ended when the size goal is reached, -or when the computed duration is reached. -This is useful when DUT behavior depends on traffic size, -e.g. target number of NAT translation entries, each to be hit exactly once -per direction. -This is used mainly for stateful mode. - -## Traffic synchronicity - -Traffic can be generated synchronously (test waits for duration) -or asynchronously (test operates during traffic and stops traffic explicitly). - -### Synchronous traffic - -Trial measurement is driven by given (or precomputed) duration, -no activity from test driver during the traffic. -Used for most trials. - -### Asynchronous traffic - -Traffic is started, but then the test driver is free to perform -other actions, before stopping the traffic explicitly. -This is used mainly by reconf tests, but also by some trials -used for runtime telemetry. - -## Trafic profiles - -TRex supports several ways to define the traffic. -CSIT uses small Python modules based on Scapy as definitions. -Details of traffic profiles depend on modes (STL or ASTF), -but some are common for both modes. - -Search algorithms are intentionally unaware of the traffic mode used, -so CSIT defines some terms to use instead of mode-specific TRex terms. - -### Transactions - -TRex traffic profile defines a small number of behaviors, -in CSIT called transaction templates. Traffic profiles also instruct -TRex how to create a large number of transactions based on the templates. - -Continuous traffic loops over the generated transactions. -Limited traffic usually executes each transaction once -(typically as constant number of loops over source addresses, -each loop with different source ports). - -Currently, ASTF profiles define one transaction template each. -Number of packets expected per one transaction varies based on profile details, -as does the criterion for when a transaction is considered successful. - -Stateless transactions are just one packet (sent from one TG port, -successful if received on the other TG port). -Thus unidirectional stateless profiles define one transaction template, -bidirectional stateless profiles define two transaction templates. - -### TPS multiplier - -TRex aims to open transaction specified by the profile at a steady rate. -While TRex allows the transaction template to define its intended "cps" value, -CSIT does not specify it, so the default value of 1 is applied, -meaning TRex will open one transaction per second (and transaction template) -by default. But CSIT invocation uses "multiplier" (mult) argument -when starting the traffic, that multiplies the cps value, -meaning it acts as TPS (transactions per second) input. - -With a slight abuse of nomenclature, bidirectional stateless tests -set "packets per transaction" value to 2, just to keep the TPS semantics -as a unidirectional input value. - -### Duration stretching - -TRex can be IO-bound, CPU-bound, or have any other reason -why it is not able to generate the traffic at the requested TPS. -Some conditions are detected, leading to TRex failure, -for example when the bandwidth does not fit into the line capacity. -But many reasons are not detected. - -Unfortunately, TRex frequently reacts by not honoring the duration -in synchronous mode, taking longer to send the traffic, -leading to lower then requested load offered to DUT. -This usualy breaks assumptions used in search algorithms, -so it has to be avoided. - -For stateless traffic, the behavior is quite deterministic, -so the workaround is to apply a fictional TPS limit (max_rate) -to search algorithms, usually depending only on the NIC used. - -For stateful traffic the behavior is not deterministic enough, -for example the limit for TCP traffic depends on DUT packet loss. -In CSIT we decided to use logic similar to asynchronous traffic. -The traffic driver sleeps for a time, then stops the traffic explicitly. -The library that parses counters into measurement results -than usually treats unsent packets/transactions as lost/failed. - -We have added a IP4base tests for every NAT44ED test, -so that users can compare results. -If the results are very similar, it is probable TRex was the bottleneck. - -### Startup delay - -By investigating TRex behavior, it was found that TRex does not start -the traffic in ASTF mode immediately. There is a delay of zero traffic, -after which the traffic rate ramps up to the defined TPS value. - -It is possible to poll for counters during the traffic -(fist nonzero means traffic has started), -but that was found to influence the NDR results. - -Thus "sleep and stop" stategy is used, which needs a correction -to the computed duration so traffic is stopped after the intended -duration of real traffic. Luckily, it turns out this correction -is not dependend on traffic profile nor CPU used by TRex, -so a fixed constant (0.112 seconds) works well. -Unfortunately, the constant may depend on TRex version, -or execution environment (e.g. TRex in AWS). - -The result computations need a precise enough duration of the real traffic, -luckily server side of TRex has precise enough counter for that. - -It is unknown whether stateless traffic profiles also exhibit a startup delay. -Unfortunately, stateless mode does not have similarly precise duration counter, -so some results (mostly MRR) are affected by less precise duration measurement -in Python part of CSIT code. - -## Measuring Latency - -If measurement of latency is requested, two more packet streams are -created (one for each direction) with TRex flow_stats parameter set to -STLFlowLatencyStats. In that case, returned statistics will also include -min/avg/max latency values and encoded HDRHistogram data. \ No newline at end of file diff --git a/docs/content/methodology/tunnel_encapsulations.md b/docs/content/methodology/tunnel_encapsulations.md deleted file mode 100644 index 52505b7efb..0000000000 --- a/docs/content/methodology/tunnel_encapsulations.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -title: "Tunnel Encapsulations" -weight: 10 ---- - -# Tunnel Encapsulations - -Tunnel encapsulations testing is grouped based on the type of outer -header: IPv4 or IPv6. - -## IPv4 Tunnels - -VPP is tested in the following IPv4 tunnel baseline configurations: - -- *ip4vxlan-l2bdbase*: VXLAN over IPv4 tunnels with L2 bridge-domain MAC - switching. -- *ip4vxlan-l2xcbase*: VXLAN over IPv4 tunnels with L2 cross-connect. -- *ip4lispip4-ip4base*: LISP over IPv4 tunnels with IPv4 routing. -- *ip4lispip6-ip6base*: LISP over IPv4 tunnels with IPv6 routing. -- *ip4gtpusw-ip4base*: GTPU over IPv4 tunnels with IPv4 routing. - -In all cases listed above low number of MAC, IPv4, IPv6 flows (253 or 254 per -direction) is switched or routed by VPP. - -In addition selected IPv4 tunnels are tested at scale: - -- *dot1q--ip4vxlanscale-l2bd*: VXLAN over IPv4 tunnels with L2 bridge- - domain MAC switching, with scaled up dot1q VLANs (10, 100, 1k), - mapped to scaled up L2 bridge-domains (10, 100, 1k), that are in turn - mapped to (10, 100, 1k) VXLAN tunnels. 64.5k flows are transmitted per - direction. - -## IPv6 Tunnels - -VPP is tested in the following IPv6 tunnel baseline configurations: - -- *ip6lispip4-ip4base*: LISP over IPv4 tunnels with IPv4 routing. -- *ip6lispip6-ip6base*: LISP over IPv4 tunnels with IPv6 routing. - -In all cases listed above low number of IPv4, IPv6 flows (253 or 254 per -direction) is routed by VPP. diff --git a/docs/content/methodology/vpp_device_functional.md b/docs/content/methodology/vpp_device_functional.md deleted file mode 100644 index 2bad5973b6..0000000000 --- a/docs/content/methodology/vpp_device_functional.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -title: "VPP_Device Functional" -weight: 18 ---- - -# VPP_Device Functional - -Includes VPP_Device test environment for functional VPP -device tests integrated into LFN CI/CD infrastructure. VPP_Device tests -run on 1-Node testbeds (1n-skx, 1n-arm) and rely on Linux SRIOV Virtual -Function (VF), dot1q VLAN tagging and external loopback cables to -facilitate packet passing over external physical links. Initial focus is -on few baseline tests. New device tests can be added by small edits -to existing CSIT Performance (2-node) test. RF test definition code -stays unchanged with the exception of traffic generator related L2 KWs. diff --git a/docs/content/methodology/vpp_forwarding_modes.md b/docs/content/methodology/vpp_forwarding_modes.md deleted file mode 100644 index 1cc199c607..0000000000 --- a/docs/content/methodology/vpp_forwarding_modes.md +++ /dev/null @@ -1,104 +0,0 @@ ---- -title: "VPP Forwarding Modes" -weight: 3 ---- - -# VPP Forwarding Modes - -VPP is tested in a number of L2, IPv4 and IPv6 packet lookup and -forwarding modes. Within each mode baseline and scale tests are -executed, the latter with varying number of FIB entries. - -## L2 Ethernet Switching - -VPP is tested in three L2 forwarding modes: - -- *l2patch*: L2 patch, the fastest point-to-point L2 path that loops - packets between two interfaces without any Ethernet frame checks or - lookups. -- *l2xc*: L2 cross-connect, point-to-point L2 path with all Ethernet - frame checks, but no MAC learning and no MAC lookup. -- *l2bd*: L2 bridge-domain, multipoint-to-multipoint L2 path with all - Ethernet frame checks, with MAC learning (unless static MACs are used) - and MAC lookup. - -l2bd tests are executed in baseline and scale configurations: - -- *l2bdbase*: Two MAC FIB entries are learned by VPP to enable packet - switching between two interfaces in two directions. VPP L2 switching - is tested with 254 IPv4 unique flows per direction, varying IPv4 - source address per flow in order to invoke RSS based packet - distribution across VPP workers. The same source and destination MAC - address is used for all flows per direction. IPv4 source address is - incremented for every packet. - -- *l2bdscale*: A high number of MAC FIB entries are learned by VPP to - enable packet switching between two interfaces in two directions. - Tested MAC FIB sizes include: i) 10k with 5k unique flows per - direction, ii) 100k with 2 x 50k flows and iii) 1M with 2 x 500k - flows. Unique flows are created by using distinct source and - destination MAC addresses that are changed for every packet using - incremental ordering, making VPP learn (or refresh) distinct src MAC - entries and look up distinct dst MAC entries for every packet. For - details, see - [Packet Flow Ordering]({{< ref "packet_flow_ordering#Packet Flow Ordering" >}}). - -Ethernet wire encapsulations tested include: untagged, dot1q, dot1ad. - -## IPv4 Routing - -IPv4 routing tests are executed in baseline and scale configurations: - -- *ip4base*: Two /32 IPv4 FIB entries are configured in VPP to enable - packet routing between two interfaces in two directions. VPP routing - is tested with 253 IPv4 unique flows per direction, varying IPv4 - source address per flow in order to invoke RSS based packet - distribution across VPP workers. IPv4 source address is incremented - for every packet. - -- *ip4scale*: A high number of /32 IPv4 FIB entries are configured in - VPP. Tested IPv4 FIB sizes include: i) 20k with 10k unique flows per - direction, ii) 200k with 2 * 100k flows and iii) 2M with 2 * 1M - flows. Unique flows are created by using distinct IPv4 destination - addresses that are changed for every packet, using incremental or - random ordering. For details, see - [Packet Flow Ordering]({{< ref "packet_flow_ordering#Packet Flow Ordering" >}}). - -## IPv6 Routing - -Similarly to IPv4, IPv6 routing tests are executed in baseline and scale -configurations: - -- *ip6base*: Two /128 IPv4 FIB entries are configured in VPP to enable - packet routing between two interfaces in two directions. VPP routing - is tested with 253 IPv6 unique flows per direction, varying IPv6 - source address per flow in order to invoke RSS based packet - distribution across VPP workers. IPv6 source address is incremented - for every packet. - -- *ip4scale*: A high number of /128 IPv6 FIB entries are configured in - VPP. Tested IPv6 FIB sizes include: i) 20k with 10k unique flows per - direction, ii) 200k with 2 * 100k flows and iii) 2M with 2 * 1M - flows. Unique flows are created by using distinct IPv6 destination - addresses that are changed for every packet, using incremental or - random ordering. For details, see - [Packet Flow Ordering]({{< ref "packet_flow_ordering#Packet Flow Ordering" >}}). - -## SRv6 Routing - -SRv6 routing tests are executed in a number of baseline configurations, -in each case SR policy and steering policy are configured for one -direction and one (or two) SR behaviours (functions) in the other -directions: - -- *srv6enc1sid*: One SID (no SRH present), one SR function - End. -- *srv6enc2sids*: Two SIDs (SRH present), two SR functions - End and - End.DX6. -- *srv6enc2sids-nodecaps*: Two SIDs (SRH present) without decapsulation, - one SR function - End. -- *srv6proxy-dyn*: Dynamic SRv6 proxy, one SR function - End.AD. -- *srv6proxy-masq*: Masquerading SRv6 proxy, one SR function - End.AM. -- *srv6proxy-stat*: Static SRv6 proxy, one SR function - End.AS. - -In all listed cases low number of IPv6 flows (253 per direction) is -routed by VPP. diff --git a/docs/content/methodology/vpp_startup_settings.md b/docs/content/methodology/vpp_startup_settings.md deleted file mode 100644 index 6e40091a6c..0000000000 --- a/docs/content/methodology/vpp_startup_settings.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -title: "VPP Startup Settings" -weight: 17 ---- - -# VPP Startup Settings - -CSIT code manipulates a number of VPP settings in startup.conf for -optimized performance. List of common settings applied to all tests and -test dependent settings follows. - -## Common Settings - -List of VPP startup.conf settings applied to all tests: - -1. heap-size - set separately for ip4, ip6, stats, main - depending on scale tested. -2. no-tx-checksum-offload - disables UDP / TCP TX checksum offload in - DPDK. Typically needed for use faster vector PMDs (together with - no-multi-seg). -3. buffers-per-numa - sets a number of memory buffers allocated - to VPP per CPU socket. VPP default is 16384. Needs to be increased for - scenarios with large number of interfaces and worker threads. To - accommodate for scale tests, CSIT is setting it to the maximum possible - value corresponding to the limit of DPDK memory mappings (currently - 256). For Xeon Skylake platforms configured with 2MB hugepages and VPP - data-size and buffer-size defaults (2048B and 2496B respectively), this - results in value of 215040 (256 * 840 = 215040, 840 * 2496B buffers fit - in 2MB hugepage). - -## Per Test Settings - -List of vpp startup.conf settings applied dynamically per test: - -1. corelist-workers - list of logical cores to run VPP - worker data plane threads. Depends on HyperThreading and core per - test configuration. -2. num-rx-queues - depends on a number of VPP threads and NIC - interfaces. -3. no-multi-seg - disables multi-segment buffers in DPDK, improves - packet throughput, but disables Jumbo MTU support. Disabled for all - tests apart from the ones that require Jumbo 9000B frame support. -4. UIO driver - depends on topology file definition. -5. QAT VFs - depends on NRThreads, each thread = 1QAT VFs. -- cgit 1.2.3-korg