diff options
-rw-r--r-- | resources/libraries/bash/entry/bisect.sh | 188 | ||||
-rw-r--r-- | resources/libraries/bash/function/common.sh | 20 | ||||
-rw-r--r-- | resources/libraries/bash/function/per_patch.sh | 59 | ||||
-rw-r--r-- | resources/libraries/python/model/parse.py | 108 | ||||
-rw-r--r-- | resources/tools/integrated/compare_bisect.py | 132 | ||||
-rw-r--r-- | resources/tools/integrated/compare_perpatch.py | 58 |
6 files changed, 508 insertions, 57 deletions
diff --git a/resources/libraries/bash/entry/bisect.sh b/resources/libraries/bash/entry/bisect.sh new file mode 100644 index 0000000000..d5cb1d51ba --- /dev/null +++ b/resources/libraries/bash/entry/bisect.sh @@ -0,0 +1,188 @@ +#!/usr/bin/env bash + +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -exuo pipefail + +# This entry script does not change which CSIT branch is used, +# use "with_oper_for_vpp.sh" wrapper for that. +# +# This script is to be used for locating performance regressions +# (or breakages, or progressions, or fixes). +# It uses "git bisect" commands on the VPP repository, +# between the triggered VPP patch and a commit specified in the first argument +# of the gerrit comment text. +# The other arguments are used as tag expressions for selecting tests as usual. +# Many different result types are supported. +# +# Logs are present in the archive directory, but usually the main output +# is the offending commit as identified by "git bisect", visible in console. +# +# While selecting just one testcase is the intended use, +# this script should be able to deal with multiple testcases as well, +# grouping all the values together. This usually inflates +# the standard deviation, but it is not clear how that affects the bisection. +# +# For the bisection decision, jumpavg library is used, +# deciding whether shorter description is achieved by forcefully grouping +# the middle results with the old, or with the new ones. +# If the shortest description is achieved with 3 separate groups, +# bisect interval focuses on biggest relative change +# (with respect to pairwise maximum). +# +# If a test fails, an artificial result is used to distinguish +# from normal results. Currently, the value 1.0, with the multiplicity of 4. +# +# Note that if there was a VPP API change that affects tests in the interval, +# there frequently is no good way for single CSIT commit to work there. +# You can try manually reverting the CSIT changes to make tests pass, +# possibly needing to search over multiple subintervals. +# Using and older CSIT commit (possibly cherry-picking the bisect Change +# if it was not present in CSIT compatible with old enough VPP builds) +# is the fastest solution; but beware of CSIT-induced performance effects +# (e.g. TRex settings). +# +# If a regression happens during a subinterval where the test fails +# due to a bug in VPP, you may try to create a new commit chain +# with the fix cherry-picked to the start of the interval. +# Do not do that as a chain in Gerrit, it would be long and Gerrit will refuse +# edits of already merged Changes. +# Instead, add a block of bash code to do the manipulation +# on local git history between checkout and bisect. +# +# At the start, the script executes first bisect iteration in an attempt +# to avoid work if the search interval has only one commit (or is invalid). +# Only when the work is needed, earliest and latest commits are built +# and tested. Branches "earliest", "middle" and "latest" are temporarily created +# as a way to remember which commits to check out. +# +# Test results are parsed from json files, +# symlinks are used to tell python script which results to compare. +# +# Assumptions: +# + There is a directory holding VPP repo with patch under test checked out. +# + It contains csit subdirectory with CSIT code to use (this script is there). +# + Everything needed to build VPP is already installed locally. +# Consequences: +# + Working directory is switched to the VPP repo root. +# + At the end, VPP repo has checked out and built some commit, +# as chosen by "git bisect". +# + Directories build_root, build and csit are reset during the run. +# + The following directories (relative to VPP repo) are (re)created: +# ++ csit_{earliest,middle,latest}, build_{earliest,latest}, +# ++ archive, csit/archive, csit/download_dir. +# + Symlinks csit_{early,late,mid} are also created. +# Arguments: +# - ${1} - If present, override JOB_NAME to simplify manual usage. + +# "set -eu" handles failures from the following two lines. +BASH_ENTRY_DIR="$(dirname $(readlink -e "${BASH_SOURCE[0]}"))" +BASH_FUNCTION_DIR="$(readlink -e "${BASH_ENTRY_DIR}/../function")" +source "${BASH_FUNCTION_DIR}/common.sh" || { + echo "Source failed." >&2 + exit 1 +} +source "${BASH_FUNCTION_DIR}/per_patch.sh" || die "Source failed." +# Cleanup needs ansible. +source "${BASH_FUNCTION_DIR}/ansible.sh" || die "Source failed." +common_dirs || die +check_prerequisites || die +set_perpatch_vpp_dir || die +get_test_code "${1-}" || die +get_test_tag_string || die +# Unfortunately, git bisect only works at the top of the repo. +cd "${VPP_DIR}" || die + +# Save the current commit. +git checkout -b "latest" +# Save the lower bound commit. +git checkout -b "earliest" +git reset --hard "${GIT_BISECT_FROM}" + +# This is the place for custom code manipulating local git history. + +#git checkout -b "alter" +#... +#git checkout "latest" +#git rebase "alter" || git rebase --skip +#git branch -D "alter" + +git bisect start || die +# TODO: Can we add a trap for "git bisect reset" or even "deactivate", +# without affecting the inner trap for unreserve and cleanup? +git checkout "latest" +git status || die +git describe || die +git bisect new || die +# Performing first iteration early to avoid testing or even building. +git checkout "earliest" || die "Failed to checkout earliest commit." +git status || die +git describe || die +# The first iteration. +git bisect old | tee "git.log" || die "Invalid bisect interval?" +git checkout -b "middle" || die "Failed to create branch: middle" +git status || die +git describe || die +if head -n 1 "git.log" | cut -b -11 | fgrep -q "Bisecting:"; then + echo "Building and testing initial bounds." +else + echo "Single commit, no work needed." + exit 0 +fi +# Building latest first, good for avoiding DPDK rebuilds. +git checkout "latest" || die "Failed to checkout latest commit." +build_vpp_ubuntu "LATEST" || die +set_aside_build_artifacts "latest" || die +git checkout "earliest" || die "Failed to checkout earliest commit." +git status || die +git describe || die +build_vpp_ubuntu "EARLIEST" || die +set_aside_build_artifacts "earliest" || die +git checkout "middle" || die "Failed to checkout middle commit." +git branch -D "earliest" "latest" || die "Failed to remove branches." +# Done with repo manipulation for now, testing commences. +initialize_csit_dirs "earliest" "middle" "latest" || die +set_perpatch_dut || die +select_topology || die +select_arch_os || die +activate_virtualenv "${VPP_DIR}" || die +generate_tests || die +archive_tests || die + +# TODO: Does it matter which build is tested first? + +select_build "build_earliest" || die +check_download_dir || die +reserve_and_cleanup_testbed || die +run_robot || die +move_test_results "csit_earliest" || die +ln -s -T "csit_earliest" "csit_early" || die + +# Explicit cleanup, in case the previous test left the testbed in a bad shape. +ansible_playbook "cleanup" + +select_build "build_latest" || die +check_download_dir || die +run_robot || die +move_test_results "csit_latest" || die +ln -s -T "csit_latest" "csit_late" || die +untrap_and_unreserve_testbed || die + +# See function documentation for the logic in the loop. +main_bisect_loop || die +# In worst case, the middle branch is still checked out. +# TODO: Is there a way to ensure "middle" branch is always deleted? +git branch -D "middle" || true +# Delete symlinks to prevent duplicate archiving. +rm -vrf "csit_early" "csit_late" "csit_mid" diff --git a/resources/libraries/bash/function/common.sh b/resources/libraries/bash/function/common.sh index 44149ca6e1..c2b169f550 100644 --- a/resources/libraries/bash/function/common.sh +++ b/resources/libraries/bash/function/common.sh @@ -290,7 +290,7 @@ function compose_robot_arguments () { *"device"*) ROBOT_ARGS+=("--suite" "tests.${DUT}.device") ;; - *"perf"*) + *"perf"* | *"bisect"*) ROBOT_ARGS+=("--suite" "tests.${DUT}.perf") ;; *) @@ -557,6 +557,8 @@ function get_test_tag_string () { # Variables set: # - TEST_TAG_STRING - The string following trigger word in gerrit comment. # May be empty, or even not set on event types not adding comment. + # - GIT_BISECT_FROM - If bisecttest, the commit hash to bisect from. + # Else not set. # Variables exported optionally: # - GRAPH_NODE_VARIANT - Node variant to test with, set if found in trigger. @@ -566,6 +568,10 @@ function get_test_tag_string () { if [[ "${GERRIT_EVENT_TYPE-}" == "comment-added" ]]; then case "${TEST_CODE}" in + # Order matters, bisect job contains "perf" in its name. + *"bisect"*) + trigger="bisecttest" + ;; *"device"*) trigger="devicetest" ;; @@ -591,6 +597,18 @@ function get_test_tag_string () { comment=$(fgrep "${trigger}" <<< "${comment}" || true) TEST_TAG_STRING=$("${cmd[@]}" <<< "${comment}" || true) fi + if [[ "${trigger}" == "bisecttest" ]]; then + # Intentionally without quotes, so spaces delimit elements. + test_tag_array=(${TEST_TAG_STRING}) || die "How could this fail?" + # First "argument" of bisecttest is a commit hash. + GIT_BISECT_FROM="${test_tag_array[0]}" || { + die "Bisect job requires commit hash." + } + # Update the tag string (tag expressions only, no commit hash). + TEST_TAG_STRING="${test_tag_array[@]:1}" || { + die "Bisect job needs a single test, no default." + } + fi if [[ -n "${TEST_TAG_STRING-}" ]]; then test_tag_array=(${TEST_TAG_STRING}) if [[ "${test_tag_array[0]}" == "icl" ]]; then diff --git a/resources/libraries/bash/function/per_patch.sh b/resources/libraries/bash/function/per_patch.sh index b9680a1560..44bd57da80 100644 --- a/resources/libraries/bash/function/per_patch.sh +++ b/resources/libraries/bash/function/per_patch.sh @@ -110,6 +110,65 @@ function initialize_csit_dirs () { } +function main_bisect_loop () { + + # Perform the iterative part of bisect entry script. + # + # The logic is too complex to remain in the entry script. + # + # At the start, the loop assumes git bisect old/new has just been executed, + # and verified more iterations are needed. + # The iteration cleans the build directory and builds the new mid commit. + # Then, testbed is reserved, tests run, and testbed unreserved. + # Results are moved from default to archive location + # (indexed by iteration number) and analyzed. + # The new adjective ("old" or "new") is selected, + # and git bisect with the adjective is executed. + # The symlinks csit_early and csit_late are updated to tightest bounds. + # The git.log file is examined and if the bisect is finished, loop ends. + + iteration=0 + while true + do + let iteration+=1 + git clean -dffx "build"/ "build-root"/ || die + build_vpp_ubuntu "MIDDLE" || die + select_build "build-root" || die + check_download_dir || die + reserve_and_cleanup_testbed || die + run_robot || die + move_test_results "csit_middle/${iteration}" || die + untrap_and_unreserve_testbed || die + rm -vf "csit_mid" || die + ln -s -T "csit_middle/${iteration}" "csit_mid" || die + set +e + python3 "${TOOLS_DIR}/integrated/compare_bisect.py" + bisect_rc="${?}" + set -e + if [[ "${bisect_rc}" == "3" ]]; then + adjective="new" + rm -v "csit_late" || die + ln -s -T "csit_middle/${iteration}" "csit_late" || die + elif [[ "${bisect_rc}" == "0" ]]; then + adjective="old" + rm -v "csit_early" || die + ln -s -T "csit_middle/${iteration}" "csit_early" || die + else + die "Unexpected return code: ${bisect_rc}" + fi + git bisect "${adjective}" | tee "git.log" || die + git describe || die + git status || die + if head -n 1 "git.log" | cut -b -11 | fgrep -q "Bisecting:"; then + echo "Still bisecting..." + else + echo "Bisecting done." + break + fi + done +} + + function move_test_results () { # Arguments: diff --git a/resources/libraries/python/model/parse.py b/resources/libraries/python/model/parse.py new file mode 100644 index 0000000000..b2e8da67ea --- /dev/null +++ b/resources/libraries/python/model/parse.py @@ -0,0 +1,108 @@ +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Library for parsing results from JSON back to python objects. + +This is useful for vpp-csit jobs like per-patch performance verify. +Such jobs invoke robot multiple times, each time on a different build. +Each robot invocation may execute several test cases. +How exactly are the results compared depends on the job type, +but extracting just the main results from jsons (file trees) is a common task, +so it is placed into this library. + +As such, the code in this file does not directly interact +with the code in other files in this directory +(result comparison is done outside robot invocation), +but all files share common assumptions about json structure. + +The function here expects a particular tree created on a filesystem by +a bootstrap script, including test results +exported as json files according to a current model schema. +This script extracts the results (according to result type) +and joins them mapping from test IDs to lists of floats. +Also, the result is cached into a results.json file, +so each tree is parsed only once. + +The cached result does not depend on tree placement, +so the bootstrap script may move and copy trees around +before or after parsing. +""" + +import json +import os +import pathlib + +from typing import Dict, List + + +def parse(dirpath: str, fake_value: float = 1.0) -> Dict[str, List[float]]: + """Look for test jsons, extract scalar results. + + Files other than .json are skipped, jsons without test_id are skipped. + If the test failed, four fake values are used as a fake result. + + Units are ignored, as both parent and current are tested + with the same CSIT code so the unit should be identical. + + The result is also cached as results.json file. + + :param dirpath: Path to the directory tree to examine. + :param fail_value: Fake value to use for test cases that failed. + :type dirpath: str + :type fail_falue: float + :returns: Mapping from test IDs to list of measured values. + :rtype: Dict[str, List[float]] + :raises RuntimeError: On duplicate test ID or unknown test type. + """ + if not pathlib.Path(dirpath).is_dir(): + # This happens when per-patch runs out of iterations. + return {} + resultpath = pathlib.Path(f"{dirpath}/results.json") + if resultpath.is_file(): + with open(resultpath, "rt", encoding="utf8") as file_in: + return json.load(file_in) + results = {} + for root, _, files in os.walk(dirpath): + for filename in files: + if not filename.endswith(".json"): + continue + filepath = os.path.join(root, filename) + with open(filepath, "rt", encoding="utf8") as file_in: + data = json.load(file_in) + if "test_id" not in data: + continue + name = data["test_id"] + if name in results: + raise RuntimeError(f"Duplicate: {name}") + if not data["passed"]: + results[name] = [fake_value] * 4 + continue + result_object = data["result"] + result_type = result_object["type"] + if result_type == "mrr": + results[name] = result_object["receive_rate"]["rate"]["values"] + elif result_type == "ndrpdr": + results[name] = [result_object["pdr"]["lower"]["rate"]["value"]] + elif result_type == "soak": + results[name] = [ + result_object["critical_rate"]["lower"]["rate"]["value"] + ] + elif result_type == "reconf": + results[name] = [result_object["loss"]["time"]["value"]] + elif result_type == "hoststack": + results[name] = [result_object["bandwidth"]["value"]] + else: + raise RuntimeError(f"Unknown result type: {result_type}") + with open(resultpath, "wt", encoding="utf8") as file_out: + json.dump(results, file_out, indent=1, separators=(", ", ": ")) + return results diff --git a/resources/tools/integrated/compare_bisect.py b/resources/tools/integrated/compare_bisect.py new file mode 100644 index 0000000000..247ba507ef --- /dev/null +++ b/resources/tools/integrated/compare_bisect.py @@ -0,0 +1,132 @@ +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Script for analyzing 3 result sets for "git bisect" purposes. + +Jumpavg library is used for comparing description length of three groupings. +The mid result is grouped with early or late result, or as a separate group. +The jump we are looking for is between the mid and the smaller group +of the grouping with less bits. +Except when a grouping with all three sets as separate groups is the smallest. +In that case we chose the bigger difference in averages. +""" + +import sys + +from typing import List, Tuple + +from resources.libraries.python import jumpavg +from resources.libraries.python.model.parse import parse + + +def read_from_dir(dirname: str) -> Tuple[List[float], float]: + """Parse samples from dir, print them and stats, return them as list. + + In case there are more test cases, their results are concatenated. + + :param direname: The directory name (maybe with path) to parse. + :type dirname: str + :returns: The samples, deserialized from json, and the average. + :rtype: Tuple[List[float], float] + :raises RuntimeError: On parsing error. + """ + results = parse(dirname) + samples = [] + for result in results.values(): + samples.extend(result) + print(f"Read {dirname}: {samples!r}") + stats = jumpavg.AvgStdevStats.for_runs(samples) + print(f"Stats: {stats!r}") + return samples, stats.avg + + +def main() -> int: + """Execute the main logic, return the return code. + + :returns: The return code, 0 or 3 depending on the comparison result. + :rtype: int + """ + early_results, early_avg = read_from_dir("csit_early") + late_results, late_avg = read_from_dir("csit_late") + mid_results, mid_avg = read_from_dir("csit_mid") + rel_diff_to_early = abs(early_avg - mid_avg) / max(early_avg, mid_avg) + rel_diff_to_late = abs(late_avg - mid_avg) / max(late_avg, mid_avg) + max_value = max(early_results + mid_results + late_results) + # Create a common group list with just the early group. + common_group_list = jumpavg.BitCountingGroupList( + max_value=max_value + ).append_group_of_runs(early_results) + # Try grouping the mid with the early. + early_group_list = common_group_list.copy() + early_group_list.extend_runs_to_last_group(mid_results) + early_group_list.append_group_of_runs(late_results) + early_bits = early_group_list.bits + print(f"Early group list bits: {early_bits}") + # Now the same, but grouping the mid with the late. + late_group_list = common_group_list.copy() + late_group_list.append_group_of_runs(mid_results) + late_group_list.extend_runs_to_last_group(late_results) + late_bits = late_group_list.bits + print(f"Late group list bits: {late_bits}") + # Finally, group each separately, as if double anomaly happened. + double_group_list = common_group_list.copy() + double_group_list.append_group_of_runs(mid_results) + double_group_list.append_group_of_runs(late_results) + double_bits = double_group_list.bits + print(f"Double group list bits: {double_bits}") + single_bits = min(early_bits, late_bits) + if double_bits <= single_bits: + # In this case, comparing early_bits with late_bits is not the best, + # as that would probably select based on stdev, not based on diff. + # Example: mid (small stdev) is closer to early (small stdev), + # and farther from late (big stdev). + # As grouping mid with early would increase their combined stdev, + # it is not selected. This means a noisy late bound can affect + # what human perceives as the more interesting region. + # So we select only based on averages. + print("Perhaps two different anomalies. Selecting by averages only.") + diff = single_bits - double_bits + print(f"Saved {diff} ({100*diff/single_bits}%) bits.") + if rel_diff_to_early > rel_diff_to_late: + print("The mid results are considered late.") + print("Preferring relative difference of averages:") + print(f"{100*rel_diff_to_early}% to {100*rel_diff_to_late}%.") + # rc==1 is when command is not found. + # rc==2 is when python interpreter does not find the script. + exit_code = 3 + else: + print("The mid results are considered early.") + print("Preferring relative difference of averages:") + print(f"{100*rel_diff_to_late}% to {100*rel_diff_to_early}%.") + exit_code = 0 + else: + # When difference of averages is within stdev, + # we let jumpavg decide, as here difference in stdev + # can be the more interesting signal. + diff = early_bits - late_bits + if early_bits > late_bits: + print("The mid results are considered late.") + print(f"Saved {diff} ({100*diff/early_bits}%) bits.") + print(f"New relative difference is {100*rel_diff_to_early}%.") + exit_code = 3 + else: + print("The mid results are considered early.") + print(f"Saved {-diff} ({-100*diff/late_bits}%) bits.") + print(f"New relative difference is {100*rel_diff_to_late}%.") + exit_code = 0 + print(f"Exit code {exit_code}") + return exit_code + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/resources/tools/integrated/compare_perpatch.py b/resources/tools/integrated/compare_perpatch.py index 0adb6ae73e..9b04b7bdea 100644 --- a/resources/tools/integrated/compare_perpatch.py +++ b/resources/tools/integrated/compare_perpatch.py @@ -16,7 +16,7 @@ This script expects a particular tree created on a filesystem by per_patch_perf.sh bootstrap script, including test results exported as json files according to a current model schema. -This script extracts the results (according to tresult type) +This script extracts the results (according to result type) and joins them into one list of floats for parent and one for current. This script then uses jumpavg library to determine whether there was @@ -26,64 +26,10 @@ If the set of test names does not match, or there was a regression, this script votes -1 (by exiting with code 1), otherwise it votes +1 (exit 0). """ -import json -import os import sys -from typing import Dict, List - from resources.libraries.python import jumpavg - - -def parse(dirpath: str, fake_value: float) -> Dict[str, List[float]]: - """Looks for test jsons, extract scalar results. - - Files other than .json are skipped, jsons without test_id are skipped. - If the test failed, four fake values are used as a fake result. - - Units are ignored, as both parent and current are tested - with the same CSIT code so the unit should be identical. - - :param dirpath: Path to the directory tree to examine. - :param fail_value: Fake value to use for test cases that failed. - :type dirpath: str - :returns: Mapping from test IDs to list of measured values. - :rtype: Dict[str, List[float]] - :raises RuntimeError: On duplicate test ID or unknown test type. - """ - results = {} - for root, _, files in os.walk(dirpath): - for filename in files: - if not filename.endswith(".json"): - continue - filepath = os.path.join(root, filename) - with open(filepath, "rt", encoding="utf8") as file_in: - data = json.load(file_in) - if "test_id" not in data: - continue - name = data["test_id"] - if name in results: - raise RuntimeError(f"Duplicate: {name}") - if not data["passed"]: - results[name] = [fake_value] * 4 - continue - result_object = data["result"] - result_type = result_object["type"] - if result_type == "mrr": - results[name] = result_object["receive_rate"]["rate"]["values"] - elif result_type == "ndrpdr": - results[name] = [result_object["pdr"]["lower"]["rate"]["value"]] - elif result_type == "soak": - results[name] = [ - result_object["critical_rate"]["lower"]["rate"]["value"] - ] - elif result_type == "reconf": - results[name] = [result_object["loss"]["time"]["value"]] - elif result_type == "hoststack": - results[name] = [result_object["bandwidth"]["value"]] - else: - raise RuntimeError(f"Unknown result type: {result_type}") - return results +from resources.libraries.python.model.parse import parse def main() -> int: |