summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Yourtchenko <ayourtch@gmail.com>2019-11-05 01:08:26 +0100
committerAndrew Yourtchenko <ayourtch@gmail.com>2019-11-18 13:17:27 +0000
commit7f9603d90a68a7905e3e8d6095806324b8a1e169 (patch)
treea2ce43d92e688815f31486fd078dc00747c52395
parent22dec9695162a0af2a4709762cdc767f08819895 (diff)
tests: avoid test runner hanging on child test process join
In parallel test, the single process is spawning a bunch of child processes running the tests, and communicates to them. When the child process signals that it has finished, the parent calls child.join(). Sometimes this join never returns. The result is a lot of defunct python processes, and the test run just hangs. I have seen this failure intermittently a fair bit in a busy containerized environment, and by chance, consistently reproduced it on a Thinkpad X280 with 8G of RAM and Ubuntu 19.04, which allowed to diagnose it. Type: test Signed-off-by: Andrew Yourtchenko <ayourtch@gmail.com> Change-Id: If0a3110fc2d23e73d77c310d61c3ea90a2b53610 (cherry picked from commit 42693521f6046997133c8f63bcfc9d615d96f69d)
-rw-r--r--test/run_tests.py14
1 files changed, 13 insertions, 1 deletions
diff --git a/test/run_tests.py b/test/run_tests.py
index e6a182c016b..c9c5bdb8a6e 100644
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -433,7 +433,17 @@ def run_forked(testcase_suites):
results) or stop_run
for finished_testcase in finished_testcase_suites:
- finished_testcase.child.join()
+ # Somewhat surprisingly, the join below may
+ # timeout, even if client signaled that
+ # it finished - so we note it just in case.
+ join_start = time.time()
+ finished_testcase.child.join(test_finished_join_timeout)
+ join_end = time.time()
+ if join_end - join_start >= test_finished_join_timeout:
+ finished_testcase.logger.error(
+ "Timeout joining finished test: %s (pid %d)" %
+ (finished_testcase.last_test,
+ finished_testcase.child.pid))
finished_testcase.close_pipes()
wrapped_testcase_suites.remove(finished_testcase)
finished_unread_testcases.add(finished_testcase)
@@ -726,6 +736,8 @@ if __name__ == '__main__':
test_timeout = parse_digit_env("TIMEOUT", 600) # default = 10 minutes
+ test_finished_join_timeout = 15
+
retries = parse_digit_env("RETRIES", 0)
debug = os.getenv("DEBUG", "n").lower() in ["gdb", "gdbserver"]