Reland "[test] Print hanging tests on linux on test-runner termination"

This is a reland of 3fc9663159

The private method on_event in timeout proc is now renamed to be truly
private.

Original change's description:
> [test] Print hanging tests on linux on test-runner termination
>
> This will print the list of processes still running before and after
> joining workers during termination. This will help debugging hanging
> tests during flake-bisect or with num-fuzzer, which both terminate
> on total timeout and currently still sometimes hang without printing
> processes.
>
> Bug: v8:8292
> Change-Id: I124b65fa35b8d7a6aa198fcf50f2c20df94dc51a
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1735312
> Reviewed-by: Tamer Tas <tmrts@chromium.org>
> Commit-Queue: Michael Achenbach <machenbach@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#63065}

Bug: v8:8292
Change-Id: Ibad1172666d6f4d2c07884a54edfe9d6499b57fe
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1735318
Reviewed-by: Tamer Tas <tmrts@chromium.org>
Commit-Queue: Michael Achenbach <machenbach@chromium.org>
Cr-Commit-Position: refs/heads/master@{#63072}
This commit is contained in:
Michael Achenbach 2019-08-05 12:58:34 +02:00 committed by Commit Bot
parent 8c3da74f18
commit cdfadf4a99
5 changed files with 34 additions and 7 deletions

View File

@ -115,7 +115,15 @@ class Pool():
# Necessary to not overflow the queue's pipe if a keyboard interrupt happens.
BUFFER_FACTOR = 4
def __init__(self, num_workers, heartbeat_timeout=1):
def __init__(self, num_workers, heartbeat_timeout=1, notify_fun=None):
"""
Args:
num_workers: Number of worker processes to run in parallel.
heartbeat_timeout: Timeout in seconds for waiting for results. Each time
the timeout is reached, a heartbeat is signalled and timeout is reset.
notify_fun: Callable called to signale some events like termination. The
event name is passed as string.
"""
self.num_workers = num_workers
self.processes = []
self.terminated = False
@ -130,6 +138,7 @@ class Pool():
# work_queue.
self.processing_count = 0
self.heartbeat_timeout = heartbeat_timeout
self.notify = notify_fun or (lambda x: x)
# Disable sigint and sigterm to prevent subprocesses from capturing the
# signals.
@ -261,11 +270,13 @@ class Pool():
for p in self.processes:
os.kill(p.pid, signal.SIGTERM)
self.notify("Joining workers")
for p in self.processes:
p.join()
# Drain the queues to prevent stderr chatter when queues are garbage
# collected.
self.notify("Draining queues")
try:
while True: self.work_queue.get(False)
except:

View File

@ -109,6 +109,19 @@ class TestProc(object):
### Communication
def notify_previous(self, event):
self._on_event(event)
if self._prev_proc:
self._prev_proc.notify_previous(event)
def _on_event(self, event):
"""Called when processors to the right signal events, e.g. termination.
Args:
event: A text describing the signalled event.
"""
pass
def _send_test(self, test):
"""Helper method for sending test to the next processor."""
return self._next_proc.next_test(test)
@ -120,7 +133,6 @@ class TestProc(object):
self._prev_proc.result_for(test, result)
class TestProcObserver(TestProc):
"""Processor used for observing the data."""
def __init__(self):

View File

@ -45,7 +45,7 @@ class ExecutionProc(base.TestProc):
def __init__(self, jobs, outproc_factory=None):
super(ExecutionProc, self).__init__()
self._pool = pool.Pool(jobs)
self._pool = pool.Pool(jobs, notify_fun=self.notify_previous)
self._outproc_factory = outproc_factory or (lambda t: t.output_proc)
self._tests = {}

View File

@ -149,6 +149,10 @@ class VerboseProgressIndicator(SimpleProgressIndicator):
self._print('Still working...')
self._print_processes_linux()
def _on_event(self, event):
self._print(event)
self._print_processes_linux()
class DotsProgressIndicator(SimpleProgressIndicator):
def __init__(self):

View File

@ -14,15 +14,15 @@ class TimeoutProc(base.TestProcObserver):
self._start = time.time()
def _on_next_test(self, test):
self._on_event()
self.__on_event()
def _on_result_for(self, test, result):
self._on_event()
self.__on_event()
def _on_heartbeat(self):
self._on_event()
self.__on_event()
def _on_event(self):
def __on_event(self):
if not self.is_stopped:
if time.time() - self._start > self._duration_sec:
print('>>> Total timeout reached.')