[test] Add logging to test runner

This adds Python logging to several code locations that previously
used to just print. The locations aren't yet complete. The changed
code locations should help for investigating hanging test runs.

The default level is WARNING for running tests locally, and INFO
when called from bots that pass the --swarming parameter.

Bug: v8:13113
Change-Id: If3a336703e7c346a5c718f2359b1a80e37e1ca6d
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3876183
Commit-Queue: Michael Achenbach <machenbach@chromium.org>
Reviewed-by: Alexander Schulze <alexschulze@chromium.org>
Cr-Commit-Position: refs/heads/main@{#83006}
This commit is contained in:
Michael Achenbach 2022-09-06 17:19:54 +02:00 committed by V8 LUCI CQ
parent 519c430b4c
commit b37760d2c9
7 changed files with 50 additions and 33 deletions

View File

@ -7,6 +7,7 @@ from functools import reduce
from os.path import dirname as up
import json
import logging
import multiprocessing
import optparse
import os
@ -116,6 +117,14 @@ TRY_RELEASE_MODE = ModeConfig(
status_mode="debug",
)
# Set up logging. No need to log a date in timestamps as we can get that from
# test run start times.
logging.basicConfig(
format='%(asctime)s %(message)s',
datefmt='%H:%M:%S',
level=logging.WARNING,
)
class TestRunnerError(Exception):
pass
@ -142,6 +151,8 @@ class BaseTestRunner(object):
self.options, args = self._parse_args(parser, sys_args)
self.infra_staging = self.options.infra_staging
if self.options.swarming:
logging.getLogger().setLevel(logging.INFO)
# Swarming doesn't print how isolated commands are called. Lets make
# this less cryptic by printing it ourselves.
print(' '.join(sys.argv))

View File

@ -3,6 +3,7 @@
# found in the LICENSE file.
from contextlib import contextmanager
import logging
import os
import re
import signal
@ -145,13 +146,10 @@ class BaseCommand(object):
started_as = self.to_string(relative=True)
process_text = 'process %d started as:\n %s\n' % (process.pid, started_as)
try:
print('Attempting to kill ' + process_text)
sys.stdout.flush()
logging.warning('Attempting to kill %s', process_text)
self._kill_process(process)
except OSError as e:
print(e)
print('Unruly ' + process_text)
sys.stdout.flush()
except OSError:
logging.exception('Unruly %s', process_text)
def __str__(self):
return self.to_string()
@ -216,11 +214,10 @@ def taskkill_windows(process, verbose=False, force=True):
)
stdout, stderr = tk.communicate()
if verbose:
print('Taskkill results for %d' % process.pid)
print(stdout)
print(stderr)
print('Return code: %d' % tk.returncode)
sys.stdout.flush()
logging.info('Taskkill results for %d', process.pid)
logging.info(stdout.decode('utf-8', errors='ignore'))
logging.info(stderr.decode('utf-8', errors='ignore'))
logging.info('Return code: %d', tk.returncode)
class WindowsCommand(BaseCommand):

View File

@ -4,6 +4,7 @@
# found in the LICENSE file.
import collections
import logging
import os
import signal
import traceback
@ -76,8 +77,7 @@ def Worker(fn, work_queue, done_queue,
# SIGINT, SIGTERM or internal hard timeout.
break
except Exception as e:
traceback.print_exc()
print(">>> EXCEPTION: %s" % e)
logging.exception('Unhandled error during worker execution.')
done_queue.put(ExceptionResult(e))
# When we reach here on normal tear down, all items have been pulled from
# the done_queue before and this should have no effect. On fast abort, it's
@ -245,9 +245,8 @@ class DefaultExecutionPool(ContextPool):
self.advance(gen)
except KeyboardInterrupt:
assert False, 'Unreachable'
except Exception as e:
traceback.print_exc()
print(">>> EXCEPTION: %s" % e)
except Exception:
logging.exception('Unhandled error during pool execution.')
finally:
self._terminate()
@ -321,12 +320,17 @@ class DefaultExecutionPool(ContextPool):
self.notify("Draining queues")
try:
while True: self.work_queue.get(False)
except:
except Empty:
pass
except:
logging.exception('Error draining work queue.')
try:
while True: self.done_queue.get(False)
except:
except Empty:
pass
except:
logging.exception('Error draining done queue.')
self.notify("Pool terminated")
def _get_result_from_queue(self):
"""Attempts to get the next result from the queue.

View File

@ -4,6 +4,7 @@
import datetime
import json
import logging
import platform
import sys
import time
@ -140,13 +141,13 @@ class VerboseProgressIndicator(SimpleProgressIndicator):
# TODO(machenbach): Remove this platform specific hack and implement a proper
# feedback channel from the workers, providing which tests are currently run.
def _print_processes(self):
def _log_processes(self):
procs = self.context.list_processes()
if procs:
self._print('List of processes:')
logging.info('List of processes:')
for pid, cmd in self.context.list_processes():
# Show command with pid, but other process info cut off.
self._print('pid: %d cmd: %s' % (pid, cmd))
logging.info('pid: %d cmd: %s', pid, cmd)
def _ensure_delay(self, delay):
return time.time() - self._last_printed_time > delay
@ -156,11 +157,11 @@ class VerboseProgressIndicator(SimpleProgressIndicator):
# Print something every 30 seconds to not get killed by an output
# timeout.
self._print('Still working...')
self._print_processes()
self._log_processes()
def on_event(self, event):
self._print(event)
self._print_processes()
logging.info(event)
self._log_processes()
class CIProgressIndicator(VerboseProgressIndicator):

View File

@ -2,6 +2,7 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import logging
import signal
from . import base
@ -16,12 +17,12 @@ class SignalProc(base.TestProcObserver):
signal.signal(signal.SIGTERM, self._on_sigterm)
def _on_ctrlc(self, _signum, _stack_frame):
print('>>> Ctrl-C detected, early abort...')
logging.warning('Ctrl-C detected, early abort...')
self.exit_code = utils.EXIT_CODE_INTERRUPTED
self.stop()
def _on_sigterm(self, _signum, _stack_frame):
print('>>> SIGTERM received, early abort...')
logging.warning('SIGTERM received, early abort...')
self.exit_code = utils.EXIT_CODE_TERMINATED
self.stop()

View File

@ -2,6 +2,7 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import logging
import time
from . import base
@ -31,5 +32,5 @@ class TimeoutProc(base.TestProcObserver):
def __on_event(self):
if not self.is_stopped:
if time.time() - self._start > self._duration_sec:
print('>>> Total timeout reached.')
logging.info('Total timeout reached.')
self.stop()

View File

@ -4,6 +4,7 @@
# found in the LICENSE file.
import heapq
import logging
import os
import platform
import signal
@ -29,10 +30,11 @@ def list_processes_linux():
]
# Filter strange process with name as out dir.
return [p for p in processes if p[1] != OUT_DIR]
except Exception as e:
# TODO(https://crbug.com/v8/13101): Remove after investigation.
print('Fetching process list failed.')
print(e)
except subprocess.CalledProcessError as e:
# Return code 1 means no processes found.
if e.returncode != 1:
# TODO(https://crbug.com/v8/13101): Remove after investigation.
logging.exception('Fetching process list failed.')
return []
@ -45,10 +47,10 @@ def kill_processes_linux():
return
for pid, cmd in list_processes_linux():
try:
print('Attempting to kill %d - %s' % (pid, cmd))
logging.warning('Attempting to kill %d - %s', pid, cmd)
os.kill(pid, signal.SIGKILL)
except:
pass
logging.exception('Failed to kill process')
class FixedSizeTopList():