#!/usr/bin/env python # Copyright 2016 the V8 project authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. """Script to transform and merge sancov files into human readable json-format. The script supports three actions: all: Writes a json file with all instrumented lines of all executables. merge: Merges sancov files with coverage output into an existing json file. split: Split json file into separate files per covered source file. The json data is structured as follows: { "version": 1, "tests": ["executable1", "executable2", ...], "files": { "file1": [[, ], [, ], ...], "file2": [...], ... } } The executables are sorted and determine the test bit mask. Their index+1 is the bit, e.g. executable1 = 1, executable3 = 4, etc. Hence, a line covered by executable1 and executable3 will have bit_mask == 5 == 0b101. The number of tests is restricted to 52 in version 1, to allow javascript JSON parsing of the bitsets encoded as numbers. JS max safe int is (1 << 53) - 1. The line-number-bit_mask pairs are sorted by line number and don't contain duplicates. Split json data preserves the same format, but only contains one file per json file. The sancov tool is expected to be in the llvm compiler-rt third-party directory. It's not checked out by default and must be added as a custom deps: 'v8/third_party/llvm/projects/compiler-rt': 'https://chromium.googlesource.com/external/llvm.org/compiler-rt.git' """ # for py2/py3 compatibility from __future__ import print_function from functools import reduce import argparse import json import logging import os import re import subprocess import sys from multiprocessing import Pool, cpu_count logging.basicConfig(level=logging.INFO) # Files to exclude from coverage. Dropping their data early adds more speed. # The contained cc files are already excluded from instrumentation, but inlined # data is referenced through v8's object files. EXCLUSIONS = [ 'buildtools', 'src/third_party', 'third_party', 'test', 'testing', ] # Executables found in the build output for which no coverage is generated. # Exclude them from the coverage data file. EXE_EXCLUSIONS = [ 'generate-bytecode-expectations', 'hello-world', 'mksnapshot', 'parser-shell', 'process', 'shell', ] # V8 checkout directory. BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname( os.path.abspath(__file__)))) # The sancov tool location. SANCOV_TOOL = os.path.join( BASE_DIR, 'third_party', 'llvm', 'projects', 'compiler-rt', 'lib', 'sanitizer_common', 'scripts', 'sancov.py') # Simple script to sanitize the PCs from objdump. SANITIZE_PCS = os.path.join(BASE_DIR, 'tools', 'sanitizers', 'sanitize_pcs.py') # The llvm symbolizer location. SYMBOLIZER = os.path.join( BASE_DIR, 'third_party', 'llvm-build', 'Release+Asserts', 'bin', 'llvm-symbolizer') # Number of cpus. CPUS = cpu_count() # Regexp to find sancov files as output by sancov_merger.py. Also grabs the # executable name in group 1. SANCOV_FILE_RE = re.compile(r'^(.*)\.result.sancov$') def executables(build_dir): """Iterates over executable files in the build directory.""" for f in os.listdir(build_dir): file_path = os.path.join(build_dir, f) if (os.path.isfile(file_path) and os.access(file_path, os.X_OK) and f not in EXE_EXCLUSIONS): yield file_path def process_symbolizer_output(output, build_dir): """Post-process llvm symbolizer output. Excludes files outside the v8 checkout or given in exclusion list above from further processing. Drops the character index in each line. Returns: A mapping of file names to lists of line numbers. The file names have relative paths to the v8 base directory. The lists of line numbers don't contain duplicate lines and are sorted. """ # Path prefix added by the llvm symbolizer including trailing slash. output_path_prefix = os.path.join(build_dir, '..', '..', '') # Drop path prefix when iterating lines. The path is redundant and takes # too much space. Drop files outside that path, e.g. generated files in # the build dir and absolute paths to c++ library headers. def iter_lines(): for line in output.strip().splitlines(): if line.startswith(output_path_prefix): yield line[len(output_path_prefix):] # Map file names to sets of instrumented line numbers. file_map = {} for line in iter_lines(): # Drop character number, we only care for line numbers. Each line has the # form: ::. file_name, number, _ = line.split(':') file_map.setdefault(file_name, set([])).add(int(number)) # Remove exclusion patterns from file map. It's cheaper to do it after the # mapping, as there are few excluded files and we don't want to do this # check for numerous lines in ordinary files. def keep(file_name): for e in EXCLUSIONS: if file_name.startswith(e): return False return True # Return in serializable form and filter. return {k: sorted(file_map[k]) for k in file_map if keep(k)} def get_instrumented_lines(executable): """Return the instrumented lines of an executable. Called trough multiprocessing pool. Returns: Post-processed llvm output as returned by process_symbolizer_output. """ # The first two pipes are from llvm's tool sancov.py with 0x added to the hex # numbers. The results are piped into the llvm symbolizer, which outputs for # each PC: ::. # We don't call the sancov tool to get more speed. process = subprocess.Popen( 'objdump -d %s | ' 'grep \'^\s\+[0-9a-f]\+:.*\scall\(q\|\)\s\+[0-9a-f]\+ ' '<__sanitizer_cov\(_with_check\|\|_trace_pc_guard\)\(@plt\|\)>\' | ' 'grep \'^\s\+[0-9a-f]\+\' -o | ' '%s | ' '%s --obj %s -functions=none' % (executable, SANITIZE_PCS, SYMBOLIZER, executable), stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=BASE_DIR, shell=True, ) output, _ = process.communicate() assert process.returncode == 0 return process_symbolizer_output(output, os.path.dirname(executable)) def merge_instrumented_line_results(exe_list, results): """Merge multiprocessing results for all instrumented lines. Args: exe_list: List of all executable names with absolute paths. results: List of results as returned by get_instrumented_lines. Returns: Dict to be used as json data as specified on the top of this page. The dictionary contains all instrumented lines of all files referenced by all executables. """ def merge_files(x, y): for file_name, lines in y.iteritems(): x.setdefault(file_name, set([])).update(lines) return x result = reduce(merge_files, results, {}) # Return data as file->lines mapping. The lines are saved as lists # with (line number, test bits (as int)). The test bits are initialized with # 0, meaning instrumented, but no coverage. # The order of the test bits is given with key 'tests'. For now, these are # the executable names. We use a _list_ with two items instead of a tuple to # ease merging by allowing mutation of the second item. return { 'version': 1, 'tests': sorted(map(os.path.basename, exe_list)), 'files': {f: map(lambda l: [l, 0], sorted(result[f])) for f in result}, } def write_instrumented(options): """Implements the 'all' action of this tool.""" exe_list = list(executables(options.build_dir)) logging.info('Reading instrumented lines from %d executables.', len(exe_list)) pool = Pool(CPUS) try: results = pool.imap_unordered(get_instrumented_lines, exe_list) finally: pool.close() # Merge multiprocessing results and prepare output data. data = merge_instrumented_line_results(exe_list, results) logging.info('Read data from %d executables, which covers %d files.', len(data['tests']), len(data['files'])) logging.info('Writing results to %s', options.json_output) # Write json output. with open(options.json_output, 'w') as f: json.dump(data, f, sort_keys=True) def get_covered_lines(args): """Return the covered lines of an executable. Called trough multiprocessing pool. The args are expected to unpack to: cov_dir: Folder with sancov files merged by sancov_merger.py. executable: Absolute path to the executable that was called to produce the given coverage data. sancov_file: The merged sancov file with coverage data. Returns: A tuple of post-processed llvm output as returned by process_symbolizer_output and the executable name. """ cov_dir, executable, sancov_file = args # Let the sancov tool print the covered PCs and pipe them through the llvm # symbolizer. process = subprocess.Popen( '%s print %s 2> /dev/null | ' '%s --obj %s -functions=none' % (SANCOV_TOOL, os.path.join(cov_dir, sancov_file), SYMBOLIZER, executable), stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=BASE_DIR, shell=True, ) output, _ = process.communicate() assert process.returncode == 0 return ( process_symbolizer_output(output, os.path.dirname(executable)), os.path.basename(executable), ) def merge_covered_line_results(data, results): """Merge multiprocessing results for covered lines. The data is mutated, the results are merged into it in place. Args: data: Existing coverage data from json file containing all instrumented lines. results: List of results as returned by get_covered_lines. """ # List of executables and mapping to the test bit mask. The number of # tests is restricted to 52, to allow javascript JSON parsing of # the bitsets encoded as numbers. JS max safe int is (1 << 53) - 1. exe_list = data['tests'] assert len(exe_list) <= 52, 'Max 52 different tests are supported.' test_bit_masks = {exe:1<