v8/tools/locs.py

#!/usr/bin/env python3
# Copyright 2018 the V8 project authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

""" locs.py - Count lines of code before and after preprocessor expansion
  Consult --help for more information.
"""

import argparse
import json
import os
import re
import subprocess
import sys
import tempfile
import time
from pathlib import Path

ARGPARSE = argparse.ArgumentParser(
    description=("A script that computes LoC for a build dir or from a"
                 "compile_commands.json file"),
    epilog="""Examples:
 Count with default settings for build in out/Default:
   locs.py --build-dir out/Default
 Count with default settings according to given compile_commands file:
   locs.py --compile-commands compile_commands.json
 Count only a custom group of files settings for build in out/Default:
   tools/locs.py --build-dir out/Default
                 --group src-compiler '\.\./\.\./src/compiler'
                 --only src-compiler
 Report the 10 files with the worst expansion:
   tools/locs.py --build-dir out/Default --worst 10
 Report the 10 files with the worst expansion in src/compiler:
   tools/locs.py --build-dir out/Default --worst 10
                 --group src-compiler '\.\./\.\./src/compiler'
                 --only src-compiler
 Report the 10 largest files after preprocessing:
   tools/locs.py --build-dir out/Default --largest 10
 Report the 10 smallest input files:
   tools/locs.py --build-dir out/Default --smallest 10""",
    formatter_class=argparse.RawTextHelpFormatter
)

ARGPARSE.add_argument(
    '--json',
    action='store_true',
    default=False,
    help="output json instead of short summary")
ARGPARSE.add_argument(
    '--build-dir',
    type=str,
    default="",
    help="Use specified build dir and generate necessary files")
ARGPARSE.add_argument(
    '--echocmd',
    action='store_true',
    default=False,
    help="output command used to compute LoC")
ARGPARSE.add_argument(
    '--compile-commands',
    type=str,
    default='compile_commands.json',
    help="Use specified compile_commands.json file")
ARGPARSE.add_argument(
    '--only',
    action='append',
    default=[],
    help="Restrict counting to report group (can be passed multiple times)")
ARGPARSE.add_argument(
    '--not',
    action='append',
    default=[],
    help="Exclude specific group (can be passed multiple times)")
ARGPARSE.add_argument(
    '--list-groups',
    action='store_true',
    default=False,
    help="List groups and associated regular expressions")
ARGPARSE.add_argument(
    '--group',
    nargs=2,
    action='append',
    default=[],
    help="Add a report group (can be passed multiple times)")
ARGPARSE.add_argument(
    '--largest',
    type=int,
    nargs='?',
    default=0,
    const=3,
    help="Output the n largest files after preprocessing")
ARGPARSE.add_argument(
    '--worst',
    type=int,
    nargs='?',
    default=0,
    const=3,
    help="Output the n files with worst expansion by preprocessing")
ARGPARSE.add_argument(
    '--smallest',
    type=int,
    nargs='?',
    default=0,
    const=3,
    help="Output the n smallest input files")
ARGPARSE.add_argument(
    '--files',
    type=int,
    nargs='?',
    default=0,
    const=3,
    help="Output results for each file separately")

ARGS = vars(ARGPARSE.parse_args())


def MaxWidth(strings):
  max_width = 0
  for s in strings:
    max_width = max(max_width, len(s))
  return max_width


def GenerateCompileCommandsAndBuild(build_dir, compile_commands_file, out):
  if not os.path.isdir(build_dir):
    print("Error: Specified build dir {} is not a directory.".format(
        build_dir), file=sys.stderr)
    exit(1)
  compile_commands_file = "{}/compile_commands.json".format(build_dir)

  print("Generating compile commands in {}.".format(
      compile_commands_file), file=out)

  ninja = "ninja -C {} -t compdb cxx cc > {}".format(
      build_dir, compile_commands_file)
  if subprocess.call(ninja, shell=True, stdout=out) != 0:
    print("Error: Cound not generate {} for {}.".format(
      compile_commands_file, build_dir), file=sys.stderr)
    exit(1)

  autoninja = "autoninja -C {} v8_generated_cc_files".format(build_dir)
  if subprocess.call(autoninja, shell=True, stdout=out) != 0:
    print("Error: Building target 'v8_generated_cc_files'"
      " failed for {}.".format(build_dir), file=sys.stderr)
    exit(1)

  return compile_commands_file

def fmt_bytes(bytes):
  if bytes > 1024*1024*1024:
    return int(bytes / (1024*1024)), "MB"
  elif bytes > 1024*1024:
    return int(bytes / (1024)), "kB"
  return int(bytes), " B"


class CompilationData:
  def __init__(self, loc, in_bytes, expanded, expanded_bytes):
    self.loc = loc
    self.in_bytes = in_bytes
    self.expanded = expanded
    self.expanded_bytes = expanded_bytes

  def ratio(self):
    return self.expanded / (self.loc+1)

  def to_string(self):
    exp_bytes, exp_unit = fmt_bytes(self.expanded_bytes)
    in_bytes, in_unit = fmt_bytes(self.in_bytes)
    return "{:>9,} LoC ({:>7,} {}) to {:>12,} LoC ({:>7,} {}) ({:>5.0f}x)".format(
        self.loc, in_bytes, in_unit, self.expanded, exp_bytes, exp_unit, self.ratio())

class File(CompilationData):
  def __init__(self, file, loc, in_bytes, expanded, expanded_bytes):
    super().__init__(loc, in_bytes, expanded, expanded_bytes)
    self.file = file

  def to_string(self):
    return "{} {}".format(super().to_string(), self.file)


class Group(CompilationData):
  def __init__(self, name, regexp_string):
    super().__init__(0, 0, 0, 0)
    self.name = name
    self.count = 0
    self.regexp = re.compile(regexp_string)

  def account(self, unit):
    if (self.regexp.match(unit.file)):
      self.loc += unit.loc
      self.in_bytes += unit.in_bytes
      self.expanded += unit.expanded
      self.expanded_bytes += unit.expanded_bytes
      self.count += 1

  def to_string(self, name_width):
    return "{:<{}} ({:>5} files): {}".format(
        self.name, name_width, self.count, super().to_string())


def SetupReportGroups():
  default_report_groups = {"total": '.*',
                           "src": '\\.\\./\\.\\./src',
                           "test": '\\.\\./\\.\\./test',
                           "third_party": '\\.\\./\\.\\./third_party',
                           "gen": 'gen'}

  report_groups = {**default_report_groups, **dict(ARGS['group'])}

  if ARGS['only']:
    for only_arg in ARGS['only']:
      if not only_arg in report_groups.keys():
        print("Error: specified report group '{}' is not defined.".format(
            ARGS['only']))
        exit(1)
      else:
        report_groups = {
            k: v for (k, v) in report_groups.items() if k in ARGS['only']}

  if ARGS['not']:
    report_groups = {
        k: v for (k, v) in report_groups.items() if k not in ARGS['not']}

  if ARGS['list_groups']:
    print_cat_max_width = MaxWidth(list(report_groups.keys()) + ["Category"])
    print("  {:<{}}  {}".format("Category",
                                print_cat_max_width, "Regular expression"))
    for cat, regexp_string in report_groups.items():
      print("  {:<{}}: {}".format(
          cat, print_cat_max_width, regexp_string))

  report_groups = {k: Group(k, v) for (k, v) in report_groups.items()}

  return report_groups


class Results:
  def __init__(self):
    self.groups = SetupReportGroups()
    self.units = {}

  def track(self, filename):
    is_tracked = False
    for group in self.groups.values():
      if group.regexp.match(filename):
        is_tracked = True
    return is_tracked

  def recordFile(self, filename, loc, in_bytes, expanded, expanded_bytes):
    unit = File(filename, loc, in_bytes, expanded, expanded_bytes)
    self.units[filename] = unit
    for group in self.groups.values():
      group.account(unit)

  def maxGroupWidth(self):
    return MaxWidth([v.name for v in self.groups.values()])

  def printGroupResults(self, file):
    for key in sorted(self.groups.keys()):
      print(self.groups[key].to_string(self.maxGroupWidth()), file=file)

  def printSorted(self, key, count, reverse, out):
    for unit in sorted(list(self.units.values()), key=key, reverse=reverse)[:count]:
      print(unit.to_string(), file=out)


class LocsEncoder(json.JSONEncoder):
  def default(self, o):
    if isinstance(o, File):
      return {"file": o.file, "loc": o.loc, "in_bytes": o.in_bytes,
              "expanded": o.expanded, "expanded_bytes": o.expanded_bytes}
    if isinstance(o, Group):
      return {"name": o.name, "loc": o.loc, "in_bytes": o.in_bytes,
              "expanded": o.expanded, "expanded_bytes": o.expanded_bytes}
    if isinstance(o, Results):
      return {"groups": o.groups, "units": o.units}
    return json.JSONEncoder.default(self, o)


class StatusLine:
  def __init__(self):
    self.max_width = 0

  def print(self, statusline, end="\r", file=sys.stdout):
    self.max_width = max(self.max_width, len(statusline))
    print("{0:<{1}}".format(statusline, self.max_width), end=end, file=file, flush=True)


class CommandSplitter:
  def __init__(self):
    self.cmd_pattern = re.compile(
        "([^\\s]*\\s+)?(?P<clangcmd>[^\\s]*clang.*)"
        " -c (?P<infile>.*) -o (?P<outfile>.*)")

  def process(self, compilation_unit, temp_file_name):
    cmd = self.cmd_pattern.match(compilation_unit['command'])
    outfilename = cmd.group('outfile') + ".cc"
    infilename = cmd.group('infile')
    infile = Path(compilation_unit['directory']).joinpath(infilename)
    outfile = Path(str(temp_file_name)).joinpath(outfilename)
    return [cmd.group('clangcmd'), infilename, infile, outfile]


def Main():
  compile_commands_file = ARGS['compile_commands']
  out = sys.stdout
  if ARGS['json']:
    out = sys.stderr

  if ARGS['build_dir']:
    compile_commands_file = GenerateCompileCommandsAndBuild(
        ARGS['build_dir'], compile_commands_file, out)

  try:
    with open(compile_commands_file) as file:
      data = json.load(file)
  except FileNotFoundError:
    print("Error: Cannot read '{}'. Consult --help to get started.")
    exit(1)

  result = Results()
  status = StatusLine()

  with tempfile.TemporaryDirectory(dir='/tmp/', prefix="locs.") as temp:
    processes = []
    start = time.time()
    cmd_splitter = CommandSplitter()

    for i, key in enumerate(data):
      if not result.track(key['file']):
        continue
      status.print("[{}/{}] Counting LoCs of {}".format(i, len(data), key['file']),
        file=out)
      clangcmd, infilename, infile, outfile = cmd_splitter.process(key, temp)
      outfile.parent.mkdir(parents=True, exist_ok=True)
      if infile.is_file():
        clangcmd = clangcmd + " -E -P " + \
            str(infile) + " -o /dev/stdout | sed '/^\\s*$/d' | wc -lc"
        loccmd = ("cat {}  | sed '\\;^\\s*//;d' | sed '\\;^/\\*;d'"
                  " | sed '/^\\*/d' | sed '/^\\s*$/d' | wc -lc").format(
            infile)
        runcmd = " {} ; {}".format(clangcmd, loccmd)
        if ARGS['echocmd']:
          print(runcmd)
        p = subprocess.Popen(
            runcmd, shell=True, cwd=key['directory'], stdout=subprocess.PIPE)
        processes.append({'process': p, 'infile': infilename})

    for i, p in enumerate(processes):
      status.print("[{}/{}] Summing up {}".format(
          i, len(processes), p['infile']), file=out)
      output, err = p['process'].communicate()
      expanded, expanded_bytes, loc, in_bytes = list(map(int, output.split()))
      result.recordFile(p['infile'], loc, in_bytes, expanded, expanded_bytes)

    end = time.time()
    if ARGS['json']:
      print(json.dumps(result, ensure_ascii=False, cls=LocsEncoder))
    status.print("Processed {:,} files in {:,.2f} sec.".format(
        len(processes), end-start), end="\n", file=out)
    result.printGroupResults(file=out)

    if ARGS['largest']:
      print("Largest {} files after expansion:".format(ARGS['largest']))
      result.printSorted(
          lambda v: v.expanded, ARGS['largest'], reverse=True, out=out)

    if ARGS['worst']:
      print("Worst expansion ({} files):".format(ARGS['worst']))
      result.printSorted(
          lambda v: v.ratio(), ARGS['worst'], reverse=True, out=out)

    if ARGS['smallest']:
      print("Smallest {} input files:".format(ARGS['smallest']))
      result.printSorted(
          lambda v: v.loc, ARGS['smallest'], reverse=False, out=out)

    if ARGS['files']:
      print("List of input files:")
      result.printSorted(
          lambda v: v.file, ARGS['files'], reverse=False, out=out)

  return 0


if __name__ == '__main__':
  sys.exit(Main())