skia2/gm/rebaseline_server/imagediffdb.py

#!/usr/bin/python

"""
Copyright 2013 Google Inc.

Use of this source code is governed by a BSD-style license that can be
found in the LICENSE file.

Calulate differences between image pairs, and store them in a database.
"""

# System-level imports
import contextlib
import json
import logging
import os
import re
import shutil
import tempfile
import urllib

# Must fix up PYTHONPATH before importing from within Skia
import fix_pythonpath  # pylint: disable=W0611

# Imports from within Skia
import find_run_binary

SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff')

DEFAULT_IMAGE_SUFFIX = '.png'
DEFAULT_IMAGES_SUBDIR = 'images'

DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]')

RGBDIFFS_SUBDIR = 'diffs'
WHITEDIFFS_SUBDIR = 'whitediffs'

# Keys used within DiffRecord dictionary representations.
# NOTE: Keep these in sync with static/constants.js
KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel'
KEY__DIFFERENCES__NUM_DIFF_PIXELS = 'numDifferingPixels'
KEY__DIFFERENCES__PERCENT_DIFF_PIXELS = 'percentDifferingPixels'
KEY__DIFFERENCES__PERCEPTUAL_DIFF = 'perceptualDifference'


class DiffRecord(object):
  """ Record of differences between two images. """

  def __init__(self, storage_root,
               expected_image_url, expected_image_locator,
               actual_image_url, actual_image_locator,
               expected_images_subdir=DEFAULT_IMAGES_SUBDIR,
               actual_images_subdir=DEFAULT_IMAGES_SUBDIR,
               image_suffix=DEFAULT_IMAGE_SUFFIX):
    """Download this pair of images (unless we already have them on local disk),
    and prepare a DiffRecord for them.

    TODO(epoger): Make this asynchronously download images, rather than blocking
    until the images have been downloaded and processed.

    Args:
      storage_root: root directory on local disk within which we store all
          images
      expected_image_url: file or HTTP url from which we will download the
          expected image
      expected_image_locator: a unique ID string under which we will store the
          expected image within storage_root (probably including a checksum to
          guarantee uniqueness)
      actual_image_url: file or HTTP url from which we will download the
          actual image
      actual_image_locator: a unique ID string under which we will store the
          actual image within storage_root (probably including a checksum to
          guarantee uniqueness)
      expected_images_subdir: the subdirectory expected images are stored in.
      actual_images_subdir: the subdirectory actual images are stored in.
      image_suffix: the suffix of images.
    """
    expected_image_locator = _sanitize_locator(expected_image_locator)
    actual_image_locator = _sanitize_locator(actual_image_locator)

    # Download the expected/actual images, if we don't have them already.
    # TODO(rmistry): Add a parameter that just tries to use already-present
    # image files rather than downloading them.
    expected_image_file = os.path.join(
        storage_root, expected_images_subdir,
        str(expected_image_locator) + image_suffix)
    actual_image_file = os.path.join(
        storage_root, actual_images_subdir,
        str(actual_image_locator) + image_suffix)
    try:
      _download_file(expected_image_file, expected_image_url)
    except Exception:
      logging.exception('unable to download expected_image_url %s to file %s' %
                        (expected_image_url, expected_image_file))
      raise
    try:
      _download_file(actual_image_file, actual_image_url)
    except Exception:
      logging.exception('unable to download actual_image_url %s to file %s' %
                        (actual_image_url, actual_image_file))
      raise

    # Get all diff images and values from skpdiff binary.
    skpdiff_output_dir = tempfile.mkdtemp()
    try:
      skpdiff_summary_file = os.path.join(skpdiff_output_dir,
                                          'skpdiff-output.json')
      skpdiff_rgbdiff_dir = os.path.join(skpdiff_output_dir, 'rgbDiff')
      skpdiff_whitediff_dir = os.path.join(skpdiff_output_dir, 'whiteDiff')
      expected_img = os.path.join(storage_root, expected_images_subdir,
                                  str(expected_image_locator) + image_suffix)
      actual_img = os.path.join(storage_root, actual_images_subdir,
                                str(actual_image_locator) + image_suffix)

      # TODO: Call skpdiff ONCE for all image pairs, instead of calling it
      # repeatedly.  This will allow us to parallelize a lot more work.
      find_run_binary.run_command(
          [SKPDIFF_BINARY, '-p', expected_img, actual_img,
           '--jsonp', 'false',
           '--output', skpdiff_summary_file,
           '--differs', 'perceptual', 'different_pixels',
           '--rgbDiffDir', skpdiff_rgbdiff_dir,
           '--whiteDiffDir', skpdiff_whitediff_dir,
           ])

      # Get information out of the skpdiff_summary_file.
      with contextlib.closing(open(skpdiff_summary_file)) as fp:
        data = json.load(fp)

      # For now, we can assume there is only one record in the output summary,
      # since we passed skpdiff only one pair of images.
      record = data['records'][0]
      self._width = record['width']
      self._height = record['height']
      # TODO: make max_diff_per_channel a tuple instead of a list, because the
      # structure is meaningful (first element is red, second is green, etc.)
      # See http://stackoverflow.com/a/626871
      self._max_diff_per_channel = [
          record['maxRedDiff'], record['maxGreenDiff'], record['maxBlueDiff']]
      rgb_diff_path = record['rgbDiffPath']
      white_diff_path = record['whiteDiffPath']
      per_differ_stats = record['diffs']
      for stats in per_differ_stats:
        differ_name = stats['differName']
        if differ_name == 'different_pixels':
          self._num_pixels_differing = stats['pointsOfInterest']
        elif differ_name == 'perceptual':
          perceptual_similarity = stats['result']

      # skpdiff returns the perceptual similarity; convert it to get the
      # perceptual difference percentage.
      # skpdiff outputs -1 if the images are different sizes. Treat any
      # output that does not lie in [0, 1] as having 0% perceptual
      # similarity.
      if not 0 <= perceptual_similarity <= 1:
        perceptual_similarity = 0
      self._perceptual_difference = 100 - (perceptual_similarity * 100)

      # Store the rgbdiff and whitediff images generated above.
      diff_image_locator = _get_difference_locator(
          expected_image_locator=expected_image_locator,
          actual_image_locator=actual_image_locator)
      basename = str(diff_image_locator) + image_suffix
      _mkdir_unless_exists(os.path.join(storage_root, RGBDIFFS_SUBDIR))
      _mkdir_unless_exists(os.path.join(storage_root, WHITEDIFFS_SUBDIR))
      # TODO: Modify skpdiff's behavior so we can tell it exactly where to
      # write the image files into, rather than having to move them around
      # after skpdiff writes them out.
      shutil.copyfile(rgb_diff_path,
                      os.path.join(storage_root, RGBDIFFS_SUBDIR, basename))
      shutil.copyfile(white_diff_path,
                      os.path.join(storage_root, WHITEDIFFS_SUBDIR, basename))

    finally:
      shutil.rmtree(skpdiff_output_dir)

  # TODO(epoger): Use properties instead of getters throughout.
  # See http://stackoverflow.com/a/6618176
  def get_num_pixels_differing(self):
    """Returns the absolute number of pixels that differ."""
    return self._num_pixels_differing

  def get_percent_pixels_differing(self):
    """Returns the percentage of pixels that differ, as a float between
    0 and 100 (inclusive)."""
    return ((float(self._num_pixels_differing) * 100) /
            (self._width * self._height))

  def get_perceptual_difference(self):
    """Returns the perceptual difference percentage."""
    return self._perceptual_difference

  def get_max_diff_per_channel(self):
    """Returns the maximum difference between the expected and actual images
    for each R/G/B channel, as a list."""
    return self._max_diff_per_channel

  def as_dict(self):
    """Returns a dictionary representation of this DiffRecord, as needed when
    constructing the JSON representation."""
    return {
        KEY__DIFFERENCES__NUM_DIFF_PIXELS: self._num_pixels_differing,
        KEY__DIFFERENCES__PERCENT_DIFF_PIXELS:
            self.get_percent_pixels_differing(),
        KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL: self._max_diff_per_channel,
        KEY__DIFFERENCES__PERCEPTUAL_DIFF: self._perceptual_difference,
    }


class ImageDiffDB(object):
  """ Calculates differences between image pairs, maintaining a database of
  them for download."""

  def __init__(self, storage_root):
    """
    Args:
      storage_root: string; root path within the DB will store all of its stuff
    """
    self._storage_root = storage_root

    # Dictionary of DiffRecords, keyed by (expected_image_locator,
    # actual_image_locator) tuples.
    self._diff_dict = {}

  @property
  def storage_root(self):
    return self._storage_root

  def add_image_pair(self,
                     expected_image_url, expected_image_locator,
                     actual_image_url, actual_image_locator):
    """Download this pair of images (unless we already have them on local disk),
    and prepare a DiffRecord for them.

    TODO(epoger): Make this asynchronously download images, rather than blocking
    until the images have been downloaded and processed.
    When we do that, we should probably add a new method that will block
    until all of the images have been downloaded and processed.  Otherwise,
    we won't know when it's safe to start calling get_diff_record().
    jcgregorio notes: maybe just make ImageDiffDB thread-safe and create a
    thread-pool/worker queue at a higher level that just uses ImageDiffDB?

    Args:
      expected_image_url: file or HTTP url from which we will download the
          expected image
      expected_image_locator: a unique ID string under which we will store the
          expected image within storage_root (probably including a checksum to
          guarantee uniqueness)
      actual_image_url: file or HTTP url from which we will download the
          actual image
      actual_image_locator: a unique ID string under which we will store the
          actual image within storage_root (probably including a checksum to
          guarantee uniqueness)
    """
    expected_image_locator = _sanitize_locator(expected_image_locator)
    actual_image_locator = _sanitize_locator(actual_image_locator)
    key = (expected_image_locator, actual_image_locator)
    if not key in self._diff_dict:
      try:
        new_diff_record = DiffRecord(
            self._storage_root,
            expected_image_url=expected_image_url,
            expected_image_locator=expected_image_locator,
            actual_image_url=actual_image_url,
            actual_image_locator=actual_image_locator)
      except Exception:
        # If we can't create a real DiffRecord for this (expected, actual) pair,
        # store None and the UI will show whatever information we DO have.
        # Fixes http://skbug.com/2368 .
        logging.exception(
            'got exception while creating a DiffRecord for '
            'expected_image_url=%s , actual_image_url=%s; returning None' % (
                expected_image_url, actual_image_url))
        new_diff_record = None
      self._diff_dict[key] = new_diff_record

  def get_diff_record(self, expected_image_locator, actual_image_locator):
    """Returns the DiffRecord for this image pair.

    Raises a KeyError if we don't have a DiffRecord for this image pair.
    """
    key = (_sanitize_locator(expected_image_locator),
           _sanitize_locator(actual_image_locator))
    return self._diff_dict[key]


# Utility functions

def _download_file(local_filepath, url):
  """Download a file from url to local_filepath, unless it is already there.

  Args:
    local_filepath: path on local disk where the image should be stored
    url: URL from which we can download the image if we don't have it yet
  """
  if not os.path.exists(local_filepath):
    _mkdir_unless_exists(os.path.dirname(local_filepath))
    with contextlib.closing(urllib.urlopen(url)) as url_handle:
      with open(local_filepath, 'wb') as file_handle:
        shutil.copyfileobj(fsrc=url_handle, fdst=file_handle)


def _mkdir_unless_exists(path):
  """Unless path refers to an already-existing directory, create it.

  Args:
    path: path on local disk
  """
  if not os.path.isdir(path):
    os.makedirs(path)


def _sanitize_locator(locator):
  """Returns a sanitized version of a locator (one in which we know none of the
  characters will have special meaning in filenames).

  Args:
    locator: string, or something that can be represented as a string
  """
  return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator))


def _get_difference_locator(expected_image_locator, actual_image_locator):
  """Returns the locator string used to look up the diffs between expected_image
  and actual_image.

  We must keep this function in sync with getImageDiffRelativeUrl() in
  static/loader.js

  Args:
    expected_image_locator: locator string pointing at expected image
    actual_image_locator: locator string pointing at actual image

  Returns: already-sanitized locator where the diffs between expected and
      actual images can be found
  """
  return "%s-vs-%s" % (_sanitize_locator(expected_image_locator),
                       _sanitize_locator(actual_image_locator))