Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Unified Diff: gm/rebaseline_server/imagediffdb.py

Issue 848073005: Revert "delete old things!" (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « gm/rebaseline_server/download_actuals_test.py ('k') | gm/rebaseline_server/imagediffdb_test.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: gm/rebaseline_server/imagediffdb.py
diff --git a/gm/rebaseline_server/imagediffdb.py b/gm/rebaseline_server/imagediffdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bc75cfca4d9c9dce9a47759dddb8ec99aa6383a
--- /dev/null
+++ b/gm/rebaseline_server/imagediffdb.py
@@ -0,0 +1,477 @@
+#!/usr/bin/python
+
+"""
+Copyright 2013 Google Inc.
+
+Use of this source code is governed by a BSD-style license that can be
+found in the LICENSE file.
+
+Calulate differences between image pairs, and store them in a database.
+"""
+
+# System-level imports
+import contextlib
+import errno
+import json
+import logging
+import os
+import Queue
+import re
+import shutil
+import tempfile
+import threading
+import time
+import urllib
+
+# Must fix up PYTHONPATH before importing from within Skia
+import rs_fixpypath # pylint: disable=W0611
+
+# Imports from within Skia
+import find_run_binary
+from py.utils import gs_utils
+
+
+SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff')
+
+DEFAULT_IMAGE_SUFFIX = '.png'
+DEFAULT_IMAGES_SUBDIR = 'images'
+# TODO(epoger): Figure out a better default number of threads; for now,
+# using a conservative default value.
+DEFAULT_NUM_WORKER_THREADS = 1
+
+DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]')
+
+RGBDIFFS_SUBDIR = 'diffs'
+WHITEDIFFS_SUBDIR = 'whitediffs'
+
+# Keys used within DiffRecord dictionary representations.
+# NOTE: Keep these in sync with static/constants.js
+KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel'
+KEY__DIFFERENCES__NUM_DIFF_PIXELS = 'numDifferingPixels'
+KEY__DIFFERENCES__PERCENT_DIFF_PIXELS = 'percentDifferingPixels'
+KEY__DIFFERENCES__PERCEPTUAL_DIFF = 'perceptualDifference'
+KEY__DIFFERENCES__DIFF_URL = 'diffUrl'
+KEY__DIFFERENCES__WHITE_DIFF_URL = 'whiteDiffUrl'
+
+# Special values within ImageDiffDB._diff_dict
+_DIFFRECORD_FAILED = 'failed'
+_DIFFRECORD_PENDING = 'pending'
+
+# How often to report tasks_queue size
+QUEUE_LOGGING_GRANULARITY = 1000
+
+# Temporary variable to keep track of how many times we download
+# the same file in multiple threads.
+# TODO(epoger): Delete this, once we see that the number stays close to 0.
+global_file_collisions = 0
+
+
+class DiffRecord(object):
+ """ Record of differences between two images. """
+
+ def __init__(self, gs, storage_root,
+ expected_image_url, expected_image_locator,
+ actual_image_url, actual_image_locator,
+ expected_images_subdir=DEFAULT_IMAGES_SUBDIR,
+ actual_images_subdir=DEFAULT_IMAGES_SUBDIR,
+ image_suffix=DEFAULT_IMAGE_SUFFIX):
+ """Download this pair of images (unless we already have them on local disk),
+ and prepare a DiffRecord for them.
+
+ Args:
+ gs: instance of GSUtils object we can use to download images
+ storage_root: root directory on local disk within which we store all
+ images
+ expected_image_url: file, GS, or HTTP url from which we will download the
+ expected image
+ expected_image_locator: a unique ID string under which we will store the
+ expected image within storage_root (probably including a checksum to
+ guarantee uniqueness)
+ actual_image_url: file, GS, or HTTP url from which we will download the
+ actual image
+ actual_image_locator: a unique ID string under which we will store the
+ actual image within storage_root (probably including a checksum to
+ guarantee uniqueness)
+ expected_images_subdir: the subdirectory expected images are stored in.
+ actual_images_subdir: the subdirectory actual images are stored in.
+ image_suffix: the suffix of images.
+ """
+ expected_image_locator = _sanitize_locator(expected_image_locator)
+ actual_image_locator = _sanitize_locator(actual_image_locator)
+
+ # Download the expected/actual images, if we don't have them already.
+ expected_image_file = os.path.join(
+ storage_root, expected_images_subdir,
+ str(expected_image_locator) + image_suffix)
+ actual_image_file = os.path.join(
+ storage_root, actual_images_subdir,
+ str(actual_image_locator) + image_suffix)
+ for image_file, image_url in [
+ (expected_image_file, expected_image_url),
+ (actual_image_file, actual_image_url)]:
+ if image_file and image_url:
+ try:
+ _download_file(gs, image_file, image_url)
+ except Exception:
+ logging.exception('unable to download image_url %s to file %s' %
+ (image_url, image_file))
+ raise
+
+ # Return early if we do not need to generate diffs.
+ if (expected_image_url == actual_image_url or
+ not expected_image_url or not actual_image_url):
+ return
+
+ # Get all diff images and values using the skpdiff binary.
+ skpdiff_output_dir = tempfile.mkdtemp()
+ try:
+ skpdiff_summary_file = os.path.join(skpdiff_output_dir,
+ 'skpdiff-output.json')
+ skpdiff_rgbdiff_dir = os.path.join(storage_root, RGBDIFFS_SUBDIR)
+ skpdiff_whitediff_dir = os.path.join(storage_root, WHITEDIFFS_SUBDIR)
+ _mkdir_unless_exists(skpdiff_rgbdiff_dir)
+ _mkdir_unless_exists(skpdiff_rgbdiff_dir)
+
+ # TODO(epoger): Consider calling skpdiff ONCE for all image pairs,
+ # instead of calling it separately for each image pair.
+ # Pro: we'll incur less overhead from making repeated system calls,
+ # spinning up the skpdiff binary, etc.
+ # Con: we would have to wait until all image pairs were loaded before
+ # generating any of the diffs?
+ # Note(stephana): '--longnames' was added to allow for this
+ # case (multiple files at once) versus specifying output diffs
+ # directly.
+ find_run_binary.run_command(
+ [SKPDIFF_BINARY, '-p', expected_image_file, actual_image_file,
+ '--jsonp', 'false',
+ '--longnames', 'true',
+ '--output', skpdiff_summary_file,
+ '--differs', 'perceptual', 'different_pixels',
+ '--rgbDiffDir', skpdiff_rgbdiff_dir,
+ '--whiteDiffDir', skpdiff_whitediff_dir,
+ ])
+
+ # Get information out of the skpdiff_summary_file.
+ with contextlib.closing(open(skpdiff_summary_file)) as fp:
+ data = json.load(fp)
+
+ # For now, we can assume there is only one record in the output summary,
+ # since we passed skpdiff only one pair of images.
+ record = data['records'][0]
+ self._width = record['width']
+ self._height = record['height']
+ self._diffUrl = os.path.split(record['rgbDiffPath'])[1]
+ self._whiteDiffUrl = os.path.split(record['whiteDiffPath'])[1]
+
+ # TODO: make max_diff_per_channel a tuple instead of a list, because the
+ # structure is meaningful (first element is red, second is green, etc.)
+ # See http://stackoverflow.com/a/626871
+ self._max_diff_per_channel = [
+ record['maxRedDiff'], record['maxGreenDiff'], record['maxBlueDiff']]
+ per_differ_stats = record['diffs']
+ for stats in per_differ_stats:
+ differ_name = stats['differName']
+ if differ_name == 'different_pixels':
+ self._num_pixels_differing = stats['pointsOfInterest']
+ elif differ_name == 'perceptual':
+ perceptual_similarity = stats['result']
+
+ # skpdiff returns the perceptual similarity; convert it to get the
+ # perceptual difference percentage.
+ # skpdiff outputs -1 if the images are different sizes. Treat any
+ # output that does not lie in [0, 1] as having 0% perceptual
+ # similarity.
+ if not 0 <= perceptual_similarity <= 1:
+ perceptual_similarity = 0
+ self._perceptual_difference = 100 - (perceptual_similarity * 100)
+ finally:
+ shutil.rmtree(skpdiff_output_dir)
+
+ # TODO(epoger): Use properties instead of getters throughout.
+ # See http://stackoverflow.com/a/6618176
+ def get_num_pixels_differing(self):
+ """Returns the absolute number of pixels that differ."""
+ return self._num_pixels_differing
+
+ def get_percent_pixels_differing(self):
+ """Returns the percentage of pixels that differ, as a float between
+ 0 and 100 (inclusive)."""
+ return ((float(self._num_pixels_differing) * 100) /
+ (self._width * self._height))
+
+ def get_perceptual_difference(self):
+ """Returns the perceptual difference percentage."""
+ return self._perceptual_difference
+
+ def get_max_diff_per_channel(self):
+ """Returns the maximum difference between the expected and actual images
+ for each R/G/B channel, as a list."""
+ return self._max_diff_per_channel
+
+ def as_dict(self):
+ """Returns a dictionary representation of this DiffRecord, as needed when
+ constructing the JSON representation."""
+ return {
+ KEY__DIFFERENCES__NUM_DIFF_PIXELS: self._num_pixels_differing,
+ KEY__DIFFERENCES__PERCENT_DIFF_PIXELS:
+ self.get_percent_pixels_differing(),
+ KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL: self._max_diff_per_channel,
+ KEY__DIFFERENCES__PERCEPTUAL_DIFF: self._perceptual_difference,
+ KEY__DIFFERENCES__DIFF_URL: self._diffUrl,
+ KEY__DIFFERENCES__WHITE_DIFF_URL: self._whiteDiffUrl,
+ }
+
+
+
+class ImageDiffDB(object):
+ """ Calculates differences between image pairs, maintaining a database of
+ them for download."""
+
+ def __init__(self, storage_root, gs=None,
+ num_worker_threads=DEFAULT_NUM_WORKER_THREADS):
+ """
+ Args:
+ storage_root: string; root path within the DB will store all of its stuff
+ gs: instance of GSUtils object we can use to download images
+ num_worker_threads: how many threads that download images and
+ generate diffs simultaneously
+ """
+ self._storage_root = storage_root
+ self._gs = gs
+
+ # Mechanism for reporting queue size periodically.
+ self._last_queue_size_reported = None
+ self._queue_size_report_lock = threading.RLock()
+
+ # Dictionary of DiffRecords, keyed by (expected_image_locator,
+ # actual_image_locator) tuples.
+ # Values can also be _DIFFRECORD_PENDING, _DIFFRECORD_FAILED.
+ #
+ # Any thread that modifies _diff_dict must first acquire
+ # _diff_dict_writelock!
+ #
+ # TODO(epoger): Disk is limitless, but RAM is not... so, we should probably
+ # remove items from self._diff_dict if they haven't been accessed for a
+ # long time. We can always regenerate them by diffing the images we
+ # previously downloaded to local disk.
+ # I guess we should figure out how expensive it is to download vs diff the
+ # image pairs... if diffing them is expensive too, we can write these
+ # _diff_dict objects out to disk if there's too many to hold in RAM.
+ # Or we could use virtual memory to handle that automatically.
+ self._diff_dict = {}
+ self._diff_dict_writelock = threading.RLock()
+
+ # Set up the queue for asynchronously loading DiffRecords, and start the
+ # worker threads reading from it.
+ # The queue maxsize must be 0 (infinite size queue), so that asynchronous
+ # calls can return as soon as possible.
+ self._tasks_queue = Queue.Queue(maxsize=0)
+ self._workers = []
+ for i in range(num_worker_threads):
+ worker = threading.Thread(target=self.worker, args=(i,))
+ worker.daemon = True
+ worker.start()
+ self._workers.append(worker)
+
+ def log_queue_size_if_changed(self, limit_verbosity=True):
+ """Log the size of self._tasks_queue, if it has changed since the last call.
+
+ Reports the current queue size, using log.info(), unless the queue is the
+ same size as the last time we reported it.
+
+ Args:
+ limit_verbosity: if True, only log if the queue size is a multiple of
+ QUEUE_LOGGING_GRANULARITY
+ """
+ # Acquire the lock, to synchronize access to self._last_queue_size_reported
+ self._queue_size_report_lock.acquire()
+ try:
+ size = self._tasks_queue.qsize()
+ if size == self._last_queue_size_reported:
+ return
+ if limit_verbosity and (size % QUEUE_LOGGING_GRANULARITY != 0):
+ return
+ logging.info('tasks_queue size is %d' % size)
+ self._last_queue_size_reported = size
+ finally:
+ self._queue_size_report_lock.release()
+
+ def worker(self, worker_num):
+ """Launch a worker thread that pulls tasks off self._tasks_queue.
+
+ Args:
+ worker_num: (integer) which worker this is
+ """
+ while True:
+ self.log_queue_size_if_changed()
+ params = self._tasks_queue.get()
+ key, expected_image_url, actual_image_url = params
+ try:
+ diff_record = DiffRecord(
+ self._gs, self._storage_root,
+ expected_image_url=expected_image_url,
+ expected_image_locator=key[0],
+ actual_image_url=actual_image_url,
+ actual_image_locator=key[1])
+ except Exception:
+ logging.exception(
+ 'exception while creating DiffRecord for key %s' % str(key))
+ diff_record = _DIFFRECORD_FAILED
+ self._diff_dict_writelock.acquire()
+ try:
+ self._diff_dict[key] = diff_record
+ finally:
+ self._diff_dict_writelock.release()
+
+ @property
+ def storage_root(self):
+ return self._storage_root
+
+ def add_image_pair(self,
+ expected_image_url, expected_image_locator,
+ actual_image_url, actual_image_locator):
+ """Asynchronously prepare a DiffRecord for a pair of images.
+
+ This method will return quickly; calls to get_diff_record() will block
+ until the DiffRecord is available (or we have given up on creating it).
+
+ If we already have a DiffRecord for this particular image pair, no work
+ will be done.
+
+ If expected_image_url (or its locator) is None, just download actual_image.
+ If actual_image_url (or its locator) is None, just download expected_image.
+
+ Args:
+ expected_image_url: file, GS, or HTTP url from which we will download the
+ expected image
+ expected_image_locator: a unique ID string under which we will store the
+ expected image within storage_root (probably including a checksum to
+ guarantee uniqueness)
+ actual_image_url: file, GS, or HTTP url from which we will download the
+ actual image
+ actual_image_locator: a unique ID string under which we will store the
+ actual image within storage_root (probably including a checksum to
+ guarantee uniqueness)
+ """
+ expected_image_locator = _sanitize_locator(expected_image_locator)
+ actual_image_locator = _sanitize_locator(actual_image_locator)
+ key = (expected_image_locator, actual_image_locator)
+ must_add_to_queue = False
+
+ self._diff_dict_writelock.acquire()
+ try:
+ if not key in self._diff_dict:
+ # If we have already requested a diff between these two images,
+ # we don't need to request it again.
+ must_add_to_queue = True
+ self._diff_dict[key] = _DIFFRECORD_PENDING
+ finally:
+ self._diff_dict_writelock.release()
+
+ if must_add_to_queue:
+ self._tasks_queue.put((key, expected_image_url, actual_image_url))
+ self.log_queue_size_if_changed()
+
+ def get_diff_record(self, expected_image_locator, actual_image_locator):
+ """Returns the DiffRecord for this image pair.
+
+ This call will block until the diff record is available, or we were unable
+ to generate it.
+
+ Args:
+ expected_image_locator: a unique ID string under which we will store the
+ expected image within storage_root (probably including a checksum to
+ guarantee uniqueness)
+ actual_image_locator: a unique ID string under which we will store the
+ actual image within storage_root (probably including a checksum to
+ guarantee uniqueness)
+
+ Returns the DiffRecord for this image pair, or None if we were unable to
+ generate one.
+ """
+ key = (_sanitize_locator(expected_image_locator),
+ _sanitize_locator(actual_image_locator))
+ diff_record = self._diff_dict[key]
+
+ # If we have no results yet, block until we do.
+ while diff_record == _DIFFRECORD_PENDING:
+ time.sleep(1)
+ diff_record = self._diff_dict[key]
+
+ # Once we have the result...
+ if diff_record == _DIFFRECORD_FAILED:
+ logging.error(
+ 'failed to create a DiffRecord for expected_image_locator=%s , '
+ 'actual_image_locator=%s' % (
+ expected_image_locator, actual_image_locator))
+ return None
+ else:
+ return diff_record
+
+
+# Utility functions
+
+def _download_file(gs, local_filepath, url):
+ """Download a file from url to local_filepath, unless it is already there.
+
+ Args:
+ gs: instance of GSUtils object, in case the url points at Google Storage
+ local_filepath: path on local disk where the image should be stored
+ url: HTTP or GS URL from which we can download the image if we don't have
+ it yet
+ """
+ global global_file_collisions
+ if not os.path.exists(local_filepath):
+ _mkdir_unless_exists(os.path.dirname(local_filepath))
+
+ # First download the file contents into a unique filename, and
+ # then rename that file. That way, if multiple threads are downloading
+ # the same filename at the same time, they won't interfere with each
+ # other (they will both download the file, and one will "win" in the end)
+ temp_filename = '%s-%d' % (local_filepath,
+ threading.current_thread().ident)
+ if gs_utils.GSUtils.is_gs_url(url):
+ (bucket, path) = gs_utils.GSUtils.split_gs_url(url)
+ gs.download_file(source_bucket=bucket, source_path=path,
+ dest_path=temp_filename)
+ else:
+ with contextlib.closing(urllib.urlopen(url)) as url_handle:
+ with open(temp_filename, 'wb') as file_handle:
+ shutil.copyfileobj(fsrc=url_handle, fdst=file_handle)
+
+ # Rename the file to its real filename.
+ # Keep count of how many colliding downloads we encounter;
+ # if it's a large number, we may want to change our download strategy
+ # to minimize repeated downloads.
+ if os.path.exists(local_filepath):
+ global_file_collisions += 1
+ else:
+ os.rename(temp_filename, local_filepath)
+
+
+def _mkdir_unless_exists(path):
+ """Unless path refers to an already-existing directory, create it.
+
+ Args:
+ path: path on local disk
+ """
+ try:
+ os.makedirs(path)
+ except OSError as e:
+ if e.errno == errno.EEXIST:
+ pass
+
+
+def _sanitize_locator(locator):
+ """Returns a sanitized version of a locator (one in which we know none of the
+ characters will have special meaning in filenames).
+
+ Args:
+ locator: string, or something that can be represented as a string.
+ If None or '', it is returned without modification, because empty
+ locators have a particular meaning ("there is no image for this")
+ """
+ if locator:
+ return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator))
+ else:
+ return locator
« no previous file with comments | « gm/rebaseline_server/download_actuals_test.py ('k') | gm/rebaseline_server/imagediffdb_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698