Index: gm/rebaseline_server/imagediffdb.py |
diff --git a/gm/rebaseline_server/imagediffdb.py b/gm/rebaseline_server/imagediffdb.py |
deleted file mode 100644 |
index 0bc75cfca4d9c9dce9a47759dddb8ec99aa6383a..0000000000000000000000000000000000000000 |
--- a/gm/rebaseline_server/imagediffdb.py |
+++ /dev/null |
@@ -1,477 +0,0 @@ |
-#!/usr/bin/python |
- |
-""" |
-Copyright 2013 Google Inc. |
- |
-Use of this source code is governed by a BSD-style license that can be |
-found in the LICENSE file. |
- |
-Calulate differences between image pairs, and store them in a database. |
-""" |
- |
-# System-level imports |
-import contextlib |
-import errno |
-import json |
-import logging |
-import os |
-import Queue |
-import re |
-import shutil |
-import tempfile |
-import threading |
-import time |
-import urllib |
- |
-# Must fix up PYTHONPATH before importing from within Skia |
-import rs_fixpypath # pylint: disable=W0611 |
- |
-# Imports from within Skia |
-import find_run_binary |
-from py.utils import gs_utils |
- |
- |
-SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff') |
- |
-DEFAULT_IMAGE_SUFFIX = '.png' |
-DEFAULT_IMAGES_SUBDIR = 'images' |
-# TODO(epoger): Figure out a better default number of threads; for now, |
-# using a conservative default value. |
-DEFAULT_NUM_WORKER_THREADS = 1 |
- |
-DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]') |
- |
-RGBDIFFS_SUBDIR = 'diffs' |
-WHITEDIFFS_SUBDIR = 'whitediffs' |
- |
-# Keys used within DiffRecord dictionary representations. |
-# NOTE: Keep these in sync with static/constants.js |
-KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel' |
-KEY__DIFFERENCES__NUM_DIFF_PIXELS = 'numDifferingPixels' |
-KEY__DIFFERENCES__PERCENT_DIFF_PIXELS = 'percentDifferingPixels' |
-KEY__DIFFERENCES__PERCEPTUAL_DIFF = 'perceptualDifference' |
-KEY__DIFFERENCES__DIFF_URL = 'diffUrl' |
-KEY__DIFFERENCES__WHITE_DIFF_URL = 'whiteDiffUrl' |
- |
-# Special values within ImageDiffDB._diff_dict |
-_DIFFRECORD_FAILED = 'failed' |
-_DIFFRECORD_PENDING = 'pending' |
- |
-# How often to report tasks_queue size |
-QUEUE_LOGGING_GRANULARITY = 1000 |
- |
-# Temporary variable to keep track of how many times we download |
-# the same file in multiple threads. |
-# TODO(epoger): Delete this, once we see that the number stays close to 0. |
-global_file_collisions = 0 |
- |
- |
-class DiffRecord(object): |
- """ Record of differences between two images. """ |
- |
- def __init__(self, gs, storage_root, |
- expected_image_url, expected_image_locator, |
- actual_image_url, actual_image_locator, |
- expected_images_subdir=DEFAULT_IMAGES_SUBDIR, |
- actual_images_subdir=DEFAULT_IMAGES_SUBDIR, |
- image_suffix=DEFAULT_IMAGE_SUFFIX): |
- """Download this pair of images (unless we already have them on local disk), |
- and prepare a DiffRecord for them. |
- |
- Args: |
- gs: instance of GSUtils object we can use to download images |
- storage_root: root directory on local disk within which we store all |
- images |
- expected_image_url: file, GS, or HTTP url from which we will download the |
- expected image |
- expected_image_locator: a unique ID string under which we will store the |
- expected image within storage_root (probably including a checksum to |
- guarantee uniqueness) |
- actual_image_url: file, GS, or HTTP url from which we will download the |
- actual image |
- actual_image_locator: a unique ID string under which we will store the |
- actual image within storage_root (probably including a checksum to |
- guarantee uniqueness) |
- expected_images_subdir: the subdirectory expected images are stored in. |
- actual_images_subdir: the subdirectory actual images are stored in. |
- image_suffix: the suffix of images. |
- """ |
- expected_image_locator = _sanitize_locator(expected_image_locator) |
- actual_image_locator = _sanitize_locator(actual_image_locator) |
- |
- # Download the expected/actual images, if we don't have them already. |
- expected_image_file = os.path.join( |
- storage_root, expected_images_subdir, |
- str(expected_image_locator) + image_suffix) |
- actual_image_file = os.path.join( |
- storage_root, actual_images_subdir, |
- str(actual_image_locator) + image_suffix) |
- for image_file, image_url in [ |
- (expected_image_file, expected_image_url), |
- (actual_image_file, actual_image_url)]: |
- if image_file and image_url: |
- try: |
- _download_file(gs, image_file, image_url) |
- except Exception: |
- logging.exception('unable to download image_url %s to file %s' % |
- (image_url, image_file)) |
- raise |
- |
- # Return early if we do not need to generate diffs. |
- if (expected_image_url == actual_image_url or |
- not expected_image_url or not actual_image_url): |
- return |
- |
- # Get all diff images and values using the skpdiff binary. |
- skpdiff_output_dir = tempfile.mkdtemp() |
- try: |
- skpdiff_summary_file = os.path.join(skpdiff_output_dir, |
- 'skpdiff-output.json') |
- skpdiff_rgbdiff_dir = os.path.join(storage_root, RGBDIFFS_SUBDIR) |
- skpdiff_whitediff_dir = os.path.join(storage_root, WHITEDIFFS_SUBDIR) |
- _mkdir_unless_exists(skpdiff_rgbdiff_dir) |
- _mkdir_unless_exists(skpdiff_rgbdiff_dir) |
- |
- # TODO(epoger): Consider calling skpdiff ONCE for all image pairs, |
- # instead of calling it separately for each image pair. |
- # Pro: we'll incur less overhead from making repeated system calls, |
- # spinning up the skpdiff binary, etc. |
- # Con: we would have to wait until all image pairs were loaded before |
- # generating any of the diffs? |
- # Note(stephana): '--longnames' was added to allow for this |
- # case (multiple files at once) versus specifying output diffs |
- # directly. |
- find_run_binary.run_command( |
- [SKPDIFF_BINARY, '-p', expected_image_file, actual_image_file, |
- '--jsonp', 'false', |
- '--longnames', 'true', |
- '--output', skpdiff_summary_file, |
- '--differs', 'perceptual', 'different_pixels', |
- '--rgbDiffDir', skpdiff_rgbdiff_dir, |
- '--whiteDiffDir', skpdiff_whitediff_dir, |
- ]) |
- |
- # Get information out of the skpdiff_summary_file. |
- with contextlib.closing(open(skpdiff_summary_file)) as fp: |
- data = json.load(fp) |
- |
- # For now, we can assume there is only one record in the output summary, |
- # since we passed skpdiff only one pair of images. |
- record = data['records'][0] |
- self._width = record['width'] |
- self._height = record['height'] |
- self._diffUrl = os.path.split(record['rgbDiffPath'])[1] |
- self._whiteDiffUrl = os.path.split(record['whiteDiffPath'])[1] |
- |
- # TODO: make max_diff_per_channel a tuple instead of a list, because the |
- # structure is meaningful (first element is red, second is green, etc.) |
- # See http://stackoverflow.com/a/626871 |
- self._max_diff_per_channel = [ |
- record['maxRedDiff'], record['maxGreenDiff'], record['maxBlueDiff']] |
- per_differ_stats = record['diffs'] |
- for stats in per_differ_stats: |
- differ_name = stats['differName'] |
- if differ_name == 'different_pixels': |
- self._num_pixels_differing = stats['pointsOfInterest'] |
- elif differ_name == 'perceptual': |
- perceptual_similarity = stats['result'] |
- |
- # skpdiff returns the perceptual similarity; convert it to get the |
- # perceptual difference percentage. |
- # skpdiff outputs -1 if the images are different sizes. Treat any |
- # output that does not lie in [0, 1] as having 0% perceptual |
- # similarity. |
- if not 0 <= perceptual_similarity <= 1: |
- perceptual_similarity = 0 |
- self._perceptual_difference = 100 - (perceptual_similarity * 100) |
- finally: |
- shutil.rmtree(skpdiff_output_dir) |
- |
- # TODO(epoger): Use properties instead of getters throughout. |
- # See http://stackoverflow.com/a/6618176 |
- def get_num_pixels_differing(self): |
- """Returns the absolute number of pixels that differ.""" |
- return self._num_pixels_differing |
- |
- def get_percent_pixels_differing(self): |
- """Returns the percentage of pixels that differ, as a float between |
- 0 and 100 (inclusive).""" |
- return ((float(self._num_pixels_differing) * 100) / |
- (self._width * self._height)) |
- |
- def get_perceptual_difference(self): |
- """Returns the perceptual difference percentage.""" |
- return self._perceptual_difference |
- |
- def get_max_diff_per_channel(self): |
- """Returns the maximum difference between the expected and actual images |
- for each R/G/B channel, as a list.""" |
- return self._max_diff_per_channel |
- |
- def as_dict(self): |
- """Returns a dictionary representation of this DiffRecord, as needed when |
- constructing the JSON representation.""" |
- return { |
- KEY__DIFFERENCES__NUM_DIFF_PIXELS: self._num_pixels_differing, |
- KEY__DIFFERENCES__PERCENT_DIFF_PIXELS: |
- self.get_percent_pixels_differing(), |
- KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL: self._max_diff_per_channel, |
- KEY__DIFFERENCES__PERCEPTUAL_DIFF: self._perceptual_difference, |
- KEY__DIFFERENCES__DIFF_URL: self._diffUrl, |
- KEY__DIFFERENCES__WHITE_DIFF_URL: self._whiteDiffUrl, |
- } |
- |
- |
- |
-class ImageDiffDB(object): |
- """ Calculates differences between image pairs, maintaining a database of |
- them for download.""" |
- |
- def __init__(self, storage_root, gs=None, |
- num_worker_threads=DEFAULT_NUM_WORKER_THREADS): |
- """ |
- Args: |
- storage_root: string; root path within the DB will store all of its stuff |
- gs: instance of GSUtils object we can use to download images |
- num_worker_threads: how many threads that download images and |
- generate diffs simultaneously |
- """ |
- self._storage_root = storage_root |
- self._gs = gs |
- |
- # Mechanism for reporting queue size periodically. |
- self._last_queue_size_reported = None |
- self._queue_size_report_lock = threading.RLock() |
- |
- # Dictionary of DiffRecords, keyed by (expected_image_locator, |
- # actual_image_locator) tuples. |
- # Values can also be _DIFFRECORD_PENDING, _DIFFRECORD_FAILED. |
- # |
- # Any thread that modifies _diff_dict must first acquire |
- # _diff_dict_writelock! |
- # |
- # TODO(epoger): Disk is limitless, but RAM is not... so, we should probably |
- # remove items from self._diff_dict if they haven't been accessed for a |
- # long time. We can always regenerate them by diffing the images we |
- # previously downloaded to local disk. |
- # I guess we should figure out how expensive it is to download vs diff the |
- # image pairs... if diffing them is expensive too, we can write these |
- # _diff_dict objects out to disk if there's too many to hold in RAM. |
- # Or we could use virtual memory to handle that automatically. |
- self._diff_dict = {} |
- self._diff_dict_writelock = threading.RLock() |
- |
- # Set up the queue for asynchronously loading DiffRecords, and start the |
- # worker threads reading from it. |
- # The queue maxsize must be 0 (infinite size queue), so that asynchronous |
- # calls can return as soon as possible. |
- self._tasks_queue = Queue.Queue(maxsize=0) |
- self._workers = [] |
- for i in range(num_worker_threads): |
- worker = threading.Thread(target=self.worker, args=(i,)) |
- worker.daemon = True |
- worker.start() |
- self._workers.append(worker) |
- |
- def log_queue_size_if_changed(self, limit_verbosity=True): |
- """Log the size of self._tasks_queue, if it has changed since the last call. |
- |
- Reports the current queue size, using log.info(), unless the queue is the |
- same size as the last time we reported it. |
- |
- Args: |
- limit_verbosity: if True, only log if the queue size is a multiple of |
- QUEUE_LOGGING_GRANULARITY |
- """ |
- # Acquire the lock, to synchronize access to self._last_queue_size_reported |
- self._queue_size_report_lock.acquire() |
- try: |
- size = self._tasks_queue.qsize() |
- if size == self._last_queue_size_reported: |
- return |
- if limit_verbosity and (size % QUEUE_LOGGING_GRANULARITY != 0): |
- return |
- logging.info('tasks_queue size is %d' % size) |
- self._last_queue_size_reported = size |
- finally: |
- self._queue_size_report_lock.release() |
- |
- def worker(self, worker_num): |
- """Launch a worker thread that pulls tasks off self._tasks_queue. |
- |
- Args: |
- worker_num: (integer) which worker this is |
- """ |
- while True: |
- self.log_queue_size_if_changed() |
- params = self._tasks_queue.get() |
- key, expected_image_url, actual_image_url = params |
- try: |
- diff_record = DiffRecord( |
- self._gs, self._storage_root, |
- expected_image_url=expected_image_url, |
- expected_image_locator=key[0], |
- actual_image_url=actual_image_url, |
- actual_image_locator=key[1]) |
- except Exception: |
- logging.exception( |
- 'exception while creating DiffRecord for key %s' % str(key)) |
- diff_record = _DIFFRECORD_FAILED |
- self._diff_dict_writelock.acquire() |
- try: |
- self._diff_dict[key] = diff_record |
- finally: |
- self._diff_dict_writelock.release() |
- |
- @property |
- def storage_root(self): |
- return self._storage_root |
- |
- def add_image_pair(self, |
- expected_image_url, expected_image_locator, |
- actual_image_url, actual_image_locator): |
- """Asynchronously prepare a DiffRecord for a pair of images. |
- |
- This method will return quickly; calls to get_diff_record() will block |
- until the DiffRecord is available (or we have given up on creating it). |
- |
- If we already have a DiffRecord for this particular image pair, no work |
- will be done. |
- |
- If expected_image_url (or its locator) is None, just download actual_image. |
- If actual_image_url (or its locator) is None, just download expected_image. |
- |
- Args: |
- expected_image_url: file, GS, or HTTP url from which we will download the |
- expected image |
- expected_image_locator: a unique ID string under which we will store the |
- expected image within storage_root (probably including a checksum to |
- guarantee uniqueness) |
- actual_image_url: file, GS, or HTTP url from which we will download the |
- actual image |
- actual_image_locator: a unique ID string under which we will store the |
- actual image within storage_root (probably including a checksum to |
- guarantee uniqueness) |
- """ |
- expected_image_locator = _sanitize_locator(expected_image_locator) |
- actual_image_locator = _sanitize_locator(actual_image_locator) |
- key = (expected_image_locator, actual_image_locator) |
- must_add_to_queue = False |
- |
- self._diff_dict_writelock.acquire() |
- try: |
- if not key in self._diff_dict: |
- # If we have already requested a diff between these two images, |
- # we don't need to request it again. |
- must_add_to_queue = True |
- self._diff_dict[key] = _DIFFRECORD_PENDING |
- finally: |
- self._diff_dict_writelock.release() |
- |
- if must_add_to_queue: |
- self._tasks_queue.put((key, expected_image_url, actual_image_url)) |
- self.log_queue_size_if_changed() |
- |
- def get_diff_record(self, expected_image_locator, actual_image_locator): |
- """Returns the DiffRecord for this image pair. |
- |
- This call will block until the diff record is available, or we were unable |
- to generate it. |
- |
- Args: |
- expected_image_locator: a unique ID string under which we will store the |
- expected image within storage_root (probably including a checksum to |
- guarantee uniqueness) |
- actual_image_locator: a unique ID string under which we will store the |
- actual image within storage_root (probably including a checksum to |
- guarantee uniqueness) |
- |
- Returns the DiffRecord for this image pair, or None if we were unable to |
- generate one. |
- """ |
- key = (_sanitize_locator(expected_image_locator), |
- _sanitize_locator(actual_image_locator)) |
- diff_record = self._diff_dict[key] |
- |
- # If we have no results yet, block until we do. |
- while diff_record == _DIFFRECORD_PENDING: |
- time.sleep(1) |
- diff_record = self._diff_dict[key] |
- |
- # Once we have the result... |
- if diff_record == _DIFFRECORD_FAILED: |
- logging.error( |
- 'failed to create a DiffRecord for expected_image_locator=%s , ' |
- 'actual_image_locator=%s' % ( |
- expected_image_locator, actual_image_locator)) |
- return None |
- else: |
- return diff_record |
- |
- |
-# Utility functions |
- |
-def _download_file(gs, local_filepath, url): |
- """Download a file from url to local_filepath, unless it is already there. |
- |
- Args: |
- gs: instance of GSUtils object, in case the url points at Google Storage |
- local_filepath: path on local disk where the image should be stored |
- url: HTTP or GS URL from which we can download the image if we don't have |
- it yet |
- """ |
- global global_file_collisions |
- if not os.path.exists(local_filepath): |
- _mkdir_unless_exists(os.path.dirname(local_filepath)) |
- |
- # First download the file contents into a unique filename, and |
- # then rename that file. That way, if multiple threads are downloading |
- # the same filename at the same time, they won't interfere with each |
- # other (they will both download the file, and one will "win" in the end) |
- temp_filename = '%s-%d' % (local_filepath, |
- threading.current_thread().ident) |
- if gs_utils.GSUtils.is_gs_url(url): |
- (bucket, path) = gs_utils.GSUtils.split_gs_url(url) |
- gs.download_file(source_bucket=bucket, source_path=path, |
- dest_path=temp_filename) |
- else: |
- with contextlib.closing(urllib.urlopen(url)) as url_handle: |
- with open(temp_filename, 'wb') as file_handle: |
- shutil.copyfileobj(fsrc=url_handle, fdst=file_handle) |
- |
- # Rename the file to its real filename. |
- # Keep count of how many colliding downloads we encounter; |
- # if it's a large number, we may want to change our download strategy |
- # to minimize repeated downloads. |
- if os.path.exists(local_filepath): |
- global_file_collisions += 1 |
- else: |
- os.rename(temp_filename, local_filepath) |
- |
- |
-def _mkdir_unless_exists(path): |
- """Unless path refers to an already-existing directory, create it. |
- |
- Args: |
- path: path on local disk |
- """ |
- try: |
- os.makedirs(path) |
- except OSError as e: |
- if e.errno == errno.EEXIST: |
- pass |
- |
- |
-def _sanitize_locator(locator): |
- """Returns a sanitized version of a locator (one in which we know none of the |
- characters will have special meaning in filenames). |
- |
- Args: |
- locator: string, or something that can be represented as a string. |
- If None or '', it is returned without modification, because empty |
- locators have a particular meaning ("there is no image for this") |
- """ |
- if locator: |
- return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator)) |
- else: |
- return locator |