Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(309)

Unified Diff: gm/rebaseline_server/imagediffdb.py

Issue 856103002: Revert "Revert "delete old things!"" (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « gm/rebaseline_server/download_actuals_test.py ('k') | gm/rebaseline_server/imagediffdb_test.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: gm/rebaseline_server/imagediffdb.py
diff --git a/gm/rebaseline_server/imagediffdb.py b/gm/rebaseline_server/imagediffdb.py
deleted file mode 100644
index 0bc75cfca4d9c9dce9a47759dddb8ec99aa6383a..0000000000000000000000000000000000000000
--- a/gm/rebaseline_server/imagediffdb.py
+++ /dev/null
@@ -1,477 +0,0 @@
-#!/usr/bin/python
-
-"""
-Copyright 2013 Google Inc.
-
-Use of this source code is governed by a BSD-style license that can be
-found in the LICENSE file.
-
-Calulate differences between image pairs, and store them in a database.
-"""
-
-# System-level imports
-import contextlib
-import errno
-import json
-import logging
-import os
-import Queue
-import re
-import shutil
-import tempfile
-import threading
-import time
-import urllib
-
-# Must fix up PYTHONPATH before importing from within Skia
-import rs_fixpypath # pylint: disable=W0611
-
-# Imports from within Skia
-import find_run_binary
-from py.utils import gs_utils
-
-
-SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff')
-
-DEFAULT_IMAGE_SUFFIX = '.png'
-DEFAULT_IMAGES_SUBDIR = 'images'
-# TODO(epoger): Figure out a better default number of threads; for now,
-# using a conservative default value.
-DEFAULT_NUM_WORKER_THREADS = 1
-
-DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]')
-
-RGBDIFFS_SUBDIR = 'diffs'
-WHITEDIFFS_SUBDIR = 'whitediffs'
-
-# Keys used within DiffRecord dictionary representations.
-# NOTE: Keep these in sync with static/constants.js
-KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel'
-KEY__DIFFERENCES__NUM_DIFF_PIXELS = 'numDifferingPixels'
-KEY__DIFFERENCES__PERCENT_DIFF_PIXELS = 'percentDifferingPixels'
-KEY__DIFFERENCES__PERCEPTUAL_DIFF = 'perceptualDifference'
-KEY__DIFFERENCES__DIFF_URL = 'diffUrl'
-KEY__DIFFERENCES__WHITE_DIFF_URL = 'whiteDiffUrl'
-
-# Special values within ImageDiffDB._diff_dict
-_DIFFRECORD_FAILED = 'failed'
-_DIFFRECORD_PENDING = 'pending'
-
-# How often to report tasks_queue size
-QUEUE_LOGGING_GRANULARITY = 1000
-
-# Temporary variable to keep track of how many times we download
-# the same file in multiple threads.
-# TODO(epoger): Delete this, once we see that the number stays close to 0.
-global_file_collisions = 0
-
-
-class DiffRecord(object):
- """ Record of differences between two images. """
-
- def __init__(self, gs, storage_root,
- expected_image_url, expected_image_locator,
- actual_image_url, actual_image_locator,
- expected_images_subdir=DEFAULT_IMAGES_SUBDIR,
- actual_images_subdir=DEFAULT_IMAGES_SUBDIR,
- image_suffix=DEFAULT_IMAGE_SUFFIX):
- """Download this pair of images (unless we already have them on local disk),
- and prepare a DiffRecord for them.
-
- Args:
- gs: instance of GSUtils object we can use to download images
- storage_root: root directory on local disk within which we store all
- images
- expected_image_url: file, GS, or HTTP url from which we will download the
- expected image
- expected_image_locator: a unique ID string under which we will store the
- expected image within storage_root (probably including a checksum to
- guarantee uniqueness)
- actual_image_url: file, GS, or HTTP url from which we will download the
- actual image
- actual_image_locator: a unique ID string under which we will store the
- actual image within storage_root (probably including a checksum to
- guarantee uniqueness)
- expected_images_subdir: the subdirectory expected images are stored in.
- actual_images_subdir: the subdirectory actual images are stored in.
- image_suffix: the suffix of images.
- """
- expected_image_locator = _sanitize_locator(expected_image_locator)
- actual_image_locator = _sanitize_locator(actual_image_locator)
-
- # Download the expected/actual images, if we don't have them already.
- expected_image_file = os.path.join(
- storage_root, expected_images_subdir,
- str(expected_image_locator) + image_suffix)
- actual_image_file = os.path.join(
- storage_root, actual_images_subdir,
- str(actual_image_locator) + image_suffix)
- for image_file, image_url in [
- (expected_image_file, expected_image_url),
- (actual_image_file, actual_image_url)]:
- if image_file and image_url:
- try:
- _download_file(gs, image_file, image_url)
- except Exception:
- logging.exception('unable to download image_url %s to file %s' %
- (image_url, image_file))
- raise
-
- # Return early if we do not need to generate diffs.
- if (expected_image_url == actual_image_url or
- not expected_image_url or not actual_image_url):
- return
-
- # Get all diff images and values using the skpdiff binary.
- skpdiff_output_dir = tempfile.mkdtemp()
- try:
- skpdiff_summary_file = os.path.join(skpdiff_output_dir,
- 'skpdiff-output.json')
- skpdiff_rgbdiff_dir = os.path.join(storage_root, RGBDIFFS_SUBDIR)
- skpdiff_whitediff_dir = os.path.join(storage_root, WHITEDIFFS_SUBDIR)
- _mkdir_unless_exists(skpdiff_rgbdiff_dir)
- _mkdir_unless_exists(skpdiff_rgbdiff_dir)
-
- # TODO(epoger): Consider calling skpdiff ONCE for all image pairs,
- # instead of calling it separately for each image pair.
- # Pro: we'll incur less overhead from making repeated system calls,
- # spinning up the skpdiff binary, etc.
- # Con: we would have to wait until all image pairs were loaded before
- # generating any of the diffs?
- # Note(stephana): '--longnames' was added to allow for this
- # case (multiple files at once) versus specifying output diffs
- # directly.
- find_run_binary.run_command(
- [SKPDIFF_BINARY, '-p', expected_image_file, actual_image_file,
- '--jsonp', 'false',
- '--longnames', 'true',
- '--output', skpdiff_summary_file,
- '--differs', 'perceptual', 'different_pixels',
- '--rgbDiffDir', skpdiff_rgbdiff_dir,
- '--whiteDiffDir', skpdiff_whitediff_dir,
- ])
-
- # Get information out of the skpdiff_summary_file.
- with contextlib.closing(open(skpdiff_summary_file)) as fp:
- data = json.load(fp)
-
- # For now, we can assume there is only one record in the output summary,
- # since we passed skpdiff only one pair of images.
- record = data['records'][0]
- self._width = record['width']
- self._height = record['height']
- self._diffUrl = os.path.split(record['rgbDiffPath'])[1]
- self._whiteDiffUrl = os.path.split(record['whiteDiffPath'])[1]
-
- # TODO: make max_diff_per_channel a tuple instead of a list, because the
- # structure is meaningful (first element is red, second is green, etc.)
- # See http://stackoverflow.com/a/626871
- self._max_diff_per_channel = [
- record['maxRedDiff'], record['maxGreenDiff'], record['maxBlueDiff']]
- per_differ_stats = record['diffs']
- for stats in per_differ_stats:
- differ_name = stats['differName']
- if differ_name == 'different_pixels':
- self._num_pixels_differing = stats['pointsOfInterest']
- elif differ_name == 'perceptual':
- perceptual_similarity = stats['result']
-
- # skpdiff returns the perceptual similarity; convert it to get the
- # perceptual difference percentage.
- # skpdiff outputs -1 if the images are different sizes. Treat any
- # output that does not lie in [0, 1] as having 0% perceptual
- # similarity.
- if not 0 <= perceptual_similarity <= 1:
- perceptual_similarity = 0
- self._perceptual_difference = 100 - (perceptual_similarity * 100)
- finally:
- shutil.rmtree(skpdiff_output_dir)
-
- # TODO(epoger): Use properties instead of getters throughout.
- # See http://stackoverflow.com/a/6618176
- def get_num_pixels_differing(self):
- """Returns the absolute number of pixels that differ."""
- return self._num_pixels_differing
-
- def get_percent_pixels_differing(self):
- """Returns the percentage of pixels that differ, as a float between
- 0 and 100 (inclusive)."""
- return ((float(self._num_pixels_differing) * 100) /
- (self._width * self._height))
-
- def get_perceptual_difference(self):
- """Returns the perceptual difference percentage."""
- return self._perceptual_difference
-
- def get_max_diff_per_channel(self):
- """Returns the maximum difference between the expected and actual images
- for each R/G/B channel, as a list."""
- return self._max_diff_per_channel
-
- def as_dict(self):
- """Returns a dictionary representation of this DiffRecord, as needed when
- constructing the JSON representation."""
- return {
- KEY__DIFFERENCES__NUM_DIFF_PIXELS: self._num_pixels_differing,
- KEY__DIFFERENCES__PERCENT_DIFF_PIXELS:
- self.get_percent_pixels_differing(),
- KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL: self._max_diff_per_channel,
- KEY__DIFFERENCES__PERCEPTUAL_DIFF: self._perceptual_difference,
- KEY__DIFFERENCES__DIFF_URL: self._diffUrl,
- KEY__DIFFERENCES__WHITE_DIFF_URL: self._whiteDiffUrl,
- }
-
-
-
-class ImageDiffDB(object):
- """ Calculates differences between image pairs, maintaining a database of
- them for download."""
-
- def __init__(self, storage_root, gs=None,
- num_worker_threads=DEFAULT_NUM_WORKER_THREADS):
- """
- Args:
- storage_root: string; root path within the DB will store all of its stuff
- gs: instance of GSUtils object we can use to download images
- num_worker_threads: how many threads that download images and
- generate diffs simultaneously
- """
- self._storage_root = storage_root
- self._gs = gs
-
- # Mechanism for reporting queue size periodically.
- self._last_queue_size_reported = None
- self._queue_size_report_lock = threading.RLock()
-
- # Dictionary of DiffRecords, keyed by (expected_image_locator,
- # actual_image_locator) tuples.
- # Values can also be _DIFFRECORD_PENDING, _DIFFRECORD_FAILED.
- #
- # Any thread that modifies _diff_dict must first acquire
- # _diff_dict_writelock!
- #
- # TODO(epoger): Disk is limitless, but RAM is not... so, we should probably
- # remove items from self._diff_dict if they haven't been accessed for a
- # long time. We can always regenerate them by diffing the images we
- # previously downloaded to local disk.
- # I guess we should figure out how expensive it is to download vs diff the
- # image pairs... if diffing them is expensive too, we can write these
- # _diff_dict objects out to disk if there's too many to hold in RAM.
- # Or we could use virtual memory to handle that automatically.
- self._diff_dict = {}
- self._diff_dict_writelock = threading.RLock()
-
- # Set up the queue for asynchronously loading DiffRecords, and start the
- # worker threads reading from it.
- # The queue maxsize must be 0 (infinite size queue), so that asynchronous
- # calls can return as soon as possible.
- self._tasks_queue = Queue.Queue(maxsize=0)
- self._workers = []
- for i in range(num_worker_threads):
- worker = threading.Thread(target=self.worker, args=(i,))
- worker.daemon = True
- worker.start()
- self._workers.append(worker)
-
- def log_queue_size_if_changed(self, limit_verbosity=True):
- """Log the size of self._tasks_queue, if it has changed since the last call.
-
- Reports the current queue size, using log.info(), unless the queue is the
- same size as the last time we reported it.
-
- Args:
- limit_verbosity: if True, only log if the queue size is a multiple of
- QUEUE_LOGGING_GRANULARITY
- """
- # Acquire the lock, to synchronize access to self._last_queue_size_reported
- self._queue_size_report_lock.acquire()
- try:
- size = self._tasks_queue.qsize()
- if size == self._last_queue_size_reported:
- return
- if limit_verbosity and (size % QUEUE_LOGGING_GRANULARITY != 0):
- return
- logging.info('tasks_queue size is %d' % size)
- self._last_queue_size_reported = size
- finally:
- self._queue_size_report_lock.release()
-
- def worker(self, worker_num):
- """Launch a worker thread that pulls tasks off self._tasks_queue.
-
- Args:
- worker_num: (integer) which worker this is
- """
- while True:
- self.log_queue_size_if_changed()
- params = self._tasks_queue.get()
- key, expected_image_url, actual_image_url = params
- try:
- diff_record = DiffRecord(
- self._gs, self._storage_root,
- expected_image_url=expected_image_url,
- expected_image_locator=key[0],
- actual_image_url=actual_image_url,
- actual_image_locator=key[1])
- except Exception:
- logging.exception(
- 'exception while creating DiffRecord for key %s' % str(key))
- diff_record = _DIFFRECORD_FAILED
- self._diff_dict_writelock.acquire()
- try:
- self._diff_dict[key] = diff_record
- finally:
- self._diff_dict_writelock.release()
-
- @property
- def storage_root(self):
- return self._storage_root
-
- def add_image_pair(self,
- expected_image_url, expected_image_locator,
- actual_image_url, actual_image_locator):
- """Asynchronously prepare a DiffRecord for a pair of images.
-
- This method will return quickly; calls to get_diff_record() will block
- until the DiffRecord is available (or we have given up on creating it).
-
- If we already have a DiffRecord for this particular image pair, no work
- will be done.
-
- If expected_image_url (or its locator) is None, just download actual_image.
- If actual_image_url (or its locator) is None, just download expected_image.
-
- Args:
- expected_image_url: file, GS, or HTTP url from which we will download the
- expected image
- expected_image_locator: a unique ID string under which we will store the
- expected image within storage_root (probably including a checksum to
- guarantee uniqueness)
- actual_image_url: file, GS, or HTTP url from which we will download the
- actual image
- actual_image_locator: a unique ID string under which we will store the
- actual image within storage_root (probably including a checksum to
- guarantee uniqueness)
- """
- expected_image_locator = _sanitize_locator(expected_image_locator)
- actual_image_locator = _sanitize_locator(actual_image_locator)
- key = (expected_image_locator, actual_image_locator)
- must_add_to_queue = False
-
- self._diff_dict_writelock.acquire()
- try:
- if not key in self._diff_dict:
- # If we have already requested a diff between these two images,
- # we don't need to request it again.
- must_add_to_queue = True
- self._diff_dict[key] = _DIFFRECORD_PENDING
- finally:
- self._diff_dict_writelock.release()
-
- if must_add_to_queue:
- self._tasks_queue.put((key, expected_image_url, actual_image_url))
- self.log_queue_size_if_changed()
-
- def get_diff_record(self, expected_image_locator, actual_image_locator):
- """Returns the DiffRecord for this image pair.
-
- This call will block until the diff record is available, or we were unable
- to generate it.
-
- Args:
- expected_image_locator: a unique ID string under which we will store the
- expected image within storage_root (probably including a checksum to
- guarantee uniqueness)
- actual_image_locator: a unique ID string under which we will store the
- actual image within storage_root (probably including a checksum to
- guarantee uniqueness)
-
- Returns the DiffRecord for this image pair, or None if we were unable to
- generate one.
- """
- key = (_sanitize_locator(expected_image_locator),
- _sanitize_locator(actual_image_locator))
- diff_record = self._diff_dict[key]
-
- # If we have no results yet, block until we do.
- while diff_record == _DIFFRECORD_PENDING:
- time.sleep(1)
- diff_record = self._diff_dict[key]
-
- # Once we have the result...
- if diff_record == _DIFFRECORD_FAILED:
- logging.error(
- 'failed to create a DiffRecord for expected_image_locator=%s , '
- 'actual_image_locator=%s' % (
- expected_image_locator, actual_image_locator))
- return None
- else:
- return diff_record
-
-
-# Utility functions
-
-def _download_file(gs, local_filepath, url):
- """Download a file from url to local_filepath, unless it is already there.
-
- Args:
- gs: instance of GSUtils object, in case the url points at Google Storage
- local_filepath: path on local disk where the image should be stored
- url: HTTP or GS URL from which we can download the image if we don't have
- it yet
- """
- global global_file_collisions
- if not os.path.exists(local_filepath):
- _mkdir_unless_exists(os.path.dirname(local_filepath))
-
- # First download the file contents into a unique filename, and
- # then rename that file. That way, if multiple threads are downloading
- # the same filename at the same time, they won't interfere with each
- # other (they will both download the file, and one will "win" in the end)
- temp_filename = '%s-%d' % (local_filepath,
- threading.current_thread().ident)
- if gs_utils.GSUtils.is_gs_url(url):
- (bucket, path) = gs_utils.GSUtils.split_gs_url(url)
- gs.download_file(source_bucket=bucket, source_path=path,
- dest_path=temp_filename)
- else:
- with contextlib.closing(urllib.urlopen(url)) as url_handle:
- with open(temp_filename, 'wb') as file_handle:
- shutil.copyfileobj(fsrc=url_handle, fdst=file_handle)
-
- # Rename the file to its real filename.
- # Keep count of how many colliding downloads we encounter;
- # if it's a large number, we may want to change our download strategy
- # to minimize repeated downloads.
- if os.path.exists(local_filepath):
- global_file_collisions += 1
- else:
- os.rename(temp_filename, local_filepath)
-
-
-def _mkdir_unless_exists(path):
- """Unless path refers to an already-existing directory, create it.
-
- Args:
- path: path on local disk
- """
- try:
- os.makedirs(path)
- except OSError as e:
- if e.errno == errno.EEXIST:
- pass
-
-
-def _sanitize_locator(locator):
- """Returns a sanitized version of a locator (one in which we know none of the
- characters will have special meaning in filenames).
-
- Args:
- locator: string, or something that can be represented as a string.
- If None or '', it is returned without modification, because empty
- locators have a particular meaning ("there is no image for this")
- """
- if locator:
- return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator))
- else:
- return locator
« no previous file with comments | « gm/rebaseline_server/download_actuals_test.py ('k') | gm/rebaseline_server/imagediffdb_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698