gm/rebaseline_server/imagediffdb.py - Issue 848073005: Revert "delete old things!"

Side by Side Diff: gm/rebaseline_server/imagediffdb.py

Issue 848073005: Revert "delete old things!" (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 #!/usr/bin/python

	2

	3 """

	4 Copyright 2013 Google Inc.

	5

	6 Use of this source code is governed by a BSD-style license that can be

	7 found in the LICENSE file.

	8

	9 Calulate differences between image pairs, and store them in a database.

	10 """

	11

	12 # System-level imports

	13 import contextlib

	14 import errno

	15 import json

	16 import logging

	17 import os

	18 import Queue

	19 import re

	20 import shutil

	21 import tempfile

	22 import threading

	23 import time

	24 import urllib

	25

	26 # Must fix up PYTHONPATH before importing from within Skia

	27 import rs_fixpypath # pylint: disable=W0611

	28

	29 # Imports from within Skia

	30 import find_run_binary

	31 from py.utils import gs_utils

	32

	33

	34 SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff')

	35

	36 DEFAULT_IMAGE_SUFFIX = '.png'

	37 DEFAULT_IMAGES_SUBDIR = 'images'

	38 # TODO(epoger): Figure out a better default number of threads; for now,

	39 # using a conservative default value.

	40 DEFAULT_NUM_WORKER_THREADS = 1

	41

	42 DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]')

	43

	44 RGBDIFFS_SUBDIR = 'diffs'

	45 WHITEDIFFS_SUBDIR = 'whitediffs'

	46

	47 # Keys used within DiffRecord dictionary representations.

	48 # NOTE: Keep these in sync with static/constants.js

	49 KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel'

	50 KEY__DIFFERENCES__NUM_DIFF_PIXELS = 'numDifferingPixels'

	51 KEY__DIFFERENCES__PERCENT_DIFF_PIXELS = 'percentDifferingPixels'

	52 KEY__DIFFERENCES__PERCEPTUAL_DIFF = 'perceptualDifference'

	53 KEY__DIFFERENCES__DIFF_URL = 'diffUrl'

	54 KEY__DIFFERENCES__WHITE_DIFF_URL = 'whiteDiffUrl'

	55

	56 # Special values within ImageDiffDB._diff_dict

	57 _DIFFRECORD_FAILED = 'failed'

	58 _DIFFRECORD_PENDING = 'pending'

	59

	60 # How often to report tasks_queue size

	61 QUEUE_LOGGING_GRANULARITY = 1000

	62

	63 # Temporary variable to keep track of how many times we download

	64 # the same file in multiple threads.

	65 # TODO(epoger): Delete this, once we see that the number stays close to 0.

	66 global_file_collisions = 0

	67

	68

	69 class DiffRecord(object):

	70 """ Record of differences between two images. """

	71

	72 def __init__(self, gs, storage_root,

	73 expected_image_url, expected_image_locator,

	74 actual_image_url, actual_image_locator,

	75 expected_images_subdir=DEFAULT_IMAGES_SUBDIR,

	76 actual_images_subdir=DEFAULT_IMAGES_SUBDIR,

	77 image_suffix=DEFAULT_IMAGE_SUFFIX):

	78 """Download this pair of images (unless we already have them on local disk),

	79 and prepare a DiffRecord for them.

	80

	81 Args:

	82 gs: instance of GSUtils object we can use to download images

	83 storage_root: root directory on local disk within which we store all

	84 images

	85 expected_image_url: file, GS, or HTTP url from which we will download the

	86 expected image

	87 expected_image_locator: a unique ID string under which we will store the

	88 expected image within storage_root (probably including a checksum to

	89 guarantee uniqueness)

	90 actual_image_url: file, GS, or HTTP url from which we will download the

	91 actual image

	92 actual_image_locator: a unique ID string under which we will store the

	93 actual image within storage_root (probably including a checksum to

	94 guarantee uniqueness)

	95 expected_images_subdir: the subdirectory expected images are stored in.

	96 actual_images_subdir: the subdirectory actual images are stored in.

	97 image_suffix: the suffix of images.

	98 """

	99 expected_image_locator = _sanitize_locator(expected_image_locator)

	100 actual_image_locator = _sanitize_locator(actual_image_locator)

	101

	102 # Download the expected/actual images, if we don't have them already.

	103 expected_image_file = os.path.join(

	104 storage_root, expected_images_subdir,

	105 str(expected_image_locator) + image_suffix)

	106 actual_image_file = os.path.join(

	107 storage_root, actual_images_subdir,

	108 str(actual_image_locator) + image_suffix)

	109 for image_file, image_url in [

	110 (expected_image_file, expected_image_url),

	111 (actual_image_file, actual_image_url)]:

	112 if image_file and image_url:

	113 try:

	114 _download_file(gs, image_file, image_url)

	115 except Exception:

	116 logging.exception('unable to download image_url %s to file %s' %

	117 (image_url, image_file))

	118 raise

	119

	120 # Return early if we do not need to generate diffs.

	121 if (expected_image_url == actual_image_url or

	122 not expected_image_url or not actual_image_url):

	123 return

	124

	125 # Get all diff images and values using the skpdiff binary.

	126 skpdiff_output_dir = tempfile.mkdtemp()

	127 try:

	128 skpdiff_summary_file = os.path.join(skpdiff_output_dir,

	129 'skpdiff-output.json')

	130 skpdiff_rgbdiff_dir = os.path.join(storage_root, RGBDIFFS_SUBDIR)

	131 skpdiff_whitediff_dir = os.path.join(storage_root, WHITEDIFFS_SUBDIR)

	132 _mkdir_unless_exists(skpdiff_rgbdiff_dir)

	133 _mkdir_unless_exists(skpdiff_rgbdiff_dir)

	134

	135 # TODO(epoger): Consider calling skpdiff ONCE for all image pairs,

	136 # instead of calling it separately for each image pair.

	137 # Pro: we'll incur less overhead from making repeated system calls,

	138 # spinning up the skpdiff binary, etc.

	139 # Con: we would have to wait until all image pairs were loaded before

	140 # generating any of the diffs?

	141 # Note(stephana): '--longnames' was added to allow for this

	142 # case (multiple files at once) versus specifying output diffs

	143 # directly.

	144 find_run_binary.run_command(

	145 [SKPDIFF_BINARY, '-p', expected_image_file, actual_image_file,

	146 '--jsonp', 'false',

	147 '--longnames', 'true',

	148 '--output', skpdiff_summary_file,

	149 '--differs', 'perceptual', 'different_pixels',

	150 '--rgbDiffDir', skpdiff_rgbdiff_dir,

	151 '--whiteDiffDir', skpdiff_whitediff_dir,

	152 ])

	153

	154 # Get information out of the skpdiff_summary_file.

	155 with contextlib.closing(open(skpdiff_summary_file)) as fp:

	156 data = json.load(fp)

	157

	158 # For now, we can assume there is only one record in the output summary,

	159 # since we passed skpdiff only one pair of images.

	160 record = data['records'][0]

	161 self._width = record['width']

	162 self._height = record['height']

	163 self._diffUrl = os.path.split(record['rgbDiffPath'])[1]

	164 self._whiteDiffUrl = os.path.split(record['whiteDiffPath'])[1]

	165

	166 # TODO: make max_diff_per_channel a tuple instead of a list, because the

	167 # structure is meaningful (first element is red, second is green, etc.)

	168 # See http://stackoverflow.com/a/626871

	169 self._max_diff_per_channel = [

	170 record['maxRedDiff'], record['maxGreenDiff'], record['maxBlueDiff']]

	171 per_differ_stats = record['diffs']

	172 for stats in per_differ_stats:

	173 differ_name = stats['differName']

	174 if differ_name == 'different_pixels':

	175 self._num_pixels_differing = stats['pointsOfInterest']

	176 elif differ_name == 'perceptual':

	177 perceptual_similarity = stats['result']

	178

	179 # skpdiff returns the perceptual similarity; convert it to get the

	180 # perceptual difference percentage.

	181 # skpdiff outputs -1 if the images are different sizes. Treat any

	182 # output that does not lie in [0, 1] as having 0% perceptual

	183 # similarity.

	184 if not 0 <= perceptual_similarity <= 1:

	185 perceptual_similarity = 0

	186 self._perceptual_difference = 100 - (perceptual_similarity * 100)

	187 finally:

	188 shutil.rmtree(skpdiff_output_dir)

	189

	190 # TODO(epoger): Use properties instead of getters throughout.

	191 # See http://stackoverflow.com/a/6618176

	192 def get_num_pixels_differing(self):

	193 """Returns the absolute number of pixels that differ."""

	194 return self._num_pixels_differing

	195

	196 def get_percent_pixels_differing(self):

	197 """Returns the percentage of pixels that differ, as a float between

	198 0 and 100 (inclusive)."""

	199 return ((float(self._num_pixels_differing) * 100) /

	200 (self._width * self._height))

	201

	202 def get_perceptual_difference(self):

	203 """Returns the perceptual difference percentage."""

	204 return self._perceptual_difference

	205

	206 def get_max_diff_per_channel(self):

	207 """Returns the maximum difference between the expected and actual images

	208 for each R/G/B channel, as a list."""

	209 return self._max_diff_per_channel

	210

	211 def as_dict(self):

	212 """Returns a dictionary representation of this DiffRecord, as needed when

	213 constructing the JSON representation."""

	214 return {

	215 KEY__DIFFERENCES__NUM_DIFF_PIXELS: self._num_pixels_differing,

	216 KEY__DIFFERENCES__PERCENT_DIFF_PIXELS:

	217 self.get_percent_pixels_differing(),

	218 KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL: self._max_diff_per_channel,

	219 KEY__DIFFERENCES__PERCEPTUAL_DIFF: self._perceptual_difference,

	220 KEY__DIFFERENCES__DIFF_URL: self._diffUrl,

	221 KEY__DIFFERENCES__WHITE_DIFF_URL: self._whiteDiffUrl,

	222 }

	223

	224

	225

	226 class ImageDiffDB(object):

	227 """ Calculates differences between image pairs, maintaining a database of

	228 them for download."""

	229

	230 def __init__(self, storage_root, gs=None,

	231 num_worker_threads=DEFAULT_NUM_WORKER_THREADS):

	232 """

	233 Args:

	234 storage_root: string; root path within the DB will store all of its stuff

	235 gs: instance of GSUtils object we can use to download images

	236 num_worker_threads: how many threads that download images and

	237 generate diffs simultaneously

	238 """

	239 self._storage_root = storage_root

	240 self._gs = gs

	241

	242 # Mechanism for reporting queue size periodically.

	243 self._last_queue_size_reported = None

	244 self._queue_size_report_lock = threading.RLock()

	245

	246 # Dictionary of DiffRecords, keyed by (expected_image_locator,

	247 # actual_image_locator) tuples.

	248 # Values can also be _DIFFRECORD_PENDING, _DIFFRECORD_FAILED.

	249 #

	250 # Any thread that modifies _diff_dict must first acquire

	251 # _diff_dict_writelock!

	252 #

	253 # TODO(epoger): Disk is limitless, but RAM is not... so, we should probably

	254 # remove items from self._diff_dict if they haven't been accessed for a

	255 # long time. We can always regenerate them by diffing the images we

	256 # previously downloaded to local disk.

	257 # I guess we should figure out how expensive it is to download vs diff the

	258 # image pairs... if diffing them is expensive too, we can write these

	259 # _diff_dict objects out to disk if there's too many to hold in RAM.

	260 # Or we could use virtual memory to handle that automatically.

	261 self._diff_dict = {}

	262 self._diff_dict_writelock = threading.RLock()

	263

	264 # Set up the queue for asynchronously loading DiffRecords, and start the

	265 # worker threads reading from it.

	266 # The queue maxsize must be 0 (infinite size queue), so that asynchronous

	267 # calls can return as soon as possible.

	268 self._tasks_queue = Queue.Queue(maxsize=0)

	269 self._workers = []

	270 for i in range(num_worker_threads):

	271 worker = threading.Thread(target=self.worker, args=(i,))

	272 worker.daemon = True

	273 worker.start()

	274 self._workers.append(worker)

	275

	276 def log_queue_size_if_changed(self, limit_verbosity=True):

	277 """Log the size of self._tasks_queue, if it has changed since the last call.

	278

	279 Reports the current queue size, using log.info(), unless the queue is the

	280 same size as the last time we reported it.

	281

	282 Args:

	283 limit_verbosity: if True, only log if the queue size is a multiple of

	284 QUEUE_LOGGING_GRANULARITY

	285 """

	286 # Acquire the lock, to synchronize access to self._last_queue_size_reported

	287 self._queue_size_report_lock.acquire()

	288 try:

	289 size = self._tasks_queue.qsize()

	290 if size == self._last_queue_size_reported:

	291 return

	292 if limit_verbosity and (size % QUEUE_LOGGING_GRANULARITY != 0):

	293 return

	294 logging.info('tasks_queue size is %d' % size)

	295 self._last_queue_size_reported = size

	296 finally:

	297 self._queue_size_report_lock.release()

	298

	299 def worker(self, worker_num):

	300 """Launch a worker thread that pulls tasks off self._tasks_queue.

	301

	302 Args:

	303 worker_num: (integer) which worker this is

	304 """

	305 while True:

	306 self.log_queue_size_if_changed()

	307 params = self._tasks_queue.get()

	308 key, expected_image_url, actual_image_url = params

	309 try:

	310 diff_record = DiffRecord(

	311 self._gs, self._storage_root,

	312 expected_image_url=expected_image_url,

	313 expected_image_locator=key[0],

	314 actual_image_url=actual_image_url,

	315 actual_image_locator=key[1])

	316 except Exception:

	317 logging.exception(

	318 'exception while creating DiffRecord for key %s' % str(key))

	319 diff_record = _DIFFRECORD_FAILED

	320 self._diff_dict_writelock.acquire()

	321 try:

	322 self._diff_dict[key] = diff_record

	323 finally:

	324 self._diff_dict_writelock.release()

	325

	326 @property

	327 def storage_root(self):

	328 return self._storage_root

	329

	330 def add_image_pair(self,

	331 expected_image_url, expected_image_locator,

	332 actual_image_url, actual_image_locator):

	333 """Asynchronously prepare a DiffRecord for a pair of images.

	334

	335 This method will return quickly; calls to get_diff_record() will block

	336 until the DiffRecord is available (or we have given up on creating it).

	337

	338 If we already have a DiffRecord for this particular image pair, no work

	339 will be done.

	340

	341 If expected_image_url (or its locator) is None, just download actual_image.

	342 If actual_image_url (or its locator) is None, just download expected_image.

	343

	344 Args:

	345 expected_image_url: file, GS, or HTTP url from which we will download the

	346 expected image

	347 expected_image_locator: a unique ID string under which we will store the

	348 expected image within storage_root (probably including a checksum to

	349 guarantee uniqueness)

	350 actual_image_url: file, GS, or HTTP url from which we will download the

	351 actual image

	352 actual_image_locator: a unique ID string under which we will store the

	353 actual image within storage_root (probably including a checksum to

	354 guarantee uniqueness)

	355 """

	356 expected_image_locator = _sanitize_locator(expected_image_locator)

	357 actual_image_locator = _sanitize_locator(actual_image_locator)

	358 key = (expected_image_locator, actual_image_locator)

	359 must_add_to_queue = False

	360

	361 self._diff_dict_writelock.acquire()

	362 try:

	363 if not key in self._diff_dict:

	364 # If we have already requested a diff between these two images,

	365 # we don't need to request it again.

	366 must_add_to_queue = True

	367 self._diff_dict[key] = _DIFFRECORD_PENDING

	368 finally:

	369 self._diff_dict_writelock.release()

	370

	371 if must_add_to_queue:

	372 self._tasks_queue.put((key, expected_image_url, actual_image_url))

	373 self.log_queue_size_if_changed()

	374

	375 def get_diff_record(self, expected_image_locator, actual_image_locator):

	376 """Returns the DiffRecord for this image pair.

	377

	378 This call will block until the diff record is available, or we were unable

	379 to generate it.

	380

	381 Args:

	382 expected_image_locator: a unique ID string under which we will store the

	383 expected image within storage_root (probably including a checksum to

	384 guarantee uniqueness)

	385 actual_image_locator: a unique ID string under which we will store the

	386 actual image within storage_root (probably including a checksum to

	387 guarantee uniqueness)

	388

	389 Returns the DiffRecord for this image pair, or None if we were unable to

	390 generate one.

	391 """

	392 key = (_sanitize_locator(expected_image_locator),

	393 _sanitize_locator(actual_image_locator))

	394 diff_record = self._diff_dict[key]

	395

	396 # If we have no results yet, block until we do.

	397 while diff_record == _DIFFRECORD_PENDING:

	398 time.sleep(1)

	399 diff_record = self._diff_dict[key]

	400

	401 # Once we have the result...

	402 if diff_record == _DIFFRECORD_FAILED:

	403 logging.error(

	404 'failed to create a DiffRecord for expected_image_locator=%s , '

	405 'actual_image_locator=%s' % (

	406 expected_image_locator, actual_image_locator))

	407 return None

	408 else:

	409 return diff_record

	410

	411

	412 # Utility functions

	413

	414 def _download_file(gs, local_filepath, url):

	415 """Download a file from url to local_filepath, unless it is already there.

	416

	417 Args:

	418 gs: instance of GSUtils object, in case the url points at Google Storage

	419 local_filepath: path on local disk where the image should be stored

	420 url: HTTP or GS URL from which we can download the image if we don't have

	421 it yet

	422 """

	423 global global_file_collisions

	424 if not os.path.exists(local_filepath):

	425 _mkdir_unless_exists(os.path.dirname(local_filepath))

	426

	427 # First download the file contents into a unique filename, and

	428 # then rename that file. That way, if multiple threads are downloading

	429 # the same filename at the same time, they won't interfere with each

	430 # other (they will both download the file, and one will "win" in the end)

	431 temp_filename = '%s-%d' % (local_filepath,

	432 threading.current_thread().ident)

	433 if gs_utils.GSUtils.is_gs_url(url):

	434 (bucket, path) = gs_utils.GSUtils.split_gs_url(url)

	435 gs.download_file(source_bucket=bucket, source_path=path,

	436 dest_path=temp_filename)

	437 else:

	438 with contextlib.closing(urllib.urlopen(url)) as url_handle:

	439 with open(temp_filename, 'wb') as file_handle:

	440 shutil.copyfileobj(fsrc=url_handle, fdst=file_handle)

	441

	442 # Rename the file to its real filename.

	443 # Keep count of how many colliding downloads we encounter;

	444 # if it's a large number, we may want to change our download strategy

	445 # to minimize repeated downloads.

	446 if os.path.exists(local_filepath):

	447 global_file_collisions += 1

	448 else:

	449 os.rename(temp_filename, local_filepath)

	450

	451

	452 def _mkdir_unless_exists(path):

	453 """Unless path refers to an already-existing directory, create it.

	454

	455 Args:

	456 path: path on local disk

	457 """

	458 try:

	459 os.makedirs(path)

	460 except OSError as e:

	461 if e.errno == errno.EEXIST:

	462 pass

	463

	464

	465 def _sanitize_locator(locator):

	466 """Returns a sanitized version of a locator (one in which we know none of the

	467 characters will have special meaning in filenames).

	468

	469 Args:

	470 locator: string, or something that can be represented as a string.

	471 If None or '', it is returned without modification, because empty

	472 locators have a particular meaning ("there is no image for this")

	473 """

	474 if locator:

	475 return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator))

	476 else:

	477 return locator

OLD	NEW

« no previous file with comments | « gm/rebaseline_server/download_actuals_test.py ('k') | gm/rebaseline_server/imagediffdb_test.py » ('j') | no next file with comments »