Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: gm/rebaseline_server/imagediffdb.py

Issue 235923002: rebaseline_server: multithreaded loading/diffing of images (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 6 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « gm/rebaseline_server/compare_to_expectations.py ('k') | gm/rebaseline_server/imagepair.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #!/usr/bin/python 1 #!/usr/bin/python
2 2
3 """ 3 """
4 Copyright 2013 Google Inc. 4 Copyright 2013 Google Inc.
5 5
6 Use of this source code is governed by a BSD-style license that can be 6 Use of this source code is governed by a BSD-style license that can be
7 found in the LICENSE file. 7 found in the LICENSE file.
8 8
9 Calulate differences between image pairs, and store them in a database. 9 Calulate differences between image pairs, and store them in a database.
10 """ 10 """
11 11
12 import contextlib 12 import contextlib
13 import csv 13 import csv
14 import errno
14 import logging 15 import logging
16 import Queue
15 import os 17 import os
16 import re 18 import re
17 import shutil 19 import shutil
18 import sys 20 import sys
19 import tempfile 21 import tempfile
22 import time
23 import threading
20 import urllib 24 import urllib
21 try: 25 try:
22 from PIL import Image, ImageChops 26 from PIL import Image, ImageChops
23 except ImportError: 27 except ImportError:
24 raise ImportError('Requires PIL to be installed; see ' 28 raise ImportError('Requires PIL to be installed; see '
25 + 'http://www.pythonware.com/products/pil/') 29 + 'http://www.pythonware.com/products/pil/')
26 30
27 # Set the PYTHONPATH to include the tools directory. 31 # Set the PYTHONPATH to include the tools directory.
28 sys.path.append( 32 sys.path.append(
29 os.path.join( 33 os.path.join(
30 os.path.dirname(os.path.realpath(__file__)), os.pardir, os.pardir, 34 os.path.dirname(os.path.realpath(__file__)), os.pardir, os.pardir,
31 'tools')) 35 'tools'))
32 import find_run_binary 36 import find_run_binary
33 37
34 SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff') 38 SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff')
35 39
36 DEFAULT_IMAGE_SUFFIX = '.png' 40 DEFAULT_IMAGE_SUFFIX = '.png'
37 DEFAULT_IMAGES_SUBDIR = 'images' 41 DEFAULT_IMAGES_SUBDIR = 'images'
42 DEFAULT_NUM_WORKERS = 8
38 43
39 DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]') 44 DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]')
40 45
41 DIFFS_SUBDIR = 'diffs' 46 DIFFS_SUBDIR = 'diffs'
42 WHITEDIFFS_SUBDIR = 'whitediffs' 47 WHITEDIFFS_SUBDIR = 'whitediffs'
43 48
44 VALUES_PER_BAND = 256 49 VALUES_PER_BAND = 256
45 50
46 # Keys used within DiffRecord dictionary representations. 51 # Keys used within DiffRecord dictionary representations.
47 # NOTE: Keep these in sync with static/constants.js 52 # NOTE: Keep these in sync with static/constants.js
48 KEY__DIFFERENCE_DATA__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel' 53 KEY__DIFFERENCE_DATA__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel'
49 KEY__DIFFERENCE_DATA__NUM_DIFF_PIXELS = 'numDifferingPixels' 54 KEY__DIFFERENCE_DATA__NUM_DIFF_PIXELS = 'numDifferingPixels'
50 KEY__DIFFERENCE_DATA__PERCENT_DIFF_PIXELS = 'percentDifferingPixels' 55 KEY__DIFFERENCE_DATA__PERCENT_DIFF_PIXELS = 'percentDifferingPixels'
51 KEY__DIFFERENCE_DATA__PERCEPTUAL_DIFF = 'perceptualDifference' 56 KEY__DIFFERENCE_DATA__PERCEPTUAL_DIFF = 'perceptualDifference'
52 KEY__DIFFERENCE_DATA__WEIGHTED_DIFF = 'weightedDiffMeasure' 57 KEY__DIFFERENCE_DATA__WEIGHTED_DIFF = 'weightedDiffMeasure'
53 58
59 # Special values within ImageDiffDB._diff_dict
60 DIFFRECORD_FAILED = 'failed'
61 DIFFRECORD_PENDING = 'pending'
62
63 # TODO(epoger): Temporary(?) list to keep track of how many times we download
64 # the same file in multiple threads.
65 global_file_collisions = 0
66
54 67
55 class DiffRecord(object): 68 class DiffRecord(object):
56 """ Record of differences between two images. """ 69 """ Record of differences between two images. """
57 70
58 def __init__(self, storage_root, 71 def __init__(self, storage_root,
59 expected_image_url, expected_image_locator, 72 expected_image_url, expected_image_locator,
60 actual_image_url, actual_image_locator, 73 actual_image_url, actual_image_locator,
61 expected_images_subdir=DEFAULT_IMAGES_SUBDIR, 74 expected_images_subdir=DEFAULT_IMAGES_SUBDIR,
62 actual_images_subdir=DEFAULT_IMAGES_SUBDIR, 75 actual_images_subdir=DEFAULT_IMAGES_SUBDIR,
63 image_suffix=DEFAULT_IMAGE_SUFFIX): 76 image_suffix=DEFAULT_IMAGE_SUFFIX):
64 """Download this pair of images (unless we already have them on local disk), 77 """Download this pair of images (unless we already have them on local disk),
65 and prepare a DiffRecord for them. 78 and prepare a DiffRecord for them.
66 79
67 TODO(epoger): Make this asynchronously download images, rather than blocking
68 until the images have been downloaded and processed.
69
70 Args: 80 Args:
71 storage_root: root directory on local disk within which we store all 81 storage_root: root directory on local disk within which we store all
72 images 82 images
73 expected_image_url: file or HTTP url from which we will download the 83 expected_image_url: file or HTTP url from which we will download the
74 expected image 84 expected image
75 expected_image_locator: a unique ID string under which we will store the 85 expected_image_locator: a unique ID string under which we will store the
76 expected image within storage_root (probably including a checksum to 86 expected image within storage_root (probably including a checksum to
77 guarantee uniqueness) 87 guarantee uniqueness)
78 actual_image_url: file or HTTP url from which we will download the 88 actual_image_url: file or HTTP url from which we will download the
79 actual image 89 actual image
(...skipping 132 matching lines...) Expand 10 before | Expand all | Expand 10 after
212 KEY__DIFFERENCE_DATA__WEIGHTED_DIFF: self.get_weighted_diff_measure(), 222 KEY__DIFFERENCE_DATA__WEIGHTED_DIFF: self.get_weighted_diff_measure(),
213 KEY__DIFFERENCE_DATA__MAX_DIFF_PER_CHANNEL: self._max_diff_per_channel, 223 KEY__DIFFERENCE_DATA__MAX_DIFF_PER_CHANNEL: self._max_diff_per_channel,
214 KEY__DIFFERENCE_DATA__PERCEPTUAL_DIFF: self._perceptual_difference, 224 KEY__DIFFERENCE_DATA__PERCEPTUAL_DIFF: self._perceptual_difference,
215 } 225 }
216 226
217 227
218 class ImageDiffDB(object): 228 class ImageDiffDB(object):
219 """ Calculates differences between image pairs, maintaining a database of 229 """ Calculates differences between image pairs, maintaining a database of
220 them for download.""" 230 them for download."""
221 231
222 def __init__(self, storage_root): 232 def __init__(self, storage_root, num_workers=DEFAULT_NUM_WORKERS):
223 """ 233 """
224 Args: 234 Args:
225 storage_root: string; root path within the DB will store all of its stuff 235 storage_root: string; root path within the DB will store all of its stuff
236 num_workers: integer; number of worker threads to spawn
226 """ 237 """
227 self._storage_root = storage_root 238 self._storage_root = storage_root
228 239
229 # Dictionary of DiffRecords, keyed by (expected_image_locator, 240 # Dictionary of DiffRecords, keyed by (expected_image_locator,
230 # actual_image_locator) tuples. 241 # actual_image_locator) tuples.
242 # Values can also be DIFFRECORD_PENDING, DIFFRECORD_FAILED.
231 self._diff_dict = {} 243 self._diff_dict = {}
232 244
245 # Set up the queue for asynchronously loading DiffRecords, and start the
246 # worker threads reading from it.
247 self._tasks_queue = Queue.Queue(maxsize=2*num_workers)
248 self._workers = []
249 for i in range(num_workers):
250 worker = threading.Thread(target=self.worker, args=(i,))
251 worker.daemon = True
252 worker.start()
253 self._workers.append(worker)
254
255 def worker(self, worker_num):
256 """Launch a worker thread that pulls tasks off self._tasks_queue.
257
258 Args:
259 worker_num: (integer) which worker this is
260 """
261 while True:
262 params = self._tasks_queue.get()
263 key, expected_image_url, actual_image_url = params
264 try:
265 diff_record = DiffRecord(
266 self._storage_root,
267 expected_image_url=expected_image_url,
268 expected_image_locator=key[0],
269 actual_image_url=actual_image_url,
270 actual_image_locator=key[1])
271 except Exception:
272 logging.exception(
273 'exception while creating DiffRecord for key %s' % str(key))
274 diff_record = DIFFRECORD_FAILED
275 self._diff_dict[key] = diff_record
276
233 def add_image_pair(self, 277 def add_image_pair(self,
234 expected_image_url, expected_image_locator, 278 expected_image_url, expected_image_locator,
235 actual_image_url, actual_image_locator): 279 actual_image_url, actual_image_locator):
236 """Download this pair of images (unless we already have them on local disk), 280 """Download this pair of images (unless we already have them on local disk),
237 and prepare a DiffRecord for them. 281 and prepare a DiffRecord for them.
238 282
239 TODO(epoger): Make this asynchronously download images, rather than blocking 283 This method will block until the images are downloaded and DiffRecord is
240 until the images have been downloaded and processed. 284 available by calling get_diff_record().
241 When we do that, we should probably add a new method that will block
242 until all of the images have been downloaded and processed. Otherwise,
243 we won't know when it's safe to start calling get_diff_record().
244 jcgregorio notes: maybe just make ImageDiffDB thread-safe and create a
245 thread-pool/worker queue at a higher level that just uses ImageDiffDB?
246 285
247 Args: 286 Args:
248 expected_image_url: file or HTTP url from which we will download the 287 expected_image_url: file or HTTP url from which we will download the
249 expected image 288 expected image
250 expected_image_locator: a unique ID string under which we will store the 289 expected_image_locator: a unique ID string under which we will store the
251 expected image within storage_root (probably including a checksum to 290 expected image within storage_root (probably including a checksum to
252 guarantee uniqueness) 291 guarantee uniqueness)
253 actual_image_url: file or HTTP url from which we will download the 292 actual_image_url: file or HTTP url from which we will download the
254 actual image 293 actual image
255 actual_image_locator: a unique ID string under which we will store the 294 actual_image_locator: a unique ID string under which we will store the
256 actual image within storage_root (probably including a checksum to 295 actual image within storage_root (probably including a checksum to
257 guarantee uniqueness) 296 guarantee uniqueness)
297
298 Raises:
299 Exception if we are unable to create a DiffRecord for this image pair.
258 """ 300 """
259 expected_image_locator = _sanitize_locator(expected_image_locator) 301 key = _generate_key(expected_image_locator, actual_image_locator)
260 actual_image_locator = _sanitize_locator(actual_image_locator)
261 key = (expected_image_locator, actual_image_locator)
262 if not key in self._diff_dict: 302 if not key in self._diff_dict:
263 try: 303 try:
264 new_diff_record = DiffRecord( 304 new_diff_record = DiffRecord(
265 self._storage_root, 305 self._storage_root,
266 expected_image_url=expected_image_url, 306 expected_image_url=expected_image_url,
267 expected_image_locator=expected_image_locator, 307 expected_image_locator=expected_image_locator,
268 actual_image_url=actual_image_url, 308 actual_image_url=actual_image_url,
269 actual_image_locator=actual_image_locator) 309 actual_image_locator=actual_image_locator)
270 except Exception: 310 except Exception:
271 # If we can't create a real DiffRecord for this (expected, actual) pair, 311 # If we can't create a real DiffRecord for this (expected, actual) pair,
272 # store None and the UI will show whatever information we DO have. 312 # store None and the UI will show whatever information we DO have.
273 # Fixes http://skbug.com/2368 . 313 # Fixes http://skbug.com/2368 .
274 logging.exception( 314 logging.exception(
275 'got exception while creating a DiffRecord for ' 315 'got exception while creating a DiffRecord for '
276 'expected_image_url=%s , actual_image_url=%s; returning None' % ( 316 'expected_image_url=%s , actual_image_url=%s; returning None' % (
277 expected_image_url, actual_image_url)) 317 expected_image_url, actual_image_url))
278 new_diff_record = None 318 new_diff_record = None
279 self._diff_dict[key] = new_diff_record 319 self._diff_dict[key] = new_diff_record
280 320
321 def add_image_pair_async(self,
322 expected_image_url, expected_image_locator,
323 actual_image_url, actual_image_locator):
324 """Download this pair of images (unless we already have them on local disk),
325 and prepare a DiffRecord for them.
326
327 This method will return quickly; calls to get_diff_record() will block
328 until the DiffRecord is available (or we have given up on creating it).
329
330 Args:
331 expected_image_url: file or HTTP url from which we will download the
332 expected image
333 expected_image_locator: a unique ID string under which we will store the
334 expected image within storage_root (probably including a checksum to
335 guarantee uniqueness)
336 actual_image_url: file or HTTP url from which we will download the
337 actual image
338 actual_image_locator: a unique ID string under which we will store the
339 actual image within storage_root (probably including a checksum to
340 guarantee uniqueness)
341 """
342 key = _generate_key(expected_image_locator, actual_image_locator)
343 if not key in self._diff_dict:
344 # If we have already requested a diff between these two images,
345 # we don't need to request it again.
346 #
347 # Threading note: If multiple threads called into this method with the
348 # same key at the same time, there will be multiple tasks on the queue
349 # with the same key. But that's OK; they will both complete successfully,
350 # and just waste a little time in the process. Nothing will break.
rmistry 2014/04/14 17:59:06 Can we use semaphores/locks here? Based on your te
epoger 2014/04/14 18:10:51 In my tests, I didn't see any global_file_collisio
351 self._diff_dict[key] = DIFFRECORD_PENDING
352 self._tasks_queue.put((key, expected_image_url, actual_image_url))
353
281 def get_diff_record(self, expected_image_locator, actual_image_locator): 354 def get_diff_record(self, expected_image_locator, actual_image_locator):
282 """Returns the DiffRecord for this image pair. 355 """Returns the DiffRecord for this image pair.
283 356
284 Raises a KeyError if we don't have a DiffRecord for this image pair. 357 Args:
358 expected_image_locator: a unique ID string under which we will store the
359 expected image within storage_root (probably including a checksum to
360 guarantee uniqueness)
361 actual_image_locator: a unique ID string under which we will store the
362 actual image within storage_root (probably including a checksum to
363 guarantee uniqueness)
364
365 Returns the DiffRecord for this image pair, or None if we were unable to
366 generate one.
285 """ 367 """
286 key = (_sanitize_locator(expected_image_locator), 368 key = _generate_key(expected_image_locator, actual_image_locator)
287 _sanitize_locator(actual_image_locator)) 369 diff_record = self._diff_dict[key]
288 return self._diff_dict[key] 370
371 # If we have no results yet, block until we do.
372 while diff_record == DIFFRECORD_PENDING:
373 time.sleep(1)
374 diff_record = self._diff_dict[key]
375
376 # Once we have the result...
377 if diff_record == DIFFRECORD_FAILED:
378 logging.error(
379 'failed to create a DiffRecord for expected_image_locator=%s , '
380 'actual_image_locator=%s' % (
381 expected_image_locator, actual_image_locator))
382 return None
383 else:
384 return diff_record
289 385
290 386
291 # Utility functions 387 # Utility functions
292 388
293 def _calculate_weighted_diff_metric(histogram, num_pixels): 389 def _calculate_weighted_diff_metric(histogram, num_pixels):
294 """Given the histogram of a diff image (per-channel diff at each 390 """Given the histogram of a diff image (per-channel diff at each
295 pixel between two images), calculate the weighted diff metric (a 391 pixel between two images), calculate the weighted diff metric (a
296 stab at how different the two images really are). 392 stab at how different the two images really are).
297 393
298 TODO(epoger): Delete this function, now that we have perceptual diff? 394 TODO(epoger): Delete this function, now that we have perceptual diff?
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after
367 def _download_and_open_image(local_filepath, url): 463 def _download_and_open_image(local_filepath, url):
368 """Open the image at local_filepath; if there is no file at that path, 464 """Open the image at local_filepath; if there is no file at that path,
369 download it from url to that path and then open it. 465 download it from url to that path and then open it.
370 466
371 Args: 467 Args:
372 local_filepath: path on local disk where the image should be stored 468 local_filepath: path on local disk where the image should be stored
373 url: URL from which we can download the image if we don't have it yet 469 url: URL from which we can download the image if we don't have it yet
374 470
375 Returns: a PIL image object 471 Returns: a PIL image object
376 """ 472 """
473 global global_file_collisions
377 if not os.path.exists(local_filepath): 474 if not os.path.exists(local_filepath):
378 _mkdir_unless_exists(os.path.dirname(local_filepath)) 475 _mkdir_unless_exists(os.path.dirname(local_filepath))
379 with contextlib.closing(urllib.urlopen(url)) as url_handle: 476 with contextlib.closing(urllib.urlopen(url)) as url_handle:
380 with open(local_filepath, 'wb') as file_handle: 477
478 # First download the file contents into a unique filename, and
479 # then rename that file. That way, if multiple threads are downloading
480 # the same filename at the same time, they won't interfere with each
481 # other (they will both download the file, and one will "win" in the end)
482 temp_filename = '%s-%d' % (local_filepath,
483 threading.current_thread().ident)
484 with open(temp_filename, 'wb') as file_handle:
381 shutil.copyfileobj(fsrc=url_handle, fdst=file_handle) 485 shutil.copyfileobj(fsrc=url_handle, fdst=file_handle)
486
487 # Keep count of how many colliding downloads we encounter;
488 # if it's a large number, we may want to change our download strategy
489 # to minimize repeated downloads.
490 if os.path.exists(local_filepath):
491 global_file_collisions += 1
492 else:
493 os.rename(temp_filename, local_filepath)
494
382 return _open_image(local_filepath) 495 return _open_image(local_filepath)
383 496
384 497
385 def _open_image(filepath): 498 def _open_image(filepath):
386 """Wrapper for Image.open(filepath) that yields more useful error messages. 499 """Wrapper for Image.open(filepath) that yields more useful error messages.
387 500
388 Args: 501 Args:
389 filepath: path on local disk to load image from 502 filepath: path on local disk to load image from
390 503
391 Returns: a PIL image object 504 Returns: a PIL image object
(...skipping 20 matching lines...) Expand all
412 _mkdir_unless_exists(os.path.dirname(filepath)) 525 _mkdir_unless_exists(os.path.dirname(filepath))
413 image.save(filepath, format) 526 image.save(filepath, format)
414 527
415 528
416 def _mkdir_unless_exists(path): 529 def _mkdir_unless_exists(path):
417 """Unless path refers to an already-existing directory, create it. 530 """Unless path refers to an already-existing directory, create it.
418 531
419 Args: 532 Args:
420 path: path on local disk 533 path: path on local disk
421 """ 534 """
422 if not os.path.isdir(path): 535 try:
423 os.makedirs(path) 536 os.makedirs(path)
537 except OSError as e:
538 if e.errno == errno.EEXIST:
539 pass
424 540
425 541
426 def _sanitize_locator(locator): 542 def _sanitize_locator(locator):
427 """Returns a sanitized version of a locator (one in which we know none of the 543 """Returns a sanitized version of a locator (one in which we know none of the
428 characters will have special meaning in filenames). 544 characters will have special meaning in filenames).
429 545
430 Args: 546 Args:
431 locator: string, or something that can be represented as a string 547 locator: string, or something that can be represented as a string
432 """ 548 """
433 return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator)) 549 return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator))
434 550
435 551
552 def _generate_key(expected_image_locator, actual_image_locator):
553 """Returns a key suitable for looking up this image pair.
554
555 Args:
556 expected_image_locator: a unique ID string under which we will store the
557 expected image within storage_root (probably including a checksum to
558 guarantee uniqueness)
559 actual_image_locator: a unique ID string under which we will store the
560 actual image within storage_root (probably including a checksum to
561 guarantee uniqueness)
562 """
563 return (_sanitize_locator(expected_image_locator),
564 _sanitize_locator(actual_image_locator))
565
566
436 def _get_difference_locator(expected_image_locator, actual_image_locator): 567 def _get_difference_locator(expected_image_locator, actual_image_locator):
437 """Returns the locator string used to look up the diffs between expected_image 568 """Returns the locator string used to look up the diffs between expected_image
438 and actual_image. 569 and actual_image.
439 570
440 We must keep this function in sync with getImageDiffRelativeUrl() in 571 We must keep this function in sync with getImageDiffRelativeUrl() in
441 static/loader.js 572 static/loader.js
442 573
443 Args: 574 Args:
444 expected_image_locator: locator string pointing at expected image 575 expected_image_locator: locator string pointing at expected image
445 actual_image_locator: locator string pointing at actual image 576 actual_image_locator: locator string pointing at actual image
446 577
447 Returns: already-sanitized locator where the diffs between expected and 578 Returns: already-sanitized locator where the diffs between expected and
448 actual images can be found 579 actual images can be found
449 """ 580 """
450 return "%s-vs-%s" % (_sanitize_locator(expected_image_locator), 581 return "%s-vs-%s" % (_sanitize_locator(expected_image_locator),
451 _sanitize_locator(actual_image_locator)) 582 _sanitize_locator(actual_image_locator))
OLDNEW
« no previous file with comments | « gm/rebaseline_server/compare_to_expectations.py ('k') | gm/rebaseline_server/imagepair.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698