OLD | NEW |
| (Empty) |
1 #!/usr/bin/python | |
2 | |
3 """ | |
4 Copyright 2013 Google Inc. | |
5 | |
6 Use of this source code is governed by a BSD-style license that can be | |
7 found in the LICENSE file. | |
8 | |
9 Calulate differences between image pairs, and store them in a database. | |
10 """ | |
11 | |
12 # System-level imports | |
13 import contextlib | |
14 import errno | |
15 import json | |
16 import logging | |
17 import os | |
18 import Queue | |
19 import re | |
20 import shutil | |
21 import tempfile | |
22 import threading | |
23 import time | |
24 import urllib | |
25 | |
26 # Must fix up PYTHONPATH before importing from within Skia | |
27 import rs_fixpypath # pylint: disable=W0611 | |
28 | |
29 # Imports from within Skia | |
30 import find_run_binary | |
31 from py.utils import gs_utils | |
32 | |
33 | |
34 SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff') | |
35 | |
36 DEFAULT_IMAGE_SUFFIX = '.png' | |
37 DEFAULT_IMAGES_SUBDIR = 'images' | |
38 # TODO(epoger): Figure out a better default number of threads; for now, | |
39 # using a conservative default value. | |
40 DEFAULT_NUM_WORKER_THREADS = 1 | |
41 | |
42 DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]') | |
43 | |
44 RGBDIFFS_SUBDIR = 'diffs' | |
45 WHITEDIFFS_SUBDIR = 'whitediffs' | |
46 | |
47 # Keys used within DiffRecord dictionary representations. | |
48 # NOTE: Keep these in sync with static/constants.js | |
49 KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel' | |
50 KEY__DIFFERENCES__NUM_DIFF_PIXELS = 'numDifferingPixels' | |
51 KEY__DIFFERENCES__PERCENT_DIFF_PIXELS = 'percentDifferingPixels' | |
52 KEY__DIFFERENCES__PERCEPTUAL_DIFF = 'perceptualDifference' | |
53 KEY__DIFFERENCES__DIFF_URL = 'diffUrl' | |
54 KEY__DIFFERENCES__WHITE_DIFF_URL = 'whiteDiffUrl' | |
55 | |
56 # Special values within ImageDiffDB._diff_dict | |
57 _DIFFRECORD_FAILED = 'failed' | |
58 _DIFFRECORD_PENDING = 'pending' | |
59 | |
60 # How often to report tasks_queue size | |
61 QUEUE_LOGGING_GRANULARITY = 1000 | |
62 | |
63 # Temporary variable to keep track of how many times we download | |
64 # the same file in multiple threads. | |
65 # TODO(epoger): Delete this, once we see that the number stays close to 0. | |
66 global_file_collisions = 0 | |
67 | |
68 | |
69 class DiffRecord(object): | |
70 """ Record of differences between two images. """ | |
71 | |
72 def __init__(self, gs, storage_root, | |
73 expected_image_url, expected_image_locator, | |
74 actual_image_url, actual_image_locator, | |
75 expected_images_subdir=DEFAULT_IMAGES_SUBDIR, | |
76 actual_images_subdir=DEFAULT_IMAGES_SUBDIR, | |
77 image_suffix=DEFAULT_IMAGE_SUFFIX): | |
78 """Download this pair of images (unless we already have them on local disk), | |
79 and prepare a DiffRecord for them. | |
80 | |
81 Args: | |
82 gs: instance of GSUtils object we can use to download images | |
83 storage_root: root directory on local disk within which we store all | |
84 images | |
85 expected_image_url: file, GS, or HTTP url from which we will download the | |
86 expected image | |
87 expected_image_locator: a unique ID string under which we will store the | |
88 expected image within storage_root (probably including a checksum to | |
89 guarantee uniqueness) | |
90 actual_image_url: file, GS, or HTTP url from which we will download the | |
91 actual image | |
92 actual_image_locator: a unique ID string under which we will store the | |
93 actual image within storage_root (probably including a checksum to | |
94 guarantee uniqueness) | |
95 expected_images_subdir: the subdirectory expected images are stored in. | |
96 actual_images_subdir: the subdirectory actual images are stored in. | |
97 image_suffix: the suffix of images. | |
98 """ | |
99 expected_image_locator = _sanitize_locator(expected_image_locator) | |
100 actual_image_locator = _sanitize_locator(actual_image_locator) | |
101 | |
102 # Download the expected/actual images, if we don't have them already. | |
103 expected_image_file = os.path.join( | |
104 storage_root, expected_images_subdir, | |
105 str(expected_image_locator) + image_suffix) | |
106 actual_image_file = os.path.join( | |
107 storage_root, actual_images_subdir, | |
108 str(actual_image_locator) + image_suffix) | |
109 for image_file, image_url in [ | |
110 (expected_image_file, expected_image_url), | |
111 (actual_image_file, actual_image_url)]: | |
112 if image_file and image_url: | |
113 try: | |
114 _download_file(gs, image_file, image_url) | |
115 except Exception: | |
116 logging.exception('unable to download image_url %s to file %s' % | |
117 (image_url, image_file)) | |
118 raise | |
119 | |
120 # Return early if we do not need to generate diffs. | |
121 if (expected_image_url == actual_image_url or | |
122 not expected_image_url or not actual_image_url): | |
123 return | |
124 | |
125 # Get all diff images and values using the skpdiff binary. | |
126 skpdiff_output_dir = tempfile.mkdtemp() | |
127 try: | |
128 skpdiff_summary_file = os.path.join(skpdiff_output_dir, | |
129 'skpdiff-output.json') | |
130 skpdiff_rgbdiff_dir = os.path.join(storage_root, RGBDIFFS_SUBDIR) | |
131 skpdiff_whitediff_dir = os.path.join(storage_root, WHITEDIFFS_SUBDIR) | |
132 _mkdir_unless_exists(skpdiff_rgbdiff_dir) | |
133 _mkdir_unless_exists(skpdiff_rgbdiff_dir) | |
134 | |
135 # TODO(epoger): Consider calling skpdiff ONCE for all image pairs, | |
136 # instead of calling it separately for each image pair. | |
137 # Pro: we'll incur less overhead from making repeated system calls, | |
138 # spinning up the skpdiff binary, etc. | |
139 # Con: we would have to wait until all image pairs were loaded before | |
140 # generating any of the diffs? | |
141 # Note(stephana): '--longnames' was added to allow for this | |
142 # case (multiple files at once) versus specifying output diffs | |
143 # directly. | |
144 find_run_binary.run_command( | |
145 [SKPDIFF_BINARY, '-p', expected_image_file, actual_image_file, | |
146 '--jsonp', 'false', | |
147 '--longnames', 'true', | |
148 '--output', skpdiff_summary_file, | |
149 '--differs', 'perceptual', 'different_pixels', | |
150 '--rgbDiffDir', skpdiff_rgbdiff_dir, | |
151 '--whiteDiffDir', skpdiff_whitediff_dir, | |
152 ]) | |
153 | |
154 # Get information out of the skpdiff_summary_file. | |
155 with contextlib.closing(open(skpdiff_summary_file)) as fp: | |
156 data = json.load(fp) | |
157 | |
158 # For now, we can assume there is only one record in the output summary, | |
159 # since we passed skpdiff only one pair of images. | |
160 record = data['records'][0] | |
161 self._width = record['width'] | |
162 self._height = record['height'] | |
163 self._diffUrl = os.path.split(record['rgbDiffPath'])[1] | |
164 self._whiteDiffUrl = os.path.split(record['whiteDiffPath'])[1] | |
165 | |
166 # TODO: make max_diff_per_channel a tuple instead of a list, because the | |
167 # structure is meaningful (first element is red, second is green, etc.) | |
168 # See http://stackoverflow.com/a/626871 | |
169 self._max_diff_per_channel = [ | |
170 record['maxRedDiff'], record['maxGreenDiff'], record['maxBlueDiff']] | |
171 per_differ_stats = record['diffs'] | |
172 for stats in per_differ_stats: | |
173 differ_name = stats['differName'] | |
174 if differ_name == 'different_pixels': | |
175 self._num_pixels_differing = stats['pointsOfInterest'] | |
176 elif differ_name == 'perceptual': | |
177 perceptual_similarity = stats['result'] | |
178 | |
179 # skpdiff returns the perceptual similarity; convert it to get the | |
180 # perceptual difference percentage. | |
181 # skpdiff outputs -1 if the images are different sizes. Treat any | |
182 # output that does not lie in [0, 1] as having 0% perceptual | |
183 # similarity. | |
184 if not 0 <= perceptual_similarity <= 1: | |
185 perceptual_similarity = 0 | |
186 self._perceptual_difference = 100 - (perceptual_similarity * 100) | |
187 finally: | |
188 shutil.rmtree(skpdiff_output_dir) | |
189 | |
190 # TODO(epoger): Use properties instead of getters throughout. | |
191 # See http://stackoverflow.com/a/6618176 | |
192 def get_num_pixels_differing(self): | |
193 """Returns the absolute number of pixels that differ.""" | |
194 return self._num_pixels_differing | |
195 | |
196 def get_percent_pixels_differing(self): | |
197 """Returns the percentage of pixels that differ, as a float between | |
198 0 and 100 (inclusive).""" | |
199 return ((float(self._num_pixels_differing) * 100) / | |
200 (self._width * self._height)) | |
201 | |
202 def get_perceptual_difference(self): | |
203 """Returns the perceptual difference percentage.""" | |
204 return self._perceptual_difference | |
205 | |
206 def get_max_diff_per_channel(self): | |
207 """Returns the maximum difference between the expected and actual images | |
208 for each R/G/B channel, as a list.""" | |
209 return self._max_diff_per_channel | |
210 | |
211 def as_dict(self): | |
212 """Returns a dictionary representation of this DiffRecord, as needed when | |
213 constructing the JSON representation.""" | |
214 return { | |
215 KEY__DIFFERENCES__NUM_DIFF_PIXELS: self._num_pixels_differing, | |
216 KEY__DIFFERENCES__PERCENT_DIFF_PIXELS: | |
217 self.get_percent_pixels_differing(), | |
218 KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL: self._max_diff_per_channel, | |
219 KEY__DIFFERENCES__PERCEPTUAL_DIFF: self._perceptual_difference, | |
220 KEY__DIFFERENCES__DIFF_URL: self._diffUrl, | |
221 KEY__DIFFERENCES__WHITE_DIFF_URL: self._whiteDiffUrl, | |
222 } | |
223 | |
224 | |
225 | |
226 class ImageDiffDB(object): | |
227 """ Calculates differences between image pairs, maintaining a database of | |
228 them for download.""" | |
229 | |
230 def __init__(self, storage_root, gs=None, | |
231 num_worker_threads=DEFAULT_NUM_WORKER_THREADS): | |
232 """ | |
233 Args: | |
234 storage_root: string; root path within the DB will store all of its stuff | |
235 gs: instance of GSUtils object we can use to download images | |
236 num_worker_threads: how many threads that download images and | |
237 generate diffs simultaneously | |
238 """ | |
239 self._storage_root = storage_root | |
240 self._gs = gs | |
241 | |
242 # Mechanism for reporting queue size periodically. | |
243 self._last_queue_size_reported = None | |
244 self._queue_size_report_lock = threading.RLock() | |
245 | |
246 # Dictionary of DiffRecords, keyed by (expected_image_locator, | |
247 # actual_image_locator) tuples. | |
248 # Values can also be _DIFFRECORD_PENDING, _DIFFRECORD_FAILED. | |
249 # | |
250 # Any thread that modifies _diff_dict must first acquire | |
251 # _diff_dict_writelock! | |
252 # | |
253 # TODO(epoger): Disk is limitless, but RAM is not... so, we should probably | |
254 # remove items from self._diff_dict if they haven't been accessed for a | |
255 # long time. We can always regenerate them by diffing the images we | |
256 # previously downloaded to local disk. | |
257 # I guess we should figure out how expensive it is to download vs diff the | |
258 # image pairs... if diffing them is expensive too, we can write these | |
259 # _diff_dict objects out to disk if there's too many to hold in RAM. | |
260 # Or we could use virtual memory to handle that automatically. | |
261 self._diff_dict = {} | |
262 self._diff_dict_writelock = threading.RLock() | |
263 | |
264 # Set up the queue for asynchronously loading DiffRecords, and start the | |
265 # worker threads reading from it. | |
266 # The queue maxsize must be 0 (infinite size queue), so that asynchronous | |
267 # calls can return as soon as possible. | |
268 self._tasks_queue = Queue.Queue(maxsize=0) | |
269 self._workers = [] | |
270 for i in range(num_worker_threads): | |
271 worker = threading.Thread(target=self.worker, args=(i,)) | |
272 worker.daemon = True | |
273 worker.start() | |
274 self._workers.append(worker) | |
275 | |
276 def log_queue_size_if_changed(self, limit_verbosity=True): | |
277 """Log the size of self._tasks_queue, if it has changed since the last call. | |
278 | |
279 Reports the current queue size, using log.info(), unless the queue is the | |
280 same size as the last time we reported it. | |
281 | |
282 Args: | |
283 limit_verbosity: if True, only log if the queue size is a multiple of | |
284 QUEUE_LOGGING_GRANULARITY | |
285 """ | |
286 # Acquire the lock, to synchronize access to self._last_queue_size_reported | |
287 self._queue_size_report_lock.acquire() | |
288 try: | |
289 size = self._tasks_queue.qsize() | |
290 if size == self._last_queue_size_reported: | |
291 return | |
292 if limit_verbosity and (size % QUEUE_LOGGING_GRANULARITY != 0): | |
293 return | |
294 logging.info('tasks_queue size is %d' % size) | |
295 self._last_queue_size_reported = size | |
296 finally: | |
297 self._queue_size_report_lock.release() | |
298 | |
299 def worker(self, worker_num): | |
300 """Launch a worker thread that pulls tasks off self._tasks_queue. | |
301 | |
302 Args: | |
303 worker_num: (integer) which worker this is | |
304 """ | |
305 while True: | |
306 self.log_queue_size_if_changed() | |
307 params = self._tasks_queue.get() | |
308 key, expected_image_url, actual_image_url = params | |
309 try: | |
310 diff_record = DiffRecord( | |
311 self._gs, self._storage_root, | |
312 expected_image_url=expected_image_url, | |
313 expected_image_locator=key[0], | |
314 actual_image_url=actual_image_url, | |
315 actual_image_locator=key[1]) | |
316 except Exception: | |
317 logging.exception( | |
318 'exception while creating DiffRecord for key %s' % str(key)) | |
319 diff_record = _DIFFRECORD_FAILED | |
320 self._diff_dict_writelock.acquire() | |
321 try: | |
322 self._diff_dict[key] = diff_record | |
323 finally: | |
324 self._diff_dict_writelock.release() | |
325 | |
326 @property | |
327 def storage_root(self): | |
328 return self._storage_root | |
329 | |
330 def add_image_pair(self, | |
331 expected_image_url, expected_image_locator, | |
332 actual_image_url, actual_image_locator): | |
333 """Asynchronously prepare a DiffRecord for a pair of images. | |
334 | |
335 This method will return quickly; calls to get_diff_record() will block | |
336 until the DiffRecord is available (or we have given up on creating it). | |
337 | |
338 If we already have a DiffRecord for this particular image pair, no work | |
339 will be done. | |
340 | |
341 If expected_image_url (or its locator) is None, just download actual_image. | |
342 If actual_image_url (or its locator) is None, just download expected_image. | |
343 | |
344 Args: | |
345 expected_image_url: file, GS, or HTTP url from which we will download the | |
346 expected image | |
347 expected_image_locator: a unique ID string under which we will store the | |
348 expected image within storage_root (probably including a checksum to | |
349 guarantee uniqueness) | |
350 actual_image_url: file, GS, or HTTP url from which we will download the | |
351 actual image | |
352 actual_image_locator: a unique ID string under which we will store the | |
353 actual image within storage_root (probably including a checksum to | |
354 guarantee uniqueness) | |
355 """ | |
356 expected_image_locator = _sanitize_locator(expected_image_locator) | |
357 actual_image_locator = _sanitize_locator(actual_image_locator) | |
358 key = (expected_image_locator, actual_image_locator) | |
359 must_add_to_queue = False | |
360 | |
361 self._diff_dict_writelock.acquire() | |
362 try: | |
363 if not key in self._diff_dict: | |
364 # If we have already requested a diff between these two images, | |
365 # we don't need to request it again. | |
366 must_add_to_queue = True | |
367 self._diff_dict[key] = _DIFFRECORD_PENDING | |
368 finally: | |
369 self._diff_dict_writelock.release() | |
370 | |
371 if must_add_to_queue: | |
372 self._tasks_queue.put((key, expected_image_url, actual_image_url)) | |
373 self.log_queue_size_if_changed() | |
374 | |
375 def get_diff_record(self, expected_image_locator, actual_image_locator): | |
376 """Returns the DiffRecord for this image pair. | |
377 | |
378 This call will block until the diff record is available, or we were unable | |
379 to generate it. | |
380 | |
381 Args: | |
382 expected_image_locator: a unique ID string under which we will store the | |
383 expected image within storage_root (probably including a checksum to | |
384 guarantee uniqueness) | |
385 actual_image_locator: a unique ID string under which we will store the | |
386 actual image within storage_root (probably including a checksum to | |
387 guarantee uniqueness) | |
388 | |
389 Returns the DiffRecord for this image pair, or None if we were unable to | |
390 generate one. | |
391 """ | |
392 key = (_sanitize_locator(expected_image_locator), | |
393 _sanitize_locator(actual_image_locator)) | |
394 diff_record = self._diff_dict[key] | |
395 | |
396 # If we have no results yet, block until we do. | |
397 while diff_record == _DIFFRECORD_PENDING: | |
398 time.sleep(1) | |
399 diff_record = self._diff_dict[key] | |
400 | |
401 # Once we have the result... | |
402 if diff_record == _DIFFRECORD_FAILED: | |
403 logging.error( | |
404 'failed to create a DiffRecord for expected_image_locator=%s , ' | |
405 'actual_image_locator=%s' % ( | |
406 expected_image_locator, actual_image_locator)) | |
407 return None | |
408 else: | |
409 return diff_record | |
410 | |
411 | |
412 # Utility functions | |
413 | |
414 def _download_file(gs, local_filepath, url): | |
415 """Download a file from url to local_filepath, unless it is already there. | |
416 | |
417 Args: | |
418 gs: instance of GSUtils object, in case the url points at Google Storage | |
419 local_filepath: path on local disk where the image should be stored | |
420 url: HTTP or GS URL from which we can download the image if we don't have | |
421 it yet | |
422 """ | |
423 global global_file_collisions | |
424 if not os.path.exists(local_filepath): | |
425 _mkdir_unless_exists(os.path.dirname(local_filepath)) | |
426 | |
427 # First download the file contents into a unique filename, and | |
428 # then rename that file. That way, if multiple threads are downloading | |
429 # the same filename at the same time, they won't interfere with each | |
430 # other (they will both download the file, and one will "win" in the end) | |
431 temp_filename = '%s-%d' % (local_filepath, | |
432 threading.current_thread().ident) | |
433 if gs_utils.GSUtils.is_gs_url(url): | |
434 (bucket, path) = gs_utils.GSUtils.split_gs_url(url) | |
435 gs.download_file(source_bucket=bucket, source_path=path, | |
436 dest_path=temp_filename) | |
437 else: | |
438 with contextlib.closing(urllib.urlopen(url)) as url_handle: | |
439 with open(temp_filename, 'wb') as file_handle: | |
440 shutil.copyfileobj(fsrc=url_handle, fdst=file_handle) | |
441 | |
442 # Rename the file to its real filename. | |
443 # Keep count of how many colliding downloads we encounter; | |
444 # if it's a large number, we may want to change our download strategy | |
445 # to minimize repeated downloads. | |
446 if os.path.exists(local_filepath): | |
447 global_file_collisions += 1 | |
448 else: | |
449 os.rename(temp_filename, local_filepath) | |
450 | |
451 | |
452 def _mkdir_unless_exists(path): | |
453 """Unless path refers to an already-existing directory, create it. | |
454 | |
455 Args: | |
456 path: path on local disk | |
457 """ | |
458 try: | |
459 os.makedirs(path) | |
460 except OSError as e: | |
461 if e.errno == errno.EEXIST: | |
462 pass | |
463 | |
464 | |
465 def _sanitize_locator(locator): | |
466 """Returns a sanitized version of a locator (one in which we know none of the | |
467 characters will have special meaning in filenames). | |
468 | |
469 Args: | |
470 locator: string, or something that can be represented as a string. | |
471 If None or '', it is returned without modification, because empty | |
472 locators have a particular meaning ("there is no image for this") | |
473 """ | |
474 if locator: | |
475 return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator)) | |
476 else: | |
477 return locator | |
OLD | NEW |