gm/rebaseline_server/imagediffdb.py - Issue 235923002: rebaseline_server: multithreaded loading/diffing of images

Side by Side Diff: gm/rebaseline_server/imagediffdb.py

Issue 235923002: rebaseline_server: multithreaded loading/diffing of images (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 #!/usr/bin/python	1 #!/usr/bin/python

2	2

3 """	3 """

4 Copyright 2013 Google Inc.	4 Copyright 2013 Google Inc.

5	5

6 Use of this source code is governed by a BSD-style license that can be	6 Use of this source code is governed by a BSD-style license that can be

7 found in the LICENSE file.	7 found in the LICENSE file.

8	8

9 Calulate differences between image pairs, and store them in a database.	9 Calulate differences between image pairs, and store them in a database.

10 """	10 """

11	11

12 import contextlib	12 import contextlib

13 import csv	13 import csv

	14 import errno

14 import logging	15 import logging

	16 import Queue

15 import os	17 import os

16 import re	18 import re

17 import shutil	19 import shutil

18 import sys	20 import sys

19 import tempfile	21 import tempfile

	22 import time

	23 import threading

20 import urllib	24 import urllib

21 try:	25 try:

22 from PIL import Image, ImageChops	26 from PIL import Image, ImageChops

23 except ImportError:	27 except ImportError:

24 raise ImportError('Requires PIL to be installed; see '	28 raise ImportError('Requires PIL to be installed; see '

25 + 'http://www.pythonware.com/products/pil/')	29 + 'http://www.pythonware.com/products/pil/')

26	30

27 # Set the PYTHONPATH to include the tools directory.	31 # Set the PYTHONPATH to include the tools directory.

28 sys.path.append(	32 sys.path.append(

29 os.path.join(	33 os.path.join(

30 os.path.dirname(os.path.realpath(__file__)), os.pardir, os.pardir,	34 os.path.dirname(os.path.realpath(__file__)), os.pardir, os.pardir,

31 'tools'))	35 'tools'))

32 import find_run_binary	36 import find_run_binary

33	37

34 SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff')	38 SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff')

35	39

36 DEFAULT_IMAGE_SUFFIX = '.png'	40 DEFAULT_IMAGE_SUFFIX = '.png'

37 DEFAULT_IMAGES_SUBDIR = 'images'	41 DEFAULT_IMAGES_SUBDIR = 'images'

	42 DEFAULT_NUM_WORKERS = 8

38	43

39 DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]')	44 DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]')

40	45

41 DIFFS_SUBDIR = 'diffs'	46 DIFFS_SUBDIR = 'diffs'

42 WHITEDIFFS_SUBDIR = 'whitediffs'	47 WHITEDIFFS_SUBDIR = 'whitediffs'

43	48

44 VALUES_PER_BAND = 256	49 VALUES_PER_BAND = 256

45	50

46 # Keys used within DiffRecord dictionary representations.	51 # Keys used within DiffRecord dictionary representations.

47 # NOTE: Keep these in sync with static/constants.js	52 # NOTE: Keep these in sync with static/constants.js

48 KEY__DIFFERENCE_DATA__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel'	53 KEY__DIFFERENCE_DATA__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel'

49 KEY__DIFFERENCE_DATA__NUM_DIFF_PIXELS = 'numDifferingPixels'	54 KEY__DIFFERENCE_DATA__NUM_DIFF_PIXELS = 'numDifferingPixels'

50 KEY__DIFFERENCE_DATA__PERCENT_DIFF_PIXELS = 'percentDifferingPixels'	55 KEY__DIFFERENCE_DATA__PERCENT_DIFF_PIXELS = 'percentDifferingPixels'

51 KEY__DIFFERENCE_DATA__PERCEPTUAL_DIFF = 'perceptualDifference'	56 KEY__DIFFERENCE_DATA__PERCEPTUAL_DIFF = 'perceptualDifference'

52 KEY__DIFFERENCE_DATA__WEIGHTED_DIFF = 'weightedDiffMeasure'	57 KEY__DIFFERENCE_DATA__WEIGHTED_DIFF = 'weightedDiffMeasure'

53	58

	59 # Special values within ImageDiffDB._diff_dict

	60 DIFFRECORD_FAILED = 'failed'

	61 DIFFRECORD_PENDING = 'pending'

	62

	63 # TODO(epoger): Temporary(?) list to keep track of how many times we download

	64 # the same file in multiple threads.

	65 global_file_collisions = 0

	66

54	67

55 class DiffRecord(object):	68 class DiffRecord(object):

56 """ Record of differences between two images. """	69 """ Record of differences between two images. """

57	70

58 def __init__(self, storage_root,	71 def __init__(self, storage_root,

59 expected_image_url, expected_image_locator,	72 expected_image_url, expected_image_locator,

60 actual_image_url, actual_image_locator,	73 actual_image_url, actual_image_locator,

61 expected_images_subdir=DEFAULT_IMAGES_SUBDIR,	74 expected_images_subdir=DEFAULT_IMAGES_SUBDIR,

62 actual_images_subdir=DEFAULT_IMAGES_SUBDIR,	75 actual_images_subdir=DEFAULT_IMAGES_SUBDIR,

63 image_suffix=DEFAULT_IMAGE_SUFFIX):	76 image_suffix=DEFAULT_IMAGE_SUFFIX):

64 """Download this pair of images (unless we already have them on local disk),	77 """Download this pair of images (unless we already have them on local disk),

65 and prepare a DiffRecord for them.	78 and prepare a DiffRecord for them.

66	79

67 TODO(epoger): Make this asynchronously download images, rather than blocking

68 until the images have been downloaded and processed.

69

70 Args:	80 Args:

71 storage_root: root directory on local disk within which we store all	81 storage_root: root directory on local disk within which we store all

72 images	82 images

73 expected_image_url: file or HTTP url from which we will download the	83 expected_image_url: file or HTTP url from which we will download the

74 expected image	84 expected image

75 expected_image_locator: a unique ID string under which we will store the	85 expected_image_locator: a unique ID string under which we will store the

76 expected image within storage_root (probably including a checksum to	86 expected image within storage_root (probably including a checksum to

77 guarantee uniqueness)	87 guarantee uniqueness)

78 actual_image_url: file or HTTP url from which we will download the	88 actual_image_url: file or HTTP url from which we will download the

79 actual image	89 actual image

(...skipping 132 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
212 KEY__DIFFERENCE_DATA__WEIGHTED_DIFF: self.get_weighted_diff_measure(),	222 KEY__DIFFERENCE_DATA__WEIGHTED_DIFF: self.get_weighted_diff_measure(),

213 KEY__DIFFERENCE_DATA__MAX_DIFF_PER_CHANNEL: self._max_diff_per_channel,	223 KEY__DIFFERENCE_DATA__MAX_DIFF_PER_CHANNEL: self._max_diff_per_channel,

214 KEY__DIFFERENCE_DATA__PERCEPTUAL_DIFF: self._perceptual_difference,	224 KEY__DIFFERENCE_DATA__PERCEPTUAL_DIFF: self._perceptual_difference,

215 }	225 }

216	226

217	227

218 class ImageDiffDB(object):	228 class ImageDiffDB(object):

219 """ Calculates differences between image pairs, maintaining a database of	229 """ Calculates differences between image pairs, maintaining a database of

220 them for download."""	230 them for download."""

221	231

222 def __init__(self, storage_root):	232 def __init__(self, storage_root, num_workers=DEFAULT_NUM_WORKERS):

223 """	233 """

224 Args:	234 Args:

225 storage_root: string; root path within the DB will store all of its stuff	235 storage_root: string; root path within the DB will store all of its stuff

	236 num_workers: integer; number of worker threads to spawn

226 """	237 """

227 self._storage_root = storage_root	238 self._storage_root = storage_root

228	239

229 # Dictionary of DiffRecords, keyed by (expected_image_locator,	240 # Dictionary of DiffRecords, keyed by (expected_image_locator,

230 # actual_image_locator) tuples.	241 # actual_image_locator) tuples.

	242 # Values can also be DIFFRECORD_PENDING, DIFFRECORD_FAILED.

231 self._diff_dict = {}	243 self._diff_dict = {}

232	244

	245 # Set up the queue for asynchronously loading DiffRecords, and start the

	246 # worker threads reading from it.

	247 self._tasks_queue = Queue.Queue(maxsize=2*num_workers)

	248 self._workers = []

	249 for i in range(num_workers):

	250 worker = threading.Thread(target=self.worker, args=(i,))

	251 worker.daemon = True

	252 worker.start()

	253 self._workers.append(worker)

	254

	255 def worker(self, worker_num):

	256 """Launch a worker thread that pulls tasks off self._tasks_queue.

	257

	258 Args:

	259 worker_num: (integer) which worker this is

	260 """

	261 while True:

	262 params = self._tasks_queue.get()

	263 key, expected_image_url, actual_image_url = params

	264 try:

	265 diff_record = DiffRecord(

	266 self._storage_root,

	267 expected_image_url=expected_image_url,

	268 expected_image_locator=key[0],

	269 actual_image_url=actual_image_url,

	270 actual_image_locator=key[1])

	271 except Exception:

	272 logging.exception(

	273 'exception while creating DiffRecord for key %s' % str(key))

	274 diff_record = DIFFRECORD_FAILED

	275 self._diff_dict[key] = diff_record

	276

233 def add_image_pair(self,	277 def add_image_pair(self,

234 expected_image_url, expected_image_locator,	278 expected_image_url, expected_image_locator,

235 actual_image_url, actual_image_locator):	279 actual_image_url, actual_image_locator):

236 """Download this pair of images (unless we already have them on local disk),	280 """Download this pair of images (unless we already have them on local disk),

237 and prepare a DiffRecord for them.	281 and prepare a DiffRecord for them.

238	282

239 TODO(epoger): Make this asynchronously download images, rather than blocking	283 This method will block until the images are downloaded and DiffRecord is

240 until the images have been downloaded and processed.	284 available by calling get_diff_record().

241 When we do that, we should probably add a new method that will block

242 until all of the images have been downloaded and processed. Otherwise,

243 we won't know when it's safe to start calling get_diff_record().

244 jcgregorio notes: maybe just make ImageDiffDB thread-safe and create a

245 thread-pool/worker queue at a higher level that just uses ImageDiffDB?

246	285

247 Args:	286 Args:

248 expected_image_url: file or HTTP url from which we will download the	287 expected_image_url: file or HTTP url from which we will download the

249 expected image	288 expected image

250 expected_image_locator: a unique ID string under which we will store the	289 expected_image_locator: a unique ID string under which we will store the

251 expected image within storage_root (probably including a checksum to	290 expected image within storage_root (probably including a checksum to

252 guarantee uniqueness)	291 guarantee uniqueness)

253 actual_image_url: file or HTTP url from which we will download the	292 actual_image_url: file or HTTP url from which we will download the

254 actual image	293 actual image

255 actual_image_locator: a unique ID string under which we will store the	294 actual_image_locator: a unique ID string under which we will store the

256 actual image within storage_root (probably including a checksum to	295 actual image within storage_root (probably including a checksum to

257 guarantee uniqueness)	296 guarantee uniqueness)

	297

	298 Raises:

	299 Exception if we are unable to create a DiffRecord for this image pair.

258 """	300 """

259 expected_image_locator = _sanitize_locator(expected_image_locator)	301 key = _generate_key(expected_image_locator, actual_image_locator)

260 actual_image_locator = _sanitize_locator(actual_image_locator)

261 key = (expected_image_locator, actual_image_locator)

262 if not key in self._diff_dict:	302 if not key in self._diff_dict:

263 try:	303 try:

264 new_diff_record = DiffRecord(	304 new_diff_record = DiffRecord(

265 self._storage_root,	305 self._storage_root,

266 expected_image_url=expected_image_url,	306 expected_image_url=expected_image_url,

267 expected_image_locator=expected_image_locator,	307 expected_image_locator=expected_image_locator,

268 actual_image_url=actual_image_url,	308 actual_image_url=actual_image_url,

269 actual_image_locator=actual_image_locator)	309 actual_image_locator=actual_image_locator)

270 except Exception:	310 except Exception:

271 # If we can't create a real DiffRecord for this (expected, actual) pair,	311 # If we can't create a real DiffRecord for this (expected, actual) pair,

272 # store None and the UI will show whatever information we DO have.	312 # store None and the UI will show whatever information we DO have.

273 # Fixes http://skbug.com/2368 .	313 # Fixes http://skbug.com/2368 .

274 logging.exception(	314 logging.exception(

275 'got exception while creating a DiffRecord for '	315 'got exception while creating a DiffRecord for '

276 'expected_image_url=%s , actual_image_url=%s; returning None' % (	316 'expected_image_url=%s , actual_image_url=%s; returning None' % (

277 expected_image_url, actual_image_url))	317 expected_image_url, actual_image_url))

278 new_diff_record = None	318 new_diff_record = None

279 self._diff_dict[key] = new_diff_record	319 self._diff_dict[key] = new_diff_record

280	320

	321 def add_image_pair_async(self,

	322 expected_image_url, expected_image_locator,

	323 actual_image_url, actual_image_locator):

	324 """Download this pair of images (unless we already have them on local disk),

	325 and prepare a DiffRecord for them.

	326

	327 This method will return quickly; calls to get_diff_record() will block

	328 until the DiffRecord is available (or we have given up on creating it).

	329

	330 Args:

	331 expected_image_url: file or HTTP url from which we will download the

	332 expected image

	333 expected_image_locator: a unique ID string under which we will store the

	334 expected image within storage_root (probably including a checksum to

	335 guarantee uniqueness)

	336 actual_image_url: file or HTTP url from which we will download the

	337 actual image

	338 actual_image_locator: a unique ID string under which we will store the

	339 actual image within storage_root (probably including a checksum to

	340 guarantee uniqueness)

	341 """

	342 key = _generate_key(expected_image_locator, actual_image_locator)

	343 if not key in self._diff_dict:

	344 # If we have already requested a diff between these two images,

	345 # we don't need to request it again.

	346 #

	347 # Threading note: If multiple threads called into this method with the

	348 # same key at the same time, there will be multiple tasks on the queue

	349 # with the same key. But that's OK; they will both complete successfully,

	350 # and just waste a little time in the process. Nothing will break.
	rmistry 2014/04/14 17:59:06 Can we use semaphores/locks here? Based on your te Can we use semaphores/locks here? Based on your tests how many global_file_collisions show up? Also, you said it runs about 60% faster, does that mean it now takes about 10-15 mins during startup (with gpu cpu comparison)? epoger 2014/04/14 18:10:51 In my tests, I didn't see any global_file_collisio Show quoted text On 2014/04/14 17:59:06, rmistry wrote: > Can we use semaphores/locks here? Based on your tests how many > global_file_collisions show up? In my tests, I didn't see any global_file_collisions yet. Thus, I concluded we were better off not incurring the cost of an additional lock. (Given that the downside is not disastrous, just takes some extra time.) Show quoted text > Also, you said it runs about 60% faster, does that mean it now takes about 10-15 > mins during startup (with gpu cpu comparison)? Well..... no. I haven't tried with the gpu/cpu comparison yet. This is the command I was running, on my Ubiquity instance: $ rm -rf gm/rebaseline_server/static/generated-images/ && gm/rebaseline_server/server.py --export --actuals-repo "" (Clear out disk cache to force download of actual/expected images for all with differing checksums, but don't update the actuals checksums from skia-autogen so we have consistent results.) It consistenly took ~139 seconds before this change, and ~58 seconds after. I'm sure there is plenty more speedup to be found. I will keep looking...
	351 self._diff_dict[key] = DIFFRECORD_PENDING

	352 self._tasks_queue.put((key, expected_image_url, actual_image_url))

	353

281 def get_diff_record(self, expected_image_locator, actual_image_locator):	354 def get_diff_record(self, expected_image_locator, actual_image_locator):

282 """Returns the DiffRecord for this image pair.	355 """Returns the DiffRecord for this image pair.

283	356

284 Raises a KeyError if we don't have a DiffRecord for this image pair.	357 Args:

	358 expected_image_locator: a unique ID string under which we will store the

	359 expected image within storage_root (probably including a checksum to

	360 guarantee uniqueness)

	361 actual_image_locator: a unique ID string under which we will store the

	362 actual image within storage_root (probably including a checksum to

	363 guarantee uniqueness)

	364

	365 Returns the DiffRecord for this image pair, or None if we were unable to

	366 generate one.

285 """	367 """

286 key = (_sanitize_locator(expected_image_locator),	368 key = _generate_key(expected_image_locator, actual_image_locator)

287 _sanitize_locator(actual_image_locator))	369 diff_record = self._diff_dict[key]

288 return self._diff_dict[key]	370

	371 # If we have no results yet, block until we do.

	372 while diff_record == DIFFRECORD_PENDING:

	373 time.sleep(1)

	374 diff_record = self._diff_dict[key]

	375

	376 # Once we have the result...

	377 if diff_record == DIFFRECORD_FAILED:

	378 logging.error(

	379 'failed to create a DiffRecord for expected_image_locator=%s , '

	380 'actual_image_locator=%s' % (

	381 expected_image_locator, actual_image_locator))

	382 return None

	383 else:

	384 return diff_record

289	385

290	386

291 # Utility functions	387 # Utility functions

292	388

293 def _calculate_weighted_diff_metric(histogram, num_pixels):	389 def _calculate_weighted_diff_metric(histogram, num_pixels):

294 """Given the histogram of a diff image (per-channel diff at each	390 """Given the histogram of a diff image (per-channel diff at each

295 pixel between two images), calculate the weighted diff metric (a	391 pixel between two images), calculate the weighted diff metric (a

296 stab at how different the two images really are).	392 stab at how different the two images really are).

297	393

298 TODO(epoger): Delete this function, now that we have perceptual diff?	394 TODO(epoger): Delete this function, now that we have perceptual diff?

(...skipping 68 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
367 def _download_and_open_image(local_filepath, url):	463 def _download_and_open_image(local_filepath, url):

368 """Open the image at local_filepath; if there is no file at that path,	464 """Open the image at local_filepath; if there is no file at that path,

369 download it from url to that path and then open it.	465 download it from url to that path and then open it.

370	466

371 Args:	467 Args:

372 local_filepath: path on local disk where the image should be stored	468 local_filepath: path on local disk where the image should be stored

373 url: URL from which we can download the image if we don't have it yet	469 url: URL from which we can download the image if we don't have it yet

374	470

375 Returns: a PIL image object	471 Returns: a PIL image object

376 """	472 """

	473 global global_file_collisions

377 if not os.path.exists(local_filepath):	474 if not os.path.exists(local_filepath):

378 _mkdir_unless_exists(os.path.dirname(local_filepath))	475 _mkdir_unless_exists(os.path.dirname(local_filepath))

379 with contextlib.closing(urllib.urlopen(url)) as url_handle:	476 with contextlib.closing(urllib.urlopen(url)) as url_handle:

380 with open(local_filepath, 'wb') as file_handle:	477

	478 # First download the file contents into a unique filename, and

	479 # then rename that file. That way, if multiple threads are downloading

	480 # the same filename at the same time, they won't interfere with each

	481 # other (they will both download the file, and one will "win" in the end)

	482 temp_filename = '%s-%d' % (local_filepath,

	483 threading.current_thread().ident)

	484 with open(temp_filename, 'wb') as file_handle:

381 shutil.copyfileobj(fsrc=url_handle, fdst=file_handle)	485 shutil.copyfileobj(fsrc=url_handle, fdst=file_handle)

	486

	487 # Keep count of how many colliding downloads we encounter;

	488 # if it's a large number, we may want to change our download strategy

	489 # to minimize repeated downloads.

	490 if os.path.exists(local_filepath):

	491 global_file_collisions += 1

	492 else:

	493 os.rename(temp_filename, local_filepath)

	494

382 return _open_image(local_filepath)	495 return _open_image(local_filepath)

383	496

384	497

385 def _open_image(filepath):	498 def _open_image(filepath):

386 """Wrapper for Image.open(filepath) that yields more useful error messages.	499 """Wrapper for Image.open(filepath) that yields more useful error messages.

387	500

388 Args:	501 Args:

389 filepath: path on local disk to load image from	502 filepath: path on local disk to load image from

390	503

391 Returns: a PIL image object	504 Returns: a PIL image object

(...skipping 20 matching lines...) Expand all Loading...
412 _mkdir_unless_exists(os.path.dirname(filepath))	525 _mkdir_unless_exists(os.path.dirname(filepath))

413 image.save(filepath, format)	526 image.save(filepath, format)

414	527

415	528

416 def _mkdir_unless_exists(path):	529 def _mkdir_unless_exists(path):

417 """Unless path refers to an already-existing directory, create it.	530 """Unless path refers to an already-existing directory, create it.

418	531

419 Args:	532 Args:

420 path: path on local disk	533 path: path on local disk

421 """	534 """

422 if not os.path.isdir(path):	535 try:

423 os.makedirs(path)	536 os.makedirs(path)

	537 except OSError as e:

	538 if e.errno == errno.EEXIST:

	539 pass

424	540

425	541

426 def _sanitize_locator(locator):	542 def _sanitize_locator(locator):

427 """Returns a sanitized version of a locator (one in which we know none of the	543 """Returns a sanitized version of a locator (one in which we know none of the

428 characters will have special meaning in filenames).	544 characters will have special meaning in filenames).

429	545

430 Args:	546 Args:

431 locator: string, or something that can be represented as a string	547 locator: string, or something that can be represented as a string

432 """	548 """

433 return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator))	549 return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator))

434	550

435	551

	552 def _generate_key(expected_image_locator, actual_image_locator):

	553 """Returns a key suitable for looking up this image pair.

	554

	555 Args:

	556 expected_image_locator: a unique ID string under which we will store the

	557 expected image within storage_root (probably including a checksum to

	558 guarantee uniqueness)

	559 actual_image_locator: a unique ID string under which we will store the

	560 actual image within storage_root (probably including a checksum to

	561 guarantee uniqueness)

	562 """

	563 return (_sanitize_locator(expected_image_locator),

	564 _sanitize_locator(actual_image_locator))

	565

	566

436 def _get_difference_locator(expected_image_locator, actual_image_locator):	567 def _get_difference_locator(expected_image_locator, actual_image_locator):

437 """Returns the locator string used to look up the diffs between expected_image	568 """Returns the locator string used to look up the diffs between expected_image

438 and actual_image.	569 and actual_image.

439	570

440 We must keep this function in sync with getImageDiffRelativeUrl() in	571 We must keep this function in sync with getImageDiffRelativeUrl() in

441 static/loader.js	572 static/loader.js

442	573

443 Args:	574 Args:

444 expected_image_locator: locator string pointing at expected image	575 expected_image_locator: locator string pointing at expected image

445 actual_image_locator: locator string pointing at actual image	576 actual_image_locator: locator string pointing at actual image

446	577

447 Returns: already-sanitized locator where the diffs between expected and	578 Returns: already-sanitized locator where the diffs between expected and

448 actual images can be found	579 actual images can be found

449 """	580 """

450 return "%s-vs-%s" % (_sanitize_locator(expected_image_locator),	581 return "%s-vs-%s" % (_sanitize_locator(expected_image_locator),

451 _sanitize_locator(actual_image_locator))	582 _sanitize_locator(actual_image_locator))

OLD	NEW

« no previous file with comments | « gm/rebaseline_server/compare_to_expectations.py ('k') | gm/rebaseline_server/imagepair.py » ('j') | no next file with comments »