build/android/pylib/base_test_sharder.py - Issue 11275078: Android: improves test sharding reliability.

Side by Side Diff: build/android/pylib/base_test_sharder.py

Issue 11275078: Android: improves test sharding reliability. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 8 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 # Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5	5

6 import android_commands	6 import android_commands

7 import logging	7 import logging

8 import multiprocessing	8 import multiprocessing

9	9

	10 from android_commands import errors

10 from test_result import TestResults	11 from test_result import TestResults

11	12

12	13

13 def _ShardedTestRunnable(test):	14 def _ShardedTestRunnable(test):

14 """Standalone function needed by multiprocessing.Pool."""	15 """Standalone function needed by multiprocessing.Pool."""

15 log_format = '[' + test.device + '] # %(asctime)-15s: %(message)s'	16 log_format = '[' + test.device + '] # %(asctime)-15s: %(message)s'

16 if logging.getLogger().handlers:	17 if logging.getLogger().handlers:

17 logging.getLogger().handlers[0].setFormatter(logging.Formatter(log_format))	18 logging.getLogger().handlers[0].setFormatter(logging.Formatter(log_format))

18 else:	19 else:

19 logging.basicConfig(format=log_format)	20 logging.basicConfig(format=log_format)

(...skipping 16 matching lines...) Expand all Loading...
36 """Base class for sharding tests across multiple devices.	37 """Base class for sharding tests across multiple devices.

37	38

38 Args:	39 Args:

39 attached_devices: A list of attached devices.	40 attached_devices: A list of attached devices.

40 """	41 """

41 # See more in SetTestsContainer.	42 # See more in SetTestsContainer.

42 tests_container = None	43 tests_container = None

43	44

44 def __init__(self, attached_devices):	45 def __init__(self, attached_devices):

45 self.attached_devices = attached_devices	46 self.attached_devices = attached_devices

46 self.retries = 1	47 # Worst case scenario: a device will drop offline per run, so we need

	48 # to retry until we're out of devices.

	49 self.retries = len(self.attached_devices)

47 self.tests = []	50 self.tests = []

48	51

49 def CreateShardedTestRunner(self, device, index):	52 def CreateShardedTestRunner(self, device, index):

50 """Factory function to create a suite-specific test runner.	53 """Factory function to create a suite-specific test runner.

51	54

52 Args:	55 Args:

53 device: Device serial where this shard will run	56 device: Device serial where this shard will run

54 index: Index of this device in the pool.	57 index: Index of this device in the pool.

55	58

56 Returns:	59 Returns:

(...skipping 19 matching lines...) Expand all Loading...
76 logging.warning('Sharding in ' + str(len(self.attached_devices)) +	79 logging.warning('Sharding in ' + str(len(self.attached_devices)) +

77 ' devices.')	80 ' devices.')

78 logging.warning('Note that the output is not synchronized.')	81 logging.warning('Note that the output is not synchronized.')

79 logging.warning('Look for the "Final result" banner in the end.')	82 logging.warning('Look for the "Final result" banner in the end.')

80 logging.warning('' 80)	83 logging.warning('' 80)

81 final_results = TestResults()	84 final_results = TestResults()

82 for retry in xrange(self.retries):	85 for retry in xrange(self.retries):

83 logging.warning('Try %d of %d', retry + 1, self.retries)	86 logging.warning('Try %d of %d', retry + 1, self.retries)

84 self.SetupSharding(self.tests)	87 self.SetupSharding(self.tests)

85 test_runners = []	88 test_runners = []

86 for index, device in enumerate(self.attached_devices):	89

87 logging.warning('' 80)	90 # Try to create N shards, and retrying on failure.

88 logging.warning('Creating shard %d for %s', index, device)	91 try:
	Yaron 2012/10/31 17:18:51 Shouldn't this try be in the for loop? \|device\| is Shouldn't this try be in the for loop? \|device\| is only defined in there. bulach 2012/10/31 18:10:35 not really.. we need to create all shards at once, Show quoted text On 2012/10/31 17:18:51, Yaron wrote: > Shouldn't this try be in the for loop? \|device\| is only defined in there. not really.. we need to create all shards at once, or fail. python allows for \|device\| to be available outside the for scope. if we move the try inside the for loop, we'd need to somehow abort the current test_runners creation and retry all over again..
89 logging.warning('' 80)	92 for index, device in enumerate(self.attached_devices):

90 test_runner = self.CreateShardedTestRunner(device, index)	93 logging.warning('' 80)

91 test_runners += [test_runner]	94 logging.warning('Creating shard %d for %s', index, device)

	95 logging.warning('' 80)

	96 test_runner = self.CreateShardedTestRunner(device, index)

	97 test_runners += [test_runner]

	98 except errors.DeviceUnresponsiveError as e:

	99 logging.critical('****Failed to create a shard: [%s]', e)

	100 self.attached_devices.remove(device)

	101 continue
	Yaron 2012/10/31 17:18:51 By continuing, we're not updating \|final_results\|. By continuing, we're not updating \|final_results\|. It looks like it might only be tabulating successful tests in which case it's ok to skip? bulach 2012/10/31 18:10:35 this continue here would be for the loop on 85, so Show quoted text On 2012/10/31 17:18:51, Yaron wrote: > By continuing, we're not updating \|final_results\|. It looks like it might only > be tabulating successful tests in which case it's ok to skip? this continue here would be for the loop on 85, so there's nothing to update yet here... the flow is basically: - try to create N shards. \|-- if it fails, retry. - dispatch and collect results for the N \|-- if it fails, retry at the end, since we got N results for N devices, try to optimize what should run next.. does it make sense? Yaron 2012/10/31 18:37:23 Yes. My point was though that it looks like it's d Show quoted text On 2012/10/31 18:10:35, bulach wrote: > On 2012/10/31 17:18:51, Yaron wrote: > > By continuing, we're not updating \|final_results\|. It looks like it might only > > be tabulating successful tests in which case it's ok to skip? > > this continue here would be for the loop on 85, so there's nothing to update yet > here... > > the flow is basically: > > - try to create N shards. > \|-- if it fails, retry. > - dispatch and collect results for the N > \|-- if it fails, retry > > at the end, since we got N results for N devices, try to optimize what should > run next.. > > does it make sense? Yes. My point was though that it looks like it's doing special accounting on the last run of the loop. If you "continue" here, it never gets a chance to do that and we can't report that any tests successfully ran. I guess if we end up with 0 devices it'll be a pretty big failure anyway so it's ok. However, looking at OnTestsCompleted in build/android/run_tests.py it assumes that test_runners will be non-empty and that's not true. bulach 2012/10/31 19:18:17 ahn, yes, good point! I added an else: clause to r Show quoted text On 2012/10/31 18:37:23, Yaron wrote: > On 2012/10/31 18:10:35, bulach wrote: > > On 2012/10/31 17:18:51, Yaron wrote: > > > By continuing, we're not updating \|final_results\|. It looks like it might > only > > > be tabulating successful tests in which case it's ok to skip? > > > > this continue here would be for the loop on 85, so there's nothing to update > yet > > here... > > > > the flow is basically: > > > > - try to create N shards. > > \|-- if it fails, retry. > > - dispatch and collect results for the N > > \|-- if it fails, retry > > > > at the end, since we got N results for N devices, try to optimize what should > > run next.. > > > > does it make sense? > > Yes. My point was though that it looks like it's doing special accounting on the > last run of the loop. If you "continue" here, it never gets a chance to do that > and we can't report that any tests successfully ran. I guess if we end up with 0 > devices it'll be a pretty big failure anyway so it's ok. > > However, looking at OnTestsCompleted in build/android/run_tests.py it assumes > that test_runners will be non-empty and that's not true. ahn, yes, good point! I added an else: clause to raise an exception if we run out of retries... at that point, there's no possible recovery anyways, but this at least clarifies which condition we hit..
	102

92 logging.warning('Starting...')	103 logging.warning('Starting...')

93 pool = multiprocessing.Pool(len(self.attached_devices),	104 pool = multiprocessing.Pool(len(self.attached_devices),

94 SetTestsContainer,	105 SetTestsContainer,

95 [BaseTestSharder.tests_container])	106 [BaseTestSharder.tests_container])

96 # map can't handle KeyboardInterrupt exception. It's a python bug.	107 # map can't handle KeyboardInterrupt exception. It's a python bug.

97 # So use map_async instead.	108 # So use map_async instead.

98 async_results = pool.map_async(_ShardedTestRunnable, test_runners)	109 async_results = pool.map_async(_ShardedTestRunnable, test_runners)

99 results_lists = async_results.get(999999)	110 try:

100	111 results_lists = async_results.get(999999)
	Yaron 2012/10/31 17:18:51 Will the exception get thrown while the other shar Will the exception get thrown while the other shards are still running? If so, does this mean that we'll whip around and retry shards of tests on devices that are still running their old tests? bulach 2012/10/31 18:10:35 it's not clear from the python documentation if th Show quoted text On 2012/10/31 17:18:51, Yaron wrote: > Will the exception get thrown while the other shards are still running? If so, > does this mean that we'll whip around and retry shards of tests on devices that > are still running their old tests? it's not clear from the python documentation if the exception is raised as soon as the shard throws, or later when all the results are collected. thankfully, it doesn't matter :) we'll create the shards again, and that re-installs the apks, which kills the process. Yaron 2012/10/31 18:37:23 Ah, right. Thanks. Show quoted text On 2012/10/31 18:10:35, bulach wrote: > On 2012/10/31 17:18:51, Yaron wrote: > > Will the exception get thrown while the other shards are still running? If so, > > does this mean that we'll whip around and retry shards of tests on devices > that > > are still running their old tests? > > it's not clear from the python documentation if the exception is raised as soon > as the shard throws, or later when all the results are collected. > > thankfully, it doesn't matter :) > > we'll create the shards again, and that re-installs the apks, which kills the > process. Ah, right. Thanks.
	112 except errors.DeviceUnresponsiveError as e:

	113 logging.critical('****Failed to run test: [%s]', e)

	114 self.attached_devices = android_commands.GetAttachedDevices()

	115 continue

101 test_results = TestResults.FromTestResults(results_lists)	116 test_results = TestResults.FromTestResults(results_lists)

102 # Re-check the attached devices for some devices may	117 # Re-check the attached devices for some devices may

103 # become offline	118 # become offline

104 retry_devices = set(android_commands.GetAttachedDevices())	119 retry_devices = set(android_commands.GetAttachedDevices())

105 # Remove devices that had exceptions.	120 # Remove devices that had exceptions.

106 retry_devices -= TestResults.DeviceExceptions(results_lists)	121 retry_devices -= TestResults.DeviceExceptions(results_lists)

107 # Retry on devices that didn't have any exception.	122 # Retry on devices that didn't have any exception.

108 self.attached_devices = list(retry_devices)	123 self.attached_devices = list(retry_devices)

109 if (retry == self.retries - 1 or	124 if (retry == self.retries - 1 or

110 len(self.attached_devices) == 0):	125 len(self.attached_devices) == 0):

111 all_passed = final_results.ok + test_results.ok	126 all_passed = final_results.ok + test_results.ok

112 final_results = test_results	127 final_results = test_results

113 final_results.ok = all_passed	128 final_results.ok = all_passed

114 break	129 break

115 else:	130 else:

116 final_results.ok += test_results.ok	131 final_results.ok += test_results.ok

117 self.tests = []	132 self.tests = []

118 for t in test_results.GetAllBroken():	133 for t in test_results.GetAllBroken():

119 self.tests += [t.name]	134 self.tests += [t.name]

120 if not self.tests:	135 if not self.tests:

121 break	136 break

122 self.OnTestsCompleted(test_runners, final_results)	137 self.OnTestsCompleted(test_runners, final_results)

123 return final_results	138 return final_results

OLD	NEW

« no previous file with comments | « no previous file | build/android/run_tests.py » ('j') | build/android/run_tests.py » ('J')