build/android/pylib/base/test_dispatcher.py - Issue 18770008: [Android] Redesigns the sharder to allow replicated vs distributed tests

Side by Side Diff: build/android/pylib/base/test_dispatcher.py

Issue 18770008: [Android] Redesigns the sharder to allow replicated vs distributed tests (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Small fixes to formatting Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« build/android/pylib/base/base_test_result.py ('K') | « build/android/pylib/base/shard_unittest.py ('k') | build/android/pylib/base/test_dispatcher_unittest.py » ('j') | build/android/pylib/base/test_dispatcher_unittest.py » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # Copyright (c) 2013 The Chromium Authors. All rights reserved.	1 # Copyright (c) 2013 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 """Implements test sharding logic."""	5 """Dispatches tests, either sharding or replicating them."""
	frankf 2013/07/17 04:07:20 Expand this a little about. Include the fact, this Expand this a little about. Include the fact, this dispatches tests by assigning pools of tests which are run by test runners. gkanwar 2013/07/17 20:31:26 Done. Show quoted text On 2013/07/17 04:07:20, frankf wrote: > Expand this a little about. Include the fact, this dispatches tests by assigning > pools of tests which are run by test runners. Done.
6	6

7 import logging	7 import logging

8 import threading	8 import threading

9	9

10 from pylib import android_commands	10 from pylib import android_commands

11 from pylib import constants	11 from pylib import constants

12 from pylib import forwarder	12 from pylib import forwarder

13 from pylib.utils import reraiser_thread	13 from pylib.utils import reraiser_thread

14 from pylib.utils import watchdog_timer	14 from pylib.utils import watchdog_timer

15	15

(...skipping 95 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
111 def __iter__(self):	111 def __iter__(self):

112 """Iterate through tests in the collection until all have been handled."""	112 """Iterate through tests in the collection until all have been handled."""

113 while True:	113 while True:

114 r = self._pop()	114 r = self._pop()

115 if r is None:	115 if r is None:

116 break	116 break

117 yield r	117 yield r

118	118

119	119

120 def _RunTestsFromQueue(runner, test_collection, out_results, watcher,	120 def _RunTestsFromQueue(runner, test_collection, out_results, watcher,

121 num_retries):	121 num_retries, tag_results_with_device=False):

122 """Runs tests from the test_collection until empty using the given runner.	122 """Runs tests from the test_collection until empty using the given runner.

123	123

124 Adds TestRunResults objects to the out_results list and may add tests to the	124 Adds TestRunResults objects to the out_results list and may add tests to the

125 out_retry list.	125 out_retry list.

126	126

127 Args:	127 Args:

128 runner: A TestRunner object used to run the tests.	128 runner: A TestRunner object used to run the tests.

129 test_collection: A _TestCollection from which to get _Test objects to run.	129 test_collection: A _TestCollection from which to get _Test objects to run.

130 out_results: A list to add TestRunResults to.	130 out_results: A list to add TestRunResults to.

131 watcher: A watchdog_timer.WatchdogTimer object, used as a shared timeout.	131 watcher: A watchdog_timer.WatchdogTimer object, used as a shared timeout.

132 num_retries: Number of retries for a test.	132 num_retries: Number of retries for a test.

	133 tag_results_with_device: If True, appends the name of the device on which

	134 the test was run to the test name. Used by ReplicateAndRunTests to

	135 identify which device ran each copy of the test, and to ensure each copy

	136 of the test is recorded separately.

133 """	137 """

	138

	139 # Used to tag all results to identify which device caused failing tests
	frankf 2013/07/17 04:07:20 This is misleading. This also tags passing tests, This is misleading. This also tags passing tests, although it's not displayed. The main reason for this is to have unique test resuts as we use a set. gkanwar 2013/07/17 20:31:26 Done. Show quoted text On 2013/07/17 04:07:20, frankf wrote: > This is misleading. This also tags passing tests, although it's not displayed. > The main reason for this is to have unique test resuts as we use a set. Done.
	140 def TagTestRunResults(test_run_results):
	frankf 2013/07/17 04:07:20 Move comment here. Move comment here. gkanwar 2013/07/17 20:31:26 Done. Show quoted text On 2013/07/17 04:07:20, frankf wrote: > Move comment here. Done.
	141 new_test_run_results = base_test_result.TestRunResults()

	142 for test_result in test_run_results.GetAll():

	143 test_result.SetTag(runner.device[-4:])

	144 new_test_run_results.AddResult(test_result)

	145 return new_test_run_results

	146

134 for test in test_collection:	147 for test in test_collection:

135 watcher.Reset()	148 watcher.Reset()

136 try:	149 try:

137 if not android_commands.IsDeviceAttached(runner.device):	150 if not android_commands.IsDeviceAttached(runner.device):

138 # Device is unresponsive, stop handling tests on this device.	151 # Device is unresponsive, stop handling tests on this device.

139 msg = 'Device %s is unresponsive.' % runner.device	152 msg = 'Device %s is unresponsive.' % runner.device

140 logging.warning(msg)	153 logging.warning(msg)

141 raise android_commands.errors.DeviceUnresponsiveError(msg)	154 raise android_commands.errors.DeviceUnresponsiveError(msg)

142 result, retry = runner.RunTest(test.test)	155 result, retry = runner.RunTest(test.test)

	156 if tag_results_with_device:

	157 result = TagTestRunResults(result)

143 test.tries += 1	158 test.tries += 1

144 if retry and test.tries <= num_retries:	159 if retry and test.tries <= num_retries:

145 # Retry non-passing results, only record passing results.	160 # Retry non-passing results, only record passing results.

146 pass_results = base_test_result.TestRunResults()	161 pass_results = base_test_result.TestRunResults()

147 pass_results.AddResults(result.GetPass())	162 pass_results.AddResults(result.GetPass())

148 out_results.append(pass_results)	163 out_results.append(pass_results)

149 logging.warning('Will retry test, try #%s.' % test.tries)	164 logging.warning('Will retry test, try #%s.' % test.tries)

150 test_collection.add(_Test(test=retry, tries=test.tries))	165 test_collection.add(_Test(test=retry, tries=test.tries))

151 else:	166 else:

152 # All tests passed or retry limit reached. Either way, record results.	167 # All tests passed or retry limit reached. Either way, record results.

(...skipping 24 matching lines...) Expand all Loading...
177 try:	192 try:

178 index = threadsafe_counter.GetAndIncrement()	193 index = threadsafe_counter.GetAndIncrement()

179 logging.warning('Creating shard %s for device %s.', index, device)	194 logging.warning('Creating shard %s for device %s.', index, device)

180 runner = runner_factory(device, index)	195 runner = runner_factory(device, index)

181 runner.SetUp()	196 runner.SetUp()

182 out_runners.append(runner)	197 out_runners.append(runner)

183 except android_commands.errors.DeviceUnresponsiveError as e:	198 except android_commands.errors.DeviceUnresponsiveError as e:

184 logging.warning('Failed to create shard for %s: [%s]', device, e)	199 logging.warning('Failed to create shard for %s: [%s]', device, e)

185	200

186	201

187 def _RunAllTests(runners, tests, num_retries, timeout=None):	202 def _RunAllTests(runners, test_collection_factory, num_retries, timeout=None,

	203 tag_results_with_device=False):

188 """Run all tests using the given TestRunners.	204 """Run all tests using the given TestRunners.

189	205

190 Args:	206 Args:

191 runners: a list of TestRunner objects.	207 runners: a list of TestRunner objects.

192 tests: a list of Tests to run using the given TestRunners.	208 test_collection_factory: a callable to generate a _TestCollection object for

	209 each test runner.

193 num_retries: number of retries for a test.	210 num_retries: number of retries for a test.

194 timeout: watchdog timeout in seconds, defaults to the default timeout.	211 timeout: watchdog timeout in seconds, defaults to the default timeout.

	212 tag_results_with_device: If True, appends the name of the device on which

	213 the test was run to the test name. Used by ReplicateAndRunTests to
	frankf 2013/07/17 04:07:20 Please address all comments. This needs to be upda Please address all comments. This needs to be update since there's no ReplicateAndRunTests. gkanwar 2013/07/17 20:31:26 Done. Show quoted text On 2013/07/17 04:07:20, frankf wrote: > Please address all comments. This needs to be update since there's no > ReplicateAndRunTests. Done.
	214 identify which device ran each copy of the test, and to ensure each copy

	215 of the test is recorded separately.

195	216

196 Returns:	217 Returns:

197 A tuple of (TestRunResults object, exit code)	218 A tuple of (TestRunResults object, exit code)

198 """	219 """

199 logging.warning('Running %s tests with %s test runners.' %	220 logging.warning('Running tests with %s test runners.' % (len(runners)))

200 (len(tests), len(runners)))

201 tests_collection = _TestCollection([_Test(t) for t in tests])

202 results = []	221 results = []

203 exit_code = 0	222 exit_code = 0

204 watcher = watchdog_timer.WatchdogTimer(timeout)	223 watcher = watchdog_timer.WatchdogTimer(timeout)

	224

205 workers = reraiser_thread.ReraiserThreadGroup(	225 workers = reraiser_thread.ReraiserThreadGroup(

206 [reraiser_thread.ReraiserThread(	226 [reraiser_thread.ReraiserThread(

207 _RunTestsFromQueue,	227 _RunTestsFromQueue,

208 [r, tests_collection, results, watcher, num_retries],	228 [r, test_collection_factory(), results, watcher, num_retries,

	229 tag_results_with_device],

209 name=r.device[-4:])	230 name=r.device[-4:])

210 for r in runners])	231 for r in runners])

211 run_results = base_test_result.TestRunResults()	232 run_results = base_test_result.TestRunResults()

212 workers.StartAll()	233 workers.StartAll()

213	234

214 # Catch DeviceUnresponsiveErrors and set a warning exit code	235 # Catch DeviceUnresponsiveErrors and set a warning exit code

215 try:	236 try:

216 workers.JoinAll(watcher)	237 workers.JoinAll(watcher)

217 except android_commands.errors.DeviceUnresponsiveError as e:	238 except android_commands.errors.DeviceUnresponsiveError as e:

218 logging.error(e)	239 logging.error(e)

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
260 runners: a list of TestRunner objects.	281 runners: a list of TestRunner objects.

261 timeout: watchdog timeout in seconds, defaults to the default timeout.	282 timeout: watchdog timeout in seconds, defaults to the default timeout.

262 """	283 """

263 threads = reraiser_thread.ReraiserThreadGroup(	284 threads = reraiser_thread.ReraiserThreadGroup(

264 [reraiser_thread.ReraiserThread(r.TearDown, name=r.device[-4:])	285 [reraiser_thread.ReraiserThread(r.TearDown, name=r.device[-4:])

265 for r in runners])	286 for r in runners])

266 threads.StartAll()	287 threads.StartAll()

267 threads.JoinAll(watchdog_timer.WatchdogTimer(timeout))	288 threads.JoinAll(watchdog_timer.WatchdogTimer(timeout))

268	289

269	290

270 def ShardAndRunTests(runner_factory, devices, tests, build_type='Debug',	291

271 test_timeout=DEFAULT_TIMEOUT,	292 def _GetAttachedDevices(wait_for_debugger=False, test_device=None):

272 setup_timeout=DEFAULT_TIMEOUT,	293 """Get all attached devices.

273 num_retries=2):	294

	295 If we are using a debugger, limit to only one device.

	296

	297 Args:

	298 wait_for_debugger: True if this run will use a debugger.

	299 test_device: name of a specific device to use.

	300

	301 Returns:

	302 A list of attached devices.

	303 """

	304 attached_devices = []

	305

	306 attached_devices = android_commands.GetAttachedDevices()

	307 if test_device:

	308 assert (test_device in attached_devices,

	309 'Did not find device %s among attached device. Attached devices: %s'

	310 % (test_device, ', '.join(attached_devices)))

	311 attached_devices = [test_device]

	312

	313 if len(attached_devices) > 1 and wait_for_debugger:

	314 logging.warning('Debugger can not be sharded, using first available device')

	315 attached_devices = attached_devices[:1]

	316

	317 return attached_devices

	318

	319

	320 def RunTests(tests, runner_factory, wait_for_debugger, test_device, shard,

	321 build_type='Debug',

	322 test_timeout=DEFAULT_TIMEOUT,

	323 setup_timeout=DEFAULT_TIMEOUT,

	324 num_retries=2):

274 """Run all tests on attached devices, retrying tests that don't pass.	325 """Run all tests on attached devices, retrying tests that don't pass.

275	326

276 Args:	327 Args:

277 runner_factory: callable that takes a device and index and returns a	328 tests: List of tests to run.

278 TestRunner object.	329 runner_factory: Callable that takes a device and index and returns a

279 devices: list of attached device serial numbers as strings.	330 TestRunner object.
	frankf 2013/07/17 04:07:20 I think we use 2 spaces for indentation or align i I think we use 2 spaces for indentation or align it with 'Callable'. gkanwar 2013/07/17 20:31:26 According to go/pyguide it's 4 spaces: http://www. Show quoted text On 2013/07/17 04:07:20, frankf wrote: > I think we use 2 spaces for indentation or align it with 'Callable'. According to go/pyguide it's 4 spaces: http://www.corp.google.com/eng/doc/pyguide.xml#Comments Is this different in the Chromium tree? frankf 2013/07/17 21:08:41 We diverge from the style guide for somethings. Le We diverge from the style guide for somethings. Let's be consistant everywhere. We can always do a mass style fix later. On 2013/07/17 20:31:26, gkanwar wrote: Show quoted text > On 2013/07/17 04:07:20, frankf wrote: > > I think we use 2 spaces for indentation or align it with 'Callable'. > > According to go/pyguide it's 4 spaces: > http://www.corp.google.com/eng/doc/pyguide.xml#Comments > > Is this different in the Chromium tree?
280 tests: list of tests to run.	331 wait_for_debugger: True if this test is using a debugger.

281 build_type: either 'Debug' or 'Release'.	332 test_device: A specific device to run tests on, or None.

282 test_timeout: watchdog timeout in seconds for running tests, defaults to the	333 shard: True if we should shard, False if we should replicate tests.
	frankf 2013/07/17 04:07:20 Expand this to define replicate. Expand this to define replicate. gkanwar 2013/07/17 20:31:26 Done. Show quoted text On 2013/07/17 04:07:20, frankf wrote: > Expand this to define replicate. Done.
283 default timeout.	334 build_type: Either 'Debug' or 'Release'.

284 setup_timeout: watchdog timeout in seconds for creating and cleaning up	335 test_timeout: Watchdog timeout in seconds for running tests, defaults to the
	frankf 2013/07/17 04:07:20 Remove "defaults to default timeout". Remove "defaults to default timeout". gkanwar 2013/07/17 20:31:26 Done. Show quoted text On 2013/07/17 04:07:20, frankf wrote: > Remove "defaults to default timeout". Done.
285 test runners, defaults to the default timeout.	336 default timeout.

286 num_retries: number of retries for a test.	337 setup_timeout: Watchdog timeout in seconds for creating and cleaning up

	338 test runners, defaults to the default timeout.

	339 num_retries: Number of retries for a test.

287	340

288 Returns:	341 Returns:

289 A tuple of (base_test_result.TestRunResults object, exit code).	342 A tuple of (base_test_result.TestRunResults object, exit code).

290 """	343 """

291 if not tests:	344 if not tests:

292 logging.error('No tests to run.')	345 logging.error('No tests to run.')

293 return (base_test_result.TestRunResults(), constants.ERROR_EXIT_CODE)	346 return (base_test_result.TestRunResults(), constants.ERROR_EXIT_CODE)

294	347

	348 if shard:

	349 # Generate a shared _TestCollection object for all test runners, so they

	350 # draw from a common pool of tests.

	351 shared_test_collection = _TestCollection([_Test(t) for t in tests])

	352 test_collection_factory = lambda: shared_test_collection

	353 tag_results_with_device = False

	354 else:

	355 # Generate a unique _TestCollection object for each test runner, but use

	356 # the same set of tests.

	357 test_collection_factory = lambda: _TestCollection([_Test(t) for t in tests])

	358 tag_results_with_device = True

	359

	360 devices = _GetAttachedDevices(wait_for_debugger, test_device)

	361

295 logging.info('Will run %d tests: %s', len(tests), str(tests))	362 logging.info('Will run %d tests: %s', len(tests), str(tests))

	363

296 forwarder.Forwarder.KillHost(build_type)	364 forwarder.Forwarder.KillHost(build_type)

297 runners = _CreateRunners(runner_factory, devices, setup_timeout)	365 runners = _CreateRunners(runner_factory, devices, setup_timeout)

298 try:	366 try:

299 return _RunAllTests(runners, tests, num_retries, test_timeout)	367 return _RunAllTests(runners, test_collection_factory,

	368 num_retries, test_timeout, tag_results_with_device)

300 finally:	369 finally:

301 try:	370 try:

302 _TearDownRunners(runners, setup_timeout)	371 _TearDownRunners(runners, setup_timeout)

303 except android_commands.errors.DeviceUnresponsiveError as e:	372 except android_commands.errors.DeviceUnresponsiveError as e:

304 logging.warning('Device unresponsive during TearDown: [%s]', e)	373 logging.warning('Device unresponsive during TearDown: [%s]', e)

305 finally:	374 finally:

306 forwarder.Forwarder.KillHost(build_type)	375 forwarder.Forwarder.KillHost(build_type)

OLD	NEW