build/android/pylib/base/dispatch.py - Issue 18770008: [Android] Redesigns the sharder to allow replicated vs distributed tests

Side by Side Diff: build/android/pylib/base/dispatch.py

Issue 18770008: [Android] Redesigns the sharder to allow replicated vs distributed tests (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Adds tagging of tests (for replication) Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # Copyright (c) 2013 The Chromium Authors. All rights reserved.	1 # Copyright (c) 2013 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 """Implements test sharding logic."""	5 """Implements test sharding logic."""
	frankf 2013/07/16 00:02:18 rename to test_dispatcher.py rename to test_dispatcher.py gkanwar 2013/07/16 00:47:03 Done. Show quoted text On 2013/07/16 00:02:18, frankf wrote: > rename to test_dispatcher.py Done.
6	6

7 import logging	7 import logging

8 import threading	8 import threading

9	9

10 from pylib import android_commands	10 from pylib import android_commands

11 from pylib import constants	11 from pylib import constants

12 from pylib import forwarder	12 from pylib import forwarder

13 from pylib.utils import reraiser_thread	13 from pylib.utils import reraiser_thread

14 from pylib.utils import watchdog_timer	14 from pylib.utils import watchdog_timer

15	15

(...skipping 95 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
111 def __iter__(self):	111 def __iter__(self):

112 """Iterate through tests in the collection until all have been handled."""	112 """Iterate through tests in the collection until all have been handled."""

113 while True:	113 while True:

114 r = self._pop()	114 r = self._pop()

115 if r is None:	115 if r is None:

116 break	116 break

117 yield r	117 yield r

118	118

119	119

120 def _RunTestsFromQueue(runner, test_collection, out_results, watcher,	120 def _RunTestsFromQueue(runner, test_collection, out_results, watcher,

121 num_retries):	121 num_retries, tag_results_with_device=False):

122 """Runs tests from the test_collection until empty using the given runner.	122 """Runs tests from the test_collection until empty using the given runner.

123	123

124 Adds TestRunResults objects to the out_results list and may add tests to the	124 Adds TestRunResults objects to the out_results list and may add tests to the

125 out_retry list.	125 out_retry list.

126	126

127 Args:	127 Args:

128 runner: A TestRunner object used to run the tests.	128 runner: A TestRunner object used to run the tests.

129 test_collection: A _TestCollection from which to get _Test objects to run.	129 test_collection: A _TestCollection from which to get _Test objects to run.

130 out_results: A list to add TestRunResults to.	130 out_results: A list to add TestRunResults to.

131 watcher: A watchdog_timer.WatchdogTimer object, used as a shared timeout.	131 watcher: A watchdog_timer.WatchdogTimer object, used as a shared timeout.

132 num_retries: Number of retries for a test.	132 num_retries: Number of retries for a test.

	133 tag_results_with_device: If True, appends the name of the device on which

	134 the test was run to the test name. Used by ReplicateAndRunTests to

	135 identify which device ran each copy of the test, and to ensure each copy

	136 of the test is recorded separately.

133 """	137 """

	138

	139 # Used to tag all results to identify which device caused failing tests

	140 def TagTestRunResults(test_run_results):

	141 new_test_run_results = base_test_result.TestRunResults()

	142 for test_result in test_run_results.GetAll():

	143 new_result = base_test_result.BaseTestResult(

	144 '%s_%s' % (runner.device, test_result.GetName()),

	145 test_result.GetType(), test_result.GetLog())

	146 new_test_run_results.AddResult(new_result)

	147 return new_test_run_results

	148

134 for test in test_collection:	149 for test in test_collection:

135 watcher.Reset()	150 watcher.Reset()

136 try:	151 try:

137 if not android_commands.IsDeviceAttached(runner.device):	152 if not android_commands.IsDeviceAttached(runner.device):

138 # Device is unresponsive, stop handling tests on this device.	153 # Device is unresponsive, stop handling tests on this device.

139 msg = 'Device %s is unresponsive.' % runner.device	154 msg = 'Device %s is unresponsive.' % runner.device

140 logging.warning(msg)	155 logging.warning(msg)

141 raise android_commands.errors.DeviceUnresponsiveError(msg)	156 raise android_commands.errors.DeviceUnresponsiveError(msg)

142 result, retry = runner.RunTest(test.test)	157 result, retry = runner.RunTest(test.test)

	158 if tag_results_with_device:

	159 result = TagTestRunResults(result)

143 test.tries += 1	160 test.tries += 1

144 if retry and test.tries <= num_retries:	161 if retry and test.tries <= num_retries:

145 # Retry non-passing results, only record passing results.	162 # Retry non-passing results, only record passing results.

146 pass_results = base_test_result.TestRunResults()	163 pass_results = base_test_result.TestRunResults()

	164 # Tag all results with the device, so we can identify the failing device

	165 # for replicated tests.

147 pass_results.AddResults(result.GetPass())	166 pass_results.AddResults(result.GetPass())

148 out_results.append(pass_results)	167 out_results.append(pass_results)

149 logging.warning('Will retry test, try #%s.' % test.tries)	168 logging.warning('Will retry test, try #%s.' % test.tries)

150 test_collection.add(_Test(test=retry, tries=test.tries))	169 test_collection.add(_Test(test=retry, tries=test.tries))

151 else:	170 else:

152 # All tests passed or retry limit reached. Either way, record results.	171 # All tests passed or retry limit reached. Either way, record results.

153 out_results.append(result)	172 out_results.append(result)

154 except:	173 except:

155 # An unhandleable exception, ensure tests get run by another device and	174 # An unhandleable exception, ensure tests get run by another device and

156 # reraise this exception on the main thread.	175 # reraise this exception on the main thread.

(...skipping 20 matching lines...) Expand all Loading...
177 try:	196 try:

178 index = threadsafe_counter.GetAndIncrement()	197 index = threadsafe_counter.GetAndIncrement()

179 logging.warning('Creating shard %s for device %s.', index, device)	198 logging.warning('Creating shard %s for device %s.', index, device)

180 runner = runner_factory(device, index)	199 runner = runner_factory(device, index)

181 runner.SetUp()	200 runner.SetUp()

182 out_runners.append(runner)	201 out_runners.append(runner)

183 except android_commands.errors.DeviceUnresponsiveError as e:	202 except android_commands.errors.DeviceUnresponsiveError as e:

184 logging.warning('Failed to create shard for %s: [%s]', device, e)	203 logging.warning('Failed to create shard for %s: [%s]', device, e)

185	204

186	205

187 def _RunAllTests(runners, tests, num_retries, timeout=None):	206 def _RunAllTests(runners, test_collection_factory, num_retries, timeout=None,

	207 tag_results_with_device=False):

188 """Run all tests using the given TestRunners.	208 """Run all tests using the given TestRunners.

189	209

190 Args:	210 Args:

191 runners: a list of TestRunner objects.	211 runners: a list of TestRunner objects.

192 tests: a list of Tests to run using the given TestRunners.	212 test_collection_factory: a callable to generate a _TestCollection object for

	213 each test runner.

193 num_retries: number of retries for a test.	214 num_retries: number of retries for a test.

194 timeout: watchdog timeout in seconds, defaults to the default timeout.	215 timeout: watchdog timeout in seconds, defaults to the default timeout.

	216 tag_results_with_device: If True, appends the name of the device on which

	217 the test was run to the test name. Used by ReplicateAndRunTests to

	218 identify which device ran each copy of the test, and to ensure each copy

	219 of the test is recorded separately.

195	220

196 Returns:	221 Returns:

197 A tuple of (TestRunResults object, exit code)	222 A tuple of (TestRunResults object, exit code)

198 """	223 """

199 logging.warning('Running %s tests with %s test runners.' %	224 logging.warning('Running tests with %s test runners.' % (len(runners)))

200 (len(tests), len(runners)))

201 tests_collection = _TestCollection([_Test(t) for t in tests])

202 results = []	225 results = []

203 exit_code = 0	226 exit_code = 0

204 watcher = watchdog_timer.WatchdogTimer(timeout)	227 watcher = watchdog_timer.WatchdogTimer(timeout)

	228

205 workers = reraiser_thread.ReraiserThreadGroup(	229 workers = reraiser_thread.ReraiserThreadGroup(

206 [reraiser_thread.ReraiserThread(	230 [reraiser_thread.ReraiserThread(

207 _RunTestsFromQueue,	231 _RunTestsFromQueue,

208 [r, tests_collection, results, watcher, num_retries],	232 [r, test_collection_factory(), results, watcher, num_retries,

	233 tag_results_with_device],

209 name=r.device[-4:])	234 name=r.device[-4:])

210 for r in runners])	235 for r in runners])

211 run_results = base_test_result.TestRunResults()	236 run_results = base_test_result.TestRunResults()

212 workers.StartAll()	237 workers.StartAll()

213	238

214 # Catch DeviceUnresponsiveErrors and set a warning exit code	239 # Catch DeviceUnresponsiveErrors and set a warning exit code

215 try:	240 try:

216 workers.JoinAll(watcher)	241 workers.JoinAll(watcher)

217 except android_commands.errors.DeviceUnresponsiveError as e:	242 except android_commands.errors.DeviceUnresponsiveError as e:

218 logging.error(e)	243 logging.error(e)

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
260 runners: a list of TestRunner objects.	285 runners: a list of TestRunner objects.

261 timeout: watchdog timeout in seconds, defaults to the default timeout.	286 timeout: watchdog timeout in seconds, defaults to the default timeout.

262 """	287 """

263 threads = reraiser_thread.ReraiserThreadGroup(	288 threads = reraiser_thread.ReraiserThreadGroup(

264 [reraiser_thread.ReraiserThread(r.TearDown, name=r.device[-4:])	289 [reraiser_thread.ReraiserThread(r.TearDown, name=r.device[-4:])

265 for r in runners])	290 for r in runners])

266 threads.StartAll()	291 threads.StartAll()

267 threads.JoinAll(watchdog_timer.WatchdogTimer(timeout))	292 threads.JoinAll(watchdog_timer.WatchdogTimer(timeout))

268	293

269	294

270 def ShardAndRunTests(runner_factory, devices, tests, build_type='Debug',	295

271 test_timeout=DEFAULT_TIMEOUT,	296 def _GetAttachedDevices(wait_for_debugger=False, test_device=None):

272 setup_timeout=DEFAULT_TIMEOUT,	297 """Get all attached devices.

273 num_retries=2):	298

274 """Run all tests on attached devices, retrying tests that don't pass.	299 If we are using a debugger, limit to only one device.

275	300

276 Args:	301 Args:

277 runner_factory: callable that takes a device and index and returns a	302 wait_for_debugger: True if this run will use a debugger.

278 TestRunner object.	303 test_device: name of a specific device to use.

279 devices: list of attached device serial numbers as strings.	304

280 tests: list of tests to run.	305 Returns:

281 build_type: either 'Debug' or 'Release'.	306 A list of attached devices.

282 test_timeout: watchdog timeout in seconds for running tests, defaults to the	307 """

283 default timeout.	308 attached_devices = []

284 setup_timeout: watchdog timeout in seconds for creating and cleaning up	309

285 test runners, defaults to the default timeout.	310 attached_devices = android_commands.GetAttachedDevices()

286 num_retries: number of retries for a test.	311 if test_device:

	312 assert test_device in attached_devices

	313 attached_devices = [test_device]

	314

	315 if len(attached_devices) > 1 and wait_for_debugger:

	316 logging.warning('Debugger can not be sharded, using first available device')

	317 attached_devices = attached_devices[:1]

	318

	319 return attached_devices

	320

	321

	322 def ReplicateAndRunTests(tests, wait_for_debugger, test_device,

	323 args, *kwargs):

	324 """Replicates the tests for each device, so all devices run every test.

	325

	326 Args:

	327 tests: A list of tests to run.

	328 wait_for_debugger: True if this test is using a debugger.

	329 test_device: A specific device to run tests on, or None.

	330 args, *kwargs: Args and kwargs to RunTests which we pass through.

287	331

288 Returns:	332 Returns:

289 A tuple of (base_test_result.TestRunResults object, exit code).	333 A tuple of (base_test_result.TestRunResults object, exit code).

290 """	334 """

	335

291 if not tests:	336 if not tests:

292 logging.error('No tests to run.')	337 logging.error('No tests to run.')

293 return (base_test_result.TestRunResults(), constants.ERROR_EXIT_CODE)	338 return (base_test_result.TestRunResults(), constants.ERROR_EXIT_CODE)

294	339

295 logging.info('Will run %d tests: %s', len(tests), str(tests))	340 logging.info('Will run %d tests: %s', len(tests), str(tests))

	341

	342 # Genereate a unique _TestCollection object for each test runner, but use

	343 # the same set of tests.

	344 TestCollectionFactory = lambda: _TestCollection([_Test(t) for t in tests])

	345

	346 devices = _GetAttachedDevices(wait_for_debugger, test_device)

	347 return _RunTests(TestCollectionFactory, devices, *args,

	348 tag_results_with_device=True, **kwargs)

	349

	350

	351 def ShardAndRunTests(tests, wait_for_debugger, test_device, args, *kwargs):

	352 """Distrbutes all tests over devices through a shared pool of tests.
	frankf 2013/07/16 00:02:18 It's sufficient to say "Shards tests over devices" It's sufficient to say "Shards tests over devices" gkanwar 2013/07/16 00:47:03 Done. Show quoted text On 2013/07/16 00:02:18, frankf wrote: > It's sufficient to say "Shards tests over devices" Done.
	353

	354 Args:

	355 tests: A list of tests to run.

	356 wait_for_debugger: True if this test is using a debugger.

	357 test_device: A specific device to run tests on, or None.

	358 args, *kwargs: Args and kwargs to _RunTests which we pass through.

	359

	360 Returns:

	361 A tuple of (base_test_result.TestRunResults object, exit code).

	362 """

	363

	364 if not tests:
	frankf 2013/07/16 00:02:18 There's a lot duplication between these two method There's a lot duplication between these two methods. You can delegate everything to a private method and add ifs for distribution mode. Or just expose the parameter like we discussed earlier. gkanwar 2013/07/16 00:47:03 I ended up combining the two methods back together Show quoted text On 2013/07/16 00:02:18, frankf wrote: > There's a lot duplication between these two methods. You can delegate everything > to a private method and add ifs for distribution mode. Or just expose the > parameter like we discussed earlier. I ended up combining the two methods back together, since they actually only diverge slightly.
	365 logging.error('No tests to run.')

	366 return (base_test_result.TestRunResults(), constants.ERROR_EXIT_CODE)

	367

	368 logging.info('Will run %d tests: %s', len(tests), str(tests))

	369

	370 # Genereate a shared _TestCollection object for all test runners, so they draw

	371 # from a common pool of tests.

	372 shared_test_collection = _TestCollection([_Test(t) for t in tests])

	373 TestCollectionFactory = lambda: shared_test_collection

	374

	375 devices = _GetAttachedDevices(wait_for_debugger, test_device)

	376 return _RunTests(TestCollectionFactory, devices, *args,

	377 tag_results_with_device=False, **kwargs)

	378

	379

	380 def _RunTests(test_collection_factory, devices, runner_factory,
	frankf 2013/07/16 00:02:18 It's convention to move callee above caller It's convention to move callee above caller gkanwar 2013/07/16 00:47:03 Done. Show quoted text On 2013/07/16 00:02:18, frankf wrote: > It's convention to move callee above caller Done.
	381 build_type='Debug',

	382 test_timeout=DEFAULT_TIMEOUT,

	383 setup_timeout=DEFAULT_TIMEOUT,

	384 num_retries=2,

	385 tag_results_with_device=False):

	386 """Run all tests on attached devices, retrying tests that don't pass.

	387

	388 Args:

	389 test_collection_factory: callable that is used to generate a _TestCollection

	390 object for each test runner.

	391 devices: list of attached device serial numbers as strings.

	392 build_type: either 'Debug' or 'Release'.

	393 runner_factory: callable that takes a device and index and returns a

	394 TestRunner object.

	395 test_timeout: watchdog timeout in seconds for running tests, defaults to the

	396 default timeout.

	397 setup_timeout: watchdog timeout in seconds for creating and cleaning up

	398 test runners, defaults to the default timeout.

	399 num_retries: number of retries for a test.

	400 tag_results_with_device: If True, appends the name of the device on which

	401 the test was run to the test name. Used by ReplicateAndRunTests to

	402 identify which device ran each copy of the test, and to ensure each copy

	403 of the test is recorded separately.

	404

	405 Returns:

	406 A tuple of (base_test_result.TestRunResults object, exit code).

	407 """

296 forwarder.Forwarder.KillHost(build_type)	408 forwarder.Forwarder.KillHost(build_type)

297 runners = _CreateRunners(runner_factory, devices, setup_timeout)	409 runners = _CreateRunners(runner_factory, devices, setup_timeout)

298 try:	410 try:

299 return _RunAllTests(runners, tests, num_retries, test_timeout)	411 return _RunAllTests(runners, test_collection_factory,

	412 num_retries, test_timeout, tag_results_with_device)

300 finally:	413 finally:

301 try:	414 try:

302 _TearDownRunners(runners, setup_timeout)	415 _TearDownRunners(runners, setup_timeout)

303 except android_commands.errors.DeviceUnresponsiveError as e:	416 except android_commands.errors.DeviceUnresponsiveError as e:

304 logging.warning('Device unresponsive during TearDown: [%s]', e)	417 logging.warning('Device unresponsive during TearDown: [%s]', e)

305 finally:	418 finally:

306 forwarder.Forwarder.KillHost(build_type)	419 forwarder.Forwarder.KillHost(build_type)

OLD	NEW

« no previous file with comments | « no previous file | build/android/pylib/base/dispatch_unittest.py » ('j') | build/android/pylib/browsertests/setup.py » ('J')