build/android/pylib/base/test_dispatcher.py - Issue 18770008: [Android] Redesigns the sharder to allow replicated vs distributed tests

Side by Side Diff: build/android/pylib/base/test_dispatcher.py

Issue 18770008: [Android] Redesigns the sharder to allow replicated vs distributed tests (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Fixes running multiple gtest suites Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # Copyright (c) 2013 The Chromium Authors. All rights reserved.	1 # Copyright (c) 2013 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 """Implements test sharding logic."""	5 """Implements test sharding logic."""

6	6

7 import logging	7 import logging

8 import threading	8 import threading

9	9

10 from pylib import android_commands	10 from pylib import android_commands

11 from pylib import constants	11 from pylib import constants

12 from pylib import forwarder	12 from pylib import forwarder

13 from pylib.utils import reraiser_thread	13 from pylib.utils import reraiser_thread

14 from pylib.utils import watchdog_timer	14 from pylib.utils import watchdog_timer

15	15

16 import base_test_result	16 import base_test_result

17	17

18	18

19 DEFAULT_TIMEOUT = 7 * 60 # seven minutes	19 DEFAULT_TIMEOUT = 7 * 60 # seven minutes

	20 VALID_TEST_ALLOCATION = ['shard', 'replicate']

20	21

21	22

22 class _ThreadSafeCounter(object):	23 class _ThreadSafeCounter(object):

23 """A threadsafe counter."""	24 """A threadsafe counter."""

24	25

25 def __init__(self):	26 def __init__(self):

26 self._lock = threading.Lock()	27 self._lock = threading.Lock()

27 self._value = 0	28 self._value = 0

28	29

29 def GetAndIncrement(self):	30 def GetAndIncrement(self):

(...skipping 81 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
111 def __iter__(self):	112 def __iter__(self):

112 """Iterate through tests in the collection until all have been handled."""	113 """Iterate through tests in the collection until all have been handled."""

113 while True:	114 while True:

114 r = self._pop()	115 r = self._pop()

115 if r is None:	116 if r is None:

116 break	117 break

117 yield r	118 yield r

118	119

119	120

120 def _RunTestsFromQueue(runner, test_collection, out_results, watcher,	121 def _RunTestsFromQueue(runner, test_collection, out_results, watcher,

121 num_retries):	122 num_retries, tag_results_with_device=False):

122 """Runs tests from the test_collection until empty using the given runner.	123 """Runs tests from the test_collection until empty using the given runner.

123	124

124 Adds TestRunResults objects to the out_results list and may add tests to the	125 Adds TestRunResults objects to the out_results list and may add tests to the

125 out_retry list.	126 out_retry list.

126	127

127 Args:	128 Args:

128 runner: A TestRunner object used to run the tests.	129 runner: A TestRunner object used to run the tests.

129 test_collection: A _TestCollection from which to get _Test objects to run.	130 test_collection: A _TestCollection from which to get _Test objects to run.

130 out_results: A list to add TestRunResults to.	131 out_results: A list to add TestRunResults to.

131 watcher: A watchdog_timer.WatchdogTimer object, used as a shared timeout.	132 watcher: A watchdog_timer.WatchdogTimer object, used as a shared timeout.

132 num_retries: Number of retries for a test.	133 num_retries: Number of retries for a test.

	134 tag_results_with_device: If True, appends the name of the device on which

	135 the test was run to the test name. Used by ReplicateAndRunTests to

	136 identify which device ran each copy of the test, and to ensure each copy

	137 of the test is recorded separately.

133 """	138 """

	139

	140 # Used to tag all results to identify which device caused failing tests

	141 def TagTestRunResults(test_run_results):

	142 new_test_run_results = base_test_result.TestRunResults()

	143 for test_result in test_run_results.GetAll():

	144 new_result = base_test_result.BaseTestResult(

	145 '%s_%s' % (runner.device, test_result.GetName()),

	146 test_result.GetType(), test_result.GetLog())

	147 new_test_run_results.AddResult(new_result)

	148 return new_test_run_results

	149

134 for test in test_collection:	150 for test in test_collection:

135 watcher.Reset()	151 watcher.Reset()

136 try:	152 try:

137 if not android_commands.IsDeviceAttached(runner.device):	153 if not android_commands.IsDeviceAttached(runner.device):

138 # Device is unresponsive, stop handling tests on this device.	154 # Device is unresponsive, stop handling tests on this device.

139 msg = 'Device %s is unresponsive.' % runner.device	155 msg = 'Device %s is unresponsive.' % runner.device

140 logging.warning(msg)	156 logging.warning(msg)

141 raise android_commands.errors.DeviceUnresponsiveError(msg)	157 raise android_commands.errors.DeviceUnresponsiveError(msg)

142 result, retry = runner.RunTest(test.test)	158 result, retry = runner.RunTest(test.test)

	159 if tag_results_with_device:

	160 result = TagTestRunResults(result)

143 test.tries += 1	161 test.tries += 1

144 if retry and test.tries <= num_retries:	162 if retry and test.tries <= num_retries:

145 # Retry non-passing results, only record passing results.	163 # Retry non-passing results, only record passing results.

146 pass_results = base_test_result.TestRunResults()	164 pass_results = base_test_result.TestRunResults()

	165 # Tag all results with the device, so we can identify the failing device

	166 # for replicated tests.

147 pass_results.AddResults(result.GetPass())	167 pass_results.AddResults(result.GetPass())

148 out_results.append(pass_results)	168 out_results.append(pass_results)

149 logging.warning('Will retry test, try #%s.' % test.tries)	169 logging.warning('Will retry test, try #%s.' % test.tries)

150 test_collection.add(_Test(test=retry, tries=test.tries))	170 test_collection.add(_Test(test=retry, tries=test.tries))

151 else:	171 else:

152 # All tests passed or retry limit reached. Either way, record results.	172 # All tests passed or retry limit reached. Either way, record results.

153 out_results.append(result)	173 out_results.append(result)

154 except:	174 except:

155 # An unhandleable exception, ensure tests get run by another device and	175 # An unhandleable exception, ensure tests get run by another device and

156 # reraise this exception on the main thread.	176 # reraise this exception on the main thread.

(...skipping 20 matching lines...) Expand all Loading...
177 try:	197 try:

178 index = threadsafe_counter.GetAndIncrement()	198 index = threadsafe_counter.GetAndIncrement()

179 logging.warning('Creating shard %s for device %s.', index, device)	199 logging.warning('Creating shard %s for device %s.', index, device)

180 runner = runner_factory(device, index)	200 runner = runner_factory(device, index)

181 runner.SetUp()	201 runner.SetUp()

182 out_runners.append(runner)	202 out_runners.append(runner)

183 except android_commands.errors.DeviceUnresponsiveError as e:	203 except android_commands.errors.DeviceUnresponsiveError as e:

184 logging.warning('Failed to create shard for %s: [%s]', device, e)	204 logging.warning('Failed to create shard for %s: [%s]', device, e)

185	205

186	206

187 def _RunAllTests(runners, tests, num_retries, timeout=None):	207 def _RunAllTests(runners, test_collection_factory, num_retries, timeout=None,

	208 tag_results_with_device=False):

188 """Run all tests using the given TestRunners.	209 """Run all tests using the given TestRunners.

189	210

190 Args:	211 Args:

191 runners: a list of TestRunner objects.	212 runners: a list of TestRunner objects.

192 tests: a list of Tests to run using the given TestRunners.	213 test_collection_factory: a callable to generate a _TestCollection object for

	214 each test runner.

193 num_retries: number of retries for a test.	215 num_retries: number of retries for a test.

194 timeout: watchdog timeout in seconds, defaults to the default timeout.	216 timeout: watchdog timeout in seconds, defaults to the default timeout.

	217 tag_results_with_device: If True, appends the name of the device on which

	218 the test was run to the test name. Used by ReplicateAndRunTests to

	219 identify which device ran each copy of the test, and to ensure each copy

	220 of the test is recorded separately.

195	221

196 Returns:	222 Returns:

197 A tuple of (TestRunResults object, exit code)	223 A tuple of (TestRunResults object, exit code)

198 """	224 """

199 logging.warning('Running %s tests with %s test runners.' %	225 logging.warning('Running tests with %s test runners.' % (len(runners)))

200 (len(tests), len(runners)))

201 tests_collection = _TestCollection([_Test(t) for t in tests])

202 results = []	226 results = []

203 exit_code = 0	227 exit_code = 0

204 watcher = watchdog_timer.WatchdogTimer(timeout)	228 watcher = watchdog_timer.WatchdogTimer(timeout)

	229

205 workers = reraiser_thread.ReraiserThreadGroup(	230 workers = reraiser_thread.ReraiserThreadGroup(

206 [reraiser_thread.ReraiserThread(	231 [reraiser_thread.ReraiserThread(

207 _RunTestsFromQueue,	232 _RunTestsFromQueue,

208 [r, tests_collection, results, watcher, num_retries],	233 [r, test_collection_factory(), results, watcher, num_retries,

	234 tag_results_with_device],

209 name=r.device[-4:])	235 name=r.device[-4:])

210 for r in runners])	236 for r in runners])

211 run_results = base_test_result.TestRunResults()	237 run_results = base_test_result.TestRunResults()

212 workers.StartAll()	238 workers.StartAll()

213	239

214 # Catch DeviceUnresponsiveErrors and set a warning exit code	240 # Catch DeviceUnresponsiveErrors and set a warning exit code

215 try:	241 try:

216 workers.JoinAll(watcher)	242 workers.JoinAll(watcher)

217 except android_commands.errors.DeviceUnresponsiveError as e:	243 except android_commands.errors.DeviceUnresponsiveError as e:

218 logging.error(e)	244 logging.error(e)

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
260 runners: a list of TestRunner objects.	286 runners: a list of TestRunner objects.

261 timeout: watchdog timeout in seconds, defaults to the default timeout.	287 timeout: watchdog timeout in seconds, defaults to the default timeout.

262 """	288 """

263 threads = reraiser_thread.ReraiserThreadGroup(	289 threads = reraiser_thread.ReraiserThreadGroup(

264 [reraiser_thread.ReraiserThread(r.TearDown, name=r.device[-4:])	290 [reraiser_thread.ReraiserThread(r.TearDown, name=r.device[-4:])

265 for r in runners])	291 for r in runners])

266 threads.StartAll()	292 threads.StartAll()

267 threads.JoinAll(watchdog_timer.WatchdogTimer(timeout))	293 threads.JoinAll(watchdog_timer.WatchdogTimer(timeout))

268	294

269	295

270 def ShardAndRunTests(runner_factory, devices, tests, build_type='Debug',	296

271 test_timeout=DEFAULT_TIMEOUT,	297 def _GetAttachedDevices(wait_for_debugger=False, test_device=None):

272 setup_timeout=DEFAULT_TIMEOUT,	298 """Get all attached devices.

273 num_retries=2):	299

	300 If we are using a debugger, limit to only one device.

	301

	302 Args:

	303 wait_for_debugger: True if this run will use a debugger.

	304 test_device: name of a specific device to use.

	305

	306 Returns:

	307 A list of attached devices.

	308 """

	309 attached_devices = []

	310

	311 attached_devices = android_commands.GetAttachedDevices()

	312 if test_device:

	313 assert test_device in attached_devices

	314 attached_devices = [test_device]

	315

	316 if len(attached_devices) > 1 and wait_for_debugger:

	317 logging.warning('Debugger can not be sharded, using first available device')

	318 attached_devices = attached_devices[:1]

	319

	320 return attached_devices

	321

	322

	323 def RunTests(tests, runner_factory,

	324 wait_for_debugger, test_device,

	325 test_allocation='shard',

	326 build_type='Debug',

	327 test_timeout=DEFAULT_TIMEOUT,

	328 setup_timeout=DEFAULT_TIMEOUT,

	329 num_retries=2):

274 """Run all tests on attached devices, retrying tests that don't pass.	330 """Run all tests on attached devices, retrying tests that don't pass.

275	331

276 Args:	332 Args:

	333 tests: list of tests to run.

277 runner_factory: callable that takes a device and index and returns a	334 runner_factory: callable that takes a device and index and returns a

278 TestRunner object.	335 TestRunner object.

279 devices: list of attached device serial numbers as strings.	336 wait_for_debugger: True if this test is using a debugger.

280 tests: list of tests to run.	337 test_device: A specific device to run tests on, or None.

	338 test_allocation: 'shard' or 'replicate'.

281 build_type: either 'Debug' or 'Release'.	339 build_type: either 'Debug' or 'Release'.

282 test_timeout: watchdog timeout in seconds for running tests, defaults to the	340 test_timeout: watchdog timeout in seconds for running tests, defaults to the

283 default timeout.	341 default timeout.

284 setup_timeout: watchdog timeout in seconds for creating and cleaning up	342 setup_timeout: watchdog timeout in seconds for creating and cleaning up

285 test runners, defaults to the default timeout.	343 test runners, defaults to the default timeout.

286 num_retries: number of retries for a test.	344 num_retries: number of retries for a test.

	345 tag_results_with_device: If True, appends the name of the device on which

	346 the test was run to the test name. Used by ReplicateAndRunTests to

	347 identify which device ran each copy of the test, and to ensure each copy

	348 of the test is recorded separately.

287	349

288 Returns:	350 Returns:

289 A tuple of (base_test_result.TestRunResults object, exit code).	351 A tuple of (base_test_result.TestRunResults object, exit code).

290 """	352 """

	353 # Validation

291 if not tests:	354 if not tests:

292 logging.error('No tests to run.')	355 logging.error('No tests to run.')

293 return (base_test_result.TestRunResults(), constants.ERROR_EXIT_CODE)	356 return (base_test_result.TestRunResults(), constants.ERROR_EXIT_CODE)

294	357

	358 if not test_allocation in VALID_TEST_ALLOCATION:

	359 logging.error('Unknown test allocation string %s. Options are: %s'

	360 % (test_allocation, ', '.join(VALID_TEST_ALLOCATION)))

	361 return (base_test_result.TestRunResults(), constants.ERROR_EXIT_CODE)

	362

	363 if test_allocation == 'shard':

	364 # Generate a shared _TestCollection object for all test runners, so they

	365 # draw from a common pool of tests.

	366 shared_test_collection = _TestCollection([_Test(t) for t in tests])

	367 test_collection_factory = lambda: shared_test_collection

	368 tag_results_with_device = False

	369 else:

	370 # Generate a unique _TestCollection object for each test runner, but use

	371 # the same set of tests.

	372 test_collection_factory = lambda: _TestCollection([_Test(t) for t in tests])

	373 tag_results_with_device = True

	374

	375 devices = _GetAttachedDevices(wait_for_debugger, test_device)

	376

295 logging.info('Will run %d tests: %s', len(tests), str(tests))	377 logging.info('Will run %d tests: %s', len(tests), str(tests))

	378

296 forwarder.Forwarder.KillHost(build_type)	379 forwarder.Forwarder.KillHost(build_type)

297 runners = _CreateRunners(runner_factory, devices, setup_timeout)	380 runners = _CreateRunners(runner_factory, devices, setup_timeout)

298 try:	381 try:

299 return _RunAllTests(runners, tests, num_retries, test_timeout)	382 return _RunAllTests(runners, test_collection_factory,

	383 num_retries, test_timeout, tag_results_with_device)

300 finally:	384 finally:

301 try:	385 try:

302 _TearDownRunners(runners, setup_timeout)	386 _TearDownRunners(runners, setup_timeout)

303 except android_commands.errors.DeviceUnresponsiveError as e:	387 except android_commands.errors.DeviceUnresponsiveError as e:

304 logging.warning('Device unresponsive during TearDown: [%s]', e)	388 logging.warning('Device unresponsive during TearDown: [%s]', e)

305 finally:	389 finally:

306 forwarder.Forwarder.KillHost(build_type)	390 forwarder.Forwarder.KillHost(build_type)

OLD	NEW