Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(419)

Side by Side Diff: build/android/pylib/base/test_dispatcher.py

Issue 18770008: [Android] Redesigns the sharder to allow replicated vs distributed tests (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Combines ShardAndRunTests and ReplicateAndRunTests Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright (c) 2013 The Chromium Authors. All rights reserved. 1 # Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 """Implements test sharding logic.""" 5 """Implements test sharding logic."""
frankf 2013/07/16 18:42:32 Update
gkanwar 2013/07/16 20:27:36 Done.
6 6
7 import logging 7 import logging
8 import threading 8 import threading
9 9
10 from pylib import android_commands 10 from pylib import android_commands
11 from pylib import constants 11 from pylib import constants
12 from pylib import forwarder 12 from pylib import forwarder
13 from pylib.utils import reraiser_thread 13 from pylib.utils import reraiser_thread
14 from pylib.utils import watchdog_timer 14 from pylib.utils import watchdog_timer
15 15
16 import base_test_result 16 import base_test_result
17 17
18 18
19 DEFAULT_TIMEOUT = 7 * 60 # seven minutes 19 DEFAULT_TIMEOUT = 7 * 60 # seven minutes
20 VALID_TEST_ALLOCATION = ['shard', 'replicate']
20 21
21 22
22 class _ThreadSafeCounter(object): 23 class _ThreadSafeCounter(object):
23 """A threadsafe counter.""" 24 """A threadsafe counter."""
24 25
25 def __init__(self): 26 def __init__(self):
26 self._lock = threading.Lock() 27 self._lock = threading.Lock()
27 self._value = 0 28 self._value = 0
28 29
29 def GetAndIncrement(self): 30 def GetAndIncrement(self):
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after
111 def __iter__(self): 112 def __iter__(self):
112 """Iterate through tests in the collection until all have been handled.""" 113 """Iterate through tests in the collection until all have been handled."""
113 while True: 114 while True:
114 r = self._pop() 115 r = self._pop()
115 if r is None: 116 if r is None:
116 break 117 break
117 yield r 118 yield r
118 119
119 120
120 def _RunTestsFromQueue(runner, test_collection, out_results, watcher, 121 def _RunTestsFromQueue(runner, test_collection, out_results, watcher,
121 num_retries): 122 num_retries, tag_results_with_device=False):
122 """Runs tests from the test_collection until empty using the given runner. 123 """Runs tests from the test_collection until empty using the given runner.
123 124
124 Adds TestRunResults objects to the out_results list and may add tests to the 125 Adds TestRunResults objects to the out_results list and may add tests to the
125 out_retry list. 126 out_retry list.
126 127
127 Args: 128 Args:
128 runner: A TestRunner object used to run the tests. 129 runner: A TestRunner object used to run the tests.
129 test_collection: A _TestCollection from which to get _Test objects to run. 130 test_collection: A _TestCollection from which to get _Test objects to run.
130 out_results: A list to add TestRunResults to. 131 out_results: A list to add TestRunResults to.
131 watcher: A watchdog_timer.WatchdogTimer object, used as a shared timeout. 132 watcher: A watchdog_timer.WatchdogTimer object, used as a shared timeout.
132 num_retries: Number of retries for a test. 133 num_retries: Number of retries for a test.
134 tag_results_with_device: If True, appends the name of the device on which
135 the test was run to the test name. Used by ReplicateAndRunTests to
136 identify which device ran each copy of the test, and to ensure each copy
137 of the test is recorded separately.
133 """ 138 """
139
140 # Used to tag all results to identify which device caused failing tests
141 def TagTestRunResults(test_run_results):
142 new_test_run_results = base_test_result.TestRunResults()
143 for test_result in test_run_results.GetAll():
144 new_result = base_test_result.BaseTestResult(
145 '%s_%s' % (runner.device, test_result.GetName()),
frankf 2013/07/16 18:42:32 Use last 4 digits of device serial.
gkanwar 2013/07/16 20:27:36 Done.
146 test_result.GetType(), test_result.GetLog())
frankf 2013/07/16 18:42:32 As the name suggest, BaseTestResult can be derived
gkanwar 2013/07/16 20:27:36 Updated to move tag into the BaseTestResult class
147 new_test_run_results.AddResult(new_result)
148 return new_test_run_results
149
134 for test in test_collection: 150 for test in test_collection:
135 watcher.Reset() 151 watcher.Reset()
136 try: 152 try:
137 if not android_commands.IsDeviceAttached(runner.device): 153 if not android_commands.IsDeviceAttached(runner.device):
138 # Device is unresponsive, stop handling tests on this device. 154 # Device is unresponsive, stop handling tests on this device.
139 msg = 'Device %s is unresponsive.' % runner.device 155 msg = 'Device %s is unresponsive.' % runner.device
140 logging.warning(msg) 156 logging.warning(msg)
141 raise android_commands.errors.DeviceUnresponsiveError(msg) 157 raise android_commands.errors.DeviceUnresponsiveError(msg)
142 result, retry = runner.RunTest(test.test) 158 result, retry = runner.RunTest(test.test)
159 if tag_results_with_device:
160 result = TagTestRunResults(result)
143 test.tries += 1 161 test.tries += 1
144 if retry and test.tries <= num_retries: 162 if retry and test.tries <= num_retries:
145 # Retry non-passing results, only record passing results. 163 # Retry non-passing results, only record passing results.
146 pass_results = base_test_result.TestRunResults() 164 pass_results = base_test_result.TestRunResults()
165 # Tag all results with the device, so we can identify the failing device
166 # for replicated tests.
frankf 2013/07/16 18:42:32 Is this comment misplaced?
gkanwar 2013/07/16 20:27:36 Oops, removed.
147 pass_results.AddResults(result.GetPass()) 167 pass_results.AddResults(result.GetPass())
148 out_results.append(pass_results) 168 out_results.append(pass_results)
149 logging.warning('Will retry test, try #%s.' % test.tries) 169 logging.warning('Will retry test, try #%s.' % test.tries)
150 test_collection.add(_Test(test=retry, tries=test.tries)) 170 test_collection.add(_Test(test=retry, tries=test.tries))
151 else: 171 else:
152 # All tests passed or retry limit reached. Either way, record results. 172 # All tests passed or retry limit reached. Either way, record results.
153 out_results.append(result) 173 out_results.append(result)
154 except: 174 except:
155 # An unhandleable exception, ensure tests get run by another device and 175 # An unhandleable exception, ensure tests get run by another device and
156 # reraise this exception on the main thread. 176 # reraise this exception on the main thread.
(...skipping 20 matching lines...) Expand all
177 try: 197 try:
178 index = threadsafe_counter.GetAndIncrement() 198 index = threadsafe_counter.GetAndIncrement()
179 logging.warning('Creating shard %s for device %s.', index, device) 199 logging.warning('Creating shard %s for device %s.', index, device)
180 runner = runner_factory(device, index) 200 runner = runner_factory(device, index)
181 runner.SetUp() 201 runner.SetUp()
182 out_runners.append(runner) 202 out_runners.append(runner)
183 except android_commands.errors.DeviceUnresponsiveError as e: 203 except android_commands.errors.DeviceUnresponsiveError as e:
184 logging.warning('Failed to create shard for %s: [%s]', device, e) 204 logging.warning('Failed to create shard for %s: [%s]', device, e)
185 205
186 206
187 def _RunAllTests(runners, tests, num_retries, timeout=None): 207 def _RunAllTests(runners, test_collection_factory, num_retries, timeout=None,
208 tag_results_with_device=False):
188 """Run all tests using the given TestRunners. 209 """Run all tests using the given TestRunners.
189 210
190 Args: 211 Args:
191 runners: a list of TestRunner objects. 212 runners: a list of TestRunner objects.
192 tests: a list of Tests to run using the given TestRunners. 213 test_collection_factory: a callable to generate a _TestCollection object for
214 each test runner.
193 num_retries: number of retries for a test. 215 num_retries: number of retries for a test.
194 timeout: watchdog timeout in seconds, defaults to the default timeout. 216 timeout: watchdog timeout in seconds, defaults to the default timeout.
217 tag_results_with_device: If True, appends the name of the device on which
218 the test was run to the test name. Used by ReplicateAndRunTests to
219 identify which device ran each copy of the test, and to ensure each copy
220 of the test is recorded separately.
195 221
196 Returns: 222 Returns:
197 A tuple of (TestRunResults object, exit code) 223 A tuple of (TestRunResults object, exit code)
198 """ 224 """
199 logging.warning('Running %s tests with %s test runners.' % 225 logging.warning('Running tests with %s test runners.' % (len(runners)))
200 (len(tests), len(runners)))
201 tests_collection = _TestCollection([_Test(t) for t in tests])
202 results = [] 226 results = []
203 exit_code = 0 227 exit_code = 0
204 watcher = watchdog_timer.WatchdogTimer(timeout) 228 watcher = watchdog_timer.WatchdogTimer(timeout)
229
205 workers = reraiser_thread.ReraiserThreadGroup( 230 workers = reraiser_thread.ReraiserThreadGroup(
206 [reraiser_thread.ReraiserThread( 231 [reraiser_thread.ReraiserThread(
207 _RunTestsFromQueue, 232 _RunTestsFromQueue,
208 [r, tests_collection, results, watcher, num_retries], 233 [r, test_collection_factory(), results, watcher, num_retries,
234 tag_results_with_device],
209 name=r.device[-4:]) 235 name=r.device[-4:])
210 for r in runners]) 236 for r in runners])
211 run_results = base_test_result.TestRunResults() 237 run_results = base_test_result.TestRunResults()
212 workers.StartAll() 238 workers.StartAll()
213 239
214 # Catch DeviceUnresponsiveErrors and set a warning exit code 240 # Catch DeviceUnresponsiveErrors and set a warning exit code
215 try: 241 try:
216 workers.JoinAll(watcher) 242 workers.JoinAll(watcher)
217 except android_commands.errors.DeviceUnresponsiveError as e: 243 except android_commands.errors.DeviceUnresponsiveError as e:
218 logging.error(e) 244 logging.error(e)
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
260 runners: a list of TestRunner objects. 286 runners: a list of TestRunner objects.
261 timeout: watchdog timeout in seconds, defaults to the default timeout. 287 timeout: watchdog timeout in seconds, defaults to the default timeout.
262 """ 288 """
263 threads = reraiser_thread.ReraiserThreadGroup( 289 threads = reraiser_thread.ReraiserThreadGroup(
264 [reraiser_thread.ReraiserThread(r.TearDown, name=r.device[-4:]) 290 [reraiser_thread.ReraiserThread(r.TearDown, name=r.device[-4:])
265 for r in runners]) 291 for r in runners])
266 threads.StartAll() 292 threads.StartAll()
267 threads.JoinAll(watchdog_timer.WatchdogTimer(timeout)) 293 threads.JoinAll(watchdog_timer.WatchdogTimer(timeout))
268 294
269 295
270 def ShardAndRunTests(runner_factory, devices, tests, build_type='Debug', 296
271 test_timeout=DEFAULT_TIMEOUT, 297 def _GetAttachedDevices(wait_for_debugger=False, test_device=None):
272 setup_timeout=DEFAULT_TIMEOUT, 298 """Get all attached devices.
273 num_retries=2): 299
300 If we are using a debugger, limit to only one device.
301
302 Args:
303 wait_for_debugger: True if this run will use a debugger.
304 test_device: name of a specific device to use.
305
306 Returns:
307 A list of attached devices.
308 """
309 attached_devices = []
310
311 attached_devices = android_commands.GetAttachedDevices()
312 if test_device:
313 assert test_device in attached_devices
frankf 2013/07/16 18:42:32 add a message to assert
gkanwar 2013/07/16 20:27:36 Done.
314 attached_devices = [test_device]
315
316 if len(attached_devices) > 1 and wait_for_debugger:
317 logging.warning('Debugger can not be sharded, using first available device')
318 attached_devices = attached_devices[:1]
319
320 return attached_devices
321
322
323 def RunTests(tests, runner_factory,
324 wait_for_debugger, test_device,
325 test_allocation='shard',
frankf 2013/07/16 18:42:32 I think a boolean called 'shard' makes more sense,
gkanwar 2013/07/16 20:27:36 Done.
326 build_type='Debug',
327 test_timeout=DEFAULT_TIMEOUT,
328 setup_timeout=DEFAULT_TIMEOUT,
329 num_retries=2):
274 """Run all tests on attached devices, retrying tests that don't pass. 330 """Run all tests on attached devices, retrying tests that don't pass.
275 331
276 Args: 332 Args:
333 tests: list of tests to run.
frankf 2013/07/16 18:42:32 Capital first letter.
gkanwar 2013/07/16 20:27:36 Done.
277 runner_factory: callable that takes a device and index and returns a 334 runner_factory: callable that takes a device and index and returns a
278 TestRunner object. 335 TestRunner object.
279 devices: list of attached device serial numbers as strings. 336 wait_for_debugger: True if this test is using a debugger.
280 tests: list of tests to run. 337 test_device: A specific device to run tests on, or None.
338 test_allocation: 'shard' or 'replicate'.
281 build_type: either 'Debug' or 'Release'. 339 build_type: either 'Debug' or 'Release'.
282 test_timeout: watchdog timeout in seconds for running tests, defaults to the 340 test_timeout: watchdog timeout in seconds for running tests, defaults to the
283 default timeout. 341 default timeout.
284 setup_timeout: watchdog timeout in seconds for creating and cleaning up 342 setup_timeout: watchdog timeout in seconds for creating and cleaning up
285 test runners, defaults to the default timeout. 343 test runners, defaults to the default timeout.
286 num_retries: number of retries for a test. 344 num_retries: number of retries for a test.
345 tag_results_with_device: If True, appends the name of the device on which
346 the test was run to the test name. Used by ReplicateAndRunTests to
347 identify which device ran each copy of the test, and to ensure each copy
frankf 2013/07/16 18:42:32 Update this
gkanwar 2013/07/16 20:27:36 Done.
348 of the test is recorded separately.
287 349
288 Returns: 350 Returns:
289 A tuple of (base_test_result.TestRunResults object, exit code). 351 A tuple of (base_test_result.TestRunResults object, exit code).
290 """ 352 """
353 # Validation
frankf 2013/07/16 18:42:32 this is obvious. remove comment.
gkanwar 2013/07/16 20:27:36 Done.
291 if not tests: 354 if not tests:
292 logging.error('No tests to run.') 355 logging.error('No tests to run.')
293 return (base_test_result.TestRunResults(), constants.ERROR_EXIT_CODE) 356 return (base_test_result.TestRunResults(), constants.ERROR_EXIT_CODE)
294 357
358 if not test_allocation in VALID_TEST_ALLOCATION:
359 logging.error('Unknown test allocation string %s. Options are: %s'
360 % (test_allocation, ', '.join(VALID_TEST_ALLOCATION)))
361 return (base_test_result.TestRunResults(), constants.ERROR_EXIT_CODE)
362
363 if test_allocation == 'shard':
364 # Generate a shared _TestCollection object for all test runners, so they
365 # draw from a common pool of tests.
366 shared_test_collection = _TestCollection([_Test(t) for t in tests])
367 test_collection_factory = lambda: shared_test_collection
368 tag_results_with_device = False
369 else:
370 # Generate a unique _TestCollection object for each test runner, but use
371 # the same set of tests.
372 test_collection_factory = lambda: _TestCollection([_Test(t) for t in tests])
373 tag_results_with_device = True
374
375 devices = _GetAttachedDevices(wait_for_debugger, test_device)
376
295 logging.info('Will run %d tests: %s', len(tests), str(tests)) 377 logging.info('Will run %d tests: %s', len(tests), str(tests))
378
296 forwarder.Forwarder.KillHost(build_type) 379 forwarder.Forwarder.KillHost(build_type)
297 runners = _CreateRunners(runner_factory, devices, setup_timeout) 380 runners = _CreateRunners(runner_factory, devices, setup_timeout)
298 try: 381 try:
299 return _RunAllTests(runners, tests, num_retries, test_timeout) 382 return _RunAllTests(runners, test_collection_factory,
383 num_retries, test_timeout, tag_results_with_device)
300 finally: 384 finally:
301 try: 385 try:
302 _TearDownRunners(runners, setup_timeout) 386 _TearDownRunners(runners, setup_timeout)
303 except android_commands.errors.DeviceUnresponsiveError as e: 387 except android_commands.errors.DeviceUnresponsiveError as e:
304 logging.warning('Device unresponsive during TearDown: [%s]', e) 388 logging.warning('Device unresponsive during TearDown: [%s]', e)
305 finally: 389 finally:
306 forwarder.Forwarder.KillHost(build_type) 390 forwarder.Forwarder.KillHost(build_type)
OLDNEW
« no previous file with comments | « build/android/pylib/base/shard_unittest.py ('k') | build/android/pylib/base/test_dispatcher_unittest.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698