build/android/pylib/base/shard.py - Issue 18444004: Makes host driven tests use the common sharder.

Side by Side Diff: build/android/pylib/base/shard.py

Issue 18444004: Makes host driven tests use the common sharder. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # Copyright (c) 2013 The Chromium Authors. All rights reserved.	1 # Copyright (c) 2013 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 """Implements test sharding logic."""	5 """Implements test sharding logic."""

6	6

7 import logging	7 import logging

8 import threading	8 import threading

9	9

10 from pylib import android_commands	10 from pylib import android_commands

11 from pylib import constants	11 from pylib import constants

12 from pylib import forwarder	12 from pylib import forwarder

13 from pylib.utils import reraiser_thread	13 from pylib.utils import reraiser_thread

14 from pylib.utils import watchdog_timer	14 from pylib.utils import watchdog_timer

15	15

16 import base_test_result	16 import base_test_result

17	17

18	18

19 DEFAULT_TIMEOUT = 7 * 60 # seven minutes	19 DEFAULT_TIMEOUT = 7 * 60 # seven minutes

	20 VALID_SHARDING_OPTIONS = ['distribute', 'duplicate']

20	21

21	22

22 class _ThreadSafeCounter(object):	23 class _ThreadSafeCounter(object):

23 """A threadsafe counter."""	24 """A threadsafe counter."""

24	25

25 def __init__(self):	26 def __init__(self):

26 self._lock = threading.Lock()	27 self._lock = threading.Lock()

27 self._value = 0	28 self._value = 0

28	29

29 def GetAndIncrement(self):	30 def GetAndIncrement(self):

(...skipping 147 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
177 try:	178 try:

178 index = threadsafe_counter.GetAndIncrement()	179 index = threadsafe_counter.GetAndIncrement()

179 logging.warning('Creating shard %s for device %s.', index, device)	180 logging.warning('Creating shard %s for device %s.', index, device)

180 runner = runner_factory(device, index)	181 runner = runner_factory(device, index)

181 runner.SetUp()	182 runner.SetUp()

182 out_runners.append(runner)	183 out_runners.append(runner)

183 except android_commands.errors.DeviceUnresponsiveError as e:	184 except android_commands.errors.DeviceUnresponsiveError as e:

184 logging.warning('Failed to create shard for %s: [%s]', device, e)	185 logging.warning('Failed to create shard for %s: [%s]', device, e)

185	186

186	187

187 def _RunAllTests(runners, tests, num_retries, timeout=None):	188 def _RunAllTests(runners, tests, sharding, num_retries, timeout=None):

188 """Run all tests using the given TestRunners.	189 """Run all tests using the given TestRunners.

189	190

190 Args:	191 Args:

191 runners: a list of TestRunner objects.	192 runners: a list of TestRunner objects.

192 tests: a list of Tests to run using the given TestRunners.	193 tests: a list of Tests to run using the given TestRunners.

193 num_retries: number of retries for a test.	194 num_retries: number of retries for a test.

194 timeout: watchdog timeout in seconds, defaults to the default timeout.	195 timeout: watchdog timeout in seconds, defaults to the default timeout.

	196 sharding: a string to indicate whether we should distribute all tests as a

	197 common pool to draw from, or duplicate all tests onto every test runner.

	198 Must be either 'distribute' or 'duplicate'

195	199

196 Returns:	200 Returns:

197 A tuple of (TestRunResults object, exit code)	201 A tuple of (TestRunResults object, exit code)

198 """	202 """

199 logging.warning('Running %s tests with %s test runners.' %	203 logging.warning('Running %s tests with %s test runners.' %

200 (len(tests), len(runners)))	204 (len(tests), len(runners)))

201 tests_collection = _TestCollection([_Test(t) for t in tests])

202 results = []	205 results = []

203 exit_code = 0	206 exit_code = 0

204 watcher = watchdog_timer.WatchdogTimer(timeout)	207 watcher = watchdog_timer.WatchdogTimer(timeout)

	208

	209 if sharding == 'distribute':

	210 shared_test_collection = _TestCollection([_Test(t) for t in tests])

	211 GenerateTestCollection = lambda: shared_test_collection

	212 elif sharding == 'duplicate':

	213 GenerateTestCollection = lambda: _TestCollection([_Test(t) for t in tests])

	214 else:

	215 raise ValueError('Unknown sharding option %s. Options are: %s'

	216 % (sharding, VALID_SHARDING_OPTIONS))

	217

	218 # Use a lambda to get the test collection for each runner for cleanliness

205 workers = reraiser_thread.ReraiserThreadGroup(	219 workers = reraiser_thread.ReraiserThreadGroup(

206 [reraiser_thread.ReraiserThread(	220 [reraiser_thread.ReraiserThread(

207 _RunTestsFromQueue,	221 _RunTestsFromQueue,

208 [r, tests_collection, results, watcher, num_retries],	222 [r, GenerateTestCollection(), results, watcher, num_retries],

209 name=r.device[-4:])	223 name=r.device[-4:])

210 for r in runners])	224 for r in runners])

211 run_results = base_test_result.TestRunResults()	225 run_results = base_test_result.TestRunResults()

212 workers.StartAll()	226 workers.StartAll()

213	227

214 # Catch DeviceUnresponsiveErrors and set a warning exit code	228 # Catch DeviceUnresponsiveErrors and set a warning exit code

215 try:	229 try:

216 workers.JoinAll(watcher)	230 workers.JoinAll(watcher)

217 except android_commands.errors.DeviceUnresponsiveError as e:	231 except android_commands.errors.DeviceUnresponsiveError as e:

218 logging.error(e)	232 logging.error(e)

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
263 threads = reraiser_thread.ReraiserThreadGroup(	277 threads = reraiser_thread.ReraiserThreadGroup(

264 [reraiser_thread.ReraiserThread(r.TearDown, name=r.device[-4:])	278 [reraiser_thread.ReraiserThread(r.TearDown, name=r.device[-4:])

265 for r in runners])	279 for r in runners])

266 threads.StartAll()	280 threads.StartAll()

267 threads.JoinAll(watchdog_timer.WatchdogTimer(timeout))	281 threads.JoinAll(watchdog_timer.WatchdogTimer(timeout))

268	282

269	283

270 def ShardAndRunTests(runner_factory, devices, tests, build_type='Debug',	284 def ShardAndRunTests(runner_factory, devices, tests, build_type='Debug',

271 test_timeout=DEFAULT_TIMEOUT,	285 test_timeout=DEFAULT_TIMEOUT,

272 setup_timeout=DEFAULT_TIMEOUT,	286 setup_timeout=DEFAULT_TIMEOUT,

273 num_retries=2):	287 num_retries=2,

	288 sharding='distribute'):
	craigdh 2013/07/12 18:14:15 The behavior is quite different, I think it would The behavior is quite different, I think it would be better to expose two public functions, something like: RunTestsDistributed() and RunTestsDuplicated(). Then you can put the divergent logic from lines 208-218 in those functions and pass the lambda to a common implementation similar to the ShardAndRunTests you have here, except with the lambda instead of the sharding string. gkanwar 2013/07/12 18:22:41 I was discussing this with Frank in the gdoc, and Show quoted text On 2013/07/12 18:14:15, craigdh wrote: > The behavior is quite different, I think it would be better to expose two public > functions, something like: RunTestsDistributed() and RunTestsDuplicated(). > > Then you can put the divergent logic from lines 208-218 in those functions and > pass the lambda to a common implementation similar to the ShardAndRunTests you > have here, except with the lambda instead of the sharding string. I was discussing this with Frank in the gdoc, and I think we decided it was overcomplicating it to create the additional two functions. I would be happy to go either way though. Frank? frankf 2013/07/12 18:49:19 I still think the semantics is confusing. Sharding I still think the semantics is confusing. Sharding is synonymous with distributing. How about we call this class test_allocator.py and have two functions: ShardAndRunTests() ReplicateAndRunTests() On 2013/07/12 18:22:41, gkanwar wrote: Show quoted text > On 2013/07/12 18:14:15, craigdh wrote: > > The behavior is quite different, I think it would be better to expose two > public > > functions, something like: RunTestsDistributed() and RunTestsDuplicated(). > > > > Then you can put the divergent logic from lines 208-218 in those functions and > > pass the lambda to a common implementation similar to the ShardAndRunTests you > > have here, except with the lambda instead of the sharding string. > > I was discussing this with Frank in the gdoc, and I think we decided it was > overcomplicating it to create the additional two functions. I would be happy to > go either way though. Frank? gkanwar 2013/07/12 18:52:04 That sounds reasonable to me. Craig, does that sou Show quoted text On 2013/07/12 18:49:19, frankf wrote: > I still think the semantics is confusing. Sharding is synonymous with > distributing. How about we call this class test_allocator.py and have two > functions: > > ShardAndRunTests() > ReplicateAndRunTests() > > On 2013/07/12 18:22:41, gkanwar wrote: > > On 2013/07/12 18:14:15, craigdh wrote: > > > The behavior is quite different, I think it would be better to expose two > > public > > > functions, something like: RunTestsDistributed() and RunTestsDuplicated(). > > > > > > Then you can put the divergent logic from lines 208-218 in those functions > and > > > pass the lambda to a common implementation similar to the ShardAndRunTests > you > > > have here, except with the lambda instead of the sharding string. > > > > I was discussing this with Frank in the gdoc, and I think we decided it was > > overcomplicating it to create the additional two functions. I would be happy > to > > go either way though. Frank? > That sounds reasonable to me. Craig, does that sound good to you? craigdh 2013/07/12 19:03:36 I like the method names, but test_allocator.py is Show quoted text On 2013/07/12 18:49:19, frankf wrote: > I still think the semantics is confusing. Sharding is synonymous with > distributing. How about we call this class test_allocator.py and have two > functions: > > ShardAndRunTests() > ReplicateAndRunTests() > > On 2013/07/12 18:22:41, gkanwar wrote: > > On 2013/07/12 18:14:15, craigdh wrote: > > > The behavior is quite different, I think it would be better to expose two > > public > > > functions, something like: RunTestsDistributed() and RunTestsDuplicated(). > > > > > > Then you can put the divergent logic from lines 208-218 in those functions > and > > > pass the lambda to a common implementation similar to the ShardAndRunTests > you > > > have here, except with the lambda instead of the sharding string. > > > > I was discussing this with Frank in the gdoc, and I think we decided it was > > overcomplicating it to create the additional two functions. I would be happy > to > > go either way though. Frank? > I like the method names, but test_allocator.py is not as accurate as it could be. How about test_dispatcher.py? gkanwar 2013/07/12 19:22:32 We already have a 'dispatch.py' under pylib now (t Show quoted text On 2013/07/12 19:03:36, craigdh wrote: > On 2013/07/12 18:49:19, frankf wrote: > > I still think the semantics is confusing. Sharding is synonymous with > > distributing. How about we call this class test_allocator.py and have two > > functions: > > > > ShardAndRunTests() > > ReplicateAndRunTests() > > > > On 2013/07/12 18:22:41, gkanwar wrote: > > > On 2013/07/12 18:14:15, craigdh wrote: > > > > The behavior is quite different, I think it would be better to expose two > > > public > > > > functions, something like: RunTestsDistributed() and RunTestsDuplicated(). > > > > > > > > Then you can put the divergent logic from lines 208-218 in those functions > > and > > > > pass the lambda to a common implementation similar to the ShardAndRunTests > > you > > > > have here, except with the lambda instead of the sharding string. > > > > > > I was discussing this with Frank in the gdoc, and I think we decided it was > > > overcomplicating it to create the additional two functions. I would be happy > > to > > > go either way though. Frank? > > > > I like the method names, but test_allocator.py is not as accurate as it could > be. How about test_dispatcher.py? We already have a 'dispatch.py' under pylib now (this is where we get the devices and call this method to do the actual distribution and running). Since we're not using the terms 'distrbuted' vs. 'duplicated' now, perhaps we could name this test_distribution.py or distribute.py? I would consider both sharding and replication forms of test distribution.
274 """Run all tests on attached devices, retrying tests that don't pass.	289 """Run all tests on attached devices, retrying tests that don't pass.

275	290

276 Args:	291 Args:

277 runner_factory: callable that takes a device and index and returns a	292 runner_factory: callable that takes a device and index and returns a

278 TestRunner object.	293 TestRunner object.

279 devices: list of attached device serial numbers as strings.	294 devices: list of attached device serial numbers as strings.

280 tests: list of tests to run.	295 tests: list of tests to run.

281 build_type: either 'Debug' or 'Release'.	296 build_type: either 'Debug' or 'Release'.

282 test_timeout: watchdog timeout in seconds for running tests, defaults to the	297 test_timeout: watchdog timeout in seconds for running tests, defaults to the

283 default timeout.	298 default timeout.

284 setup_timeout: watchdog timeout in seconds for creating and cleaning up	299 setup_timeout: watchdog timeout in seconds for creating and cleaning up

285 test runners, defaults to the default timeout.	300 test runners, defaults to the default timeout.

286 num_retries: number of retries for a test.	301 num_retries: number of retries for a test.

	302 sharding: a string to indicate whether we should distribute all tests as a

	303 common pool to draw from, or duplicate all tests onto every test runner.

	304 Must be either 'distribute' or 'duplicate'

287	305

288 Returns:	306 Returns:

289 A tuple of (base_test_result.TestRunResults object, exit code).	307 A tuple of (base_test_result.TestRunResults object, exit code).

290 """	308 """

	309
	craigdh 2013/07/12 18:14:15 remove this new blank line remove this new blank line gkanwar 2013/07/12 18:22:41 Will do. Show quoted text On 2013/07/12 18:14:15, craigdh wrote: > remove this new blank line Will do.
291 if not tests:	310 if not tests:

292 logging.error('No tests to run.')	311 logging.error('No tests to run.')

293 return (base_test_result.TestRunResults(), constants.ERROR_EXIT_CODE)	312 return (base_test_result.TestRunResults(), constants.ERROR_EXIT_CODE)

294	313

295 logging.info('Will run %d tests: %s', len(tests), str(tests))	314 logging.info('Will run %d tests: %s', len(tests), str(tests))

296 forwarder.Forwarder.KillHost(build_type)	315 forwarder.Forwarder.KillHost(build_type)

297 runners = _CreateRunners(runner_factory, devices, setup_timeout)	316 runners = _CreateRunners(runner_factory, devices, setup_timeout)

298 try:	317 try:

299 return _RunAllTests(runners, tests, num_retries, test_timeout)	318 return _RunAllTests(runners, tests, sharding, num_retries, test_timeout)

300 finally:	319 finally:

301 try:	320 try:

302 _TearDownRunners(runners, setup_timeout)	321 _TearDownRunners(runners, setup_timeout)

303 except android_commands.errors.DeviceUnresponsiveError as e:	322 except android_commands.errors.DeviceUnresponsiveError as e:

304 logging.warning('Device unresponsive during TearDown: [%s]', e)	323 logging.warning('Device unresponsive during TearDown: [%s]', e)

305 finally:	324 finally:

306 forwarder.Forwarder.KillHost(build_type)	325 forwarder.Forwarder.KillHost(build_type)

OLD	NEW

« no previous file with comments | « no previous file | build/android/pylib/browsertests/dispatch.py » ('j') | build/android/pylib/gtest/dispatch.py » ('J')