build/android/pylib/perf/test_runner.py - Issue 301183004: Android: adds device affinity for perf tests.

Side by Side Diff: build/android/pylib/perf/test_runner.py

Issue 301183004: Android: adds device affinity for perf tests. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Comments Created 6 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 # Copyright 2013 The Chromium Authors. All rights reserved.	1 # Copyright 2013 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 """Runs perf tests.	5 """Runs perf tests.

6	6

7 Our buildbot infrastructure requires each slave to run steps serially.	7 Our buildbot infrastructure requires each slave to run steps serially.

8 This is sub-optimal for android, where these steps can run independently on	8 This is sub-optimal for android, where these steps can run independently on

9 multiple connected devices.	9 multiple connected devices.

10	10

11 The buildbots will run this script multiple times per cycle:	11 The buildbots will run this script multiple times per cycle:

12 - First: all steps listed in --steps in will be executed in parallel using all	12 - First: all steps listed in --steps in will be executed in parallel using all

13 connected devices. Step results will be pickled to disk. Each step has a unique	13 connected devices. Step results will be pickled to disk. Each step has a unique

14 name. The result code will be ignored if the step name is listed in	14 name. The result code will be ignored if the step name is listed in

15 --flaky-steps.	15 --flaky-steps.

16 The buildbot will treat this step as a regular step, and will not process any	16 The buildbot will treat this step as a regular step, and will not process any

17 graph data.	17 graph data.

18	18

19 - Then, with -print-step STEP_NAME: at this stage, we'll simply print the file	19 - Then, with -print-step STEP_NAME: at this stage, we'll simply print the file

20 with the step results previously saved. The buildbot will then process the graph	20 with the step results previously saved. The buildbot will then process the graph

21 data accordingly.	21 data accordingly.

22	22

	23 The JSON steps file contains a dictionary in the format:

	24 { "version": int,

	25 "steps": {

	26 "foo": {

	27 "device_affinity": int,

	28 "cmd": "script_to_execute foo"

	29 },

	30 "bar": {

	31 "device_affinity": int,

	32 "cmd": "script_to_execute bar"

	33 }

	34 }

	35 }

23	36

24 The JSON steps file contains a dictionary in the format:	37 # TODO(bulach): remove once it rolls downstream, crbug.com/378862.

	38 The OLD JSON steps file contains a dictionary in the format:

25 [	39 [

26 ["step_name_foo", "script_to_execute foo"],	40 ["step_name_foo", "script_to_execute foo"],

27 ["step_name_bar", "script_to_execute bar"]	41 ["step_name_bar", "script_to_execute bar"]

28 ]	42 ]

29	43

30 This preserves the order in which the steps are executed.	44 This preserves the order in which the steps are executed.

31	45

32 The JSON flaky steps file contains a list with step names which results should	46 The JSON flaky steps file contains a list with step names which results should

33 be ignored:	47 be ignored:

34 [	48 [

35 "step_name_foo",	49 "step_name_foo",

36 "step_name_bar"	50 "step_name_bar"

37 ]	51 ]

38	52

39 Note that script_to_execute necessarily have to take at least the following	53 Note that script_to_execute necessarily have to take at least the following

40 option:	54 option:

41 --device: the serial number to be passed to all adb commands.	55 --device: the serial number to be passed to all adb commands.

42 """	56 """

43	57

	58 import collections

44 import datetime	59 import datetime

45 import logging	60 import logging

46 import os	61 import os

47 import pickle	62 import pickle

48 import sys	63 import sys

49 import threading	64 import threading

50 import time	65 import time

51	66

52 from pylib import cmd_helper	67 from pylib import cmd_helper

53 from pylib import constants	68 from pylib import constants

(...skipping 24 matching lines...) Expand all Loading...
78 logging.info('' 80)	93 logging.info('' 80)

79 print persisted_result['output']	94 print persisted_result['output']

80	95

81 return persisted_result['exit_code']	96 return persisted_result['exit_code']

82	97

83	98

84 def PrintSummary(test_names):	99 def PrintSummary(test_names):

85 logging.info('' 80)	100 logging.info('' 80)

86 logging.info('Sharding summary')	101 logging.info('Sharding summary')

87 total_time = 0	102 total_time = 0

	103 device_total_time = collections.defaultdict(int)

88 for test_name in test_names:	104 for test_name in test_names:

89 file_name = os.path.join(constants.PERF_OUTPUT_DIR, test_name)	105 file_name = os.path.join(constants.PERF_OUTPUT_DIR, test_name)

90 if not os.path.exists(file_name):	106 if not os.path.exists(file_name):

91 logging.info('%s : No status file found', test_name)	107 logging.info('%s : No status file found', test_name)

92 continue	108 continue

93 with file(file_name, 'r') as f:	109 with file(file_name, 'r') as f:

94 result = pickle.loads(f.read())	110 result = pickle.loads(f.read())

95 logging.info('%s : exit_code=%d in %d secs at %s',	111 logging.info('%s : exit_code=%d in %d secs at %s',

96 result['name'], result['exit_code'], result['total_time'],	112 result['name'], result['exit_code'], result['total_time'],

97 result['device'])	113 result['device'])

98 total_time += result['total_time']	114 total_time += result['total_time']
	jbudorick 2014/06/03 13:52:52 nit: you don't really need a separate total_time a nit: you don't really need a separate total_time any more. this can now just be done with sum(device_total_time.values()) bulach 2014/06/03 14:25:52 Done. Show quoted text On 2014/06/03 13:52:52, jbudorick wrote: > nit: you don't really need a separate total_time any more. this can now just be > done with sum(device_total_time.values()) Done.
	115 device_total_time[result['device']] += result['total_time']

	116 for device, device_time in device_total_time.iteritems():

	117 logging.info('Total for device %s : %d secs', device, device_time)

99 logging.info('Total steps time: %d secs', total_time)	118 logging.info('Total steps time: %d secs', total_time)

100	119

101	120

102 class _HeartBeatLogger(object):	121 class _HeartBeatLogger(object):

103 # How often to print the heartbeat on flush().	122 # How often to print the heartbeat on flush().

104 _PRINT_INTERVAL = 30.0	123 _PRINT_INTERVAL = 30.0

105	124

106 def __init__(self):	125 def __init__(self):

107 """A file-like class for keeping the buildbot alive."""	126 """A file-like class for keeping the buildbot alive."""

108 self._len = 0	127 self._len = 0

(...skipping 15 matching lines...) Expand all Loading...
124 if now - self._tick >= _HeartBeatLogger._PRINT_INTERVAL:	143 if now - self._tick >= _HeartBeatLogger._PRINT_INTERVAL:

125 self._tick = now	144 self._tick = now

126 print '--single-step output length %d' % self._len	145 print '--single-step output length %d' % self._len

127 sys.stdout.flush()	146 sys.stdout.flush()

128	147

129 def stop(self):	148 def stop(self):

130 self._stopped.set()	149 self._stopped.set()

131	150

132	151

133 class TestRunner(base_test_runner.BaseTestRunner):	152 class TestRunner(base_test_runner.BaseTestRunner):

134 def __init__(self, test_options, device, tests, flaky_tests):	153 def __init__(self, test_options, device, shard_index, max_shard, tests,

	154 flaky_tests):

135 """A TestRunner instance runs a perf test on a single device.	155 """A TestRunner instance runs a perf test on a single device.

136	156

137 Args:	157 Args:

138 test_options: A PerfOptions object.	158 test_options: A PerfOptions object.

139 device: Device to run the tests.	159 device: Device to run the tests.

	160 shard_index: the index of this device.

	161 max_shards: the maximum shard index.

140 tests: a dict mapping test_name to command.	162 tests: a dict mapping test_name to command.

141 flaky_tests: a list of flaky test_name.	163 flaky_tests: a list of flaky test_name.

142 """	164 """

143 super(TestRunner, self).__init__(device, None, 'Release')	165 super(TestRunner, self).__init__(device, None, 'Release')

144 self._options = test_options	166 self._options = test_options

	167 self._shard_index = shard_index

	168 self._max_shard = max_shard

145 self._tests = tests	169 self._tests = tests

146 self._flaky_tests = flaky_tests	170 self._flaky_tests = flaky_tests

147	171

148 @staticmethod	172 @staticmethod

149 def _IsBetter(result):	173 def _IsBetter(result):

150 if result['actual_exit_code'] == 0:	174 if result['actual_exit_code'] == 0:

151 return True	175 return True

152 pickled = os.path.join(constants.PERF_OUTPUT_DIR,	176 pickled = os.path.join(constants.PERF_OUTPUT_DIR,

153 result['name'])	177 result['name'])

154 if not os.path.exists(pickled):	178 if not os.path.exists(pickled):

155 return True	179 return True

156 with file(pickled, 'r') as f:	180 with file(pickled, 'r') as f:

157 previous = pickle.loads(f.read())	181 previous = pickle.loads(f.read())

158 return result['actual_exit_code'] < previous['actual_exit_code']	182 return result['actual_exit_code'] < previous['actual_exit_code']

159	183

160 @staticmethod	184 @staticmethod

161 def _SaveResult(result):	185 def _SaveResult(result):

162 if TestRunner._IsBetter(result):	186 if TestRunner._IsBetter(result):

163 with file(os.path.join(constants.PERF_OUTPUT_DIR,	187 with file(os.path.join(constants.PERF_OUTPUT_DIR,

164 result['name']), 'w') as f:	188 result['name']), 'w') as f:

165 f.write(pickle.dumps(result))	189 f.write(pickle.dumps(result))

166	190

	191 def _CheckDeviceAffinity(self, test_name):

	192 """Returns True if test_name has affinity for this shard."""

	193 affinity = (self._tests['steps'][test_name]['device_affinity'] %
	jbudorick 2014/06/03 13:52:52 My point with the shard index vs the serial was th My point with the shard index vs the serial was that we can ensure that tests that have run on a particular device run on that same device, not that we care which device a particular test runs on. adb devices doesn't necessarily maintain a particular order, and it appears to me that, if the list of devices changes in some way, you're liable to hit false positives for device affinity and run tests on devices that don't actually have affinity for them. bulach 2014/06/03 14:25:52 let's split this: 1) this is using the persistent Show quoted text On 2014/06/03 13:52:52, jbudorick wrote: > My point with the shard index vs the serial was that we can ensure that tests > that have run on a particular device run on that same device, not that we care > which device a particular test runs on. > > adb devices doesn't necessarily maintain a particular order, and it appears to > me that, if the list of devices changes in some way, you're liable to hit false > positives for device affinity and run tests on devices that don't actually have > affinity for them. let's split this: 1) this is using the persistent list of devices, not adb devices. also, the list is sorted at setup.py:_GetAllDevices()... 2) again, the thing is that if we use serial, something/someone somewhere will have to maintain a map: ... 'serial1': [tests], 'serial2': [tests], ... this will explode quite quickly... whilst I fully agree there's a risk of tests running on devices without affinity, right now it'd only happen when the .last_devices file is wiped and generated with a different list of devices... in which case, we'd also need to change the map with serials.. right now, we have no affinity at all :) I suppose we should go with this, analyse, and if it still don't provide affinity enough, then we change to use hardcoded serials. wdyt?
	194 self._max_shard)

	195 if self._shard_index == affinity:

	196 return True

	197 logging.info('Skipping %s on %s (affinity is %s, device is %s)',

	198 test_name, self.device_serial, affinity, self._shard_index)

	199 return False

	200

167 def _LaunchPerfTest(self, test_name):	201 def _LaunchPerfTest(self, test_name):

168 """Runs a perf test.	202 """Runs a perf test.

169	203

170 Args:	204 Args:

171 test_name: the name of the test to be executed.	205 test_name: the name of the test to be executed.

172	206

173 Returns:	207 Returns:

174 A tuple containing (Output, base_test_result.ResultType)	208 A tuple containing (Output, base_test_result.ResultType)

175 """	209 """

	210 if not self._CheckDeviceAffinity(test_name):

	211 return '', base_test_result.ResultType.PASS

	212

176 try:	213 try:

177 logging.warning('Unmapping device ports')	214 logging.warning('Unmapping device ports')

178 forwarder.Forwarder.UnmapAllDevicePorts(self.device)	215 forwarder.Forwarder.UnmapAllDevicePorts(self.device)

179 self.device.old_interface.RestartAdbdOnDevice()	216 self.device.old_interface.RestartAdbdOnDevice()

180 except Exception as e:	217 except Exception as e:

181 logging.error('Exception when tearing down device %s', e)	218 logging.error('Exception when tearing down device %s', e)

182	219

183 cmd = ('%s --device %s' %	220 cmd = ('%s --device %s' %

184 (self._tests[test_name], self.device.old_interface.GetDevice()))	221 (self._tests['steps'][test_name]['cmd'],

	222 self.device_serial))

185 logging.info('%s : %s', test_name, cmd)	223 logging.info('%s : %s', test_name, cmd)

186 start_time = datetime.datetime.now()	224 start_time = datetime.datetime.now()

187	225

188 timeout = 5400	226 timeout = 5400

189 if self._options.no_timeout:	227 if self._options.no_timeout:

190 timeout = None	228 timeout = None

191 full_cmd = cmd	229 full_cmd = cmd

192 if self._options.dry_run:	230 if self._options.dry_run:

193 full_cmd = 'echo %s' % cmd	231 full_cmd = 'echo %s' % cmd

194	232

195 logfile = sys.stdout	233 logfile = sys.stdout

196 if self._options.single_step:	234 if self._options.single_step:

197 # Just print a heart-beat so that the outer buildbot scripts won't timeout	235 # Just print a heart-beat so that the outer buildbot scripts won't timeout

198 # without response.	236 # without response.

199 logfile = _HeartBeatLogger()	237 logfile = _HeartBeatLogger()

200 cwd = os.path.abspath(constants.DIR_SOURCE_ROOT)	238 cwd = os.path.abspath(constants.DIR_SOURCE_ROOT)

201 if full_cmd.startswith('src/'):	239 if full_cmd.startswith('src/'):

202 cwd = os.path.abspath(os.path.join(constants.DIR_SOURCE_ROOT, os.pardir))	240 cwd = os.path.abspath(os.path.join(constants.DIR_SOURCE_ROOT, os.pardir))

203 try:	241 try:

204 exit_code, output = cmd_helper.GetCmdStatusAndOutputWithTimeout(	242 exit_code, output = cmd_helper.GetCmdStatusAndOutputWithTimeout(

205 full_cmd, timeout, cwd=cwd, shell=True, logfile=logfile)	243 full_cmd, timeout, cwd=cwd, shell=True, logfile=logfile)

206 finally:	244 finally:

207 if self._options.single_step:	245 if self._options.single_step:

208 logfile.stop()	246 logfile.stop()

209 end_time = datetime.datetime.now()	247 end_time = datetime.datetime.now()

210 if exit_code is None:	248 if exit_code is None:

211 exit_code = -1	249 exit_code = -1

212 logging.info('%s : exit_code=%d in %d secs at %s',	250 logging.info('%s : exit_code=%d in %d secs at %s',

213 test_name, exit_code, (end_time - start_time).seconds,	251 test_name, exit_code, (end_time - start_time).seconds,

214 self.device.old_interface.GetDevice())	252 self.device_serial)
	jbudorick 2014/06/03 13:52:52 Sneaking in ahead of me here, I see. (In the futu Sneaking in ahead of me here, I see. (In the future, I'm going to be removing BaseTestRunner.device_serial and just using str(self.device)) bulach 2014/06/03 14:25:52 want me to keep the old way? happy either way :) Show quoted text On 2014/06/03 13:52:52, jbudorick wrote: > Sneaking in ahead of me here, I see. > > (In the future, I'm going to be removing BaseTestRunner.device_serial and just > using str(self.device)) want me to keep the old way? happy either way :)
215 result_type = base_test_result.ResultType.FAIL	253 result_type = base_test_result.ResultType.FAIL

216 if exit_code == 0:	254 if exit_code == 0:

217 result_type = base_test_result.ResultType.PASS	255 result_type = base_test_result.ResultType.PASS

218 actual_exit_code = exit_code	256 actual_exit_code = exit_code

219 if test_name in self._flaky_tests:	257 if test_name in self._flaky_tests:

220 # The exit_code is used at the second stage when printing the	258 # The exit_code is used at the second stage when printing the

221 # test output. If the test is flaky, force to "0" to get that step green	259 # test output. If the test is flaky, force to "0" to get that step green

222 # whilst still gathering data to the perf dashboards.	260 # whilst still gathering data to the perf dashboards.

223 # The result_type is used by the test_dispatcher to retry the test.	261 # The result_type is used by the test_dispatcher to retry the test.

224 exit_code = 0	262 exit_code = 0

225	263

226 persisted_result = {	264 persisted_result = {

227 'name': test_name,	265 'name': test_name,

228 'output': output,	266 'output': output,

229 'exit_code': exit_code,	267 'exit_code': exit_code,

230 'actual_exit_code': actual_exit_code,	268 'actual_exit_code': actual_exit_code,

231 'result_type': result_type,	269 'result_type': result_type,

232 'total_time': (end_time - start_time).seconds,	270 'total_time': (end_time - start_time).seconds,

233 'device': self.device.old_interface.GetDevice(),	271 'device': self.device_serial,

234 'cmd': cmd,	272 'cmd': cmd,

235 }	273 }

236 self._SaveResult(persisted_result)	274 self._SaveResult(persisted_result)

237	275

238 return (output, result_type)	276 return (output, result_type)

239	277

240 def RunTest(self, test_name):	278 def RunTest(self, test_name):

241 """Run a perf test on the device.	279 """Run a perf test on the device.

242	280

243 Args:	281 Args:

244 test_name: String to use for logging the test result.	282 test_name: String to use for logging the test result.

245	283

246 Returns:	284 Returns:

247 A tuple of (TestRunResults, retry).	285 A tuple of (TestRunResults, retry).

248 """	286 """

249 _, result_type = self._LaunchPerfTest(test_name)	287 _, result_type = self._LaunchPerfTest(test_name)

250 results = base_test_result.TestRunResults()	288 results = base_test_result.TestRunResults()

251 results.AddResult(base_test_result.BaseTestResult(test_name, result_type))	289 results.AddResult(base_test_result.BaseTestResult(test_name, result_type))

252 retry = None	290 retry = None

253 if not results.DidRunPass():	291 if not results.DidRunPass():

254 retry = test_name	292 retry = test_name

255 return results, retry	293 return results, retry

OLD	NEW

« build/android/pylib/perf/setup.py ('K') | « build/android/pylib/perf/setup.py ('k') | build/android/test_runner.py » ('j') | build/android/test_runner.py » ('J')