scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py - Issue 1702013004: Revert of Iteratively increase sample size for good/bad classification.

Side by Side Diff: scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py

Issue 1702013004: Revert of Iteratively increase sample size for good/bad classification. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master

Patch Set: Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « scripts/slave/recipe_modules/auto_bisect/example.expected/windows_x64_bisector.json ('k') | scripts/slave/recipe_modules/auto_bisect/resources/significantly_different.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # Copyright 2015 The Chromium Authors. All rights reserved.	1 # Copyright 2015 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 import json	5 import json

6 import math

7 import tempfile	6 import tempfile

8 import os	7 import os

9 import uuid	8 import uuid

10	9

11 from . import revision_state	10 from . import revision_state

12	11

13 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover	12 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover

14 from . import test_results_cache	13 from . import test_results_cache

15	14

16 # These relate to how to increase the number of repetitions during re-test

17 MINIMUM_SAMPLE_SIZE = 5

18 INCREASE_FACTOR = 1.5

19	15

20 class PerfRevisionState(revision_state.RevisionState):	16 class PerfRevisionState(revision_state.RevisionState):

21 """Contains the state and results for one revision in a perf bisect job."""	17 """Contains the state and results for one revision in a perf bisect job."""

22	18

23 def __init__(self, args, *kwargs):	19 def __init__(self, args, *kwargs):

24 super(PerfRevisionState, self).__init__(args, *kwargs)	20 super(PerfRevisionState, self).__init__(args, *kwargs)

25 self.values = []	21 self.values = []

26 self.mean_value = None	22 self.mean_value = None

27 self.std_dev = None	23 self.std_dev = None

28 self.repeat_count = MINIMUM_SAMPLE_SIZE

29 self._test_config = None	24 self._test_config = None

30	25

31 def _read_test_results(self, check_revision_goodness=True):	26 def _read_test_results(self):

32 """Gets the test results from GS and checks if the rev is good or bad."""	27 """Gets the test results from GS and checks if the rev is good or bad."""

33 test_results = self._get_test_results()	28 test_results = self._get_test_results()

34 # Results will contain the keys 'results' and 'output' where output is the	29 # Results will contain the keys 'results' and 'output' where output is the

35 # stdout of the command, and 'results' is itself a dict with the key	30 # stdout of the command, and 'results' is itself a dict with the key

36 # 'values' unless the test failed, in which case 'results' will contain	31 # 'values' unless the test failed, in which case 'results' will contain

37 # the 'error' key explaining the type of error.	32 # the 'error' key explaining the type of error.

38 results = test_results['results']	33 results = test_results['results']

39 if results.get('errors'):	34 if results.get('errors'):

40 self.status = PerfRevisionState.FAILED	35 self.status = PerfRevisionState.FAILED

41 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover	36 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover

42 self.bisector.surface_result('MISSING_METRIC')	37 self.bisector.surface_result('MISSING_METRIC')

43 return	38 return

44 self.values += results['values']	39 self.values = results['values']

45 if self.bisector.is_return_code_mode():	40 if self.bisector.is_return_code_mode():

46 retcodes = test_results['retcodes']	41 retcodes = test_results['retcodes']

47 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1	42 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1

48 self.mean_value = overall_return_code	43 self.mean_value = overall_return_code

49 elif self.values:	44 elif self.values:

50 api = self.bisector.api	45 api = self.bisector.api

51 self.mean_value = api.m.math_utils.mean(self.values)	46 self.mean_value = api.m.math_utils.mean(self.values)

52 self.std_dev = api.m.math_utils.standard_deviation(self.values)	47 self.std_dev = api.m.math_utils.standard_deviation(self.values)

53 # Values were not found, but the test did not otherwise fail.	48 # Values were not found, but the test did not otherwise fail.

54 else:	49 else:

55 self.status = PerfRevisionState.FAILED	50 self.status = PerfRevisionState.FAILED

56 self.bisector.surface_result('MISSING_METRIC')	51 self.bisector.surface_result('MISSING_METRIC')

57 return	52 return

58 # If we have already decided on the goodness of this revision, we shouldn't

59 # recheck it.

60 if self.good or self.bad:

61 check_revision_goodness = False

62 # We cannot test the goodness of the initial rev range.	53 # We cannot test the goodness of the initial rev range.

63 if (self.bisector.good_rev != self and self.bisector.bad_rev != self and	54 if self.bisector.good_rev != self and self.bisector.bad_rev != self:

64 check_revision_goodness):

65 if self._check_revision_good():	55 if self._check_revision_good():

66 self.good = True	56 self.good = True

67 else:	57 else:

68 self.bad = True	58 self.bad = True

69	59

70 def _write_deps_patch_file(self, build_name):	60 def _write_deps_patch_file(self, build_name):

71 """Saves the DEPS patch in a temp location and returns the file path."""	61 """Saves the DEPS patch in a temp location and returns the file path."""

72 api = self.bisector.api	62 api = self.bisector.api

73 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff'))	63 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff'))

74 api.m.file.write('Saving diff patch for ' + str(self.revision_string),	64 api.m.file.write('Saving diff patch for ' + str(self.revision_string),

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
116 api.m.file.remove('cleaning up patch', self.patch_file)	106 api.m.file.remove('cleaning up patch', self.patch_file)

117 except api.m.step.StepFailure: # pragma: no cover	107 except api.m.step.StepFailure: # pragma: no cover

118 print 'Could not clean up ' + self.patch_file	108 print 'Could not clean up ' + self.patch_file

119	109

120 def _get_bisect_config_for_tester(self):	110 def _get_bisect_config_for_tester(self):

121 """Copies the key-value pairs required by a tester bot to a new dict."""	111 """Copies the key-value pairs required by a tester bot to a new dict."""

122 result = {}	112 result = {}

123 required_test_properties = {	113 required_test_properties = {

124 'truncate_percent',	114 'truncate_percent',

125 'metric',	115 'metric',

	116 'max_time_minutes',

126 'command',	117 'command',

	118 'repeat_count',

127 'test_type'	119 'test_type'

128 }	120 }

129 for k, v in self.bisector.bisect_config.iteritems():	121 for k, v in self.bisector.bisect_config.iteritems():

130 if k in required_test_properties:	122 if k in required_test_properties:

131 result[k] = v	123 result[k] = v

132 result['repeat_count'] = self.repeat_count

133 self._test_config = result	124 self._test_config = result

134 return result	125 return result

135	126

136 def _do_test(self):	127 def _do_test(self):

137 """Triggers tests for a revision, either locally or via try job.	128 """Triggers tests for a revision, either locally or via try job.

138	129

139 If local testing is enabled (i.e. director/tester merged) then	130 If local testing is enabled (i.e. director/tester merged) then

140 the test will be run on the same machine. Otherwise, this posts	131 the test will be run on the same machine. Otherwise, this posts

141 a request to buildbot to download and perf-test this build.	132 a request to buildbot to download and perf-test this build.

142 """	133 """

(...skipping 14 matching lines...) Expand all Loading...
157 'bisect_config': self._get_bisect_config_for_tester(),	148 'bisect_config': self._get_bisect_config_for_tester(),

158 'job_name': self.job_name,	149 'job_name': self.job_name,

159 },	150 },

160 }	151 }

161 if 'CACHE_TEST_RESULTS' in os.environ and test_results_cache.has_results(	152 if 'CACHE_TEST_RESULTS' in os.environ and test_results_cache.has_results(

162 self.job_name): # pragma: no cover	153 self.job_name): # pragma: no cover

163 return	154 return

164 self.test_results_url = (self.bisector.api.GS_RESULTS_URL +	155 self.test_results_url = (self.bisector.api.GS_RESULTS_URL +

165 self.job_name + '.results')	156 self.job_name + '.results')

166 if api.m.bisect_tester.local_test_enabled(): # pragma: no cover	157 if api.m.bisect_tester.local_test_enabled(): # pragma: no cover

167 skip_download = self.bisector.last_tested_revision == self

168 self.bisector.last_tested_revision = self

169 overrides = perf_test_properties['properties']	158 overrides = perf_test_properties['properties']

170 api.run_local_test_run(api.m, overrides, skip_download=skip_download)	159 api.run_local_test_run(api.m, overrides)

171 else:	160 else:

172 step_name = 'Triggering test job for ' + str(self.revision_string)	161 step_name = 'Triggering test job for ' + str(self.revision_string)

173 api.m.trigger(perf_test_properties, name=step_name)	162 api.m.trigger(perf_test_properties, name=step_name)

174	163

175 def get_next_url(self):	164 def get_next_url(self):

176 """Returns a GS URL for checking progress of a build or test."""	165 """Returns a GS URL for checking progress of a build or test."""

177 if self.status == PerfRevisionState.BUILDING:	166 if self.status == PerfRevisionState.BUILDING:

178 return self.build_url	167 return self.build_url

179 if self.status == PerfRevisionState.TESTING:	168 if self.status == PerfRevisionState.TESTING:

180 return self.test_results_url	169 return self.test_results_url

(...skipping 14 matching lines...) Expand all Loading...
195 builder = self.bisector.get_builder_bot_for_this_platform()	184 builder = self.bisector.get_builder_bot_for_this_platform()

196 if self.status == PerfRevisionState.TESTING:	185 if self.status == PerfRevisionState.TESTING:

197 builder = self.bisector.get_perf_tester_name()	186 builder = self.bisector.get_perf_tester_name()

198 return {	187 return {

199 'type': 'buildbot',	188 'type': 'buildbot',

200 'master': master,	189 'master': master,

201 'builder': builder,	190 'builder': builder,

202 'job_name': self.job_name,	191 'job_name': self.job_name,

203 }	192 }

204	193

205 def retest(self): # pragma: no cover

206 # We need at least 5 samples for applying Mann-Whitney U test

207 # with P < 0.01, two-tailed .

208 target_sample_size = max(5, math.ceil(len(self.values) * 1.5))

209 self.status = PerfRevisionState.NEED_MORE_DATA

210 self.repeat_count = target_sample_size - len(self.values)

211 self.start_job()

212 self.bisector.wait_for_any([self])

213

214 def _get_test_results(self):	194 def _get_test_results(self):

215 """Tries to get the results of a test job from cloud storage."""	195 """Tries to get the results of a test job from cloud storage."""

216 api = self.bisector.api	196 api = self.bisector.api

217 try:	197 try:

218 stdout = api.m.raw_io.output()	198 stdout = api.m.raw_io.output()

219 name = 'Get test results for build ' + self.commit_hash	199 name = 'Get test results for build ' + self.commit_hash

220 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout,	200 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout,

221 name=name)	201 name=name)

222 except api.m.step.StepFailure: # pragma: no cover	202 except api.m.step.StepFailure: # pragma: no cover

223 self.bisector.surface_result('TEST_FAILURE')	203 self.bisector.surface_result('TEST_FAILURE')

224 return None	204 return None

225 else:	205 else:

226 return json.loads(step_result.stdout)	206 return json.loads(step_result.stdout)

227	207

228 def _check_revision_good(self):	208 def _check_revision_good(self):

229 """Determines if a revision is good or bad.	209 """Determines if a revision is good or bad.

230	210

231 Iteratively increment the sample size of the revision being tested, the last	211 Note that our current approach is to determine whether it is closer to

232 known good revision, and the first known bad revision until a relationship	212 either the 'good' and 'bad' revisions given for the bisect job.

233 of significant difference can be established betweeb the results of the

234 revision being tested and one of the other two.

235

236 If the results do not converge towards finding a significant difference in

237 either direction, this is expected to timeout eventually. This scenario

238 should be rather rare, since it is expected that the fkbr and lkgr are

239 significantly different as a precondition.

240	213

241 Returns:	214 Returns:

242 True if the results of testing this revision are significantly different	215 True if this revision is closer to the initial good revision's value than

243 from those of testing the earliest known bad revision.	216 to the initial bad revision's value. False otherwise.

244 False if they are instead significantly different form those of testing

245 the latest knwon good revision.

246 """	217 """

247	218 # TODO: Reevaluate this approach

248 while True:	219 bisector = self.bisector

249 diff_from_good = self.bisector.significantly_different(	220 distance_to_good = abs(self.mean_value - bisector.good_rev.mean_value)

250 self.bisector.lkgr.values, self.values)	221 distance_to_bad = abs(self.mean_value - bisector.bad_rev.mean_value)

251 diff_from_bad = self.bisector.significantly_different(	222 if distance_to_good < distance_to_bad:

252 self.bisector.fkbr.values, self.values)	223 return True

253	224 return False

254 if diff_from_good and diff_from_bad:

255 # Multiple regressions.

256 # For now, proceed bisecting the biggest difference of the means.

257 dist_from_good = abs(self.mean_value - self.bisector.lkgr.mean_value)

258 dist_from_bad = abs(self.mean_value - self.bisector.fkbr.mean_value)

259 if dist_from_good > dist_from_bad:

260 # TODO(robertocn): Add way to handle the secondary regression

261 #self.bisector.handle_secondary_regression(self, self.bisector.fkbr)

262 return False

263 else:

264 #self.bisector.handle_secondary_regression(self.bisector.lkgr, self)

265 return True

266

267 if diff_from_good or diff_from_bad: # pragma: no cover

268 return diff_from_bad

269

270 self._next_retest() # pragma: no cover

271

272

273 def _next_retest(self): # pragma: no cover

274 """Chooses one of current, lkgr, fkbr to retest.

275

276 Look for the smallest sample and retest that. If the last tested revision

277 is tied for the smallest sample, use that to take advantage of the fact

278 that it is already downloaded and unzipped.

279 """

280 next_revision_to_test = min(self.bisector.lkgr, self, self.bisector.fkbr,

281 key=lambda x: len(x.values))

282 if (len(self.bisector.last_tested_revision.values) ==

283 next_revision_to_test.values):

284 self.bisector.last_tested_revision.retest()

285 else:

286 next_revision_to_test.retest()

287	225

288 def __repr__(self):	226 def __repr__(self):

289 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' %	227 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' %

290 (self.commit_pos, self.values, self.mean_value, self.std_dev))	228 (self.commit_pos, self.values, self.mean_value, self.std_dev))

OLD	NEW