scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py - Issue 1610203003: Iteratively increase sample size for good/bad classification.

Side by Side Diff: scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py

Issue 1610203003: Iteratively increase sample size for good/bad classification. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master

Patch Set: Rebasing Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « scripts/slave/recipe_modules/auto_bisect/example.expected/windows_x64_bisector.json ('k') | scripts/slave/recipe_modules/auto_bisect/resources/significantly_different.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # Copyright 2015 The Chromium Authors. All rights reserved.	1 # Copyright 2015 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 import json	5 import json

	6 import math

6 import tempfile	7 import tempfile

7 import os	8 import os

8 import uuid	9 import uuid

9	10

10 from . import revision_state	11 from . import revision_state

11	12

12 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover	13 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover

13 from . import test_results_cache	14 from . import test_results_cache

14	15

	16 # These relate to how to increase the number of repetitions during re-test

	17 MINIMUM_SAMPLE_SIZE = 5

	18 INCREASE_FACTOR = 1.5

15	19

16 class PerfRevisionState(revision_state.RevisionState):	20 class PerfRevisionState(revision_state.RevisionState):

17 """Contains the state and results for one revision in a perf bisect job."""	21 """Contains the state and results for one revision in a perf bisect job."""

18	22

19 def __init__(self, args, *kwargs):	23 def __init__(self, args, *kwargs):

20 super(PerfRevisionState, self).__init__(args, *kwargs)	24 super(PerfRevisionState, self).__init__(args, *kwargs)

21 self.values = []	25 self.values = []

22 self.mean_value = None	26 self.mean_value = None

23 self.std_dev = None	27 self.std_dev = None

	28 self.repeat_count = MINIMUM_SAMPLE_SIZE

24 self._test_config = None	29 self._test_config = None

25	30

26 def _read_test_results(self):	31 def _read_test_results(self, check_revision_goodness=True):

27 """Gets the test results from GS and checks if the rev is good or bad."""	32 """Gets the test results from GS and checks if the rev is good or bad."""

28 test_results = self._get_test_results()	33 test_results = self._get_test_results()

29 # Results will contain the keys 'results' and 'output' where output is the	34 # Results will contain the keys 'results' and 'output' where output is the

30 # stdout of the command, and 'results' is itself a dict with the key	35 # stdout of the command, and 'results' is itself a dict with the key

31 # 'values' unless the test failed, in which case 'results' will contain	36 # 'values' unless the test failed, in which case 'results' will contain

32 # the 'error' key explaining the type of error.	37 # the 'error' key explaining the type of error.

33 results = test_results['results']	38 results = test_results['results']

34 if results.get('errors'):	39 if results.get('errors'):

35 self.status = PerfRevisionState.FAILED	40 self.status = PerfRevisionState.FAILED

36 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover	41 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover

37 self.bisector.surface_result('MISSING_METRIC')	42 self.bisector.surface_result('MISSING_METRIC')

38 return	43 return

39 self.values = results['values']	44 self.values += results['values']

40 if self.bisector.is_return_code_mode():	45 if self.bisector.is_return_code_mode():

41 retcodes = test_results['retcodes']	46 retcodes = test_results['retcodes']

42 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1	47 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1

43 self.mean_value = overall_return_code	48 self.mean_value = overall_return_code

44 elif self.values:	49 elif self.values:

45 api = self.bisector.api	50 api = self.bisector.api

46 self.mean_value = api.m.math_utils.mean(self.values)	51 self.mean_value = api.m.math_utils.mean(self.values)

47 self.std_dev = api.m.math_utils.standard_deviation(self.values)	52 self.std_dev = api.m.math_utils.standard_deviation(self.values)

48 # Values were not found, but the test did not otherwise fail.	53 # Values were not found, but the test did not otherwise fail.

49 else:	54 else:

50 self.status = PerfRevisionState.FAILED	55 self.status = PerfRevisionState.FAILED

51 self.bisector.surface_result('MISSING_METRIC')	56 self.bisector.surface_result('MISSING_METRIC')

52 return	57 return

	58 # If we have already decided on the goodness of this revision, we shouldn't

	59 # recheck it.

	60 if self.good or self.bad:

	61 check_revision_goodness = False

53 # We cannot test the goodness of the initial rev range.	62 # We cannot test the goodness of the initial rev range.

54 if self.bisector.good_rev != self and self.bisector.bad_rev != self:	63 if (self.bisector.good_rev != self and self.bisector.bad_rev != self and

	64 check_revision_goodness):

55 if self._check_revision_good():	65 if self._check_revision_good():

56 self.good = True	66 self.good = True

57 else:	67 else:

58 self.bad = True	68 self.bad = True

59	69

60 def _write_deps_patch_file(self, build_name):	70 def _write_deps_patch_file(self, build_name):

61 """Saves the DEPS patch in a temp location and returns the file path."""	71 """Saves the DEPS patch in a temp location and returns the file path."""

62 api = self.bisector.api	72 api = self.bisector.api

63 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff'))	73 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff'))

64 api.m.file.write('Saving diff patch for ' + str(self.revision_string),	74 api.m.file.write('Saving diff patch for ' + str(self.revision_string),

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
106 api.m.file.remove('cleaning up patch', self.patch_file)	116 api.m.file.remove('cleaning up patch', self.patch_file)

107 except api.m.step.StepFailure: # pragma: no cover	117 except api.m.step.StepFailure: # pragma: no cover

108 print 'Could not clean up ' + self.patch_file	118 print 'Could not clean up ' + self.patch_file

109	119

110 def _get_bisect_config_for_tester(self):	120 def _get_bisect_config_for_tester(self):

111 """Copies the key-value pairs required by a tester bot to a new dict."""	121 """Copies the key-value pairs required by a tester bot to a new dict."""

112 result = {}	122 result = {}

113 required_test_properties = {	123 required_test_properties = {

114 'truncate_percent',	124 'truncate_percent',

115 'metric',	125 'metric',

116 'max_time_minutes',

117 'command',	126 'command',

118 'repeat_count',

119 'test_type'	127 'test_type'

120 }	128 }

121 for k, v in self.bisector.bisect_config.iteritems():	129 for k, v in self.bisector.bisect_config.iteritems():

122 if k in required_test_properties:	130 if k in required_test_properties:

123 result[k] = v	131 result[k] = v

	132 result['repeat_count'] = self.repeat_count

124 self._test_config = result	133 self._test_config = result

125 return result	134 return result

126	135

127 def _do_test(self):	136 def _do_test(self):

128 """Triggers tests for a revision, either locally or via try job.	137 """Triggers tests for a revision, either locally or via try job.

129	138

130 If local testing is enabled (i.e. director/tester merged) then	139 If local testing is enabled (i.e. director/tester merged) then

131 the test will be run on the same machine. Otherwise, this posts	140 the test will be run on the same machine. Otherwise, this posts

132 a request to buildbot to download and perf-test this build.	141 a request to buildbot to download and perf-test this build.

133 """	142 """

(...skipping 14 matching lines...) Expand all Loading...
148 'bisect_config': self._get_bisect_config_for_tester(),	157 'bisect_config': self._get_bisect_config_for_tester(),

149 'job_name': self.job_name,	158 'job_name': self.job_name,

150 },	159 },

151 }	160 }

152 if 'CACHE_TEST_RESULTS' in os.environ and test_results_cache.has_results(	161 if 'CACHE_TEST_RESULTS' in os.environ and test_results_cache.has_results(

153 self.job_name): # pragma: no cover	162 self.job_name): # pragma: no cover

154 return	163 return

155 self.test_results_url = (self.bisector.api.GS_RESULTS_URL +	164 self.test_results_url = (self.bisector.api.GS_RESULTS_URL +

156 self.job_name + '.results')	165 self.job_name + '.results')

157 if api.m.bisect_tester.local_test_enabled(): # pragma: no cover	166 if api.m.bisect_tester.local_test_enabled(): # pragma: no cover

	167 skip_download = self.bisector.last_tested_revision == self

	168 self.bisector.last_tested_revision = self

158 overrides = perf_test_properties['properties']	169 overrides = perf_test_properties['properties']

159 api.run_local_test_run(api.m, overrides)	170 api.run_local_test_run(api.m, overrides, skip_download=skip_download)

160 else:	171 else:

161 step_name = 'Triggering test job for ' + str(self.revision_string)	172 step_name = 'Triggering test job for ' + str(self.revision_string)

162 api.m.trigger(perf_test_properties, name=step_name)	173 api.m.trigger(perf_test_properties, name=step_name)

163	174

164 def get_next_url(self):	175 def get_next_url(self):

165 """Returns a GS URL for checking progress of a build or test."""	176 """Returns a GS URL for checking progress of a build or test."""

166 if self.status == PerfRevisionState.BUILDING:	177 if self.status == PerfRevisionState.BUILDING:

167 return self.build_url	178 return self.build_url

168 if self.status == PerfRevisionState.TESTING:	179 if self.status == PerfRevisionState.TESTING:

169 return self.test_results_url	180 return self.test_results_url

(...skipping 14 matching lines...) Expand all Loading...
184 builder = self.bisector.get_builder_bot_for_this_platform()	195 builder = self.bisector.get_builder_bot_for_this_platform()

185 if self.status == PerfRevisionState.TESTING:	196 if self.status == PerfRevisionState.TESTING:

186 builder = self.bisector.get_perf_tester_name()	197 builder = self.bisector.get_perf_tester_name()

187 return {	198 return {

188 'type': 'buildbot',	199 'type': 'buildbot',

189 'master': master,	200 'master': master,

190 'builder': builder,	201 'builder': builder,

191 'job_name': self.job_name,	202 'job_name': self.job_name,

192 }	203 }

193	204

	205 def retest(self): # pragma: no cover

	206 # We need at least 5 samples for applying Mann-Whitney U test

	207 # with P < 0.01, two-tailed .

	208 target_sample_size = max(5, math.ceil(len(self.values) * 1.5))

	209 self.status = PerfRevisionState.NEED_MORE_DATA

	210 self.repeat_count = target_sample_size - len(self.values)

	211 self.start_job()

	212 self.bisector.wait_for_any([self])

	213

194 def _get_test_results(self):	214 def _get_test_results(self):

195 """Tries to get the results of a test job from cloud storage."""	215 """Tries to get the results of a test job from cloud storage."""

196 api = self.bisector.api	216 api = self.bisector.api

197 try:	217 try:

198 stdout = api.m.raw_io.output()	218 stdout = api.m.raw_io.output()

199 name = 'Get test results for build ' + self.commit_hash	219 name = 'Get test results for build ' + self.commit_hash

200 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout,	220 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout,

201 name=name)	221 name=name)

202 except api.m.step.StepFailure: # pragma: no cover	222 except api.m.step.StepFailure: # pragma: no cover

203 self.bisector.surface_result('TEST_FAILURE')	223 self.bisector.surface_result('TEST_FAILURE')

204 return None	224 return None

205 else:	225 else:

206 return json.loads(step_result.stdout)	226 return json.loads(step_result.stdout)

207	227

208 def _check_revision_good(self):	228 def _check_revision_good(self):

209 """Determines if a revision is good or bad.	229 """Determines if a revision is good or bad.

210	230

211 Note that our current approach is to determine whether it is closer to	231 Iteratively increment the sample size of the revision being tested, the last

212 either the 'good' and 'bad' revisions given for the bisect job.	232 known good revision, and the first known bad revision until a relationship

	233 of significant difference can be established betweeb the results of the

	234 revision being tested and one of the other two.

	235

	236 If the results do not converge towards finding a significant difference in

	237 either direction, this is expected to timeout eventually. This scenario

	238 should be rather rare, since it is expected that the fkbr and lkgr are

	239 significantly different as a precondition.

213	240

214 Returns:	241 Returns:

215 True if this revision is closer to the initial good revision's value than	242 True if the results of testing this revision are significantly different

216 to the initial bad revision's value. False otherwise.	243 from those of testing the earliest known bad revision.

	244 False if they are instead significantly different form those of testing

	245 the latest knwon good revision.

217 """	246 """

218 # TODO: Reevaluate this approach	247 if self.bisector.is_return_code_mode():

219 bisector = self.bisector	248 return self.mean_value == 0

220 distance_to_good = abs(self.mean_value - bisector.good_rev.mean_value)	249

221 distance_to_bad = abs(self.mean_value - bisector.bad_rev.mean_value)	250 while True:

222 if distance_to_good < distance_to_bad:	251 diff_from_good = self.bisector.significantly_different(

223 return True	252 self.bisector.lkgr.values, self.values)

224 return False	253 diff_from_bad = self.bisector.significantly_different(

	254 self.bisector.fkbr.values, self.values)

	255

	256 if diff_from_good and diff_from_bad:

	257 # Multiple regressions.

	258 # For now, proceed bisecting the biggest difference of the means.

	259 dist_from_good = abs(self.mean_value - self.bisector.lkgr.mean_value)

	260 dist_from_bad = abs(self.mean_value - self.bisector.fkbr.mean_value)

	261 if dist_from_good > dist_from_bad:

	262 # TODO(robertocn): Add way to handle the secondary regression

	263 #self.bisector.handle_secondary_regression(self, self.bisector.fkbr)

	264 return False

	265 else:

	266 #self.bisector.handle_secondary_regression(self.bisector.lkgr, self)

	267 return True

	268

	269 if diff_from_good or diff_from_bad: # pragma: no cover

	270 return diff_from_bad

	271

	272 self._next_retest() # pragma: no cover

	273

	274

	275 def _next_retest(self): # pragma: no cover

	276 """Chooses one of current, lkgr, fkbr to retest.

	277

	278 Look for the smallest sample and retest that. If the last tested revision

	279 is tied for the smallest sample, use that to take advantage of the fact

	280 that it is already downloaded and unzipped.

	281 """

	282 next_revision_to_test = min(self.bisector.lkgr, self, self.bisector.fkbr,

	283 key=lambda x: len(x.values))

	284 if (len(self.bisector.last_tested_revision.values) ==

	285 next_revision_to_test.values):

	286 self.bisector.last_tested_revision.retest()

	287 else:

	288 next_revision_to_test.retest()

225	289

226 def __repr__(self):	290 def __repr__(self):

227 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' %	291 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' %

228 (self.commit_pos, self.values, self.mean_value, self.std_dev))	292 (self.commit_pos, self.values, self.mean_value, self.std_dev))

OLD	NEW