scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py - Issue 1610203003: Iteratively increase sample size for good/bad classification.

Side by Side Diff: scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py

Issue 1610203003: Iteratively increase sample size for good/bad classification. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master

Patch Set: Making initial regression check have a timeout. Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« scripts/slave/recipe_modules/auto_bisect/bisector.py ('K') | « scripts/slave/recipe_modules/auto_bisect/example.expected/windows_x64_bisector.json ('k') | scripts/slave/recipe_modules/auto_bisect/resources/significantly_different.py » ('j') | scripts/slave/recipe_modules/auto_bisect/resources/significantly_different.py » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # Copyright 2015 The Chromium Authors. All rights reserved.	1 # Copyright 2015 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 import json	5 import json

	6 import math

6 import tempfile	7 import tempfile

7 import os	8 import os

8 import uuid	9 import uuid

9	10

10 from . import revision_state	11 from . import revision_state

11	12

12 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover	13 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover

13 from . import test_results_cache	14 from . import test_results_cache

14	15

	16 # These relate to how to increase the number of repetitions during re-test

	17 MINIMUM_SAMPLE_SIZE = 5

	18 INCREASE_FACTOR = 1.5

15	19

16 class PerfRevisionState(revision_state.RevisionState):	20 class PerfRevisionState(revision_state.RevisionState):

17 """Contains the state and results for one revision in a perf bisect job."""	21 """Contains the state and results for one revision in a perf bisect job."""

18	22

19 def __init__(self, args, *kwargs):	23 def __init__(self, args, *kwargs):

20 super(PerfRevisionState, self).__init__(args, *kwargs)	24 super(PerfRevisionState, self).__init__(args, *kwargs)

21 self.values = []	25 self.values = []

22 self.mean_value = None	26 self.mean_value = None

23 self.std_dev = None	27 self.std_dev = None

	28 self.repeat_count = MINIMUM_SAMPLE_SIZE

24 self._test_config = None	29 self._test_config = None

25	30

26 def _read_test_results(self):	31 def _read_test_results(self, check_revision_goodness=True):

27 """Gets the test results from GS and checks if the rev is good or bad."""	32 """Gets the test results from GS and checks if the rev is good or bad."""

28 test_results = self._get_test_results()	33 test_results = self._get_test_results()

29 # Results will contain the keys 'results' and 'output' where output is the	34 # Results will contain the keys 'results' and 'output' where output is the

30 # stdout of the command, and 'results' is itself a dict with the key	35 # stdout of the command, and 'results' is itself a dict with the key

31 # 'values' unless the test failed, in which case 'results' will contain	36 # 'values' unless the test failed, in which case 'results' will contain

32 # the 'error' key explaining the type of error.	37 # the 'error' key explaining the type of error.

33 results = test_results['results']	38 results = test_results['results']

34 if results.get('errors'):	39 if results.get('errors'):

35 self.status = PerfRevisionState.FAILED	40 self.status = PerfRevisionState.FAILED

36 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover	41 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover

37 self.bisector.surface_result('MISSING_METRIC')	42 self.bisector.surface_result('MISSING_METRIC')

38 return	43 return

39 self.values = results['values']	44 self.values = (self.values or []) + results['values']
	dtu 2016/02/05 00:18:02 Why do you need this? Isn't self.values already in Why do you need this? Isn't self.values already initialized to []?
40 if self.bisector.is_return_code_mode():	45 if self.bisector.is_return_code_mode():

41 retcodes = test_results['retcodes']	46 retcodes = test_results['retcodes']

42 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1	47 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1

43 self.mean_value = overall_return_code	48 self.mean_value = overall_return_code

44 elif self.values:	49 elif self.values:

45 api = self.bisector.api	50 api = self.bisector.api

46 self.mean_value = api.m.math_utils.mean(self.values)	51 self.mean_value = api.m.math_utils.mean(self.values)

47 self.std_dev = api.m.math_utils.standard_deviation(self.values)	52 self.std_dev = api.m.math_utils.standard_deviation(self.values)

48 # Values were not found, but the test did not otherwise fail.	53 # Values were not found, but the test did not otherwise fail.

49 else:	54 else:

50 self.status = PerfRevisionState.FAILED	55 self.status = PerfRevisionState.FAILED

51 self.bisector.surface_result('MISSING_METRIC')	56 self.bisector.surface_result('MISSING_METRIC')

52 return	57 return

53 # We cannot test the goodness of the initial rev range.	58 # We cannot test the goodness of the initial rev range.

54 if self.bisector.good_rev != self and self.bisector.bad_rev != self:	59 if (self.bisector.good_rev != self and self.bisector.bad_rev != self and

	60 check_revision_goodness):

55 if self._check_revision_good():	61 if self._check_revision_good():

56 self.good = True	62 self.good = True

57 else:	63 else:

58 self.bad = True	64 self.bad = True

59	65

60 def _write_deps_patch_file(self, build_name):	66 def _write_deps_patch_file(self, build_name):

61 """Saves the DEPS patch in a temp location and returns the file path."""	67 """Saves the DEPS patch in a temp location and returns the file path."""

62 api = self.bisector.api	68 api = self.bisector.api

63 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff'))	69 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff'))

64 api.m.file.write('Saving diff patch for ' + str(self.revision_string),	70 api.m.file.write('Saving diff patch for ' + str(self.revision_string),

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
106 api.m.file.remove('cleaning up patch', self.patch_file)	112 api.m.file.remove('cleaning up patch', self.patch_file)

107 except api.m.step.StepFailure: # pragma: no cover	113 except api.m.step.StepFailure: # pragma: no cover

108 print 'Could not clean up ' + self.patch_file	114 print 'Could not clean up ' + self.patch_file

109	115

110 def _get_bisect_config_for_tester(self):	116 def _get_bisect_config_for_tester(self):

111 """Copies the key-value pairs required by a tester bot to a new dict."""	117 """Copies the key-value pairs required by a tester bot to a new dict."""

112 result = {}	118 result = {}

113 required_test_properties = {	119 required_test_properties = {

114 'truncate_percent',	120 'truncate_percent',

115 'metric',	121 'metric',

116 'max_time_minutes',

117 'command',	122 'command',

118 'repeat_count',

119 'test_type'	123 'test_type'

120 }	124 }

121 for k, v in self.bisector.bisect_config.iteritems():	125 for k, v in self.bisector.bisect_config.iteritems():

122 if k in required_test_properties:	126 if k in required_test_properties:

123 result[k] = v	127 result[k] = v

	128 result['repeat_count'] = self.repeat_count

124 self._test_config = result	129 self._test_config = result

125 return result	130 return result

126	131

127 def _do_test(self):	132 def _do_test(self):

128 """Triggers tests for a revision, either locally or via try job.	133 """Triggers tests for a revision, either locally or via try job.

129	134

130 If local testing is enabled (i.e. director/tester merged) then	135 If local testing is enabled (i.e. director/tester merged) then

131 the test will be run on the same machine. Otherwise, this posts	136 the test will be run on the same machine. Otherwise, this posts

132 a request to buildbot to download and perf-test this build.	137 a request to buildbot to download and perf-test this build.

133 """	138 """

(...skipping 50 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
184 builder = self.bisector.get_builder_bot_for_this_platform()	189 builder = self.bisector.get_builder_bot_for_this_platform()

185 if self.status == PerfRevisionState.TESTING:	190 if self.status == PerfRevisionState.TESTING:

186 builder = self.bisector.get_perf_tester_name()	191 builder = self.bisector.get_perf_tester_name()

187 return {	192 return {

188 'type': 'buildbot',	193 'type': 'buildbot',

189 'master': master,	194 'master': master,

190 'builder': builder,	195 'builder': builder,

191 'job_name': self.job_name,	196 'job_name': self.job_name,

192 }	197 }

193	198

	199 def retest(self): # pragma: no cover

	200 # We need at least 5 samples for the hypothesis testing to work properly.
	dtu 2016/02/05 00:18:02 Sorry, not strictly true as written. Only true wit Sorry, not strictly true as written. Only true with these assumptions: - Mann-Whitney U - 2-sided - Significance level of 0.01
	201 target_sample_size = max(5, math.ceil(len(self.values) * 1.5))

	202 self.status = PerfRevisionState.NEED_MORE_DATA

	203 self.repeat_count = target_sample_size - len(self.values)

	204 self.start_job()

	205 self.bisector.wait_for_any([self])

	206

194 def _get_test_results(self):	207 def _get_test_results(self):

195 """Tries to get the results of a test job from cloud storage."""	208 """Tries to get the results of a test job from cloud storage."""

196 api = self.bisector.api	209 api = self.bisector.api

197 try:	210 try:

198 stdout = api.m.raw_io.output()	211 stdout = api.m.raw_io.output()

199 name = 'Get test results for build ' + self.commit_hash	212 name = 'Get test results for build ' + self.commit_hash

200 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout,	213 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout,

201 name=name)	214 name=name)

202 except api.m.step.StepFailure: # pragma: no cover	215 except api.m.step.StepFailure: # pragma: no cover

203 self.bisector.surface_result('TEST_FAILURE')	216 self.bisector.surface_result('TEST_FAILURE')

204 return None	217 return None

205 else:	218 else:

206 return json.loads(step_result.stdout)	219 return json.loads(step_result.stdout)

207	220

208 def _check_revision_good(self):	221 def _check_revision_good(self):

209 """Determines if a revision is good or bad.	222 """Determines if a revision is good or bad.

210	223

211 Note that our current approach is to determine whether it is closer to	224 Iteratively increment the sample size of the revision being tested, the last

212 either the 'good' and 'bad' revisions given for the bisect job.	225 known good revision, and the first known bad revision until a relationship

	226 of significant difference can be established betweeb the results of the

	227 revision being tested and one of the other two.

	228

	229 If the results do not converge towards finding a significant difference in

	230 either direction, this is expected to timeout eventually. This scenario

	231 should be rather rare, since it is expected that the fkbr and lkgr are

	232 significantly different as a precondition.

213	233

214 Returns:	234 Returns:

215 True if this revision is closer to the initial good revision's value than	235 True if the results of testing this revision are significantly different

216 to the initial bad revision's value. False otherwise.	236 from those of testing the earliest known bad revision.

	237 False if they are instead significantly different form those of testing

	238 the latest knwon good revision.

217 """	239 """

218 # TODO: Reevaluate this approach	240 diff_from_good = self.bisector.significantly_different(
	dtu 2016/02/05 00:18:02 If you reorganize this function, you can avoid the If you reorganize this function, you can avoid the code duplication. while True: diff_from_good = abc diff_from_bad = xyz if diff_from_good and diff_from_bad: # Calculate stuff and return. elif diff_from_good or diff_from_bad: # Return. else: # Retest.
219 bisector = self.bisector	241 self.bisector.lkgr.values, self.values)

220 distance_to_good = abs(self.mean_value - bisector.good_rev.mean_value)	242 diff_from_bad = self.bisector.significantly_different(

221 distance_to_bad = abs(self.mean_value - bisector.bad_rev.mean_value)	243 self.bisector.fkbr.values, self.values)

222 if distance_to_good < distance_to_bad:	244

223 return True	245 while not (diff_from_good or diff_from_bad): # pragma: no cover

224 return False	246 min(self.bisector.lkgr, self, self.bisector.fkbr,

	247 key=lambda x: len(x.values)).retest()

	248 diff_from_good = self.bisector.significantly_different(

	249 self.bisector.lkgr.values, self.values)

	250 diff_from_bad = self.bisector.significantly_different(

	251 self.bisector.fkbr.values, self.values)

	252

	253 if diff_from_good and diff_from_bad:

	254 # Multiple regressions.

	255 # For now, proceed bisecting the biggest difference of the means.

	256 dist_from_good = abs(self.mean_value - self.bisector.lkgr.mean_value)

	257 dist_from_bad = abs(self.mean_value - self.bisector.fkbr.mean_value)

	258 if dist_from_good > dist_from_bad:

	259 # TODO(robertocn): Add way to handle the secondary regression

	260 #self.bisector.handle_secondary_regression(self, self.bisector.fkbr)

	261 return False

	262 else:

	263 #self.bisector.handle_secondary_regression(self.bisector.lkgr, self)

	264 return True

	265 return diff_from_bad # pragma: no cover

225	266

226 def __repr__(self):	267 def __repr__(self):

227 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' %	268 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' %

228 (self.commit_pos, self.values, self.mean_value, self.std_dev))	269 (self.commit_pos, self.values, self.mean_value, self.std_dev))

OLD	NEW