scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py - Issue 1610203003: Iteratively increase sample size for good/bad classification.

Side by Side Diff: scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py

Issue 1610203003: Iteratively increase sample size for good/bad classification. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master

Patch Set: Fixing multiple problems Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« scripts/slave/recipe_modules/auto_bisect/bisector.py ('K') | « scripts/slave/recipe_modules/auto_bisect/example.expected/windows_x64_bisector.json ('k') | scripts/slave/recipe_modules/auto_bisect/resources/significantly_different.py » ('j') | scripts/slave/recipe_modules/auto_bisect/resources/significantly_different.py » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # Copyright 2015 The Chromium Authors. All rights reserved.	1 # Copyright 2015 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 import json	5 import json

	6 import math

6 import tempfile	7 import tempfile

7 import os	8 import os

8 import uuid	9 import uuid

9	10

10 from . import revision_state	11 from . import revision_state

11	12

12 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover	13 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover

13 from . import test_results_cache	14 from . import test_results_cache

14	15

15	16

16 class PerfRevisionState(revision_state.RevisionState):	17 class PerfRevisionState(revision_state.RevisionState):

17	18

18 """Contains the state and results for one revision in a perf bisect job."""	19 """Contains the state and results for one revision in a perf bisect job."""

19	20

20 def __init__(self, args, *kwargs):	21 def __init__(self, args, *kwargs):

21 super(PerfRevisionState, self).__init__(args, *kwargs)	22 super(PerfRevisionState, self).__init__(args, *kwargs)

22 self.values = []	23 self.values = []

23 self.mean_value = None	24 self.mean_value = None

24 self.std_dev = None	25 self.std_dev = None

	26 self.custom_reps = None

25 self._test_config = None	27 self._test_config = None

26	28

27 def _read_test_results(self):	29 def _read_test_results(self, check_revision_goodness=True):

28 """Gets the test results from GS and checks if the rev is good or bad."""	30 """Gets the test results from GS and checks if the rev is good or bad."""

29 test_results = self._get_test_results()	31 test_results = self._get_test_results()

30 # Results will contain the keys 'results' and 'output' where output is the	32 # Results will contain the keys 'results' and 'output' where output is the

31 # stdout of the command, and 'results' is itself a dict with the key	33 # stdout of the command, and 'results' is itself a dict with the key

32 # 'values' unless the test failed, in which case 'results' will contain	34 # 'values' unless the test failed, in which case 'results' will contain

33 # the 'error' key explaining the type of error.	35 # the 'error' key explaining the type of error.

34 results = test_results['results']	36 results = test_results['results']

35 if results.get('errors'):	37 if results.get('errors'):

36 self.status = PerfRevisionState.FAILED	38 self.status = PerfRevisionState.FAILED

37 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover	39 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover

38 self.bisector.surface_result('MISSING_METRIC')	40 self.bisector.surface_result('MISSING_METRIC')

39 return	41 return

40 self.values = results['values']	42 self.values = self.values or [] + results['values']
	qyearsley 2016/01/26 19:26:00 `or` is less tightly binding than `+`, so this is `or` is less tightly binding than `+`, so this is equivalent to: self.values = self.values or ([] + results['values']) And if results['values'] is a list, then ([] + results['values']) should be the same as results['values'] (I think). Should this be changed to the following? self.values = (self.values or []) + results['values'] RobertoCN 2016/02/01 17:29:50 Done. Show quoted text On 2016/01/26 19:26:00, qyearsley wrote: > `or` is less tightly binding than `+`, so this is equivalent to: > > self.values = self.values or ([] + results['values']) > > And if results['values'] is a list, then ([] + results['values']) should be the > same as results['values'] (I think). > > Should this be changed to the following? > > self.values = (self.values or []) + results['values'] Done.
41 if self.bisector.is_return_code_mode():	43 if self.bisector.is_return_code_mode():

42 retcodes = test_results['retcodes']	44 retcodes = test_results['retcodes']

43 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1	45 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1

44 self.mean_value = overall_return_code	46 self.mean_value = overall_return_code

45 elif self.values:	47 elif self.values:

46 api = self.bisector.api	48 api = self.bisector.api

47 self.mean_value = api.m.math_utils.mean(self.values)	49 self.mean_value = api.m.math_utils.mean(self.values)

48 self.std_dev = api.m.math_utils.standard_deviation(self.values)	50 self.std_dev = api.m.math_utils.standard_deviation(self.values)

49 # Values were not found, but the test did not otherwise fail.	51 # Values were not found, but the test did not otherwise fail.

50 else:	52 else:

51 self.status = PerfRevisionState.FAILED	53 self.status = PerfRevisionState.FAILED

52 self.bisector.surface_result('MISSING_METRIC')	54 self.bisector.surface_result('MISSING_METRIC')

53 return	55 return

54 # We cannot test the goodness of the initial rev range.	56 # We cannot test the goodness of the initial rev range.

55 if self.bisector.good_rev != self and self.bisector.bad_rev != self:	57 if (self.bisector.good_rev != self and self.bisector.bad_rev != self and

	58 check_revision_goodness):

56 if self._check_revision_good():	59 if self._check_revision_good():

57 self.good = True	60 self.good = True

58 else:	61 else:

59 self.bad = True	62 self.bad = True

60	63

61 def _write_deps_patch_file(self, build_name):	64 def _write_deps_patch_file(self, build_name):

62 api = self.bisector.api	65 api = self.bisector.api

63 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff'))	66 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff'))

64 api.m.file.write('Saving diff patch for ' + str(self.revision_string),	67 api.m.file.write('Saving diff patch for ' + str(self.revision_string),

65 file_name, self.deps_patch + self.deps_sha_patch)	68 file_name, self.deps_patch + self.deps_sha_patch)

(...skipping 49 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
115 'metric',	118 'metric',

116 'max_time_minutes',	119 'max_time_minutes',

117 'command',	120 'command',

118 'repeat_count',	121 'repeat_count',

119 'test_type'	122 'test_type'

120 }	123 }

121 for k, v in self.bisector.bisect_config.iteritems():	124 for k, v in self.bisector.bisect_config.iteritems():

122 if k in required_test_properties:	125 if k in required_test_properties:

123 result[k] = v	126 result[k] = v

124 self._test_config = result	127 self._test_config = result

	128 if self.custom_reps: # pragma: no cover
	qyearsley 2016/01/26 19:26:00 Two spaces before #. Two spaces before #. RobertoCN 2016/02/01 17:29:50 Done. Show quoted text On 2016/01/26 19:26:00, qyearsley wrote: > Two spaces before #. Done.
	129 result['repeat_count'] = self.custom_reps

	130 else:

	131 # We need at the very least 3 data points. (e.g. that's the minimum for

	132 # shapiro's test for normality [even though we don't really use it])
	qyearsley 2016/01/26 19:26:00 The parenthetical part of this comment could proba The parenthetical part of this comment could probably be removed. RobertoCN 2016/02/01 17:29:50 Done. Show quoted text On 2016/01/26 19:26:00, qyearsley wrote: > The parenthetical part of this comment could probably be removed. Done.
	133 result['repeat_count'] = max(int(result['repeat_count']), 3)

125 return result	134 return result

126	135

	136 # It seems it should be okay to call this again to test the revision some

	137 # more.
	qyearsley 2016/01/26 19:26:00 I'm not sure whether this comment is helpful; mayb I'm not sure whether this comment is helpful; maybe it could be removed? RobertoCN 2016/02/01 17:29:50 Yep, I was supposed to remove it and forgot. Done Show quoted text On 2016/01/26 19:26:00, qyearsley wrote: > I'm not sure whether this comment is helpful; maybe it could be removed? Yep, I was supposed to remove it and forgot. Done.
127 def _do_test(self):	138 def _do_test(self):

128 """Posts a request to buildbot to download and perf-test this build."""	139 """Posts a request to buildbot to download and perf-test this build."""

129 if self.bisector.dummy_builds:	140 if self.bisector.dummy_builds:

130 self.job_name = self.commit_hash + '-test'	141 self.job_name = self.commit_hash + '-test'

131 elif 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover	142 elif 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover

132 self.job_name = test_results_cache.make_id(	143 self.job_name = test_results_cache.make_id(

133 self.revision_string, self._get_bisect_config_for_tester())	144 self.revision_string, self._get_bisect_config_for_tester())

134 else: # pragma: no cover	145 else: # pragma: no cover

135 self.job_name = uuid.uuid4().hex	146 self.job_name = uuid.uuid4().hex

136 api = self.bisector.api	147 api = self.bisector.api

(...skipping 35 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
172 builder = self.bisector.get_builder_bot_for_this_platform()	183 builder = self.bisector.get_builder_bot_for_this_platform()

173 if self.status == PerfRevisionState.TESTING:	184 if self.status == PerfRevisionState.TESTING:

174 builder = self.bisector.get_perf_tester_name()	185 builder = self.bisector.get_perf_tester_name()

175 return {	186 return {

176 'type': 'buildbot',	187 'type': 'buildbot',

177 'master': master,	188 'master': master,

178 'builder': builder,	189 'builder': builder,

179 'job_name': self.job_name,	190 'job_name': self.job_name,

180 }	191 }

181	192

	193 def retest(self, target_sample_size): # pragma: no cover

	194 self.status = PerfRevisionState.NEED_MORE_DATA

	195 self.custom_reps = target_sample_size - len(self.values)

	196 self.start_job()

	197 self.bisector.wait_for_any([self])
	qyearsley 2016/01/26 19:26:00 This triggers the bisect to run another \|self.cust This triggers the bisect to run another \|self.custom_reps\| times, right? RobertoCN 2016/02/01 17:29:50 Yeah. Show quoted text On 2016/01/26 19:26:00, qyearsley wrote: > This triggers the bisect to run another \|self.custom_reps\| times, right? Yeah.
	198

182 def _get_test_results(self):	199 def _get_test_results(self):

183 """Tries to get the results of a test job from cloud storage."""	200 """Tries to get the results of a test job from cloud storage."""

184 api = self.bisector.api	201 api = self.bisector.api

185 try:	202 try:

186 stdout = api.m.raw_io.output()	203 stdout = api.m.raw_io.output()

187 name = 'Get test results for build ' + self.commit_hash	204 name = 'Get test results for build ' + self.commit_hash

188 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout,	205 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout,

189 name=name)	206 name=name)

190 except api.m.step.StepFailure: # pragma: no cover	207 except api.m.step.StepFailure: # pragma: no cover

191 self.bisector.surface_result('TEST_FAILURE')	208 self.bisector.surface_result('TEST_FAILURE')

192 return None	209 return None

193 else:	210 else:

194 return json.loads(step_result.stdout)	211 return json.loads(step_result.stdout)

195	212

	213 def _calculate_target_sample_size(self): # pragma: no cover

	214 # Make the smallest bigger than the largest, by increasing it at least 33%

	215 sizes = map(lambda x: len(x.values), [self.bisector.lkgr, self,
	qyearsley 2016/01/26 19:26:00 Extra space after the comma. Extra space after the comma. RobertoCN 2016/02/01 17:29:50 Done. Show quoted text On 2016/01/26 19:26:00, qyearsley wrote: > Extra space after the comma. Done.
	216 self.bisector.fkbr])

	217 smallest_sample_size = min(sizes)

	218 target_sample_size = max(sizes) + 1

	219 delta = target_sample_size - smallest_sample_size
	dtu 2016/02/05 00:18:01 delta is unused. delta is unused.
	220 min_increase = math.ceil(smallest_sample_size/3.0)
	qyearsley 2016/01/26 19:26:00 Optional: could add spaces around the / operator. Optional: could add spaces around the / operator. RobertoCN 2016/02/01 17:29:50 Done. Show quoted text On 2016/01/26 19:26:00, qyearsley wrote: > Optional: could add spaces around the / operator. Done.
	221 return max(smallest_sample_size + min_increase, target_sample_size)

	222

196 def _check_revision_good(self):	223 def _check_revision_good(self):
	qyearsley 2016/01/26 19:26:00 Parts of the previous docstring could be kept here Parts of the previous docstring could be kept here -- overall, it still returns True if it's going to be considered "good", and False otherwise. RobertoCN 2016/02/01 17:29:51 Done. Show quoted text On 2016/01/26 19:26:00, qyearsley wrote: > Parts of the previous docstring could be kept here -- overall, it still returns > True if it's going to be considered "good", and False otherwise. Done.
197 """Determines if a revision is good or bad.	224 diff_from_good = self.bisector.significantly_different(

198	225 self.bisector.lkgr.values, self.values)

199 Note that our current approach is to determine whether it is closer to	226 diff_from_bad = self.bisector.significantly_different(

200 either the 'good' and 'bad' revisions given for the bisect job.	227 self.bisector.fkbr.values, self.values)

201	228 while not (diff_from_good or diff_from_bad): # pragma: no cover

202 Returns:	229 target_sample_size = self._calculate_target_sample_size()

203 True if this revision is closer to the initial good revision's value than	230 min(self.bisector.lkgr, self, self.bisector.fkbr,

204 to the initial bad revision's value. False otherwise.	231 key=lambda x: len(x.values)).retest(target_sample_size)

205 """	232 diff_from_good = self.bisector.significantly_different(

206 # TODO: Reevaluate this approach	233 self.bisector.lkgr.values, self.values)

207 bisector = self.bisector	234 diff_from_bad = self.bisector.significantly_different(

208 distance_to_good = abs(self.mean_value - bisector.good_rev.mean_value)	235 self.bisector.fkbr.values, self.values)

209 distance_to_bad = abs(self.mean_value - bisector.bad_rev.mean_value)	236 if diff_from_good and diff_from_bad:
	qyearsley 2016/01/26 19:26:00 A blank line above this if block might slightly he A blank line above this if block might slightly help with readability. RobertoCN 2016/02/01 17:29:50 Done. Show quoted text On 2016/01/26 19:26:00, qyearsley wrote: > A blank line above this if block might slightly help with readability. Done.
210 if distance_to_good < distance_to_bad:	237 # Multiple regressions.

211 return True	238 # For now, proceed bisecting the biggest difference of the means, and

212 return False	239 # handle the secondary regression by spinning a second job.
	qyearsley 2016/01/26 19:26:00 No second job is started as of this CL, right? No second job is started as of this CL, right? RobertoCN 2016/02/01 17:29:50 Correct, for now we'll go with the larger regressi Show quoted text On 2016/01/26 19:26:00, qyearsley wrote: > No second job is started as of this CL, right? Correct, for now we'll go with the larger regression (calculated by distance of means)
	240 dist_from_good = abs(self.mean_value - self.bisector.lkgr.mean_value)

	241 dist_from_bad = abs(self.mean_value - self.bisector.fkbr.mean_value)

	242 if dist_from_good > dist_from_bad:

	243 #self.bisector.handle_secondary_regression(self, self.bisector.fkbr)

	244 return False

	245 else:

	246 #self.bisector.handle_secondary_regression(self.bisector.lkgr, self)

	247 return True

	248 return diff_from_bad # pragma: no cover

213	249

214 def __repr__(self):	250 def __repr__(self):

215 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' %	251 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' %

216 (self.commit_pos, self.values, self.mean_value, self.std_dev))	252 (self.commit_pos, self.values, self.mean_value, self.std_dev))

OLD	NEW