Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1420)

Side by Side Diff: scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py

Issue 1610203003: Iteratively increase sample size for good/bad classification. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master
Patch Set: Fixing multiple problems Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2015 The Chromium Authors. All rights reserved. 1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import json 5 import json
6 import math
6 import tempfile 7 import tempfile
7 import os 8 import os
8 import uuid 9 import uuid
9 10
10 from . import revision_state 11 from . import revision_state
11 12
12 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover 13 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover
13 from . import test_results_cache 14 from . import test_results_cache
14 15
15 16
16 class PerfRevisionState(revision_state.RevisionState): 17 class PerfRevisionState(revision_state.RevisionState):
17 18
18 """Contains the state and results for one revision in a perf bisect job.""" 19 """Contains the state and results for one revision in a perf bisect job."""
19 20
20 def __init__(self, *args, **kwargs): 21 def __init__(self, *args, **kwargs):
21 super(PerfRevisionState, self).__init__(*args, **kwargs) 22 super(PerfRevisionState, self).__init__(*args, **kwargs)
22 self.values = [] 23 self.values = []
23 self.mean_value = None 24 self.mean_value = None
24 self.std_dev = None 25 self.std_dev = None
26 self.custom_reps = None
25 self._test_config = None 27 self._test_config = None
26 28
27 def _read_test_results(self): 29 def _read_test_results(self, check_revision_goodness=True):
28 """Gets the test results from GS and checks if the rev is good or bad.""" 30 """Gets the test results from GS and checks if the rev is good or bad."""
29 test_results = self._get_test_results() 31 test_results = self._get_test_results()
30 # Results will contain the keys 'results' and 'output' where output is the 32 # Results will contain the keys 'results' and 'output' where output is the
31 # stdout of the command, and 'results' is itself a dict with the key 33 # stdout of the command, and 'results' is itself a dict with the key
32 # 'values' unless the test failed, in which case 'results' will contain 34 # 'values' unless the test failed, in which case 'results' will contain
33 # the 'error' key explaining the type of error. 35 # the 'error' key explaining the type of error.
34 results = test_results['results'] 36 results = test_results['results']
35 if results.get('errors'): 37 if results.get('errors'):
36 self.status = PerfRevisionState.FAILED 38 self.status = PerfRevisionState.FAILED
37 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover 39 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover
38 self.bisector.surface_result('MISSING_METRIC') 40 self.bisector.surface_result('MISSING_METRIC')
39 return 41 return
40 self.values = results['values'] 42 self.values = self.values or [] + results['values']
qyearsley 2016/01/26 19:26:00 `or` is less tightly binding than `+`, so this is
RobertoCN 2016/02/01 17:29:50 Done.
41 if self.bisector.is_return_code_mode(): 43 if self.bisector.is_return_code_mode():
42 retcodes = test_results['retcodes'] 44 retcodes = test_results['retcodes']
43 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1 45 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1
44 self.mean_value = overall_return_code 46 self.mean_value = overall_return_code
45 elif self.values: 47 elif self.values:
46 api = self.bisector.api 48 api = self.bisector.api
47 self.mean_value = api.m.math_utils.mean(self.values) 49 self.mean_value = api.m.math_utils.mean(self.values)
48 self.std_dev = api.m.math_utils.standard_deviation(self.values) 50 self.std_dev = api.m.math_utils.standard_deviation(self.values)
49 # Values were not found, but the test did not otherwise fail. 51 # Values were not found, but the test did not otherwise fail.
50 else: 52 else:
51 self.status = PerfRevisionState.FAILED 53 self.status = PerfRevisionState.FAILED
52 self.bisector.surface_result('MISSING_METRIC') 54 self.bisector.surface_result('MISSING_METRIC')
53 return 55 return
54 # We cannot test the goodness of the initial rev range. 56 # We cannot test the goodness of the initial rev range.
55 if self.bisector.good_rev != self and self.bisector.bad_rev != self: 57 if (self.bisector.good_rev != self and self.bisector.bad_rev != self and
58 check_revision_goodness):
56 if self._check_revision_good(): 59 if self._check_revision_good():
57 self.good = True 60 self.good = True
58 else: 61 else:
59 self.bad = True 62 self.bad = True
60 63
61 def _write_deps_patch_file(self, build_name): 64 def _write_deps_patch_file(self, build_name):
62 api = self.bisector.api 65 api = self.bisector.api
63 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff')) 66 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff'))
64 api.m.file.write('Saving diff patch for ' + str(self.revision_string), 67 api.m.file.write('Saving diff patch for ' + str(self.revision_string),
65 file_name, self.deps_patch + self.deps_sha_patch) 68 file_name, self.deps_patch + self.deps_sha_patch)
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
115 'metric', 118 'metric',
116 'max_time_minutes', 119 'max_time_minutes',
117 'command', 120 'command',
118 'repeat_count', 121 'repeat_count',
119 'test_type' 122 'test_type'
120 } 123 }
121 for k, v in self.bisector.bisect_config.iteritems(): 124 for k, v in self.bisector.bisect_config.iteritems():
122 if k in required_test_properties: 125 if k in required_test_properties:
123 result[k] = v 126 result[k] = v
124 self._test_config = result 127 self._test_config = result
128 if self.custom_reps: # pragma: no cover
qyearsley 2016/01/26 19:26:00 Two spaces before #.
RobertoCN 2016/02/01 17:29:50 Done.
129 result['repeat_count'] = self.custom_reps
130 else:
131 # We need at the very least 3 data points. (e.g. that's the minimum for
132 # shapiro's test for normality [even though we don't really use it])
qyearsley 2016/01/26 19:26:00 The parenthetical part of this comment could proba
RobertoCN 2016/02/01 17:29:50 Done.
133 result['repeat_count'] = max(int(result['repeat_count']), 3)
125 return result 134 return result
126 135
136 # It seems it should be okay to call this again to test the revision some
137 # more.
qyearsley 2016/01/26 19:26:00 I'm not sure whether this comment is helpful; mayb
RobertoCN 2016/02/01 17:29:50 Yep, I was supposed to remove it and forgot. Done
127 def _do_test(self): 138 def _do_test(self):
128 """Posts a request to buildbot to download and perf-test this build.""" 139 """Posts a request to buildbot to download and perf-test this build."""
129 if self.bisector.dummy_builds: 140 if self.bisector.dummy_builds:
130 self.job_name = self.commit_hash + '-test' 141 self.job_name = self.commit_hash + '-test'
131 elif 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover 142 elif 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover
132 self.job_name = test_results_cache.make_id( 143 self.job_name = test_results_cache.make_id(
133 self.revision_string, self._get_bisect_config_for_tester()) 144 self.revision_string, self._get_bisect_config_for_tester())
134 else: # pragma: no cover 145 else: # pragma: no cover
135 self.job_name = uuid.uuid4().hex 146 self.job_name = uuid.uuid4().hex
136 api = self.bisector.api 147 api = self.bisector.api
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
172 builder = self.bisector.get_builder_bot_for_this_platform() 183 builder = self.bisector.get_builder_bot_for_this_platform()
173 if self.status == PerfRevisionState.TESTING: 184 if self.status == PerfRevisionState.TESTING:
174 builder = self.bisector.get_perf_tester_name() 185 builder = self.bisector.get_perf_tester_name()
175 return { 186 return {
176 'type': 'buildbot', 187 'type': 'buildbot',
177 'master': master, 188 'master': master,
178 'builder': builder, 189 'builder': builder,
179 'job_name': self.job_name, 190 'job_name': self.job_name,
180 } 191 }
181 192
193 def retest(self, target_sample_size): # pragma: no cover
194 self.status = PerfRevisionState.NEED_MORE_DATA
195 self.custom_reps = target_sample_size - len(self.values)
196 self.start_job()
197 self.bisector.wait_for_any([self])
qyearsley 2016/01/26 19:26:00 This triggers the bisect to run another |self.cust
RobertoCN 2016/02/01 17:29:50 Yeah.
198
182 def _get_test_results(self): 199 def _get_test_results(self):
183 """Tries to get the results of a test job from cloud storage.""" 200 """Tries to get the results of a test job from cloud storage."""
184 api = self.bisector.api 201 api = self.bisector.api
185 try: 202 try:
186 stdout = api.m.raw_io.output() 203 stdout = api.m.raw_io.output()
187 name = 'Get test results for build ' + self.commit_hash 204 name = 'Get test results for build ' + self.commit_hash
188 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout, 205 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout,
189 name=name) 206 name=name)
190 except api.m.step.StepFailure: # pragma: no cover 207 except api.m.step.StepFailure: # pragma: no cover
191 self.bisector.surface_result('TEST_FAILURE') 208 self.bisector.surface_result('TEST_FAILURE')
192 return None 209 return None
193 else: 210 else:
194 return json.loads(step_result.stdout) 211 return json.loads(step_result.stdout)
195 212
213 def _calculate_target_sample_size(self): # pragma: no cover
214 # Make the smallest bigger than the largest, by increasing it at least 33%
215 sizes = map(lambda x: len(x.values), [self.bisector.lkgr, self,
qyearsley 2016/01/26 19:26:00 Extra space after the comma.
RobertoCN 2016/02/01 17:29:50 Done.
216 self.bisector.fkbr])
217 smallest_sample_size = min(sizes)
218 target_sample_size = max(sizes) + 1
219 delta = target_sample_size - smallest_sample_size
dtu 2016/02/05 00:18:01 delta is unused.
220 min_increase = math.ceil(smallest_sample_size/3.0)
qyearsley 2016/01/26 19:26:00 Optional: could add spaces around the / operator.
RobertoCN 2016/02/01 17:29:50 Done.
221 return max(smallest_sample_size + min_increase, target_sample_size)
222
196 def _check_revision_good(self): 223 def _check_revision_good(self):
qyearsley 2016/01/26 19:26:00 Parts of the previous docstring could be kept here
RobertoCN 2016/02/01 17:29:51 Done.
197 """Determines if a revision is good or bad. 224 diff_from_good = self.bisector.significantly_different(
198 225 self.bisector.lkgr.values, self.values)
199 Note that our current approach is to determine whether it is closer to 226 diff_from_bad = self.bisector.significantly_different(
200 either the 'good' and 'bad' revisions given for the bisect job. 227 self.bisector.fkbr.values, self.values)
201 228 while not (diff_from_good or diff_from_bad): # pragma: no cover
202 Returns: 229 target_sample_size = self._calculate_target_sample_size()
203 True if this revision is closer to the initial good revision's value than 230 min(self.bisector.lkgr, self, self.bisector.fkbr,
204 to the initial bad revision's value. False otherwise. 231 key=lambda x: len(x.values)).retest(target_sample_size)
205 """ 232 diff_from_good = self.bisector.significantly_different(
206 # TODO: Reevaluate this approach 233 self.bisector.lkgr.values, self.values)
207 bisector = self.bisector 234 diff_from_bad = self.bisector.significantly_different(
208 distance_to_good = abs(self.mean_value - bisector.good_rev.mean_value) 235 self.bisector.fkbr.values, self.values)
209 distance_to_bad = abs(self.mean_value - bisector.bad_rev.mean_value) 236 if diff_from_good and diff_from_bad:
qyearsley 2016/01/26 19:26:00 A blank line above this if block might slightly he
RobertoCN 2016/02/01 17:29:50 Done.
210 if distance_to_good < distance_to_bad: 237 # Multiple regressions.
211 return True 238 # For now, proceed bisecting the biggest difference of the means, and
212 return False 239 # handle the secondary regression by spinning a second job.
qyearsley 2016/01/26 19:26:00 No second job is started as of this CL, right?
RobertoCN 2016/02/01 17:29:50 Correct, for now we'll go with the larger regressi
240 dist_from_good = abs(self.mean_value - self.bisector.lkgr.mean_value)
241 dist_from_bad = abs(self.mean_value - self.bisector.fkbr.mean_value)
242 if dist_from_good > dist_from_bad:
243 #self.bisector.handle_secondary_regression(self, self.bisector.fkbr)
244 return False
245 else:
246 #self.bisector.handle_secondary_regression(self.bisector.lkgr, self)
247 return True
248 return diff_from_bad # pragma: no cover
213 249
214 def __repr__(self): 250 def __repr__(self):
215 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' % 251 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' %
216 (self.commit_pos, self.values, self.mean_value, self.std_dev)) 252 (self.commit_pos, self.values, self.mean_value, self.std_dev))
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698