Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(246)

Side by Side Diff: scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py

Issue 1610203003: Iteratively increase sample size for good/bad classification. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master
Patch Set: Making initial regression check have a timeout. Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2015 The Chromium Authors. All rights reserved. 1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import json 5 import json
6 import math
6 import tempfile 7 import tempfile
7 import os 8 import os
8 import uuid 9 import uuid
9 10
10 from . import revision_state 11 from . import revision_state
11 12
12 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover 13 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover
13 from . import test_results_cache 14 from . import test_results_cache
14 15
16 # These relate to how to increase the number of repetitions during re-test
17 MINIMUM_SAMPLE_SIZE = 5
18 INCREASE_FACTOR = 1.5
15 19
16 class PerfRevisionState(revision_state.RevisionState): 20 class PerfRevisionState(revision_state.RevisionState):
17 """Contains the state and results for one revision in a perf bisect job.""" 21 """Contains the state and results for one revision in a perf bisect job."""
18 22
19 def __init__(self, *args, **kwargs): 23 def __init__(self, *args, **kwargs):
20 super(PerfRevisionState, self).__init__(*args, **kwargs) 24 super(PerfRevisionState, self).__init__(*args, **kwargs)
21 self.values = [] 25 self.values = []
22 self.mean_value = None 26 self.mean_value = None
23 self.std_dev = None 27 self.std_dev = None
28 self.repeat_count = MINIMUM_SAMPLE_SIZE
24 self._test_config = None 29 self._test_config = None
25 30
26 def _read_test_results(self): 31 def _read_test_results(self, check_revision_goodness=True):
27 """Gets the test results from GS and checks if the rev is good or bad.""" 32 """Gets the test results from GS and checks if the rev is good or bad."""
28 test_results = self._get_test_results() 33 test_results = self._get_test_results()
29 # Results will contain the keys 'results' and 'output' where output is the 34 # Results will contain the keys 'results' and 'output' where output is the
30 # stdout of the command, and 'results' is itself a dict with the key 35 # stdout of the command, and 'results' is itself a dict with the key
31 # 'values' unless the test failed, in which case 'results' will contain 36 # 'values' unless the test failed, in which case 'results' will contain
32 # the 'error' key explaining the type of error. 37 # the 'error' key explaining the type of error.
33 results = test_results['results'] 38 results = test_results['results']
34 if results.get('errors'): 39 if results.get('errors'):
35 self.status = PerfRevisionState.FAILED 40 self.status = PerfRevisionState.FAILED
36 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover 41 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover
37 self.bisector.surface_result('MISSING_METRIC') 42 self.bisector.surface_result('MISSING_METRIC')
38 return 43 return
39 self.values = results['values'] 44 self.values = (self.values or []) + results['values']
dtu 2016/02/05 00:18:02 Why do you need this? Isn't self.values already in
40 if self.bisector.is_return_code_mode(): 45 if self.bisector.is_return_code_mode():
41 retcodes = test_results['retcodes'] 46 retcodes = test_results['retcodes']
42 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1 47 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1
43 self.mean_value = overall_return_code 48 self.mean_value = overall_return_code
44 elif self.values: 49 elif self.values:
45 api = self.bisector.api 50 api = self.bisector.api
46 self.mean_value = api.m.math_utils.mean(self.values) 51 self.mean_value = api.m.math_utils.mean(self.values)
47 self.std_dev = api.m.math_utils.standard_deviation(self.values) 52 self.std_dev = api.m.math_utils.standard_deviation(self.values)
48 # Values were not found, but the test did not otherwise fail. 53 # Values were not found, but the test did not otherwise fail.
49 else: 54 else:
50 self.status = PerfRevisionState.FAILED 55 self.status = PerfRevisionState.FAILED
51 self.bisector.surface_result('MISSING_METRIC') 56 self.bisector.surface_result('MISSING_METRIC')
52 return 57 return
53 # We cannot test the goodness of the initial rev range. 58 # We cannot test the goodness of the initial rev range.
54 if self.bisector.good_rev != self and self.bisector.bad_rev != self: 59 if (self.bisector.good_rev != self and self.bisector.bad_rev != self and
60 check_revision_goodness):
55 if self._check_revision_good(): 61 if self._check_revision_good():
56 self.good = True 62 self.good = True
57 else: 63 else:
58 self.bad = True 64 self.bad = True
59 65
60 def _write_deps_patch_file(self, build_name): 66 def _write_deps_patch_file(self, build_name):
61 """Saves the DEPS patch in a temp location and returns the file path.""" 67 """Saves the DEPS patch in a temp location and returns the file path."""
62 api = self.bisector.api 68 api = self.bisector.api
63 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff')) 69 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff'))
64 api.m.file.write('Saving diff patch for ' + str(self.revision_string), 70 api.m.file.write('Saving diff patch for ' + str(self.revision_string),
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
106 api.m.file.remove('cleaning up patch', self.patch_file) 112 api.m.file.remove('cleaning up patch', self.patch_file)
107 except api.m.step.StepFailure: # pragma: no cover 113 except api.m.step.StepFailure: # pragma: no cover
108 print 'Could not clean up ' + self.patch_file 114 print 'Could not clean up ' + self.patch_file
109 115
110 def _get_bisect_config_for_tester(self): 116 def _get_bisect_config_for_tester(self):
111 """Copies the key-value pairs required by a tester bot to a new dict.""" 117 """Copies the key-value pairs required by a tester bot to a new dict."""
112 result = {} 118 result = {}
113 required_test_properties = { 119 required_test_properties = {
114 'truncate_percent', 120 'truncate_percent',
115 'metric', 121 'metric',
116 'max_time_minutes',
117 'command', 122 'command',
118 'repeat_count',
119 'test_type' 123 'test_type'
120 } 124 }
121 for k, v in self.bisector.bisect_config.iteritems(): 125 for k, v in self.bisector.bisect_config.iteritems():
122 if k in required_test_properties: 126 if k in required_test_properties:
123 result[k] = v 127 result[k] = v
128 result['repeat_count'] = self.repeat_count
124 self._test_config = result 129 self._test_config = result
125 return result 130 return result
126 131
127 def _do_test(self): 132 def _do_test(self):
128 """Triggers tests for a revision, either locally or via try job. 133 """Triggers tests for a revision, either locally or via try job.
129 134
130 If local testing is enabled (i.e. director/tester merged) then 135 If local testing is enabled (i.e. director/tester merged) then
131 the test will be run on the same machine. Otherwise, this posts 136 the test will be run on the same machine. Otherwise, this posts
132 a request to buildbot to download and perf-test this build. 137 a request to buildbot to download and perf-test this build.
133 """ 138 """
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
184 builder = self.bisector.get_builder_bot_for_this_platform() 189 builder = self.bisector.get_builder_bot_for_this_platform()
185 if self.status == PerfRevisionState.TESTING: 190 if self.status == PerfRevisionState.TESTING:
186 builder = self.bisector.get_perf_tester_name() 191 builder = self.bisector.get_perf_tester_name()
187 return { 192 return {
188 'type': 'buildbot', 193 'type': 'buildbot',
189 'master': master, 194 'master': master,
190 'builder': builder, 195 'builder': builder,
191 'job_name': self.job_name, 196 'job_name': self.job_name,
192 } 197 }
193 198
199 def retest(self): # pragma: no cover
200 # We need at least 5 samples for the hypothesis testing to work properly.
dtu 2016/02/05 00:18:02 Sorry, not strictly true as written. Only true wit
201 target_sample_size = max(5, math.ceil(len(self.values) * 1.5))
202 self.status = PerfRevisionState.NEED_MORE_DATA
203 self.repeat_count = target_sample_size - len(self.values)
204 self.start_job()
205 self.bisector.wait_for_any([self])
206
194 def _get_test_results(self): 207 def _get_test_results(self):
195 """Tries to get the results of a test job from cloud storage.""" 208 """Tries to get the results of a test job from cloud storage."""
196 api = self.bisector.api 209 api = self.bisector.api
197 try: 210 try:
198 stdout = api.m.raw_io.output() 211 stdout = api.m.raw_io.output()
199 name = 'Get test results for build ' + self.commit_hash 212 name = 'Get test results for build ' + self.commit_hash
200 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout, 213 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout,
201 name=name) 214 name=name)
202 except api.m.step.StepFailure: # pragma: no cover 215 except api.m.step.StepFailure: # pragma: no cover
203 self.bisector.surface_result('TEST_FAILURE') 216 self.bisector.surface_result('TEST_FAILURE')
204 return None 217 return None
205 else: 218 else:
206 return json.loads(step_result.stdout) 219 return json.loads(step_result.stdout)
207 220
208 def _check_revision_good(self): 221 def _check_revision_good(self):
209 """Determines if a revision is good or bad. 222 """Determines if a revision is good or bad.
210 223
211 Note that our current approach is to determine whether it is closer to 224 Iteratively increment the sample size of the revision being tested, the last
212 either the 'good' and 'bad' revisions given for the bisect job. 225 known good revision, and the first known bad revision until a relationship
226 of significant difference can be established betweeb the results of the
227 revision being tested and one of the other two.
228
229 If the results do not converge towards finding a significant difference in
230 either direction, this is expected to timeout eventually. This scenario
231 should be rather rare, since it is expected that the fkbr and lkgr are
232 significantly different as a precondition.
213 233
214 Returns: 234 Returns:
215 True if this revision is closer to the initial good revision's value than 235 True if the results of testing this revision are significantly different
216 to the initial bad revision's value. False otherwise. 236 from those of testing the earliest known bad revision.
237 False if they are instead significantly different form those of testing
238 the latest knwon good revision.
217 """ 239 """
218 # TODO: Reevaluate this approach 240 diff_from_good = self.bisector.significantly_different(
dtu 2016/02/05 00:18:02 If you reorganize this function, you can avoid the
219 bisector = self.bisector 241 self.bisector.lkgr.values, self.values)
220 distance_to_good = abs(self.mean_value - bisector.good_rev.mean_value) 242 diff_from_bad = self.bisector.significantly_different(
221 distance_to_bad = abs(self.mean_value - bisector.bad_rev.mean_value) 243 self.bisector.fkbr.values, self.values)
222 if distance_to_good < distance_to_bad: 244
223 return True 245 while not (diff_from_good or diff_from_bad): # pragma: no cover
224 return False 246 min(self.bisector.lkgr, self, self.bisector.fkbr,
247 key=lambda x: len(x.values)).retest()
248 diff_from_good = self.bisector.significantly_different(
249 self.bisector.lkgr.values, self.values)
250 diff_from_bad = self.bisector.significantly_different(
251 self.bisector.fkbr.values, self.values)
252
253 if diff_from_good and diff_from_bad:
254 # Multiple regressions.
255 # For now, proceed bisecting the biggest difference of the means.
256 dist_from_good = abs(self.mean_value - self.bisector.lkgr.mean_value)
257 dist_from_bad = abs(self.mean_value - self.bisector.fkbr.mean_value)
258 if dist_from_good > dist_from_bad:
259 # TODO(robertocn): Add way to handle the secondary regression
260 #self.bisector.handle_secondary_regression(self, self.bisector.fkbr)
261 return False
262 else:
263 #self.bisector.handle_secondary_regression(self.bisector.lkgr, self)
264 return True
265 return diff_from_bad # pragma: no cover
225 266
226 def __repr__(self): 267 def __repr__(self):
227 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' % 268 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' %
228 (self.commit_pos, self.values, self.mean_value, self.std_dev)) 269 (self.commit_pos, self.values, self.mean_value, self.std_dev))
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698