Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(82)

Side by Side Diff: scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py

Issue 1610203003: Iteratively increase sample size for good/bad classification. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master
Patch Set: Rebasing Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2015 The Chromium Authors. All rights reserved. 1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import json 5 import json
6 import math
6 import tempfile 7 import tempfile
7 import os 8 import os
8 import uuid 9 import uuid
9 10
10 from . import revision_state 11 from . import revision_state
11 12
12 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover 13 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover
13 from . import test_results_cache 14 from . import test_results_cache
14 15
16 # These relate to how to increase the number of repetitions during re-test
17 MINIMUM_SAMPLE_SIZE = 5
18 INCREASE_FACTOR = 1.5
15 19
16 class PerfRevisionState(revision_state.RevisionState): 20 class PerfRevisionState(revision_state.RevisionState):
17 """Contains the state and results for one revision in a perf bisect job.""" 21 """Contains the state and results for one revision in a perf bisect job."""
18 22
19 def __init__(self, *args, **kwargs): 23 def __init__(self, *args, **kwargs):
20 super(PerfRevisionState, self).__init__(*args, **kwargs) 24 super(PerfRevisionState, self).__init__(*args, **kwargs)
21 self.values = [] 25 self.values = []
22 self.mean_value = None 26 self.mean_value = None
23 self.std_dev = None 27 self.std_dev = None
28 self.repeat_count = MINIMUM_SAMPLE_SIZE
24 self._test_config = None 29 self._test_config = None
25 30
26 def _read_test_results(self): 31 def _read_test_results(self, check_revision_goodness=True):
27 """Gets the test results from GS and checks if the rev is good or bad.""" 32 """Gets the test results from GS and checks if the rev is good or bad."""
28 test_results = self._get_test_results() 33 test_results = self._get_test_results()
29 # Results will contain the keys 'results' and 'output' where output is the 34 # Results will contain the keys 'results' and 'output' where output is the
30 # stdout of the command, and 'results' is itself a dict with the key 35 # stdout of the command, and 'results' is itself a dict with the key
31 # 'values' unless the test failed, in which case 'results' will contain 36 # 'values' unless the test failed, in which case 'results' will contain
32 # the 'error' key explaining the type of error. 37 # the 'error' key explaining the type of error.
33 results = test_results['results'] 38 results = test_results['results']
34 if results.get('errors'): 39 if results.get('errors'):
35 self.status = PerfRevisionState.FAILED 40 self.status = PerfRevisionState.FAILED
36 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover 41 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover
37 self.bisector.surface_result('MISSING_METRIC') 42 self.bisector.surface_result('MISSING_METRIC')
38 return 43 return
39 self.values = results['values'] 44 self.values += results['values']
40 if self.bisector.is_return_code_mode(): 45 if self.bisector.is_return_code_mode():
41 retcodes = test_results['retcodes'] 46 retcodes = test_results['retcodes']
42 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1 47 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1
43 self.mean_value = overall_return_code 48 self.mean_value = overall_return_code
44 elif self.values: 49 elif self.values:
45 api = self.bisector.api 50 api = self.bisector.api
46 self.mean_value = api.m.math_utils.mean(self.values) 51 self.mean_value = api.m.math_utils.mean(self.values)
47 self.std_dev = api.m.math_utils.standard_deviation(self.values) 52 self.std_dev = api.m.math_utils.standard_deviation(self.values)
48 # Values were not found, but the test did not otherwise fail. 53 # Values were not found, but the test did not otherwise fail.
49 else: 54 else:
50 self.status = PerfRevisionState.FAILED 55 self.status = PerfRevisionState.FAILED
51 self.bisector.surface_result('MISSING_METRIC') 56 self.bisector.surface_result('MISSING_METRIC')
52 return 57 return
58 # If we have already decided on the goodness of this revision, we shouldn't
59 # recheck it.
60 if self.good or self.bad:
61 check_revision_goodness = False
53 # We cannot test the goodness of the initial rev range. 62 # We cannot test the goodness of the initial rev range.
54 if self.bisector.good_rev != self and self.bisector.bad_rev != self: 63 if (self.bisector.good_rev != self and self.bisector.bad_rev != self and
64 check_revision_goodness):
55 if self._check_revision_good(): 65 if self._check_revision_good():
56 self.good = True 66 self.good = True
57 else: 67 else:
58 self.bad = True 68 self.bad = True
59 69
60 def _write_deps_patch_file(self, build_name): 70 def _write_deps_patch_file(self, build_name):
61 """Saves the DEPS patch in a temp location and returns the file path.""" 71 """Saves the DEPS patch in a temp location and returns the file path."""
62 api = self.bisector.api 72 api = self.bisector.api
63 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff')) 73 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff'))
64 api.m.file.write('Saving diff patch for ' + str(self.revision_string), 74 api.m.file.write('Saving diff patch for ' + str(self.revision_string),
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
106 api.m.file.remove('cleaning up patch', self.patch_file) 116 api.m.file.remove('cleaning up patch', self.patch_file)
107 except api.m.step.StepFailure: # pragma: no cover 117 except api.m.step.StepFailure: # pragma: no cover
108 print 'Could not clean up ' + self.patch_file 118 print 'Could not clean up ' + self.patch_file
109 119
110 def _get_bisect_config_for_tester(self): 120 def _get_bisect_config_for_tester(self):
111 """Copies the key-value pairs required by a tester bot to a new dict.""" 121 """Copies the key-value pairs required by a tester bot to a new dict."""
112 result = {} 122 result = {}
113 required_test_properties = { 123 required_test_properties = {
114 'truncate_percent', 124 'truncate_percent',
115 'metric', 125 'metric',
116 'max_time_minutes',
117 'command', 126 'command',
118 'repeat_count',
119 'test_type' 127 'test_type'
120 } 128 }
121 for k, v in self.bisector.bisect_config.iteritems(): 129 for k, v in self.bisector.bisect_config.iteritems():
122 if k in required_test_properties: 130 if k in required_test_properties:
123 result[k] = v 131 result[k] = v
132 result['repeat_count'] = self.repeat_count
124 self._test_config = result 133 self._test_config = result
125 return result 134 return result
126 135
127 def _do_test(self): 136 def _do_test(self):
128 """Triggers tests for a revision, either locally or via try job. 137 """Triggers tests for a revision, either locally or via try job.
129 138
130 If local testing is enabled (i.e. director/tester merged) then 139 If local testing is enabled (i.e. director/tester merged) then
131 the test will be run on the same machine. Otherwise, this posts 140 the test will be run on the same machine. Otherwise, this posts
132 a request to buildbot to download and perf-test this build. 141 a request to buildbot to download and perf-test this build.
133 """ 142 """
(...skipping 14 matching lines...) Expand all
148 'bisect_config': self._get_bisect_config_for_tester(), 157 'bisect_config': self._get_bisect_config_for_tester(),
149 'job_name': self.job_name, 158 'job_name': self.job_name,
150 }, 159 },
151 } 160 }
152 if 'CACHE_TEST_RESULTS' in os.environ and test_results_cache.has_results( 161 if 'CACHE_TEST_RESULTS' in os.environ and test_results_cache.has_results(
153 self.job_name): # pragma: no cover 162 self.job_name): # pragma: no cover
154 return 163 return
155 self.test_results_url = (self.bisector.api.GS_RESULTS_URL + 164 self.test_results_url = (self.bisector.api.GS_RESULTS_URL +
156 self.job_name + '.results') 165 self.job_name + '.results')
157 if api.m.bisect_tester.local_test_enabled(): # pragma: no cover 166 if api.m.bisect_tester.local_test_enabled(): # pragma: no cover
167 skip_download = self.bisector.last_tested_revision == self
168 self.bisector.last_tested_revision = self
158 overrides = perf_test_properties['properties'] 169 overrides = perf_test_properties['properties']
159 api.run_local_test_run(api.m, overrides) 170 api.run_local_test_run(api.m, overrides, skip_download=skip_download)
160 else: 171 else:
161 step_name = 'Triggering test job for ' + str(self.revision_string) 172 step_name = 'Triggering test job for ' + str(self.revision_string)
162 api.m.trigger(perf_test_properties, name=step_name) 173 api.m.trigger(perf_test_properties, name=step_name)
163 174
164 def get_next_url(self): 175 def get_next_url(self):
165 """Returns a GS URL for checking progress of a build or test.""" 176 """Returns a GS URL for checking progress of a build or test."""
166 if self.status == PerfRevisionState.BUILDING: 177 if self.status == PerfRevisionState.BUILDING:
167 return self.build_url 178 return self.build_url
168 if self.status == PerfRevisionState.TESTING: 179 if self.status == PerfRevisionState.TESTING:
169 return self.test_results_url 180 return self.test_results_url
(...skipping 14 matching lines...) Expand all
184 builder = self.bisector.get_builder_bot_for_this_platform() 195 builder = self.bisector.get_builder_bot_for_this_platform()
185 if self.status == PerfRevisionState.TESTING: 196 if self.status == PerfRevisionState.TESTING:
186 builder = self.bisector.get_perf_tester_name() 197 builder = self.bisector.get_perf_tester_name()
187 return { 198 return {
188 'type': 'buildbot', 199 'type': 'buildbot',
189 'master': master, 200 'master': master,
190 'builder': builder, 201 'builder': builder,
191 'job_name': self.job_name, 202 'job_name': self.job_name,
192 } 203 }
193 204
205 def retest(self): # pragma: no cover
206 # We need at least 5 samples for applying Mann-Whitney U test
207 # with P < 0.01, two-tailed .
208 target_sample_size = max(5, math.ceil(len(self.values) * 1.5))
209 self.status = PerfRevisionState.NEED_MORE_DATA
210 self.repeat_count = target_sample_size - len(self.values)
211 self.start_job()
212 self.bisector.wait_for_any([self])
213
194 def _get_test_results(self): 214 def _get_test_results(self):
195 """Tries to get the results of a test job from cloud storage.""" 215 """Tries to get the results of a test job from cloud storage."""
196 api = self.bisector.api 216 api = self.bisector.api
197 try: 217 try:
198 stdout = api.m.raw_io.output() 218 stdout = api.m.raw_io.output()
199 name = 'Get test results for build ' + self.commit_hash 219 name = 'Get test results for build ' + self.commit_hash
200 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout, 220 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout,
201 name=name) 221 name=name)
202 except api.m.step.StepFailure: # pragma: no cover 222 except api.m.step.StepFailure: # pragma: no cover
203 self.bisector.surface_result('TEST_FAILURE') 223 self.bisector.surface_result('TEST_FAILURE')
204 return None 224 return None
205 else: 225 else:
206 return json.loads(step_result.stdout) 226 return json.loads(step_result.stdout)
207 227
208 def _check_revision_good(self): 228 def _check_revision_good(self):
209 """Determines if a revision is good or bad. 229 """Determines if a revision is good or bad.
210 230
211 Note that our current approach is to determine whether it is closer to 231 Iteratively increment the sample size of the revision being tested, the last
212 either the 'good' and 'bad' revisions given for the bisect job. 232 known good revision, and the first known bad revision until a relationship
233 of significant difference can be established betweeb the results of the
234 revision being tested and one of the other two.
235
236 If the results do not converge towards finding a significant difference in
237 either direction, this is expected to timeout eventually. This scenario
238 should be rather rare, since it is expected that the fkbr and lkgr are
239 significantly different as a precondition.
213 240
214 Returns: 241 Returns:
215 True if this revision is closer to the initial good revision's value than 242 True if the results of testing this revision are significantly different
216 to the initial bad revision's value. False otherwise. 243 from those of testing the earliest known bad revision.
244 False if they are instead significantly different form those of testing
245 the latest knwon good revision.
217 """ 246 """
218 # TODO: Reevaluate this approach 247 if self.bisector.is_return_code_mode():
219 bisector = self.bisector 248 return self.mean_value == 0
220 distance_to_good = abs(self.mean_value - bisector.good_rev.mean_value) 249
221 distance_to_bad = abs(self.mean_value - bisector.bad_rev.mean_value) 250 while True:
222 if distance_to_good < distance_to_bad: 251 diff_from_good = self.bisector.significantly_different(
223 return True 252 self.bisector.lkgr.values, self.values)
224 return False 253 diff_from_bad = self.bisector.significantly_different(
254 self.bisector.fkbr.values, self.values)
255
256 if diff_from_good and diff_from_bad:
257 # Multiple regressions.
258 # For now, proceed bisecting the biggest difference of the means.
259 dist_from_good = abs(self.mean_value - self.bisector.lkgr.mean_value)
260 dist_from_bad = abs(self.mean_value - self.bisector.fkbr.mean_value)
261 if dist_from_good > dist_from_bad:
262 # TODO(robertocn): Add way to handle the secondary regression
263 #self.bisector.handle_secondary_regression(self, self.bisector.fkbr)
264 return False
265 else:
266 #self.bisector.handle_secondary_regression(self.bisector.lkgr, self)
267 return True
268
269 if diff_from_good or diff_from_bad: # pragma: no cover
270 return diff_from_bad
271
272 self._next_retest() # pragma: no cover
273
274
275 def _next_retest(self): # pragma: no cover
276 """Chooses one of current, lkgr, fkbr to retest.
277
278 Look for the smallest sample and retest that. If the last tested revision
279 is tied for the smallest sample, use that to take advantage of the fact
280 that it is already downloaded and unzipped.
281 """
282 next_revision_to_test = min(self.bisector.lkgr, self, self.bisector.fkbr,
283 key=lambda x: len(x.values))
284 if (len(self.bisector.last_tested_revision.values) ==
285 next_revision_to_test.values):
286 self.bisector.last_tested_revision.retest()
287 else:
288 next_revision_to_test.retest()
225 289
226 def __repr__(self): 290 def __repr__(self):
227 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' % 291 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' %
228 (self.commit_pos, self.values, self.mean_value, self.std_dev)) 292 (self.commit_pos, self.values, self.mean_value, self.std_dev))
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698