Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(310)

Side by Side Diff: scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py

Issue 1702013004: Revert of Iteratively increase sample size for good/bad classification. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master
Patch Set: Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2015 The Chromium Authors. All rights reserved. 1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import json 5 import json
6 import math
7 import tempfile 6 import tempfile
8 import os 7 import os
9 import uuid 8 import uuid
10 9
11 from . import revision_state 10 from . import revision_state
12 11
13 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover 12 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover
14 from . import test_results_cache 13 from . import test_results_cache
15 14
16 # These relate to how to increase the number of repetitions during re-test
17 MINIMUM_SAMPLE_SIZE = 5
18 INCREASE_FACTOR = 1.5
19 15
20 class PerfRevisionState(revision_state.RevisionState): 16 class PerfRevisionState(revision_state.RevisionState):
21 """Contains the state and results for one revision in a perf bisect job.""" 17 """Contains the state and results for one revision in a perf bisect job."""
22 18
23 def __init__(self, *args, **kwargs): 19 def __init__(self, *args, **kwargs):
24 super(PerfRevisionState, self).__init__(*args, **kwargs) 20 super(PerfRevisionState, self).__init__(*args, **kwargs)
25 self.values = [] 21 self.values = []
26 self.mean_value = None 22 self.mean_value = None
27 self.std_dev = None 23 self.std_dev = None
28 self.repeat_count = MINIMUM_SAMPLE_SIZE
29 self._test_config = None 24 self._test_config = None
30 25
31 def _read_test_results(self, check_revision_goodness=True): 26 def _read_test_results(self):
32 """Gets the test results from GS and checks if the rev is good or bad.""" 27 """Gets the test results from GS and checks if the rev is good or bad."""
33 test_results = self._get_test_results() 28 test_results = self._get_test_results()
34 # Results will contain the keys 'results' and 'output' where output is the 29 # Results will contain the keys 'results' and 'output' where output is the
35 # stdout of the command, and 'results' is itself a dict with the key 30 # stdout of the command, and 'results' is itself a dict with the key
36 # 'values' unless the test failed, in which case 'results' will contain 31 # 'values' unless the test failed, in which case 'results' will contain
37 # the 'error' key explaining the type of error. 32 # the 'error' key explaining the type of error.
38 results = test_results['results'] 33 results = test_results['results']
39 if results.get('errors'): 34 if results.get('errors'):
40 self.status = PerfRevisionState.FAILED 35 self.status = PerfRevisionState.FAILED
41 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover 36 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover
42 self.bisector.surface_result('MISSING_METRIC') 37 self.bisector.surface_result('MISSING_METRIC')
43 return 38 return
44 self.values += results['values'] 39 self.values = results['values']
45 if self.bisector.is_return_code_mode(): 40 if self.bisector.is_return_code_mode():
46 retcodes = test_results['retcodes'] 41 retcodes = test_results['retcodes']
47 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1 42 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1
48 self.mean_value = overall_return_code 43 self.mean_value = overall_return_code
49 elif self.values: 44 elif self.values:
50 api = self.bisector.api 45 api = self.bisector.api
51 self.mean_value = api.m.math_utils.mean(self.values) 46 self.mean_value = api.m.math_utils.mean(self.values)
52 self.std_dev = api.m.math_utils.standard_deviation(self.values) 47 self.std_dev = api.m.math_utils.standard_deviation(self.values)
53 # Values were not found, but the test did not otherwise fail. 48 # Values were not found, but the test did not otherwise fail.
54 else: 49 else:
55 self.status = PerfRevisionState.FAILED 50 self.status = PerfRevisionState.FAILED
56 self.bisector.surface_result('MISSING_METRIC') 51 self.bisector.surface_result('MISSING_METRIC')
57 return 52 return
58 # If we have already decided on the goodness of this revision, we shouldn't
59 # recheck it.
60 if self.good or self.bad:
61 check_revision_goodness = False
62 # We cannot test the goodness of the initial rev range. 53 # We cannot test the goodness of the initial rev range.
63 if (self.bisector.good_rev != self and self.bisector.bad_rev != self and 54 if self.bisector.good_rev != self and self.bisector.bad_rev != self:
64 check_revision_goodness):
65 if self._check_revision_good(): 55 if self._check_revision_good():
66 self.good = True 56 self.good = True
67 else: 57 else:
68 self.bad = True 58 self.bad = True
69 59
70 def _write_deps_patch_file(self, build_name): 60 def _write_deps_patch_file(self, build_name):
71 """Saves the DEPS patch in a temp location and returns the file path.""" 61 """Saves the DEPS patch in a temp location and returns the file path."""
72 api = self.bisector.api 62 api = self.bisector.api
73 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff')) 63 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff'))
74 api.m.file.write('Saving diff patch for ' + str(self.revision_string), 64 api.m.file.write('Saving diff patch for ' + str(self.revision_string),
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
116 api.m.file.remove('cleaning up patch', self.patch_file) 106 api.m.file.remove('cleaning up patch', self.patch_file)
117 except api.m.step.StepFailure: # pragma: no cover 107 except api.m.step.StepFailure: # pragma: no cover
118 print 'Could not clean up ' + self.patch_file 108 print 'Could not clean up ' + self.patch_file
119 109
120 def _get_bisect_config_for_tester(self): 110 def _get_bisect_config_for_tester(self):
121 """Copies the key-value pairs required by a tester bot to a new dict.""" 111 """Copies the key-value pairs required by a tester bot to a new dict."""
122 result = {} 112 result = {}
123 required_test_properties = { 113 required_test_properties = {
124 'truncate_percent', 114 'truncate_percent',
125 'metric', 115 'metric',
116 'max_time_minutes',
126 'command', 117 'command',
118 'repeat_count',
127 'test_type' 119 'test_type'
128 } 120 }
129 for k, v in self.bisector.bisect_config.iteritems(): 121 for k, v in self.bisector.bisect_config.iteritems():
130 if k in required_test_properties: 122 if k in required_test_properties:
131 result[k] = v 123 result[k] = v
132 result['repeat_count'] = self.repeat_count
133 self._test_config = result 124 self._test_config = result
134 return result 125 return result
135 126
136 def _do_test(self): 127 def _do_test(self):
137 """Triggers tests for a revision, either locally or via try job. 128 """Triggers tests for a revision, either locally or via try job.
138 129
139 If local testing is enabled (i.e. director/tester merged) then 130 If local testing is enabled (i.e. director/tester merged) then
140 the test will be run on the same machine. Otherwise, this posts 131 the test will be run on the same machine. Otherwise, this posts
141 a request to buildbot to download and perf-test this build. 132 a request to buildbot to download and perf-test this build.
142 """ 133 """
(...skipping 14 matching lines...) Expand all
157 'bisect_config': self._get_bisect_config_for_tester(), 148 'bisect_config': self._get_bisect_config_for_tester(),
158 'job_name': self.job_name, 149 'job_name': self.job_name,
159 }, 150 },
160 } 151 }
161 if 'CACHE_TEST_RESULTS' in os.environ and test_results_cache.has_results( 152 if 'CACHE_TEST_RESULTS' in os.environ and test_results_cache.has_results(
162 self.job_name): # pragma: no cover 153 self.job_name): # pragma: no cover
163 return 154 return
164 self.test_results_url = (self.bisector.api.GS_RESULTS_URL + 155 self.test_results_url = (self.bisector.api.GS_RESULTS_URL +
165 self.job_name + '.results') 156 self.job_name + '.results')
166 if api.m.bisect_tester.local_test_enabled(): # pragma: no cover 157 if api.m.bisect_tester.local_test_enabled(): # pragma: no cover
167 skip_download = self.bisector.last_tested_revision == self
168 self.bisector.last_tested_revision = self
169 overrides = perf_test_properties['properties'] 158 overrides = perf_test_properties['properties']
170 api.run_local_test_run(api.m, overrides, skip_download=skip_download) 159 api.run_local_test_run(api.m, overrides)
171 else: 160 else:
172 step_name = 'Triggering test job for ' + str(self.revision_string) 161 step_name = 'Triggering test job for ' + str(self.revision_string)
173 api.m.trigger(perf_test_properties, name=step_name) 162 api.m.trigger(perf_test_properties, name=step_name)
174 163
175 def get_next_url(self): 164 def get_next_url(self):
176 """Returns a GS URL for checking progress of a build or test.""" 165 """Returns a GS URL for checking progress of a build or test."""
177 if self.status == PerfRevisionState.BUILDING: 166 if self.status == PerfRevisionState.BUILDING:
178 return self.build_url 167 return self.build_url
179 if self.status == PerfRevisionState.TESTING: 168 if self.status == PerfRevisionState.TESTING:
180 return self.test_results_url 169 return self.test_results_url
(...skipping 14 matching lines...) Expand all
195 builder = self.bisector.get_builder_bot_for_this_platform() 184 builder = self.bisector.get_builder_bot_for_this_platform()
196 if self.status == PerfRevisionState.TESTING: 185 if self.status == PerfRevisionState.TESTING:
197 builder = self.bisector.get_perf_tester_name() 186 builder = self.bisector.get_perf_tester_name()
198 return { 187 return {
199 'type': 'buildbot', 188 'type': 'buildbot',
200 'master': master, 189 'master': master,
201 'builder': builder, 190 'builder': builder,
202 'job_name': self.job_name, 191 'job_name': self.job_name,
203 } 192 }
204 193
205 def retest(self): # pragma: no cover
206 # We need at least 5 samples for applying Mann-Whitney U test
207 # with P < 0.01, two-tailed .
208 target_sample_size = max(5, math.ceil(len(self.values) * 1.5))
209 self.status = PerfRevisionState.NEED_MORE_DATA
210 self.repeat_count = target_sample_size - len(self.values)
211 self.start_job()
212 self.bisector.wait_for_any([self])
213
214 def _get_test_results(self): 194 def _get_test_results(self):
215 """Tries to get the results of a test job from cloud storage.""" 195 """Tries to get the results of a test job from cloud storage."""
216 api = self.bisector.api 196 api = self.bisector.api
217 try: 197 try:
218 stdout = api.m.raw_io.output() 198 stdout = api.m.raw_io.output()
219 name = 'Get test results for build ' + self.commit_hash 199 name = 'Get test results for build ' + self.commit_hash
220 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout, 200 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout,
221 name=name) 201 name=name)
222 except api.m.step.StepFailure: # pragma: no cover 202 except api.m.step.StepFailure: # pragma: no cover
223 self.bisector.surface_result('TEST_FAILURE') 203 self.bisector.surface_result('TEST_FAILURE')
224 return None 204 return None
225 else: 205 else:
226 return json.loads(step_result.stdout) 206 return json.loads(step_result.stdout)
227 207
228 def _check_revision_good(self): 208 def _check_revision_good(self):
229 """Determines if a revision is good or bad. 209 """Determines if a revision is good or bad.
230 210
231 Iteratively increment the sample size of the revision being tested, the last 211 Note that our current approach is to determine whether it is closer to
232 known good revision, and the first known bad revision until a relationship 212 either the 'good' and 'bad' revisions given for the bisect job.
233 of significant difference can be established betweeb the results of the
234 revision being tested and one of the other two.
235
236 If the results do not converge towards finding a significant difference in
237 either direction, this is expected to timeout eventually. This scenario
238 should be rather rare, since it is expected that the fkbr and lkgr are
239 significantly different as a precondition.
240 213
241 Returns: 214 Returns:
242 True if the results of testing this revision are significantly different 215 True if this revision is closer to the initial good revision's value than
243 from those of testing the earliest known bad revision. 216 to the initial bad revision's value. False otherwise.
244 False if they are instead significantly different form those of testing
245 the latest knwon good revision.
246 """ 217 """
247 218 # TODO: Reevaluate this approach
248 while True: 219 bisector = self.bisector
249 diff_from_good = self.bisector.significantly_different( 220 distance_to_good = abs(self.mean_value - bisector.good_rev.mean_value)
250 self.bisector.lkgr.values, self.values) 221 distance_to_bad = abs(self.mean_value - bisector.bad_rev.mean_value)
251 diff_from_bad = self.bisector.significantly_different( 222 if distance_to_good < distance_to_bad:
252 self.bisector.fkbr.values, self.values) 223 return True
253 224 return False
254 if diff_from_good and diff_from_bad:
255 # Multiple regressions.
256 # For now, proceed bisecting the biggest difference of the means.
257 dist_from_good = abs(self.mean_value - self.bisector.lkgr.mean_value)
258 dist_from_bad = abs(self.mean_value - self.bisector.fkbr.mean_value)
259 if dist_from_good > dist_from_bad:
260 # TODO(robertocn): Add way to handle the secondary regression
261 #self.bisector.handle_secondary_regression(self, self.bisector.fkbr)
262 return False
263 else:
264 #self.bisector.handle_secondary_regression(self.bisector.lkgr, self)
265 return True
266
267 if diff_from_good or diff_from_bad: # pragma: no cover
268 return diff_from_bad
269
270 self._next_retest() # pragma: no cover
271
272
273 def _next_retest(self): # pragma: no cover
274 """Chooses one of current, lkgr, fkbr to retest.
275
276 Look for the smallest sample and retest that. If the last tested revision
277 is tied for the smallest sample, use that to take advantage of the fact
278 that it is already downloaded and unzipped.
279 """
280 next_revision_to_test = min(self.bisector.lkgr, self, self.bisector.fkbr,
281 key=lambda x: len(x.values))
282 if (len(self.bisector.last_tested_revision.values) ==
283 next_revision_to_test.values):
284 self.bisector.last_tested_revision.retest()
285 else:
286 next_revision_to_test.retest()
287 225
288 def __repr__(self): 226 def __repr__(self):
289 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' % 227 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' %
290 (self.commit_pos, self.values, self.mean_value, self.std_dev)) 228 (self.commit_pos, self.values, self.mean_value, self.std_dev))
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698