Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 # Copyright 2015 The Chromium Authors. All rights reserved. | 1 # Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 import json | 5 import json |
| 6 import math | |
| 6 import tempfile | 7 import tempfile |
| 7 import os | 8 import os |
| 8 import uuid | 9 import uuid |
| 9 | 10 |
| 10 from . import revision_state | 11 from . import revision_state |
| 11 | 12 |
| 12 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover | 13 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover |
| 13 from . import test_results_cache | 14 from . import test_results_cache |
| 14 | 15 |
| 16 # These relate to how to increase the number of repetitions during re-test | |
| 17 MINIMUM_SAMPLE_SIZE = 5 | |
| 18 INCREASE_FACTOR = 1.5 | |
| 15 | 19 |
| 16 class PerfRevisionState(revision_state.RevisionState): | 20 class PerfRevisionState(revision_state.RevisionState): |
| 17 """Contains the state and results for one revision in a perf bisect job.""" | 21 """Contains the state and results for one revision in a perf bisect job.""" |
| 18 | 22 |
| 19 def __init__(self, *args, **kwargs): | 23 def __init__(self, *args, **kwargs): |
| 20 super(PerfRevisionState, self).__init__(*args, **kwargs) | 24 super(PerfRevisionState, self).__init__(*args, **kwargs) |
| 21 self.values = [] | 25 self.values = [] |
| 22 self.mean_value = None | 26 self.mean_value = None |
| 23 self.std_dev = None | 27 self.std_dev = None |
| 28 self.repeat_count = MINIMUM_SAMPLE_SIZE | |
| 24 self._test_config = None | 29 self._test_config = None |
| 25 | 30 |
| 26 def _read_test_results(self): | 31 def _read_test_results(self, check_revision_goodness=True): |
| 27 """Gets the test results from GS and checks if the rev is good or bad.""" | 32 """Gets the test results from GS and checks if the rev is good or bad.""" |
| 28 test_results = self._get_test_results() | 33 test_results = self._get_test_results() |
| 29 # Results will contain the keys 'results' and 'output' where output is the | 34 # Results will contain the keys 'results' and 'output' where output is the |
| 30 # stdout of the command, and 'results' is itself a dict with the key | 35 # stdout of the command, and 'results' is itself a dict with the key |
| 31 # 'values' unless the test failed, in which case 'results' will contain | 36 # 'values' unless the test failed, in which case 'results' will contain |
| 32 # the 'error' key explaining the type of error. | 37 # the 'error' key explaining the type of error. |
| 33 results = test_results['results'] | 38 results = test_results['results'] |
| 34 if results.get('errors'): | 39 if results.get('errors'): |
| 35 self.status = PerfRevisionState.FAILED | 40 self.status = PerfRevisionState.FAILED |
| 36 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover | 41 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover |
| 37 self.bisector.surface_result('MISSING_METRIC') | 42 self.bisector.surface_result('MISSING_METRIC') |
| 38 return | 43 return |
| 39 self.values = results['values'] | 44 self.values = (self.values or []) + results['values'] |
|
dtu
2016/02/05 00:18:02
Why do you need this? Isn't self.values already in
| |
| 40 if self.bisector.is_return_code_mode(): | 45 if self.bisector.is_return_code_mode(): |
| 41 retcodes = test_results['retcodes'] | 46 retcodes = test_results['retcodes'] |
| 42 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1 | 47 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1 |
| 43 self.mean_value = overall_return_code | 48 self.mean_value = overall_return_code |
| 44 elif self.values: | 49 elif self.values: |
| 45 api = self.bisector.api | 50 api = self.bisector.api |
| 46 self.mean_value = api.m.math_utils.mean(self.values) | 51 self.mean_value = api.m.math_utils.mean(self.values) |
| 47 self.std_dev = api.m.math_utils.standard_deviation(self.values) | 52 self.std_dev = api.m.math_utils.standard_deviation(self.values) |
| 48 # Values were not found, but the test did not otherwise fail. | 53 # Values were not found, but the test did not otherwise fail. |
| 49 else: | 54 else: |
| 50 self.status = PerfRevisionState.FAILED | 55 self.status = PerfRevisionState.FAILED |
| 51 self.bisector.surface_result('MISSING_METRIC') | 56 self.bisector.surface_result('MISSING_METRIC') |
| 52 return | 57 return |
| 53 # We cannot test the goodness of the initial rev range. | 58 # We cannot test the goodness of the initial rev range. |
| 54 if self.bisector.good_rev != self and self.bisector.bad_rev != self: | 59 if (self.bisector.good_rev != self and self.bisector.bad_rev != self and |
| 60 check_revision_goodness): | |
| 55 if self._check_revision_good(): | 61 if self._check_revision_good(): |
| 56 self.good = True | 62 self.good = True |
| 57 else: | 63 else: |
| 58 self.bad = True | 64 self.bad = True |
| 59 | 65 |
| 60 def _write_deps_patch_file(self, build_name): | 66 def _write_deps_patch_file(self, build_name): |
| 61 """Saves the DEPS patch in a temp location and returns the file path.""" | 67 """Saves the DEPS patch in a temp location and returns the file path.""" |
| 62 api = self.bisector.api | 68 api = self.bisector.api |
| 63 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff')) | 69 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff')) |
| 64 api.m.file.write('Saving diff patch for ' + str(self.revision_string), | 70 api.m.file.write('Saving diff patch for ' + str(self.revision_string), |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 106 api.m.file.remove('cleaning up patch', self.patch_file) | 112 api.m.file.remove('cleaning up patch', self.patch_file) |
| 107 except api.m.step.StepFailure: # pragma: no cover | 113 except api.m.step.StepFailure: # pragma: no cover |
| 108 print 'Could not clean up ' + self.patch_file | 114 print 'Could not clean up ' + self.patch_file |
| 109 | 115 |
| 110 def _get_bisect_config_for_tester(self): | 116 def _get_bisect_config_for_tester(self): |
| 111 """Copies the key-value pairs required by a tester bot to a new dict.""" | 117 """Copies the key-value pairs required by a tester bot to a new dict.""" |
| 112 result = {} | 118 result = {} |
| 113 required_test_properties = { | 119 required_test_properties = { |
| 114 'truncate_percent', | 120 'truncate_percent', |
| 115 'metric', | 121 'metric', |
| 116 'max_time_minutes', | |
| 117 'command', | 122 'command', |
| 118 'repeat_count', | |
| 119 'test_type' | 123 'test_type' |
| 120 } | 124 } |
| 121 for k, v in self.bisector.bisect_config.iteritems(): | 125 for k, v in self.bisector.bisect_config.iteritems(): |
| 122 if k in required_test_properties: | 126 if k in required_test_properties: |
| 123 result[k] = v | 127 result[k] = v |
| 128 result['repeat_count'] = self.repeat_count | |
| 124 self._test_config = result | 129 self._test_config = result |
| 125 return result | 130 return result |
| 126 | 131 |
| 127 def _do_test(self): | 132 def _do_test(self): |
| 128 """Triggers tests for a revision, either locally or via try job. | 133 """Triggers tests for a revision, either locally or via try job. |
| 129 | 134 |
| 130 If local testing is enabled (i.e. director/tester merged) then | 135 If local testing is enabled (i.e. director/tester merged) then |
| 131 the test will be run on the same machine. Otherwise, this posts | 136 the test will be run on the same machine. Otherwise, this posts |
| 132 a request to buildbot to download and perf-test this build. | 137 a request to buildbot to download and perf-test this build. |
| 133 """ | 138 """ |
| (...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 184 builder = self.bisector.get_builder_bot_for_this_platform() | 189 builder = self.bisector.get_builder_bot_for_this_platform() |
| 185 if self.status == PerfRevisionState.TESTING: | 190 if self.status == PerfRevisionState.TESTING: |
| 186 builder = self.bisector.get_perf_tester_name() | 191 builder = self.bisector.get_perf_tester_name() |
| 187 return { | 192 return { |
| 188 'type': 'buildbot', | 193 'type': 'buildbot', |
| 189 'master': master, | 194 'master': master, |
| 190 'builder': builder, | 195 'builder': builder, |
| 191 'job_name': self.job_name, | 196 'job_name': self.job_name, |
| 192 } | 197 } |
| 193 | 198 |
| 199 def retest(self): # pragma: no cover | |
| 200 # We need at least 5 samples for the hypothesis testing to work properly. | |
|
dtu
2016/02/05 00:18:02
Sorry, not strictly true as written. Only true wit
| |
| 201 target_sample_size = max(5, math.ceil(len(self.values) * 1.5)) | |
| 202 self.status = PerfRevisionState.NEED_MORE_DATA | |
| 203 self.repeat_count = target_sample_size - len(self.values) | |
| 204 self.start_job() | |
| 205 self.bisector.wait_for_any([self]) | |
| 206 | |
| 194 def _get_test_results(self): | 207 def _get_test_results(self): |
| 195 """Tries to get the results of a test job from cloud storage.""" | 208 """Tries to get the results of a test job from cloud storage.""" |
| 196 api = self.bisector.api | 209 api = self.bisector.api |
| 197 try: | 210 try: |
| 198 stdout = api.m.raw_io.output() | 211 stdout = api.m.raw_io.output() |
| 199 name = 'Get test results for build ' + self.commit_hash | 212 name = 'Get test results for build ' + self.commit_hash |
| 200 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout, | 213 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout, |
| 201 name=name) | 214 name=name) |
| 202 except api.m.step.StepFailure: # pragma: no cover | 215 except api.m.step.StepFailure: # pragma: no cover |
| 203 self.bisector.surface_result('TEST_FAILURE') | 216 self.bisector.surface_result('TEST_FAILURE') |
| 204 return None | 217 return None |
| 205 else: | 218 else: |
| 206 return json.loads(step_result.stdout) | 219 return json.loads(step_result.stdout) |
| 207 | 220 |
| 208 def _check_revision_good(self): | 221 def _check_revision_good(self): |
| 209 """Determines if a revision is good or bad. | 222 """Determines if a revision is good or bad. |
| 210 | 223 |
| 211 Note that our current approach is to determine whether it is closer to | 224 Iteratively increment the sample size of the revision being tested, the last |
| 212 either the 'good' and 'bad' revisions given for the bisect job. | 225 known good revision, and the first known bad revision until a relationship |
| 226 of significant difference can be established betweeb the results of the | |
| 227 revision being tested and one of the other two. | |
| 228 | |
| 229 If the results do not converge towards finding a significant difference in | |
| 230 either direction, this is expected to timeout eventually. This scenario | |
| 231 should be rather rare, since it is expected that the fkbr and lkgr are | |
| 232 significantly different as a precondition. | |
| 213 | 233 |
| 214 Returns: | 234 Returns: |
| 215 True if this revision is closer to the initial good revision's value than | 235 True if the results of testing this revision are significantly different |
| 216 to the initial bad revision's value. False otherwise. | 236 from those of testing the earliest known bad revision. |
| 237 False if they are instead significantly different form those of testing | |
| 238 the latest knwon good revision. | |
| 217 """ | 239 """ |
| 218 # TODO: Reevaluate this approach | 240 diff_from_good = self.bisector.significantly_different( |
|
dtu
2016/02/05 00:18:02
If you reorganize this function, you can avoid the
| |
| 219 bisector = self.bisector | 241 self.bisector.lkgr.values, self.values) |
| 220 distance_to_good = abs(self.mean_value - bisector.good_rev.mean_value) | 242 diff_from_bad = self.bisector.significantly_different( |
| 221 distance_to_bad = abs(self.mean_value - bisector.bad_rev.mean_value) | 243 self.bisector.fkbr.values, self.values) |
| 222 if distance_to_good < distance_to_bad: | 244 |
| 223 return True | 245 while not (diff_from_good or diff_from_bad): # pragma: no cover |
| 224 return False | 246 min(self.bisector.lkgr, self, self.bisector.fkbr, |
| 247 key=lambda x: len(x.values)).retest() | |
| 248 diff_from_good = self.bisector.significantly_different( | |
| 249 self.bisector.lkgr.values, self.values) | |
| 250 diff_from_bad = self.bisector.significantly_different( | |
| 251 self.bisector.fkbr.values, self.values) | |
| 252 | |
| 253 if diff_from_good and diff_from_bad: | |
| 254 # Multiple regressions. | |
| 255 # For now, proceed bisecting the biggest difference of the means. | |
| 256 dist_from_good = abs(self.mean_value - self.bisector.lkgr.mean_value) | |
| 257 dist_from_bad = abs(self.mean_value - self.bisector.fkbr.mean_value) | |
| 258 if dist_from_good > dist_from_bad: | |
| 259 # TODO(robertocn): Add way to handle the secondary regression | |
| 260 #self.bisector.handle_secondary_regression(self, self.bisector.fkbr) | |
| 261 return False | |
| 262 else: | |
| 263 #self.bisector.handle_secondary_regression(self.bisector.lkgr, self) | |
| 264 return True | |
| 265 return diff_from_bad # pragma: no cover | |
| 225 | 266 |
| 226 def __repr__(self): | 267 def __repr__(self): |
| 227 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' % | 268 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' % |
| 228 (self.commit_pos, self.values, self.mean_value, self.std_dev)) | 269 (self.commit_pos, self.values, self.mean_value, self.std_dev)) |
| OLD | NEW |