| OLD | NEW |
| 1 # Copyright 2015 The Chromium Authors. All rights reserved. | 1 # Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 import json | 5 import json |
| 6 import math |
| 6 import tempfile | 7 import tempfile |
| 7 import os | 8 import os |
| 8 import uuid | 9 import uuid |
| 9 | 10 |
| 10 from . import revision_state | 11 from . import revision_state |
| 11 | 12 |
| 12 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover | 13 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover |
| 13 from . import test_results_cache | 14 from . import test_results_cache |
| 14 | 15 |
| 16 # These relate to how to increase the number of repetitions during re-test |
| 17 MINIMUM_SAMPLE_SIZE = 5 |
| 18 INCREASE_FACTOR = 1.5 |
| 15 | 19 |
| 16 class PerfRevisionState(revision_state.RevisionState): | 20 class PerfRevisionState(revision_state.RevisionState): |
| 17 """Contains the state and results for one revision in a perf bisect job.""" | 21 """Contains the state and results for one revision in a perf bisect job.""" |
| 18 | 22 |
| 19 def __init__(self, *args, **kwargs): | 23 def __init__(self, *args, **kwargs): |
| 20 super(PerfRevisionState, self).__init__(*args, **kwargs) | 24 super(PerfRevisionState, self).__init__(*args, **kwargs) |
| 21 self.values = [] | 25 self.values = [] |
| 22 self.mean_value = None | 26 self.mean_value = None |
| 23 self.std_dev = None | 27 self.std_dev = None |
| 28 self.repeat_count = MINIMUM_SAMPLE_SIZE |
| 24 self._test_config = None | 29 self._test_config = None |
| 25 | 30 |
| 26 def _read_test_results(self): | 31 def _read_test_results(self, check_revision_goodness=True): |
| 27 """Gets the test results from GS and checks if the rev is good or bad.""" | 32 """Gets the test results from GS and checks if the rev is good or bad.""" |
| 28 test_results = self._get_test_results() | 33 test_results = self._get_test_results() |
| 29 # Results will contain the keys 'results' and 'output' where output is the | 34 # Results will contain the keys 'results' and 'output' where output is the |
| 30 # stdout of the command, and 'results' is itself a dict with the key | 35 # stdout of the command, and 'results' is itself a dict with the key |
| 31 # 'values' unless the test failed, in which case 'results' will contain | 36 # 'values' unless the test failed, in which case 'results' will contain |
| 32 # the 'error' key explaining the type of error. | 37 # the 'error' key explaining the type of error. |
| 33 results = test_results['results'] | 38 results = test_results['results'] |
| 34 if results.get('errors'): | 39 if results.get('errors'): |
| 35 self.status = PerfRevisionState.FAILED | 40 self.status = PerfRevisionState.FAILED |
| 36 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover | 41 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover |
| 37 self.bisector.surface_result('MISSING_METRIC') | 42 self.bisector.surface_result('MISSING_METRIC') |
| 38 return | 43 return |
| 39 self.values = results['values'] | 44 self.values += results['values'] |
| 40 if self.bisector.is_return_code_mode(): | 45 if self.bisector.is_return_code_mode(): |
| 41 retcodes = test_results['retcodes'] | 46 retcodes = test_results['retcodes'] |
| 42 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1 | 47 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1 |
| 43 self.mean_value = overall_return_code | 48 self.mean_value = overall_return_code |
| 44 elif self.values: | 49 elif self.values: |
| 45 api = self.bisector.api | 50 api = self.bisector.api |
| 46 self.mean_value = api.m.math_utils.mean(self.values) | 51 self.mean_value = api.m.math_utils.mean(self.values) |
| 47 self.std_dev = api.m.math_utils.standard_deviation(self.values) | 52 self.std_dev = api.m.math_utils.standard_deviation(self.values) |
| 48 # Values were not found, but the test did not otherwise fail. | 53 # Values were not found, but the test did not otherwise fail. |
| 49 else: | 54 else: |
| 50 self.status = PerfRevisionState.FAILED | 55 self.status = PerfRevisionState.FAILED |
| 51 self.bisector.surface_result('MISSING_METRIC') | 56 self.bisector.surface_result('MISSING_METRIC') |
| 52 return | 57 return |
| 58 # If we have already decided on the goodness of this revision, we shouldn't |
| 59 # recheck it. |
| 60 if self.good or self.bad: |
| 61 check_revision_goodness = False |
| 53 # We cannot test the goodness of the initial rev range. | 62 # We cannot test the goodness of the initial rev range. |
| 54 if self.bisector.good_rev != self and self.bisector.bad_rev != self: | 63 if (self.bisector.good_rev != self and self.bisector.bad_rev != self and |
| 64 check_revision_goodness): |
| 55 if self._check_revision_good(): | 65 if self._check_revision_good(): |
| 56 self.good = True | 66 self.good = True |
| 57 else: | 67 else: |
| 58 self.bad = True | 68 self.bad = True |
| 59 | 69 |
| 60 def _write_deps_patch_file(self, build_name): | 70 def _write_deps_patch_file(self, build_name): |
| 61 """Saves the DEPS patch in a temp location and returns the file path.""" | 71 """Saves the DEPS patch in a temp location and returns the file path.""" |
| 62 api = self.bisector.api | 72 api = self.bisector.api |
| 63 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff')) | 73 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff')) |
| 64 api.m.file.write('Saving diff patch for ' + str(self.revision_string), | 74 api.m.file.write('Saving diff patch for ' + str(self.revision_string), |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 106 api.m.file.remove('cleaning up patch', self.patch_file) | 116 api.m.file.remove('cleaning up patch', self.patch_file) |
| 107 except api.m.step.StepFailure: # pragma: no cover | 117 except api.m.step.StepFailure: # pragma: no cover |
| 108 print 'Could not clean up ' + self.patch_file | 118 print 'Could not clean up ' + self.patch_file |
| 109 | 119 |
| 110 def _get_bisect_config_for_tester(self): | 120 def _get_bisect_config_for_tester(self): |
| 111 """Copies the key-value pairs required by a tester bot to a new dict.""" | 121 """Copies the key-value pairs required by a tester bot to a new dict.""" |
| 112 result = {} | 122 result = {} |
| 113 required_test_properties = { | 123 required_test_properties = { |
| 114 'truncate_percent', | 124 'truncate_percent', |
| 115 'metric', | 125 'metric', |
| 116 'max_time_minutes', | |
| 117 'command', | 126 'command', |
| 118 'repeat_count', | |
| 119 'test_type' | 127 'test_type' |
| 120 } | 128 } |
| 121 for k, v in self.bisector.bisect_config.iteritems(): | 129 for k, v in self.bisector.bisect_config.iteritems(): |
| 122 if k in required_test_properties: | 130 if k in required_test_properties: |
| 123 result[k] = v | 131 result[k] = v |
| 132 result['repeat_count'] = self.repeat_count |
| 124 self._test_config = result | 133 self._test_config = result |
| 125 return result | 134 return result |
| 126 | 135 |
| 127 def _do_test(self): | 136 def _do_test(self): |
| 128 """Triggers tests for a revision, either locally or via try job. | 137 """Triggers tests for a revision, either locally or via try job. |
| 129 | 138 |
| 130 If local testing is enabled (i.e. director/tester merged) then | 139 If local testing is enabled (i.e. director/tester merged) then |
| 131 the test will be run on the same machine. Otherwise, this posts | 140 the test will be run on the same machine. Otherwise, this posts |
| 132 a request to buildbot to download and perf-test this build. | 141 a request to buildbot to download and perf-test this build. |
| 133 """ | 142 """ |
| (...skipping 14 matching lines...) Expand all Loading... |
| 148 'bisect_config': self._get_bisect_config_for_tester(), | 157 'bisect_config': self._get_bisect_config_for_tester(), |
| 149 'job_name': self.job_name, | 158 'job_name': self.job_name, |
| 150 }, | 159 }, |
| 151 } | 160 } |
| 152 if 'CACHE_TEST_RESULTS' in os.environ and test_results_cache.has_results( | 161 if 'CACHE_TEST_RESULTS' in os.environ and test_results_cache.has_results( |
| 153 self.job_name): # pragma: no cover | 162 self.job_name): # pragma: no cover |
| 154 return | 163 return |
| 155 self.test_results_url = (self.bisector.api.GS_RESULTS_URL + | 164 self.test_results_url = (self.bisector.api.GS_RESULTS_URL + |
| 156 self.job_name + '.results') | 165 self.job_name + '.results') |
| 157 if api.m.bisect_tester.local_test_enabled(): # pragma: no cover | 166 if api.m.bisect_tester.local_test_enabled(): # pragma: no cover |
| 167 skip_download = self.bisector.last_tested_revision == self |
| 168 self.bisector.last_tested_revision = self |
| 158 overrides = perf_test_properties['properties'] | 169 overrides = perf_test_properties['properties'] |
| 159 api.run_local_test_run(api.m, overrides) | 170 api.run_local_test_run(api.m, overrides, skip_download=skip_download) |
| 160 else: | 171 else: |
| 161 step_name = 'Triggering test job for ' + str(self.revision_string) | 172 step_name = 'Triggering test job for ' + str(self.revision_string) |
| 162 api.m.trigger(perf_test_properties, name=step_name) | 173 api.m.trigger(perf_test_properties, name=step_name) |
| 163 | 174 |
| 164 def get_next_url(self): | 175 def get_next_url(self): |
| 165 """Returns a GS URL for checking progress of a build or test.""" | 176 """Returns a GS URL for checking progress of a build or test.""" |
| 166 if self.status == PerfRevisionState.BUILDING: | 177 if self.status == PerfRevisionState.BUILDING: |
| 167 return self.build_url | 178 return self.build_url |
| 168 if self.status == PerfRevisionState.TESTING: | 179 if self.status == PerfRevisionState.TESTING: |
| 169 return self.test_results_url | 180 return self.test_results_url |
| (...skipping 14 matching lines...) Expand all Loading... |
| 184 builder = self.bisector.get_builder_bot_for_this_platform() | 195 builder = self.bisector.get_builder_bot_for_this_platform() |
| 185 if self.status == PerfRevisionState.TESTING: | 196 if self.status == PerfRevisionState.TESTING: |
| 186 builder = self.bisector.get_perf_tester_name() | 197 builder = self.bisector.get_perf_tester_name() |
| 187 return { | 198 return { |
| 188 'type': 'buildbot', | 199 'type': 'buildbot', |
| 189 'master': master, | 200 'master': master, |
| 190 'builder': builder, | 201 'builder': builder, |
| 191 'job_name': self.job_name, | 202 'job_name': self.job_name, |
| 192 } | 203 } |
| 193 | 204 |
| 205 def retest(self): # pragma: no cover |
| 206 # We need at least 5 samples for applying Mann-Whitney U test |
| 207 # with P < 0.01, two-tailed . |
| 208 target_sample_size = max(5, math.ceil(len(self.values) * 1.5)) |
| 209 self.status = PerfRevisionState.NEED_MORE_DATA |
| 210 self.repeat_count = target_sample_size - len(self.values) |
| 211 self.start_job() |
| 212 self.bisector.wait_for_any([self]) |
| 213 |
| 194 def _get_test_results(self): | 214 def _get_test_results(self): |
| 195 """Tries to get the results of a test job from cloud storage.""" | 215 """Tries to get the results of a test job from cloud storage.""" |
| 196 api = self.bisector.api | 216 api = self.bisector.api |
| 197 try: | 217 try: |
| 198 stdout = api.m.raw_io.output() | 218 stdout = api.m.raw_io.output() |
| 199 name = 'Get test results for build ' + self.commit_hash | 219 name = 'Get test results for build ' + self.commit_hash |
| 200 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout, | 220 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout, |
| 201 name=name) | 221 name=name) |
| 202 except api.m.step.StepFailure: # pragma: no cover | 222 except api.m.step.StepFailure: # pragma: no cover |
| 203 self.bisector.surface_result('TEST_FAILURE') | 223 self.bisector.surface_result('TEST_FAILURE') |
| 204 return None | 224 return None |
| 205 else: | 225 else: |
| 206 return json.loads(step_result.stdout) | 226 return json.loads(step_result.stdout) |
| 207 | 227 |
| 208 def _check_revision_good(self): | 228 def _check_revision_good(self): |
| 209 """Determines if a revision is good or bad. | 229 """Determines if a revision is good or bad. |
| 210 | 230 |
| 211 Note that our current approach is to determine whether it is closer to | 231 Iteratively increment the sample size of the revision being tested, the last |
| 212 either the 'good' and 'bad' revisions given for the bisect job. | 232 known good revision, and the first known bad revision until a relationship |
| 233 of significant difference can be established betweeb the results of the |
| 234 revision being tested and one of the other two. |
| 235 |
| 236 If the results do not converge towards finding a significant difference in |
| 237 either direction, this is expected to timeout eventually. This scenario |
| 238 should be rather rare, since it is expected that the fkbr and lkgr are |
| 239 significantly different as a precondition. |
| 213 | 240 |
| 214 Returns: | 241 Returns: |
| 215 True if this revision is closer to the initial good revision's value than | 242 True if the results of testing this revision are significantly different |
| 216 to the initial bad revision's value. False otherwise. | 243 from those of testing the earliest known bad revision. |
| 244 False if they are instead significantly different form those of testing |
| 245 the latest knwon good revision. |
| 217 """ | 246 """ |
| 218 # TODO: Reevaluate this approach | 247 if self.bisector.is_return_code_mode(): |
| 219 bisector = self.bisector | 248 return self.mean_value == 0 |
| 220 distance_to_good = abs(self.mean_value - bisector.good_rev.mean_value) | 249 |
| 221 distance_to_bad = abs(self.mean_value - bisector.bad_rev.mean_value) | 250 while True: |
| 222 if distance_to_good < distance_to_bad: | 251 diff_from_good = self.bisector.significantly_different( |
| 223 return True | 252 self.bisector.lkgr.values, self.values) |
| 224 return False | 253 diff_from_bad = self.bisector.significantly_different( |
| 254 self.bisector.fkbr.values, self.values) |
| 255 |
| 256 if diff_from_good and diff_from_bad: |
| 257 # Multiple regressions. |
| 258 # For now, proceed bisecting the biggest difference of the means. |
| 259 dist_from_good = abs(self.mean_value - self.bisector.lkgr.mean_value) |
| 260 dist_from_bad = abs(self.mean_value - self.bisector.fkbr.mean_value) |
| 261 if dist_from_good > dist_from_bad: |
| 262 # TODO(robertocn): Add way to handle the secondary regression |
| 263 #self.bisector.handle_secondary_regression(self, self.bisector.fkbr) |
| 264 return False |
| 265 else: |
| 266 #self.bisector.handle_secondary_regression(self.bisector.lkgr, self) |
| 267 return True |
| 268 |
| 269 if diff_from_good or diff_from_bad: # pragma: no cover |
| 270 return diff_from_bad |
| 271 |
| 272 self._next_retest() # pragma: no cover |
| 273 |
| 274 |
| 275 def _next_retest(self): # pragma: no cover |
| 276 """Chooses one of current, lkgr, fkbr to retest. |
| 277 |
| 278 Look for the smallest sample and retest that. If the last tested revision |
| 279 is tied for the smallest sample, use that to take advantage of the fact |
| 280 that it is already downloaded and unzipped. |
| 281 """ |
| 282 next_revision_to_test = min(self.bisector.lkgr, self, self.bisector.fkbr, |
| 283 key=lambda x: len(x.values)) |
| 284 if (len(self.bisector.last_tested_revision.values) == |
| 285 next_revision_to_test.values): |
| 286 self.bisector.last_tested_revision.retest() |
| 287 else: |
| 288 next_revision_to_test.retest() |
| 225 | 289 |
| 226 def __repr__(self): | 290 def __repr__(self): |
| 227 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' % | 291 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' % |
| 228 (self.commit_pos, self.values, self.mean_value, self.std_dev)) | 292 (self.commit_pos, self.values, self.mean_value, self.std_dev)) |
| OLD | NEW |