| OLD | NEW |
| 1 # Copyright 2015 The Chromium Authors. All rights reserved. | 1 # Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 import json | 5 import json |
| 6 import math | |
| 7 import tempfile | 6 import tempfile |
| 8 import os | 7 import os |
| 9 import uuid | 8 import uuid |
| 10 | 9 |
| 11 from . import revision_state | 10 from . import revision_state |
| 12 | 11 |
| 13 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover | 12 if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover |
| 14 from . import test_results_cache | 13 from . import test_results_cache |
| 15 | 14 |
| 16 # These relate to how to increase the number of repetitions during re-test | |
| 17 MINIMUM_SAMPLE_SIZE = 5 | |
| 18 INCREASE_FACTOR = 1.5 | |
| 19 | 15 |
| 20 class PerfRevisionState(revision_state.RevisionState): | 16 class PerfRevisionState(revision_state.RevisionState): |
| 21 """Contains the state and results for one revision in a perf bisect job.""" | 17 """Contains the state and results for one revision in a perf bisect job.""" |
| 22 | 18 |
| 23 def __init__(self, *args, **kwargs): | 19 def __init__(self, *args, **kwargs): |
| 24 super(PerfRevisionState, self).__init__(*args, **kwargs) | 20 super(PerfRevisionState, self).__init__(*args, **kwargs) |
| 25 self.values = [] | 21 self.values = [] |
| 26 self.mean_value = None | 22 self.mean_value = None |
| 27 self.std_dev = None | 23 self.std_dev = None |
| 28 self.repeat_count = MINIMUM_SAMPLE_SIZE | |
| 29 self._test_config = None | 24 self._test_config = None |
| 30 | 25 |
| 31 def _read_test_results(self, check_revision_goodness=True): | 26 def _read_test_results(self): |
| 32 """Gets the test results from GS and checks if the rev is good or bad.""" | 27 """Gets the test results from GS and checks if the rev is good or bad.""" |
| 33 test_results = self._get_test_results() | 28 test_results = self._get_test_results() |
| 34 # Results will contain the keys 'results' and 'output' where output is the | 29 # Results will contain the keys 'results' and 'output' where output is the |
| 35 # stdout of the command, and 'results' is itself a dict with the key | 30 # stdout of the command, and 'results' is itself a dict with the key |
| 36 # 'values' unless the test failed, in which case 'results' will contain | 31 # 'values' unless the test failed, in which case 'results' will contain |
| 37 # the 'error' key explaining the type of error. | 32 # the 'error' key explaining the type of error. |
| 38 results = test_results['results'] | 33 results = test_results['results'] |
| 39 if results.get('errors'): | 34 if results.get('errors'): |
| 40 self.status = PerfRevisionState.FAILED | 35 self.status = PerfRevisionState.FAILED |
| 41 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover | 36 if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover |
| 42 self.bisector.surface_result('MISSING_METRIC') | 37 self.bisector.surface_result('MISSING_METRIC') |
| 43 return | 38 return |
| 44 self.values += results['values'] | 39 self.values = results['values'] |
| 45 if self.bisector.is_return_code_mode(): | 40 if self.bisector.is_return_code_mode(): |
| 46 retcodes = test_results['retcodes'] | 41 retcodes = test_results['retcodes'] |
| 47 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1 | 42 overall_return_code = 0 if all(v == 0 for v in retcodes) else 1 |
| 48 self.mean_value = overall_return_code | 43 self.mean_value = overall_return_code |
| 49 elif self.values: | 44 elif self.values: |
| 50 api = self.bisector.api | 45 api = self.bisector.api |
| 51 self.mean_value = api.m.math_utils.mean(self.values) | 46 self.mean_value = api.m.math_utils.mean(self.values) |
| 52 self.std_dev = api.m.math_utils.standard_deviation(self.values) | 47 self.std_dev = api.m.math_utils.standard_deviation(self.values) |
| 53 # Values were not found, but the test did not otherwise fail. | 48 # Values were not found, but the test did not otherwise fail. |
| 54 else: | 49 else: |
| 55 self.status = PerfRevisionState.FAILED | 50 self.status = PerfRevisionState.FAILED |
| 56 self.bisector.surface_result('MISSING_METRIC') | 51 self.bisector.surface_result('MISSING_METRIC') |
| 57 return | 52 return |
| 58 # If we have already decided on the goodness of this revision, we shouldn't | |
| 59 # recheck it. | |
| 60 if self.good or self.bad: | |
| 61 check_revision_goodness = False | |
| 62 # We cannot test the goodness of the initial rev range. | 53 # We cannot test the goodness of the initial rev range. |
| 63 if (self.bisector.good_rev != self and self.bisector.bad_rev != self and | 54 if self.bisector.good_rev != self and self.bisector.bad_rev != self: |
| 64 check_revision_goodness): | |
| 65 if self._check_revision_good(): | 55 if self._check_revision_good(): |
| 66 self.good = True | 56 self.good = True |
| 67 else: | 57 else: |
| 68 self.bad = True | 58 self.bad = True |
| 69 | 59 |
| 70 def _write_deps_patch_file(self, build_name): | 60 def _write_deps_patch_file(self, build_name): |
| 71 """Saves the DEPS patch in a temp location and returns the file path.""" | 61 """Saves the DEPS patch in a temp location and returns the file path.""" |
| 72 api = self.bisector.api | 62 api = self.bisector.api |
| 73 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff')) | 63 file_name = str(api.m.path['tmp_base'].join(build_name + '.diff')) |
| 74 api.m.file.write('Saving diff patch for ' + str(self.revision_string), | 64 api.m.file.write('Saving diff patch for ' + str(self.revision_string), |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 116 api.m.file.remove('cleaning up patch', self.patch_file) | 106 api.m.file.remove('cleaning up patch', self.patch_file) |
| 117 except api.m.step.StepFailure: # pragma: no cover | 107 except api.m.step.StepFailure: # pragma: no cover |
| 118 print 'Could not clean up ' + self.patch_file | 108 print 'Could not clean up ' + self.patch_file |
| 119 | 109 |
| 120 def _get_bisect_config_for_tester(self): | 110 def _get_bisect_config_for_tester(self): |
| 121 """Copies the key-value pairs required by a tester bot to a new dict.""" | 111 """Copies the key-value pairs required by a tester bot to a new dict.""" |
| 122 result = {} | 112 result = {} |
| 123 required_test_properties = { | 113 required_test_properties = { |
| 124 'truncate_percent', | 114 'truncate_percent', |
| 125 'metric', | 115 'metric', |
| 116 'max_time_minutes', |
| 126 'command', | 117 'command', |
| 118 'repeat_count', |
| 127 'test_type' | 119 'test_type' |
| 128 } | 120 } |
| 129 for k, v in self.bisector.bisect_config.iteritems(): | 121 for k, v in self.bisector.bisect_config.iteritems(): |
| 130 if k in required_test_properties: | 122 if k in required_test_properties: |
| 131 result[k] = v | 123 result[k] = v |
| 132 result['repeat_count'] = self.repeat_count | |
| 133 self._test_config = result | 124 self._test_config = result |
| 134 return result | 125 return result |
| 135 | 126 |
| 136 def _do_test(self): | 127 def _do_test(self): |
| 137 """Triggers tests for a revision, either locally or via try job. | 128 """Triggers tests for a revision, either locally or via try job. |
| 138 | 129 |
| 139 If local testing is enabled (i.e. director/tester merged) then | 130 If local testing is enabled (i.e. director/tester merged) then |
| 140 the test will be run on the same machine. Otherwise, this posts | 131 the test will be run on the same machine. Otherwise, this posts |
| 141 a request to buildbot to download and perf-test this build. | 132 a request to buildbot to download and perf-test this build. |
| 142 """ | 133 """ |
| (...skipping 14 matching lines...) Expand all Loading... |
| 157 'bisect_config': self._get_bisect_config_for_tester(), | 148 'bisect_config': self._get_bisect_config_for_tester(), |
| 158 'job_name': self.job_name, | 149 'job_name': self.job_name, |
| 159 }, | 150 }, |
| 160 } | 151 } |
| 161 if 'CACHE_TEST_RESULTS' in os.environ and test_results_cache.has_results( | 152 if 'CACHE_TEST_RESULTS' in os.environ and test_results_cache.has_results( |
| 162 self.job_name): # pragma: no cover | 153 self.job_name): # pragma: no cover |
| 163 return | 154 return |
| 164 self.test_results_url = (self.bisector.api.GS_RESULTS_URL + | 155 self.test_results_url = (self.bisector.api.GS_RESULTS_URL + |
| 165 self.job_name + '.results') | 156 self.job_name + '.results') |
| 166 if api.m.bisect_tester.local_test_enabled(): # pragma: no cover | 157 if api.m.bisect_tester.local_test_enabled(): # pragma: no cover |
| 167 skip_download = self.bisector.last_tested_revision == self | |
| 168 self.bisector.last_tested_revision = self | |
| 169 overrides = perf_test_properties['properties'] | 158 overrides = perf_test_properties['properties'] |
| 170 api.run_local_test_run(api.m, overrides, skip_download=skip_download) | 159 api.run_local_test_run(api.m, overrides) |
| 171 else: | 160 else: |
| 172 step_name = 'Triggering test job for ' + str(self.revision_string) | 161 step_name = 'Triggering test job for ' + str(self.revision_string) |
| 173 api.m.trigger(perf_test_properties, name=step_name) | 162 api.m.trigger(perf_test_properties, name=step_name) |
| 174 | 163 |
| 175 def get_next_url(self): | 164 def get_next_url(self): |
| 176 """Returns a GS URL for checking progress of a build or test.""" | 165 """Returns a GS URL for checking progress of a build or test.""" |
| 177 if self.status == PerfRevisionState.BUILDING: | 166 if self.status == PerfRevisionState.BUILDING: |
| 178 return self.build_url | 167 return self.build_url |
| 179 if self.status == PerfRevisionState.TESTING: | 168 if self.status == PerfRevisionState.TESTING: |
| 180 return self.test_results_url | 169 return self.test_results_url |
| (...skipping 14 matching lines...) Expand all Loading... |
| 195 builder = self.bisector.get_builder_bot_for_this_platform() | 184 builder = self.bisector.get_builder_bot_for_this_platform() |
| 196 if self.status == PerfRevisionState.TESTING: | 185 if self.status == PerfRevisionState.TESTING: |
| 197 builder = self.bisector.get_perf_tester_name() | 186 builder = self.bisector.get_perf_tester_name() |
| 198 return { | 187 return { |
| 199 'type': 'buildbot', | 188 'type': 'buildbot', |
| 200 'master': master, | 189 'master': master, |
| 201 'builder': builder, | 190 'builder': builder, |
| 202 'job_name': self.job_name, | 191 'job_name': self.job_name, |
| 203 } | 192 } |
| 204 | 193 |
| 205 def retest(self): # pragma: no cover | |
| 206 # We need at least 5 samples for applying Mann-Whitney U test | |
| 207 # with P < 0.01, two-tailed . | |
| 208 target_sample_size = max(5, math.ceil(len(self.values) * 1.5)) | |
| 209 self.status = PerfRevisionState.NEED_MORE_DATA | |
| 210 self.repeat_count = target_sample_size - len(self.values) | |
| 211 self.start_job() | |
| 212 self.bisector.wait_for_any([self]) | |
| 213 | |
| 214 def _get_test_results(self): | 194 def _get_test_results(self): |
| 215 """Tries to get the results of a test job from cloud storage.""" | 195 """Tries to get the results of a test job from cloud storage.""" |
| 216 api = self.bisector.api | 196 api = self.bisector.api |
| 217 try: | 197 try: |
| 218 stdout = api.m.raw_io.output() | 198 stdout = api.m.raw_io.output() |
| 219 name = 'Get test results for build ' + self.commit_hash | 199 name = 'Get test results for build ' + self.commit_hash |
| 220 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout, | 200 step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout, |
| 221 name=name) | 201 name=name) |
| 222 except api.m.step.StepFailure: # pragma: no cover | 202 except api.m.step.StepFailure: # pragma: no cover |
| 223 self.bisector.surface_result('TEST_FAILURE') | 203 self.bisector.surface_result('TEST_FAILURE') |
| 224 return None | 204 return None |
| 225 else: | 205 else: |
| 226 return json.loads(step_result.stdout) | 206 return json.loads(step_result.stdout) |
| 227 | 207 |
| 228 def _check_revision_good(self): | 208 def _check_revision_good(self): |
| 229 """Determines if a revision is good or bad. | 209 """Determines if a revision is good or bad. |
| 230 | 210 |
| 231 Iteratively increment the sample size of the revision being tested, the last | 211 Note that our current approach is to determine whether it is closer to |
| 232 known good revision, and the first known bad revision until a relationship | 212 either the 'good' and 'bad' revisions given for the bisect job. |
| 233 of significant difference can be established betweeb the results of the | |
| 234 revision being tested and one of the other two. | |
| 235 | |
| 236 If the results do not converge towards finding a significant difference in | |
| 237 either direction, this is expected to timeout eventually. This scenario | |
| 238 should be rather rare, since it is expected that the fkbr and lkgr are | |
| 239 significantly different as a precondition. | |
| 240 | 213 |
| 241 Returns: | 214 Returns: |
| 242 True if the results of testing this revision are significantly different | 215 True if this revision is closer to the initial good revision's value than |
| 243 from those of testing the earliest known bad revision. | 216 to the initial bad revision's value. False otherwise. |
| 244 False if they are instead significantly different form those of testing | |
| 245 the latest knwon good revision. | |
| 246 """ | 217 """ |
| 247 | 218 # TODO: Reevaluate this approach |
| 248 while True: | 219 bisector = self.bisector |
| 249 diff_from_good = self.bisector.significantly_different( | 220 distance_to_good = abs(self.mean_value - bisector.good_rev.mean_value) |
| 250 self.bisector.lkgr.values, self.values) | 221 distance_to_bad = abs(self.mean_value - bisector.bad_rev.mean_value) |
| 251 diff_from_bad = self.bisector.significantly_different( | 222 if distance_to_good < distance_to_bad: |
| 252 self.bisector.fkbr.values, self.values) | 223 return True |
| 253 | 224 return False |
| 254 if diff_from_good and diff_from_bad: | |
| 255 # Multiple regressions. | |
| 256 # For now, proceed bisecting the biggest difference of the means. | |
| 257 dist_from_good = abs(self.mean_value - self.bisector.lkgr.mean_value) | |
| 258 dist_from_bad = abs(self.mean_value - self.bisector.fkbr.mean_value) | |
| 259 if dist_from_good > dist_from_bad: | |
| 260 # TODO(robertocn): Add way to handle the secondary regression | |
| 261 #self.bisector.handle_secondary_regression(self, self.bisector.fkbr) | |
| 262 return False | |
| 263 else: | |
| 264 #self.bisector.handle_secondary_regression(self.bisector.lkgr, self) | |
| 265 return True | |
| 266 | |
| 267 if diff_from_good or diff_from_bad: # pragma: no cover | |
| 268 return diff_from_bad | |
| 269 | |
| 270 self._next_retest() # pragma: no cover | |
| 271 | |
| 272 | |
| 273 def _next_retest(self): # pragma: no cover | |
| 274 """Chooses one of current, lkgr, fkbr to retest. | |
| 275 | |
| 276 Look for the smallest sample and retest that. If the last tested revision | |
| 277 is tied for the smallest sample, use that to take advantage of the fact | |
| 278 that it is already downloaded and unzipped. | |
| 279 """ | |
| 280 next_revision_to_test = min(self.bisector.lkgr, self, self.bisector.fkbr, | |
| 281 key=lambda x: len(x.values)) | |
| 282 if (len(self.bisector.last_tested_revision.values) == | |
| 283 next_revision_to_test.values): | |
| 284 self.bisector.last_tested_revision.retest() | |
| 285 else: | |
| 286 next_revision_to_test.retest() | |
| 287 | 225 |
| 288 def __repr__(self): | 226 def __repr__(self): |
| 289 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' % | 227 return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' % |
| 290 (self.commit_pos, self.values, self.mean_value, self.std_dev)) | 228 (self.commit_pos, self.values, self.mean_value, self.std_dev)) |
| OLD | NEW |