| OLD | NEW |
| 1 # Copyright 2015 The Chromium Authors. All rights reserved. | 1 # Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 import json | 5 import json |
| 6 import re | 6 import re |
| 7 import time |
| 7 | 8 |
| 8 from . import bisect_results | 9 from . import bisect_results |
| 9 from . import depot_config | 10 from . import depot_config |
| 10 from . import revision_state | 11 from . import revision_state |
| 11 | 12 |
| 12 _DEPS_SHA_PATCH = """ | 13 _DEPS_SHA_PATCH = """ |
| 13 diff --git DEPS.sha DEPS.sha | 14 diff --git DEPS.sha DEPS.sha |
| 14 new file mode 100644 | 15 new file mode 100644 |
| 15 --- /dev/null | 16 --- /dev/null |
| 16 +++ DEPS.sha | 17 +++ DEPS.sha |
| (...skipping 18 matching lines...) Expand all Loading... |
| 35 'LO_INIT_CONF', # Bisect aborted early for lack of confidence. | 36 'LO_INIT_CONF', # Bisect aborted early for lack of confidence. |
| 36 'MISSING_METRIC', # The metric was not found in the test text/json output. | 37 'MISSING_METRIC', # The metric was not found in the test text/json output. |
| 37 'LO_FINAL_CONF', # The bisect completed without a culprit. | 38 'LO_FINAL_CONF', # The bisect completed without a culprit. |
| 38 ) | 39 ) |
| 39 | 40 |
| 40 # When we look for the next revision to build, we search nearby revisions | 41 # When we look for the next revision to build, we search nearby revisions |
| 41 # looking for a revision that's already been archived. Since we don't want | 42 # looking for a revision that's already been archived. Since we don't want |
| 42 # to move *too* far from the original revision, we'll cap the search at 25%. | 43 # to move *too* far from the original revision, we'll cap the search at 25%. |
| 43 DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25 | 44 DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25 |
| 44 | 45 |
| 46 # How long to re-test the initial good-bad range for until significant |
| 47 # difference is established. |
| 48 REGRESSION_CHECK_TIMEOUT = 2 * 60 * 60 |
| 49 # If we reach this number of samples on the reference range and have not |
| 50 # achieved statistical significance, bail. |
| 51 MAX_REQUIRED_SAMPLES = 50 |
| 52 |
| 53 # Significance level to use for determining difference between revisions via |
| 54 # hypothesis testing. |
| 55 SIGNIFICANCE_LEVEL = 0.01 |
| 56 |
| 45 | 57 |
| 46 class Bisector(object): | 58 class Bisector(object): |
| 47 """This class abstracts an ongoing bisect (or n-sect) job.""" | 59 """This class abstracts an ongoing bisect (or n-sect) job.""" |
| 48 | 60 |
| 49 def __init__(self, api, bisect_config, revision_class, init_revisions=True): | 61 def __init__(self, api, bisect_config, revision_class, init_revisions=True): |
| 50 """Initializes the state of a new bisect job from a dictionary. | 62 """Initializes the state of a new bisect job from a dictionary. |
| 51 | 63 |
| 52 Note that the initial good_rev and bad_rev MUST resolve to a commit position | 64 Note that the initial good_rev and bad_rev MUST resolve to a commit position |
| 53 in the chromium repo. | 65 in the chromium repo. |
| 54 """ | 66 """ |
| 55 super(Bisector, self).__init__() | 67 super(Bisector, self).__init__() |
| 56 self._api = api | 68 self._api = api |
| 57 self.ensure_sync_master_branch() | 69 self.ensure_sync_master_branch() |
| 58 self.bisect_config = bisect_config | 70 self.bisect_config = bisect_config |
| 59 self.config_step() | 71 self.config_step() |
| 60 self.revision_class = revision_class | 72 self.revision_class = revision_class |
| 61 self.result_codes = set() | 73 self.result_codes = set() |
| 74 self.last_tested_revision = None |
| 62 | 75 |
| 63 # Test-only properties. | 76 # Test-only properties. |
| 64 # TODO: Replace these with proper mod_test_data. | 77 # TODO: Replace these with proper mod_test_data. |
| 65 self.dummy_initial_confidence = bisect_config.get( | 78 self.dummy_initial_confidence = bisect_config.get( |
| 66 'dummy_initial_confidence') | 79 'dummy_initial_confidence') |
| 67 self.dummy_builds = bisect_config.get('dummy_builds', False) | 80 self.dummy_builds = bisect_config.get('dummy_builds', False) |
| 68 | 81 |
| 69 # Load configuration items. | 82 # Load configuration items. |
| 70 self.test_type = bisect_config.get('test_type', 'perf') | 83 self.test_type = bisect_config.get('test_type', 'perf') |
| 71 self.improvement_direction = int(bisect_config.get( | 84 self.improvement_direction = int(bisect_config.get( |
| (...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 107 self.good_rev.good = True | 120 self.good_rev.good = True |
| 108 self.good_rev.read_deps(self.get_perf_tester_name()) | 121 self.good_rev.read_deps(self.get_perf_tester_name()) |
| 109 api.m.step.active_result.presentation.logs['Debug Good Revision DEPS'] = [ | 122 api.m.step.active_result.presentation.logs['Debug Good Revision DEPS'] = [ |
| 110 '%s: %s' % (key, value) for key, value in | 123 '%s: %s' % (key, value) for key, value in |
| 111 self.good_rev.deps.iteritems()] | 124 self.good_rev.deps.iteritems()] |
| 112 self.good_rev.deps = {} | 125 self.good_rev.deps = {} |
| 113 self.lkgr = self.good_rev | 126 self.lkgr = self.good_rev |
| 114 if init_revisions: | 127 if init_revisions: |
| 115 self._expand_chromium_revision_range() | 128 self._expand_chromium_revision_range() |
| 116 | 129 |
| 130 def significantly_different( |
| 131 self, list_a, list_b, |
| 132 significance_level=SIGNIFICANCE_LEVEL): # pragma: no cover |
| 133 """Uses an external script to run hypothesis testing with scipy. |
| 134 |
| 135 The reason why we need an external script is that scipy is not available to |
| 136 the default python installed in all platforms. We instead rely on an |
| 137 anaconda environment to provide those packages. |
| 138 |
| 139 Args: |
| 140 list_a, list_b: Two lists representing samples to be compared. |
| 141 significance_level: Self-describing. As a decimal fraction. |
| 142 |
| 143 Returns: |
| 144 A boolean indicating whether the null hypothesis ~(that the lists are |
| 145 samples from the same population) can be rejected at the specified |
| 146 significance level. |
| 147 """ |
| 148 step_result = self.api.m.python( |
| 149 'Checking sample difference', |
| 150 self.api.resource('significantly_different.py'), |
| 151 [json.dumps(list_a), json.dumps(list_b), str(significance_level)], |
| 152 stdout=self.api.m.json.output()) |
| 153 results = step_result.stdout |
| 154 if results is None: |
| 155 assert self.dummy_builds |
| 156 return True |
| 157 significantly_different = results['significantly_different'] |
| 158 step_result.presentation.logs[str(significantly_different)] = [ |
| 159 'See json.output for details'] |
| 160 return significantly_different |
| 161 |
| 117 def config_step(self): | 162 def config_step(self): |
| 118 """Yields a simple echo step that outputs the bisect config.""" | 163 """Yields a simple echo step that outputs the bisect config.""" |
| 119 api = self.api | 164 api = self.api |
| 120 # bisect_config may come as a FrozenDict (which is not serializable). | 165 # bisect_config may come as a FrozenDict (which is not serializable). |
| 121 bisect_config = dict(self.bisect_config) | 166 bisect_config = dict(self.bisect_config) |
| 122 | 167 |
| 123 def fix_windows_backslashes(s): | 168 def fix_windows_backslashes(s): |
| 124 backslash_regex = re.compile(r'(?<!\\)\\(?!\\)') | 169 backslash_regex = re.compile(r'(?<!\\)\\(?!\\)') |
| 125 return backslash_regex.sub(r'\\', s) | 170 return backslash_regex.sub(r'\\', s) |
| 126 | 171 |
| (...skipping 278 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 405 self.failed_direction = True | 450 self.failed_direction = True |
| 406 self.warnings.append('The initial regression range for return code ' | 451 self.warnings.append('The initial regression range for return code ' |
| 407 'appears to show NO sign of a regression.') | 452 'appears to show NO sign of a regression.') |
| 408 | 453 |
| 409 def _set_failed_direction_results(self): # pragma: no cover | 454 def _set_failed_direction_results(self): # pragma: no cover |
| 410 self.failed_direction = True | 455 self.failed_direction = True |
| 411 self.warnings.append('The initial regression range appears to represent ' | 456 self.warnings.append('The initial regression range appears to represent ' |
| 412 'an improvement rather than a regression, given the ' | 457 'an improvement rather than a regression, given the ' |
| 413 'expected direction of improvement.') | 458 'expected direction of improvement.') |
| 414 | 459 |
| 415 def check_initial_confidence(self): | 460 def check_initial_confidence(self): # pragma: no cover |
| 416 """Checks that the initial range presents a clear enough regression. | 461 """Checks that the initial range presents a clear enough regression. |
| 417 | 462 |
| 418 We calculate the confidence of the results of the given 'good' | 463 We calculate the confidence of the results of the given 'good' |
| 419 and 'bad' revisions and compare it against the required confidence | 464 and 'bad' revisions and compare it against the required confidence |
| 420 set for the bisector. | 465 set for the bisector. |
| 421 | 466 |
| 422 Note that when a dummy regression confidence value has been set, that | 467 Note that when a dummy regression confidence value has been set, that |
| 423 is used instead. | 468 is used instead. |
| 424 """ | 469 """ |
| 425 if self.test_type != 'perf': | 470 if self.test_type != 'perf': |
| 426 return True | 471 return True |
| 427 | 472 |
| 428 if self.required_initial_confidence is None: | 473 if self.required_initial_confidence is None: |
| 429 return True # pragma: no cover | 474 return True # pragma: no cover |
| 430 | 475 |
| 476 # TODO(robertocn): Remove all uses of "confidence". |
| 431 if self.dummy_initial_confidence is not None: | 477 if self.dummy_initial_confidence is not None: |
| 432 self.initial_confidence = float( | 478 self.initial_confidence = float( |
| 433 self.dummy_initial_confidence) | 479 self.dummy_initial_confidence) |
| 480 if (float(self.initial_confidence) < |
| 481 float(self.required_initial_confidence)): |
| 482 self._set_insufficient_confidence_warning() |
| 483 return False |
| 484 return True |
| 434 | 485 |
| 435 else: # pragma: no cover | 486 if self.dummy_builds: |
| 436 if len(self.good_rev.values) < 5 or len(self.bad_rev.values) < 5: | 487 dummy_result = self.good_rev.values != self.bad_rev.values |
| 437 # If there are too few values, the confidence score is not a good way to | 488 if not dummy_result: |
| 438 # determine whether the regression is reproducible. | 489 self._set_insufficient_confidence_warning() |
| 439 # TODO(robertocn): Investigate a straightforward approach to deal with | 490 return dummy_result |
| 440 # these cases. Such as the mean of one group lying within the range of | 491 |
| 441 # the other. | 492 with self.api.m.step.nest('Re-testing reference range'): |
| 442 return True | 493 expiration_time = time.time() + REGRESSION_CHECK_TIMEOUT |
| 443 self.initial_confidence = ( | 494 while time.time() < expiration_time: |
| 444 self.api.m.math_utils.confidence_score( | 495 if len(self.good_rev.values) >= 5 and len(self.bad_rev.values) >= 5: |
| 445 self.good_rev.values, | 496 if self.significantly_different(self.good_rev.values, |
| 446 self.bad_rev.values)) | 497 self.bad_rev.values): |
| 447 if (self.initial_confidence < | 498 return True |
| 448 self.required_initial_confidence): # pragma: no cover | 499 if len(self.good_rev.values) == len(self.bad_rev.values): |
| 449 self._set_insufficient_confidence_warning(self.initial_confidence) | 500 revision_to_retest = self.last_tested_revision |
| 501 else: |
| 502 revision_to_retest = min(self.good_rev, self.bad_rev, |
| 503 key=lambda x: len(x.values)) |
| 504 if len(revision_to_retest.values) < MAX_REQUIRED_SAMPLES: |
| 505 revision_to_retest.retest() |
| 506 else: |
| 507 break |
| 508 self._set_insufficient_confidence_warning() |
| 450 return False | 509 return False |
| 451 return True | 510 |
| 452 | 511 |
| 453 def get_exception(self): | 512 def get_exception(self): |
| 454 raise NotImplementedError() # pragma: no cover | 513 raise NotImplementedError() # pragma: no cover |
| 455 # TODO: should return an exception with the details of the failure. | 514 # TODO: should return an exception with the details of the failure. |
| 456 | 515 |
| 457 def _set_insufficient_confidence_warning( | 516 def _set_insufficient_confidence_warning( |
| 458 self, actual_confidence): # pragma: no cover | 517 self): # pragma: no cover |
| 459 """Adds a warning about the lack of initial regression confidence.""" | 518 """Adds a warning about the lack of initial regression confidence.""" |
| 460 self.failed_initial_confidence = True | 519 self.failed_initial_confidence = True |
| 461 self.surface_result('LO_INIT_CONF') | 520 self.surface_result('LO_INIT_CONF') |
| 462 self.warnings.append( | 521 self.warnings.append( |
| 463 ('Bisect failed to reproduce the regression with enough confidence. ' | 522 'Bisect failed to reproduce the regression with enough confidence.') |
| 464 'Needed {:.2f}%, got {:.2f}%.').format( | |
| 465 self.required_initial_confidence, actual_confidence)) | |
| 466 | 523 |
| 467 def _results_debug_message(self): | 524 def _results_debug_message(self): |
| 468 """Returns a string with values used to debug a bisect result.""" | 525 """Returns a string with values used to debug a bisect result.""" |
| 469 result = 'bisector.lkgr: %r\n' % self.lkgr | 526 result = 'bisector.lkgr: %r\n' % self.lkgr |
| 470 result += 'bisector.fkbr: %r\n\n' % self.fkbr | 527 result += 'bisector.fkbr: %r\n\n' % self.fkbr |
| 471 result += self._revision_value_table() | 528 result += self._revision_value_table() |
| 472 if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values): | 529 if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values): |
| 473 result += '\n' + self._t_test_results() | 530 result += '\n' + self._t_test_results() |
| 474 return result | 531 return result |
| 475 | 532 |
| (...skipping 340 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 816 | 873 |
| 817 def surface_result(self, result_string): | 874 def surface_result(self, result_string): |
| 818 assert result_string in VALID_RESULT_CODES | 875 assert result_string in VALID_RESULT_CODES |
| 819 prefix = 'B4T_' # To avoid collision. Stands for bisect (abbr. `a la i18n). | 876 prefix = 'B4T_' # To avoid collision. Stands for bisect (abbr. `a la i18n). |
| 820 result_code = prefix + result_string | 877 result_code = prefix + result_string |
| 821 assert len(result_code) <= 20 | 878 assert len(result_code) <= 20 |
| 822 if result_code not in self.result_codes: | 879 if result_code not in self.result_codes: |
| 823 self.result_codes.add(result_code) | 880 self.result_codes.add(result_code) |
| 824 properties = self.api.m.step.active_result.presentation.properties | 881 properties = self.api.m.step.active_result.presentation.properties |
| 825 properties['extra_result_code'] = sorted(self.result_codes) | 882 properties['extra_result_code'] = sorted(self.result_codes) |
| OLD | NEW |