| OLD | NEW |
| 1 # Copyright 2015 The Chromium Authors. All rights reserved. | 1 # Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 import json | 5 import json |
| 6 import re | 6 import re |
| 7 import time | |
| 8 | 7 |
| 9 from . import bisect_results | 8 from . import bisect_results |
| 10 from . import depot_config | 9 from . import depot_config |
| 11 from . import revision_state | 10 from . import revision_state |
| 12 | 11 |
| 13 _DEPS_SHA_PATCH = """ | 12 _DEPS_SHA_PATCH = """ |
| 14 diff --git DEPS.sha DEPS.sha | 13 diff --git DEPS.sha DEPS.sha |
| 15 new file mode 100644 | 14 new file mode 100644 |
| 16 --- /dev/null | 15 --- /dev/null |
| 17 +++ DEPS.sha | 16 +++ DEPS.sha |
| (...skipping 18 matching lines...) Expand all Loading... |
| 36 'LO_INIT_CONF', # Bisect aborted early for lack of confidence. | 35 'LO_INIT_CONF', # Bisect aborted early for lack of confidence. |
| 37 'MISSING_METRIC', # The metric was not found in the test text/json output. | 36 'MISSING_METRIC', # The metric was not found in the test text/json output. |
| 38 'LO_FINAL_CONF', # The bisect completed without a culprit. | 37 'LO_FINAL_CONF', # The bisect completed without a culprit. |
| 39 ) | 38 ) |
| 40 | 39 |
| 41 # When we look for the next revision to build, we search nearby revisions | 40 # When we look for the next revision to build, we search nearby revisions |
| 42 # looking for a revision that's already been archived. Since we don't want | 41 # looking for a revision that's already been archived. Since we don't want |
| 43 # to move *too* far from the original revision, we'll cap the search at 25%. | 42 # to move *too* far from the original revision, we'll cap the search at 25%. |
| 44 DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25 | 43 DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25 |
| 45 | 44 |
| 46 # How long to re-test the initial good-bad range for until significant | |
| 47 # difference is established. | |
| 48 REGRESSION_CHECK_TIMEOUT = 2 * 60 * 60 | |
| 49 # If we reach this number of samples on the reference range and have not | |
| 50 # achieved statistical significance, bail. | |
| 51 MAX_REQUIRED_SAMPLES = 50 | |
| 52 | |
| 53 # Significance level to use for determining difference between revisions via | |
| 54 # hypothesis testing. | |
| 55 SIGNIFICANCE_LEVEL = 0.01 | |
| 56 | |
| 57 | 45 |
| 58 class Bisector(object): | 46 class Bisector(object): |
| 59 """This class abstracts an ongoing bisect (or n-sect) job.""" | 47 """This class abstracts an ongoing bisect (or n-sect) job.""" |
| 60 | 48 |
| 61 def __init__(self, api, bisect_config, revision_class, init_revisions=True): | 49 def __init__(self, api, bisect_config, revision_class, init_revisions=True): |
| 62 """Initializes the state of a new bisect job from a dictionary. | 50 """Initializes the state of a new bisect job from a dictionary. |
| 63 | 51 |
| 64 Note that the initial good_rev and bad_rev MUST resolve to a commit position | 52 Note that the initial good_rev and bad_rev MUST resolve to a commit position |
| 65 in the chromium repo. | 53 in the chromium repo. |
| 66 """ | 54 """ |
| 67 super(Bisector, self).__init__() | 55 super(Bisector, self).__init__() |
| 68 self._api = api | 56 self._api = api |
| 69 self.ensure_sync_master_branch() | 57 self.ensure_sync_master_branch() |
| 70 self.bisect_config = bisect_config | 58 self.bisect_config = bisect_config |
| 71 self.config_step() | 59 self.config_step() |
| 72 self.revision_class = revision_class | 60 self.revision_class = revision_class |
| 73 self.result_codes = set() | 61 self.result_codes = set() |
| 74 self.last_tested_revision = None | |
| 75 | 62 |
| 76 # Test-only properties. | 63 # Test-only properties. |
| 77 # TODO: Replace these with proper mod_test_data. | 64 # TODO: Replace these with proper mod_test_data. |
| 78 self.dummy_initial_confidence = bisect_config.get( | 65 self.dummy_initial_confidence = bisect_config.get( |
| 79 'dummy_initial_confidence') | 66 'dummy_initial_confidence') |
| 80 self.dummy_builds = bisect_config.get('dummy_builds', False) | 67 self.dummy_builds = bisect_config.get('dummy_builds', False) |
| 81 | 68 |
| 82 # Load configuration items. | 69 # Load configuration items. |
| 83 self.test_type = bisect_config.get('test_type', 'perf') | 70 self.test_type = bisect_config.get('test_type', 'perf') |
| 84 self.improvement_direction = int(bisect_config.get( | 71 self.improvement_direction = int(bisect_config.get( |
| (...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 120 self.good_rev.good = True | 107 self.good_rev.good = True |
| 121 self.good_rev.read_deps(self.get_perf_tester_name()) | 108 self.good_rev.read_deps(self.get_perf_tester_name()) |
| 122 api.m.step.active_result.presentation.logs['Debug Good Revision DEPS'] = [ | 109 api.m.step.active_result.presentation.logs['Debug Good Revision DEPS'] = [ |
| 123 '%s: %s' % (key, value) for key, value in | 110 '%s: %s' % (key, value) for key, value in |
| 124 self.good_rev.deps.iteritems()] | 111 self.good_rev.deps.iteritems()] |
| 125 self.good_rev.deps = {} | 112 self.good_rev.deps = {} |
| 126 self.lkgr = self.good_rev | 113 self.lkgr = self.good_rev |
| 127 if init_revisions: | 114 if init_revisions: |
| 128 self._expand_chromium_revision_range() | 115 self._expand_chromium_revision_range() |
| 129 | 116 |
| 130 def significantly_different( | |
| 131 self, list_a, list_b, | |
| 132 significance_level=SIGNIFICANCE_LEVEL): # pragma: no cover | |
| 133 """Uses an external script to run hypothesis testing with scipy. | |
| 134 | |
| 135 The reason why we need an external script is that scipy is not available to | |
| 136 the default python installed in all platforms. We instead rely on an | |
| 137 anaconda environment to provide those packages. | |
| 138 | |
| 139 Args: | |
| 140 list_a, list_b: Two lists representing samples to be compared. | |
| 141 significance_level: Self-describing. As a decimal fraction. | |
| 142 | |
| 143 Returns: | |
| 144 A boolean indicating whether the null hypothesis ~(that the lists are | |
| 145 samples from the same population) can be rejected at the specified | |
| 146 significance level. | |
| 147 """ | |
| 148 step_result = self.api.m.python( | |
| 149 'Checking sample difference', | |
| 150 self.api.resource('significantly_different.py'), | |
| 151 [json.dumps(list_a), json.dumps(list_b), str(significance_level)], | |
| 152 stdout=self.api.m.json.output()) | |
| 153 results = step_result.stdout | |
| 154 if results is None: | |
| 155 assert self.dummy_builds | |
| 156 return True | |
| 157 significantly_different = results['significantly_different'] | |
| 158 step_result.presentation.logs[str(significantly_different)] = [ | |
| 159 'See json.output for details'] | |
| 160 return significantly_different | |
| 161 | |
| 162 def config_step(self): | 117 def config_step(self): |
| 163 """Yields a simple echo step that outputs the bisect config.""" | 118 """Yields a simple echo step that outputs the bisect config.""" |
| 164 api = self.api | 119 api = self.api |
| 165 # bisect_config may come as a FrozenDict (which is not serializable). | 120 # bisect_config may come as a FrozenDict (which is not serializable). |
| 166 bisect_config = dict(self.bisect_config) | 121 bisect_config = dict(self.bisect_config) |
| 167 | 122 |
| 168 def fix_windows_backslashes(s): | 123 def fix_windows_backslashes(s): |
| 169 backslash_regex = re.compile(r'(?<!\\)\\(?!\\)') | 124 backslash_regex = re.compile(r'(?<!\\)\\(?!\\)') |
| 170 return backslash_regex.sub(r'\\', s) | 125 return backslash_regex.sub(r'\\', s) |
| 171 | 126 |
| (...skipping 278 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 450 self.failed_direction = True | 405 self.failed_direction = True |
| 451 self.warnings.append('The initial regression range for return code ' | 406 self.warnings.append('The initial regression range for return code ' |
| 452 'appears to show NO sign of a regression.') | 407 'appears to show NO sign of a regression.') |
| 453 | 408 |
| 454 def _set_failed_direction_results(self): # pragma: no cover | 409 def _set_failed_direction_results(self): # pragma: no cover |
| 455 self.failed_direction = True | 410 self.failed_direction = True |
| 456 self.warnings.append('The initial regression range appears to represent ' | 411 self.warnings.append('The initial regression range appears to represent ' |
| 457 'an improvement rather than a regression, given the ' | 412 'an improvement rather than a regression, given the ' |
| 458 'expected direction of improvement.') | 413 'expected direction of improvement.') |
| 459 | 414 |
| 460 def check_initial_confidence(self): # pragma: no cover | 415 def check_initial_confidence(self): |
| 461 """Checks that the initial range presents a clear enough regression. | 416 """Checks that the initial range presents a clear enough regression. |
| 462 | 417 |
| 463 We calculate the confidence of the results of the given 'good' | 418 We calculate the confidence of the results of the given 'good' |
| 464 and 'bad' revisions and compare it against the required confidence | 419 and 'bad' revisions and compare it against the required confidence |
| 465 set for the bisector. | 420 set for the bisector. |
| 466 | 421 |
| 467 Note that when a dummy regression confidence value has been set, that | 422 Note that when a dummy regression confidence value has been set, that |
| 468 is used instead. | 423 is used instead. |
| 469 """ | 424 """ |
| 470 if self.test_type != 'perf': | 425 if self.test_type != 'perf': |
| 471 return True | 426 return True |
| 472 | 427 |
| 473 if self.required_initial_confidence is None: | 428 if self.required_initial_confidence is None: |
| 474 return True # pragma: no cover | 429 return True # pragma: no cover |
| 475 | 430 |
| 476 # TODO(robertocn): Remove all uses of "confidence". | |
| 477 if self.dummy_initial_confidence is not None: | 431 if self.dummy_initial_confidence is not None: |
| 478 self.initial_confidence = float( | 432 self.initial_confidence = float( |
| 479 self.dummy_initial_confidence) | 433 self.dummy_initial_confidence) |
| 480 if (float(self.initial_confidence) < | |
| 481 float(self.required_initial_confidence)): | |
| 482 self._set_insufficient_confidence_warning() | |
| 483 return False | |
| 484 return True | |
| 485 | 434 |
| 486 if self.dummy_builds: | 435 else: # pragma: no cover |
| 487 dummy_result = self.good_rev.values != self.bad_rev.values | 436 if len(self.good_rev.values) < 5 or len(self.bad_rev.values) < 5: |
| 488 if not dummy_result: | 437 # If there are too few values, the confidence score is not a good way to |
| 489 self._set_insufficient_confidence_warning() | 438 # determine whether the regression is reproducible. |
| 490 return dummy_result | 439 # TODO(robertocn): Investigate a straightforward approach to deal with |
| 491 | 440 # these cases. Such as the mean of one group lying within the range of |
| 492 with self.api.m.step.nest('Re-testing reference range'): | 441 # the other. |
| 493 expiration_time = time.time() + REGRESSION_CHECK_TIMEOUT | 442 return True |
| 494 while time.time() < expiration_time: | 443 self.initial_confidence = ( |
| 495 if len(self.good_rev.values) >= 5 and len(self.bad_rev.values) >= 5: | 444 self.api.m.math_utils.confidence_score( |
| 496 if self.significantly_different(self.good_rev.values, | 445 self.good_rev.values, |
| 497 self.bad_rev.values): | 446 self.bad_rev.values)) |
| 498 return True | 447 if (self.initial_confidence < |
| 499 if len(self.good_rev.values) == len(self.bad_rev.values): | 448 self.required_initial_confidence): # pragma: no cover |
| 500 revision_to_retest = self.last_tested_revision | 449 self._set_insufficient_confidence_warning(self.initial_confidence) |
| 501 else: | |
| 502 revision_to_retest = min(self.good_rev, self.bad_rev, | |
| 503 key=lambda x: len(x.values)) | |
| 504 if len(revision_to_retest.values) < MAX_REQUIRED_SAMPLES: | |
| 505 revision_to_retest.retest() | |
| 506 else: | |
| 507 break | |
| 508 self._set_insufficient_confidence_warning() | |
| 509 return False | 450 return False |
| 510 | 451 return True |
| 511 | 452 |
| 512 def get_exception(self): | 453 def get_exception(self): |
| 513 raise NotImplementedError() # pragma: no cover | 454 raise NotImplementedError() # pragma: no cover |
| 514 # TODO: should return an exception with the details of the failure. | 455 # TODO: should return an exception with the details of the failure. |
| 515 | 456 |
| 516 def _set_insufficient_confidence_warning( | 457 def _set_insufficient_confidence_warning( |
| 517 self): # pragma: no cover | 458 self, actual_confidence): # pragma: no cover |
| 518 """Adds a warning about the lack of initial regression confidence.""" | 459 """Adds a warning about the lack of initial regression confidence.""" |
| 519 self.failed_initial_confidence = True | 460 self.failed_initial_confidence = True |
| 520 self.surface_result('LO_INIT_CONF') | 461 self.surface_result('LO_INIT_CONF') |
| 521 self.warnings.append( | 462 self.warnings.append( |
| 522 'Bisect failed to reproduce the regression with enough confidence.') | 463 ('Bisect failed to reproduce the regression with enough confidence. ' |
| 464 'Needed {:.2f}%, got {:.2f}%.').format( |
| 465 self.required_initial_confidence, actual_confidence)) |
| 523 | 466 |
| 524 def _results_debug_message(self): | 467 def _results_debug_message(self): |
| 525 """Returns a string with values used to debug a bisect result.""" | 468 """Returns a string with values used to debug a bisect result.""" |
| 526 result = 'bisector.lkgr: %r\n' % self.lkgr | 469 result = 'bisector.lkgr: %r\n' % self.lkgr |
| 527 result += 'bisector.fkbr: %r\n\n' % self.fkbr | 470 result += 'bisector.fkbr: %r\n\n' % self.fkbr |
| 528 result += self._revision_value_table() | 471 result += self._revision_value_table() |
| 529 if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values): | 472 if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values): |
| 530 result += '\n' + self._t_test_results() | 473 result += '\n' + self._t_test_results() |
| 531 return result | 474 return result |
| 532 | 475 |
| (...skipping 340 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 873 | 816 |
| 874 def surface_result(self, result_string): | 817 def surface_result(self, result_string): |
| 875 assert result_string in VALID_RESULT_CODES | 818 assert result_string in VALID_RESULT_CODES |
| 876 prefix = 'B4T_' # To avoid collision. Stands for bisect (abbr. `a la i18n). | 819 prefix = 'B4T_' # To avoid collision. Stands for bisect (abbr. `a la i18n). |
| 877 result_code = prefix + result_string | 820 result_code = prefix + result_string |
| 878 assert len(result_code) <= 20 | 821 assert len(result_code) <= 20 |
| 879 if result_code not in self.result_codes: | 822 if result_code not in self.result_codes: |
| 880 self.result_codes.add(result_code) | 823 self.result_codes.add(result_code) |
| 881 properties = self.api.m.step.active_result.presentation.properties | 824 properties = self.api.m.step.active_result.presentation.properties |
| 882 properties['extra_result_code'] = sorted(self.result_codes) | 825 properties['extra_result_code'] = sorted(self.result_codes) |
| OLD | NEW |