Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 # Copyright 2015 The Chromium Authors. All rights reserved. | 1 # Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 import json | 5 import json |
| 6 import re | 6 import re |
| 7 import time | |
| 7 | 8 |
| 8 from . import bisect_results | 9 from . import bisect_results |
| 9 from . import depot_config | 10 from . import depot_config |
| 10 from . import revision_state | 11 from . import revision_state |
| 11 | 12 |
| 12 _DEPS_SHA_PATCH = """ | 13 _DEPS_SHA_PATCH = """ |
| 13 diff --git DEPS.sha DEPS.sha | 14 diff --git DEPS.sha DEPS.sha |
| 14 new file mode 100644 | 15 new file mode 100644 |
| 15 --- /dev/null | 16 --- /dev/null |
| 16 +++ DEPS.sha | 17 +++ DEPS.sha |
| (...skipping 18 matching lines...) Expand all Loading... | |
| 35 'LO_INIT_CONF', # Bisect aborted early for lack of confidence. | 36 'LO_INIT_CONF', # Bisect aborted early for lack of confidence. |
| 36 'MISSING_METRIC', # The metric was not found in the test text/json output. | 37 'MISSING_METRIC', # The metric was not found in the test text/json output. |
| 37 'LO_FINAL_CONF', # The bisect completed without a culprit. | 38 'LO_FINAL_CONF', # The bisect completed without a culprit. |
| 38 ) | 39 ) |
| 39 | 40 |
| 40 # When we look for the next revision to build, we search nearby revisions | 41 # When we look for the next revision to build, we search nearby revisions |
| 41 # looking for a revision that's already been archived. Since we don't want | 42 # looking for a revision that's already been archived. Since we don't want |
| 42 # to move *too* far from the original revision, we'll cap the search at 25%. | 43 # to move *too* far from the original revision, we'll cap the search at 25%. |
| 43 DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25 | 44 DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25 |
| 44 | 45 |
| 46 # How long to re-test the initial good-bad range for until significant | |
| 47 # difference is established. | |
| 48 REGRESSION_CHECK_TIMEOUT = 6 * 60 * 60 | |
|
dtu
2016/02/05 00:18:01
Is that six hours? Do we really have that much tim
| |
| 49 | |
| 50 # Significance level to use for determining difference between revisions via | |
| 51 # hypothesis testing. | |
| 52 SIGNIFICANCE_LEVEL = 0.05 | |
| 53 | |
| 45 | 54 |
| 46 class Bisector(object): | 55 class Bisector(object): |
| 47 """This class abstracts an ongoing bisect (or n-sect) job.""" | 56 """This class abstracts an ongoing bisect (or n-sect) job.""" |
| 48 | 57 |
| 49 def __init__(self, api, bisect_config, revision_class, init_revisions=True): | 58 def __init__(self, api, bisect_config, revision_class, init_revisions=True): |
| 50 """Initializes the state of a new bisect job from a dictionary. | 59 """Initializes the state of a new bisect job from a dictionary. |
| 51 | 60 |
| 52 Note that the initial good_rev and bad_rev MUST resolve to a commit position | 61 Note that the initial good_rev and bad_rev MUST resolve to a commit position |
| 53 in the chromium repo. | 62 in the chromium repo. |
| 54 """ | 63 """ |
| (...skipping 15 matching lines...) Expand all Loading... | |
| 70 self.test_type = bisect_config.get('test_type', 'perf') | 79 self.test_type = bisect_config.get('test_type', 'perf') |
| 71 self.improvement_direction = int(bisect_config.get( | 80 self.improvement_direction = int(bisect_config.get( |
| 72 'improvement_direction', 0)) or None | 81 'improvement_direction', 0)) or None |
| 73 # Required initial confidence might be explicitly set to None, | 82 # Required initial confidence might be explicitly set to None, |
| 74 # which means that no initial confidence check should be done. | 83 # which means that no initial confidence check should be done. |
| 75 if ('required_initial_confidence' in bisect_config and | 84 if ('required_initial_confidence' in bisect_config and |
| 76 bisect_config['required_initial_confidence'] is None): | 85 bisect_config['required_initial_confidence'] is None): |
| 77 self.required_initial_confidence = None # pragma: no cover | 86 self.required_initial_confidence = None # pragma: no cover |
| 78 else: | 87 else: |
| 79 self.required_initial_confidence = float(bisect_config.get( | 88 self.required_initial_confidence = float(bisect_config.get( |
| 80 'required_initial_confidence', 80)) | 89 'required_initial_confidence', 80)) |
|
dtu
2016/02/05 00:18:02
Should we rethink what this value is? Also make it
| |
| 81 | 90 |
| 82 self.warnings = [] | 91 self.warnings = [] |
| 83 | 92 |
| 84 # Status flags | 93 # Status flags |
| 85 self.initial_confidence = None | 94 self.initial_confidence = None |
| 86 self.failed_initial_confidence = False | 95 self.failed_initial_confidence = False |
| 87 self.failed = False | 96 self.failed = False |
| 88 self.failed_direction = False | 97 self.failed_direction = False |
| 89 self.lkgr = None # Last known good revision | 98 self.lkgr = None # Last known good revision |
| 90 self.fkbr = None # First known bad revision | 99 self.fkbr = None # First known bad revision |
| (...skipping 16 matching lines...) Expand all Loading... | |
| 107 self.good_rev.good = True | 116 self.good_rev.good = True |
| 108 self.good_rev.read_deps(self.get_perf_tester_name()) | 117 self.good_rev.read_deps(self.get_perf_tester_name()) |
| 109 api.m.step.active_result.presentation.logs['Debug Good Revision DEPS'] = [ | 118 api.m.step.active_result.presentation.logs['Debug Good Revision DEPS'] = [ |
| 110 '%s: %s' % (key, value) for key, value in | 119 '%s: %s' % (key, value) for key, value in |
| 111 self.good_rev.deps.iteritems()] | 120 self.good_rev.deps.iteritems()] |
| 112 self.good_rev.deps = {} | 121 self.good_rev.deps = {} |
| 113 self.lkgr = self.good_rev | 122 self.lkgr = self.good_rev |
| 114 if init_revisions: | 123 if init_revisions: |
| 115 self._expand_chromium_revision_range() | 124 self._expand_chromium_revision_range() |
| 116 | 125 |
| 126 def significantly_different( | |
| 127 self, list_a, list_b, | |
| 128 significance_level=SIGNIFICANCE_LEVEL): # pragma: no cover | |
| 129 """Uses an external script to run hypothesis testing with scipy. | |
| 130 | |
| 131 The reason why we need an external script is that scipy is not available to | |
| 132 the default python installed in all platforms. We instead rely on an | |
| 133 anaconda environment to provide those packages. | |
| 134 | |
| 135 Args: | |
| 136 list_a, list_b: Two lists representing samples to be compared. | |
| 137 significance_level: Self-describing. As a decimal fraction. | |
| 138 | |
| 139 Returns: | |
| 140 A boolean indicating whether the null hypothesis ~(that the lists are | |
| 141 samples from the same population) can be rejected at the specified | |
| 142 significance level. | |
| 143 """ | |
| 144 step_result = self.api.m.python( | |
| 145 'Checking sample difference', | |
| 146 self.api.resource('significantly_different.py'), | |
| 147 [json.dumps(list_a), json.dumps(list_b), str(significance_level)], | |
| 148 stdout=self.api.m.json.output()) | |
| 149 results = step_result.stdout | |
| 150 if results is None: | |
| 151 assert self.dummy_builds | |
| 152 return True | |
| 153 significantly_different = results['significantly_different'] | |
| 154 step_result.presentation.logs[str(significantly_different)] = [ | |
| 155 'See json.output for details'] | |
| 156 return significantly_different | |
| 157 | |
| 117 def config_step(self): | 158 def config_step(self): |
| 118 """Yields a simple echo step that outputs the bisect config.""" | 159 """Yields a simple echo step that outputs the bisect config.""" |
| 119 api = self.api | 160 api = self.api |
| 120 # bisect_config may come as a FrozenDict (which is not serializable). | 161 # bisect_config may come as a FrozenDict (which is not serializable). |
| 121 bisect_config = dict(self.bisect_config) | 162 bisect_config = dict(self.bisect_config) |
| 122 | 163 |
| 123 def fix_windows_backslashes(s): | 164 def fix_windows_backslashes(s): |
| 124 backslash_regex = re.compile(r'(?<!\\)\\(?!\\)') | 165 backslash_regex = re.compile(r'(?<!\\)\\(?!\\)') |
| 125 return backslash_regex.sub(r'\\', s) | 166 return backslash_regex.sub(r'\\', s) |
| 126 | 167 |
| (...skipping 278 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 405 self.failed_direction = True | 446 self.failed_direction = True |
| 406 self.warnings.append('The initial regression range for return code ' | 447 self.warnings.append('The initial regression range for return code ' |
| 407 'appears to show NO sign of a regression.') | 448 'appears to show NO sign of a regression.') |
| 408 | 449 |
| 409 def _set_failed_direction_results(self): # pragma: no cover | 450 def _set_failed_direction_results(self): # pragma: no cover |
| 410 self.failed_direction = True | 451 self.failed_direction = True |
| 411 self.warnings.append('The initial regression range appears to represent ' | 452 self.warnings.append('The initial regression range appears to represent ' |
| 412 'an improvement rather than a regression, given the ' | 453 'an improvement rather than a regression, given the ' |
| 413 'expected direction of improvement.') | 454 'expected direction of improvement.') |
| 414 | 455 |
| 415 def check_initial_confidence(self): | 456 def check_initial_confidence(self): # pragma: no cover |
| 416 """Checks that the initial range presents a clear enough regression. | 457 """Checks that the initial range presents a clear enough regression. |
| 417 | 458 |
| 418 We calculate the confidence of the results of the given 'good' | 459 We calculate the confidence of the results of the given 'good' |
| 419 and 'bad' revisions and compare it against the required confidence | 460 and 'bad' revisions and compare it against the required confidence |
| 420 set for the bisector. | 461 set for the bisector. |
| 421 | 462 |
| 422 Note that when a dummy regression confidence value has been set, that | 463 Note that when a dummy regression confidence value has been set, that |
| 423 is used instead. | 464 is used instead. |
| 424 """ | 465 """ |
| 425 if self.test_type != 'perf': | 466 if self.test_type != 'perf': |
| 426 return True | 467 return True |
| 427 | 468 |
| 428 if self.required_initial_confidence is None: | 469 if self.required_initial_confidence is None: |
| 429 return True # pragma: no cover | 470 return True # pragma: no cover |
| 430 | 471 |
| 472 # TODO(robertocn): Remove all uses of "confidence". | |
| 431 if self.dummy_initial_confidence is not None: | 473 if self.dummy_initial_confidence is not None: |
| 432 self.initial_confidence = float( | 474 self.initial_confidence = float( |
| 433 self.dummy_initial_confidence) | 475 self.dummy_initial_confidence) |
| 476 if (float(self.initial_confidence) < | |
| 477 float(self.required_initial_confidence)): | |
| 478 self._set_insufficient_confidence_warning() | |
| 479 return False | |
| 480 return True | |
| 434 | 481 |
| 435 else: # pragma: no cover | 482 if self.dummy_builds: |
| 436 if len(self.good_rev.values) < 5 or len(self.bad_rev.values) < 5: | 483 dummy_result = self.good_rev.values != self.bad_rev.values |
| 437 # If there are too few values, the confidence score is not a good way to | 484 if not dummy_result: |
| 438 # determine whether the regression is reproducible. | 485 self._set_insufficient_confidence_warning() |
| 439 # TODO(robertocn): Investigate a straightforward approach to deal with | 486 return dummy_result |
|
dtu
2016/02/05 00:18:01
Seems like there's a bunch of technical debt relat
| |
| 440 # these cases. Such as the mean of one group lying within the range of | 487 |
| 441 # the other. | 488 expiration_time = time.time() + REGRESSION_CHECK_TIMEOUT |
| 442 return True | 489 while time.time() < expiration_time: |
| 443 self.initial_confidence = ( | 490 if len(self.good_rev.values) >= 5 and len(self.bad_rev.values) >= 5: |
| 444 self.api.m.math_utils.confidence_score( | 491 if self.significantly_different(self.good_rev.values, |
| 445 self.good_rev.values, | 492 self.bad_rev.values): |
| 446 self.bad_rev.values)) | 493 return True |
| 447 if (self.initial_confidence < | 494 min(self.good_rev, self.bad_rev, key=lambda x: len(x.values)).retest() |
| 448 self.required_initial_confidence): # pragma: no cover | 495 self._set_insufficient_confidence_warning() |
| 449 self._set_insufficient_confidence_warning(self.initial_confidence) | 496 return False |
| 450 return False | 497 |
| 451 return True | |
| 452 | 498 |
| 453 def get_exception(self): | 499 def get_exception(self): |
| 454 raise NotImplementedError() # pragma: no cover | 500 raise NotImplementedError() # pragma: no cover |
| 455 # TODO: should return an exception with the details of the failure. | 501 # TODO: should return an exception with the details of the failure. |
| 456 | 502 |
| 457 def _set_insufficient_confidence_warning( | 503 def _set_insufficient_confidence_warning( |
| 458 self, actual_confidence): # pragma: no cover | 504 self): # pragma: no cover |
| 459 """Adds a warning about the lack of initial regression confidence.""" | 505 """Adds a warning about the lack of initial regression confidence.""" |
| 460 self.failed_initial_confidence = True | 506 self.failed_initial_confidence = True |
| 461 self.surface_result('LO_INIT_CONF') | 507 self.surface_result('LO_INIT_CONF') |
| 462 self.warnings.append( | 508 self.warnings.append( |
| 463 ('Bisect failed to reproduce the regression with enough confidence. ' | 509 'Bisect failed to reproduce the regression with enough confidence.') |
| 464 'Needed {:.2f}%, got {:.2f}%.').format( | |
| 465 self.required_initial_confidence, actual_confidence)) | |
| 466 | 510 |
| 467 def _results_debug_message(self): | 511 def _results_debug_message(self): |
| 468 """Returns a string with values used to debug a bisect result.""" | 512 """Returns a string with values used to debug a bisect result.""" |
| 469 result = 'bisector.lkgr: %r\n' % self.lkgr | 513 result = 'bisector.lkgr: %r\n' % self.lkgr |
| 470 result += 'bisector.fkbr: %r\n\n' % self.fkbr | 514 result += 'bisector.fkbr: %r\n\n' % self.fkbr |
| 471 result += self._revision_value_table() | 515 result += self._revision_value_table() |
| 472 if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values): | 516 if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values): |
| 473 result += '\n' + self._t_test_results() | 517 result += '\n' + self._t_test_results() |
| 474 return result | 518 return result |
| 475 | 519 |
| (...skipping 334 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 810 | 854 |
| 811 def surface_result(self, result_string): | 855 def surface_result(self, result_string): |
| 812 assert result_string in VALID_RESULT_CODES | 856 assert result_string in VALID_RESULT_CODES |
| 813 prefix = 'B4T_' # To avoid collision. Stands for bisect (abbr. `a la i18n). | 857 prefix = 'B4T_' # To avoid collision. Stands for bisect (abbr. `a la i18n). |
| 814 result_code = prefix + result_string | 858 result_code = prefix + result_string |
| 815 assert len(result_code) <= 20 | 859 assert len(result_code) <= 20 |
| 816 if result_code not in self.result_codes: | 860 if result_code not in self.result_codes: |
| 817 self.result_codes.add(result_code) | 861 self.result_codes.add(result_code) |
| 818 properties = self.api.m.step.active_result.presentation.properties | 862 properties = self.api.m.step.active_result.presentation.properties |
| 819 properties['extra_result_code'] = sorted(self.result_codes) | 863 properties['extra_result_code'] = sorted(self.result_codes) |
| OLD | NEW |