Chromium Code Reviews| Index: scripts/slave/recipe_modules/auto_bisect/bisector.py |
| diff --git a/scripts/slave/recipe_modules/auto_bisect/bisector.py b/scripts/slave/recipe_modules/auto_bisect/bisector.py |
| index e7677c13d0e15a503dbf7444676fb88bcd116f82..5e2ebe9acc10e093a6a0fc97eae0818bdc116e86 100644 |
| --- a/scripts/slave/recipe_modules/auto_bisect/bisector.py |
| +++ b/scripts/slave/recipe_modules/auto_bisect/bisector.py |
| @@ -4,6 +4,7 @@ |
| import json |
| import re |
| +import time |
| from . import bisect_results |
| from . import depot_config |
| @@ -42,6 +43,14 @@ VALID_RESULT_CODES = ( |
| # to move *too* far from the original revision, we'll cap the search at 25%. |
| DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25 |
| +# How long to re-test the initial good-bad range for until significant |
| +# difference is established. |
| +REGRESSION_CHECK_TIMEOUT = 6 * 60 * 60 |
|
dtu
2016/02/05 00:18:01
Is that six hours? Do we really have that much tim
|
| + |
| +# Significance level to use for determining difference between revisions via |
| +# hypothesis testing. |
| +SIGNIFICANCE_LEVEL = 0.05 |
| + |
| class Bisector(object): |
| """This class abstracts an ongoing bisect (or n-sect) job.""" |
| @@ -114,6 +123,38 @@ class Bisector(object): |
| if init_revisions: |
| self._expand_chromium_revision_range() |
| + def significantly_different( |
| + self, list_a, list_b, |
| + significance_level=SIGNIFICANCE_LEVEL): # pragma: no cover |
| + """Uses an external script to run hypothesis testing with scipy. |
| + |
| + The reason why we need an external script is that scipy is not available to |
| + the default python installed in all platforms. We instead rely on an |
| + anaconda environment to provide those packages. |
| + |
| + Args: |
| + list_a, list_b: Two lists representing samples to be compared. |
| + significance_level: Self-describing. As a decimal fraction. |
| + |
| + Returns: |
| + A boolean indicating whether the null hypothesis ~(that the lists are |
| + samples from the same population) can be rejected at the specified |
| + significance level. |
| + """ |
| + step_result = self.api.m.python( |
| + 'Checking sample difference', |
| + self.api.resource('significantly_different.py'), |
| + [json.dumps(list_a), json.dumps(list_b), str(significance_level)], |
| + stdout=self.api.m.json.output()) |
| + results = step_result.stdout |
| + if results is None: |
| + assert self.dummy_builds |
| + return True |
| + significantly_different = results['significantly_different'] |
| + step_result.presentation.logs[str(significantly_different)] = [ |
| + 'See json.output for details'] |
| + return significantly_different |
| + |
| def config_step(self): |
| """Yields a simple echo step that outputs the bisect config.""" |
| api = self.api |
| @@ -412,7 +453,7 @@ class Bisector(object): |
| 'an improvement rather than a regression, given the ' |
| 'expected direction of improvement.') |
| - def check_initial_confidence(self): |
| + def check_initial_confidence(self): # pragma: no cover |
| """Checks that the initial range presents a clear enough regression. |
| We calculate the confidence of the results of the given 'good' |
| @@ -428,41 +469,44 @@ class Bisector(object): |
| if self.required_initial_confidence is None: |
| return True # pragma: no cover |
| + # TODO(robertocn): Remove all uses of "confidence". |
| if self.dummy_initial_confidence is not None: |
| self.initial_confidence = float( |
| self.dummy_initial_confidence) |
| + if (float(self.initial_confidence) < |
| + float(self.required_initial_confidence)): |
| + self._set_insufficient_confidence_warning() |
| + return False |
| + return True |
| + |
| + if self.dummy_builds: |
| + dummy_result = self.good_rev.values != self.bad_rev.values |
| + if not dummy_result: |
| + self._set_insufficient_confidence_warning() |
| + return dummy_result |
|
dtu
2016/02/05 00:18:01
Seems like there's a bunch of technical debt relat
|
| + |
| + expiration_time = time.time() + REGRESSION_CHECK_TIMEOUT |
| + while time.time() < expiration_time: |
| + if len(self.good_rev.values) >= 5 and len(self.bad_rev.values) >= 5: |
| + if self.significantly_different(self.good_rev.values, |
| + self.bad_rev.values): |
| + return True |
| + min(self.good_rev, self.bad_rev, key=lambda x: len(x.values)).retest() |
| + self._set_insufficient_confidence_warning() |
| + return False |
| - else: # pragma: no cover |
| - if len(self.good_rev.values) < 5 or len(self.bad_rev.values) < 5: |
| - # If there are too few values, the confidence score is not a good way to |
| - # determine whether the regression is reproducible. |
| - # TODO(robertocn): Investigate a straightforward approach to deal with |
| - # these cases. Such as the mean of one group lying within the range of |
| - # the other. |
| - return True |
| - self.initial_confidence = ( |
| - self.api.m.math_utils.confidence_score( |
| - self.good_rev.values, |
| - self.bad_rev.values)) |
| - if (self.initial_confidence < |
| - self.required_initial_confidence): # pragma: no cover |
| - self._set_insufficient_confidence_warning(self.initial_confidence) |
| - return False |
| - return True |
| def get_exception(self): |
| raise NotImplementedError() # pragma: no cover |
| # TODO: should return an exception with the details of the failure. |
| def _set_insufficient_confidence_warning( |
| - self, actual_confidence): # pragma: no cover |
| + self): # pragma: no cover |
| """Adds a warning about the lack of initial regression confidence.""" |
| self.failed_initial_confidence = True |
| self.surface_result('LO_INIT_CONF') |
| self.warnings.append( |
| - ('Bisect failed to reproduce the regression with enough confidence. ' |
| - 'Needed {:.2f}%, got {:.2f}%.').format( |
| - self.required_initial_confidence, actual_confidence)) |
| + 'Bisect failed to reproduce the regression with enough confidence.') |
| def _results_debug_message(self): |
| """Returns a string with values used to debug a bisect result.""" |