Chromium Code Reviews| Index: scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py |
| diff --git a/scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py b/scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py |
| index 28b149303858a44f28daa847fb8f43f8fcbcf05e..8a67f66bd7112de3958b1f627e1494f689df7c7d 100644 |
| --- a/scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py |
| +++ b/scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py |
| @@ -3,6 +3,7 @@ |
| # found in the LICENSE file. |
| import json |
| +import math |
| import tempfile |
| import os |
| import uuid |
| @@ -12,6 +13,9 @@ from . import revision_state |
| if 'CACHE_TEST_RESULTS' in os.environ: # pragma: no cover |
| from . import test_results_cache |
| +# These relate to how to increase the number of repetitions during re-test |
| +MINIMUM_SAMPLE_SIZE = 5 |
| +INCREASE_FACTOR = 1.5 |
| class PerfRevisionState(revision_state.RevisionState): |
| """Contains the state and results for one revision in a perf bisect job.""" |
| @@ -21,9 +25,10 @@ class PerfRevisionState(revision_state.RevisionState): |
| self.values = [] |
| self.mean_value = None |
| self.std_dev = None |
| + self.repeat_count = MINIMUM_SAMPLE_SIZE |
| self._test_config = None |
| - def _read_test_results(self): |
| + def _read_test_results(self, check_revision_goodness=True): |
| """Gets the test results from GS and checks if the rev is good or bad.""" |
| test_results = self._get_test_results() |
| # Results will contain the keys 'results' and 'output' where output is the |
| @@ -36,7 +41,7 @@ class PerfRevisionState(revision_state.RevisionState): |
| if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover |
| self.bisector.surface_result('MISSING_METRIC') |
| return |
| - self.values = results['values'] |
| + self.values = (self.values or []) + results['values'] |
|
dtu
2016/02/05 00:18:02
Why do you need this? Isn't self.values already in
|
| if self.bisector.is_return_code_mode(): |
| retcodes = test_results['retcodes'] |
| overall_return_code = 0 if all(v == 0 for v in retcodes) else 1 |
| @@ -51,7 +56,8 @@ class PerfRevisionState(revision_state.RevisionState): |
| self.bisector.surface_result('MISSING_METRIC') |
| return |
| # We cannot test the goodness of the initial rev range. |
| - if self.bisector.good_rev != self and self.bisector.bad_rev != self: |
| + if (self.bisector.good_rev != self and self.bisector.bad_rev != self and |
| + check_revision_goodness): |
| if self._check_revision_good(): |
| self.good = True |
| else: |
| @@ -113,14 +119,13 @@ class PerfRevisionState(revision_state.RevisionState): |
| required_test_properties = { |
| 'truncate_percent', |
| 'metric', |
| - 'max_time_minutes', |
| 'command', |
| - 'repeat_count', |
| 'test_type' |
| } |
| for k, v in self.bisector.bisect_config.iteritems(): |
| if k in required_test_properties: |
| result[k] = v |
| + result['repeat_count'] = self.repeat_count |
| self._test_config = result |
| return result |
| @@ -191,6 +196,14 @@ class PerfRevisionState(revision_state.RevisionState): |
| 'job_name': self.job_name, |
| } |
| + def retest(self): # pragma: no cover |
| + # We need at least 5 samples for the hypothesis testing to work properly. |
|
dtu
2016/02/05 00:18:02
Sorry, not strictly true as written. Only true wit
|
| + target_sample_size = max(5, math.ceil(len(self.values) * 1.5)) |
| + self.status = PerfRevisionState.NEED_MORE_DATA |
| + self.repeat_count = target_sample_size - len(self.values) |
| + self.start_job() |
| + self.bisector.wait_for_any([self]) |
| + |
| def _get_test_results(self): |
| """Tries to get the results of a test job from cloud storage.""" |
| api = self.bisector.api |
| @@ -208,20 +221,48 @@ class PerfRevisionState(revision_state.RevisionState): |
| def _check_revision_good(self): |
| """Determines if a revision is good or bad. |
| - Note that our current approach is to determine whether it is closer to |
| - either the 'good' and 'bad' revisions given for the bisect job. |
| + Iteratively increment the sample size of the revision being tested, the last |
| + known good revision, and the first known bad revision until a relationship |
| + of significant difference can be established betweeb the results of the |
| + revision being tested and one of the other two. |
| + |
| + If the results do not converge towards finding a significant difference in |
| + either direction, this is expected to timeout eventually. This scenario |
| + should be rather rare, since it is expected that the fkbr and lkgr are |
| + significantly different as a precondition. |
| Returns: |
| - True if this revision is closer to the initial good revision's value than |
| - to the initial bad revision's value. False otherwise. |
| + True if the results of testing this revision are significantly different |
| + from those of testing the earliest known bad revision. |
| + False if they are instead significantly different form those of testing |
| + the latest knwon good revision. |
| """ |
| - # TODO: Reevaluate this approach |
| - bisector = self.bisector |
| - distance_to_good = abs(self.mean_value - bisector.good_rev.mean_value) |
| - distance_to_bad = abs(self.mean_value - bisector.bad_rev.mean_value) |
| - if distance_to_good < distance_to_bad: |
| - return True |
| - return False |
| + diff_from_good = self.bisector.significantly_different( |
|
dtu
2016/02/05 00:18:02
If you reorganize this function, you can avoid the
|
| + self.bisector.lkgr.values, self.values) |
| + diff_from_bad = self.bisector.significantly_different( |
| + self.bisector.fkbr.values, self.values) |
| + |
| + while not (diff_from_good or diff_from_bad): # pragma: no cover |
| + min(self.bisector.lkgr, self, self.bisector.fkbr, |
| + key=lambda x: len(x.values)).retest() |
| + diff_from_good = self.bisector.significantly_different( |
| + self.bisector.lkgr.values, self.values) |
| + diff_from_bad = self.bisector.significantly_different( |
| + self.bisector.fkbr.values, self.values) |
| + |
| + if diff_from_good and diff_from_bad: |
| + # Multiple regressions. |
| + # For now, proceed bisecting the biggest difference of the means. |
| + dist_from_good = abs(self.mean_value - self.bisector.lkgr.mean_value) |
| + dist_from_bad = abs(self.mean_value - self.bisector.fkbr.mean_value) |
| + if dist_from_good > dist_from_bad: |
| + # TODO(robertocn): Add way to handle the secondary regression |
| + #self.bisector.handle_secondary_regression(self, self.bisector.fkbr) |
| + return False |
| + else: |
| + #self.bisector.handle_secondary_regression(self.bisector.lkgr, self) |
| + return True |
| + return diff_from_bad # pragma: no cover |
| def __repr__(self): |
| return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' % |