Chromium Code Reviews| Index: scripts/slave/recipe_modules/auto_bisect/bisector.py |
| diff --git a/scripts/slave/recipe_modules/auto_bisect/bisector.py b/scripts/slave/recipe_modules/auto_bisect/bisector.py |
| index 861ebbdc174bef0ecb9c06f02dd3be49413c7f1d..feede7d02880ddee9e9ebbb54da5607436fd13bd 100644 |
| --- a/scripts/slave/recipe_modules/auto_bisect/bisector.py |
| +++ b/scripts/slave/recipe_modules/auto_bisect/bisector.py |
| @@ -47,13 +47,6 @@ DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25 |
| # How long to re-test the initial good-bad range for until significant |
| # difference is established. |
| REGRESSION_CHECK_TIMEOUT = 2 * 60 * 60 |
| -# If we reach this number of samples on the reference range and have not |
| -# achieved statistical significance, bail. |
| -MAX_REQUIRED_SAMPLES = 15 |
| - |
| -# Significance level to use for determining difference between revisions via |
| -# hypothesis testing. |
| -SIGNIFICANCE_LEVEL = 0.01 |
| _FAILED_INITIAL_CONFIDENCE_ABORT_REASON = ( |
| 'The metric values for the initial "good" and "bad" revisions ' |
| @@ -76,6 +69,7 @@ class Bisector(object): |
| in the chromium repo. |
| """ |
| super(Bisector, self).__init__() |
| + self.loopCHECK = {} |
| self.flags = flags |
| self._api = api |
| self.result_codes = set() |
| @@ -155,7 +149,9 @@ class Bisector(object): |
| commit_position = self._api.m.commit_position.construct( |
| branch='refs/heads/master', value=rev) |
| try: |
| - return self._api.m.crrev.to_commit_hash(commit_position) |
| + return self._api.m.crrev.to_commit_hash( |
| + commit_position, |
| + step_test_data=lambda: self.api._test_data['hash_cp_map'][rev]) |
| except self.api.m.step.StepFailure: # pragma: no cover |
| self.surface_result('BAD_REV') |
| raise |
| @@ -167,37 +163,44 @@ class Bisector(object): |
| def _is_sha1(s): |
| return bool(re.match('^[0-9A-Fa-f]{40}$', s)) |
| - def significantly_different( |
| - self, list_a, list_b, |
| - significance_level=SIGNIFICANCE_LEVEL): # pragma: no cover |
| - """Uses an external script to run hypothesis testing with scipy. |
| - |
| - The reason why we need an external script is that scipy is not available to |
| - the default python installed in all platforms. We instead rely on an |
| - anaconda environment to provide those packages. |
| - |
| - Args: |
| - list_a, list_b: Two lists representing samples to be compared. |
| - significance_level: Self-describing. As a decimal fraction. |
| - |
| + def compare_revisions(self, revision_a, revision_b): |
| + """ |
| Returns: |
| - A boolean indicating whether the null hypothesis ~(that the lists are |
| - samples from the same population) can be rejected at the specified |
| - significance level. |
| + True if the samples are significantly different. |
| + None if there is not enough data to tell. |
| + False if there's enough data but still can't tell the samples apart. |
| """ |
| - step_result = self.api.m.python( |
| - 'Checking sample difference', |
| - self.api.resource('significantly_different.py'), |
| - [json.dumps(list_a), json.dumps(list_b), str(significance_level)], |
| - stdout=self.api.m.json.output()) |
| - results = step_result.stdout |
| - if results is None: |
| - assert self.dummy_builds |
| - return True |
| - significantly_different = results['significantly_different'] |
| - step_result.presentation.logs[str(significantly_different)] = [ |
| - 'See json.output for details'] |
| - return significantly_different |
| + output_format = 'chartjson' |
| + values_a = revision_a.chartjson_paths |
| + values_b = revision_b.chartjson_paths |
| + if revision_a.valueset_paths and revision_b.valueset_paths: |
| + output_format = 'valueset' |
| + |
| + result = self.api.stat_compare( |
| + values_a, |
| + values_b, |
| + self.bisect_config['metric'], |
| + output_format=output_format, |
| + step_test_data=lambda: self.api.test_api.compare_samples_data( |
| + self.api._test_data.get('revision_data'), revision_a, revision_b)) |
| + |
| + revision_a.debug_values = result['sample_a']['debug_values'] |
| + revision_b.debug_values = result['sample_b']['debug_values'] |
| + revision_a.mean = result['sample_a']['mean'] |
| + revision_b.mean = result['sample_b']['mean'] |
| + revision_a.std_dev = result['sample_a']['std_dev'] |
| + revision_b.std_dev = result['sample_b']['std_dev'] |
| + |
| + if result['result'] == 'needMoreData': |
| + key = tuple(values_a), tuple(values_b) |
| + self.loopCHECK.setdefault(key, 0) |
| + self.loopCHECK[key] += 1 |
|
RobertoCN
2016/08/23 00:26:23
Remove loop check and debug prints
RobertoCN
2016/09/07 00:33:24
Done.
|
| + if self.loopCHECK[key] > 10: |
| + raise Exception('loopCHECK!@') |
| + print result['result'], revision_a.debug_values, revision_b.debug_values |
| + print revision_a.bisector.revisions |
| + return None |
| + return bool(result['result']) |
| def config_step(self): |
| """Yields a step that prints the bisect config.""" |
| @@ -235,8 +238,8 @@ class Bisector(object): |
| return self._api |
| def compute_relative_change(self): |
| - old_value = float(self.good_rev.mean_value) |
| - new_value = float(self.bad_rev.mean_value) |
| + old_value = float(self.good_rev.mean) |
| + new_value = float(self.bad_rev.mean) |
| if new_value and not old_value: # pragma: no cover |
| self.relative_change = ZERO_TO_NON_ZERO |
| @@ -273,8 +276,10 @@ class Bisector(object): |
| stdin = self.api.m.raw_io.input(file_contents) |
| stdout = self.api.m.raw_io.output() |
| step_name = 'Hashing modified DEPS file with revision ' + commit_hash |
| - step_result = self.api.m.git(*cmd, cwd=cwd, stdin=stdin, stdout=stdout, |
| - name=step_name) |
| + step_result = self.api.m.git( |
| + *cmd, cwd=cwd, stdin=stdin, stdout=stdout, name=step_name, |
| + step_test_data=lambda: |
| + self.api.m.raw_io.test_api.stream_output(commit_hash)) |
| hash_string = step_result.stdout.splitlines()[0] |
| try: |
| if hash_string: |
| @@ -309,7 +314,9 @@ class Bisector(object): |
| cmd = cmd.split(' ') |
| stdout = self.api.m.raw_io.output() |
| step_name = 'Generating patch for %s to %s' % (git_object_a, deps_rev) |
| - step_result = self.api.m.git(*cmd, cwd=cwd, stdout=stdout, name=step_name) |
| + step_result = self.api.m.git( |
| + *cmd, cwd=cwd, stdout=stdout, name=step_name, |
| + step_test_data=lambda: self.api._test_data['diff_patch']) |
| patch_text = step_result.stdout |
| src_string = 'IAMSRC:' + git_object_a |
| dst_string = 'IAMDST:' + git_object_b |
| @@ -372,17 +379,20 @@ class Bisector(object): |
| with self.api.m.step.nest('Expanding revision range'): |
| good_hash = self.good_rev.commit_hash |
| bad_hash = self.bad_rev.commit_hash |
| + depot = self.good_rev.depot_name |
| step_name = 'for revisions %s:%s' % (good_hash, bad_hash) |
| revisions = self._revision_range( |
| start=good_hash, |
| end=bad_hash, |
| depot_name=self.base_depot, |
| - step_name=step_name) |
| + step_name=step_name, |
| + step_test_data=lambda: self.api._test_data['revision_list'][depot] |
| + ) |
| self.revisions = [self.good_rev] + revisions + [self.bad_rev] |
| self._update_revision_list_indexes() |
| def _revision_range(self, start, end, depot_name, base_revision=None, |
| - step_name=None): |
| + step_name=None, **kwargs): |
| """Returns a list of RevisionState objects between |start| and |end|. |
| Args: |
| @@ -403,7 +413,7 @@ class Bisector(object): |
| step_name, |
| self.api.resource('fetch_intervening_revisions.py'), |
| [start, end, depot_config.DEPOT_DEPS_NAME[depot_name]['url']], |
| - stdout=self.api.m.json.output()) |
| + stdout=self.api.m.json.output(), **kwargs) |
| except self.api.m.step.StepFailure: # pragma: no cover |
| self.surface_result('BAD_REV') |
| raise |
| @@ -509,7 +519,9 @@ class Bisector(object): |
| end=dep_revision_max, |
| depot_name=depot_name, |
| base_revision=min_revision, |
| - step_name=step_name) |
| + step_name=step_name, |
| + step_test_data=lambda: |
| + self.api._test_data['revision_list'][depot_name]) |
| new_revisions = self.revisions[:max_revision.list_index] |
| new_revisions += rev_list |
| new_revisions += self.revisions[max_revision.list_index:] |
| @@ -546,8 +558,8 @@ class Bisector(object): |
| True if the check passes (i.e. no problem), False if the change is not |
| a regression according to the improvement direction. |
| """ |
| - good = self.good_rev.mean_value |
| - bad = self.bad_rev.mean_value |
| + good = self.good_rev.mean |
| + bad = self.bad_rev.mean |
| if self.is_return_code_mode(): |
| return True |
| @@ -587,27 +599,25 @@ class Bisector(object): |
| self.bad_rev.overall_return_code) |
| if self.bypass_stats_check: |
| - dummy_result = self.good_rev.values != self.bad_rev.values |
| + self.compare_revisions(self.good_rev, self.bad_rev) |
| + dummy_result = self.good_rev.mean != self.bad_rev.mean |
| if not dummy_result: |
| self._set_insufficient_confidence_warning() |
| return dummy_result |
| + # TODO(robertocn): This step should not be necessary in some cases. |
| with self.api.m.step.nest('Re-testing reference range'): |
| expiration_time = time.time() + REGRESSION_CHECK_TIMEOUT |
| while time.time() < expiration_time: |
| - if len(self.good_rev.values) >= 5 and len(self.bad_rev.values) >= 5: |
| - if self.significantly_different(self.good_rev.values, |
| - self.bad_rev.values): |
| + if (self.good_rev.test_run_count >= 5 |
| + and self.bad_rev.test_run_count >= 5): |
| + if self.compare_revisions(self.good_rev, self.bad_rev): |
| return True |
| - if len(self.good_rev.values) == len(self.bad_rev.values): |
| + if self.good_rev.test_run_count == self.bad_rev.test_run_count: |
| revision_to_retest = self.last_tested_revision |
| else: |
| revision_to_retest = min(self.good_rev, self.bad_rev, |
| - key=lambda x: len(x.values)) |
| - if len(revision_to_retest.values) < MAX_REQUIRED_SAMPLES: |
| - revision_to_retest.retest() |
| - else: |
| - break |
| + key=lambda x: x.test_run_count) |
| self._set_insufficient_confidence_warning() |
| return False |
| @@ -629,14 +639,18 @@ class Bisector(object): |
| result = 'bisector.lkgr: %r\n' % self.lkgr |
| result += 'bisector.fkbr: %r\n\n' % self.fkbr |
| result += self._revision_value_table() |
| - if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values): |
| - result += '\n' + self._t_test_results() |
| + if (self.lkgr and self.lkgr.test_run_count and self.fkbr and |
| + self.fkbr.test_run_count): |
| + result += '\n' + '\n'.join([ |
| + 'LKGR values: %r' % list(self.lkgr.debug_values), |
| + 'FKBR values: %r' % list(self.fkbr.debug_values), |
| + ]) |
| return result |
| def _revision_value_table(self): |
| """Returns a string table showing revisions and their values.""" |
| header = [['Revision', 'Values']] |
| - rows = [[r.revision_string(), str(r.values)] for r in self.revisions] |
| + rows = [[r.revision_string(), str(r.debug_values)] for r in self.revisions] |
| return self._pretty_table(header + rows) |
| def _pretty_table(self, data): |
| @@ -645,20 +659,6 @@ class Bisector(object): |
| results.append('%-15s' * len(row) % tuple(row)) |
| return '\n'.join(results) |
| - def _t_test_results(self): |
| - """Returns a string showing t-test results for lkgr and fkbr.""" |
| - t, df, p = self.api.m.math_utils.welchs_t_test( |
| - self.lkgr.values, self.fkbr.values) |
| - lines = [ |
| - 'LKGR values: %r' % self.lkgr.values, |
| - 'FKBR values: %r' % self.fkbr.values, |
| - 't-statistic: %r' % t, |
| - 'deg. of freedom: %r' % df, |
| - 'p-value: %r' % p, |
| - 'Confidence score: %r' % (100 * (1 - p)) |
| - ] |
| - return '\n'.join(lines) |
| - |
| def print_result_debug_info(self): |
| """Prints extra debug info at the end of the bisect process.""" |
| lines = self._results_debug_message().splitlines() |
| @@ -683,7 +683,7 @@ class Bisector(object): |
| candidate_range = [revision for revision in |
| self.revisions[self.lkgr.list_index + 1: |
| self.fkbr.list_index] |
| - if not revision.tested and not revision.failed] |
| + if not revision.failed] |
| if len(candidate_range) == 1: |
| return candidate_range[0] |
| if len(candidate_range) == 0: |
| @@ -740,38 +740,13 @@ class Bisector(object): |
| return True |
| return False |
| - def wait_for_all(self, revision_list): |
| - """Waits for all revisions in list to finish.""" |
| - for r in revision_list: |
| - self.wait_for(r) |
| - |
| - def wait_for(self, revision, nest_check=True): |
| - """Waits for the revision to finish its job.""" |
| - if nest_check and not self.flags.get( |
| - 'do_not_nest_wait_for_revision'): # pragma: no cover |
| - with self.api.m.step.nest('Waiting for ' + revision.revision_string()): |
| - return self.wait_for(revision, nest_check=False) |
| - while True: |
| - revision.update_status() |
| - if revision.in_progress: |
| - self.api.m.python.inline( |
| - 'sleeping', |
| - """ |
| - import sys |
| - import time |
| - time.sleep(20*60) |
| - sys.exit(0) |
| - """) |
| - else: |
| - break |
| - |
| def _update_candidate_range(self): |
| """Updates lkgr and fkbr (last known good/first known bad) revisions. |
| lkgr and fkbr are 'pointers' to the appropriate RevisionState objects in |
| bisectors.revisions.""" |
| for r in self.revisions: |
| - if r.tested: |
| + if r.test_run_count: |
| if r.good: |
| self.lkgr = r |
| elif r.bad: |
| @@ -876,9 +851,6 @@ class Bisector(object): |
| """Returns the results as a jsonable object.""" |
| config = self.bisect_config |
| results_confidence = 0 |
| - if self.culprit: |
| - results_confidence = self.api.m.math_utils.confidence_score( |
| - self.lkgr.values, self.fkbr.values) |
| if self.failed: |
| status = 'failed' |
| @@ -902,7 +874,6 @@ class Bisector(object): |
| 'test_type': config['test_type'], |
| 'metric': config['metric'], |
| 'change': self.relative_change, |
| - 'score': results_confidence, |
| 'good_revision': self.good_rev.commit_hash, |
| 'bad_revision': self.bad_rev.commit_hash, |
| 'warnings': self.warnings, |
| @@ -934,14 +905,14 @@ class Bisector(object): |
| def _revision_data(self): |
| revision_rows = [] |
| for r in self.revisions: |
| - if r.tested or r.aborted: |
| + if r.test_run_count: |
| revision_rows.append({ |
| 'depot_name': r.depot_name, |
| 'commit_hash': r.commit_hash, |
| 'revision_string': r.revision_string(), |
| - 'mean_value': r.mean_value, |
| + 'mean_value': r.mean, |
| 'std_dev': r.std_dev, |
| - 'values': r.values, |
| + 'values': r.debug_values, |
| 'result': 'good' if r.good else 'bad' if r.bad else 'unknown', |
| }) |
| return revision_rows |