Chromium Code Reviews| Index: scripts/slave/recipe_modules/auto_bisect/bisector.py |
| diff --git a/scripts/slave/recipe_modules/auto_bisect/bisector.py b/scripts/slave/recipe_modules/auto_bisect/bisector.py |
| index cd46eb187fbc2b818e67509b6c6a9350f9519313..86ed2ebded4722421de0d532b77d932041d6fc22 100644 |
| --- a/scripts/slave/recipe_modules/auto_bisect/bisector.py |
| +++ b/scripts/slave/recipe_modules/auto_bisect/bisector.py |
| @@ -46,14 +46,7 @@ DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25 |
| # How long to re-test the initial good-bad range for until significant |
| # difference is established. |
| -REGRESSION_CHECK_TIMEOUT = 20 * 60 * 60 # 20 hours. A build times out after 24. |
| -# If we reach this number of samples on the reference range and have not |
| -# achieved statistical significance, bail. |
| -MAX_REQUIRED_SAMPLES = 15 |
| - |
| -# Significance level to use for determining difference between revisions via |
| -# hypothesis testing. |
| -SIGNIFICANCE_LEVEL = 0.01 |
| +REGRESSION_CHECK_TIMEOUT = 2 * 60 * 60 |
|
eakuefner
2016/09/09 20:35:42
Why? Comment?
RobertoCN
2016/09/13 22:11:40
I don't know. I'll set it back to what it was. A l
|
| _FAILED_INITIAL_CONFIDENCE_ABORT_REASON = ( |
| 'The metric values for the initial "good" and "bad" revisions ' |
| @@ -68,6 +61,10 @@ _DIRECTION_OF_IMPROVEMENT_ABORT_REASON = ( |
| class Bisector(object): |
| """This class abstracts an ongoing bisect (or n-sect) job.""" |
| + SIGNIFICANTLY_DIFFERENT = True |
|
prasadv
2016/09/09 18:52:30
I think declaring enum here would be better.
RobertoCN
2016/09/13 22:11:40
Done.
|
| + NOT_SIGNIFICANTLY_DIFFERENT = False |
| + NEED_MORE_DATA = 'needMoreData' |
| + |
| def __init__(self, api, bisect_config, revision_class, init_revisions=True, |
| **flags): |
| """Initializes the state of a new bisect job from a dictionary. |
| @@ -155,7 +152,9 @@ class Bisector(object): |
| commit_position = self._api.m.commit_position.construct( |
| branch='refs/heads/master', value=rev) |
| try: |
| - return self._api.m.crrev.to_commit_hash(commit_position) |
| + return self._api.m.crrev.to_commit_hash( |
| + commit_position, |
| + step_test_data=lambda: self.api._test_data['hash_cp_map'][rev]) |
| except self.api.m.step.StepFailure: # pragma: no cover |
| self.surface_result('BAD_REV') |
| raise |
| @@ -167,37 +166,38 @@ class Bisector(object): |
| def _is_sha1(s): |
| return bool(re.match('^[0-9A-Fa-f]{40}$', s)) |
| - def significantly_different( |
| - self, list_a, list_b, |
| - significance_level=SIGNIFICANCE_LEVEL): # pragma: no cover |
| - """Uses an external script to run hypothesis testing with scipy. |
| - |
| - The reason why we need an external script is that scipy is not available to |
| - the default python installed in all platforms. We instead rely on an |
| - anaconda environment to provide those packages. |
| - |
| - Args: |
| - list_a, list_b: Two lists representing samples to be compared. |
| - significance_level: Self-describing. As a decimal fraction. |
| - |
| + def compare_revisions(self, revision_a, revision_b): |
| + """ |
| Returns: |
| - A boolean indicating whether the null hypothesis ~(that the lists are |
| - samples from the same population) can be rejected at the specified |
| - significance level. |
| + SIGNIFICANTLY_DIFFERENT if the samples are significantly different. |
| + NEED_MORE_DATA if there is not enough data to tell. |
| + NOT_SIGNIFICANTLY_DIFFERENT if if there's enough data but still can't |
| + tell the samples apart. |
| """ |
| - step_result = self.api.m.python( |
| - 'Checking sample difference', |
| - self.api.resource('significantly_different.py'), |
| - [json.dumps(list_a), json.dumps(list_b), str(significance_level)], |
| - stdout=self.api.m.json.output()) |
| - results = step_result.stdout |
| - if results is None: |
| - assert self.dummy_builds |
| - return True |
| - significantly_different = results['significantly_different'] |
| - step_result.presentation.logs[str(significantly_different)] = [ |
| - 'See json.output for details'] |
| - return significantly_different |
| + output_format = 'chartjson' |
| + values_a = revision_a.chartjson_paths |
| + values_b = revision_b.chartjson_paths |
| + if revision_a.valueset_paths and revision_b.valueset_paths: |
| + output_format = 'valueset' |
|
prasadv
2016/09/09 18:52:30
shouldn't we re-assign values_a and values_b here?
RobertoCN
2016/09/13 22:11:40
Nice catch!
Done.
|
| + |
| + result = self.api.stat_compare( |
| + values_a, |
| + values_b, |
| + self.bisect_config['metric'], |
| + output_format=output_format, |
| + step_test_data=lambda: self.api.test_api.compare_samples_data( |
| + self.api._test_data.get('revision_data'), revision_a, revision_b)) |
| + |
| + revision_a.debug_values = result['sample_a']['debug_values'] |
| + revision_b.debug_values = result['sample_b']['debug_values'] |
| + |
| + if result['result'] == self.NEED_MORE_DATA: |
| + return self.NEED_MORE_DATA |
| + elif result['result']: |
| + return self.SIGNIFICANTLY_DIFFERENT |
| + # TODO(robertocn): Remove this pragma and cover this line. |
| + else: # pragma: no cover |
| + return self.NOT_SIGNIFICANTLY_DIFFERENT |
| def config_step(self): |
| """Yields a step that prints the bisect config.""" |
| @@ -235,8 +235,8 @@ class Bisector(object): |
| return self._api |
| def compute_relative_change(self): |
| - old_value = float(self.good_rev.mean_value) |
| - new_value = float(self.bad_rev.mean_value) |
| + old_value = float(self.good_rev.mean or 0) |
| + new_value = float(self.bad_rev.mean or 0) |
| if new_value and not old_value: # pragma: no cover |
| self.relative_change = ZERO_TO_NON_ZERO |
| @@ -273,8 +273,10 @@ class Bisector(object): |
| stdin = self.api.m.raw_io.input(file_contents) |
| stdout = self.api.m.raw_io.output() |
| step_name = 'Hashing modified DEPS file with revision ' + commit_hash |
| - step_result = self.api.m.git(*cmd, cwd=cwd, stdin=stdin, stdout=stdout, |
| - name=step_name) |
| + step_result = self.api.m.git( |
| + *cmd, cwd=cwd, stdin=stdin, stdout=stdout, name=step_name, |
| + step_test_data=lambda: |
| + self.api.m.raw_io.test_api.stream_output(commit_hash)) |
| hash_string = step_result.stdout.splitlines()[0] |
| try: |
| if hash_string: |
| @@ -309,7 +311,9 @@ class Bisector(object): |
| cmd = cmd.split(' ') |
| stdout = self.api.m.raw_io.output() |
| step_name = 'Generating patch for %s to %s' % (git_object_a, deps_rev) |
| - step_result = self.api.m.git(*cmd, cwd=cwd, stdout=stdout, name=step_name) |
| + step_result = self.api.m.git( |
| + *cmd, cwd=cwd, stdout=stdout, name=step_name, |
| + step_test_data=lambda: self.api._test_data['diff_patch']) |
| patch_text = step_result.stdout |
| src_string = 'IAMSRC:' + git_object_a |
| dst_string = 'IAMDST:' + git_object_b |
| @@ -372,18 +376,21 @@ class Bisector(object): |
| with self.api.m.step.nest('Expanding revision range'): |
| good_hash = self.good_rev.commit_hash |
| bad_hash = self.bad_rev.commit_hash |
| + depot = self.good_rev.depot_name |
| step_name = 'for revisions %s:%s' % (good_hash, bad_hash) |
| revisions = self._revision_range( |
| start=good_hash, |
| end=bad_hash, |
| depot_name=self.base_depot, |
| step_name=step_name, |
| - exclude_end=True) |
| + exclude_end=True, |
| + step_test_data=lambda: self.api._test_data['revision_list'][depot] |
| + ) |
| self.revisions = [self.good_rev] + revisions + [self.bad_rev] |
| self._update_revision_list_indexes() |
| def _revision_range(self, start, end, depot_name, base_revision=None, |
| - step_name=None, exclude_end=False): |
| + step_name=None, exclude_end=False, **kwargs): |
| """Returns a list of RevisionState objects between |start| and |end|. |
| When expanding the initial revision range we want to exclude the last |
| @@ -412,7 +419,7 @@ class Bisector(object): |
| step_name, |
| self.api.resource('fetch_intervening_revisions.py'), |
| [start, end, depot_config.DEPOT_DEPS_NAME[depot_name]['url']], |
| - stdout=self.api.m.json.output()) |
| + stdout=self.api.m.json.output(), **kwargs) |
| except self.api.m.step.StepFailure: # pragma: no cover |
| self.surface_result('BAD_REV') |
| raise |
| @@ -521,7 +528,9 @@ class Bisector(object): |
| end=dep_revision_max, |
| depot_name=depot_name, |
| base_revision=min_revision, |
| - step_name=step_name) |
| + step_name=step_name, |
| + step_test_data=lambda: |
| + self.api._test_data['revision_list'][depot_name]) |
| new_revisions = self.revisions[:max_revision.list_index] |
| new_revisions += rev_list |
| new_revisions += self.revisions[max_revision.list_index:] |
| @@ -558,8 +567,8 @@ class Bisector(object): |
| True if the check passes (i.e. no problem), False if the change is not |
| a regression according to the improvement direction. |
| """ |
| - good = self.good_rev.mean_value |
| - bad = self.bad_rev.mean_value |
| + good = self.good_rev.mean |
| + bad = self.bad_rev.mean |
| if self.is_return_code_mode(): |
| return True |
| @@ -594,32 +603,32 @@ class Bisector(object): |
| other in a statistically significant manner. False if such difference could |
| not be established in the time or sample size allowed. |
| """ |
| - if self.test_type == 'return_code': |
| + if self.is_return_code_mode(): |
| return (self.good_rev.overall_return_code != |
| self.bad_rev.overall_return_code) |
| if self.bypass_stats_check: |
| - dummy_result = self.good_rev.values != self.bad_rev.values |
| + self.compare_revisions(self.good_rev, self.bad_rev) |
| + dummy_result = self.good_rev.mean != self.bad_rev.mean |
| if not dummy_result: |
| self._set_insufficient_confidence_warning() |
| return dummy_result |
| + # TODO(robertocn): This step should not be necessary in some cases. |
| with self.api.m.step.nest('Re-testing reference range'): |
| expiration_time = time.time() + REGRESSION_CHECK_TIMEOUT |
| while time.time() < expiration_time: |
| - if len(self.good_rev.values) >= 5 and len(self.bad_rev.values) >= 5: |
| - if self.significantly_different(self.good_rev.values, |
| - self.bad_rev.values): |
| + if (self.good_rev.test_run_count >= 5 |
| + and self.bad_rev.test_run_count >= 5): |
| + if self.compare_revisions(self.good_rev, self.bad_rev): |
| return True |
| - if len(self.good_rev.values) == len(self.bad_rev.values): |
| + if self.good_rev.test_run_count == self.bad_rev.test_run_count: |
| revision_to_retest = self.last_tested_revision |
| else: |
| revision_to_retest = min(self.good_rev, self.bad_rev, |
| - key=lambda x: len(x.values)) |
| - if len(revision_to_retest.values) < MAX_REQUIRED_SAMPLES: |
| - revision_to_retest.retest() |
| - else: |
| - break |
| + key=lambda x: x.test_run_count) |
| + revision_to_retest._do_test() |
| + |
| self._set_insufficient_confidence_warning() |
| return False |
| @@ -641,14 +650,18 @@ class Bisector(object): |
| result = 'bisector.lkgr: %r\n' % self.lkgr |
| result += 'bisector.fkbr: %r\n\n' % self.fkbr |
| result += self._revision_value_table() |
| - if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values): |
| - result += '\n' + self._t_test_results() |
| + if (self.lkgr and self.lkgr.test_run_count and self.fkbr and |
| + self.fkbr.test_run_count): |
| + result += '\n' + '\n'.join([ |
| + 'LKGR values: %r' % list(self.lkgr.debug_values), |
| + 'FKBR values: %r' % list(self.fkbr.debug_values), |
| + ]) |
| return result |
| def _revision_value_table(self): |
| """Returns a string table showing revisions and their values.""" |
| header = [['Revision', 'Values']] |
| - rows = [[r.revision_string(), str(r.values)] for r in self.revisions] |
| + rows = [[r.revision_string(), str(r.debug_values)] for r in self.revisions] |
| return self._pretty_table(header + rows) |
| def _pretty_table(self, data): |
| @@ -657,20 +670,6 @@ class Bisector(object): |
| results.append('%-15s' * len(row) % tuple(row)) |
| return '\n'.join(results) |
| - def _t_test_results(self): |
| - """Returns a string showing t-test results for lkgr and fkbr.""" |
| - t, df, p = self.api.m.math_utils.welchs_t_test( |
| - self.lkgr.values, self.fkbr.values) |
| - lines = [ |
| - 'LKGR values: %r' % self.lkgr.values, |
| - 'FKBR values: %r' % self.fkbr.values, |
| - 't-statistic: %r' % t, |
| - 'deg. of freedom: %r' % df, |
| - 'p-value: %r' % p, |
| - 'Confidence score: %r' % (100 * (1 - p)) |
| - ] |
| - return '\n'.join(lines) |
| - |
| def print_result_debug_info(self): |
| """Prints extra debug info at the end of the bisect process.""" |
| lines = self._results_debug_message().splitlines() |
| @@ -695,7 +694,7 @@ class Bisector(object): |
| candidate_range = [revision for revision in |
| self.revisions[self.lkgr.list_index + 1: |
| self.fkbr.list_index] |
| - if not revision.tested and not revision.failed] |
| + if not revision.failed] |
| if len(candidate_range) == 1: |
| return candidate_range[0] |
| if len(candidate_range) == 0: |
| @@ -738,7 +737,7 @@ class Bisector(object): |
| does not contain a DEPS change. |
| """ |
| if (revision.bad and revision.previous_revision and |
| - revision.previous_revision.good): # pragma: no cover |
| + revision.previous_revision.good): |
| if revision.deps_change() and self._expand_deps_revisions(revision): |
| return False |
| self.culprit = revision |
| @@ -750,32 +749,9 @@ class Bisector(object): |
| return False |
| self.culprit = revision.next_revision |
| return True |
| - return False |
| - |
| - def wait_for_all(self, revision_list): |
| - """Waits for all revisions in list to finish.""" |
| - for r in revision_list: |
| - self.wait_for(r) |
| - |
| - def wait_for(self, revision, nest_check=True): |
| - """Waits for the revision to finish its job.""" |
| - if nest_check and not self.flags.get( |
| - 'do_not_nest_wait_for_revision'): # pragma: no cover |
| - with self.api.m.step.nest('Waiting for ' + revision.revision_string()): |
| - return self.wait_for(revision, nest_check=False) |
| - while True: |
| - revision.update_status() |
| - if revision.in_progress: |
| - self.api.m.python.inline( |
| - 'sleeping', |
| - """ |
| - import sys |
| - import time |
| - time.sleep(20*60) |
| - sys.exit(0) |
| - """) |
| - else: |
| - break |
| + # We'll never get here because revision adjacency is checked before this |
| + # function is called. |
| + assert False # pragma: no cover |
| def _update_candidate_range(self): |
| """Updates lkgr and fkbr (last known good/first known bad) revisions. |
| @@ -783,7 +759,7 @@ class Bisector(object): |
| lkgr and fkbr are 'pointers' to the appropriate RevisionState objects in |
| bisectors.revisions.""" |
| for r in self.revisions: |
| - if r.tested: |
| + if r.test_run_count: |
| if r.good: |
| self.lkgr = r |
| elif r.bad: |
| @@ -821,7 +797,7 @@ class Bisector(object): |
| # TODO(prasadv): Refactor this code to remove hard coded values and use |
| # target_bit from the bot config. crbug.com/640287 |
| - if 'android' in bot_name: |
| + if 'android' in bot_name: # pragma: no cover |
| if any(b in bot_name for b in ['arm64', 'nexus9', 'nexus5X']): |
| return 'android_arm64_perf_bisect_builder' |
| return 'android_perf_bisect_builder' |
| @@ -851,7 +827,7 @@ class Bisector(object): |
| # TODO(prasadv): Refactor this code to remove hard coded values and use |
| # target_bit from the bot config. crbug.com/640287 |
| - if 'android' in bot_name: |
| + if 'android' in bot_name: #pragma: no cover |
| if any(b in bot_name for b in ['arm64', 'nexus9', 'nexus5X']): |
| return 'gs://chrome-perf/Android arm64 Builder/full-build-linux_' |
| return 'gs://chrome-perf/Android Builder/full-build-linux_' |
| @@ -876,7 +852,7 @@ class Bisector(object): |
| def is_return_code_mode(self): |
| """Checks whether this is a bisect on the test's exit code.""" |
| - return self.bisect_config.get('test_type') == 'return_code' |
| + return self.test_type == 'return_code' |
| def surface_result(self, result_string): |
| assert result_string in VALID_RESULT_CODES |
| @@ -891,10 +867,6 @@ class Bisector(object): |
| def get_result(self): |
| """Returns the results as a jsonable object.""" |
| config = self.bisect_config |
| - results_confidence = 0 |
| - if self.culprit: |
| - results_confidence = self.api.m.math_utils.confidence_score( |
| - self.lkgr.values, self.fkbr.values) |
| if self.failed: |
| status = 'failed' |
| @@ -916,9 +888,8 @@ class Bisector(object): |
| 'bisect_bot': self.get_perf_tester_name(), |
| 'command': config['command'], |
| 'test_type': config['test_type'], |
| - 'metric': config['metric'], |
| + 'metric': config.get('metric'), |
| 'change': self.relative_change, |
| - 'score': results_confidence, |
| 'good_revision': self.good_rev.commit_hash, |
| 'bad_revision': self.bad_rev.commit_hash, |
| 'warnings': self.warnings, |
| @@ -950,14 +921,14 @@ class Bisector(object): |
| def _revision_data(self): |
| revision_rows = [] |
| for r in self.revisions: |
| - if r.tested or r.aborted: |
| + if r.test_run_count: |
| revision_rows.append({ |
| 'depot_name': r.depot_name, |
| 'commit_hash': r.commit_hash, |
| 'revision_string': r.revision_string(), |
| - 'mean_value': r.mean_value, |
| + 'mean_value': r.mean, |
| 'std_dev': r.std_dev, |
| - 'values': r.values, |
| + 'values': r.debug_values, |
| 'result': 'good' if r.good else 'bad' if r.bad else 'unknown', |
| }) |
| return revision_rows |