Index: scripts/slave/recipe_modules/auto_bisect/bisector.py |
diff --git a/scripts/slave/recipe_modules/auto_bisect/bisector.py b/scripts/slave/recipe_modules/auto_bisect/bisector.py |
index cd46eb187fbc2b818e67509b6c6a9350f9519313..1abaaa0b57599a965125f61c95eb1281db5b5676 100644 |
--- a/scripts/slave/recipe_modules/auto_bisect/bisector.py |
+++ b/scripts/slave/recipe_modules/auto_bisect/bisector.py |
@@ -46,14 +46,7 @@ DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25 |
# How long to re-test the initial good-bad range for until significant |
# difference is established. |
-REGRESSION_CHECK_TIMEOUT = 20 * 60 * 60 # 20 hours. A build times out after 24. |
-# If we reach this number of samples on the reference range and have not |
-# achieved statistical significance, bail. |
-MAX_REQUIRED_SAMPLES = 15 |
- |
-# Significance level to use for determining difference between revisions via |
-# hypothesis testing. |
-SIGNIFICANCE_LEVEL = 0.01 |
+REGRESSION_CHECK_TIMEOUT = 2 * 60 * 60 |
_FAILED_INITIAL_CONFIDENCE_ABORT_REASON = ( |
'The metric values for the initial "good" and "bad" revisions ' |
@@ -155,7 +148,9 @@ class Bisector(object): |
commit_position = self._api.m.commit_position.construct( |
branch='refs/heads/master', value=rev) |
try: |
- return self._api.m.crrev.to_commit_hash(commit_position) |
+ return self._api.m.crrev.to_commit_hash( |
+ commit_position, |
+ step_test_data=lambda: self.api._test_data['hash_cp_map'][rev]) |
except self.api.m.step.StepFailure: # pragma: no cover |
self.surface_result('BAD_REV') |
raise |
@@ -167,37 +162,37 @@ class Bisector(object): |
def _is_sha1(s): |
return bool(re.match('^[0-9A-Fa-f]{40}$', s)) |
- def significantly_different( |
- self, list_a, list_b, |
- significance_level=SIGNIFICANCE_LEVEL): # pragma: no cover |
- """Uses an external script to run hypothesis testing with scipy. |
- |
- The reason why we need an external script is that scipy is not available to |
- the default python installed in all platforms. We instead rely on an |
- anaconda environment to provide those packages. |
- |
- Args: |
- list_a, list_b: Two lists representing samples to be compared. |
- significance_level: Self-describing. As a decimal fraction. |
- |
+ def compare_revisions(self, revision_a, revision_b): |
+ """ |
Returns: |
prasadv
2016/09/09 18:52:30
I think it is better to have a consistent datatype
RobertoCN
2016/09/13 22:11:40
Done.
|
- A boolean indicating whether the null hypothesis ~(that the lists are |
- samples from the same population) can be rejected at the specified |
- significance level. |
+ True if the samples are significantly different. |
+ None if there is not enough data to tell. |
+ False if there's enough data but still can't tell the samples apart. |
""" |
- step_result = self.api.m.python( |
- 'Checking sample difference', |
- self.api.resource('significantly_different.py'), |
- [json.dumps(list_a), json.dumps(list_b), str(significance_level)], |
- stdout=self.api.m.json.output()) |
- results = step_result.stdout |
- if results is None: |
- assert self.dummy_builds |
- return True |
- significantly_different = results['significantly_different'] |
- step_result.presentation.logs[str(significantly_different)] = [ |
- 'See json.output for details'] |
- return significantly_different |
+ output_format = 'chartjson' |
+ values_a = revision_a.chartjson_paths |
+ values_b = revision_b.chartjson_paths |
+ if revision_a.valueset_paths and revision_b.valueset_paths: |
+ output_format = 'valueset' |
+ |
+ result = self.api.stat_compare( |
+ values_a, |
+ values_b, |
+ self.bisect_config['metric'], |
+ output_format=output_format, |
+ step_test_data=lambda: self.api.test_api.compare_samples_data( |
+ self.api._test_data.get('revision_data'), revision_a, revision_b)) |
+ |
+ revision_a.debug_values = result['sample_a']['debug_values'] |
+ revision_b.debug_values = result['sample_b']['debug_values'] |
+ revision_a.mean = result['sample_a']['mean'] |
+ revision_b.mean = result['sample_b']['mean'] |
+ revision_a.std_dev = result['sample_a']['std_dev'] |
+ revision_b.std_dev = result['sample_b']['std_dev'] |
+ |
+ if result['result'] == 'needMoreData': |
+ return None |
+ return bool(result['result']) |
def config_step(self): |
"""Yields a step that prints the bisect config.""" |
@@ -235,8 +230,8 @@ class Bisector(object): |
return self._api |
def compute_relative_change(self): |
- old_value = float(self.good_rev.mean_value) |
- new_value = float(self.bad_rev.mean_value) |
+ old_value = float(self.good_rev.mean or 0) |
+ new_value = float(self.bad_rev.mean or 0) |
if new_value and not old_value: # pragma: no cover |
self.relative_change = ZERO_TO_NON_ZERO |
@@ -273,8 +268,10 @@ class Bisector(object): |
stdin = self.api.m.raw_io.input(file_contents) |
stdout = self.api.m.raw_io.output() |
step_name = 'Hashing modified DEPS file with revision ' + commit_hash |
- step_result = self.api.m.git(*cmd, cwd=cwd, stdin=stdin, stdout=stdout, |
- name=step_name) |
+ step_result = self.api.m.git( |
+ *cmd, cwd=cwd, stdin=stdin, stdout=stdout, name=step_name, |
+ step_test_data=lambda: |
+ self.api.m.raw_io.test_api.stream_output(commit_hash)) |
hash_string = step_result.stdout.splitlines()[0] |
try: |
if hash_string: |
@@ -309,7 +306,9 @@ class Bisector(object): |
cmd = cmd.split(' ') |
stdout = self.api.m.raw_io.output() |
step_name = 'Generating patch for %s to %s' % (git_object_a, deps_rev) |
- step_result = self.api.m.git(*cmd, cwd=cwd, stdout=stdout, name=step_name) |
+ step_result = self.api.m.git( |
+ *cmd, cwd=cwd, stdout=stdout, name=step_name, |
+ step_test_data=lambda: self.api._test_data['diff_patch']) |
patch_text = step_result.stdout |
src_string = 'IAMSRC:' + git_object_a |
dst_string = 'IAMDST:' + git_object_b |
@@ -372,18 +371,21 @@ class Bisector(object): |
with self.api.m.step.nest('Expanding revision range'): |
good_hash = self.good_rev.commit_hash |
bad_hash = self.bad_rev.commit_hash |
+ depot = self.good_rev.depot_name |
step_name = 'for revisions %s:%s' % (good_hash, bad_hash) |
revisions = self._revision_range( |
start=good_hash, |
end=bad_hash, |
depot_name=self.base_depot, |
step_name=step_name, |
- exclude_end=True) |
+ exclude_end=True, |
+ step_test_data=lambda: self.api._test_data['revision_list'][depot] |
+ ) |
self.revisions = [self.good_rev] + revisions + [self.bad_rev] |
self._update_revision_list_indexes() |
def _revision_range(self, start, end, depot_name, base_revision=None, |
- step_name=None, exclude_end=False): |
+ step_name=None, exclude_end=False, **kwargs): |
"""Returns a list of RevisionState objects between |start| and |end|. |
When expanding the initial revision range we want to exclude the last |
@@ -412,7 +414,7 @@ class Bisector(object): |
step_name, |
self.api.resource('fetch_intervening_revisions.py'), |
[start, end, depot_config.DEPOT_DEPS_NAME[depot_name]['url']], |
- stdout=self.api.m.json.output()) |
+ stdout=self.api.m.json.output(), **kwargs) |
except self.api.m.step.StepFailure: # pragma: no cover |
self.surface_result('BAD_REV') |
raise |
@@ -521,7 +523,9 @@ class Bisector(object): |
end=dep_revision_max, |
depot_name=depot_name, |
base_revision=min_revision, |
- step_name=step_name) |
+ step_name=step_name, |
+ step_test_data=lambda: |
+ self.api._test_data['revision_list'][depot_name]) |
new_revisions = self.revisions[:max_revision.list_index] |
new_revisions += rev_list |
new_revisions += self.revisions[max_revision.list_index:] |
@@ -558,8 +562,8 @@ class Bisector(object): |
True if the check passes (i.e. no problem), False if the change is not |
a regression according to the improvement direction. |
""" |
- good = self.good_rev.mean_value |
- bad = self.bad_rev.mean_value |
+ good = self.good_rev.mean |
+ bad = self.bad_rev.mean |
if self.is_return_code_mode(): |
return True |
@@ -594,32 +598,32 @@ class Bisector(object): |
other in a statistically significant manner. False if such difference could |
not be established in the time or sample size allowed. |
""" |
- if self.test_type == 'return_code': |
+ if self.is_return_code_mode(): |
return (self.good_rev.overall_return_code != |
self.bad_rev.overall_return_code) |
if self.bypass_stats_check: |
- dummy_result = self.good_rev.values != self.bad_rev.values |
+ self.compare_revisions(self.good_rev, self.bad_rev) |
+ dummy_result = self.good_rev.mean != self.bad_rev.mean |
if not dummy_result: |
self._set_insufficient_confidence_warning() |
return dummy_result |
+ # TODO(robertocn): This step should not be necessary in some cases. |
with self.api.m.step.nest('Re-testing reference range'): |
expiration_time = time.time() + REGRESSION_CHECK_TIMEOUT |
while time.time() < expiration_time: |
- if len(self.good_rev.values) >= 5 and len(self.bad_rev.values) >= 5: |
- if self.significantly_different(self.good_rev.values, |
- self.bad_rev.values): |
+ if (self.good_rev.test_run_count >= 5 |
+ and self.bad_rev.test_run_count >= 5): |
+ if self.compare_revisions(self.good_rev, self.bad_rev): |
return True |
- if len(self.good_rev.values) == len(self.bad_rev.values): |
+ if self.good_rev.test_run_count == self.bad_rev.test_run_count: |
revision_to_retest = self.last_tested_revision |
else: |
revision_to_retest = min(self.good_rev, self.bad_rev, |
- key=lambda x: len(x.values)) |
- if len(revision_to_retest.values) < MAX_REQUIRED_SAMPLES: |
- revision_to_retest.retest() |
- else: |
- break |
+ key=lambda x: x.test_run_count) |
+ revision_to_retest._do_test() |
+ |
self._set_insufficient_confidence_warning() |
return False |
@@ -641,14 +645,18 @@ class Bisector(object): |
result = 'bisector.lkgr: %r\n' % self.lkgr |
result += 'bisector.fkbr: %r\n\n' % self.fkbr |
result += self._revision_value_table() |
- if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values): |
- result += '\n' + self._t_test_results() |
+ if (self.lkgr and self.lkgr.test_run_count and self.fkbr and |
+ self.fkbr.test_run_count): |
+ result += '\n' + '\n'.join([ |
+ 'LKGR values: %r' % list(self.lkgr.debug_values), |
+ 'FKBR values: %r' % list(self.fkbr.debug_values), |
+ ]) |
return result |
def _revision_value_table(self): |
"""Returns a string table showing revisions and their values.""" |
header = [['Revision', 'Values']] |
- rows = [[r.revision_string(), str(r.values)] for r in self.revisions] |
+ rows = [[r.revision_string(), str(r.debug_values)] for r in self.revisions] |
return self._pretty_table(header + rows) |
def _pretty_table(self, data): |
@@ -657,20 +665,6 @@ class Bisector(object): |
results.append('%-15s' * len(row) % tuple(row)) |
return '\n'.join(results) |
- def _t_test_results(self): |
- """Returns a string showing t-test results for lkgr and fkbr.""" |
- t, df, p = self.api.m.math_utils.welchs_t_test( |
- self.lkgr.values, self.fkbr.values) |
- lines = [ |
- 'LKGR values: %r' % self.lkgr.values, |
- 'FKBR values: %r' % self.fkbr.values, |
- 't-statistic: %r' % t, |
- 'deg. of freedom: %r' % df, |
- 'p-value: %r' % p, |
- 'Confidence score: %r' % (100 * (1 - p)) |
- ] |
- return '\n'.join(lines) |
- |
def print_result_debug_info(self): |
"""Prints extra debug info at the end of the bisect process.""" |
lines = self._results_debug_message().splitlines() |
@@ -695,7 +689,7 @@ class Bisector(object): |
candidate_range = [revision for revision in |
self.revisions[self.lkgr.list_index + 1: |
self.fkbr.list_index] |
- if not revision.tested and not revision.failed] |
+ if not revision.failed] |
if len(candidate_range) == 1: |
return candidate_range[0] |
if len(candidate_range) == 0: |
@@ -738,7 +732,7 @@ class Bisector(object): |
does not contain a DEPS change. |
""" |
if (revision.bad and revision.previous_revision and |
- revision.previous_revision.good): # pragma: no cover |
+ revision.previous_revision.good): |
if revision.deps_change() and self._expand_deps_revisions(revision): |
return False |
self.culprit = revision |
@@ -750,32 +744,9 @@ class Bisector(object): |
return False |
self.culprit = revision.next_revision |
return True |
- return False |
- |
- def wait_for_all(self, revision_list): |
- """Waits for all revisions in list to finish.""" |
- for r in revision_list: |
- self.wait_for(r) |
- |
- def wait_for(self, revision, nest_check=True): |
- """Waits for the revision to finish its job.""" |
- if nest_check and not self.flags.get( |
- 'do_not_nest_wait_for_revision'): # pragma: no cover |
- with self.api.m.step.nest('Waiting for ' + revision.revision_string()): |
- return self.wait_for(revision, nest_check=False) |
- while True: |
- revision.update_status() |
- if revision.in_progress: |
- self.api.m.python.inline( |
- 'sleeping', |
- """ |
- import sys |
- import time |
- time.sleep(20*60) |
- sys.exit(0) |
- """) |
- else: |
- break |
+ # We'll never get here because revision adjacency is checked before this |
+ # function is called. |
+ assert False # pragma: no cover |
def _update_candidate_range(self): |
"""Updates lkgr and fkbr (last known good/first known bad) revisions. |
@@ -783,7 +754,7 @@ class Bisector(object): |
lkgr and fkbr are 'pointers' to the appropriate RevisionState objects in |
bisectors.revisions.""" |
for r in self.revisions: |
- if r.tested: |
+ if r.test_run_count: |
if r.good: |
self.lkgr = r |
elif r.bad: |
@@ -821,7 +792,7 @@ class Bisector(object): |
# TODO(prasadv): Refactor this code to remove hard coded values and use |
# target_bit from the bot config. crbug.com/640287 |
- if 'android' in bot_name: |
+ if 'android' in bot_name: # pragma: no cover |
if any(b in bot_name for b in ['arm64', 'nexus9', 'nexus5X']): |
return 'android_arm64_perf_bisect_builder' |
return 'android_perf_bisect_builder' |
@@ -851,7 +822,7 @@ class Bisector(object): |
# TODO(prasadv): Refactor this code to remove hard coded values and use |
# target_bit from the bot config. crbug.com/640287 |
- if 'android' in bot_name: |
+ if 'android' in bot_name: #pragma: no cover |
if any(b in bot_name for b in ['arm64', 'nexus9', 'nexus5X']): |
return 'gs://chrome-perf/Android arm64 Builder/full-build-linux_' |
return 'gs://chrome-perf/Android Builder/full-build-linux_' |
@@ -876,7 +847,7 @@ class Bisector(object): |
def is_return_code_mode(self): |
"""Checks whether this is a bisect on the test's exit code.""" |
- return self.bisect_config.get('test_type') == 'return_code' |
+ return self.test_type == 'return_code' |
def surface_result(self, result_string): |
assert result_string in VALID_RESULT_CODES |
@@ -891,10 +862,6 @@ class Bisector(object): |
def get_result(self): |
"""Returns the results as a jsonable object.""" |
config = self.bisect_config |
- results_confidence = 0 |
- if self.culprit: |
- results_confidence = self.api.m.math_utils.confidence_score( |
- self.lkgr.values, self.fkbr.values) |
if self.failed: |
status = 'failed' |
@@ -916,9 +883,8 @@ class Bisector(object): |
'bisect_bot': self.get_perf_tester_name(), |
'command': config['command'], |
'test_type': config['test_type'], |
- 'metric': config['metric'], |
+ 'metric': config.get('metric'), |
'change': self.relative_change, |
- 'score': results_confidence, |
'good_revision': self.good_rev.commit_hash, |
'bad_revision': self.bad_rev.commit_hash, |
'warnings': self.warnings, |
@@ -950,14 +916,14 @@ class Bisector(object): |
def _revision_data(self): |
revision_rows = [] |
for r in self.revisions: |
- if r.tested or r.aborted: |
+ if r.test_run_count: |
revision_rows.append({ |
'depot_name': r.depot_name, |
'commit_hash': r.commit_hash, |
'revision_string': r.revision_string(), |
- 'mean_value': r.mean_value, |
+ 'mean_value': r.mean, |
'std_dev': r.std_dev, |
- 'values': r.values, |
+ 'values': r.debug_values, |
'result': 'good' if r.good else 'bad' if r.bad else 'unknown', |
}) |
return revision_rows |