Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(44)

Unified Diff: scripts/slave/recipe_modules/auto_bisect/bisector.py

Issue 2247373002: Refactor stages 1, 2 and test_api overhaul. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master
Patch Set: Got full coverage for new and changed code. Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: scripts/slave/recipe_modules/auto_bisect/bisector.py
diff --git a/scripts/slave/recipe_modules/auto_bisect/bisector.py b/scripts/slave/recipe_modules/auto_bisect/bisector.py
index cd46eb187fbc2b818e67509b6c6a9350f9519313..1abaaa0b57599a965125f61c95eb1281db5b5676 100644
--- a/scripts/slave/recipe_modules/auto_bisect/bisector.py
+++ b/scripts/slave/recipe_modules/auto_bisect/bisector.py
@@ -46,14 +46,7 @@ DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25
# How long to re-test the initial good-bad range for until significant
# difference is established.
-REGRESSION_CHECK_TIMEOUT = 20 * 60 * 60 # 20 hours. A build times out after 24.
-# If we reach this number of samples on the reference range and have not
-# achieved statistical significance, bail.
-MAX_REQUIRED_SAMPLES = 15
-
-# Significance level to use for determining difference between revisions via
-# hypothesis testing.
-SIGNIFICANCE_LEVEL = 0.01
+REGRESSION_CHECK_TIMEOUT = 2 * 60 * 60
_FAILED_INITIAL_CONFIDENCE_ABORT_REASON = (
'The metric values for the initial "good" and "bad" revisions '
@@ -155,7 +148,9 @@ class Bisector(object):
commit_position = self._api.m.commit_position.construct(
branch='refs/heads/master', value=rev)
try:
- return self._api.m.crrev.to_commit_hash(commit_position)
+ return self._api.m.crrev.to_commit_hash(
+ commit_position,
+ step_test_data=lambda: self.api._test_data['hash_cp_map'][rev])
except self.api.m.step.StepFailure: # pragma: no cover
self.surface_result('BAD_REV')
raise
@@ -167,37 +162,37 @@ class Bisector(object):
def _is_sha1(s):
return bool(re.match('^[0-9A-Fa-f]{40}$', s))
- def significantly_different(
- self, list_a, list_b,
- significance_level=SIGNIFICANCE_LEVEL): # pragma: no cover
- """Uses an external script to run hypothesis testing with scipy.
-
- The reason why we need an external script is that scipy is not available to
- the default python installed in all platforms. We instead rely on an
- anaconda environment to provide those packages.
-
- Args:
- list_a, list_b: Two lists representing samples to be compared.
- significance_level: Self-describing. As a decimal fraction.
-
+ def compare_revisions(self, revision_a, revision_b):
+ """
Returns:
prasadv 2016/09/09 18:52:30 I think it is better to have a consistent datatype
RobertoCN 2016/09/13 22:11:40 Done.
- A boolean indicating whether the null hypothesis ~(that the lists are
- samples from the same population) can be rejected at the specified
- significance level.
+ True if the samples are significantly different.
+ None if there is not enough data to tell.
+ False if there's enough data but still can't tell the samples apart.
"""
- step_result = self.api.m.python(
- 'Checking sample difference',
- self.api.resource('significantly_different.py'),
- [json.dumps(list_a), json.dumps(list_b), str(significance_level)],
- stdout=self.api.m.json.output())
- results = step_result.stdout
- if results is None:
- assert self.dummy_builds
- return True
- significantly_different = results['significantly_different']
- step_result.presentation.logs[str(significantly_different)] = [
- 'See json.output for details']
- return significantly_different
+ output_format = 'chartjson'
+ values_a = revision_a.chartjson_paths
+ values_b = revision_b.chartjson_paths
+ if revision_a.valueset_paths and revision_b.valueset_paths:
+ output_format = 'valueset'
+
+ result = self.api.stat_compare(
+ values_a,
+ values_b,
+ self.bisect_config['metric'],
+ output_format=output_format,
+ step_test_data=lambda: self.api.test_api.compare_samples_data(
+ self.api._test_data.get('revision_data'), revision_a, revision_b))
+
+ revision_a.debug_values = result['sample_a']['debug_values']
+ revision_b.debug_values = result['sample_b']['debug_values']
+ revision_a.mean = result['sample_a']['mean']
+ revision_b.mean = result['sample_b']['mean']
+ revision_a.std_dev = result['sample_a']['std_dev']
+ revision_b.std_dev = result['sample_b']['std_dev']
+
+ if result['result'] == 'needMoreData':
+ return None
+ return bool(result['result'])
def config_step(self):
"""Yields a step that prints the bisect config."""
@@ -235,8 +230,8 @@ class Bisector(object):
return self._api
def compute_relative_change(self):
- old_value = float(self.good_rev.mean_value)
- new_value = float(self.bad_rev.mean_value)
+ old_value = float(self.good_rev.mean or 0)
+ new_value = float(self.bad_rev.mean or 0)
if new_value and not old_value: # pragma: no cover
self.relative_change = ZERO_TO_NON_ZERO
@@ -273,8 +268,10 @@ class Bisector(object):
stdin = self.api.m.raw_io.input(file_contents)
stdout = self.api.m.raw_io.output()
step_name = 'Hashing modified DEPS file with revision ' + commit_hash
- step_result = self.api.m.git(*cmd, cwd=cwd, stdin=stdin, stdout=stdout,
- name=step_name)
+ step_result = self.api.m.git(
+ *cmd, cwd=cwd, stdin=stdin, stdout=stdout, name=step_name,
+ step_test_data=lambda:
+ self.api.m.raw_io.test_api.stream_output(commit_hash))
hash_string = step_result.stdout.splitlines()[0]
try:
if hash_string:
@@ -309,7 +306,9 @@ class Bisector(object):
cmd = cmd.split(' ')
stdout = self.api.m.raw_io.output()
step_name = 'Generating patch for %s to %s' % (git_object_a, deps_rev)
- step_result = self.api.m.git(*cmd, cwd=cwd, stdout=stdout, name=step_name)
+ step_result = self.api.m.git(
+ *cmd, cwd=cwd, stdout=stdout, name=step_name,
+ step_test_data=lambda: self.api._test_data['diff_patch'])
patch_text = step_result.stdout
src_string = 'IAMSRC:' + git_object_a
dst_string = 'IAMDST:' + git_object_b
@@ -372,18 +371,21 @@ class Bisector(object):
with self.api.m.step.nest('Expanding revision range'):
good_hash = self.good_rev.commit_hash
bad_hash = self.bad_rev.commit_hash
+ depot = self.good_rev.depot_name
step_name = 'for revisions %s:%s' % (good_hash, bad_hash)
revisions = self._revision_range(
start=good_hash,
end=bad_hash,
depot_name=self.base_depot,
step_name=step_name,
- exclude_end=True)
+ exclude_end=True,
+ step_test_data=lambda: self.api._test_data['revision_list'][depot]
+ )
self.revisions = [self.good_rev] + revisions + [self.bad_rev]
self._update_revision_list_indexes()
def _revision_range(self, start, end, depot_name, base_revision=None,
- step_name=None, exclude_end=False):
+ step_name=None, exclude_end=False, **kwargs):
"""Returns a list of RevisionState objects between |start| and |end|.
When expanding the initial revision range we want to exclude the last
@@ -412,7 +414,7 @@ class Bisector(object):
step_name,
self.api.resource('fetch_intervening_revisions.py'),
[start, end, depot_config.DEPOT_DEPS_NAME[depot_name]['url']],
- stdout=self.api.m.json.output())
+ stdout=self.api.m.json.output(), **kwargs)
except self.api.m.step.StepFailure: # pragma: no cover
self.surface_result('BAD_REV')
raise
@@ -521,7 +523,9 @@ class Bisector(object):
end=dep_revision_max,
depot_name=depot_name,
base_revision=min_revision,
- step_name=step_name)
+ step_name=step_name,
+ step_test_data=lambda:
+ self.api._test_data['revision_list'][depot_name])
new_revisions = self.revisions[:max_revision.list_index]
new_revisions += rev_list
new_revisions += self.revisions[max_revision.list_index:]
@@ -558,8 +562,8 @@ class Bisector(object):
True if the check passes (i.e. no problem), False if the change is not
a regression according to the improvement direction.
"""
- good = self.good_rev.mean_value
- bad = self.bad_rev.mean_value
+ good = self.good_rev.mean
+ bad = self.bad_rev.mean
if self.is_return_code_mode():
return True
@@ -594,32 +598,32 @@ class Bisector(object):
other in a statistically significant manner. False if such difference could
not be established in the time or sample size allowed.
"""
- if self.test_type == 'return_code':
+ if self.is_return_code_mode():
return (self.good_rev.overall_return_code !=
self.bad_rev.overall_return_code)
if self.bypass_stats_check:
- dummy_result = self.good_rev.values != self.bad_rev.values
+ self.compare_revisions(self.good_rev, self.bad_rev)
+ dummy_result = self.good_rev.mean != self.bad_rev.mean
if not dummy_result:
self._set_insufficient_confidence_warning()
return dummy_result
+ # TODO(robertocn): This step should not be necessary in some cases.
with self.api.m.step.nest('Re-testing reference range'):
expiration_time = time.time() + REGRESSION_CHECK_TIMEOUT
while time.time() < expiration_time:
- if len(self.good_rev.values) >= 5 and len(self.bad_rev.values) >= 5:
- if self.significantly_different(self.good_rev.values,
- self.bad_rev.values):
+ if (self.good_rev.test_run_count >= 5
+ and self.bad_rev.test_run_count >= 5):
+ if self.compare_revisions(self.good_rev, self.bad_rev):
return True
- if len(self.good_rev.values) == len(self.bad_rev.values):
+ if self.good_rev.test_run_count == self.bad_rev.test_run_count:
revision_to_retest = self.last_tested_revision
else:
revision_to_retest = min(self.good_rev, self.bad_rev,
- key=lambda x: len(x.values))
- if len(revision_to_retest.values) < MAX_REQUIRED_SAMPLES:
- revision_to_retest.retest()
- else:
- break
+ key=lambda x: x.test_run_count)
+ revision_to_retest._do_test()
+
self._set_insufficient_confidence_warning()
return False
@@ -641,14 +645,18 @@ class Bisector(object):
result = 'bisector.lkgr: %r\n' % self.lkgr
result += 'bisector.fkbr: %r\n\n' % self.fkbr
result += self._revision_value_table()
- if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values):
- result += '\n' + self._t_test_results()
+ if (self.lkgr and self.lkgr.test_run_count and self.fkbr and
+ self.fkbr.test_run_count):
+ result += '\n' + '\n'.join([
+ 'LKGR values: %r' % list(self.lkgr.debug_values),
+ 'FKBR values: %r' % list(self.fkbr.debug_values),
+ ])
return result
def _revision_value_table(self):
"""Returns a string table showing revisions and their values."""
header = [['Revision', 'Values']]
- rows = [[r.revision_string(), str(r.values)] for r in self.revisions]
+ rows = [[r.revision_string(), str(r.debug_values)] for r in self.revisions]
return self._pretty_table(header + rows)
def _pretty_table(self, data):
@@ -657,20 +665,6 @@ class Bisector(object):
results.append('%-15s' * len(row) % tuple(row))
return '\n'.join(results)
- def _t_test_results(self):
- """Returns a string showing t-test results for lkgr and fkbr."""
- t, df, p = self.api.m.math_utils.welchs_t_test(
- self.lkgr.values, self.fkbr.values)
- lines = [
- 'LKGR values: %r' % self.lkgr.values,
- 'FKBR values: %r' % self.fkbr.values,
- 't-statistic: %r' % t,
- 'deg. of freedom: %r' % df,
- 'p-value: %r' % p,
- 'Confidence score: %r' % (100 * (1 - p))
- ]
- return '\n'.join(lines)
-
def print_result_debug_info(self):
"""Prints extra debug info at the end of the bisect process."""
lines = self._results_debug_message().splitlines()
@@ -695,7 +689,7 @@ class Bisector(object):
candidate_range = [revision for revision in
self.revisions[self.lkgr.list_index + 1:
self.fkbr.list_index]
- if not revision.tested and not revision.failed]
+ if not revision.failed]
if len(candidate_range) == 1:
return candidate_range[0]
if len(candidate_range) == 0:
@@ -738,7 +732,7 @@ class Bisector(object):
does not contain a DEPS change.
"""
if (revision.bad and revision.previous_revision and
- revision.previous_revision.good): # pragma: no cover
+ revision.previous_revision.good):
if revision.deps_change() and self._expand_deps_revisions(revision):
return False
self.culprit = revision
@@ -750,32 +744,9 @@ class Bisector(object):
return False
self.culprit = revision.next_revision
return True
- return False
-
- def wait_for_all(self, revision_list):
- """Waits for all revisions in list to finish."""
- for r in revision_list:
- self.wait_for(r)
-
- def wait_for(self, revision, nest_check=True):
- """Waits for the revision to finish its job."""
- if nest_check and not self.flags.get(
- 'do_not_nest_wait_for_revision'): # pragma: no cover
- with self.api.m.step.nest('Waiting for ' + revision.revision_string()):
- return self.wait_for(revision, nest_check=False)
- while True:
- revision.update_status()
- if revision.in_progress:
- self.api.m.python.inline(
- 'sleeping',
- """
- import sys
- import time
- time.sleep(20*60)
- sys.exit(0)
- """)
- else:
- break
+ # We'll never get here because revision adjacency is checked before this
+ # function is called.
+ assert False # pragma: no cover
def _update_candidate_range(self):
"""Updates lkgr and fkbr (last known good/first known bad) revisions.
@@ -783,7 +754,7 @@ class Bisector(object):
lkgr and fkbr are 'pointers' to the appropriate RevisionState objects in
bisectors.revisions."""
for r in self.revisions:
- if r.tested:
+ if r.test_run_count:
if r.good:
self.lkgr = r
elif r.bad:
@@ -821,7 +792,7 @@ class Bisector(object):
# TODO(prasadv): Refactor this code to remove hard coded values and use
# target_bit from the bot config. crbug.com/640287
- if 'android' in bot_name:
+ if 'android' in bot_name: # pragma: no cover
if any(b in bot_name for b in ['arm64', 'nexus9', 'nexus5X']):
return 'android_arm64_perf_bisect_builder'
return 'android_perf_bisect_builder'
@@ -851,7 +822,7 @@ class Bisector(object):
# TODO(prasadv): Refactor this code to remove hard coded values and use
# target_bit from the bot config. crbug.com/640287
- if 'android' in bot_name:
+ if 'android' in bot_name: #pragma: no cover
if any(b in bot_name for b in ['arm64', 'nexus9', 'nexus5X']):
return 'gs://chrome-perf/Android arm64 Builder/full-build-linux_'
return 'gs://chrome-perf/Android Builder/full-build-linux_'
@@ -876,7 +847,7 @@ class Bisector(object):
def is_return_code_mode(self):
"""Checks whether this is a bisect on the test's exit code."""
- return self.bisect_config.get('test_type') == 'return_code'
+ return self.test_type == 'return_code'
def surface_result(self, result_string):
assert result_string in VALID_RESULT_CODES
@@ -891,10 +862,6 @@ class Bisector(object):
def get_result(self):
"""Returns the results as a jsonable object."""
config = self.bisect_config
- results_confidence = 0
- if self.culprit:
- results_confidence = self.api.m.math_utils.confidence_score(
- self.lkgr.values, self.fkbr.values)
if self.failed:
status = 'failed'
@@ -916,9 +883,8 @@ class Bisector(object):
'bisect_bot': self.get_perf_tester_name(),
'command': config['command'],
'test_type': config['test_type'],
- 'metric': config['metric'],
+ 'metric': config.get('metric'),
'change': self.relative_change,
- 'score': results_confidence,
'good_revision': self.good_rev.commit_hash,
'bad_revision': self.bad_rev.commit_hash,
'warnings': self.warnings,
@@ -950,14 +916,14 @@ class Bisector(object):
def _revision_data(self):
revision_rows = []
for r in self.revisions:
- if r.tested or r.aborted:
+ if r.test_run_count:
revision_rows.append({
'depot_name': r.depot_name,
'commit_hash': r.commit_hash,
'revision_string': r.revision_string(),
- 'mean_value': r.mean_value,
+ 'mean_value': r.mean,
'std_dev': r.std_dev,
- 'values': r.values,
+ 'values': r.debug_values,
'result': 'good' if r.good else 'bad' if r.bad else 'unknown',
})
return revision_rows

Powered by Google App Engine
This is Rietveld 408576698