Index: scripts/slave/recipe_modules/auto_bisect/revision_state.py |
diff --git a/scripts/slave/recipe_modules/auto_bisect/revision_state.py b/scripts/slave/recipe_modules/auto_bisect/revision_state.py |
index e396742a0e429f93f7957e338668d8df0e281125..b7bfdfb96dce524457d60771d7cab2bd715e510a 100644 |
--- a/scripts/slave/recipe_modules/auto_bisect/revision_state.py |
+++ b/scripts/slave/recipe_modules/auto_bisect/revision_state.py |
@@ -19,6 +19,7 @@ import re |
import uuid |
from . import depot_config |
+from exceptions import * |
# These relate to how to increase the number of repetitions during re-test |
MINIMUM_SAMPLE_SIZE = 5 |
@@ -30,22 +31,6 @@ SUCCESS, WARNINGS, FAILURE, SKIPPED, EXCEPTION = range(5) |
class RevisionState(object): |
"""Abstracts the state of a single revision on a bisect job.""" |
- # Possible values for the status attribute of RevisionState: |
- ( |
- NEW, # A revision_state object that has just been initialized. |
- BUILDING, # Requested a build for this revision, waiting for it. |
- TESTING, # A test job for this revision was triggered, waiting for it. |
- TESTED, # The test job completed with non-failing results. |
- FAILED, # Either the build or the test jobs failed or timed out. |
- ABORTED, # The build or test job was aborted. (For use in multi-secting). |
- SKIPPED, # A revision that was not built or tested for a special reason, |
- # such as those ranges that we know are broken, or when nudging |
- # revisions. |
- NEED_MORE_DATA, # Current number of test values is too small to establish |
- # a statistically significant difference between this |
- # revision and the revisions known to be good and bad. |
- ) = xrange(8) |
- |
def __init__(self, bisector, commit_hash, depot_name=None, |
base_revision=None): |
"""Creates a new instance to track the state of a revision. |
@@ -61,10 +46,10 @@ class RevisionState(object): |
super(RevisionState, self).__init__() |
self.bisector = bisector |
self._good = None |
+ self.failed = False |
self.deps = None |
self.test_results_url = None |
self.build_archived = False |
- self.status = RevisionState.NEW |
self.next_revision = None |
self.previous_revision = None |
self.job_name = None |
@@ -90,9 +75,11 @@ class RevisionState(object): |
else: |
self.needs_patch = False |
self.build_url = self.bisector.get_platform_gs_prefix() + self._gs_suffix() |
- self.values = [] |
- self.mean_value = None |
- self.overall_return_code = None |
+ self.valueset_paths = [] |
+ self.chartjson_paths = [] |
+ self.debug_values = [] |
+ self.return_codes = [] |
+ self.mean = None |
self.std_dev = None |
self._test_config = None |
self.build_number = None |
@@ -104,23 +91,6 @@ class RevisionState(object): |
'repeat_count', MINIMUM_SAMPLE_SIZE) |
@property |
- def tested(self): |
- return self.status in (RevisionState.TESTED,) |
- |
- @property |
- def in_progress(self): |
- return self.status in (RevisionState.BUILDING, RevisionState.TESTING, |
- RevisionState.NEED_MORE_DATA) |
- |
- @property |
- def failed(self): |
- return self.status == RevisionState.FAILED |
- |
- @property |
- def aborted(self): |
- return self.status == RevisionState.ABORTED |
- |
- @property |
def good(self): |
return self._good == True |
@@ -136,18 +106,40 @@ class RevisionState(object): |
def bad(self, value): |
self._good = not value |
+ @property |
+ def test_run_count(self): |
+ return max( |
+ len(self.valueset_paths), |
+ len(self.chartjson_paths), |
+ len(self.return_codes)) |
+ |
def start_job(self): |
- """Starts a build, or a test job if the build is available.""" |
- if self.status == RevisionState.NEW and not self._is_build_archived(): |
- self._request_build() |
- self.status = RevisionState.BUILDING |
- return |
+ try: |
+ if not self._is_build_archived(): |
+ self._request_build() |
+ with self.bisector.api.m.step.nest('Waiting for build'): |
+ while not self._is_build_archived(): |
+ self.api.m.python.inline( |
+ 'sleeping', |
+ """ |
+ import sys |
+ import time |
+ time.sleep(20*60) |
+ sys.exit(0) |
+ """) |
+ if self._is_build_failed(): |
+ self.failed = True |
+ return |
- if self._is_build_archived() and self.status in ( |
- RevisionState.NEW, RevisionState.BUILDING, |
- RevisionState.NEED_MORE_DATA): |
self._do_test() |
- self.status = RevisionState.TESTING |
+ while not self._check_revision_good(): |
+ min(self, self.bisector.lkgr, self.bisector.fkbr, |
+ key=lambda(x): x.test_run_count)._do_test() |
+ |
+ except UntestableRevisionException: |
+ if self.reference_range: |
+ # TODO(robertocn): Consider nudging the revision here. |
+ raise InconclusiveBisectException() |
def deps_change(self): |
"""Uses `git show` to see if a given commit contains a DEPS change.""" |
@@ -162,9 +154,9 @@ class RevisionState(object): |
name = 'Checking DEPS for ' + self.commit_hash |
step_result = api.m.git( |
'show', '--name-only', '--pretty=format:', |
- self.commit_hash, cwd=cwd, stdout=api.m.raw_io.output(), name=name) |
- if self.bisector.dummy_builds and not self.commit_hash.startswith('dcdc'): |
- return False |
+ self.commit_hash, cwd=cwd, stdout=api.m.raw_io.output(), name=name, |
+ step_test_data=lambda: api._test_data['deps_change'][self.commit_hash] |
+ ) |
if 'DEPS' in step_result.stdout.splitlines(): # pragma: no cover |
return True |
return False # pragma: no cover |
@@ -211,11 +203,13 @@ class RevisionState(object): |
'fetch file %s:%s' % (self.commit_hash, depot_config.DEPS_FILENAME), |
api.resource('fetch_file.py'), |
[depot_config.DEPS_FILENAME, '--commit', self.commit_hash], |
- stdout=api.m.raw_io.output()) |
+ stdout=api.m.raw_io.output(), |
+ step_test_data=lambda: api._test_data['deps'][self.commit_hash] |
+ ) |
self.deps_file_contents = step_result.stdout |
try: |
deps_data = self._gen_deps_local_scope() |
- exec(self.deps_file_contents or 'deps = {}', {}, deps_data) |
+ exec (self.deps_file_contents or 'deps = {}') in {}, deps_data |
deps_data = deps_data['deps'] |
except ImportError: # pragma: no cover |
# TODO(robertocn): Implement manual parsing of DEPS when exec fails. |
@@ -246,40 +240,17 @@ class RevisionState(object): |
self.deps = results |
return |
- def update_status(self): |
- """Checks on the pending jobs and updates status accordingly. |
- |
- This method will check for the build to complete and then trigger the test, |
- or will wait for the test as appropriate. |
- |
- To wait for the test we try to get the buildbot job url from GS, and if |
- available, we query the status of such job. |
- """ |
- if self.status == RevisionState.BUILDING: |
- if self._is_build_archived(): |
- self.start_job() |
- elif self._is_build_failed(): |
- self.status = RevisionState.FAILED |
- elif (self.status in (RevisionState.TESTING, RevisionState.NEED_MORE_DATA) |
- and self._results_available()): |
- # If we have already decided whether the revision is good or bad we |
- # shouldn't check again |
- check_revision_goodness = not(self.good or self.bad) |
- self._read_test_results( |
- check_revision_goodness=check_revision_goodness) |
- # We assume _read_test_results may have changed the status to a broken |
- # state such as FAILED or ABORTED. |
- if self.status in (RevisionState.TESTING, RevisionState.NEED_MORE_DATA): |
- self.status = RevisionState.TESTED |
- |
def _is_build_archived(self): |
"""Checks if the revision is already built and archived.""" |
if not self.build_archived: |
api = self.bisector.api |
self.build_archived = api.gsutil_file_exists(self.build_url) |
- if self.bisector.dummy_builds: |
- self.build_archived = self.in_progress |
+ # This is meant to return false the first time is called, and true on later |
+ # calls. This is so that expectations can cover both cases. |
+ if self.bisector.dummy_builds and not self.build_archived: |
+ self.build_archived = True |
+ return False |
return self.build_archived |
@@ -289,6 +260,7 @@ class RevisionState(object): |
fetch_result = api.m.url.fetch( build_url, step_name='fetch build details') |
return json.loads(fetch_result or '{}') |
+ # TODO(robertocn): Make this use buildbucket instead. |
def _is_build_failed(self): |
api = self.bisector.api |
current_build = None |
@@ -321,14 +293,6 @@ class RevisionState(object): |
current_build = self._fetch_build_info(base_url, self.build_number) |
return current_build.get('results') in [FAILURE, SKIPPED, EXCEPTION] |
- def _results_available(self): |
- """Checks if the results for the test job have been uploaded.""" |
- api = self.bisector.api |
- result = api.gsutil_file_exists(self.test_results_url) |
- if self.bisector.dummy_builds: |
- return self.in_progress |
- return result # pragma: no cover |
- |
def _gs_suffix(self): |
"""Provides the expected right half of the build filename. |
@@ -342,58 +306,20 @@ class RevisionState(object): |
name_parts.append(self.deps_sha) |
return '%s.zip' % '_'.join(name_parts) |
- def _read_test_results(self, check_revision_goodness=True): |
- """Gets the test results from GS and checks if the rev is good or bad.""" |
- test_results = self._get_test_results() |
- # Results will contain the keys 'results' and 'output' where output is the |
- # stdout of the command, and 'results' is itself a dict with the key |
- # 'values' unless the test failed, in which case 'results' will contain |
- # the 'error' key explaining the type of error. |
- results = test_results['results'] |
+ def _read_test_results(self, results): |
+ # Results will be a dictionary containing path to chartjsons, paths to |
+ # valueset, list of return codes. |
if results.get('errors'): |
- self.status = RevisionState.FAILED |
+ self.failed = True |
if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover |
self.bisector.surface_result('MISSING_METRIC') |
- return |
- self.values += results['values'] |
- api = self.bisector.api |
- if test_results.get('retcodes') and test_results['retcodes'][-1] != 0 and ( |
- api.m.chromium.c.TARGET_PLATFORM == 'android'): #pragma: no cover |
- api.m.chromium_android.device_status() |
- current_connected_devices = api.m.chromium_android.devices |
- current_device = api.m.bisect_tester.device_to_test |
- if current_device not in current_connected_devices: |
- # We need to manually raise step failure here because we are catching |
- # them further down the line to enable return_code bisects and bisecting |
- # on benchmarks that are a little flaky. |
- raise api.m.step.StepFailure('Test device disconnected.') |
- if self.bisector.is_return_code_mode(): |
- retcodes = test_results['retcodes'] |
- self.overall_return_code = 0 if all(v == 0 for v in retcodes) else 1 |
- # Keeping mean_value for compatibility with dashboard. |
- # TODO(robertocn): refactor mean_value, specially when uploading results |
- # to dashboard. |
- self.mean_value = self.overall_return_code |
- elif self.values: |
- api = self.bisector.api |
- self.mean_value = api.m.math_utils.mean(self.values) |
- self.std_dev = api.m.math_utils.standard_deviation(self.values) |
- # Values were not found, but the test did not otherwise fail. |
+ raise UntestableRevisionException(results['errors']) |
+ elif self.bisector.is_return_code_mode(): |
+ assert len(results['retcodes']) |
+ self.return_codes.extend(results['retcodes']) |
else: |
- self.status = RevisionState.FAILED |
- self.bisector.surface_result('MISSING_METRIC') |
- return |
- # If we have already decided on the goodness of this revision, we shouldn't |
- # recheck it. |
- if self.good or self.bad: |
- check_revision_goodness = False |
- # We cannot test the goodness of the initial rev range. |
- if (self.bisector.good_rev != self and self.bisector.bad_rev != self and |
- check_revision_goodness): |
- if self._check_revision_good(): |
- self.good = True |
- else: |
- self.bad = True |
+ self.valueset_paths.extend(results.get('valueset_paths')) |
+ self.chartjson_paths.extend(results.get('chartjson_paths')) |
def _request_build(self): |
"""Posts a request to buildbot to build this revision and archive it.""" |
@@ -457,10 +383,10 @@ class RevisionState(object): |
the test will be run on the same machine. Otherwise, this posts |
a request to buildbot to download and perf-test this build. |
""" |
- if self.bisector.bisect_config.get('dummy_job_names'): |
- self.job_name = self.commit_hash + '-test' |
- else: # pragma: no cover |
- self.job_name = uuid.uuid4().hex |
+ if self.test_run_count: |
+ self.repeat_count = max(MINIMUM_SAMPLE_SIZE, math.ceil( |
+ self.test_run_count * 1.5)) - len(self.test_run_count) |
+ |
api = self.bisector.api |
# Stores revision map for different repos eg, android-chrome, src, v8 etc. |
revision_ladder = {} |
@@ -470,102 +396,80 @@ class RevisionState(object): |
revision_ladder[top_revision.depot_name] = top_revision.commit_hash |
top_revision = top_revision.base_revision |
perf_test_properties = { |
- 'builder_name': self.bisector.get_perf_tester_name(), |
'properties': { |
'revision': top_revision.commit_hash, |
'parent_got_revision': top_revision.commit_hash, |
'parent_build_archive_url': self.build_url, |
'bisect_config': self._get_bisect_config_for_tester(), |
- 'job_name': self.job_name, |
'revision_ladder': revision_ladder, |
}, |
} |
- self.test_results_url = (self.bisector.api.GS_RESULTS_URL + |
- self.job_name + '.results') |
- if (api.m.bisect_tester.local_test_enabled() or |
- self.bisector.internal_bisect): # pragma: no cover |
- skip_download = self.bisector.last_tested_revision == self |
- self.bisector.last_tested_revision = self |
- overrides = perf_test_properties['properties'] |
- api.run_local_test_run(overrides, skip_download=skip_download) |
- else: |
- step_name = 'Triggering test job for ' + self.commit_hash |
- api.m.trigger(perf_test_properties, name=step_name) |
- |
- def retest(self): # pragma: no cover |
- # We need at least 5 samples for applying Mann-Whitney U test |
- # with P < 0.01, two-tailed . |
- target_sample_size = max(5, math.ceil(len(self.values) * 1.5)) |
- self.status = RevisionState.NEED_MORE_DATA |
- self.repeat_count = target_sample_size - len(self.values) |
- self.start_job() |
- self.bisector.wait_for(self) |
- |
- def _get_test_results(self): |
- """Tries to get the results of a test job from cloud storage.""" |
- api = self.bisector.api |
- try: |
- stdout = api.m.raw_io.output() |
- name = 'Get test results for build ' + self.commit_hash |
- step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout, |
- name=name) |
- if not step_result.stdout: |
- raise api.m.step.StepFailure('Test for build %s failed' % |
- self.revision_string()) |
- except api.m.step.StepFailure as sf: # pragma: no cover |
- self.bisector.surface_result('TEST_FAILURE') |
- return {'results': {'errors': str(sf)}} |
- else: |
- return json.loads(step_result.stdout) |
+ self.bisector.last_tested_revision = self |
+ skip_download = self.bisector.last_tested_revision == self |
+ overrides = perf_test_properties['properties'] |
- def _check_revision_good(self): |
- """Determines if a revision is good or bad. |
+ def run_test_step_test_data(): |
+ """Returns a single step data object when called. |
+ |
+ These are expected to be populated by the test_api. |
+ """ |
+ if (api._test_data['run_results'].get(self.commit_hash)): |
+ return api._test_data['run_results'][self.commit_hash].pop(0) |
+ return api._test_data['run_results']['default'] |
- Iteratively increment the sample size of the revision being tested, the last |
- known good revision, and the first known bad revision until a relationship |
- of significant difference can be established betweeb the results of the |
- revision being tested and one of the other two. |
- If the results do not converge towards finding a significant difference in |
- either direction, this is expected to timeout eventually. This scenario |
- should be rather rare, since it is expected that the fkbr and lkgr are |
- significantly different as a precondition. |
+ self._read_test_results(api.run_local_test_run( |
+ overrides, skip_download=skip_download, |
+ step_test_data=run_test_step_test_data |
+ )) |
+ |
+ def _check_revision_good(self): |
+ """Determines if a revision is good or bad. |
Returns: |
- True if the results of testing this revision are significantly different |
- from those of testing the earliest known bad revision. |
- False if they are instead significantly different form those of testing |
- the latest knwon good revision. |
+ True if the revision is either good or bad, False if it cannot be |
+ determined from the available data. |
""" |
+ # Do not reclassify revisions. Important for reference range. |
+ if self.good or self.bad: |
+ return True |
+ |
lkgr = self.bisector.lkgr |
fkbr = self.bisector.fkbr |
- |
if self.bisector.is_return_code_mode(): |
- return self.overall_return_code == lkgr.overall_return_code |
- |
- while True: |
- diff_from_good = self.bisector.significantly_different( |
- lkgr.values[:len(fkbr.values)], self.values) |
- diff_from_bad = self.bisector.significantly_different( |
- fkbr.values[:len(lkgr.values)], self.values) |
- |
- if diff_from_good and diff_from_bad: |
- # Multiple regressions. |
- # For now, proceed bisecting the biggest difference of the means. |
- dist_from_good = abs(self.mean_value - lkgr.mean_value) |
- dist_from_bad = abs(self.mean_value - fkbr.mean_value) |
- if dist_from_good > dist_from_bad: |
- # TODO(robertocn): Add way to handle the secondary regression |
- #self.bisector.handle_secondary_regression(self, fkbr) |
- return False |
- else: |
- #self.bisector.handle_secondary_regression(lkgr, self) |
- return True |
- |
- if diff_from_good or diff_from_bad: # pragma: no cover |
- return diff_from_bad |
+ if self.overall_return_code == lkgr.overall_return_code: |
+ self.good = True |
+ else: |
+ self.bad = True |
+ return True |
+ diff_from_good = self.bisector.compare_revisions(self, lkgr) |
+ diff_from_bad = self.bisector.compare_revisions(self, fkbr) |
+ if diff_from_good == False and diff_from_bad == False: |
+ # We have reached the max number of samples and have not established |
+ # difference, give up. |
+ raise InconclusiveBisectException() |
+ if diff_from_good and diff_from_bad: |
+ # Multiple regressions. |
+ # For now, proceed bisecting the biggest difference of the means. |
+ dist_from_good = abs(self.mean - lkgr.mean) |
+ dist_from_bad = abs(self.mean - fkbr.mean) |
+ if dist_from_good > dist_from_bad: |
+ # TODO(robertocn): Add way to handle the secondary regression |
+ #self.bisector.handle_secondary_regression(self, fkbr) |
+ self.bad = True |
+ return True |
+ else: |
+ #self.bisector.handle_secondary_regression(lkgr, self) |
+ self.good = True |
+ return True |
+ if diff_from_good: |
+ self.bad = True |
+ return True |
+ elif diff_from_bad: # pragma: no cover |
+ self.good = True |
+ return True |
+ return False |
- self._next_retest() # pragma: no cover |
def revision_string(self): |
if self._rev_str: |
@@ -584,25 +488,24 @@ class RevisionState(object): |
self._rev_str = result |
return self._rev_str |
- def _next_retest(self): # pragma: no cover |
- """Chooses one of current, lkgr, fkbr to retest. |
- |
- Look for the smallest sample and retest that. If the last tested revision |
- is tied for the smallest sample, use that to take advantage of the fact |
- that it is already downloaded and unzipped. |
- """ |
- next_revision_to_test = min(self.bisector.lkgr, self, self.bisector.fkbr, |
- key=lambda x: len(x.values)) |
- if (len(self.bisector.last_tested_revision.values) == |
- next_revision_to_test.values): |
- self.bisector.last_tested_revision.retest() |
- else: |
- next_revision_to_test.retest() |
- |
def __repr__(self): |
- if self.overall_return_code is not None: |
- return ('RevisionState(rev=%s, values=%r, overall_return_code=%r, ' |
- 'std_dev=%r)') % (self.revision_string(), self.values, |
+ if not self.test_run_count: |
+ return ('RevisionState(rev=%s), values=[]' % self.revision_string()) |
+ if self.bisector.is_return_code_mode(): |
+ return ('RevisionState(rev=%s, mean=%r, overall_return_code=%r, ' |
+ 'std_dev=%r)') % (self.revision_string(), self.mean, |
self.overall_return_code, self.std_dev) |
- return ('RevisionState(rev=%s, values=%r, mean_value=%r, std_dev=%r)' % ( |
- self.revision_string(), self.values, self.mean_value, self.std_dev)) |
+ return ('RevisionState(rev=%s, mean_value=%r, std_dev=%r)' % ( |
+ self.revision_string(), self.mean, self.std_dev)) |
+ |
+ @property |
+ def overall_return_code(self): |
+ if self.bisector.is_return_code_mode(): |
+ if self.return_codes: |
+ if max(self.return_codes): |
+ return 1 |
+ return 0 |
+ raise ValueError('overall_return_code needs non-empty sample' |
+ ) # pragma: no cover |
+ raise ValueError('overall_return_code only applies to return_code bisects' |
+ ) # pragma: no cover |