scripts/slave/recipe_modules/auto_bisect/revision_state.py - Issue 2247373002: Refactor stages 1, 2 and test_api overhaul.

Unified Diff: scripts/slave/recipe_modules/auto_bisect/revision_state.py

Issue 2247373002: Refactor stages 1, 2 and test_api overhaul. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master

Patch Set: Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« scripts/slave/recipe_modules/auto_bisect/local_bisect.py ('K') | « scripts/slave/recipe_modules/auto_bisect/resources/significantly_different_test.py ('k') | scripts/slave/recipe_modules/auto_bisect/test_api.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: scripts/slave/recipe_modules/auto_bisect/revision_state.py

diff --git a/scripts/slave/recipe_modules/auto_bisect/revision_state.py b/scripts/slave/recipe_modules/auto_bisect/revision_state.py

index e396742a0e429f93f7957e338668d8df0e281125..b7bfdfb96dce524457d60771d7cab2bd715e510a 100644

--- a/scripts/slave/recipe_modules/auto_bisect/revision_state.py

+++ b/scripts/slave/recipe_modules/auto_bisect/revision_state.py

@@ -19,6 +19,7 @@ import re

import uuid

from . import depot_config

+from exceptions import *

# These relate to how to increase the number of repetitions during re-test

MINIMUM_SAMPLE_SIZE = 5

@@ -30,22 +31,6 @@ SUCCESS, WARNINGS, FAILURE, SKIPPED, EXCEPTION = range(5)

class RevisionState(object):

"""Abstracts the state of a single revision on a bisect job."""

- # Possible values for the status attribute of RevisionState:

- (

- NEW, # A revision_state object that has just been initialized.

- BUILDING, # Requested a build for this revision, waiting for it.

- TESTING, # A test job for this revision was triggered, waiting for it.

- TESTED, # The test job completed with non-failing results.

- FAILED, # Either the build or the test jobs failed or timed out.

- ABORTED, # The build or test job was aborted. (For use in multi-secting).

- SKIPPED, # A revision that was not built or tested for a special reason,

- # such as those ranges that we know are broken, or when nudging

- # revisions.

- NEED_MORE_DATA, # Current number of test values is too small to establish

- # a statistically significant difference between this

- # revision and the revisions known to be good and bad.

- ) = xrange(8)

def __init__(self, bisector, commit_hash, depot_name=None,

base_revision=None):

"""Creates a new instance to track the state of a revision.

@@ -61,10 +46,10 @@ class RevisionState(object):

super(RevisionState, self).__init__()

self.bisector = bisector

self._good = None

+ self.failed = False

self.deps = None

self.test_results_url = None

self.build_archived = False

- self.status = RevisionState.NEW

self.next_revision = None

self.previous_revision = None

self.job_name = None

@@ -90,9 +75,11 @@ class RevisionState(object):

else:

self.needs_patch = False

self.build_url = self.bisector.get_platform_gs_prefix() + self._gs_suffix()

- self.values = []

- self.mean_value = None

- self.overall_return_code = None

+ self.valueset_paths = []

+ self.chartjson_paths = []

+ self.debug_values = []

+ self.return_codes = []

+ self.mean = None

self.std_dev = None

self._test_config = None

self.build_number = None

@@ -104,23 +91,6 @@ class RevisionState(object):

'repeat_count', MINIMUM_SAMPLE_SIZE)

@property

- def tested(self):

- return self.status in (RevisionState.TESTED,)

- @property

- def in_progress(self):

- return self.status in (RevisionState.BUILDING, RevisionState.TESTING,

- RevisionState.NEED_MORE_DATA)

- @property

- def failed(self):

- return self.status == RevisionState.FAILED

- @property

- def aborted(self):

- return self.status == RevisionState.ABORTED

- @property

def good(self):

return self._good == True

@@ -136,18 +106,40 @@ class RevisionState(object):

def bad(self, value):

self._good = not value

+ @property

+ def test_run_count(self):

+ return max(

+ len(self.valueset_paths),

+ len(self.chartjson_paths),

+ len(self.return_codes))

def start_job(self):

- """Starts a build, or a test job if the build is available."""

- if self.status == RevisionState.NEW and not self._is_build_archived():

- self._request_build()

- self.status = RevisionState.BUILDING

- return

+ try:

+ if not self._is_build_archived():

+ self._request_build()

+ with self.bisector.api.m.step.nest('Waiting for build'):

+ while not self._is_build_archived():

+ self.api.m.python.inline(

+ 'sleeping',

+ """

+ import sys

+ import time

+ time.sleep(20*60)

+ sys.exit(0)

+ """)

+ if self._is_build_failed():

+ self.failed = True

+ return

- if self._is_build_archived() and self.status in (

- RevisionState.NEW, RevisionState.BUILDING,

- RevisionState.NEED_MORE_DATA):

self._do_test()

- self.status = RevisionState.TESTING

+ while not self._check_revision_good():

+ min(self, self.bisector.lkgr, self.bisector.fkbr,

+ key=lambda(x): x.test_run_count)._do_test()

+ except UntestableRevisionException:

+ if self.reference_range:

+ # TODO(robertocn): Consider nudging the revision here.

+ raise InconclusiveBisectException()

def deps_change(self):

"""Uses `git show` to see if a given commit contains a DEPS change."""

@@ -162,9 +154,9 @@ class RevisionState(object):

name = 'Checking DEPS for ' + self.commit_hash

step_result = api.m.git(

'show', '--name-only', '--pretty=format:',

- self.commit_hash, cwd=cwd, stdout=api.m.raw_io.output(), name=name)

- if self.bisector.dummy_builds and not self.commit_hash.startswith('dcdc'):

- return False

+ self.commit_hash, cwd=cwd, stdout=api.m.raw_io.output(), name=name,

+ step_test_data=lambda: api._test_data['deps_change'][self.commit_hash]

+ )

if 'DEPS' in step_result.stdout.splitlines(): # pragma: no cover

return True

return False # pragma: no cover

@@ -211,11 +203,13 @@ class RevisionState(object):

'fetch file %s:%s' % (self.commit_hash, depot_config.DEPS_FILENAME),

api.resource('fetch_file.py'),

[depot_config.DEPS_FILENAME, '--commit', self.commit_hash],

- stdout=api.m.raw_io.output())

+ stdout=api.m.raw_io.output(),

+ step_test_data=lambda: api._test_data['deps'][self.commit_hash]

+ )

self.deps_file_contents = step_result.stdout

try:

deps_data = self._gen_deps_local_scope()

- exec(self.deps_file_contents or 'deps = {}', {}, deps_data)

+ exec (self.deps_file_contents or 'deps = {}') in {}, deps_data

deps_data = deps_data['deps']

except ImportError: # pragma: no cover

# TODO(robertocn): Implement manual parsing of DEPS when exec fails.

@@ -246,40 +240,17 @@ class RevisionState(object):

self.deps = results

return

- def update_status(self):

- """Checks on the pending jobs and updates status accordingly.

- This method will check for the build to complete and then trigger the test,

- or will wait for the test as appropriate.

- To wait for the test we try to get the buildbot job url from GS, and if

- available, we query the status of such job.

- """

- if self.status == RevisionState.BUILDING:

- if self._is_build_archived():

- self.start_job()

- elif self._is_build_failed():

- self.status = RevisionState.FAILED

- elif (self.status in (RevisionState.TESTING, RevisionState.NEED_MORE_DATA)

- and self._results_available()):

- # If we have already decided whether the revision is good or bad we

- # shouldn't check again

- check_revision_goodness = not(self.good or self.bad)

- self._read_test_results(

- check_revision_goodness=check_revision_goodness)

- # We assume _read_test_results may have changed the status to a broken

- # state such as FAILED or ABORTED.

- if self.status in (RevisionState.TESTING, RevisionState.NEED_MORE_DATA):

- self.status = RevisionState.TESTED

def _is_build_archived(self):

"""Checks if the revision is already built and archived."""

if not self.build_archived:

api = self.bisector.api

self.build_archived = api.gsutil_file_exists(self.build_url)

- if self.bisector.dummy_builds:

- self.build_archived = self.in_progress

+ # This is meant to return false the first time is called, and true on later

+ # calls. This is so that expectations can cover both cases.

+ if self.bisector.dummy_builds and not self.build_archived:

+ self.build_archived = True

+ return False

return self.build_archived

@@ -289,6 +260,7 @@ class RevisionState(object):

fetch_result = api.m.url.fetch( build_url, step_name='fetch build details')

return json.loads(fetch_result or '{}')

+ # TODO(robertocn): Make this use buildbucket instead.

def _is_build_failed(self):

api = self.bisector.api

current_build = None

@@ -321,14 +293,6 @@ class RevisionState(object):

current_build = self._fetch_build_info(base_url, self.build_number)

return current_build.get('results') in [FAILURE, SKIPPED, EXCEPTION]

- def _results_available(self):

- """Checks if the results for the test job have been uploaded."""

- api = self.bisector.api

- result = api.gsutil_file_exists(self.test_results_url)

- if self.bisector.dummy_builds:

- return self.in_progress

- return result # pragma: no cover

def _gs_suffix(self):

"""Provides the expected right half of the build filename.

@@ -342,58 +306,20 @@ class RevisionState(object):

name_parts.append(self.deps_sha)

return '%s.zip' % '_'.join(name_parts)

- def _read_test_results(self, check_revision_goodness=True):

- """Gets the test results from GS and checks if the rev is good or bad."""

- test_results = self._get_test_results()

- # Results will contain the keys 'results' and 'output' where output is the

- # stdout of the command, and 'results' is itself a dict with the key

- # 'values' unless the test failed, in which case 'results' will contain

- # the 'error' key explaining the type of error.

- results = test_results['results']

+ def _read_test_results(self, results):

+ # Results will be a dictionary containing path to chartjsons, paths to

+ # valueset, list of return codes.

if results.get('errors'):

- self.status = RevisionState.FAILED

+ self.failed = True

if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover

self.bisector.surface_result('MISSING_METRIC')

- return

- self.values += results['values']

- api = self.bisector.api

- if test_results.get('retcodes') and test_results['retcodes'][-1] != 0 and (

- api.m.chromium.c.TARGET_PLATFORM == 'android'): #pragma: no cover

- api.m.chromium_android.device_status()

- current_connected_devices = api.m.chromium_android.devices

- current_device = api.m.bisect_tester.device_to_test

- if current_device not in current_connected_devices:

- # We need to manually raise step failure here because we are catching

- # them further down the line to enable return_code bisects and bisecting

- # on benchmarks that are a little flaky.

- raise api.m.step.StepFailure('Test device disconnected.')

- if self.bisector.is_return_code_mode():

- retcodes = test_results['retcodes']

- self.overall_return_code = 0 if all(v == 0 for v in retcodes) else 1

- # Keeping mean_value for compatibility with dashboard.

- # TODO(robertocn): refactor mean_value, specially when uploading results

- # to dashboard.

- self.mean_value = self.overall_return_code

- elif self.values:

- api = self.bisector.api

- self.mean_value = api.m.math_utils.mean(self.values)

- self.std_dev = api.m.math_utils.standard_deviation(self.values)

- # Values were not found, but the test did not otherwise fail.

+ raise UntestableRevisionException(results['errors'])

+ elif self.bisector.is_return_code_mode():

+ assert len(results['retcodes'])

+ self.return_codes.extend(results['retcodes'])

else:

- self.status = RevisionState.FAILED

- self.bisector.surface_result('MISSING_METRIC')

- return

- # If we have already decided on the goodness of this revision, we shouldn't

- # recheck it.

- if self.good or self.bad:

- check_revision_goodness = False

- # We cannot test the goodness of the initial rev range.

- if (self.bisector.good_rev != self and self.bisector.bad_rev != self and

- check_revision_goodness):

- if self._check_revision_good():

- self.good = True

- else:

- self.bad = True

+ self.valueset_paths.extend(results.get('valueset_paths'))

+ self.chartjson_paths.extend(results.get('chartjson_paths'))

def _request_build(self):

"""Posts a request to buildbot to build this revision and archive it."""

@@ -457,10 +383,10 @@ class RevisionState(object):

the test will be run on the same machine. Otherwise, this posts

a request to buildbot to download and perf-test this build.

"""

- if self.bisector.bisect_config.get('dummy_job_names'):

- self.job_name = self.commit_hash + '-test'

- else: # pragma: no cover

- self.job_name = uuid.uuid4().hex

+ if self.test_run_count:

+ self.repeat_count = max(MINIMUM_SAMPLE_SIZE, math.ceil(

+ self.test_run_count * 1.5)) - len(self.test_run_count)

api = self.bisector.api

# Stores revision map for different repos eg, android-chrome, src, v8 etc.

revision_ladder = {}

@@ -470,102 +396,80 @@ class RevisionState(object):

revision_ladder[top_revision.depot_name] = top_revision.commit_hash

top_revision = top_revision.base_revision

perf_test_properties = {

- 'builder_name': self.bisector.get_perf_tester_name(),

'properties': {

'revision': top_revision.commit_hash,

'parent_got_revision': top_revision.commit_hash,

'parent_build_archive_url': self.build_url,

'bisect_config': self._get_bisect_config_for_tester(),

- 'job_name': self.job_name,

'revision_ladder': revision_ladder,

}

- self.test_results_url = (self.bisector.api.GS_RESULTS_URL +

- self.job_name + '.results')

- if (api.m.bisect_tester.local_test_enabled() or

- self.bisector.internal_bisect): # pragma: no cover

- skip_download = self.bisector.last_tested_revision == self

- self.bisector.last_tested_revision = self

- overrides = perf_test_properties['properties']

- api.run_local_test_run(overrides, skip_download=skip_download)

- else:

- step_name = 'Triggering test job for ' + self.commit_hash

- api.m.trigger(perf_test_properties, name=step_name)

- def retest(self): # pragma: no cover

- # We need at least 5 samples for applying Mann-Whitney U test

- # with P < 0.01, two-tailed .

- target_sample_size = max(5, math.ceil(len(self.values) * 1.5))

- self.status = RevisionState.NEED_MORE_DATA

- self.repeat_count = target_sample_size - len(self.values)

- self.start_job()

- self.bisector.wait_for(self)

- def _get_test_results(self):

- """Tries to get the results of a test job from cloud storage."""

- api = self.bisector.api

- try:

- stdout = api.m.raw_io.output()

- name = 'Get test results for build ' + self.commit_hash

- step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout,

- name=name)

- if not step_result.stdout:

- raise api.m.step.StepFailure('Test for build %s failed' %

- self.revision_string())

- except api.m.step.StepFailure as sf: # pragma: no cover

- self.bisector.surface_result('TEST_FAILURE')

- return {'results': {'errors': str(sf)}}

- else:

- return json.loads(step_result.stdout)

+ self.bisector.last_tested_revision = self

+ skip_download = self.bisector.last_tested_revision == self

+ overrides = perf_test_properties['properties']

- def _check_revision_good(self):

- """Determines if a revision is good or bad.

+ def run_test_step_test_data():

+ """Returns a single step data object when called.

+ These are expected to be populated by the test_api.

+ """

+ if (api._test_data['run_results'].get(self.commit_hash)):

+ return api._test_data['run_results'][self.commit_hash].pop(0)

+ return api._test_data['run_results']['default']

- Iteratively increment the sample size of the revision being tested, the last

- known good revision, and the first known bad revision until a relationship

- of significant difference can be established betweeb the results of the

- revision being tested and one of the other two.

- If the results do not converge towards finding a significant difference in

- either direction, this is expected to timeout eventually. This scenario

- should be rather rare, since it is expected that the fkbr and lkgr are

- significantly different as a precondition.

+ self._read_test_results(api.run_local_test_run(

+ overrides, skip_download=skip_download,

+ step_test_data=run_test_step_test_data

+ ))

+ def _check_revision_good(self):

+ """Determines if a revision is good or bad.

Returns:

- True if the results of testing this revision are significantly different

- from those of testing the earliest known bad revision.

- False if they are instead significantly different form those of testing

- the latest knwon good revision.

+ True if the revision is either good or bad, False if it cannot be

+ determined from the available data.

"""

+ # Do not reclassify revisions. Important for reference range.

+ if self.good or self.bad:

+ return True

lkgr = self.bisector.lkgr

fkbr = self.bisector.fkbr

if self.bisector.is_return_code_mode():

- return self.overall_return_code == lkgr.overall_return_code

- while True:

- diff_from_good = self.bisector.significantly_different(

- lkgr.values[:len(fkbr.values)], self.values)

- diff_from_bad = self.bisector.significantly_different(

- fkbr.values[:len(lkgr.values)], self.values)

- if diff_from_good and diff_from_bad:

- # Multiple regressions.

- # For now, proceed bisecting the biggest difference of the means.

- dist_from_good = abs(self.mean_value - lkgr.mean_value)

- dist_from_bad = abs(self.mean_value - fkbr.mean_value)

- if dist_from_good > dist_from_bad:

- # TODO(robertocn): Add way to handle the secondary regression

- #self.bisector.handle_secondary_regression(self, fkbr)

- return False

- else:

- #self.bisector.handle_secondary_regression(lkgr, self)

- return True

- if diff_from_good or diff_from_bad: # pragma: no cover

- return diff_from_bad

+ if self.overall_return_code == lkgr.overall_return_code:

+ self.good = True

+ else:

+ self.bad = True

+ return True

+ diff_from_good = self.bisector.compare_revisions(self, lkgr)

+ diff_from_bad = self.bisector.compare_revisions(self, fkbr)

+ if diff_from_good == False and diff_from_bad == False:

+ # We have reached the max number of samples and have not established

+ # difference, give up.

+ raise InconclusiveBisectException()

+ if diff_from_good and diff_from_bad:

+ # Multiple regressions.

+ # For now, proceed bisecting the biggest difference of the means.

+ dist_from_good = abs(self.mean - lkgr.mean)

+ dist_from_bad = abs(self.mean - fkbr.mean)

+ if dist_from_good > dist_from_bad:

+ # TODO(robertocn): Add way to handle the secondary regression

+ #self.bisector.handle_secondary_regression(self, fkbr)

+ self.bad = True

+ return True

+ else:

+ #self.bisector.handle_secondary_regression(lkgr, self)

+ self.good = True

+ return True

+ if diff_from_good:

+ self.bad = True

+ return True

+ elif diff_from_bad: # pragma: no cover

+ self.good = True

+ return True

+ return False

- self._next_retest() # pragma: no cover

def revision_string(self):

if self._rev_str:

@@ -584,25 +488,24 @@ class RevisionState(object):

self._rev_str = result

return self._rev_str

- def _next_retest(self): # pragma: no cover

- """Chooses one of current, lkgr, fkbr to retest.

- Look for the smallest sample and retest that. If the last tested revision

- is tied for the smallest sample, use that to take advantage of the fact

- that it is already downloaded and unzipped.

- """

- next_revision_to_test = min(self.bisector.lkgr, self, self.bisector.fkbr,

- key=lambda x: len(x.values))

- if (len(self.bisector.last_tested_revision.values) ==

- next_revision_to_test.values):

- self.bisector.last_tested_revision.retest()

- else:

- next_revision_to_test.retest()

def __repr__(self):

- if self.overall_return_code is not None:

- return ('RevisionState(rev=%s, values=%r, overall_return_code=%r, '

- 'std_dev=%r)') % (self.revision_string(), self.values,

+ if not self.test_run_count:

+ return ('RevisionState(rev=%s), values=[]' % self.revision_string())

+ if self.bisector.is_return_code_mode():

+ return ('RevisionState(rev=%s, mean=%r, overall_return_code=%r, '

+ 'std_dev=%r)') % (self.revision_string(), self.mean,

self.overall_return_code, self.std_dev)

- return ('RevisionState(rev=%s, values=%r, mean_value=%r, std_dev=%r)' % (

- self.revision_string(), self.values, self.mean_value, self.std_dev))

+ return ('RevisionState(rev=%s, mean_value=%r, std_dev=%r)' % (

+ self.revision_string(), self.mean, self.std_dev))

+ @property

+ def overall_return_code(self):

+ if self.bisector.is_return_code_mode():

+ if self.return_codes:

+ if max(self.return_codes):

+ return 1

+ return 0

+ raise ValueError('overall_return_code needs non-empty sample'

+ ) # pragma: no cover

+ raise ValueError('overall_return_code only applies to return_code bisects'

+ ) # pragma: no cover