scripts/slave/recipe_modules/auto_bisect/revision_state.py - Issue 2247373002: Refactor stages 1, 2 and test_api overhaul.

Unified Diff: scripts/slave/recipe_modules/auto_bisect/revision_state.py

Issue 2247373002: Refactor stages 1, 2 and test_api overhaul. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master

Patch Set: Addressing all early feedback. Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« scripts/slave/recipe_modules/auto_bisect/bisector.py ('K') | « scripts/slave/recipe_modules/auto_bisect/resources/significantly_different_test.py ('k') | scripts/slave/recipe_modules/auto_bisect/test_api.py » ('j') | scripts/slave/recipe_modules/auto_bisect/test_api.py » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: scripts/slave/recipe_modules/auto_bisect/revision_state.py

diff --git a/scripts/slave/recipe_modules/auto_bisect/revision_state.py b/scripts/slave/recipe_modules/auto_bisect/revision_state.py

index 4e0644bc4e14f7b9ef344d2258b9cfc6353f2b4d..fbd02936b17d4a00cbb58d531db0b78b0c4bd5be 100644

--- a/scripts/slave/recipe_modules/auto_bisect/revision_state.py

+++ b/scripts/slave/recipe_modules/auto_bisect/revision_state.py

@@ -10,6 +10,7 @@ class so that the bisect module and recipe can use it.

See perf_revision_state for an example.

"""

+import collections

import hashlib

import json

import math

@@ -18,31 +19,18 @@ import tempfile

import re

import uuid

-from . import depot_config

+from auto_bisect import depot_config

+from auto_bisect import bisect_exceptions

# These relate to how to increase the number of repetitions during re-test

MINIMUM_SAMPLE_SIZE = 5

INCREASE_FACTOR = 1.5

+NOT_SIGNIFICANTLY_DIFFERENT, SIGNIFICANTLY_DIFFERENT, NEED_MORE_DATA = range(3)

dtu 2016/09/13 23:57:27 I think Nat or Tony? said that whenever you're goi

RobertoCN 2016/09/21 22:22:48 Done.

class RevisionState(object):

"""Abstracts the state of a single revision on a bisect job."""

- # Possible values for the status attribute of RevisionState:

- (

- NEW, # A revision_state object that has just been initialized.

- BUILDING, # Requested a build for this revision, waiting for it.

- TESTING, # A test job for this revision was triggered, waiting for it.

- TESTED, # The test job completed with non-failing results.

- FAILED, # Either the build or the test jobs failed or timed out.

- ABORTED, # The build or test job was aborted. (For use in multi-secting).

- SKIPPED, # A revision that was not built or tested for a special reason,

- # such as those ranges that we know are broken, or when nudging

- # revisions.

- NEED_MORE_DATA, # Current number of test values is too small to establish

- # a statistically significant difference between this

- # revision and the revisions known to be good and bad.

- ) = xrange(8)

def __init__(self, bisector, commit_hash, depot_name=None,

base_revision=None):

"""Creates a new instance to track the state of a revision.

@@ -58,10 +46,10 @@ class RevisionState(object):

super(RevisionState, self).__init__()

self.bisector = bisector

self._good = None

+ self.failed = False

self.deps = None

self.test_results_url = None

self.build_archived = False

- self.status = RevisionState.NEW

self.next_revision = None

self.previous_revision = None

self.job_name = None

@@ -75,23 +63,18 @@ class RevisionState(object):

self.revision_overrides = {}

self.build_id = None

if self.base_revision:

- assert self.base_revision.deps_file_contents

self.needs_patch = True

self.revision_overrides[self.depot['src']] = self.commit_hash

- self.deps_patch, self.deps_file_contents = self.bisector.make_deps_patch(

- self.base_revision, self.base_revision.deps_file_contents,

- self.depot, self.commit_hash)

- self.deps_sha = hashlib.sha1(self.deps_patch).hexdigest()

- self.deps_sha_patch = self.bisector.make_deps_sha_file(self.deps_sha)

+ self.deps_sha = hashlib.sha1(self.revision_string()).hexdigest()

self.deps = dict(base_revision.deps)

self.deps[self.depot_name] = self.commit_hash

else:

self.needs_patch = False

self.build_url = self.bisector.get_platform_gs_prefix() + self._gs_suffix()

- self.values = []

- self.mean_value = None

- self.overall_return_code = None

- self.std_dev = None

+ self.valueset_paths = []

+ self.chartjson_paths = []

+ self.debug_values = []

+ self.return_codes = []

self._test_config = None

if self.bisector.test_type == 'perf':

@@ -101,23 +84,6 @@ class RevisionState(object):

'repeat_count', MINIMUM_SAMPLE_SIZE)

@property

- def tested(self):

- return self.status in (RevisionState.TESTED,)

- @property

- def in_progress(self):

- return self.status in (RevisionState.BUILDING, RevisionState.TESTING,

- RevisionState.NEED_MORE_DATA)

- @property

- def failed(self):

- return self.status == RevisionState.FAILED

- @property

- def aborted(self):

- return self.status == RevisionState.ABORTED

- @property

def good(self):

return self._good == True

@@ -133,18 +99,51 @@ class RevisionState(object):

def bad(self, value):

self._good = not value

+ @property

+ def test_run_count(self):

+ return max(

+ len(self.valueset_paths),

+ len(self.chartjson_paths),

+ len(self.return_codes))

+ @property

+ def mean(self):

+ if self.debug_values:

+ return float(sum(self.debug_values))/len(self.debug_values)

+ @property

+ def std_dev(self):

+ if self.debug_values:

+ mn = self.mean

+ return math.sqrt(sum(pow(x - mn, 2) for x in self.debug_values))

def start_job(self):

- """Starts a build, or a test job if the build is available."""

- if self.status == RevisionState.NEW and not self._is_build_archived():

- self._request_build()

- self.status = RevisionState.BUILDING

- return

- if self._is_build_archived() and self.status in (

- RevisionState.NEW, RevisionState.BUILDING,

- RevisionState.NEED_MORE_DATA):

+ api = self.bisector.api

+ try:

+ if not self._is_build_archived():

+ self._request_build()

+ with api.m.step.nest('Waiting for build'):

+ while not self._is_build_archived():

+ api.m.python.inline(

+ 'sleeping',

+ """

+ import sys

+ import time

+ time.sleep(20*60)

+ sys.exit(0)

+ """)

+ if self._is_build_failed():

+ self.failed = True

+ return

self._do_test()

- self.status = RevisionState.TESTING

+ # TODO(robertocn): Add a test to remove this CL

+ while not self._check_revision_good(): # pragma: no cover

+ min(self, self.bisector.lkgr, self.bisector.fkbr,

+ key=lambda(x): x.test_run_count)._do_test()

+ except bisect_exceptions.UntestableRevisionException:

+ self.failed = True

def deps_change(self):

"""Uses `git show` to see if a given commit contains a DEPS change."""

@@ -159,9 +158,9 @@ class RevisionState(object):

name = 'Checking DEPS for ' + self.commit_hash

step_result = api.m.git(

'show', '--name-only', '--pretty=format:',

- self.commit_hash, cwd=cwd, stdout=api.m.raw_io.output(), name=name)

- if self.bisector.dummy_builds and not self.commit_hash.startswith('dcdc'):

- return False

+ self.commit_hash, cwd=cwd, stdout=api.m.raw_io.output(), name=name,

+ step_test_data=lambda: api._test_data['deps_change'][self.commit_hash]

+ )

if 'DEPS' in step_result.stdout.splitlines(): # pragma: no cover

return True

return False # pragma: no cover

@@ -189,17 +188,15 @@ class RevisionState(object):

def read_deps(self, recipe_tester_name):

"""Sets the dependencies for this revision from the contents of DEPS."""

api = self.bisector.api

- if self.deps:

- return

if self.bisector.internal_bisect: # pragma: no cover

- self.deps_file_contents = self._read_content(

+ deps_file_contents = self._read_content(

depot_config.DEPOT_DEPS_NAME[self.depot_name]['url'],

depot_config.DEPOT_DEPS_NAME[self.depot_name]['deps_file'],

self.commit_hash)

# On April 5th, 2016 .DEPS.git was changed to DEPS on android-chrome repo,

# we are doing this in order to support both deps files.

- if not self.deps_file_contents:

- self.deps_file_contents = self._read_content(

+ if not deps_file_contents:

+ deps_file_contents = self._read_content(

depot_config.DEPOT_DEPS_NAME[self.depot_name]['url'],

depot_config.DEPS_FILENAME,

self.commit_hash)

@@ -208,11 +205,13 @@ class RevisionState(object):

'fetch file %s:%s' % (self.commit_hash, depot_config.DEPS_FILENAME),

api.resource('fetch_file.py'),

[depot_config.DEPS_FILENAME, '--commit', self.commit_hash],

- stdout=api.m.raw_io.output())

- self.deps_file_contents = step_result.stdout

+ stdout=api.m.raw_io.output(),

+ step_test_data=lambda: api._test_data['deps'][self.commit_hash]

+ )

+ deps_file_contents = step_result.stdout

try:

deps_data = self._gen_deps_local_scope()

- exec(self.deps_file_contents or 'deps = {}', {}, deps_data)

+ exec (deps_file_contents or 'deps = {}') in {}, deps_data

deps_data = deps_data['deps']

except ImportError: # pragma: no cover

# TODO(robertocn): Implement manual parsing of DEPS when exec fails.

@@ -243,40 +242,16 @@ class RevisionState(object):

self.deps = results

return

- def update_status(self):

- """Checks on the pending jobs and updates status accordingly.

- This method will check for the build to complete and then trigger the test,

- or will wait for the test as appropriate.

- To wait for the test we try to get the buildbot job url from GS, and if

- available, we query the status of such job.

- """

- if self.status == RevisionState.BUILDING:

- if self._is_build_archived():

- self.start_job()

- elif self._is_build_failed(): # pragma: no cover

- self.status = RevisionState.FAILED

- elif (self.status in (RevisionState.TESTING, RevisionState.NEED_MORE_DATA)

- and self._results_available()):

- # If we have already decided whether the revision is good or bad we

- # shouldn't check again

- check_revision_goodness = not(self.good or self.bad)

- self._read_test_results(

- check_revision_goodness=check_revision_goodness)

- # We assume _read_test_results may have changed the status to a broken

- # state such as FAILED or ABORTED.

- if self.status in (RevisionState.TESTING, RevisionState.NEED_MORE_DATA):

- self.status = RevisionState.TESTED

def _is_build_archived(self):

"""Checks if the revision is already built and archived."""

if not self.build_archived:

api = self.bisector.api

- self.build_archived = api.gsutil_file_exists(self.build_url)

- if self.bisector.dummy_builds:

- self.build_archived = self.in_progress

+ self.build_archived = api.gsutil_file_exists(

+ self.build_url,

+ step_test_data=lambda: api._test_data.get(

+ 'gsutil_exists', {}).get(self.commit_hash).pop()

+ if api._test_data.get('gsutil_exists', {}).get(self.commit_hash)

+ else collections.namedtuple('retcode_attr', ['retcode'])(0))

dtu 2016/09/13 23:57:27 Maybe this is: api._test_data.get('gsutil_exists'

RobertoCN 2016/09/21 22:22:48 That wouldn't work because after popping the last

return self.build_archived

@@ -285,20 +260,11 @@ class RevisionState(object):

result = api.m.buildbucket.get_build(

self.build_id,

api.m.service_account.get_json_path(api.SERVICE_ACCOUNT),

- step_test_data=lambda: api.m.json.test_api.output_stream(

- {'build': {'result': 'SUCCESS', 'status': 'COMPLETED'}}

- ))

+ step_test_data=lambda: api.test_api.buildbot_job_status_mock(

+ api._test_data.get('build_status', {}).get(self.commit_hash, [])))

return (result.stdout['build']['status'] == 'COMPLETED' and

result.stdout['build'].get('result') != 'SUCCESS')

- def _results_available(self):

- """Checks if the results for the test job have been uploaded."""

- api = self.bisector.api

- result = api.gsutil_file_exists(self.test_results_url)

- if self.bisector.dummy_builds:

- return self.in_progress

- return result # pragma: no cover

def _gs_suffix(self):

"""Provides the expected right half of the build filename.

@@ -312,58 +278,23 @@ class RevisionState(object):

name_parts.append(self.deps_sha)

return '%s.zip' % '_'.join(name_parts)

- def _read_test_results(self, check_revision_goodness=True):

- """Gets the test results from GS and checks if the rev is good or bad."""

- test_results = self._get_test_results()

- # Results will contain the keys 'results' and 'output' where output is the

- # stdout of the command, and 'results' is itself a dict with the key

- # 'values' unless the test failed, in which case 'results' will contain

- # the 'error' key explaining the type of error.

- results = test_results['results']

- if results.get('errors'):

- self.status = RevisionState.FAILED

- if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover

+ def _read_test_results(self, results):

+ # Results will be a dictionary containing path to chartjsons, paths to

+ # valueset, list of return codes.

+ self.return_codes.extend(results.get('retcodes', []))

+ if results.get('errors'): # pragma: no cover

+ self.failed = True

+ if 'MISSING_METRIC' in results.get('errors'):

self.bisector.surface_result('MISSING_METRIC')

- return

- self.values += results['values']

- api = self.bisector.api

- if test_results.get('retcodes') and test_results['retcodes'][-1] != 0 and (

- api.m.chromium.c.TARGET_PLATFORM == 'android'): #pragma: no cover

- api.m.chromium_android.device_status()

- current_connected_devices = api.m.chromium_android.devices

- current_device = api.m.bisect_tester.device_to_test

- if current_device not in current_connected_devices:

- # We need to manually raise step failure here because we are catching

- # them further down the line to enable return_code bisects and bisecting

- # on benchmarks that are a little flaky.

- raise api.m.step.StepFailure('Test device disconnected.')

- if self.bisector.is_return_code_mode():

- retcodes = test_results['retcodes']

- self.overall_return_code = 0 if all(v == 0 for v in retcodes) else 1

- # Keeping mean_value for compatibility with dashboard.

- # TODO(robertocn): refactor mean_value, specially when uploading results

- # to dashboard.

- self.mean_value = self.overall_return_code

- elif self.values:

- api = self.bisector.api

- self.mean_value = api.m.math_utils.mean(self.values)

- self.std_dev = api.m.math_utils.standard_deviation(self.values)

- # Values were not found, but the test did not otherwise fail.

+ raise bisect_exceptions.UntestableRevisionException(results['errors'])

+ elif self.bisector.is_return_code_mode():

+ assert len(results['retcodes'])

else:

- self.status = RevisionState.FAILED

- self.bisector.surface_result('MISSING_METRIC')

- return

- # If we have already decided on the goodness of this revision, we shouldn't

- # recheck it.

- if self.good or self.bad:

- check_revision_goodness = False

- # We cannot test the goodness of the initial rev range.

- if (self.bisector.good_rev != self and self.bisector.bad_rev != self and

- check_revision_goodness):

- if self._check_revision_good():

- self.good = True

- else:

- self.bad = True

+ self.valueset_paths.extend(results.get('valueset_paths'))

+ self.chartjson_paths.extend(results.get('chartjson_paths'))

+ if results.get('retcodes') and 0 not in results['retcodes']:

+ raise bisect_exceptions.UntestableRevisionException(

+ 'got non-zero return code on all runs for ' + self.commit_hash)

def _request_build(self):

"""Posts a request to buildbot to build this revision and archive it."""

@@ -431,10 +362,10 @@ class RevisionState(object):

the test will be run on the same machine. Otherwise, this posts

a request to buildbot to download and perf-test this build.

"""

- if self.bisector.bisect_config.get('dummy_job_names'):

- self.job_name = self.commit_hash + '-test'

- else: # pragma: no cover

- self.job_name = uuid.uuid4().hex

+ if self.test_run_count: # pragma: no cover

+ self.repeat_count = max(MINIMUM_SAMPLE_SIZE, math.ceil(

+ self.test_run_count * 1.5)) - self.test_run_count

api = self.bisector.api

# Stores revision map for different repos eg, android-chrome, src, v8 etc.

revision_ladder = {}

@@ -444,102 +375,82 @@ class RevisionState(object):

revision_ladder[top_revision.depot_name] = top_revision.commit_hash

top_revision = top_revision.base_revision

perf_test_properties = {

- 'builder_name': self.bisector.get_perf_tester_name(),

'properties': {

'revision': top_revision.commit_hash,

'parent_got_revision': top_revision.commit_hash,

'parent_build_archive_url': self.build_url,

'bisect_config': self._get_bisect_config_for_tester(),

- 'job_name': self.job_name,

'revision_ladder': revision_ladder,

}

- self.test_results_url = (self.bisector.api.GS_RESULTS_URL +

- self.job_name + '.results')

- if (api.m.bisect_tester.local_test_enabled() or

- self.bisector.internal_bisect): # pragma: no cover

- skip_download = self.bisector.last_tested_revision == self

- self.bisector.last_tested_revision = self

- overrides = perf_test_properties['properties']

- api.run_local_test_run(overrides, skip_download=skip_download)

- else:

- step_name = 'Triggering test job for ' + self.commit_hash

- api.m.trigger(perf_test_properties, name=step_name)

- def retest(self): # pragma: no cover

- # We need at least 5 samples for applying Mann-Whitney U test

- # with P < 0.01, two-tailed .

- target_sample_size = max(5, math.ceil(len(self.values) * 1.5))

- self.status = RevisionState.NEED_MORE_DATA

- self.repeat_count = target_sample_size - len(self.values)

- self.start_job()

- self.bisector.wait_for(self)

- def _get_test_results(self):

- """Tries to get the results of a test job from cloud storage."""

- api = self.bisector.api

- try:

- stdout = api.m.raw_io.output()

- name = 'Get test results for build ' + self.commit_hash

- step_result = api.m.gsutil.cat(self.test_results_url, stdout=stdout,

- name=name)

- if not step_result.stdout:

- raise api.m.step.StepFailure('Test for build %s failed' %

- self.revision_string())

- except api.m.step.StepFailure as sf: # pragma: no cover

- self.bisector.surface_result('TEST_FAILURE')

- return {'results': {'errors': str(sf)}}

- else:

- return json.loads(step_result.stdout)

+ skip_download = self.bisector.last_tested_revision == self

+ self.bisector.last_tested_revision = self

+ overrides = perf_test_properties['properties']

- def _check_revision_good(self):

- """Determines if a revision is good or bad.

+ def run_test_step_test_data():

dtu 2016/09/13 23:57:27 Hmm, this feels out of place. Is there a way to pu

RobertoCN 2016/09/21 22:22:48 That makes sense. I'll give it a try in a separate

+ """Returns a single step data object when called.

- Iteratively increment the sample size of the revision being tested, the last

- known good revision, and the first known bad revision until a relationship

- of significant difference can be established betweeb the results of the

- revision being tested and one of the other two.

+ These are expected to be populated by the test_api.

+ """

+ if api._test_data['run_results'].get(self.commit_hash):

+ return api._test_data['run_results'][self.commit_hash].pop(0)

+ return api._test_data['run_results']['default']

- If the results do not converge towards finding a significant difference in

- either direction, this is expected to timeout eventually. This scenario

- should be rather rare, since it is expected that the fkbr and lkgr are

- significantly different as a precondition.

+ self._read_test_results(api.run_local_test_run(

+ overrides, skip_download=skip_download,

+ step_test_data=run_test_step_test_data

+ ))

+ def _check_revision_good(self): # pragma: no cover

+ """Determines if a revision is good or bad.

Returns:

- True if the results of testing this revision are significantly different

- from those of testing the earliest known bad revision.

- False if they are instead significantly different form those of testing

- the latest knwon good revision.

+ True if the revision is either good or bad, False if it cannot be

+ determined from the available data.

"""

+ # Do not reclassify revisions. Important for reference range.

+ if self.good or self.bad:

+ return True

lkgr = self.bisector.lkgr

fkbr = self.bisector.fkbr

if self.bisector.is_return_code_mode():

- return self.overall_return_code == lkgr.overall_return_code

- while True:

- diff_from_good = self.bisector.significantly_different(

- lkgr.values[:len(fkbr.values)], self.values)

- diff_from_bad = self.bisector.significantly_different(

- fkbr.values[:len(lkgr.values)], self.values)

- if diff_from_good and diff_from_bad:

- # Multiple regressions.

- # For now, proceed bisecting the biggest difference of the means.

- dist_from_good = abs(self.mean_value - lkgr.mean_value)

- dist_from_bad = abs(self.mean_value - fkbr.mean_value)

- if dist_from_good > dist_from_bad:

- # TODO(robertocn): Add way to handle the secondary regression

- #self.bisector.handle_secondary_regression(self, fkbr)

- return False

- else:

- #self.bisector.handle_secondary_regression(lkgr, self)

- return True

- if diff_from_good or diff_from_bad: # pragma: no cover

- return diff_from_bad

+ if self.overall_return_code == lkgr.overall_return_code:

+ self.good = True

+ else:

+ self.bad = True

+ return True

+ diff_from_good = self.bisector.compare_revisions(self, lkgr)

+ diff_from_bad = self.bisector.compare_revisions(self, fkbr)

+ if (diff_from_good == NOT_SIGNIFICANTLY_DIFFERENT and

+ diff_from_bad == NOT_SIGNIFICANTLY_DIFFERENT):

+ # We have reached the max number of samples and have not established

+ # difference, give up.

+ raise bisect_exceptions.InconclusiveBisectException()

+ if (diff_from_good == SIGNIFICANTLY_DIFFERENT and

+ diff_from_bad == SIGNIFICANTLY_DIFFERENT):

+ # Multiple regressions.

+ # For now, proceed bisecting the biggest difference of the means.

+ dist_from_good = abs(self.mean - lkgr.mean)

+ dist_from_bad = abs(self.mean - fkbr.mean)

+ if dist_from_good > dist_from_bad:

+ # TODO(robertocn): Add way to handle the secondary regression

+ #self.bisector.handle_secondary_regression(self, fkbr)

+ self.bad = True

+ return True

+ else:

+ #self.bisector.handle_secondary_regression(lkgr, self)

+ self.good = True

+ return True

+ if diff_from_good == SIGNIFICANTLY_DIFFERENT:

+ self.bad = True

+ return True

+ elif diff_from_bad == SIGNIFICANTLY_DIFFERENT:

+ self.good = True

+ return True

+ # NEED_MORE_DATA

+ return False

- self._next_retest() # pragma: no cover

def revision_string(self):

if self._rev_str:

@@ -558,25 +469,24 @@ class RevisionState(object):

self._rev_str = result

return self._rev_str

- def _next_retest(self): # pragma: no cover

- """Chooses one of current, lkgr, fkbr to retest.

- Look for the smallest sample and retest that. If the last tested revision

- is tied for the smallest sample, use that to take advantage of the fact

- that it is already downloaded and unzipped.

- """

- next_revision_to_test = min(self.bisector.lkgr, self, self.bisector.fkbr,

- key=lambda x: len(x.values))

- if (len(self.bisector.last_tested_revision.values) ==

- next_revision_to_test.values):

- self.bisector.last_tested_revision.retest()

- else:

- next_revision_to_test.retest()

- def __repr__(self):

- if self.overall_return_code is not None:

- return ('RevisionState(rev=%s, values=%r, overall_return_code=%r, '

- 'std_dev=%r)') % (self.revision_string(), self.values,

+ def __repr__(self): # pragma: no cover

+ if not self.test_run_count:

+ return ('RevisionState(rev=%s), values=[]' % self.revision_string())

+ if self.bisector.is_return_code_mode():

+ return ('RevisionState(rev=%s, mean=%r, overall_return_code=%r, '

+ 'std_dev=%r)') % (self.revision_string(), self.mean,

self.overall_return_code, self.std_dev)

- return ('RevisionState(rev=%s, values=%r, mean_value=%r, std_dev=%r)' % (

- self.revision_string(), self.values, self.mean_value, self.std_dev))

+ return ('RevisionState(rev=%s, mean_value=%r, std_dev=%r)' % (

+ self.revision_string(), self.mean, self.std_dev))

+ @property

+ def overall_return_code(self):

+ if self.bisector.is_return_code_mode():

+ if self.return_codes:

+ if max(self.return_codes):

+ return 1

+ return 0

+ raise ValueError('overall_return_code needs non-empty sample'

+ ) # pragma: no cover

+ raise ValueError('overall_return_code only applies to return_code bisects'

+ ) # pragma: no cover