scripts/slave/recipe_modules/auto_bisect/bisector.py - Issue 2247373002: Refactor stages 1, 2 and test_api overhaul.

Unified Diff: scripts/slave/recipe_modules/auto_bisect/bisector.py

Issue 2247373002: Refactor stages 1, 2 and test_api overhaul. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master

Patch Set: Addressing all early feedback. Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « scripts/slave/recipe_modules/auto_bisect/bisect_exceptions.py ('k') | scripts/slave/recipe_modules/auto_bisect/bisector_test.py » ('j') | scripts/slave/recipe_modules/auto_bisect/revision_state.py » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: scripts/slave/recipe_modules/auto_bisect/bisector.py

diff --git a/scripts/slave/recipe_modules/auto_bisect/bisector.py b/scripts/slave/recipe_modules/auto_bisect/bisector.py

index cd46eb187fbc2b818e67509b6c6a9350f9519313..c613d1be59539e72a907300714849a20cc0ebd7b 100644

--- a/scripts/slave/recipe_modules/auto_bisect/bisector.py

+++ b/scripts/slave/recipe_modules/auto_bisect/bisector.py

@@ -7,9 +7,9 @@ import re

import time

import urllib

-from . import config_validation

-from . import depot_config

-from . import revision_state

+from auto_bisect import config_validation

+from auto_bisect import depot_config

+from auto_bisect import revision_state

_DEPS_SHA_PATCH = """

diff --git DEPS.sha DEPS.sha

@@ -47,13 +47,6 @@ DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25

# How long to re-test the initial good-bad range for until significant

# difference is established.

REGRESSION_CHECK_TIMEOUT = 20 * 60 * 60 # 20 hours. A build times out after 24.

-# If we reach this number of samples on the reference range and have not

-# achieved statistical significance, bail.

-MAX_REQUIRED_SAMPLES = 15

-# Significance level to use for determining difference between revisions via

-# hypothesis testing.

-SIGNIFICANCE_LEVEL = 0.01

_FAILED_INITIAL_CONFIDENCE_ABORT_REASON = (

'The metric values for the initial "good" and "bad" revisions '

@@ -155,7 +148,9 @@ class Bisector(object):

commit_position = self._api.m.commit_position.construct(

branch='refs/heads/master', value=rev)

try:

- return self._api.m.crrev.to_commit_hash(commit_position)

+ return self._api.m.crrev.to_commit_hash(

+ commit_position,

+ step_test_data=lambda: self.api._test_data['hash_cp_map'][rev])

except self.api.m.step.StepFailure: # pragma: no cover

self.surface_result('BAD_REV')

raise

@@ -167,37 +162,39 @@ class Bisector(object):

def _is_sha1(s):

return bool(re.match('^[0-9A-Fa-f]{40}$', s))

- def significantly_different(

- self, list_a, list_b,

- significance_level=SIGNIFICANCE_LEVEL): # pragma: no cover

- """Uses an external script to run hypothesis testing with scipy.

- The reason why we need an external script is that scipy is not available to

- the default python installed in all platforms. We instead rely on an

- anaconda environment to provide those packages.

- Args:

- list_a, list_b: Two lists representing samples to be compared.

- significance_level: Self-describing. As a decimal fraction.

+ def compare_revisions(self, revision_a, revision_b):

+ """

Returns:

- A boolean indicating whether the null hypothesis ~(that the lists are

- samples from the same population) can be rejected at the specified

- significance level.

+ SIGNIFICANTLY_DIFFERENT if the samples are significantly different.

+ NEED_MORE_DATA if there is not enough data to tell.

+ NOT_SIGNIFICANTLY_DIFFERENT if if there's enough data but still can't

+ tell the samples apart.

"""

- step_result = self.api.m.python(

- 'Checking sample difference',

- self.api.resource('significantly_different.py'),

- [json.dumps(list_a), json.dumps(list_b), str(significance_level)],

- stdout=self.api.m.json.output())

- results = step_result.stdout

- if results is None:

- assert self.dummy_builds

- return True

- significantly_different = results['significantly_different']

- step_result.presentation.logs[str(significantly_different)] = [

- 'See json.output for details']

- return significantly_different

+ output_format = 'chartjson'

+ values_a = revision_a.chartjson_paths

+ values_b = revision_b.chartjson_paths

+ if revision_a.valueset_paths and revision_b.valueset_paths:

+ output_format = 'valueset'

+ values_a = revision_a.valueset_paths

+ values_b = revision_b.valueset_paths

+ result = self.api.stat_compare(

+ values_a,

+ values_b,

+ self.bisect_config['metric'],

+ output_format=output_format,

+ step_test_data=lambda: self.api.test_api.compare_samples_data(

+ self.api._test_data.get('revision_data'), revision_a, revision_b))

+ revision_a.debug_values = result['sample_a']['debug_values']

+ revision_b.debug_values = result['sample_b']['debug_values']

+ if result['result'] == revision_state.NEED_MORE_DATA:

+ return revision_state.NEED_MORE_DATA

+ elif result['result']:

+ return revision_state.SIGNIFICANTLY_DIFFERENT

+ else: # pragma: no cover

+ return revision_state.NOT_SIGNIFICANTLY_DIFFERENT

def config_step(self):

"""Yields a step that prints the bisect config."""

@@ -235,8 +232,8 @@ class Bisector(object):

return self._api

def compute_relative_change(self):

- old_value = float(self.good_rev.mean_value)

- new_value = float(self.bad_rev.mean_value)

+ old_value = float(self.good_rev.mean or 0)

+ new_value = float(self.bad_rev.mean or 0)

if new_value and not old_value: # pragma: no cover

self.relative_change = ZERO_TO_NON_ZERO

@@ -245,145 +242,26 @@ class Bisector(object):

rel_change = self.api.m.math_utils.relative_change(old_value, new_value)

self.relative_change = '%.2f%%' % (100 * rel_change)

- def make_deps_sha_file(self, deps_sha):

- """Make a diff patch that creates DEPS.sha.

- Args:

- deps_sha (str): The hex digest of a SHA1 hash of the diff that patches

- DEPS.

- Returns:

- A string containing a git diff.

- """

- return _DEPS_SHA_PATCH % {'deps_sha': deps_sha}

- def _git_intern_file(self, file_contents, cwd, commit_hash):

- """Writes a file to the git database and produces its git hash.

- Args:

- file_contents (str): The contents of the file to be hashed and interned.

- cwd (recipe_config_types.Path): Path to the checkout whose repository the

- file is to be written to.

- commit_hash (str): An identifier for the step.

- Returns:

- A string containing the hash of the interned object.

- """

- cmd = 'hash-object -t blob -w --stdin'.split(' ')

- stdin = self.api.m.raw_io.input(file_contents)

- stdout = self.api.m.raw_io.output()

- step_name = 'Hashing modified DEPS file with revision ' + commit_hash

- step_result = self.api.m.git(*cmd, cwd=cwd, stdin=stdin, stdout=stdout,

- name=step_name)

- hash_string = step_result.stdout.splitlines()[0]

- try:

- if hash_string:

- int(hash_string, 16)

- return hash_string

- except ValueError: # pragma: no cover

- reason = 'Git did not output a valid hash for the interned file.'

- self.api.m.halt(reason)

- raise self.api.m.step.StepFailure(reason)

- def _gen_diff_patch(self, git_object_a, git_object_b, src_alias, dst_alias,

- cwd, deps_rev):

- """Produces a git diff patch.

- Args:

- git_object_a (str): Tree-ish git object identifier.

- git_object_b (str): Another tree-ish git object identifier.

- src_alias (str): Label to replace the tree-ish identifier on

- the resulting diff patch. (git_object_a)

- dst_alias (str): Same as above for (git_object_b)

- cwd (recipe_config_types.Path): Path to the checkout whose repo contains

- the objects to be compared.

- deps_rev (str): Deps revision to identify the patch generating step.

- Returns:

- A string containing the diff patch as produced by the 'git diff' command.

- """

- # The prefixes used in the command below are used to find and replace the

- # tree-ish git object id's on the diff output more easily.

- cmd = 'diff %s %s --src-prefix=IAMSRC: --dst-prefix=IAMDST:'

- cmd %= (git_object_a, git_object_b)

- cmd = cmd.split(' ')

- stdout = self.api.m.raw_io.output()

- step_name = 'Generating patch for %s to %s' % (git_object_a, deps_rev)

- step_result = self.api.m.git(*cmd, cwd=cwd, stdout=stdout, name=step_name)

- patch_text = step_result.stdout

- src_string = 'IAMSRC:' + git_object_a

- dst_string = 'IAMDST:' + git_object_b

- patch_text = patch_text.replace(src_string, src_alias)

- patch_text = patch_text.replace(dst_string, dst_alias)

- return patch_text

- def make_deps_patch(self, base_revision, base_file_contents,

- depot, new_commit_hash):

- """Make a diff patch that updates a specific dependency revision.

- Args:

- base_revision (RevisionState): The revision for which the DEPS file is to

- be patched.

- base_file_contents (str): The contents of the original DEPS file.

- depot (str): The dependency to modify.

- new_commit_hash (str): The revision to put in place of the old one.

- Returns:

- A pair containing the git diff patch that updates DEPS, and the

- full text of the modified DEPS file, both as strings.

- """

- original_contents = str(base_file_contents)

- patched_contents = str(original_contents)

- # Modify DEPS.

- deps_var = depot['deps_var']

- deps_item_regexp = re.compile(

- r'(?<=["\']%s["\']: ["\'])([a-fA-F0-9]+)(?=["\'])' % deps_var,

- re.MULTILINE)

- if not re.search(deps_item_regexp, original_contents): # pragma: no cover

- reason = 'DEPS file does not contain entry for ' + deps_var

- self.api.m.halt(reason)

- raise self.api.m.step.StepFailure(reason)

- patched_contents = re.sub(deps_item_regexp, new_commit_hash,

- original_contents)

- cwd = self.api.working_dir.join(

- depot_config.DEPOT_DEPS_NAME[base_revision.depot_name]['src'])

- deps_file = 'DEPS'

- # This is to support backward compatibility on android-chrome because

- # .DEPS.git got migrated to DEPS on April 5, 2016

- if (base_revision.depot_name == 'android-chrome' and

- self.api.m.path.exists(self.api.m.path.join(cwd, '.DEPS.git'))):

- deps_file = '.DEPS.git' # pragma: no cover

- interned_deps_hash = self._git_intern_file(

- patched_contents, cwd, new_commit_hash)

- patch_text = self._gen_diff_patch(

- '%s:%s' % (base_revision.commit_hash, deps_file),

- interned_deps_hash, deps_file, deps_file,

- cwd=cwd,

- deps_rev=new_commit_hash)

- return patch_text, patched_contents

def _expand_initial_revision_range(self):

"""Sets the initial contents of |self.revisions|."""

with self.api.m.step.nest('Expanding revision range'):

good_hash = self.good_rev.commit_hash

bad_hash = self.bad_rev.commit_hash

+ depot = self.good_rev.depot_name

step_name = 'for revisions %s:%s' % (good_hash, bad_hash)

revisions = self._revision_range(

start=good_hash,

end=bad_hash,

depot_name=self.base_depot,

step_name=step_name,

- exclude_end=True)

+ exclude_end=True,

+ step_test_data=lambda: self.api._test_data['revision_list'][depot]

dtu 2016/09/13 23:57:27 How do you decide whether to pass step_test_data h

RobertoCN 2016/09/21 22:22:48 I also do not like this. Ideally gsutil (for examp

+ )

self.revisions = [self.good_rev] + revisions + [self.bad_rev]

self._update_revision_list_indexes()

def _revision_range(self, start, end, depot_name, base_revision=None,

- step_name=None, exclude_end=False):

+ step_name=None, exclude_end=False, **kwargs):

"""Returns a list of RevisionState objects between |start| and |end|.

When expanding the initial revision range we want to exclude the last

@@ -412,7 +290,7 @@ class Bisector(object):

step_name,

self.api.resource('fetch_intervening_revisions.py'),

[start, end, depot_config.DEPOT_DEPS_NAME[depot_name]['url']],

- stdout=self.api.m.json.output())

+ stdout=self.api.m.json.output(), **kwargs)

except self.api.m.step.StepFailure: # pragma: no cover

self.surface_result('BAD_REV')

raise

@@ -521,7 +399,9 @@ class Bisector(object):

end=dep_revision_max,

depot_name=depot_name,

base_revision=min_revision,

- step_name=step_name)

+ step_name=step_name,

+ step_test_data=lambda:

+ self.api._test_data['revision_list'][depot_name])

new_revisions = self.revisions[:max_revision.list_index]

new_revisions += rev_list

new_revisions += self.revisions[max_revision.list_index:]

@@ -558,8 +438,8 @@ class Bisector(object):

True if the check passes (i.e. no problem), False if the change is not

a regression according to the improvement direction.

"""

- good = self.good_rev.mean_value

- bad = self.bad_rev.mean_value

+ good = self.good_rev.mean

+ bad = self.bad_rev.mean

if self.is_return_code_mode():

return True

@@ -594,32 +474,32 @@ class Bisector(object):

other in a statistically significant manner. False if such difference could

not be established in the time or sample size allowed.

"""

- if self.test_type == 'return_code':

+ if self.is_return_code_mode():

return (self.good_rev.overall_return_code !=

self.bad_rev.overall_return_code)

if self.bypass_stats_check:

- dummy_result = self.good_rev.values != self.bad_rev.values

+ self.compare_revisions(self.good_rev, self.bad_rev)

+ dummy_result = self.good_rev.mean != self.bad_rev.mean

if not dummy_result:

self._set_insufficient_confidence_warning()

return dummy_result

+ # TODO(robertocn): This step should not be necessary in some cases.

with self.api.m.step.nest('Re-testing reference range'):

expiration_time = time.time() + REGRESSION_CHECK_TIMEOUT

while time.time() < expiration_time:

- if len(self.good_rev.values) >= 5 and len(self.bad_rev.values) >= 5:

- if self.significantly_different(self.good_rev.values,

- self.bad_rev.values):

+ if (self.good_rev.test_run_count >= 5

+ and self.bad_rev.test_run_count >= 5):

+ if self.compare_revisions(self.good_rev, self.bad_rev):

return True

- if len(self.good_rev.values) == len(self.bad_rev.values):

+ if self.good_rev.test_run_count == self.bad_rev.test_run_count:

revision_to_retest = self.last_tested_revision

else:

revision_to_retest = min(self.good_rev, self.bad_rev,

- key=lambda x: len(x.values))

- if len(revision_to_retest.values) < MAX_REQUIRED_SAMPLES:

- revision_to_retest.retest()

- else:

- break

+ key=lambda x: x.test_run_count)

+ revision_to_retest._do_test()

self._set_insufficient_confidence_warning()

return False

@@ -641,14 +521,18 @@ class Bisector(object):

result = 'bisector.lkgr: %r\n' % self.lkgr

result += 'bisector.fkbr: %r\n\n' % self.fkbr

result += self._revision_value_table()

- if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values):

- result += '\n' + self._t_test_results()

+ if (self.lkgr and self.lkgr.test_run_count and self.fkbr and

+ self.fkbr.test_run_count):

+ result += '\n' + '\n'.join([

+ 'LKGR values: %r' % list(self.lkgr.debug_values),

+ 'FKBR values: %r' % list(self.fkbr.debug_values),

+ ])

return result

def _revision_value_table(self):

"""Returns a string table showing revisions and their values."""

header = [['Revision', 'Values']]

- rows = [[r.revision_string(), str(r.values)] for r in self.revisions]

+ rows = [[r.revision_string(), str(r.debug_values)] for r in self.revisions]

return self._pretty_table(header + rows)

def _pretty_table(self, data):

@@ -657,20 +541,6 @@ class Bisector(object):

results.append('%-15s' * len(row) % tuple(row))

return '\n'.join(results)

- def _t_test_results(self):

- """Returns a string showing t-test results for lkgr and fkbr."""

- t, df, p = self.api.m.math_utils.welchs_t_test(

- self.lkgr.values, self.fkbr.values)

- lines = [

- 'LKGR values: %r' % self.lkgr.values,

- 'FKBR values: %r' % self.fkbr.values,

- 't-statistic: %r' % t,

- 'deg. of freedom: %r' % df,

- 'p-value: %r' % p,

- 'Confidence score: %r' % (100 * (1 - p))

- ]

- return '\n'.join(lines)

def print_result_debug_info(self):

"""Prints extra debug info at the end of the bisect process."""

lines = self._results_debug_message().splitlines()

@@ -695,7 +565,7 @@ class Bisector(object):

candidate_range = [revision for revision in

self.revisions[self.lkgr.list_index + 1:

self.fkbr.list_index]

- if not revision.tested and not revision.failed]

+ if not revision.failed]

if len(candidate_range) == 1:

return candidate_range[0]

if len(candidate_range) == 0:

@@ -738,7 +608,7 @@ class Bisector(object):

does not contain a DEPS change.

"""

if (revision.bad and revision.previous_revision and

- revision.previous_revision.good): # pragma: no cover

+ revision.previous_revision.good):

if revision.deps_change() and self._expand_deps_revisions(revision):

return False

self.culprit = revision

@@ -750,32 +620,9 @@ class Bisector(object):

return False

self.culprit = revision.next_revision

return True

- return False

- def wait_for_all(self, revision_list):

- """Waits for all revisions in list to finish."""

- for r in revision_list:

- self.wait_for(r)

- def wait_for(self, revision, nest_check=True):

- """Waits for the revision to finish its job."""

- if nest_check and not self.flags.get(

- 'do_not_nest_wait_for_revision'): # pragma: no cover

- with self.api.m.step.nest('Waiting for ' + revision.revision_string()):

- return self.wait_for(revision, nest_check=False)

- while True:

- revision.update_status()

- if revision.in_progress:

- self.api.m.python.inline(

- 'sleeping',

- """

- import sys

- import time

- time.sleep(20*60)

- sys.exit(0)

- """)

- else:

- break

+ # We'll never get here because revision adjacency is checked before this

+ # function is called.

+ assert False # pragma: no cover

def _update_candidate_range(self):

"""Updates lkgr and fkbr (last known good/first known bad) revisions.

@@ -783,7 +630,7 @@ class Bisector(object):

lkgr and fkbr are 'pointers' to the appropriate RevisionState objects in

bisectors.revisions."""

for r in self.revisions:

- if r.tested:

+ if r.test_run_count:

if r.good:

self.lkgr = r

elif r.bad:

@@ -821,7 +668,7 @@ class Bisector(object):

# TODO(prasadv): Refactor this code to remove hard coded values and use

# target_bit from the bot config. crbug.com/640287

- if 'android' in bot_name:

+ if 'android' in bot_name: # pragma: no cover

if any(b in bot_name for b in ['arm64', 'nexus9', 'nexus5X']):

return 'android_arm64_perf_bisect_builder'

return 'android_perf_bisect_builder'

@@ -851,7 +698,7 @@ class Bisector(object):

# TODO(prasadv): Refactor this code to remove hard coded values and use

# target_bit from the bot config. crbug.com/640287

- if 'android' in bot_name:

+ if 'android' in bot_name: #pragma: no cover

if any(b in bot_name for b in ['arm64', 'nexus9', 'nexus5X']):

return 'gs://chrome-perf/Android arm64 Builder/full-build-linux_'

return 'gs://chrome-perf/Android Builder/full-build-linux_'

@@ -876,7 +723,7 @@ class Bisector(object):

def is_return_code_mode(self):

"""Checks whether this is a bisect on the test's exit code."""

- return self.bisect_config.get('test_type') == 'return_code'

+ return self.test_type == 'return_code'

def surface_result(self, result_string):

assert result_string in VALID_RESULT_CODES

@@ -891,10 +738,6 @@ class Bisector(object):

def get_result(self):

"""Returns the results as a jsonable object."""

config = self.bisect_config

- results_confidence = 0

- if self.culprit:

- results_confidence = self.api.m.math_utils.confidence_score(

- self.lkgr.values, self.fkbr.values)

if self.failed:

status = 'failed'

@@ -916,9 +759,8 @@ class Bisector(object):

'bisect_bot': self.get_perf_tester_name(),

'command': config['command'],

'test_type': config['test_type'],

- 'metric': config['metric'],

+ 'metric': config.get('metric'),

'change': self.relative_change,

- 'score': results_confidence,

'good_revision': self.good_rev.commit_hash,

'bad_revision': self.bad_rev.commit_hash,

'warnings': self.warnings,

@@ -950,14 +792,14 @@ class Bisector(object):

def _revision_data(self):

revision_rows = []

for r in self.revisions:

- if r.tested or r.aborted:

+ if r.test_run_count:

revision_rows.append({

'depot_name': r.depot_name,

'commit_hash': r.commit_hash,

'revision_string': r.revision_string(),

- 'mean_value': r.mean_value,

+ 'mean_value': r.mean,

'std_dev': r.std_dev,

- 'values': r.values,

+ 'values': r.debug_values,

'result': 'good' if r.good else 'bad' if r.bad else 'unknown',

})

return revision_rows