tools/auto_bisect/bisect_results.py - Issue 664753002: Refactored auto_bisect bot

Unified Diff: tools/auto_bisect/bisect_results.py

Issue 664753002: Refactored auto_bisect bot (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git/+/master

Patch Set: Rebase Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/auto_bisect/bisect_results.py

diff --git a/tools/auto_bisect/bisect_results.py b/tools/auto_bisect/bisect_results.py

index 59cb94d7db07fa024ab389f283bf9f5eb8779f46..9bcaeb683a0fc6d146d3de043afba0e2148c46a9 100644

--- a/tools/auto_bisect/bisect_results.py

+++ b/tools/auto_bisect/bisect_results.py

@@ -11,55 +11,152 @@ import source_control

import ttest

-def ConfidenceScore(good_results_lists, bad_results_lists):

- """Calculates a confidence score.

+class BisectResults(object):

+ """Contains results of the completed bisect.

+ Properties:

+ error: Error message if the bisect failed.

+ If the error is None, the following properties are present:

+ warnings: List of warnings from the bisect run.

+ state: BisectState object from which these results were generated.

+ first_working_revision: First good revision.

+ last_broken_revision: Last bad revision.

+ If both of above revisions are not None, the follow properties are present:

+ culprit_revisions: A list of revisions, which contain the bad change

+ introducing the failure.

+ other_regressions: A list of tuples representing other regressions, which

+ may have occured.

+ regression_size: For performance bisects, this is a relative change of

+ the mean metric value. For other bisects this field always contains

+ 'zero-to-nonzero'.

+ regression_std_err: For performance bisects, it is a pooled standard error

+ for groups of good and bad runs. Not used for other bisects.

+ confidence: For performance bisects, it is a confidence that the good and

+ bad runs are distinct groups. Not used for non-performance bisects.

+ """

- This score is a percentage which represents our degree of confidence in the

- proposition that the good results and bad results are distinct groups, and

- their differences aren't due to chance alone.

+ def __init__(self, bisect_state=None, depot_registry=None, opts=None,

+ runtime_warnings=None, error=None):

+ """Computes final bisect results after a bisect run is complete.

+ This constructor should be called in one of the following ways:

+ BisectResults(state, depot_registry, opts, runtime_warnings)

+ BisectResults(error=error)

- Args:

- good_results_lists: A list of lists of "good" result numbers.

- bad_results_lists: A list of lists of "bad" result numbers.

+ First option creates an object representing successful bisect results, while

+ second option creates an error result.

- Returns:

- A number in the range [0, 100].

- """

- # If there's only one item in either list, this means only one revision was

- # classified good or bad; this isn't good enough evidence to make a decision.

- # If an empty list was passed, that also implies zero confidence.

- if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1:

- return 0.0

+ Args:

+ bisect_state: BisectState object representing latest bisect state.

+ depot_registry: DepotDirectoryRegistry object with information on each

+ repository in the bisect_state.

+ opts: Options passed to the bisect run.

+ runtime_warnings: A list of warnings from the bisect run.

+ error: Error message. When error is not None, other arguments are ignored.

+ """

- # Flatten the lists of results lists.

- sample1 = sum(good_results_lists, [])

- sample2 = sum(bad_results_lists, [])

+ self.error = error

+ if error is not None:

+ return

- # If there were only empty lists in either of the lists (this is unexpected

- # and normally shouldn't happen), then we also want to return 0.

- if not sample1 or not sample2:

- return 0.0

+ assert (bisect_state is not None and depot_registry is not None and

+ opts is not None and runtime_warnings is not None), (

+ 'Incorrect use of the BisectResults constructor. When error is '

+ 'None, all other arguments are required')

- # The p-value is approximately the probability of obtaining the given set

- # of good and bad values just by chance.

- _, _, p_value = ttest.WelchsTTest(sample1, sample2)

- return 100.0 * (1.0 - p_value)

+ self.state = bisect_state

+ rev_states = bisect_state.GetRevisionStates()

+ first_working_rev, last_broken_rev = self.FindBreakingRevRange(rev_states)

+ self.first_working_revision = first_working_rev

+ self.last_broken_revision = last_broken_rev

-class BisectResults(object):

+ if first_working_rev is not None and last_broken_rev is not None:

+ statistics = self._ComputeRegressionStatistics(

+ rev_states, first_working_rev, last_broken_rev)

+ self.regression_size = statistics['regression_size']

+ self.regression_std_err = statistics['regression_std_err']

+ self.confidence = statistics['confidence']

+ self.culprit_revisions = self._FindCulpritRevisions(

+ rev_states, depot_registry, first_working_rev, last_broken_rev)

+ self.other_regressions = self._FindOtherRegressions(

+ rev_states, statistics['bad_greater_than_good'])

- def __init__(self, depot_registry):

- self._depot_registry = depot_registry

- self.revision_data = {}

- self.error = None

+ self.warnings = runtime_warnings + self._GetResultBasedWarnings(

+ self.culprit_revisions, opts, self.confidence)

@staticmethod

- def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good):

+ def _GetResultBasedWarnings(culprit_revisions, opts, confidence):

+ warnings = []

+ if len(culprit_revisions) > 1:

+ warnings.append('Due to build errors, regression range could '

+ 'not be narrowed down to a single commit.')

+ if opts.repeat_test_count == 1:

+ warnings.append('Tests were only set to run once. This may '

+ 'be insufficient to get meaningful results.')

+ if 0 < confidence < bisect_utils.HIGH_CONFIDENCE:

+ warnings.append('Confidence is not high. Try bisecting again '

+ 'with increased repeat_count, larger range, or '

+ 'on another metric.')

+ if not confidence:

+ warnings.append('Confidence score is 0%. Try bisecting again on '

+ 'another platform or another metric.')

+ return warnings

+ @staticmethod

+ def ConfidenceScore(good_results_lists, bad_results_lists,

+ accept_single_bad_or_good=False):

+ """Calculates a confidence score.

+ This score is a percentage which represents our degree of confidence in the

+ proposition that the good results and bad results are distinct groups, and

+ their differences aren't due to chance alone.

+ Args:

+ good_results_lists: A list of lists of "good" result numbers.

+ bad_results_lists: A list of lists of "bad" result numbers.

+ accept_single_bad_or_good: If True, computes confidence even if there is

+ just one bad or good revision, otherwise single good or bad revision

+ always returns 0.0 confidence. This flag will probably get away when

+ we will implement expanding the bisect range by one more revision for

+ such case.

+ Returns:

+ A number in the range [0, 100].

+ """

+ # If there's only one item in either list, this means only one revision was

+ # classified good or bad; this isn't good enough evidence to make a

+ # decision. If an empty list was passed, that also implies zero confidence.

+ if not accept_single_bad_or_good:

+ if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1:

+ return 0.0

+ # Flatten the lists of results lists.

+ sample1 = sum(good_results_lists, [])

+ sample2 = sum(bad_results_lists, [])

+ # If there were only empty lists in either of the lists (this is unexpected

+ # and normally shouldn't happen), then we also want to return 0.

+ if not sample1 or not sample2:

+ return 0.0

+ # The p-value is approximately the probability of obtaining the given set

+ # of good and bad values just by chance.

+ _, _, p_value = ttest.WelchsTTest(sample1, sample2)

+ return 100.0 * (1.0 - p_value)

+ @classmethod

+ def _FindOtherRegressions(cls, revision_states, bad_greater_than_good):

"""Compiles a list of other possible regressions from the revision data.

Args:

- revision_data_sorted: Sorted list of (revision, revision data) pairs.

+ revision_states: Sorted list of RevisionState objects.

bad_greater_than_good: Whether the result value at the "bad" revision is

numerically greater than the result value at the "good" revision.

@@ -69,13 +166,13 @@ class BisectResults(object):

"""

other_regressions = []

previous_values = []

- previous_id = None

- for current_id, current_data in revision_data_sorted:

- current_values = current_data['value']

- if current_values:

- current_values = current_values['values']

+ prev_state = None

+ for revision_state in revision_states:

+ if revision_state.value:

+ current_values = revision_state.value['values']

if previous_values:

- confidence = ConfidenceScore(previous_values, [current_values])

+ confidence = cls.ConfidenceScore(previous_values, [current_values],

+ accept_single_bad_or_good=True)

mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))

mean_of_current_runs = math_utils.Mean(current_values)

@@ -83,178 +180,84 @@ class BisectResults(object):

# the overall regression. If the mean of the previous runs < the

# mean of the current runs, this local regression is in same

# direction.

- prev_less_than_current = mean_of_prev_runs < mean_of_current_runs

- is_same_direction = (prev_less_than_current if

- bad_greater_than_good else not prev_less_than_current)

+ prev_greater_than_current = mean_of_prev_runs > mean_of_current_runs

+ is_same_direction = (prev_greater_than_current if

+ bad_greater_than_good else not prev_greater_than_current)

# Only report potential regressions with high confidence.

if is_same_direction and confidence > 50:

- other_regressions.append([current_id, previous_id, confidence])

+ other_regressions.append([revision_state, prev_state, confidence])

previous_values.append(current_values)

- previous_id = current_id

+ prev_state = revision_state

return other_regressions

- def GetResultsDict(self):

- """Prepares and returns information about the final resulsts as a dict.

- Returns:

- A dictionary with the following fields

- 'first_working_revision': First good revision.

- 'last_broken_revision': Last bad revision.

- 'culprit_revisions': A list of revisions, which contain the bad change

- introducing the failure.

- 'other_regressions': A list of tuples representing other regressions,

- which may have occured.

- 'regression_size': For performance bisects, this is a relative change of

- the mean metric value. For other bisects this field always contains

- 'zero-to-nonzero'.

- 'regression_std_err': For performance bisects, it is a pooled standard

- error for groups of good and bad runs. Not used for other bisects.

- 'confidence': For performance bisects, it is a confidence that the good

- and bad runs are distinct groups. Not used for non-performance

- bisects.

- 'revision_data_sorted': dict mapping revision ids to data about that

- revision. Each piece of revision data consists of a dict with the

- following keys:

- 'passed': Represents whether the performance test was successful at

- that revision. Possible values include: 1 (passed), 0 (failed),

- '?' (skipped), 'F' (build failed).

- 'depot': The depot that this revision is from (i.e. WebKit)

- 'external': If the revision is a 'src' revision, 'external' contains

- the revisions of each of the external libraries.

- 'sort': A sort value for sorting the dict in order of commits.

- For example:

- {

- 'CL #1':

- {

- 'passed': False,

- 'depot': 'chromium',

- 'external': None,

- 'sort': 0

- }

- """

- revision_data_sorted = sorted(self.revision_data.iteritems(),

- key = lambda x: x[1]['sort'])

- # Find range where it possibly broke.

+ @staticmethod

+ def FindBreakingRevRange(revision_states):

first_working_revision = None

- first_working_revision_index = -1

last_broken_revision = None

- last_broken_revision_index = -1

+ for revision_state in revision_states:

+ if revision_state.passed == 1 and not first_working_revision:

+ first_working_revision = revision_state

+ if not revision_state.passed:

+ last_broken_revision = revision_state

+ return first_working_revision, last_broken_revision

+ @staticmethod

+ def _FindCulpritRevisions(revision_states, depot_registry, first_working_rev,

+ last_broken_rev):

+ cwd = os.getcwd()

culprit_revisions = []

- other_regressions = []

- regression_size = 0.0

- regression_std_err = 0.0

- confidence = 0.0

- for i in xrange(len(revision_data_sorted)):

- k, v = revision_data_sorted[i]

- if v['passed'] == 1:

- if not first_working_revision:

- first_working_revision = k

- first_working_revision_index = i

- if not v['passed']:

- last_broken_revision = k

- last_broken_revision_index = i

- if last_broken_revision != None and first_working_revision != None:

- broken_means = []

- for i in xrange(0, last_broken_revision_index + 1):

- if revision_data_sorted[i][1]['value']:

- broken_means.append(revision_data_sorted[i][1]['value']['values'])

- working_means = []

- for i in xrange(first_working_revision_index, len(revision_data_sorted)):

- if revision_data_sorted[i][1]['value']:

- working_means.append(revision_data_sorted[i][1]['value']['values'])

- # Flatten the lists to calculate mean of all values.

- working_mean = sum(working_means, [])

- broken_mean = sum(broken_means, [])

- # Calculate the approximate size of the regression

- mean_of_bad_runs = math_utils.Mean(broken_mean)

- mean_of_good_runs = math_utils.Mean(working_mean)

- regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,

+ for i in xrange(last_broken_rev.index, first_working_rev.index):

+ depot_registry.ChangeToDepotDir(revision_states[i].depot)

+ info = source_control.QueryRevisionInfo(revision_states[i].revision)

+ culprit_revisions.append((revision_states[i].revision, info,

+ revision_states[i].depot))

+ os.chdir(cwd)

+ return culprit_revisions

+ @classmethod

+ def _ComputeRegressionStatistics(cls, rev_states, first_working_rev,

+ last_broken_rev):

+ # TODO(sergiyb): We assume that value has "values" key, which may not be

+ # the case for failure-bisects, where there is a single value only.

+ broken_means = [state.value['values']

+ for state in rev_states[:last_broken_rev.index+1]

+ if state.value]

+ working_means = [state.value['values']

+ for state in rev_states[first_working_rev.index:]

+ if state.value]

+ # Flatten the lists to calculate mean of all values.

+ working_mean = sum(working_means, [])

+ broken_mean = sum(broken_means, [])

+ # Calculate the approximate size of the regression

+ mean_of_bad_runs = math_utils.Mean(broken_mean)

+ mean_of_good_runs = math_utils.Mean(working_mean)

+ regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,

mean_of_bad_runs)

- if math.isnan(regression_size):

- regression_size = 'zero-to-nonzero'

- regression_std_err = math.fabs(math_utils.PooledStandardError(

- [working_mean, broken_mean]) /

- max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0

- # Give a "confidence" in the bisect. At the moment we use how distinct the

- # values are before and after the last broken revision, and how noisy the

- # overall graph is.

- confidence = ConfidenceScore(working_means, broken_means)

- culprit_revisions = []

- cwd = os.getcwd()

- self._depot_registry.ChangeToDepotDir(

- self.revision_data[last_broken_revision]['depot'])

- if self.revision_data[last_broken_revision]['depot'] == 'cros':

- # Want to get a list of all the commits and what depots they belong

- # to so that we can grab info about each.

- cmd = ['repo', 'forall', '-c',

- 'pwd ; git log --pretty=oneline --before=%d --after=%d' % (

- last_broken_revision, first_working_revision + 1)]

- output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd)

- changes = []

- assert not return_code, ('An error occurred while running '

- '"%s"' % ' '.join(cmd))

- last_depot = None

- cwd = os.getcwd()

- for l in output.split('\n'):

- if l:

- # Output will be in form:

- # /path_to_depot

- # /path_to_other_depot

- # <SHA1>

- # /path_again

- # <SHA1>

- # etc.

- if l[0] == '/':

- last_depot = l

- else:

- contents = l.split(' ')

- if len(contents) > 1:

- changes.append([last_depot, contents[0]])

- for c in changes:

- os.chdir(c[0])

- info = source_control.QueryRevisionInfo(c[1])

- culprit_revisions.append((c[1], info, None))

- else:

- for i in xrange(last_broken_revision_index, len(revision_data_sorted)):

- k, v = revision_data_sorted[i]

- if k == first_working_revision:

- break

- self._depot_registry.ChangeToDepotDir(v['depot'])

- info = source_control.QueryRevisionInfo(k)

- culprit_revisions.append((k, info, v['depot']))

- os.chdir(cwd)

- # Check for any other possible regression ranges.

- other_regressions = self._FindOtherRegressions(

- revision_data_sorted, mean_of_bad_runs > mean_of_good_runs)

- return {

- 'first_working_revision': first_working_revision,

- 'last_broken_revision': last_broken_revision,

- 'culprit_revisions': culprit_revisions,

- 'other_regressions': other_regressions,

- 'regression_size': regression_size,

- 'regression_std_err': regression_std_err,

- 'confidence': confidence,

- 'revision_data_sorted': revision_data_sorted

- }

+ if math.isnan(regression_size):

+ regression_size = 'zero-to-nonzero'

+ regression_std_err = math.fabs(math_utils.PooledStandardError(

+ [working_mean, broken_mean]) /

+ max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0

+ # Give a "confidence" in the bisect. At the moment we use how distinct the

+ # values are before and after the last broken revision, and how noisy the

+ # overall graph is.

+ confidence = cls.ConfidenceScore(working_means, broken_means)

+ bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs

+ return {'regression_size': regression_size,

+ 'regression_std_err': regression_std_err,

+ 'confidence': confidence,

+ 'bad_greater_than_good': bad_greater_than_good}

« no previous file with comments | « tools/auto_bisect/bisect_printer.py ('k') | tools/auto_bisect/bisect_results_test.py » ('j') | no next file with comments »