| Index: tools/auto_bisect/bisect_results.py
|
| diff --git a/tools/auto_bisect/bisect_results.py b/tools/auto_bisect/bisect_results.py
|
| index 59cb94d7db07fa024ab389f283bf9f5eb8779f46..9bcaeb683a0fc6d146d3de043afba0e2148c46a9 100644
|
| --- a/tools/auto_bisect/bisect_results.py
|
| +++ b/tools/auto_bisect/bisect_results.py
|
| @@ -11,55 +11,152 @@ import source_control
|
| import ttest
|
|
|
|
|
| -def ConfidenceScore(good_results_lists, bad_results_lists):
|
| - """Calculates a confidence score.
|
| +class BisectResults(object):
|
| + """Contains results of the completed bisect.
|
| +
|
| + Properties:
|
| + error: Error message if the bisect failed.
|
| +
|
| + If the error is None, the following properties are present:
|
| + warnings: List of warnings from the bisect run.
|
| + state: BisectState object from which these results were generated.
|
| + first_working_revision: First good revision.
|
| + last_broken_revision: Last bad revision.
|
| +
|
| + If both of above revisions are not None, the follow properties are present:
|
| + culprit_revisions: A list of revisions, which contain the bad change
|
| + introducing the failure.
|
| + other_regressions: A list of tuples representing other regressions, which
|
| + may have occured.
|
| + regression_size: For performance bisects, this is a relative change of
|
| + the mean metric value. For other bisects this field always contains
|
| + 'zero-to-nonzero'.
|
| + regression_std_err: For performance bisects, it is a pooled standard error
|
| + for groups of good and bad runs. Not used for other bisects.
|
| + confidence: For performance bisects, it is a confidence that the good and
|
| + bad runs are distinct groups. Not used for non-performance bisects.
|
| + """
|
|
|
| - This score is a percentage which represents our degree of confidence in the
|
| - proposition that the good results and bad results are distinct groups, and
|
| - their differences aren't due to chance alone.
|
| + def __init__(self, bisect_state=None, depot_registry=None, opts=None,
|
| + runtime_warnings=None, error=None):
|
| + """Computes final bisect results after a bisect run is complete.
|
|
|
| + This constructor should be called in one of the following ways:
|
| + BisectResults(state, depot_registry, opts, runtime_warnings)
|
| + BisectResults(error=error)
|
|
|
| - Args:
|
| - good_results_lists: A list of lists of "good" result numbers.
|
| - bad_results_lists: A list of lists of "bad" result numbers.
|
| + First option creates an object representing successful bisect results, while
|
| + second option creates an error result.
|
|
|
| - Returns:
|
| - A number in the range [0, 100].
|
| - """
|
| - # If there's only one item in either list, this means only one revision was
|
| - # classified good or bad; this isn't good enough evidence to make a decision.
|
| - # If an empty list was passed, that also implies zero confidence.
|
| - if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1:
|
| - return 0.0
|
| + Args:
|
| + bisect_state: BisectState object representing latest bisect state.
|
| + depot_registry: DepotDirectoryRegistry object with information on each
|
| + repository in the bisect_state.
|
| + opts: Options passed to the bisect run.
|
| + runtime_warnings: A list of warnings from the bisect run.
|
| + error: Error message. When error is not None, other arguments are ignored.
|
| + """
|
|
|
| - # Flatten the lists of results lists.
|
| - sample1 = sum(good_results_lists, [])
|
| - sample2 = sum(bad_results_lists, [])
|
| + self.error = error
|
| + if error is not None:
|
| + return
|
|
|
| - # If there were only empty lists in either of the lists (this is unexpected
|
| - # and normally shouldn't happen), then we also want to return 0.
|
| - if not sample1 or not sample2:
|
| - return 0.0
|
| + assert (bisect_state is not None and depot_registry is not None and
|
| + opts is not None and runtime_warnings is not None), (
|
| + 'Incorrect use of the BisectResults constructor. When error is '
|
| + 'None, all other arguments are required')
|
|
|
| - # The p-value is approximately the probability of obtaining the given set
|
| - # of good and bad values just by chance.
|
| - _, _, p_value = ttest.WelchsTTest(sample1, sample2)
|
| - return 100.0 * (1.0 - p_value)
|
| + self.state = bisect_state
|
|
|
| + rev_states = bisect_state.GetRevisionStates()
|
| + first_working_rev, last_broken_rev = self.FindBreakingRevRange(rev_states)
|
| + self.first_working_revision = first_working_rev
|
| + self.last_broken_revision = last_broken_rev
|
|
|
| -class BisectResults(object):
|
| + if first_working_rev is not None and last_broken_rev is not None:
|
| + statistics = self._ComputeRegressionStatistics(
|
| + rev_states, first_working_rev, last_broken_rev)
|
| +
|
| + self.regression_size = statistics['regression_size']
|
| + self.regression_std_err = statistics['regression_std_err']
|
| + self.confidence = statistics['confidence']
|
| +
|
| + self.culprit_revisions = self._FindCulpritRevisions(
|
| + rev_states, depot_registry, first_working_rev, last_broken_rev)
|
| +
|
| + self.other_regressions = self._FindOtherRegressions(
|
| + rev_states, statistics['bad_greater_than_good'])
|
|
|
| - def __init__(self, depot_registry):
|
| - self._depot_registry = depot_registry
|
| - self.revision_data = {}
|
| - self.error = None
|
| + self.warnings = runtime_warnings + self._GetResultBasedWarnings(
|
| + self.culprit_revisions, opts, self.confidence)
|
|
|
| @staticmethod
|
| - def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good):
|
| + def _GetResultBasedWarnings(culprit_revisions, opts, confidence):
|
| + warnings = []
|
| + if len(culprit_revisions) > 1:
|
| + warnings.append('Due to build errors, regression range could '
|
| + 'not be narrowed down to a single commit.')
|
| + if opts.repeat_test_count == 1:
|
| + warnings.append('Tests were only set to run once. This may '
|
| + 'be insufficient to get meaningful results.')
|
| + if 0 < confidence < bisect_utils.HIGH_CONFIDENCE:
|
| + warnings.append('Confidence is not high. Try bisecting again '
|
| + 'with increased repeat_count, larger range, or '
|
| + 'on another metric.')
|
| + if not confidence:
|
| + warnings.append('Confidence score is 0%. Try bisecting again on '
|
| + 'another platform or another metric.')
|
| + return warnings
|
| +
|
| + @staticmethod
|
| + def ConfidenceScore(good_results_lists, bad_results_lists,
|
| + accept_single_bad_or_good=False):
|
| + """Calculates a confidence score.
|
| +
|
| + This score is a percentage which represents our degree of confidence in the
|
| + proposition that the good results and bad results are distinct groups, and
|
| + their differences aren't due to chance alone.
|
| +
|
| +
|
| + Args:
|
| + good_results_lists: A list of lists of "good" result numbers.
|
| + bad_results_lists: A list of lists of "bad" result numbers.
|
| + accept_single_bad_or_good: If True, computes confidence even if there is
|
| + just one bad or good revision, otherwise single good or bad revision
|
| + always returns 0.0 confidence. This flag will probably get away when
|
| + we will implement expanding the bisect range by one more revision for
|
| + such case.
|
| +
|
| + Returns:
|
| + A number in the range [0, 100].
|
| + """
|
| + # If there's only one item in either list, this means only one revision was
|
| + # classified good or bad; this isn't good enough evidence to make a
|
| + # decision. If an empty list was passed, that also implies zero confidence.
|
| + if not accept_single_bad_or_good:
|
| + if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1:
|
| + return 0.0
|
| +
|
| + # Flatten the lists of results lists.
|
| + sample1 = sum(good_results_lists, [])
|
| + sample2 = sum(bad_results_lists, [])
|
| +
|
| + # If there were only empty lists in either of the lists (this is unexpected
|
| + # and normally shouldn't happen), then we also want to return 0.
|
| + if not sample1 or not sample2:
|
| + return 0.0
|
| +
|
| + # The p-value is approximately the probability of obtaining the given set
|
| + # of good and bad values just by chance.
|
| + _, _, p_value = ttest.WelchsTTest(sample1, sample2)
|
| + return 100.0 * (1.0 - p_value)
|
| +
|
| + @classmethod
|
| + def _FindOtherRegressions(cls, revision_states, bad_greater_than_good):
|
| """Compiles a list of other possible regressions from the revision data.
|
|
|
| Args:
|
| - revision_data_sorted: Sorted list of (revision, revision data) pairs.
|
| + revision_states: Sorted list of RevisionState objects.
|
| bad_greater_than_good: Whether the result value at the "bad" revision is
|
| numerically greater than the result value at the "good" revision.
|
|
|
| @@ -69,13 +166,13 @@ class BisectResults(object):
|
| """
|
| other_regressions = []
|
| previous_values = []
|
| - previous_id = None
|
| - for current_id, current_data in revision_data_sorted:
|
| - current_values = current_data['value']
|
| - if current_values:
|
| - current_values = current_values['values']
|
| + prev_state = None
|
| + for revision_state in revision_states:
|
| + if revision_state.value:
|
| + current_values = revision_state.value['values']
|
| if previous_values:
|
| - confidence = ConfidenceScore(previous_values, [current_values])
|
| + confidence = cls.ConfidenceScore(previous_values, [current_values],
|
| + accept_single_bad_or_good=True)
|
| mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))
|
| mean_of_current_runs = math_utils.Mean(current_values)
|
|
|
| @@ -83,178 +180,84 @@ class BisectResults(object):
|
| # the overall regression. If the mean of the previous runs < the
|
| # mean of the current runs, this local regression is in same
|
| # direction.
|
| - prev_less_than_current = mean_of_prev_runs < mean_of_current_runs
|
| - is_same_direction = (prev_less_than_current if
|
| - bad_greater_than_good else not prev_less_than_current)
|
| + prev_greater_than_current = mean_of_prev_runs > mean_of_current_runs
|
| + is_same_direction = (prev_greater_than_current if
|
| + bad_greater_than_good else not prev_greater_than_current)
|
|
|
| # Only report potential regressions with high confidence.
|
| if is_same_direction and confidence > 50:
|
| - other_regressions.append([current_id, previous_id, confidence])
|
| + other_regressions.append([revision_state, prev_state, confidence])
|
| previous_values.append(current_values)
|
| - previous_id = current_id
|
| + prev_state = revision_state
|
| return other_regressions
|
|
|
| - def GetResultsDict(self):
|
| - """Prepares and returns information about the final resulsts as a dict.
|
| -
|
| - Returns:
|
| - A dictionary with the following fields
|
| -
|
| - 'first_working_revision': First good revision.
|
| - 'last_broken_revision': Last bad revision.
|
| - 'culprit_revisions': A list of revisions, which contain the bad change
|
| - introducing the failure.
|
| - 'other_regressions': A list of tuples representing other regressions,
|
| - which may have occured.
|
| - 'regression_size': For performance bisects, this is a relative change of
|
| - the mean metric value. For other bisects this field always contains
|
| - 'zero-to-nonzero'.
|
| - 'regression_std_err': For performance bisects, it is a pooled standard
|
| - error for groups of good and bad runs. Not used for other bisects.
|
| - 'confidence': For performance bisects, it is a confidence that the good
|
| - and bad runs are distinct groups. Not used for non-performance
|
| - bisects.
|
| - 'revision_data_sorted': dict mapping revision ids to data about that
|
| - revision. Each piece of revision data consists of a dict with the
|
| - following keys:
|
| -
|
| - 'passed': Represents whether the performance test was successful at
|
| - that revision. Possible values include: 1 (passed), 0 (failed),
|
| - '?' (skipped), 'F' (build failed).
|
| - 'depot': The depot that this revision is from (i.e. WebKit)
|
| - 'external': If the revision is a 'src' revision, 'external' contains
|
| - the revisions of each of the external libraries.
|
| - 'sort': A sort value for sorting the dict in order of commits.
|
| -
|
| - For example:
|
| - {
|
| - 'CL #1':
|
| - {
|
| - 'passed': False,
|
| - 'depot': 'chromium',
|
| - 'external': None,
|
| - 'sort': 0
|
| - }
|
| - }
|
| - """
|
| - revision_data_sorted = sorted(self.revision_data.iteritems(),
|
| - key = lambda x: x[1]['sort'])
|
| -
|
| - # Find range where it possibly broke.
|
| + @staticmethod
|
| + def FindBreakingRevRange(revision_states):
|
| first_working_revision = None
|
| - first_working_revision_index = -1
|
| last_broken_revision = None
|
| - last_broken_revision_index = -1
|
| +
|
| + for revision_state in revision_states:
|
| + if revision_state.passed == 1 and not first_working_revision:
|
| + first_working_revision = revision_state
|
| +
|
| + if not revision_state.passed:
|
| + last_broken_revision = revision_state
|
| +
|
| + return first_working_revision, last_broken_revision
|
| +
|
| + @staticmethod
|
| + def _FindCulpritRevisions(revision_states, depot_registry, first_working_rev,
|
| + last_broken_rev):
|
| + cwd = os.getcwd()
|
|
|
| culprit_revisions = []
|
| - other_regressions = []
|
| - regression_size = 0.0
|
| - regression_std_err = 0.0
|
| - confidence = 0.0
|
| -
|
| - for i in xrange(len(revision_data_sorted)):
|
| - k, v = revision_data_sorted[i]
|
| - if v['passed'] == 1:
|
| - if not first_working_revision:
|
| - first_working_revision = k
|
| - first_working_revision_index = i
|
| -
|
| - if not v['passed']:
|
| - last_broken_revision = k
|
| - last_broken_revision_index = i
|
| -
|
| - if last_broken_revision != None and first_working_revision != None:
|
| - broken_means = []
|
| - for i in xrange(0, last_broken_revision_index + 1):
|
| - if revision_data_sorted[i][1]['value']:
|
| - broken_means.append(revision_data_sorted[i][1]['value']['values'])
|
| -
|
| - working_means = []
|
| - for i in xrange(first_working_revision_index, len(revision_data_sorted)):
|
| - if revision_data_sorted[i][1]['value']:
|
| - working_means.append(revision_data_sorted[i][1]['value']['values'])
|
| -
|
| - # Flatten the lists to calculate mean of all values.
|
| - working_mean = sum(working_means, [])
|
| - broken_mean = sum(broken_means, [])
|
| -
|
| - # Calculate the approximate size of the regression
|
| - mean_of_bad_runs = math_utils.Mean(broken_mean)
|
| - mean_of_good_runs = math_utils.Mean(working_mean)
|
| -
|
| - regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
|
| + for i in xrange(last_broken_rev.index, first_working_rev.index):
|
| + depot_registry.ChangeToDepotDir(revision_states[i].depot)
|
| + info = source_control.QueryRevisionInfo(revision_states[i].revision)
|
| + culprit_revisions.append((revision_states[i].revision, info,
|
| + revision_states[i].depot))
|
| +
|
| + os.chdir(cwd)
|
| + return culprit_revisions
|
| +
|
| + @classmethod
|
| + def _ComputeRegressionStatistics(cls, rev_states, first_working_rev,
|
| + last_broken_rev):
|
| + # TODO(sergiyb): We assume that value has "values" key, which may not be
|
| + # the case for failure-bisects, where there is a single value only.
|
| + broken_means = [state.value['values']
|
| + for state in rev_states[:last_broken_rev.index+1]
|
| + if state.value]
|
| +
|
| + working_means = [state.value['values']
|
| + for state in rev_states[first_working_rev.index:]
|
| + if state.value]
|
| +
|
| + # Flatten the lists to calculate mean of all values.
|
| + working_mean = sum(working_means, [])
|
| + broken_mean = sum(broken_means, [])
|
| +
|
| + # Calculate the approximate size of the regression
|
| + mean_of_bad_runs = math_utils.Mean(broken_mean)
|
| + mean_of_good_runs = math_utils.Mean(working_mean)
|
| +
|
| + regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
|
| mean_of_bad_runs)
|
| - if math.isnan(regression_size):
|
| - regression_size = 'zero-to-nonzero'
|
| -
|
| - regression_std_err = math.fabs(math_utils.PooledStandardError(
|
| - [working_mean, broken_mean]) /
|
| - max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
|
| -
|
| - # Give a "confidence" in the bisect. At the moment we use how distinct the
|
| - # values are before and after the last broken revision, and how noisy the
|
| - # overall graph is.
|
| - confidence = ConfidenceScore(working_means, broken_means)
|
| -
|
| - culprit_revisions = []
|
| -
|
| - cwd = os.getcwd()
|
| - self._depot_registry.ChangeToDepotDir(
|
| - self.revision_data[last_broken_revision]['depot'])
|
| -
|
| - if self.revision_data[last_broken_revision]['depot'] == 'cros':
|
| - # Want to get a list of all the commits and what depots they belong
|
| - # to so that we can grab info about each.
|
| - cmd = ['repo', 'forall', '-c',
|
| - 'pwd ; git log --pretty=oneline --before=%d --after=%d' % (
|
| - last_broken_revision, first_working_revision + 1)]
|
| - output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd)
|
| -
|
| - changes = []
|
| - assert not return_code, ('An error occurred while running '
|
| - '"%s"' % ' '.join(cmd))
|
| - last_depot = None
|
| - cwd = os.getcwd()
|
| - for l in output.split('\n'):
|
| - if l:
|
| - # Output will be in form:
|
| - # /path_to_depot
|
| - # /path_to_other_depot
|
| - # <SHA1>
|
| - # /path_again
|
| - # <SHA1>
|
| - # etc.
|
| - if l[0] == '/':
|
| - last_depot = l
|
| - else:
|
| - contents = l.split(' ')
|
| - if len(contents) > 1:
|
| - changes.append([last_depot, contents[0]])
|
| - for c in changes:
|
| - os.chdir(c[0])
|
| - info = source_control.QueryRevisionInfo(c[1])
|
| - culprit_revisions.append((c[1], info, None))
|
| - else:
|
| - for i in xrange(last_broken_revision_index, len(revision_data_sorted)):
|
| - k, v = revision_data_sorted[i]
|
| - if k == first_working_revision:
|
| - break
|
| - self._depot_registry.ChangeToDepotDir(v['depot'])
|
| - info = source_control.QueryRevisionInfo(k)
|
| - culprit_revisions.append((k, info, v['depot']))
|
| - os.chdir(cwd)
|
| -
|
| - # Check for any other possible regression ranges.
|
| - other_regressions = self._FindOtherRegressions(
|
| - revision_data_sorted, mean_of_bad_runs > mean_of_good_runs)
|
| -
|
| - return {
|
| - 'first_working_revision': first_working_revision,
|
| - 'last_broken_revision': last_broken_revision,
|
| - 'culprit_revisions': culprit_revisions,
|
| - 'other_regressions': other_regressions,
|
| - 'regression_size': regression_size,
|
| - 'regression_std_err': regression_std_err,
|
| - 'confidence': confidence,
|
| - 'revision_data_sorted': revision_data_sorted
|
| - }
|
| + if math.isnan(regression_size):
|
| + regression_size = 'zero-to-nonzero'
|
| +
|
| + regression_std_err = math.fabs(math_utils.PooledStandardError(
|
| + [working_mean, broken_mean]) /
|
| + max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
|
| +
|
| + # Give a "confidence" in the bisect. At the moment we use how distinct the
|
| + # values are before and after the last broken revision, and how noisy the
|
| + # overall graph is.
|
| + confidence = cls.ConfidenceScore(working_means, broken_means)
|
| +
|
| + bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs
|
| +
|
| + return {'regression_size': regression_size,
|
| + 'regression_std_err': regression_std_err,
|
| + 'confidence': confidence,
|
| + 'bad_greater_than_good': bad_greater_than_good}
|
|
|