Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(6)

Unified Diff: tools/auto_bisect/bisect_results.py

Issue 554283003: Refactored bisect results dicts into a separate class (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Adressed comments Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « tools/auto_bisect/bisect_perf_regression_test.py ('k') | tools/auto_bisect/bisect_results_test.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: tools/auto_bisect/bisect_results.py
diff --git a/tools/auto_bisect/bisect_results.py b/tools/auto_bisect/bisect_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..144962fd7038fa516f57b955830031fc8b5aeedb
--- /dev/null
+++ b/tools/auto_bisect/bisect_results.py
@@ -0,0 +1,260 @@
+# Copyright 2014 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import math
+import os
+
+import bisect_utils
+import math_utils
+import ttest
+
+
+def ConfidenceScore(good_results_lists, bad_results_lists):
+ """Calculates a confidence score.
+
+ This score is a percentage which represents our degree of confidence in the
+ proposition that the good results and bad results are distinct groups, and
+ their differences aren't due to chance alone.
+
+
+ Args:
+ good_results_lists: A list of lists of "good" result numbers.
+ bad_results_lists: A list of lists of "bad" result numbers.
+
+ Returns:
+ A number in the range [0, 100].
+ """
+ # If there's only one item in either list, this means only one revision was
+ # classified good or bad; this isn't good enough evidence to make a decision.
+ # If an empty list was passed, that also implies zero confidence.
+ if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1:
+ return 0.0
+
+ # Flatten the lists of results lists.
+ sample1 = sum(good_results_lists, [])
+ sample2 = sum(bad_results_lists, [])
+
+ # If there were only empty lists in either of the lists (this is unexpected
+ # and normally shouldn't happen), then we also want to return 0.
+ if not sample1 or not sample2:
+ return 0.0
+
+ # The p-value is approximately the probability of obtaining the given set
+ # of good and bad values just by chance.
+ _, _, p_value = ttest.WelchsTTest(sample1, sample2)
+ return 100.0 * (1.0 - p_value)
+
+
+class BisectResults(object):
+
+ def __init__(self, depot_registry, source_control):
+ self._depot_registry = depot_registry
+ self.revision_data = {}
+ self.error = None
+ self._source_control = source_control
+
+ @staticmethod
+ def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good):
+ """Compiles a list of other possible regressions from the revision data.
+
+ Args:
+ revision_data_sorted: Sorted list of (revision, revision data) pairs.
+ bad_greater_than_good: Whether the result value at the "bad" revision is
+ numerically greater than the result value at the "good" revision.
+
+ Returns:
+ A list of [current_rev, previous_rev, confidence] for other places where
+ there may have been a regression.
+ """
+ other_regressions = []
+ previous_values = []
+ previous_id = None
+ for current_id, current_data in revision_data_sorted:
+ current_values = current_data['value']
+ if current_values:
+ current_values = current_values['values']
+ if previous_values:
+ confidence = ConfidenceScore(previous_values, [current_values])
+ mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))
+ mean_of_current_runs = math_utils.Mean(current_values)
+
+ # Check that the potential regression is in the same direction as
+ # the overall regression. If the mean of the previous runs < the
+ # mean of the current runs, this local regression is in same
+ # direction.
+ prev_less_than_current = mean_of_prev_runs < mean_of_current_runs
+ is_same_direction = (prev_less_than_current if
+ bad_greater_than_good else not prev_less_than_current)
+
+ # Only report potential regressions with high confidence.
+ if is_same_direction and confidence > 50:
+ other_regressions.append([current_id, previous_id, confidence])
+ previous_values.append(current_values)
+ previous_id = current_id
+ return other_regressions
+
+ def GetResultsDict(self):
+ """Prepares and returns information about the final resulsts as a dict.
+
+ Returns:
+ A dictionary with the following fields
+
+ 'first_working_revision': First good revision.
+ 'last_broken_revision': Last bad revision.
+ 'culprit_revisions': A list of revisions, which contain the bad change
+ introducing the failure.
+ 'other_regressions': A list of tuples representing other regressions,
+ which may have occured.
+ 'regression_size': For performance bisects, this is a relative change of
+ the mean metric value. For other bisects this field always contains
+ 'zero-to-nonzero'.
+ 'regression_std_err': For performance bisects, it is a pooled standard
+ error for groups of good and bad runs. Not used for other bisects.
+ 'confidence': For performance bisects, it is a confidence that the good
+ and bad runs are distinct groups. Not used for non-performance
+ bisects.
+ 'revision_data_sorted': dict mapping revision ids to data about that
+ revision. Each piece of revision data consists of a dict with the
+ following keys:
+
+ 'passed': Represents whether the performance test was successful at
+ that revision. Possible values include: 1 (passed), 0 (failed),
+ '?' (skipped), 'F' (build failed).
+ 'depot': The depot that this revision is from (i.e. WebKit)
+ 'external': If the revision is a 'src' revision, 'external' contains
+ the revisions of each of the external libraries.
+ 'sort': A sort value for sorting the dict in order of commits.
+
+ For example:
+ {
+ 'CL #1':
+ {
+ 'passed': False,
+ 'depot': 'chromium',
+ 'external': None,
+ 'sort': 0
+ }
+ }
+ """
+ revision_data_sorted = sorted(self.revision_data.iteritems(),
+ key = lambda x: x[1]['sort'])
+
+ # Find range where it possibly broke.
+ first_working_revision = None
+ first_working_revision_index = -1
+ last_broken_revision = None
+ last_broken_revision_index = -1
+
+ culprit_revisions = []
+ other_regressions = []
+ regression_size = 0.0
+ regression_std_err = 0.0
+ confidence = 0.0
+
+ for i in xrange(len(revision_data_sorted)):
+ k, v = revision_data_sorted[i]
+ if v['passed'] == 1:
+ if not first_working_revision:
+ first_working_revision = k
+ first_working_revision_index = i
+
+ if not v['passed']:
+ last_broken_revision = k
+ last_broken_revision_index = i
+
+ if last_broken_revision != None and first_working_revision != None:
+ broken_means = []
+ for i in xrange(0, last_broken_revision_index + 1):
+ if revision_data_sorted[i][1]['value']:
+ broken_means.append(revision_data_sorted[i][1]['value']['values'])
+
+ working_means = []
+ for i in xrange(first_working_revision_index, len(revision_data_sorted)):
+ if revision_data_sorted[i][1]['value']:
+ working_means.append(revision_data_sorted[i][1]['value']['values'])
+
+ # Flatten the lists to calculate mean of all values.
+ working_mean = sum(working_means, [])
+ broken_mean = sum(broken_means, [])
+
+ # Calculate the approximate size of the regression
+ mean_of_bad_runs = math_utils.Mean(broken_mean)
+ mean_of_good_runs = math_utils.Mean(working_mean)
+
+ regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
+ mean_of_bad_runs)
+ if math.isnan(regression_size):
+ regression_size = 'zero-to-nonzero'
+
+ regression_std_err = math.fabs(math_utils.PooledStandardError(
+ [working_mean, broken_mean]) /
+ max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
+
+ # Give a "confidence" in the bisect. At the moment we use how distinct the
+ # values are before and after the last broken revision, and how noisy the
+ # overall graph is.
+ confidence = ConfidenceScore(working_means, broken_means)
+
+ culprit_revisions = []
+
+ cwd = os.getcwd()
+ self._depot_registry.ChangeToDepotDir(
+ self.revision_data[last_broken_revision]['depot'])
+
+ if self.revision_data[last_broken_revision]['depot'] == 'cros':
+ # Want to get a list of all the commits and what depots they belong
+ # to so that we can grab info about each.
+ cmd = ['repo', 'forall', '-c',
+ 'pwd ; git log --pretty=oneline --before=%d --after=%d' % (
+ last_broken_revision, first_working_revision + 1)]
+ output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd)
+
+ changes = []
+ assert not return_code, ('An error occurred while running '
+ '"%s"' % ' '.join(cmd))
+ last_depot = None
+ cwd = os.getcwd()
+ for l in output.split('\n'):
+ if l:
+ # Output will be in form:
+ # /path_to_depot
+ # /path_to_other_depot
+ # <SHA1>
+ # /path_again
+ # <SHA1>
+ # etc.
+ if l[0] == '/':
+ last_depot = l
+ else:
+ contents = l.split(' ')
+ if len(contents) > 1:
+ changes.append([last_depot, contents[0]])
+ for c in changes:
+ os.chdir(c[0])
+ info = self._source_control.QueryRevisionInfo(c[1])
+ culprit_revisions.append((c[1], info, None))
+ else:
+ for i in xrange(last_broken_revision_index, len(revision_data_sorted)):
+ k, v = revision_data_sorted[i]
+ if k == first_working_revision:
+ break
+ self._depot_registry.ChangeToDepotDir(v['depot'])
+ info = self._source_control.QueryRevisionInfo(k)
+ culprit_revisions.append((k, info, v['depot']))
+ os.chdir(cwd)
+
+ # Check for any other possible regression ranges.
+ other_regressions = self._FindOtherRegressions(
+ revision_data_sorted, mean_of_bad_runs > mean_of_good_runs)
+
+ return {
+ 'first_working_revision': first_working_revision,
+ 'last_broken_revision': last_broken_revision,
+ 'culprit_revisions': culprit_revisions,
+ 'other_regressions': other_regressions,
+ 'regression_size': regression_size,
+ 'regression_std_err': regression_std_err,
+ 'confidence': confidence,
+ 'revision_data_sorted': revision_data_sorted
+ }
« no previous file with comments | « tools/auto_bisect/bisect_perf_regression_test.py ('k') | tools/auto_bisect/bisect_results_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698