Index: tools/auto_bisect/bisect_results.py |
diff --git a/tools/auto_bisect/bisect_results.py b/tools/auto_bisect/bisect_results.py |
new file mode 100644 |
index 0000000000000000000000000000000000000000..144962fd7038fa516f57b955830031fc8b5aeedb |
--- /dev/null |
+++ b/tools/auto_bisect/bisect_results.py |
@@ -0,0 +1,260 @@ |
+# Copyright 2014 The Chromium Authors. All rights reserved. |
+# Use of this source code is governed by a BSD-style license that can be |
+# found in the LICENSE file. |
+ |
+import math |
+import os |
+ |
+import bisect_utils |
+import math_utils |
+import ttest |
+ |
+ |
+def ConfidenceScore(good_results_lists, bad_results_lists): |
+ """Calculates a confidence score. |
+ |
+ This score is a percentage which represents our degree of confidence in the |
+ proposition that the good results and bad results are distinct groups, and |
+ their differences aren't due to chance alone. |
+ |
+ |
+ Args: |
+ good_results_lists: A list of lists of "good" result numbers. |
+ bad_results_lists: A list of lists of "bad" result numbers. |
+ |
+ Returns: |
+ A number in the range [0, 100]. |
+ """ |
+ # If there's only one item in either list, this means only one revision was |
+ # classified good or bad; this isn't good enough evidence to make a decision. |
+ # If an empty list was passed, that also implies zero confidence. |
+ if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1: |
+ return 0.0 |
+ |
+ # Flatten the lists of results lists. |
+ sample1 = sum(good_results_lists, []) |
+ sample2 = sum(bad_results_lists, []) |
+ |
+ # If there were only empty lists in either of the lists (this is unexpected |
+ # and normally shouldn't happen), then we also want to return 0. |
+ if not sample1 or not sample2: |
+ return 0.0 |
+ |
+ # The p-value is approximately the probability of obtaining the given set |
+ # of good and bad values just by chance. |
+ _, _, p_value = ttest.WelchsTTest(sample1, sample2) |
+ return 100.0 * (1.0 - p_value) |
+ |
+ |
+class BisectResults(object): |
+ |
+ def __init__(self, depot_registry, source_control): |
+ self._depot_registry = depot_registry |
+ self.revision_data = {} |
+ self.error = None |
+ self._source_control = source_control |
+ |
+ @staticmethod |
+ def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good): |
+ """Compiles a list of other possible regressions from the revision data. |
+ |
+ Args: |
+ revision_data_sorted: Sorted list of (revision, revision data) pairs. |
+ bad_greater_than_good: Whether the result value at the "bad" revision is |
+ numerically greater than the result value at the "good" revision. |
+ |
+ Returns: |
+ A list of [current_rev, previous_rev, confidence] for other places where |
+ there may have been a regression. |
+ """ |
+ other_regressions = [] |
+ previous_values = [] |
+ previous_id = None |
+ for current_id, current_data in revision_data_sorted: |
+ current_values = current_data['value'] |
+ if current_values: |
+ current_values = current_values['values'] |
+ if previous_values: |
+ confidence = ConfidenceScore(previous_values, [current_values]) |
+ mean_of_prev_runs = math_utils.Mean(sum(previous_values, [])) |
+ mean_of_current_runs = math_utils.Mean(current_values) |
+ |
+ # Check that the potential regression is in the same direction as |
+ # the overall regression. If the mean of the previous runs < the |
+ # mean of the current runs, this local regression is in same |
+ # direction. |
+ prev_less_than_current = mean_of_prev_runs < mean_of_current_runs |
+ is_same_direction = (prev_less_than_current if |
+ bad_greater_than_good else not prev_less_than_current) |
+ |
+ # Only report potential regressions with high confidence. |
+ if is_same_direction and confidence > 50: |
+ other_regressions.append([current_id, previous_id, confidence]) |
+ previous_values.append(current_values) |
+ previous_id = current_id |
+ return other_regressions |
+ |
+ def GetResultsDict(self): |
+ """Prepares and returns information about the final resulsts as a dict. |
+ |
+ Returns: |
+ A dictionary with the following fields |
+ |
+ 'first_working_revision': First good revision. |
+ 'last_broken_revision': Last bad revision. |
+ 'culprit_revisions': A list of revisions, which contain the bad change |
+ introducing the failure. |
+ 'other_regressions': A list of tuples representing other regressions, |
+ which may have occured. |
+ 'regression_size': For performance bisects, this is a relative change of |
+ the mean metric value. For other bisects this field always contains |
+ 'zero-to-nonzero'. |
+ 'regression_std_err': For performance bisects, it is a pooled standard |
+ error for groups of good and bad runs. Not used for other bisects. |
+ 'confidence': For performance bisects, it is a confidence that the good |
+ and bad runs are distinct groups. Not used for non-performance |
+ bisects. |
+ 'revision_data_sorted': dict mapping revision ids to data about that |
+ revision. Each piece of revision data consists of a dict with the |
+ following keys: |
+ |
+ 'passed': Represents whether the performance test was successful at |
+ that revision. Possible values include: 1 (passed), 0 (failed), |
+ '?' (skipped), 'F' (build failed). |
+ 'depot': The depot that this revision is from (i.e. WebKit) |
+ 'external': If the revision is a 'src' revision, 'external' contains |
+ the revisions of each of the external libraries. |
+ 'sort': A sort value for sorting the dict in order of commits. |
+ |
+ For example: |
+ { |
+ 'CL #1': |
+ { |
+ 'passed': False, |
+ 'depot': 'chromium', |
+ 'external': None, |
+ 'sort': 0 |
+ } |
+ } |
+ """ |
+ revision_data_sorted = sorted(self.revision_data.iteritems(), |
+ key = lambda x: x[1]['sort']) |
+ |
+ # Find range where it possibly broke. |
+ first_working_revision = None |
+ first_working_revision_index = -1 |
+ last_broken_revision = None |
+ last_broken_revision_index = -1 |
+ |
+ culprit_revisions = [] |
+ other_regressions = [] |
+ regression_size = 0.0 |
+ regression_std_err = 0.0 |
+ confidence = 0.0 |
+ |
+ for i in xrange(len(revision_data_sorted)): |
+ k, v = revision_data_sorted[i] |
+ if v['passed'] == 1: |
+ if not first_working_revision: |
+ first_working_revision = k |
+ first_working_revision_index = i |
+ |
+ if not v['passed']: |
+ last_broken_revision = k |
+ last_broken_revision_index = i |
+ |
+ if last_broken_revision != None and first_working_revision != None: |
+ broken_means = [] |
+ for i in xrange(0, last_broken_revision_index + 1): |
+ if revision_data_sorted[i][1]['value']: |
+ broken_means.append(revision_data_sorted[i][1]['value']['values']) |
+ |
+ working_means = [] |
+ for i in xrange(first_working_revision_index, len(revision_data_sorted)): |
+ if revision_data_sorted[i][1]['value']: |
+ working_means.append(revision_data_sorted[i][1]['value']['values']) |
+ |
+ # Flatten the lists to calculate mean of all values. |
+ working_mean = sum(working_means, []) |
+ broken_mean = sum(broken_means, []) |
+ |
+ # Calculate the approximate size of the regression |
+ mean_of_bad_runs = math_utils.Mean(broken_mean) |
+ mean_of_good_runs = math_utils.Mean(working_mean) |
+ |
+ regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs, |
+ mean_of_bad_runs) |
+ if math.isnan(regression_size): |
+ regression_size = 'zero-to-nonzero' |
+ |
+ regression_std_err = math.fabs(math_utils.PooledStandardError( |
+ [working_mean, broken_mean]) / |
+ max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 |
+ |
+ # Give a "confidence" in the bisect. At the moment we use how distinct the |
+ # values are before and after the last broken revision, and how noisy the |
+ # overall graph is. |
+ confidence = ConfidenceScore(working_means, broken_means) |
+ |
+ culprit_revisions = [] |
+ |
+ cwd = os.getcwd() |
+ self._depot_registry.ChangeToDepotDir( |
+ self.revision_data[last_broken_revision]['depot']) |
+ |
+ if self.revision_data[last_broken_revision]['depot'] == 'cros': |
+ # Want to get a list of all the commits and what depots they belong |
+ # to so that we can grab info about each. |
+ cmd = ['repo', 'forall', '-c', |
+ 'pwd ; git log --pretty=oneline --before=%d --after=%d' % ( |
+ last_broken_revision, first_working_revision + 1)] |
+ output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd) |
+ |
+ changes = [] |
+ assert not return_code, ('An error occurred while running ' |
+ '"%s"' % ' '.join(cmd)) |
+ last_depot = None |
+ cwd = os.getcwd() |
+ for l in output.split('\n'): |
+ if l: |
+ # Output will be in form: |
+ # /path_to_depot |
+ # /path_to_other_depot |
+ # <SHA1> |
+ # /path_again |
+ # <SHA1> |
+ # etc. |
+ if l[0] == '/': |
+ last_depot = l |
+ else: |
+ contents = l.split(' ') |
+ if len(contents) > 1: |
+ changes.append([last_depot, contents[0]]) |
+ for c in changes: |
+ os.chdir(c[0]) |
+ info = self._source_control.QueryRevisionInfo(c[1]) |
+ culprit_revisions.append((c[1], info, None)) |
+ else: |
+ for i in xrange(last_broken_revision_index, len(revision_data_sorted)): |
+ k, v = revision_data_sorted[i] |
+ if k == first_working_revision: |
+ break |
+ self._depot_registry.ChangeToDepotDir(v['depot']) |
+ info = self._source_control.QueryRevisionInfo(k) |
+ culprit_revisions.append((k, info, v['depot'])) |
+ os.chdir(cwd) |
+ |
+ # Check for any other possible regression ranges. |
+ other_regressions = self._FindOtherRegressions( |
+ revision_data_sorted, mean_of_bad_runs > mean_of_good_runs) |
+ |
+ return { |
+ 'first_working_revision': first_working_revision, |
+ 'last_broken_revision': last_broken_revision, |
+ 'culprit_revisions': culprit_revisions, |
+ 'other_regressions': other_regressions, |
+ 'regression_size': regression_size, |
+ 'regression_std_err': regression_std_err, |
+ 'confidence': confidence, |
+ 'revision_data_sorted': revision_data_sorted |
+ } |