OLD | NEW |
(Empty) | |
| 1 # Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. |
| 4 |
| 5 import math |
| 6 import os |
| 7 |
| 8 import bisect_utils |
| 9 import math_utils |
| 10 import ttest |
| 11 |
| 12 |
| 13 def ConfidenceScore(good_results_lists, bad_results_lists): |
| 14 """Calculates a confidence score. |
| 15 |
| 16 This score is a percentage which represents our degree of confidence in the |
| 17 proposition that the good results and bad results are distinct groups, and |
| 18 their differences aren't due to chance alone. |
| 19 |
| 20 |
| 21 Args: |
| 22 good_results_lists: A list of lists of "good" result numbers. |
| 23 bad_results_lists: A list of lists of "bad" result numbers. |
| 24 |
| 25 Returns: |
| 26 A number in the range [0, 100]. |
| 27 """ |
| 28 # If there's only one item in either list, this means only one revision was |
| 29 # classified good or bad; this isn't good enough evidence to make a decision. |
| 30 # If an empty list was passed, that also implies zero confidence. |
| 31 if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1: |
| 32 return 0.0 |
| 33 |
| 34 # Flatten the lists of results lists. |
| 35 sample1 = sum(good_results_lists, []) |
| 36 sample2 = sum(bad_results_lists, []) |
| 37 |
| 38 # If there were only empty lists in either of the lists (this is unexpected |
| 39 # and normally shouldn't happen), then we also want to return 0. |
| 40 if not sample1 or not sample2: |
| 41 return 0.0 |
| 42 |
| 43 # The p-value is approximately the probability of obtaining the given set |
| 44 # of good and bad values just by chance. |
| 45 _, _, p_value = ttest.WelchsTTest(sample1, sample2) |
| 46 return 100.0 * (1.0 - p_value) |
| 47 |
| 48 |
| 49 class BisectResults(object): |
| 50 |
| 51 def __init__(self, depot_registry, source_control): |
| 52 self._depot_registry = depot_registry |
| 53 self.revision_data = {} |
| 54 self.error = None |
| 55 self._source_control = source_control |
| 56 |
| 57 @staticmethod |
| 58 def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good): |
| 59 """Compiles a list of other possible regressions from the revision data. |
| 60 |
| 61 Args: |
| 62 revision_data_sorted: Sorted list of (revision, revision data) pairs. |
| 63 bad_greater_than_good: Whether the result value at the "bad" revision is |
| 64 numerically greater than the result value at the "good" revision. |
| 65 |
| 66 Returns: |
| 67 A list of [current_rev, previous_rev, confidence] for other places where |
| 68 there may have been a regression. |
| 69 """ |
| 70 other_regressions = [] |
| 71 previous_values = [] |
| 72 previous_id = None |
| 73 for current_id, current_data in revision_data_sorted: |
| 74 current_values = current_data['value'] |
| 75 if current_values: |
| 76 current_values = current_values['values'] |
| 77 if previous_values: |
| 78 confidence = ConfidenceScore(previous_values, [current_values]) |
| 79 mean_of_prev_runs = math_utils.Mean(sum(previous_values, [])) |
| 80 mean_of_current_runs = math_utils.Mean(current_values) |
| 81 |
| 82 # Check that the potential regression is in the same direction as |
| 83 # the overall regression. If the mean of the previous runs < the |
| 84 # mean of the current runs, this local regression is in same |
| 85 # direction. |
| 86 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs |
| 87 is_same_direction = (prev_less_than_current if |
| 88 bad_greater_than_good else not prev_less_than_current) |
| 89 |
| 90 # Only report potential regressions with high confidence. |
| 91 if is_same_direction and confidence > 50: |
| 92 other_regressions.append([current_id, previous_id, confidence]) |
| 93 previous_values.append(current_values) |
| 94 previous_id = current_id |
| 95 return other_regressions |
| 96 |
| 97 def GetResultsDict(self): |
| 98 """Prepares and returns information about the final resulsts as a dict. |
| 99 |
| 100 Returns: |
| 101 A dictionary with the following fields |
| 102 |
| 103 'first_working_revision': First good revision. |
| 104 'last_broken_revision': Last bad revision. |
| 105 'culprit_revisions': A list of revisions, which contain the bad change |
| 106 introducing the failure. |
| 107 'other_regressions': A list of tuples representing other regressions, |
| 108 which may have occured. |
| 109 'regression_size': For performance bisects, this is a relative change of |
| 110 the mean metric value. For other bisects this field always contains |
| 111 'zero-to-nonzero'. |
| 112 'regression_std_err': For performance bisects, it is a pooled standard |
| 113 error for groups of good and bad runs. Not used for other bisects. |
| 114 'confidence': For performance bisects, it is a confidence that the good |
| 115 and bad runs are distinct groups. Not used for non-performance |
| 116 bisects. |
| 117 'revision_data_sorted': dict mapping revision ids to data about that |
| 118 revision. Each piece of revision data consists of a dict with the |
| 119 following keys: |
| 120 |
| 121 'passed': Represents whether the performance test was successful at |
| 122 that revision. Possible values include: 1 (passed), 0 (failed), |
| 123 '?' (skipped), 'F' (build failed). |
| 124 'depot': The depot that this revision is from (i.e. WebKit) |
| 125 'external': If the revision is a 'src' revision, 'external' contains |
| 126 the revisions of each of the external libraries. |
| 127 'sort': A sort value for sorting the dict in order of commits. |
| 128 |
| 129 For example: |
| 130 { |
| 131 'CL #1': |
| 132 { |
| 133 'passed': False, |
| 134 'depot': 'chromium', |
| 135 'external': None, |
| 136 'sort': 0 |
| 137 } |
| 138 } |
| 139 """ |
| 140 revision_data_sorted = sorted(self.revision_data.iteritems(), |
| 141 key = lambda x: x[1]['sort']) |
| 142 |
| 143 # Find range where it possibly broke. |
| 144 first_working_revision = None |
| 145 first_working_revision_index = -1 |
| 146 last_broken_revision = None |
| 147 last_broken_revision_index = -1 |
| 148 |
| 149 culprit_revisions = [] |
| 150 other_regressions = [] |
| 151 regression_size = 0.0 |
| 152 regression_std_err = 0.0 |
| 153 confidence = 0.0 |
| 154 |
| 155 for i in xrange(len(revision_data_sorted)): |
| 156 k, v = revision_data_sorted[i] |
| 157 if v['passed'] == 1: |
| 158 if not first_working_revision: |
| 159 first_working_revision = k |
| 160 first_working_revision_index = i |
| 161 |
| 162 if not v['passed']: |
| 163 last_broken_revision = k |
| 164 last_broken_revision_index = i |
| 165 |
| 166 if last_broken_revision != None and first_working_revision != None: |
| 167 broken_means = [] |
| 168 for i in xrange(0, last_broken_revision_index + 1): |
| 169 if revision_data_sorted[i][1]['value']: |
| 170 broken_means.append(revision_data_sorted[i][1]['value']['values']) |
| 171 |
| 172 working_means = [] |
| 173 for i in xrange(first_working_revision_index, len(revision_data_sorted)): |
| 174 if revision_data_sorted[i][1]['value']: |
| 175 working_means.append(revision_data_sorted[i][1]['value']['values']) |
| 176 |
| 177 # Flatten the lists to calculate mean of all values. |
| 178 working_mean = sum(working_means, []) |
| 179 broken_mean = sum(broken_means, []) |
| 180 |
| 181 # Calculate the approximate size of the regression |
| 182 mean_of_bad_runs = math_utils.Mean(broken_mean) |
| 183 mean_of_good_runs = math_utils.Mean(working_mean) |
| 184 |
| 185 regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs, |
| 186 mean_of_bad_runs) |
| 187 if math.isnan(regression_size): |
| 188 regression_size = 'zero-to-nonzero' |
| 189 |
| 190 regression_std_err = math.fabs(math_utils.PooledStandardError( |
| 191 [working_mean, broken_mean]) / |
| 192 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 |
| 193 |
| 194 # Give a "confidence" in the bisect. At the moment we use how distinct the |
| 195 # values are before and after the last broken revision, and how noisy the |
| 196 # overall graph is. |
| 197 confidence = ConfidenceScore(working_means, broken_means) |
| 198 |
| 199 culprit_revisions = [] |
| 200 |
| 201 cwd = os.getcwd() |
| 202 self._depot_registry.ChangeToDepotDir( |
| 203 self.revision_data[last_broken_revision]['depot']) |
| 204 |
| 205 if self.revision_data[last_broken_revision]['depot'] == 'cros': |
| 206 # Want to get a list of all the commits and what depots they belong |
| 207 # to so that we can grab info about each. |
| 208 cmd = ['repo', 'forall', '-c', |
| 209 'pwd ; git log --pretty=oneline --before=%d --after=%d' % ( |
| 210 last_broken_revision, first_working_revision + 1)] |
| 211 output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd) |
| 212 |
| 213 changes = [] |
| 214 assert not return_code, ('An error occurred while running ' |
| 215 '"%s"' % ' '.join(cmd)) |
| 216 last_depot = None |
| 217 cwd = os.getcwd() |
| 218 for l in output.split('\n'): |
| 219 if l: |
| 220 # Output will be in form: |
| 221 # /path_to_depot |
| 222 # /path_to_other_depot |
| 223 # <SHA1> |
| 224 # /path_again |
| 225 # <SHA1> |
| 226 # etc. |
| 227 if l[0] == '/': |
| 228 last_depot = l |
| 229 else: |
| 230 contents = l.split(' ') |
| 231 if len(contents) > 1: |
| 232 changes.append([last_depot, contents[0]]) |
| 233 for c in changes: |
| 234 os.chdir(c[0]) |
| 235 info = self._source_control.QueryRevisionInfo(c[1]) |
| 236 culprit_revisions.append((c[1], info, None)) |
| 237 else: |
| 238 for i in xrange(last_broken_revision_index, len(revision_data_sorted)): |
| 239 k, v = revision_data_sorted[i] |
| 240 if k == first_working_revision: |
| 241 break |
| 242 self._depot_registry.ChangeToDepotDir(v['depot']) |
| 243 info = self._source_control.QueryRevisionInfo(k) |
| 244 culprit_revisions.append((k, info, v['depot'])) |
| 245 os.chdir(cwd) |
| 246 |
| 247 # Check for any other possible regression ranges. |
| 248 other_regressions = self._FindOtherRegressions( |
| 249 revision_data_sorted, mean_of_bad_runs > mean_of_good_runs) |
| 250 |
| 251 return { |
| 252 'first_working_revision': first_working_revision, |
| 253 'last_broken_revision': last_broken_revision, |
| 254 'culprit_revisions': culprit_revisions, |
| 255 'other_regressions': other_regressions, |
| 256 'regression_size': regression_size, |
| 257 'regression_std_err': regression_std_err, |
| 258 'confidence': confidence, |
| 259 'revision_data_sorted': revision_data_sorted |
| 260 } |
OLD | NEW |