| OLD | NEW |
| 1 # Copyright 2014 The Chromium Authors. All rights reserved. | 1 # Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 import math | 5 import math |
| 6 import os | 6 import os |
| 7 | 7 |
| 8 import bisect_utils | 8 import bisect_utils |
| 9 import math_utils | 9 import math_utils |
| 10 import source_control | 10 import source_control |
| (...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 102 if 0 < confidence < bisect_utils.HIGH_CONFIDENCE: | 102 if 0 < confidence < bisect_utils.HIGH_CONFIDENCE: |
| 103 warnings.append('Confidence is not high. Try bisecting again ' | 103 warnings.append('Confidence is not high. Try bisecting again ' |
| 104 'with increased repeat_count, larger range, or ' | 104 'with increased repeat_count, larger range, or ' |
| 105 'on another metric.') | 105 'on another metric.') |
| 106 if not confidence: | 106 if not confidence: |
| 107 warnings.append('Confidence score is 0%. Try bisecting again on ' | 107 warnings.append('Confidence score is 0%. Try bisecting again on ' |
| 108 'another platform or another metric.') | 108 'another platform or another metric.') |
| 109 return warnings | 109 return warnings |
| 110 | 110 |
| 111 @staticmethod | 111 @staticmethod |
| 112 def ConfidenceScore(good_results_lists, bad_results_lists, | 112 def ConfidenceScore(sample1, sample2, |
| 113 accept_single_bad_or_good=False): | 113 accept_single_bad_or_good=False): |
| 114 """Calculates a confidence score. | 114 """Calculates a confidence score. |
| 115 | 115 |
| 116 This score is a percentage which represents our degree of confidence in the | 116 This score is a percentage which represents our degree of confidence in the |
| 117 proposition that the good results and bad results are distinct groups, and | 117 proposition that the good results and bad results are distinct groups, and |
| 118 their differences aren't due to chance alone. | 118 their differences aren't due to chance alone. |
| 119 | 119 |
| 120 | 120 |
| 121 Args: | 121 Args: |
| 122 good_results_lists: A list of lists of "good" result numbers. | 122 sample1: A flat list of "good" result numbers. |
| 123 bad_results_lists: A list of lists of "bad" result numbers. | 123 sample2: A flat list of "bad" result numbers. |
| 124 accept_single_bad_or_good: If True, computes confidence even if there is | 124 accept_single_bad_or_good: If True, computes confidence even if there is |
| 125 just one bad or good revision, otherwise single good or bad revision | 125 just one bad or good revision, otherwise single good or bad revision |
| 126 always returns 0.0 confidence. This flag will probably get away when | 126 always returns 0.0 confidence. This flag will probably get away when |
| 127 we will implement expanding the bisect range by one more revision for | 127 we will implement expanding the bisect range by one more revision for |
| 128 such case. | 128 such case. |
| 129 | 129 |
| 130 Returns: | 130 Returns: |
| 131 A number in the range [0, 100]. | 131 A number in the range [0, 100]. |
| 132 """ | 132 """ |
| 133 # If there's only one item in either list, this means only one revision was | 133 # If there's only one item in either list, this means only one revision was |
| 134 # classified good or bad; this isn't good enough evidence to make a | 134 # classified good or bad; this isn't good enough evidence to make a |
| 135 # decision. If an empty list was passed, that also implies zero confidence. | 135 # decision. If an empty list was passed, that also implies zero confidence. |
| 136 if not accept_single_bad_or_good: | 136 if not accept_single_bad_or_good: |
| 137 if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1: | 137 if len(sample1) <= 1 or len(sample2) <= 1: |
| 138 return 0.0 | 138 return 0.0 |
| 139 | 139 |
| 140 # Flatten the lists of results lists. | |
| 141 sample1 = sum(good_results_lists, []) | |
| 142 sample2 = sum(bad_results_lists, []) | |
| 143 | |
| 144 # If there were only empty lists in either of the lists (this is unexpected | 140 # If there were only empty lists in either of the lists (this is unexpected |
| 145 # and normally shouldn't happen), then we also want to return 0. | 141 # and normally shouldn't happen), then we also want to return 0. |
| 146 if not sample1 or not sample2: | 142 if not sample1 or not sample2: |
| 147 return 0.0 | 143 return 0.0 |
| 148 | 144 |
| 149 # The p-value is approximately the probability of obtaining the given set | 145 # The p-value is approximately the probability of obtaining the given set |
| 150 # of good and bad values just by chance. | 146 # of good and bad values just by chance. |
| 151 _, _, p_value = ttest.WelchsTTest(sample1, sample2) | 147 _, _, p_value = ttest.WelchsTTest(sample1, sample2) |
| 152 return 100.0 * (1.0 - p_value) | 148 return 100.0 * (1.0 - p_value) |
| 153 | 149 |
| (...skipping 10 matching lines...) Expand all Loading... |
| 164 A list of [current_rev, previous_rev, confidence] for other places where | 160 A list of [current_rev, previous_rev, confidence] for other places where |
| 165 there may have been a regression. | 161 there may have been a regression. |
| 166 """ | 162 """ |
| 167 other_regressions = [] | 163 other_regressions = [] |
| 168 previous_values = [] | 164 previous_values = [] |
| 169 prev_state = None | 165 prev_state = None |
| 170 for revision_state in revision_states: | 166 for revision_state in revision_states: |
| 171 if revision_state.value: | 167 if revision_state.value: |
| 172 current_values = revision_state.value['values'] | 168 current_values = revision_state.value['values'] |
| 173 if previous_values: | 169 if previous_values: |
| 174 confidence = cls.ConfidenceScore(previous_values, [current_values], | 170 confidence_params = (sum(previous_values, []), |
| 171 sum([current_values], [])) |
| 172 confidence = cls.ConfidenceScore(*confidence_params, |
| 175 accept_single_bad_or_good=True) | 173 accept_single_bad_or_good=True) |
| 176 mean_of_prev_runs = math_utils.Mean(sum(previous_values, [])) | 174 mean_of_prev_runs = math_utils.Mean(sum(previous_values, [])) |
| 177 mean_of_current_runs = math_utils.Mean(current_values) | 175 mean_of_current_runs = math_utils.Mean(current_values) |
| 178 | 176 |
| 179 # Check that the potential regression is in the same direction as | 177 # Check that the potential regression is in the same direction as |
| 180 # the overall regression. If the mean of the previous runs < the | 178 # the overall regression. If the mean of the previous runs < the |
| 181 # mean of the current runs, this local regression is in same | 179 # mean of the current runs, this local regression is in same |
| 182 # direction. | 180 # direction. |
| 183 prev_greater_than_current = mean_of_prev_runs > mean_of_current_runs | 181 prev_greater_than_current = mean_of_prev_runs > mean_of_current_runs |
| 184 is_same_direction = (prev_greater_than_current if | 182 is_same_direction = (prev_greater_than_current if |
| (...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 246 if math.isnan(regression_size): | 244 if math.isnan(regression_size): |
| 247 regression_size = 'zero-to-nonzero' | 245 regression_size = 'zero-to-nonzero' |
| 248 | 246 |
| 249 regression_std_err = math.fabs(math_utils.PooledStandardError( | 247 regression_std_err = math.fabs(math_utils.PooledStandardError( |
| 250 [working_mean, broken_mean]) / | 248 [working_mean, broken_mean]) / |
| 251 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 | 249 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 |
| 252 | 250 |
| 253 # Give a "confidence" in the bisect. At the moment we use how distinct the | 251 # Give a "confidence" in the bisect. At the moment we use how distinct the |
| 254 # values are before and after the last broken revision, and how noisy the | 252 # values are before and after the last broken revision, and how noisy the |
| 255 # overall graph is. | 253 # overall graph is. |
| 256 confidence = cls.ConfidenceScore(working_means, broken_means) | 254 confidence_params = (sum(working_means, []), sum(broken_means, [])) |
| 255 confidence = cls.ConfidenceScore(*confidence_params) |
| 257 | 256 |
| 258 bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs | 257 bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs |
| 259 | 258 |
| 260 return {'regression_size': regression_size, | 259 return {'regression_size': regression_size, |
| 261 'regression_std_err': regression_std_err, | 260 'regression_std_err': regression_std_err, |
| 262 'confidence': confidence, | 261 'confidence': confidence, |
| 263 'bad_greater_than_good': bad_greater_than_good} | 262 'bad_greater_than_good': bad_greater_than_good} |
| OLD | NEW |