OLD | NEW |
1 # Copyright 2014 The Chromium Authors. All rights reserved. | 1 # Copyright 2014 The Chromium Authors. All rights reserved. |
2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
4 | 4 |
5 import math | 5 import math |
6 import os | 6 import os |
7 | 7 |
8 import bisect_utils | 8 import bisect_utils |
9 import math_utils | 9 import math_utils |
10 import source_control | 10 import source_control |
(...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
102 if 0 < confidence < bisect_utils.HIGH_CONFIDENCE: | 102 if 0 < confidence < bisect_utils.HIGH_CONFIDENCE: |
103 warnings.append('Confidence is not high. Try bisecting again ' | 103 warnings.append('Confidence is not high. Try bisecting again ' |
104 'with increased repeat_count, larger range, or ' | 104 'with increased repeat_count, larger range, or ' |
105 'on another metric.') | 105 'on another metric.') |
106 if not confidence: | 106 if not confidence: |
107 warnings.append('Confidence score is 0%. Try bisecting again on ' | 107 warnings.append('Confidence score is 0%. Try bisecting again on ' |
108 'another platform or another metric.') | 108 'another platform or another metric.') |
109 return warnings | 109 return warnings |
110 | 110 |
111 @staticmethod | 111 @staticmethod |
112 def ConfidenceScore(good_results_lists, bad_results_lists, | 112 def ConfidenceScore(sample1, sample2, |
113 accept_single_bad_or_good=False): | 113 accept_single_bad_or_good=False): |
114 """Calculates a confidence score. | 114 """Calculates a confidence score. |
115 | 115 |
116 This score is a percentage which represents our degree of confidence in the | 116 This score is a percentage which represents our degree of confidence in the |
117 proposition that the good results and bad results are distinct groups, and | 117 proposition that the good results and bad results are distinct groups, and |
118 their differences aren't due to chance alone. | 118 their differences aren't due to chance alone. |
119 | 119 |
120 | 120 |
121 Args: | 121 Args: |
122 good_results_lists: A list of lists of "good" result numbers. | 122 sample1: A flat list of "good" result numbers. |
123 bad_results_lists: A list of lists of "bad" result numbers. | 123 sample2: A flat list of "bad" result numbers. |
124 accept_single_bad_or_good: If True, computes confidence even if there is | 124 accept_single_bad_or_good: If True, computes confidence even if there is |
125 just one bad or good revision, otherwise single good or bad revision | 125 just one bad or good revision, otherwise single good or bad revision |
126 always returns 0.0 confidence. This flag will probably get away when | 126 always returns 0.0 confidence. This flag will probably get away when |
127 we will implement expanding the bisect range by one more revision for | 127 we will implement expanding the bisect range by one more revision for |
128 such case. | 128 such case. |
129 | 129 |
130 Returns: | 130 Returns: |
131 A number in the range [0, 100]. | 131 A number in the range [0, 100]. |
132 """ | 132 """ |
133 # If there's only one item in either list, this means only one revision was | 133 # If there's only one item in either list, this means only one revision was |
134 # classified good or bad; this isn't good enough evidence to make a | 134 # classified good or bad; this isn't good enough evidence to make a |
135 # decision. If an empty list was passed, that also implies zero confidence. | 135 # decision. If an empty list was passed, that also implies zero confidence. |
136 if not accept_single_bad_or_good: | 136 if not accept_single_bad_or_good: |
137 if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1: | 137 if len(sample1) <= 1 or len(sample2) <= 1: |
138 return 0.0 | 138 return 0.0 |
139 | 139 |
140 # Flatten the lists of results lists. | |
141 sample1 = sum(good_results_lists, []) | |
142 sample2 = sum(bad_results_lists, []) | |
143 | |
144 # If there were only empty lists in either of the lists (this is unexpected | 140 # If there were only empty lists in either of the lists (this is unexpected |
145 # and normally shouldn't happen), then we also want to return 0. | 141 # and normally shouldn't happen), then we also want to return 0. |
146 if not sample1 or not sample2: | 142 if not sample1 or not sample2: |
147 return 0.0 | 143 return 0.0 |
148 | 144 |
149 # The p-value is approximately the probability of obtaining the given set | 145 # The p-value is approximately the probability of obtaining the given set |
150 # of good and bad values just by chance. | 146 # of good and bad values just by chance. |
151 _, _, p_value = ttest.WelchsTTest(sample1, sample2) | 147 _, _, p_value = ttest.WelchsTTest(sample1, sample2) |
152 return 100.0 * (1.0 - p_value) | 148 return 100.0 * (1.0 - p_value) |
153 | 149 |
(...skipping 10 matching lines...) Expand all Loading... |
164 A list of [current_rev, previous_rev, confidence] for other places where | 160 A list of [current_rev, previous_rev, confidence] for other places where |
165 there may have been a regression. | 161 there may have been a regression. |
166 """ | 162 """ |
167 other_regressions = [] | 163 other_regressions = [] |
168 previous_values = [] | 164 previous_values = [] |
169 prev_state = None | 165 prev_state = None |
170 for revision_state in revision_states: | 166 for revision_state in revision_states: |
171 if revision_state.value: | 167 if revision_state.value: |
172 current_values = revision_state.value['values'] | 168 current_values = revision_state.value['values'] |
173 if previous_values: | 169 if previous_values: |
174 confidence = cls.ConfidenceScore(previous_values, [current_values], | 170 confidence_params = (sum(previous_values, []), |
| 171 sum([current_values], [])) |
| 172 confidence = cls.ConfidenceScore(*confidence_params, |
175 accept_single_bad_or_good=True) | 173 accept_single_bad_or_good=True) |
176 mean_of_prev_runs = math_utils.Mean(sum(previous_values, [])) | 174 mean_of_prev_runs = math_utils.Mean(sum(previous_values, [])) |
177 mean_of_current_runs = math_utils.Mean(current_values) | 175 mean_of_current_runs = math_utils.Mean(current_values) |
178 | 176 |
179 # Check that the potential regression is in the same direction as | 177 # Check that the potential regression is in the same direction as |
180 # the overall regression. If the mean of the previous runs < the | 178 # the overall regression. If the mean of the previous runs < the |
181 # mean of the current runs, this local regression is in same | 179 # mean of the current runs, this local regression is in same |
182 # direction. | 180 # direction. |
183 prev_greater_than_current = mean_of_prev_runs > mean_of_current_runs | 181 prev_greater_than_current = mean_of_prev_runs > mean_of_current_runs |
184 is_same_direction = (prev_greater_than_current if | 182 is_same_direction = (prev_greater_than_current if |
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
246 if math.isnan(regression_size): | 244 if math.isnan(regression_size): |
247 regression_size = 'zero-to-nonzero' | 245 regression_size = 'zero-to-nonzero' |
248 | 246 |
249 regression_std_err = math.fabs(math_utils.PooledStandardError( | 247 regression_std_err = math.fabs(math_utils.PooledStandardError( |
250 [working_mean, broken_mean]) / | 248 [working_mean, broken_mean]) / |
251 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 | 249 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 |
252 | 250 |
253 # Give a "confidence" in the bisect. At the moment we use how distinct the | 251 # Give a "confidence" in the bisect. At the moment we use how distinct the |
254 # values are before and after the last broken revision, and how noisy the | 252 # values are before and after the last broken revision, and how noisy the |
255 # overall graph is. | 253 # overall graph is. |
256 confidence = cls.ConfidenceScore(working_means, broken_means) | 254 confidence_params = (sum(working_means, []), sum(broken_means, [])) |
| 255 confidence = cls.ConfidenceScore(*confidence_params) |
257 | 256 |
258 bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs | 257 bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs |
259 | 258 |
260 return {'regression_size': regression_size, | 259 return {'regression_size': regression_size, |
261 'regression_std_err': regression_std_err, | 260 'regression_std_err': regression_std_err, |
262 'confidence': confidence, | 261 'confidence': confidence, |
263 'bad_greater_than_good': bad_greater_than_good} | 262 'bad_greater_than_good': bad_greater_than_good} |
OLD | NEW |