OLD | NEW |
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. | 2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. |
3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
5 | 5 |
6 """Performance Test Bisect Tool | 6 """Performance Test Bisect Tool |
7 | 7 |
8 This script bisects a series of changelists using binary search. It starts at | 8 This script bisects a series of changelists using binary search. It starts at |
9 a bad revision where a performance metric has regressed, and asks for a last | 9 a bad revision where a performance metric has regressed, and asks for a last |
10 known-good revision. It will then binary search across this revision range by | 10 known-good revision. It will then binary search across this revision range by |
(...skipping 146 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
157 global DEPOT_DEPS_NAME | 157 global DEPOT_DEPS_NAME |
158 global DEPOT_NAMES | 158 global DEPOT_NAMES |
159 DEPOT_DEPS_NAME = dict(DEPOT_DEPS_NAME.items() + | 159 DEPOT_DEPS_NAME = dict(DEPOT_DEPS_NAME.items() + |
160 depot_info.items()) | 160 depot_info.items()) |
161 DEPOT_NAMES = DEPOT_DEPS_NAME.keys() | 161 DEPOT_NAMES = DEPOT_DEPS_NAME.keys() |
162 | 162 |
163 | 163 |
164 def CalculateTruncatedMean(data_set, truncate_percent): | 164 def CalculateTruncatedMean(data_set, truncate_percent): |
165 """Calculates the truncated mean of a set of values. | 165 """Calculates the truncated mean of a set of values. |
166 | 166 |
| 167 Note that this isn't just the mean of the set of values with the highest |
| 168 and lowest values discarded; the non-discarded values are also weighted |
| 169 differently depending how many values are discarded. |
| 170 |
167 Args: | 171 Args: |
168 data_set: Set of values to use in calculation. | 172 data_set: Non-empty list of values. |
169 truncate_percent: The % from the upper/lower portions of the data set to | 173 truncate_percent: The % from the upper and lower portions of the data set |
170 discard, expressed as a value in [0, 1]. | 174 to discard, expressed as a value in [0, 1]. |
171 | 175 |
172 Returns: | 176 Returns: |
173 The truncated mean as a float. | 177 The truncated mean as a float. |
| 178 |
| 179 Raises: |
| 180 TypeError: The data set was empty after discarding values. |
174 """ | 181 """ |
175 if len(data_set) > 2: | 182 if len(data_set) > 2: |
176 data_set = sorted(data_set) | 183 data_set = sorted(data_set) |
177 | 184 |
178 discard_num_float = len(data_set) * truncate_percent | 185 discard_num_float = len(data_set) * truncate_percent |
179 discard_num_int = int(math.floor(discard_num_float)) | 186 discard_num_int = int(math.floor(discard_num_float)) |
180 kept_weight = len(data_set) - discard_num_float * 2 | 187 kept_weight = len(data_set) - discard_num_float * 2 |
181 | 188 |
182 data_set = data_set[discard_num_int:len(data_set)-discard_num_int] | 189 data_set = data_set[discard_num_int:len(data_set)-discard_num_int] |
183 | 190 |
184 weight_left = 1.0 - (discard_num_float - discard_num_int) | 191 weight_left = 1.0 - (discard_num_float - discard_num_int) |
185 | 192 |
186 if weight_left < 1: | 193 if weight_left < 1: |
187 # If the % to discard leaves a fractional portion, need to weight those | 194 # If the % to discard leaves a fractional portion, need to weight those |
188 # values. | 195 # values. |
189 unweighted_vals = data_set[1:len(data_set)-1] | 196 unweighted_vals = data_set[1:len(data_set)-1] |
190 weighted_vals = [data_set[0], data_set[len(data_set)-1]] | 197 weighted_vals = [data_set[0], data_set[len(data_set)-1]] |
191 weighted_vals = [w * weight_left for w in weighted_vals] | 198 weighted_vals = [w * weight_left for w in weighted_vals] |
192 data_set = weighted_vals + unweighted_vals | 199 data_set = weighted_vals + unweighted_vals |
193 else: | 200 else: |
194 kept_weight = len(data_set) | 201 kept_weight = len(data_set) |
195 | 202 |
196 truncated_mean = reduce(lambda x, y: float(x) + float(y), | 203 truncated_mean = reduce(lambda x, y: float(x) + float(y), |
197 data_set) / kept_weight | 204 data_set) / kept_weight |
198 | 205 |
199 return truncated_mean | 206 return truncated_mean |
200 | 207 |
201 | 208 |
202 def CalculateStandardDeviation(v): | 209 def CalculateMean(values): |
203 if len(v) == 1: | 210 """Calculates the arithmetic mean of a list of values.""" |
| 211 return CalculateTruncatedMean(values, 0.0) |
| 212 |
| 213 |
| 214 def CalculateConfidence(good_results_lists, bad_results_lists): |
| 215 """Calculates a confidence percentage. |
| 216 |
| 217 This is calculated based on how distinct the "good" and "bad" values are, |
| 218 and how noisy the results are. More precisely, the confidence is the quotient |
| 219 of the difference between the closest values across the good and bad groups |
| 220 and the sum of the standard deviations of the good and bad groups. |
| 221 |
| 222 TODO(qyearsley): Replace this confidence function with a function that |
| 223 uses a Student's t-test. The confidence would be (1 - p-value), where |
| 224 p-value is the probability of obtaining the given a set of good and bad |
| 225 values just by chance. |
| 226 |
| 227 Args: |
| 228 good_results_lists: A list of lists of "good" result numbers. |
| 229 bad_results_lists: A list of lists of "bad" result numbers. |
| 230 |
| 231 Returns: |
| 232 A number between in the range [0, 100]. |
| 233 """ |
| 234 # Get the distance between the two groups. |
| 235 means_good = map(CalculateMean, good_results_lists) |
| 236 means_bad = map(CalculateMean, bad_results_lists) |
| 237 bounds_good = (min(means_good), max(means_good)) |
| 238 bounds_bad = (min(means_bad), max(means_bad)) |
| 239 dist_between_groups = min( |
| 240 math.fabs(bounds_bad[1] - bounds_good[0]), |
| 241 math.fabs(bounds_bad[0] - bounds_good[1])) |
| 242 |
| 243 # Get the sum of the standard deviations of the two groups. |
| 244 good_results_flattened = sum(good_results_lists, []) |
| 245 bad_results_flattened = sum(bad_results_lists, []) |
| 246 stddev_good = CalculateStandardDeviation(good_results_flattened) |
| 247 stddev_bad = CalculateStandardDeviation(bad_results_flattened) |
| 248 stddev_sum = stddev_good + stddev_bad |
| 249 |
| 250 confidence = dist_between_groups / (max(0.0001, stddev_sum)) |
| 251 confidence = int(min(1.0, max(confidence, 0.0)) * 100.0) |
| 252 return confidence |
| 253 |
| 254 |
| 255 def CalculateStandardDeviation(values): |
| 256 """Calculates the sample standard deviation of the given list of values.""" |
| 257 if len(values) == 1: |
204 return 0.0 | 258 return 0.0 |
205 | 259 |
206 mean = CalculateTruncatedMean(v, 0.0) | 260 mean = CalculateMean(values) |
207 variances = [float(x) - mean for x in v] | 261 differences_from_mean = [float(x) - mean for x in values] |
208 variances = [x * x for x in variances] | 262 squared_differences = [float(x * x) for x in differences_from_mean] |
209 variance = reduce(lambda x, y: float(x) + float(y), variances) / (len(v) - 1) | 263 variance = sum(squared_differences) / (len(values) - 1) |
210 std_dev = math.sqrt(variance) | 264 std_dev = math.sqrt(variance) |
211 | 265 |
212 return std_dev | 266 return std_dev |
213 | 267 |
214 | 268 |
215 def CalculatePooledStandardError(work_sets): | 269 def CalculatePooledStandardError(work_sets): |
216 numerator = 0.0 | 270 numerator = 0.0 |
217 denominator1 = 0.0 | 271 denominator1 = 0.0 |
218 denominator2 = 0.0 | 272 denominator2 = 0.0 |
219 | 273 |
220 for current_set in work_sets: | 274 for current_set in work_sets: |
221 std_dev = CalculateStandardDeviation(current_set) | 275 std_dev = CalculateStandardDeviation(current_set) |
222 numerator += (len(current_set) - 1) * std_dev ** 2 | 276 numerator += (len(current_set) - 1) * std_dev ** 2 |
223 denominator1 += len(current_set) - 1 | 277 denominator1 += len(current_set) - 1 |
224 denominator2 += 1.0 / len(current_set) | 278 denominator2 += 1.0 / len(current_set) |
225 | 279 |
226 if denominator1: | 280 if denominator1: |
227 return math.sqrt(numerator / denominator1) * math.sqrt(denominator2) | 281 return math.sqrt(numerator / denominator1) * math.sqrt(denominator2) |
228 return 0.0 | 282 return 0.0 |
229 | 283 |
230 | 284 |
231 def CalculateStandardError(v): | 285 def CalculateStandardError(values): |
232 if len(v) <= 1: | 286 """Calculates the standard error of a list of values.""" |
| 287 if len(values) <= 1: |
233 return 0.0 | 288 return 0.0 |
234 | 289 |
235 std_dev = CalculateStandardDeviation(v) | 290 std_dev = CalculateStandardDeviation(values) |
236 | 291 |
237 return std_dev / math.sqrt(len(v)) | 292 return std_dev / math.sqrt(len(values)) |
238 | 293 |
239 | 294 |
240 def IsStringFloat(string_to_check): | 295 def IsStringFloat(string_to_check): |
241 """Checks whether or not the given string can be converted to a floating | 296 """Checks whether or not the given string can be converted to a floating |
242 point number. | 297 point number. |
243 | 298 |
244 Args: | 299 Args: |
245 string_to_check: Input string to check if it can be converted to a float. | 300 string_to_check: Input string to check if it can be converted to a float. |
246 | 301 |
247 Returns: | 302 Returns: |
(...skipping 2511 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2759 | 2814 |
2760 def _FindOtherRegressions(self, revision_data_sorted, bad_greater_than_good): | 2815 def _FindOtherRegressions(self, revision_data_sorted, bad_greater_than_good): |
2761 other_regressions = [] | 2816 other_regressions = [] |
2762 previous_values = [] | 2817 previous_values = [] |
2763 previous_id = None | 2818 previous_id = None |
2764 for current_id, current_data in revision_data_sorted: | 2819 for current_id, current_data in revision_data_sorted: |
2765 current_values = current_data['value'] | 2820 current_values = current_data['value'] |
2766 if current_values: | 2821 if current_values: |
2767 current_values = current_values['values'] | 2822 current_values = current_values['values'] |
2768 if previous_values: | 2823 if previous_values: |
2769 confidence = self._CalculateConfidence(previous_values, | 2824 confidence = CalculateConfidence(previous_values, [current_values]) |
2770 [current_values]) | 2825 mean_of_prev_runs = CalculateMean(sum(previous_values, [])) |
2771 mean_of_prev_runs = CalculateTruncatedMean( | 2826 mean_of_current_runs = CalculateMean(current_values) |
2772 sum(previous_values, []), 0) | |
2773 mean_of_current_runs = CalculateTruncatedMean(current_values, 0) | |
2774 | 2827 |
2775 # Check that the potential regression is in the same direction as | 2828 # Check that the potential regression is in the same direction as |
2776 # the overall regression. If the mean of the previous runs < the | 2829 # the overall regression. If the mean of the previous runs < the |
2777 # mean of the current runs, this local regression is in same | 2830 # mean of the current runs, this local regression is in same |
2778 # direction. | 2831 # direction. |
2779 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs | 2832 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs |
2780 is_same_direction = (prev_less_than_current if | 2833 is_same_direction = (prev_less_than_current if |
2781 bad_greater_than_good else not prev_less_than_current) | 2834 bad_greater_than_good else not prev_less_than_current) |
2782 | 2835 |
2783 # Only report potential regressions with high confidence. | 2836 # Only report potential regressions with high confidence. |
2784 if is_same_direction and confidence > 50: | 2837 if is_same_direction and confidence > 50: |
2785 other_regressions.append([current_id, previous_id, confidence]) | 2838 other_regressions.append([current_id, previous_id, confidence]) |
2786 previous_values.append(current_values) | 2839 previous_values.append(current_values) |
2787 previous_id = current_id | 2840 previous_id = current_id |
2788 return other_regressions | 2841 return other_regressions |
2789 | 2842 |
2790 def _CalculateConfidence(self, working_means, broken_means): | |
2791 bounds_working = [] | |
2792 bounds_broken = [] | |
2793 for m in working_means: | |
2794 current_mean = CalculateTruncatedMean(m, 0) | |
2795 if bounds_working: | |
2796 bounds_working[0] = min(current_mean, bounds_working[0]) | |
2797 bounds_working[1] = max(current_mean, bounds_working[0]) | |
2798 else: | |
2799 bounds_working = [current_mean, current_mean] | |
2800 for m in broken_means: | |
2801 current_mean = CalculateTruncatedMean(m, 0) | |
2802 if bounds_broken: | |
2803 bounds_broken[0] = min(current_mean, bounds_broken[0]) | |
2804 bounds_broken[1] = max(current_mean, bounds_broken[0]) | |
2805 else: | |
2806 bounds_broken = [current_mean, current_mean] | |
2807 dist_between_groups = min(math.fabs(bounds_broken[1] - bounds_working[0]), | |
2808 math.fabs(bounds_broken[0] - bounds_working[1])) | |
2809 working_mean = sum(working_means, []) | |
2810 broken_mean = sum(broken_means, []) | |
2811 len_working_group = CalculateStandardDeviation(working_mean) | |
2812 len_broken_group = CalculateStandardDeviation(broken_mean) | |
2813 | |
2814 confidence = (dist_between_groups / ( | |
2815 max(0.0001, (len_broken_group + len_working_group )))) | |
2816 confidence = int(min(1.0, max(confidence, 0.0)) * 100.0) | |
2817 return confidence | |
2818 | 2843 |
2819 def _GetResultsDict(self, revision_data, revision_data_sorted): | 2844 def _GetResultsDict(self, revision_data, revision_data_sorted): |
2820 # Find range where it possibly broke. | 2845 # Find range where it possibly broke. |
2821 first_working_revision = None | 2846 first_working_revision = None |
2822 first_working_revision_index = -1 | 2847 first_working_revision_index = -1 |
2823 last_broken_revision = None | 2848 last_broken_revision = None |
2824 last_broken_revision_index = -1 | 2849 last_broken_revision_index = -1 |
2825 | 2850 |
2826 for i in xrange(len(revision_data_sorted)): | 2851 for i in xrange(len(revision_data_sorted)): |
2827 k, v = revision_data_sorted[i] | 2852 k, v = revision_data_sorted[i] |
(...skipping 15 matching lines...) Expand all Loading... |
2843 working_means = [] | 2868 working_means = [] |
2844 for i in xrange(first_working_revision_index, len(revision_data_sorted)): | 2869 for i in xrange(first_working_revision_index, len(revision_data_sorted)): |
2845 if revision_data_sorted[i][1]['value']: | 2870 if revision_data_sorted[i][1]['value']: |
2846 working_means.append(revision_data_sorted[i][1]['value']['values']) | 2871 working_means.append(revision_data_sorted[i][1]['value']['values']) |
2847 | 2872 |
2848 # Flatten the lists to calculate mean of all values. | 2873 # Flatten the lists to calculate mean of all values. |
2849 working_mean = sum(working_means, []) | 2874 working_mean = sum(working_means, []) |
2850 broken_mean = sum(broken_means, []) | 2875 broken_mean = sum(broken_means, []) |
2851 | 2876 |
2852 # Calculate the approximate size of the regression | 2877 # Calculate the approximate size of the regression |
2853 mean_of_bad_runs = CalculateTruncatedMean(broken_mean, 0.0) | 2878 mean_of_bad_runs = CalculateMean(broken_mean) |
2854 mean_of_good_runs = CalculateTruncatedMean(working_mean, 0.0) | 2879 mean_of_good_runs = CalculateMean(working_mean) |
2855 | 2880 |
2856 regression_size = math.fabs(max(mean_of_good_runs, mean_of_bad_runs) / | 2881 regression_size = math.fabs(max(mean_of_good_runs, mean_of_bad_runs) / |
2857 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 - 100.0 | 2882 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 - 100.0 |
2858 | 2883 |
2859 regression_std_err = math.fabs(CalculatePooledStandardError( | 2884 regression_std_err = math.fabs(CalculatePooledStandardError( |
2860 [working_mean, broken_mean]) / | 2885 [working_mean, broken_mean]) / |
2861 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 | 2886 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 |
2862 | 2887 |
2863 # Give a "confidence" in the bisect. At the moment we use how distinct the | 2888 # Give a "confidence" in the bisect. At the moment we use how distinct the |
2864 # values are before and after the last broken revision, and how noisy the | 2889 # values are before and after the last broken revision, and how noisy the |
2865 # overall graph is. | 2890 # overall graph is. |
2866 confidence = self._CalculateConfidence(working_means, broken_means) | 2891 confidence = CalculateConfidence(working_means, broken_means) |
2867 | 2892 |
2868 culprit_revisions = [] | 2893 culprit_revisions = [] |
2869 | 2894 |
2870 cwd = os.getcwd() | 2895 cwd = os.getcwd() |
2871 self.ChangeToDepotWorkingDirectory( | 2896 self.ChangeToDepotWorkingDirectory( |
2872 revision_data[last_broken_revision]['depot']) | 2897 revision_data[last_broken_revision]['depot']) |
2873 | 2898 |
2874 if revision_data[last_broken_revision]['depot'] == 'cros': | 2899 if revision_data[last_broken_revision]['depot'] == 'cros': |
2875 # Want to get a list of all the commits and what depots they belong | 2900 # Want to get a list of all the commits and what depots they belong |
2876 # to so that we can grab info about each. | 2901 # to so that we can grab info about each. |
(...skipping 505 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3382 # The perf dashboard scrapes the "results" step in order to comment on | 3407 # The perf dashboard scrapes the "results" step in order to comment on |
3383 # bugs. If you change this, please update the perf dashboard as well. | 3408 # bugs. If you change this, please update the perf dashboard as well. |
3384 bisect_utils.OutputAnnotationStepStart('Results') | 3409 bisect_utils.OutputAnnotationStepStart('Results') |
3385 print 'Error: %s' % e.message | 3410 print 'Error: %s' % e.message |
3386 if opts.output_buildbot_annotations: | 3411 if opts.output_buildbot_annotations: |
3387 bisect_utils.OutputAnnotationStepClosed() | 3412 bisect_utils.OutputAnnotationStepClosed() |
3388 return 1 | 3413 return 1 |
3389 | 3414 |
3390 if __name__ == '__main__': | 3415 if __name__ == '__main__': |
3391 sys.exit(main()) | 3416 sys.exit(main()) |
OLD | NEW |