Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(81)

Side by Side Diff: tools/bisect-perf-regression.py

Issue 220113012: Refactor CalculateConfidence and add unit test. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Update call-site of CalculateConfidence. Created 6 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 # Copyright (c) 2013 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be 3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file. 4 # found in the LICENSE file.
5 5
6 """Performance Test Bisect Tool 6 """Performance Test Bisect Tool
7 7
8 This script bisects a series of changelists using binary search. It starts at 8 This script bisects a series of changelists using binary search. It starts at
9 a bad revision where a performance metric has regressed, and asks for a last 9 a bad revision where a performance metric has regressed, and asks for a last
10 known-good revision. It will then binary search across this revision range by 10 known-good revision. It will then binary search across this revision range by
(...skipping 146 matching lines...) Expand 10 before | Expand all | Expand 10 after
157 global DEPOT_DEPS_NAME 157 global DEPOT_DEPS_NAME
158 global DEPOT_NAMES 158 global DEPOT_NAMES
159 DEPOT_DEPS_NAME = dict(DEPOT_DEPS_NAME.items() + 159 DEPOT_DEPS_NAME = dict(DEPOT_DEPS_NAME.items() +
160 depot_info.items()) 160 depot_info.items())
161 DEPOT_NAMES = DEPOT_DEPS_NAME.keys() 161 DEPOT_NAMES = DEPOT_DEPS_NAME.keys()
162 162
163 163
164 def CalculateTruncatedMean(data_set, truncate_percent): 164 def CalculateTruncatedMean(data_set, truncate_percent):
165 """Calculates the truncated mean of a set of values. 165 """Calculates the truncated mean of a set of values.
166 166
167 Note that this isn't just the mean of the set of values with the highest
168 and lowest values discarded; the non-discarded values are also weighted
169 differently depending how many values are discarded.
170
167 Args: 171 Args:
168 data_set: Set of values to use in calculation. 172 data_set: Non-empty list of values.
169 truncate_percent: The % from the upper/lower portions of the data set to 173 truncate_percent: The % from the upper and lower portions of the data set
170 discard, expressed as a value in [0, 1]. 174 to discard, expressed as a value in [0, 1].
171 175
172 Returns: 176 Returns:
173 The truncated mean as a float. 177 The truncated mean as a float.
178
179 Raises:
180 TypeError: The data set was empty after discarding values.
174 """ 181 """
175 if len(data_set) > 2: 182 if len(data_set) > 2:
176 data_set = sorted(data_set) 183 data_set = sorted(data_set)
177 184
178 discard_num_float = len(data_set) * truncate_percent 185 discard_num_float = len(data_set) * truncate_percent
179 discard_num_int = int(math.floor(discard_num_float)) 186 discard_num_int = int(math.floor(discard_num_float))
180 kept_weight = len(data_set) - discard_num_float * 2 187 kept_weight = len(data_set) - discard_num_float * 2
181 188
182 data_set = data_set[discard_num_int:len(data_set)-discard_num_int] 189 data_set = data_set[discard_num_int:len(data_set)-discard_num_int]
183 190
184 weight_left = 1.0 - (discard_num_float - discard_num_int) 191 weight_left = 1.0 - (discard_num_float - discard_num_int)
185 192
186 if weight_left < 1: 193 if weight_left < 1:
187 # If the % to discard leaves a fractional portion, need to weight those 194 # If the % to discard leaves a fractional portion, need to weight those
188 # values. 195 # values.
189 unweighted_vals = data_set[1:len(data_set)-1] 196 unweighted_vals = data_set[1:len(data_set)-1]
190 weighted_vals = [data_set[0], data_set[len(data_set)-1]] 197 weighted_vals = [data_set[0], data_set[len(data_set)-1]]
191 weighted_vals = [w * weight_left for w in weighted_vals] 198 weighted_vals = [w * weight_left for w in weighted_vals]
192 data_set = weighted_vals + unweighted_vals 199 data_set = weighted_vals + unweighted_vals
193 else: 200 else:
194 kept_weight = len(data_set) 201 kept_weight = len(data_set)
195 202
196 truncated_mean = reduce(lambda x, y: float(x) + float(y), 203 truncated_mean = reduce(lambda x, y: float(x) + float(y),
197 data_set) / kept_weight 204 data_set) / kept_weight
198 205
199 return truncated_mean 206 return truncated_mean
200 207
201 208
202 def CalculateStandardDeviation(v): 209 def CalculateMean(values):
203 if len(v) == 1: 210 """Calculates the arithmetic mean of a list of values."""
211 return CalculateTruncatedMean(values, 0.0)
212
213
214 def CalculateConfidence(good_results_lists, bad_results_lists):
215 """Calculates a confidence percentage.
216
217 This is calculated based on how distinct the "good" and "bad" values are,
218 and how noisy the results are. More precisely, the confidence is the quotient
219 of the difference between the closest values across the good and bad groups
220 and the sum of the standard deviations of the good and bad groups.
221
222 TODO(qyearsley): Replace this confidence function with a function that
223 uses a Student's t-test. The confidence would be (1 - p-value), where
224 p-value is the probability of obtaining the given a set of good and bad
225 values just by chance.
226
227 Args:
228 good_results_lists: A list of lists of "good" result numbers.
229 bad_results_lists: A list of lists of "bad" result numbers.
230
231 Returns:
232 A number between in the range [0, 100].
233 """
234 # Get the distance between the two groups.
235 means_good = map(CalculateMean, good_results_lists)
236 means_bad = map(CalculateMean, bad_results_lists)
237 bounds_good = (min(means_good), max(means_good))
238 bounds_bad = (min(means_bad), max(means_bad))
239 dist_between_groups = min(
240 math.fabs(bounds_bad[1] - bounds_good[0]),
241 math.fabs(bounds_bad[0] - bounds_good[1]))
242
243 # Get the sum of the standard deviations of the two groups.
244 good_results_flattened = sum(good_results_lists, [])
245 bad_results_flattened = sum(bad_results_lists, [])
246 stddev_good = CalculateStandardDeviation(good_results_flattened)
247 stddev_bad = CalculateStandardDeviation(bad_results_flattened)
248 stddev_sum = stddev_good + stddev_bad
249
250 confidence = dist_between_groups / (max(0.0001, stddev_sum))
251 confidence = int(min(1.0, max(confidence, 0.0)) * 100.0)
252 return confidence
253
254
255 def CalculateStandardDeviation(values):
256 """Calculates the sample standard deviation of the given list of values."""
257 if len(values) == 1:
204 return 0.0 258 return 0.0
205 259
206 mean = CalculateTruncatedMean(v, 0.0) 260 mean = CalculateMean(values)
207 variances = [float(x) - mean for x in v] 261 differences_from_mean = [float(x) - mean for x in values]
208 variances = [x * x for x in variances] 262 squared_differences = [float(x * x) for x in differences_from_mean]
209 variance = reduce(lambda x, y: float(x) + float(y), variances) / (len(v) - 1) 263 variance = sum(squared_differences) / (len(values) - 1)
210 std_dev = math.sqrt(variance) 264 std_dev = math.sqrt(variance)
211 265
212 return std_dev 266 return std_dev
213 267
214 268
215 def CalculatePooledStandardError(work_sets): 269 def CalculatePooledStandardError(work_sets):
216 numerator = 0.0 270 numerator = 0.0
217 denominator1 = 0.0 271 denominator1 = 0.0
218 denominator2 = 0.0 272 denominator2 = 0.0
219 273
220 for current_set in work_sets: 274 for current_set in work_sets:
221 std_dev = CalculateStandardDeviation(current_set) 275 std_dev = CalculateStandardDeviation(current_set)
222 numerator += (len(current_set) - 1) * std_dev ** 2 276 numerator += (len(current_set) - 1) * std_dev ** 2
223 denominator1 += len(current_set) - 1 277 denominator1 += len(current_set) - 1
224 denominator2 += 1.0 / len(current_set) 278 denominator2 += 1.0 / len(current_set)
225 279
226 if denominator1: 280 if denominator1:
227 return math.sqrt(numerator / denominator1) * math.sqrt(denominator2) 281 return math.sqrt(numerator / denominator1) * math.sqrt(denominator2)
228 return 0.0 282 return 0.0
229 283
230 284
231 def CalculateStandardError(v): 285 def CalculateStandardError(values):
232 if len(v) <= 1: 286 """Calculates the standard error of a list of values."""
287 if len(values) <= 1:
233 return 0.0 288 return 0.0
234 289
235 std_dev = CalculateStandardDeviation(v) 290 std_dev = CalculateStandardDeviation(values)
236 291
237 return std_dev / math.sqrt(len(v)) 292 return std_dev / math.sqrt(len(values))
238 293
239 294
240 def IsStringFloat(string_to_check): 295 def IsStringFloat(string_to_check):
241 """Checks whether or not the given string can be converted to a floating 296 """Checks whether or not the given string can be converted to a floating
242 point number. 297 point number.
243 298
244 Args: 299 Args:
245 string_to_check: Input string to check if it can be converted to a float. 300 string_to_check: Input string to check if it can be converted to a float.
246 301
247 Returns: 302 Returns:
(...skipping 2511 matching lines...) Expand 10 before | Expand all | Expand 10 after
2759 2814
2760 def _FindOtherRegressions(self, revision_data_sorted, bad_greater_than_good): 2815 def _FindOtherRegressions(self, revision_data_sorted, bad_greater_than_good):
2761 other_regressions = [] 2816 other_regressions = []
2762 previous_values = [] 2817 previous_values = []
2763 previous_id = None 2818 previous_id = None
2764 for current_id, current_data in revision_data_sorted: 2819 for current_id, current_data in revision_data_sorted:
2765 current_values = current_data['value'] 2820 current_values = current_data['value']
2766 if current_values: 2821 if current_values:
2767 current_values = current_values['values'] 2822 current_values = current_values['values']
2768 if previous_values: 2823 if previous_values:
2769 confidence = self._CalculateConfidence(previous_values, 2824 confidence = CalculateConfidence(previous_values, [current_values])
2770 [current_values]) 2825 mean_of_prev_runs = CalculateMean(sum(previous_values, []))
2771 mean_of_prev_runs = CalculateTruncatedMean( 2826 mean_of_current_runs = CalculateMean(current_values)
2772 sum(previous_values, []), 0)
2773 mean_of_current_runs = CalculateTruncatedMean(current_values, 0)
2774 2827
2775 # Check that the potential regression is in the same direction as 2828 # Check that the potential regression is in the same direction as
2776 # the overall regression. If the mean of the previous runs < the 2829 # the overall regression. If the mean of the previous runs < the
2777 # mean of the current runs, this local regression is in same 2830 # mean of the current runs, this local regression is in same
2778 # direction. 2831 # direction.
2779 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs 2832 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs
2780 is_same_direction = (prev_less_than_current if 2833 is_same_direction = (prev_less_than_current if
2781 bad_greater_than_good else not prev_less_than_current) 2834 bad_greater_than_good else not prev_less_than_current)
2782 2835
2783 # Only report potential regressions with high confidence. 2836 # Only report potential regressions with high confidence.
2784 if is_same_direction and confidence > 50: 2837 if is_same_direction and confidence > 50:
2785 other_regressions.append([current_id, previous_id, confidence]) 2838 other_regressions.append([current_id, previous_id, confidence])
2786 previous_values.append(current_values) 2839 previous_values.append(current_values)
2787 previous_id = current_id 2840 previous_id = current_id
2788 return other_regressions 2841 return other_regressions
2789 2842
2790 def _CalculateConfidence(self, working_means, broken_means):
2791 bounds_working = []
2792 bounds_broken = []
2793 for m in working_means:
2794 current_mean = CalculateTruncatedMean(m, 0)
2795 if bounds_working:
2796 bounds_working[0] = min(current_mean, bounds_working[0])
2797 bounds_working[1] = max(current_mean, bounds_working[0])
2798 else:
2799 bounds_working = [current_mean, current_mean]
2800 for m in broken_means:
2801 current_mean = CalculateTruncatedMean(m, 0)
2802 if bounds_broken:
2803 bounds_broken[0] = min(current_mean, bounds_broken[0])
2804 bounds_broken[1] = max(current_mean, bounds_broken[0])
2805 else:
2806 bounds_broken = [current_mean, current_mean]
2807 dist_between_groups = min(math.fabs(bounds_broken[1] - bounds_working[0]),
2808 math.fabs(bounds_broken[0] - bounds_working[1]))
2809 working_mean = sum(working_means, [])
2810 broken_mean = sum(broken_means, [])
2811 len_working_group = CalculateStandardDeviation(working_mean)
2812 len_broken_group = CalculateStandardDeviation(broken_mean)
2813
2814 confidence = (dist_between_groups / (
2815 max(0.0001, (len_broken_group + len_working_group ))))
2816 confidence = int(min(1.0, max(confidence, 0.0)) * 100.0)
2817 return confidence
2818 2843
2819 def _GetResultsDict(self, revision_data, revision_data_sorted): 2844 def _GetResultsDict(self, revision_data, revision_data_sorted):
2820 # Find range where it possibly broke. 2845 # Find range where it possibly broke.
2821 first_working_revision = None 2846 first_working_revision = None
2822 first_working_revision_index = -1 2847 first_working_revision_index = -1
2823 last_broken_revision = None 2848 last_broken_revision = None
2824 last_broken_revision_index = -1 2849 last_broken_revision_index = -1
2825 2850
2826 for i in xrange(len(revision_data_sorted)): 2851 for i in xrange(len(revision_data_sorted)):
2827 k, v = revision_data_sorted[i] 2852 k, v = revision_data_sorted[i]
(...skipping 15 matching lines...) Expand all
2843 working_means = [] 2868 working_means = []
2844 for i in xrange(first_working_revision_index, len(revision_data_sorted)): 2869 for i in xrange(first_working_revision_index, len(revision_data_sorted)):
2845 if revision_data_sorted[i][1]['value']: 2870 if revision_data_sorted[i][1]['value']:
2846 working_means.append(revision_data_sorted[i][1]['value']['values']) 2871 working_means.append(revision_data_sorted[i][1]['value']['values'])
2847 2872
2848 # Flatten the lists to calculate mean of all values. 2873 # Flatten the lists to calculate mean of all values.
2849 working_mean = sum(working_means, []) 2874 working_mean = sum(working_means, [])
2850 broken_mean = sum(broken_means, []) 2875 broken_mean = sum(broken_means, [])
2851 2876
2852 # Calculate the approximate size of the regression 2877 # Calculate the approximate size of the regression
2853 mean_of_bad_runs = CalculateTruncatedMean(broken_mean, 0.0) 2878 mean_of_bad_runs = CalculateMean(broken_mean)
2854 mean_of_good_runs = CalculateTruncatedMean(working_mean, 0.0) 2879 mean_of_good_runs = CalculateMean(working_mean)
2855 2880
2856 regression_size = math.fabs(max(mean_of_good_runs, mean_of_bad_runs) / 2881 regression_size = math.fabs(max(mean_of_good_runs, mean_of_bad_runs) /
2857 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 - 100.0 2882 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 - 100.0
2858 2883
2859 regression_std_err = math.fabs(CalculatePooledStandardError( 2884 regression_std_err = math.fabs(CalculatePooledStandardError(
2860 [working_mean, broken_mean]) / 2885 [working_mean, broken_mean]) /
2861 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 2886 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
2862 2887
2863 # Give a "confidence" in the bisect. At the moment we use how distinct the 2888 # Give a "confidence" in the bisect. At the moment we use how distinct the
2864 # values are before and after the last broken revision, and how noisy the 2889 # values are before and after the last broken revision, and how noisy the
2865 # overall graph is. 2890 # overall graph is.
2866 confidence = self._CalculateConfidence(working_means, broken_means) 2891 confidence = CalculateConfidence(working_means, broken_means)
2867 2892
2868 culprit_revisions = [] 2893 culprit_revisions = []
2869 2894
2870 cwd = os.getcwd() 2895 cwd = os.getcwd()
2871 self.ChangeToDepotWorkingDirectory( 2896 self.ChangeToDepotWorkingDirectory(
2872 revision_data[last_broken_revision]['depot']) 2897 revision_data[last_broken_revision]['depot'])
2873 2898
2874 if revision_data[last_broken_revision]['depot'] == 'cros': 2899 if revision_data[last_broken_revision]['depot'] == 'cros':
2875 # Want to get a list of all the commits and what depots they belong 2900 # Want to get a list of all the commits and what depots they belong
2876 # to so that we can grab info about each. 2901 # to so that we can grab info about each.
(...skipping 505 matching lines...) Expand 10 before | Expand all | Expand 10 after
3382 # The perf dashboard scrapes the "results" step in order to comment on 3407 # The perf dashboard scrapes the "results" step in order to comment on
3383 # bugs. If you change this, please update the perf dashboard as well. 3408 # bugs. If you change this, please update the perf dashboard as well.
3384 bisect_utils.OutputAnnotationStepStart('Results') 3409 bisect_utils.OutputAnnotationStepStart('Results')
3385 print 'Error: %s' % e.message 3410 print 'Error: %s' % e.message
3386 if opts.output_buildbot_annotations: 3411 if opts.output_buildbot_annotations:
3387 bisect_utils.OutputAnnotationStepClosed() 3412 bisect_utils.OutputAnnotationStepClosed()
3388 return 1 3413 return 1
3389 3414
3390 if __name__ == '__main__': 3415 if __name__ == '__main__':
3391 sys.exit(main()) 3416 sys.exit(main())
OLDNEW
« no previous file with comments | « no previous file | tools/bisect-perf-regression_test.py » ('j') | tools/bisect-perf-regression_test.py » ('J')

Powered by Google App Engine
This is Rietveld 408576698