OLD | NEW |
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. | 2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. |
3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
5 | 5 |
6 """Performance Test Bisect Tool | 6 """Performance Test Bisect Tool |
7 | 7 |
8 This script bisects a series of changelists using binary search. It starts at | 8 This script bisects a series of changelists using binary search. It starts at |
9 a bad revision where a performance metric has regressed, and asks for a last | 9 a bad revision where a performance metric has regressed, and asks for a last |
10 known-good revision. It will then binary search across this revision range by | 10 known-good revision. It will then binary search across this revision range by |
(...skipping 14 matching lines...) Expand all Loading... |
25 revision were merged in. | 25 revision were merged in. |
26 | 26 |
27 | 27 |
28 An example usage (using git hashes): | 28 An example usage (using git hashes): |
29 | 29 |
30 ./tools/bisect-perf-regression.py -c\ | 30 ./tools/bisect-perf-regression.py -c\ |
31 "out/Release/performance_ui_tests --gtest_filter=ShutdownTest.SimpleUserQuit"\ | 31 "out/Release/performance_ui_tests --gtest_filter=ShutdownTest.SimpleUserQuit"\ |
32 -g 1f6e67861535121c5c819c16a666f2436c207e7b\ | 32 -g 1f6e67861535121c5c819c16a666f2436c207e7b\ |
33 -b b732f23b4f81c382db0b23b9035f3dadc7d925bb\ | 33 -b b732f23b4f81c382db0b23b9035f3dadc7d925bb\ |
34 -m shutdown/simple-user-quit | 34 -m shutdown/simple-user-quit |
35 | |
36 """ | 35 """ |
37 | 36 |
38 import copy | 37 import copy |
39 import datetime | 38 import datetime |
40 import errno | 39 import errno |
41 import hashlib | 40 import hashlib |
42 import math | 41 import math |
43 import optparse | 42 import optparse |
44 import os | 43 import os |
45 import re | 44 import re |
46 import shlex | 45 import shlex |
47 import shutil | 46 import shutil |
48 import StringIO | 47 import StringIO |
49 import sys | 48 import sys |
50 import time | 49 import time |
51 import zipfile | 50 import zipfile |
52 | 51 |
53 sys.path.append(os.path.join(os.path.dirname(__file__), 'telemetry')) | 52 sys.path.append(os.path.join(os.path.dirname(__file__), 'telemetry')) |
54 | 53 |
55 from auto_bisect import bisect_utils | 54 from auto_bisect import bisect_utils |
| 55 from auto_bisect import math_utils |
56 from auto_bisect import post_perf_builder_job as bisect_builder | 56 from auto_bisect import post_perf_builder_job as bisect_builder |
57 from auto_bisect import source_control as source_control_module | 57 from auto_bisect import source_control as source_control_module |
58 from telemetry.util import cloud_storage | 58 from telemetry.util import cloud_storage |
59 | 59 |
60 # The additional repositories that might need to be bisected. | 60 # The additional repositories that might need to be bisected. |
61 # If the repository has any dependant repositories (such as skia/src needs | 61 # If the repository has any dependant repositories (such as skia/src needs |
62 # skia/include and skia/gyp to be updated), specify them in the 'depends' | 62 # skia/include and skia/gyp to be updated), specify them in the 'depends' |
63 # so that they're synced appropriately. | 63 # so that they're synced appropriately. |
64 # Format is: | 64 # Format is: |
65 # src: path to the working directory. | 65 # src: path to the working directory. |
(...skipping 186 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
252 | 252 |
253 def _AddAdditionalDepotInfo(depot_info): | 253 def _AddAdditionalDepotInfo(depot_info): |
254 """Adds additional depot info to the global depot variables.""" | 254 """Adds additional depot info to the global depot variables.""" |
255 global DEPOT_DEPS_NAME | 255 global DEPOT_DEPS_NAME |
256 global DEPOT_NAMES | 256 global DEPOT_NAMES |
257 DEPOT_DEPS_NAME = dict(DEPOT_DEPS_NAME.items() + | 257 DEPOT_DEPS_NAME = dict(DEPOT_DEPS_NAME.items() + |
258 depot_info.items()) | 258 depot_info.items()) |
259 DEPOT_NAMES = DEPOT_DEPS_NAME.keys() | 259 DEPOT_NAMES = DEPOT_DEPS_NAME.keys() |
260 | 260 |
261 | 261 |
262 def CalculateTruncatedMean(data_set, truncate_percent): | 262 def ConfidenceScore(good_results_lists, bad_results_lists): |
263 """Calculates the truncated mean of a set of values. | |
264 | |
265 Note that this isn't just the mean of the set of values with the highest | |
266 and lowest values discarded; the non-discarded values are also weighted | |
267 differently depending how many values are discarded. | |
268 | |
269 Args: | |
270 data_set: Non-empty list of values. | |
271 truncate_percent: The % from the upper and lower portions of the data set | |
272 to discard, expressed as a value in [0, 1]. | |
273 | |
274 Returns: | |
275 The truncated mean as a float. | |
276 | |
277 Raises: | |
278 TypeError: The data set was empty after discarding values. | |
279 """ | |
280 if len(data_set) > 2: | |
281 data_set = sorted(data_set) | |
282 | |
283 discard_num_float = len(data_set) * truncate_percent | |
284 discard_num_int = int(math.floor(discard_num_float)) | |
285 kept_weight = len(data_set) - discard_num_float * 2 | |
286 | |
287 data_set = data_set[discard_num_int:len(data_set)-discard_num_int] | |
288 | |
289 weight_left = 1.0 - (discard_num_float - discard_num_int) | |
290 | |
291 if weight_left < 1: | |
292 # If the % to discard leaves a fractional portion, need to weight those | |
293 # values. | |
294 unweighted_vals = data_set[1:len(data_set)-1] | |
295 weighted_vals = [data_set[0], data_set[len(data_set)-1]] | |
296 weighted_vals = [w * weight_left for w in weighted_vals] | |
297 data_set = weighted_vals + unweighted_vals | |
298 else: | |
299 kept_weight = len(data_set) | |
300 | |
301 truncated_mean = reduce(lambda x, y: float(x) + float(y), | |
302 data_set) / kept_weight | |
303 | |
304 return truncated_mean | |
305 | |
306 | |
307 def CalculateMean(values): | |
308 """Calculates the arithmetic mean of a list of values.""" | |
309 return CalculateTruncatedMean(values, 0.0) | |
310 | |
311 | |
312 def CalculateConfidence(good_results_lists, bad_results_lists): | |
313 """Calculates a confidence percentage. | 263 """Calculates a confidence percentage. |
314 | 264 |
315 This is calculated based on how distinct the "good" and "bad" values are, | 265 This is calculated based on how distinct the "good" and "bad" values are, |
316 and how noisy the results are. More precisely, the confidence is the quotient | 266 and how noisy the results are. More precisely, the confidence is the quotient |
317 of the difference between the closest values across the good and bad groups | 267 of the difference between the closest values across the good and bad groups |
318 and the sum of the standard deviations of the good and bad groups. | 268 and the sum of the standard deviations of the good and bad groups. |
319 | 269 |
320 TODO(qyearsley): Replace this confidence function with a function that | 270 TODO(qyearsley): Replace this confidence function with a function that |
321 uses a Student's t-test. The confidence would be (1 - p-value), where | 271 uses a Student's t-test. The confidence would be (1 - p-value), where |
322 p-value is the probability of obtaining the given a set of good and bad | 272 p-value is the probability of obtaining the given a set of good and bad |
323 values just by chance. | 273 values just by chance. |
324 | 274 |
325 Args: | 275 Args: |
326 good_results_lists: A list of lists of "good" result numbers. | 276 good_results_lists: A list of lists of "good" result numbers. |
327 bad_results_lists: A list of lists of "bad" result numbers. | 277 bad_results_lists: A list of lists of "bad" result numbers. |
328 | 278 |
329 Returns: | 279 Returns: |
330 A number between in the range [0, 100]. | 280 A number between in the range [0, 100]. |
331 """ | 281 """ |
332 # Get the distance between the two groups. | 282 # Get the distance between the two groups. |
333 means_good = map(CalculateMean, good_results_lists) | 283 means_good = map(math_utils.Mean, good_results_lists) |
334 means_bad = map(CalculateMean, bad_results_lists) | 284 means_bad = map(math_utils.Mean, bad_results_lists) |
335 bounds_good = (min(means_good), max(means_good)) | 285 bounds_good = (min(means_good), max(means_good)) |
336 bounds_bad = (min(means_bad), max(means_bad)) | 286 bounds_bad = (min(means_bad), max(means_bad)) |
337 dist_between_groups = min( | 287 dist_between_groups = min( |
338 math.fabs(bounds_bad[1] - bounds_good[0]), | 288 math.fabs(bounds_bad[1] - bounds_good[0]), |
339 math.fabs(bounds_bad[0] - bounds_good[1])) | 289 math.fabs(bounds_bad[0] - bounds_good[1])) |
340 | 290 |
341 # Get the sum of the standard deviations of the two groups. | 291 # Get the sum of the standard deviations of the two groups. |
342 good_results_flattened = sum(good_results_lists, []) | 292 good_results_flattened = sum(good_results_lists, []) |
343 bad_results_flattened = sum(bad_results_lists, []) | 293 bad_results_flattened = sum(bad_results_lists, []) |
344 stddev_good = CalculateStandardDeviation(good_results_flattened) | 294 stddev_good = math_utils.StandardDeviation(good_results_flattened) |
345 stddev_bad = CalculateStandardDeviation(bad_results_flattened) | 295 stddev_bad = math_utils.StandardDeviation(bad_results_flattened) |
346 stddev_sum = stddev_good + stddev_bad | 296 stddev_sum = stddev_good + stddev_bad |
347 | 297 |
348 confidence = dist_between_groups / (max(0.0001, stddev_sum)) | 298 confidence = dist_between_groups / (max(0.0001, stddev_sum)) |
349 confidence = int(min(1.0, max(confidence, 0.0)) * 100.0) | 299 confidence = int(min(1.0, max(confidence, 0.0)) * 100.0) |
350 return confidence | 300 return confidence |
351 | 301 |
352 | 302 |
353 def CalculateStandardDeviation(values): | |
354 """Calculates the sample standard deviation of the given list of values.""" | |
355 if len(values) == 1: | |
356 return 0.0 | |
357 | |
358 mean = CalculateMean(values) | |
359 differences_from_mean = [float(x) - mean for x in values] | |
360 squared_differences = [float(x * x) for x in differences_from_mean] | |
361 variance = sum(squared_differences) / (len(values) - 1) | |
362 std_dev = math.sqrt(variance) | |
363 | |
364 return std_dev | |
365 | |
366 | |
367 def CalculateRelativeChange(before, after): | |
368 """Returns the relative change of before and after, relative to before. | |
369 | |
370 There are several different ways to define relative difference between | |
371 two numbers; sometimes it is defined as relative to the smaller number, | |
372 or to the mean of the two numbers. This version returns the difference | |
373 relative to the first of the two numbers. | |
374 | |
375 Args: | |
376 before: A number representing an earlier value. | |
377 after: Another number, representing a later value. | |
378 | |
379 Returns: | |
380 A non-negative floating point number; 0.1 represents a 10% change. | |
381 """ | |
382 if before == after: | |
383 return 0.0 | |
384 if before == 0: | |
385 return float('nan') | |
386 difference = after - before | |
387 return math.fabs(difference / before) | |
388 | |
389 | |
390 def CalculatePooledStandardError(work_sets): | |
391 numerator = 0.0 | |
392 denominator1 = 0.0 | |
393 denominator2 = 0.0 | |
394 | |
395 for current_set in work_sets: | |
396 std_dev = CalculateStandardDeviation(current_set) | |
397 numerator += (len(current_set) - 1) * std_dev ** 2 | |
398 denominator1 += len(current_set) - 1 | |
399 denominator2 += 1.0 / len(current_set) | |
400 | |
401 if denominator1: | |
402 return math.sqrt(numerator / denominator1) * math.sqrt(denominator2) | |
403 return 0.0 | |
404 | |
405 | |
406 def CalculateStandardError(values): | |
407 """Calculates the standard error of a list of values.""" | |
408 if len(values) <= 1: | |
409 return 0.0 | |
410 | |
411 std_dev = CalculateStandardDeviation(values) | |
412 | |
413 return std_dev / math.sqrt(len(values)) | |
414 | |
415 | |
416 | |
417 | |
418 def GetSHA1HexDigest(contents): | 303 def GetSHA1HexDigest(contents): |
419 """Returns secured hash containing hexadecimal for the given contents.""" | 304 """Returns secured hash containing hexadecimal for the given contents.""" |
420 return hashlib.sha1(contents).hexdigest() | 305 return hashlib.sha1(contents).hexdigest() |
421 | 306 |
422 | 307 |
423 def GetZipFileName(build_revision=None, target_arch='ia32', patch_sha=None): | 308 def GetZipFileName(build_revision=None, target_arch='ia32', patch_sha=None): |
424 """Gets the archive file name for the given revision.""" | 309 """Gets the archive file name for the given revision.""" |
425 def PlatformName(): | 310 def PlatformName(): |
426 """Return a string to be used in paths for the platform.""" | 311 """Return a string to be used in paths for the platform.""" |
427 if bisect_utils.IsWindowsHost(): | 312 if bisect_utils.IsWindowsHost(): |
(...skipping 1546 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1974 'std_err': 0.0, | 1859 'std_err': 0.0, |
1975 'std_dev': 0.0, | 1860 'std_dev': 0.0, |
1976 'values': metric_values, | 1861 'values': metric_values, |
1977 } | 1862 } |
1978 | 1863 |
1979 print 'Results of performance test: Command returned with %d' % ( | 1864 print 'Results of performance test: Command returned with %d' % ( |
1980 overall_return_code) | 1865 overall_return_code) |
1981 print | 1866 print |
1982 else: | 1867 else: |
1983 # Need to get the average value if there were multiple values. | 1868 # Need to get the average value if there were multiple values. |
1984 truncated_mean = CalculateTruncatedMean(metric_values, | 1869 truncated_mean = math_utils.TruncatedMean( |
1985 self.opts.truncate_percent) | 1870 metric_values, self.opts.truncate_percent) |
1986 standard_err = CalculateStandardError(metric_values) | 1871 standard_err = math_utils.StandardError(metric_values) |
1987 standard_dev = CalculateStandardDeviation(metric_values) | 1872 standard_dev = math_utils.StandardDeviation(metric_values) |
1988 | 1873 |
1989 if self._IsBisectModeStandardDeviation(): | 1874 if self._IsBisectModeStandardDeviation(): |
1990 metric_values = [standard_dev] | 1875 metric_values = [standard_dev] |
1991 | 1876 |
1992 values = { | 1877 values = { |
1993 'mean': truncated_mean, | 1878 'mean': truncated_mean, |
1994 'std_err': standard_err, | 1879 'std_err': standard_err, |
1995 'std_dev': standard_dev, | 1880 'std_dev': standard_dev, |
1996 'values': metric_values, | 1881 'values': metric_values, |
1997 } | 1882 } |
(...skipping 1169 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3167 | 3052 |
3168 def _FindOtherRegressions(self, revision_data_sorted, bad_greater_than_good): | 3053 def _FindOtherRegressions(self, revision_data_sorted, bad_greater_than_good): |
3169 other_regressions = [] | 3054 other_regressions = [] |
3170 previous_values = [] | 3055 previous_values = [] |
3171 previous_id = None | 3056 previous_id = None |
3172 for current_id, current_data in revision_data_sorted: | 3057 for current_id, current_data in revision_data_sorted: |
3173 current_values = current_data['value'] | 3058 current_values = current_data['value'] |
3174 if current_values: | 3059 if current_values: |
3175 current_values = current_values['values'] | 3060 current_values = current_values['values'] |
3176 if previous_values: | 3061 if previous_values: |
3177 confidence = CalculateConfidence(previous_values, [current_values]) | 3062 confidence = ConfidenceScore(previous_values, [current_values]) |
3178 mean_of_prev_runs = CalculateMean(sum(previous_values, [])) | 3063 mean_of_prev_runs = math_utils.Mean(sum(previous_values, [])) |
3179 mean_of_current_runs = CalculateMean(current_values) | 3064 mean_of_current_runs = math_utils.Mean(current_values) |
3180 | 3065 |
3181 # Check that the potential regression is in the same direction as | 3066 # Check that the potential regression is in the same direction as |
3182 # the overall regression. If the mean of the previous runs < the | 3067 # the overall regression. If the mean of the previous runs < the |
3183 # mean of the current runs, this local regression is in same | 3068 # mean of the current runs, this local regression is in same |
3184 # direction. | 3069 # direction. |
3185 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs | 3070 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs |
3186 is_same_direction = (prev_less_than_current if | 3071 is_same_direction = (prev_less_than_current if |
3187 bad_greater_than_good else not prev_less_than_current) | 3072 bad_greater_than_good else not prev_less_than_current) |
3188 | 3073 |
3189 # Only report potential regressions with high confidence. | 3074 # Only report potential regressions with high confidence. |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3221 working_means = [] | 3106 working_means = [] |
3222 for i in xrange(first_working_revision_index, len(revision_data_sorted)): | 3107 for i in xrange(first_working_revision_index, len(revision_data_sorted)): |
3223 if revision_data_sorted[i][1]['value']: | 3108 if revision_data_sorted[i][1]['value']: |
3224 working_means.append(revision_data_sorted[i][1]['value']['values']) | 3109 working_means.append(revision_data_sorted[i][1]['value']['values']) |
3225 | 3110 |
3226 # Flatten the lists to calculate mean of all values. | 3111 # Flatten the lists to calculate mean of all values. |
3227 working_mean = sum(working_means, []) | 3112 working_mean = sum(working_means, []) |
3228 broken_mean = sum(broken_means, []) | 3113 broken_mean = sum(broken_means, []) |
3229 | 3114 |
3230 # Calculate the approximate size of the regression | 3115 # Calculate the approximate size of the regression |
3231 mean_of_bad_runs = CalculateMean(broken_mean) | 3116 mean_of_bad_runs = math_utils.Mean(broken_mean) |
3232 mean_of_good_runs = CalculateMean(working_mean) | 3117 mean_of_good_runs = math_utils.Mean(working_mean) |
3233 | 3118 |
3234 regression_size = 100 * CalculateRelativeChange(mean_of_good_runs, | 3119 regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs, |
3235 mean_of_bad_runs) | 3120 mean_of_bad_runs) |
3236 if math.isnan(regression_size): | 3121 if math.isnan(regression_size): |
3237 regression_size = 'zero-to-nonzero' | 3122 regression_size = 'zero-to-nonzero' |
3238 | 3123 |
3239 regression_std_err = math.fabs(CalculatePooledStandardError( | 3124 regression_std_err = math.fabs(math_utils.PooledStandardError( |
3240 [working_mean, broken_mean]) / | 3125 [working_mean, broken_mean]) / |
3241 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 | 3126 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 |
3242 | 3127 |
3243 # Give a "confidence" in the bisect. At the moment we use how distinct the | 3128 # Give a "confidence" in the bisect. At the moment we use how distinct the |
3244 # values are before and after the last broken revision, and how noisy the | 3129 # values are before and after the last broken revision, and how noisy the |
3245 # overall graph is. | 3130 # overall graph is. |
3246 confidence = CalculateConfidence(working_means, broken_means) | 3131 confidence = ConfidenceScore(working_means, broken_means) |
3247 | 3132 |
3248 culprit_revisions = [] | 3133 culprit_revisions = [] |
3249 | 3134 |
3250 cwd = os.getcwd() | 3135 cwd = os.getcwd() |
3251 self.ChangeToDepotWorkingDirectory( | 3136 self.ChangeToDepotWorkingDirectory( |
3252 revision_data[last_broken_revision]['depot']) | 3137 revision_data[last_broken_revision]['depot']) |
3253 | 3138 |
3254 if revision_data[last_broken_revision]['depot'] == 'cros': | 3139 if revision_data[last_broken_revision]['depot'] == 'cros': |
3255 # Want to get a list of all the commits and what depots they belong | 3140 # Want to get a list of all the commits and what depots they belong |
3256 # to so that we can grab info about each. | 3141 # to so that we can grab info about each. |
(...skipping 507 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3764 except RuntimeError, e: | 3649 except RuntimeError, e: |
3765 if opts.output_buildbot_annotations: | 3650 if opts.output_buildbot_annotations: |
3766 # The perf dashboard scrapes the "results" step in order to comment on | 3651 # The perf dashboard scrapes the "results" step in order to comment on |
3767 # bugs. If you change this, please update the perf dashboard as well. | 3652 # bugs. If you change this, please update the perf dashboard as well. |
3768 bisect_utils.OutputAnnotationStepStart('Results') | 3653 bisect_utils.OutputAnnotationStepStart('Results') |
3769 print 'Error: %s' % e.message | 3654 print 'Error: %s' % e.message |
3770 if opts.output_buildbot_annotations: | 3655 if opts.output_buildbot_annotations: |
3771 bisect_utils.OutputAnnotationStepClosed() | 3656 bisect_utils.OutputAnnotationStepClosed() |
3772 return 1 | 3657 return 1 |
3773 | 3658 |
| 3659 |
3774 if __name__ == '__main__': | 3660 if __name__ == '__main__': |
3775 sys.exit(main()) | 3661 sys.exit(main()) |
OLD | NEW |