| OLD | NEW |
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. | 2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
| 5 | 5 |
| 6 """Performance Test Bisect Tool | 6 """Performance Test Bisect Tool |
| 7 | 7 |
| 8 This script bisects a series of changelists using binary search. It starts at | 8 This script bisects a series of changelists using binary search. It starts at |
| 9 a bad revision where a performance metric has regressed, and asks for a last | 9 a bad revision where a performance metric has regressed, and asks for a last |
| 10 known-good revision. It will then binary search across this revision range by | 10 known-good revision. It will then binary search across this revision range by |
| (...skipping 14 matching lines...) Expand all Loading... |
| 25 revision were merged in. | 25 revision were merged in. |
| 26 | 26 |
| 27 | 27 |
| 28 An example usage (using git hashes): | 28 An example usage (using git hashes): |
| 29 | 29 |
| 30 ./tools/bisect-perf-regression.py -c\ | 30 ./tools/bisect-perf-regression.py -c\ |
| 31 "out/Release/performance_ui_tests --gtest_filter=ShutdownTest.SimpleUserQuit"\ | 31 "out/Release/performance_ui_tests --gtest_filter=ShutdownTest.SimpleUserQuit"\ |
| 32 -g 1f6e67861535121c5c819c16a666f2436c207e7b\ | 32 -g 1f6e67861535121c5c819c16a666f2436c207e7b\ |
| 33 -b b732f23b4f81c382db0b23b9035f3dadc7d925bb\ | 33 -b b732f23b4f81c382db0b23b9035f3dadc7d925bb\ |
| 34 -m shutdown/simple-user-quit | 34 -m shutdown/simple-user-quit |
| 35 | |
| 36 """ | 35 """ |
| 37 | 36 |
| 38 import copy | 37 import copy |
| 39 import datetime | 38 import datetime |
| 40 import errno | 39 import errno |
| 41 import hashlib | 40 import hashlib |
| 42 import math | 41 import math |
| 43 import optparse | 42 import optparse |
| 44 import os | 43 import os |
| 45 import re | 44 import re |
| 46 import shlex | 45 import shlex |
| 47 import shutil | 46 import shutil |
| 48 import StringIO | 47 import StringIO |
| 49 import sys | 48 import sys |
| 50 import time | 49 import time |
| 51 import zipfile | 50 import zipfile |
| 52 | 51 |
| 53 sys.path.append(os.path.join(os.path.dirname(__file__), 'telemetry')) | 52 sys.path.append(os.path.join(os.path.dirname(__file__), 'telemetry')) |
| 54 | 53 |
| 55 from auto_bisect import bisect_utils | 54 from auto_bisect import bisect_utils |
| 55 from auto_bisect import math_utils |
| 56 from auto_bisect import post_perf_builder_job as bisect_builder | 56 from auto_bisect import post_perf_builder_job as bisect_builder |
| 57 from auto_bisect import source_control as source_control_module | 57 from auto_bisect import source_control as source_control_module |
| 58 from telemetry.util import cloud_storage | 58 from telemetry.util import cloud_storage |
| 59 | 59 |
| 60 # The additional repositories that might need to be bisected. | 60 # The additional repositories that might need to be bisected. |
| 61 # If the repository has any dependant repositories (such as skia/src needs | 61 # If the repository has any dependant repositories (such as skia/src needs |
| 62 # skia/include and skia/gyp to be updated), specify them in the 'depends' | 62 # skia/include and skia/gyp to be updated), specify them in the 'depends' |
| 63 # so that they're synced appropriately. | 63 # so that they're synced appropriately. |
| 64 # Format is: | 64 # Format is: |
| 65 # src: path to the working directory. | 65 # src: path to the working directory. |
| (...skipping 186 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 252 | 252 |
| 253 def _AddAdditionalDepotInfo(depot_info): | 253 def _AddAdditionalDepotInfo(depot_info): |
| 254 """Adds additional depot info to the global depot variables.""" | 254 """Adds additional depot info to the global depot variables.""" |
| 255 global DEPOT_DEPS_NAME | 255 global DEPOT_DEPS_NAME |
| 256 global DEPOT_NAMES | 256 global DEPOT_NAMES |
| 257 DEPOT_DEPS_NAME = dict(DEPOT_DEPS_NAME.items() + | 257 DEPOT_DEPS_NAME = dict(DEPOT_DEPS_NAME.items() + |
| 258 depot_info.items()) | 258 depot_info.items()) |
| 259 DEPOT_NAMES = DEPOT_DEPS_NAME.keys() | 259 DEPOT_NAMES = DEPOT_DEPS_NAME.keys() |
| 260 | 260 |
| 261 | 261 |
| 262 def CalculateTruncatedMean(data_set, truncate_percent): | 262 def ConfidenceScore(good_results_lists, bad_results_lists): |
| 263 """Calculates the truncated mean of a set of values. | |
| 264 | |
| 265 Note that this isn't just the mean of the set of values with the highest | |
| 266 and lowest values discarded; the non-discarded values are also weighted | |
| 267 differently depending how many values are discarded. | |
| 268 | |
| 269 Args: | |
| 270 data_set: Non-empty list of values. | |
| 271 truncate_percent: The % from the upper and lower portions of the data set | |
| 272 to discard, expressed as a value in [0, 1]. | |
| 273 | |
| 274 Returns: | |
| 275 The truncated mean as a float. | |
| 276 | |
| 277 Raises: | |
| 278 TypeError: The data set was empty after discarding values. | |
| 279 """ | |
| 280 if len(data_set) > 2: | |
| 281 data_set = sorted(data_set) | |
| 282 | |
| 283 discard_num_float = len(data_set) * truncate_percent | |
| 284 discard_num_int = int(math.floor(discard_num_float)) | |
| 285 kept_weight = len(data_set) - discard_num_float * 2 | |
| 286 | |
| 287 data_set = data_set[discard_num_int:len(data_set)-discard_num_int] | |
| 288 | |
| 289 weight_left = 1.0 - (discard_num_float - discard_num_int) | |
| 290 | |
| 291 if weight_left < 1: | |
| 292 # If the % to discard leaves a fractional portion, need to weight those | |
| 293 # values. | |
| 294 unweighted_vals = data_set[1:len(data_set)-1] | |
| 295 weighted_vals = [data_set[0], data_set[len(data_set)-1]] | |
| 296 weighted_vals = [w * weight_left for w in weighted_vals] | |
| 297 data_set = weighted_vals + unweighted_vals | |
| 298 else: | |
| 299 kept_weight = len(data_set) | |
| 300 | |
| 301 truncated_mean = reduce(lambda x, y: float(x) + float(y), | |
| 302 data_set) / kept_weight | |
| 303 | |
| 304 return truncated_mean | |
| 305 | |
| 306 | |
| 307 def CalculateMean(values): | |
| 308 """Calculates the arithmetic mean of a list of values.""" | |
| 309 return CalculateTruncatedMean(values, 0.0) | |
| 310 | |
| 311 | |
| 312 def CalculateConfidence(good_results_lists, bad_results_lists): | |
| 313 """Calculates a confidence percentage. | 263 """Calculates a confidence percentage. |
| 314 | 264 |
| 315 This is calculated based on how distinct the "good" and "bad" values are, | 265 This is calculated based on how distinct the "good" and "bad" values are, |
| 316 and how noisy the results are. More precisely, the confidence is the quotient | 266 and how noisy the results are. More precisely, the confidence is the quotient |
| 317 of the difference between the closest values across the good and bad groups | 267 of the difference between the closest values across the good and bad groups |
| 318 and the sum of the standard deviations of the good and bad groups. | 268 and the sum of the standard deviations of the good and bad groups. |
| 319 | 269 |
| 320 TODO(qyearsley): Replace this confidence function with a function that | 270 TODO(qyearsley): Replace this confidence function with a function that |
| 321 uses a Student's t-test. The confidence would be (1 - p-value), where | 271 uses a Student's t-test. The confidence would be (1 - p-value), where |
| 322 p-value is the probability of obtaining the given a set of good and bad | 272 p-value is the probability of obtaining the given a set of good and bad |
| 323 values just by chance. | 273 values just by chance. |
| 324 | 274 |
| 325 Args: | 275 Args: |
| 326 good_results_lists: A list of lists of "good" result numbers. | 276 good_results_lists: A list of lists of "good" result numbers. |
| 327 bad_results_lists: A list of lists of "bad" result numbers. | 277 bad_results_lists: A list of lists of "bad" result numbers. |
| 328 | 278 |
| 329 Returns: | 279 Returns: |
| 330 A number between in the range [0, 100]. | 280 A number between in the range [0, 100]. |
| 331 """ | 281 """ |
| 332 # Get the distance between the two groups. | 282 # Get the distance between the two groups. |
| 333 means_good = map(CalculateMean, good_results_lists) | 283 means_good = map(math_utils.Mean, good_results_lists) |
| 334 means_bad = map(CalculateMean, bad_results_lists) | 284 means_bad = map(math_utils.Mean, bad_results_lists) |
| 335 bounds_good = (min(means_good), max(means_good)) | 285 bounds_good = (min(means_good), max(means_good)) |
| 336 bounds_bad = (min(means_bad), max(means_bad)) | 286 bounds_bad = (min(means_bad), max(means_bad)) |
| 337 dist_between_groups = min( | 287 dist_between_groups = min( |
| 338 math.fabs(bounds_bad[1] - bounds_good[0]), | 288 math.fabs(bounds_bad[1] - bounds_good[0]), |
| 339 math.fabs(bounds_bad[0] - bounds_good[1])) | 289 math.fabs(bounds_bad[0] - bounds_good[1])) |
| 340 | 290 |
| 341 # Get the sum of the standard deviations of the two groups. | 291 # Get the sum of the standard deviations of the two groups. |
| 342 good_results_flattened = sum(good_results_lists, []) | 292 good_results_flattened = sum(good_results_lists, []) |
| 343 bad_results_flattened = sum(bad_results_lists, []) | 293 bad_results_flattened = sum(bad_results_lists, []) |
| 344 stddev_good = CalculateStandardDeviation(good_results_flattened) | 294 stddev_good = math_utils.StandardDeviation(good_results_flattened) |
| 345 stddev_bad = CalculateStandardDeviation(bad_results_flattened) | 295 stddev_bad = math_utils.StandardDeviation(bad_results_flattened) |
| 346 stddev_sum = stddev_good + stddev_bad | 296 stddev_sum = stddev_good + stddev_bad |
| 347 | 297 |
| 348 confidence = dist_between_groups / (max(0.0001, stddev_sum)) | 298 confidence = dist_between_groups / (max(0.0001, stddev_sum)) |
| 349 confidence = int(min(1.0, max(confidence, 0.0)) * 100.0) | 299 confidence = int(min(1.0, max(confidence, 0.0)) * 100.0) |
| 350 return confidence | 300 return confidence |
| 351 | 301 |
| 352 | 302 |
| 353 def CalculateStandardDeviation(values): | |
| 354 """Calculates the sample standard deviation of the given list of values.""" | |
| 355 if len(values) == 1: | |
| 356 return 0.0 | |
| 357 | |
| 358 mean = CalculateMean(values) | |
| 359 differences_from_mean = [float(x) - mean for x in values] | |
| 360 squared_differences = [float(x * x) for x in differences_from_mean] | |
| 361 variance = sum(squared_differences) / (len(values) - 1) | |
| 362 std_dev = math.sqrt(variance) | |
| 363 | |
| 364 return std_dev | |
| 365 | |
| 366 | |
| 367 def CalculateRelativeChange(before, after): | |
| 368 """Returns the relative change of before and after, relative to before. | |
| 369 | |
| 370 There are several different ways to define relative difference between | |
| 371 two numbers; sometimes it is defined as relative to the smaller number, | |
| 372 or to the mean of the two numbers. This version returns the difference | |
| 373 relative to the first of the two numbers. | |
| 374 | |
| 375 Args: | |
| 376 before: A number representing an earlier value. | |
| 377 after: Another number, representing a later value. | |
| 378 | |
| 379 Returns: | |
| 380 A non-negative floating point number; 0.1 represents a 10% change. | |
| 381 """ | |
| 382 if before == after: | |
| 383 return 0.0 | |
| 384 if before == 0: | |
| 385 return float('nan') | |
| 386 difference = after - before | |
| 387 return math.fabs(difference / before) | |
| 388 | |
| 389 | |
| 390 def CalculatePooledStandardError(work_sets): | |
| 391 numerator = 0.0 | |
| 392 denominator1 = 0.0 | |
| 393 denominator2 = 0.0 | |
| 394 | |
| 395 for current_set in work_sets: | |
| 396 std_dev = CalculateStandardDeviation(current_set) | |
| 397 numerator += (len(current_set) - 1) * std_dev ** 2 | |
| 398 denominator1 += len(current_set) - 1 | |
| 399 denominator2 += 1.0 / len(current_set) | |
| 400 | |
| 401 if denominator1: | |
| 402 return math.sqrt(numerator / denominator1) * math.sqrt(denominator2) | |
| 403 return 0.0 | |
| 404 | |
| 405 | |
| 406 def CalculateStandardError(values): | |
| 407 """Calculates the standard error of a list of values.""" | |
| 408 if len(values) <= 1: | |
| 409 return 0.0 | |
| 410 | |
| 411 std_dev = CalculateStandardDeviation(values) | |
| 412 | |
| 413 return std_dev / math.sqrt(len(values)) | |
| 414 | |
| 415 | |
| 416 | |
| 417 | |
| 418 def GetSHA1HexDigest(contents): | 303 def GetSHA1HexDigest(contents): |
| 419 """Returns secured hash containing hexadecimal for the given contents.""" | 304 """Returns secured hash containing hexadecimal for the given contents.""" |
| 420 return hashlib.sha1(contents).hexdigest() | 305 return hashlib.sha1(contents).hexdigest() |
| 421 | 306 |
| 422 | 307 |
| 423 def GetZipFileName(build_revision=None, target_arch='ia32', patch_sha=None): | 308 def GetZipFileName(build_revision=None, target_arch='ia32', patch_sha=None): |
| 424 """Gets the archive file name for the given revision.""" | 309 """Gets the archive file name for the given revision.""" |
| 425 def PlatformName(): | 310 def PlatformName(): |
| 426 """Return a string to be used in paths for the platform.""" | 311 """Return a string to be used in paths for the platform.""" |
| 427 if bisect_utils.IsWindowsHost(): | 312 if bisect_utils.IsWindowsHost(): |
| (...skipping 1546 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1974 'std_err': 0.0, | 1859 'std_err': 0.0, |
| 1975 'std_dev': 0.0, | 1860 'std_dev': 0.0, |
| 1976 'values': metric_values, | 1861 'values': metric_values, |
| 1977 } | 1862 } |
| 1978 | 1863 |
| 1979 print 'Results of performance test: Command returned with %d' % ( | 1864 print 'Results of performance test: Command returned with %d' % ( |
| 1980 overall_return_code) | 1865 overall_return_code) |
| 1981 print | 1866 print |
| 1982 else: | 1867 else: |
| 1983 # Need to get the average value if there were multiple values. | 1868 # Need to get the average value if there were multiple values. |
| 1984 truncated_mean = CalculateTruncatedMean(metric_values, | 1869 truncated_mean = math_utils.TruncatedMean( |
| 1985 self.opts.truncate_percent) | 1870 metric_values, self.opts.truncate_percent) |
| 1986 standard_err = CalculateStandardError(metric_values) | 1871 standard_err = math_utils.StandardError(metric_values) |
| 1987 standard_dev = CalculateStandardDeviation(metric_values) | 1872 standard_dev = math_utils.StandardDeviation(metric_values) |
| 1988 | 1873 |
| 1989 if self._IsBisectModeStandardDeviation(): | 1874 if self._IsBisectModeStandardDeviation(): |
| 1990 metric_values = [standard_dev] | 1875 metric_values = [standard_dev] |
| 1991 | 1876 |
| 1992 values = { | 1877 values = { |
| 1993 'mean': truncated_mean, | 1878 'mean': truncated_mean, |
| 1994 'std_err': standard_err, | 1879 'std_err': standard_err, |
| 1995 'std_dev': standard_dev, | 1880 'std_dev': standard_dev, |
| 1996 'values': metric_values, | 1881 'values': metric_values, |
| 1997 } | 1882 } |
| (...skipping 1169 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3167 | 3052 |
| 3168 def _FindOtherRegressions(self, revision_data_sorted, bad_greater_than_good): | 3053 def _FindOtherRegressions(self, revision_data_sorted, bad_greater_than_good): |
| 3169 other_regressions = [] | 3054 other_regressions = [] |
| 3170 previous_values = [] | 3055 previous_values = [] |
| 3171 previous_id = None | 3056 previous_id = None |
| 3172 for current_id, current_data in revision_data_sorted: | 3057 for current_id, current_data in revision_data_sorted: |
| 3173 current_values = current_data['value'] | 3058 current_values = current_data['value'] |
| 3174 if current_values: | 3059 if current_values: |
| 3175 current_values = current_values['values'] | 3060 current_values = current_values['values'] |
| 3176 if previous_values: | 3061 if previous_values: |
| 3177 confidence = CalculateConfidence(previous_values, [current_values]) | 3062 confidence = ConfidenceScore(previous_values, [current_values]) |
| 3178 mean_of_prev_runs = CalculateMean(sum(previous_values, [])) | 3063 mean_of_prev_runs = math_utils.Mean(sum(previous_values, [])) |
| 3179 mean_of_current_runs = CalculateMean(current_values) | 3064 mean_of_current_runs = math_utils.Mean(current_values) |
| 3180 | 3065 |
| 3181 # Check that the potential regression is in the same direction as | 3066 # Check that the potential regression is in the same direction as |
| 3182 # the overall regression. If the mean of the previous runs < the | 3067 # the overall regression. If the mean of the previous runs < the |
| 3183 # mean of the current runs, this local regression is in same | 3068 # mean of the current runs, this local regression is in same |
| 3184 # direction. | 3069 # direction. |
| 3185 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs | 3070 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs |
| 3186 is_same_direction = (prev_less_than_current if | 3071 is_same_direction = (prev_less_than_current if |
| 3187 bad_greater_than_good else not prev_less_than_current) | 3072 bad_greater_than_good else not prev_less_than_current) |
| 3188 | 3073 |
| 3189 # Only report potential regressions with high confidence. | 3074 # Only report potential regressions with high confidence. |
| (...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3221 working_means = [] | 3106 working_means = [] |
| 3222 for i in xrange(first_working_revision_index, len(revision_data_sorted)): | 3107 for i in xrange(first_working_revision_index, len(revision_data_sorted)): |
| 3223 if revision_data_sorted[i][1]['value']: | 3108 if revision_data_sorted[i][1]['value']: |
| 3224 working_means.append(revision_data_sorted[i][1]['value']['values']) | 3109 working_means.append(revision_data_sorted[i][1]['value']['values']) |
| 3225 | 3110 |
| 3226 # Flatten the lists to calculate mean of all values. | 3111 # Flatten the lists to calculate mean of all values. |
| 3227 working_mean = sum(working_means, []) | 3112 working_mean = sum(working_means, []) |
| 3228 broken_mean = sum(broken_means, []) | 3113 broken_mean = sum(broken_means, []) |
| 3229 | 3114 |
| 3230 # Calculate the approximate size of the regression | 3115 # Calculate the approximate size of the regression |
| 3231 mean_of_bad_runs = CalculateMean(broken_mean) | 3116 mean_of_bad_runs = math_utils.Mean(broken_mean) |
| 3232 mean_of_good_runs = CalculateMean(working_mean) | 3117 mean_of_good_runs = math_utils.Mean(working_mean) |
| 3233 | 3118 |
| 3234 regression_size = 100 * CalculateRelativeChange(mean_of_good_runs, | 3119 regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs, |
| 3235 mean_of_bad_runs) | 3120 mean_of_bad_runs) |
| 3236 if math.isnan(regression_size): | 3121 if math.isnan(regression_size): |
| 3237 regression_size = 'zero-to-nonzero' | 3122 regression_size = 'zero-to-nonzero' |
| 3238 | 3123 |
| 3239 regression_std_err = math.fabs(CalculatePooledStandardError( | 3124 regression_std_err = math.fabs(math_utils.PooledStandardError( |
| 3240 [working_mean, broken_mean]) / | 3125 [working_mean, broken_mean]) / |
| 3241 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 | 3126 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 |
| 3242 | 3127 |
| 3243 # Give a "confidence" in the bisect. At the moment we use how distinct the | 3128 # Give a "confidence" in the bisect. At the moment we use how distinct the |
| 3244 # values are before and after the last broken revision, and how noisy the | 3129 # values are before and after the last broken revision, and how noisy the |
| 3245 # overall graph is. | 3130 # overall graph is. |
| 3246 confidence = CalculateConfidence(working_means, broken_means) | 3131 confidence = ConfidenceScore(working_means, broken_means) |
| 3247 | 3132 |
| 3248 culprit_revisions = [] | 3133 culprit_revisions = [] |
| 3249 | 3134 |
| 3250 cwd = os.getcwd() | 3135 cwd = os.getcwd() |
| 3251 self.ChangeToDepotWorkingDirectory( | 3136 self.ChangeToDepotWorkingDirectory( |
| 3252 revision_data[last_broken_revision]['depot']) | 3137 revision_data[last_broken_revision]['depot']) |
| 3253 | 3138 |
| 3254 if revision_data[last_broken_revision]['depot'] == 'cros': | 3139 if revision_data[last_broken_revision]['depot'] == 'cros': |
| 3255 # Want to get a list of all the commits and what depots they belong | 3140 # Want to get a list of all the commits and what depots they belong |
| 3256 # to so that we can grab info about each. | 3141 # to so that we can grab info about each. |
| (...skipping 507 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3764 except RuntimeError, e: | 3649 except RuntimeError, e: |
| 3765 if opts.output_buildbot_annotations: | 3650 if opts.output_buildbot_annotations: |
| 3766 # The perf dashboard scrapes the "results" step in order to comment on | 3651 # The perf dashboard scrapes the "results" step in order to comment on |
| 3767 # bugs. If you change this, please update the perf dashboard as well. | 3652 # bugs. If you change this, please update the perf dashboard as well. |
| 3768 bisect_utils.OutputAnnotationStepStart('Results') | 3653 bisect_utils.OutputAnnotationStepStart('Results') |
| 3769 print 'Error: %s' % e.message | 3654 print 'Error: %s' % e.message |
| 3770 if opts.output_buildbot_annotations: | 3655 if opts.output_buildbot_annotations: |
| 3771 bisect_utils.OutputAnnotationStepClosed() | 3656 bisect_utils.OutputAnnotationStepClosed() |
| 3772 return 1 | 3657 return 1 |
| 3773 | 3658 |
| 3659 |
| 3774 if __name__ == '__main__': | 3660 if __name__ == '__main__': |
| 3775 sys.exit(main()) | 3661 sys.exit(main()) |
| OLD | NEW |