Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(65)

Side by Side Diff: tools/bisect-perf-regression.py

Issue 417013003: Move statistical functions in bisect script to their own module. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Respond to nit Created 6 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « tools/auto_bisect/math_utils_test.py ('k') | tools/bisect-perf-regression_test.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 # Copyright (c) 2013 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be 3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file. 4 # found in the LICENSE file.
5 5
6 """Performance Test Bisect Tool 6 """Performance Test Bisect Tool
7 7
8 This script bisects a series of changelists using binary search. It starts at 8 This script bisects a series of changelists using binary search. It starts at
9 a bad revision where a performance metric has regressed, and asks for a last 9 a bad revision where a performance metric has regressed, and asks for a last
10 known-good revision. It will then binary search across this revision range by 10 known-good revision. It will then binary search across this revision range by
(...skipping 14 matching lines...) Expand all
25 revision were merged in. 25 revision were merged in.
26 26
27 27
28 An example usage (using git hashes): 28 An example usage (using git hashes):
29 29
30 ./tools/bisect-perf-regression.py -c\ 30 ./tools/bisect-perf-regression.py -c\
31 "out/Release/performance_ui_tests --gtest_filter=ShutdownTest.SimpleUserQuit"\ 31 "out/Release/performance_ui_tests --gtest_filter=ShutdownTest.SimpleUserQuit"\
32 -g 1f6e67861535121c5c819c16a666f2436c207e7b\ 32 -g 1f6e67861535121c5c819c16a666f2436c207e7b\
33 -b b732f23b4f81c382db0b23b9035f3dadc7d925bb\ 33 -b b732f23b4f81c382db0b23b9035f3dadc7d925bb\
34 -m shutdown/simple-user-quit 34 -m shutdown/simple-user-quit
35
36 """ 35 """
37 36
38 import copy 37 import copy
39 import datetime 38 import datetime
40 import errno 39 import errno
41 import hashlib 40 import hashlib
42 import math 41 import math
43 import optparse 42 import optparse
44 import os 43 import os
45 import re 44 import re
46 import shlex 45 import shlex
47 import shutil 46 import shutil
48 import StringIO 47 import StringIO
49 import sys 48 import sys
50 import time 49 import time
51 import zipfile 50 import zipfile
52 51
53 sys.path.append(os.path.join(os.path.dirname(__file__), 'telemetry')) 52 sys.path.append(os.path.join(os.path.dirname(__file__), 'telemetry'))
54 53
55 from auto_bisect import bisect_utils 54 from auto_bisect import bisect_utils
55 from auto_bisect import math_utils
56 from auto_bisect import post_perf_builder_job as bisect_builder 56 from auto_bisect import post_perf_builder_job as bisect_builder
57 from auto_bisect import source_control as source_control_module 57 from auto_bisect import source_control as source_control_module
58 from telemetry.util import cloud_storage 58 from telemetry.util import cloud_storage
59 59
60 # The additional repositories that might need to be bisected. 60 # The additional repositories that might need to be bisected.
61 # If the repository has any dependant repositories (such as skia/src needs 61 # If the repository has any dependant repositories (such as skia/src needs
62 # skia/include and skia/gyp to be updated), specify them in the 'depends' 62 # skia/include and skia/gyp to be updated), specify them in the 'depends'
63 # so that they're synced appropriately. 63 # so that they're synced appropriately.
64 # Format is: 64 # Format is:
65 # src: path to the working directory. 65 # src: path to the working directory.
(...skipping 186 matching lines...) Expand 10 before | Expand all | Expand 10 after
252 252
253 def _AddAdditionalDepotInfo(depot_info): 253 def _AddAdditionalDepotInfo(depot_info):
254 """Adds additional depot info to the global depot variables.""" 254 """Adds additional depot info to the global depot variables."""
255 global DEPOT_DEPS_NAME 255 global DEPOT_DEPS_NAME
256 global DEPOT_NAMES 256 global DEPOT_NAMES
257 DEPOT_DEPS_NAME = dict(DEPOT_DEPS_NAME.items() + 257 DEPOT_DEPS_NAME = dict(DEPOT_DEPS_NAME.items() +
258 depot_info.items()) 258 depot_info.items())
259 DEPOT_NAMES = DEPOT_DEPS_NAME.keys() 259 DEPOT_NAMES = DEPOT_DEPS_NAME.keys()
260 260
261 261
262 def CalculateTruncatedMean(data_set, truncate_percent): 262 def ConfidenceScore(good_results_lists, bad_results_lists):
263 """Calculates the truncated mean of a set of values.
264
265 Note that this isn't just the mean of the set of values with the highest
266 and lowest values discarded; the non-discarded values are also weighted
267 differently depending how many values are discarded.
268
269 Args:
270 data_set: Non-empty list of values.
271 truncate_percent: The % from the upper and lower portions of the data set
272 to discard, expressed as a value in [0, 1].
273
274 Returns:
275 The truncated mean as a float.
276
277 Raises:
278 TypeError: The data set was empty after discarding values.
279 """
280 if len(data_set) > 2:
281 data_set = sorted(data_set)
282
283 discard_num_float = len(data_set) * truncate_percent
284 discard_num_int = int(math.floor(discard_num_float))
285 kept_weight = len(data_set) - discard_num_float * 2
286
287 data_set = data_set[discard_num_int:len(data_set)-discard_num_int]
288
289 weight_left = 1.0 - (discard_num_float - discard_num_int)
290
291 if weight_left < 1:
292 # If the % to discard leaves a fractional portion, need to weight those
293 # values.
294 unweighted_vals = data_set[1:len(data_set)-1]
295 weighted_vals = [data_set[0], data_set[len(data_set)-1]]
296 weighted_vals = [w * weight_left for w in weighted_vals]
297 data_set = weighted_vals + unweighted_vals
298 else:
299 kept_weight = len(data_set)
300
301 truncated_mean = reduce(lambda x, y: float(x) + float(y),
302 data_set) / kept_weight
303
304 return truncated_mean
305
306
307 def CalculateMean(values):
308 """Calculates the arithmetic mean of a list of values."""
309 return CalculateTruncatedMean(values, 0.0)
310
311
312 def CalculateConfidence(good_results_lists, bad_results_lists):
313 """Calculates a confidence percentage. 263 """Calculates a confidence percentage.
314 264
315 This is calculated based on how distinct the "good" and "bad" values are, 265 This is calculated based on how distinct the "good" and "bad" values are,
316 and how noisy the results are. More precisely, the confidence is the quotient 266 and how noisy the results are. More precisely, the confidence is the quotient
317 of the difference between the closest values across the good and bad groups 267 of the difference between the closest values across the good and bad groups
318 and the sum of the standard deviations of the good and bad groups. 268 and the sum of the standard deviations of the good and bad groups.
319 269
320 TODO(qyearsley): Replace this confidence function with a function that 270 TODO(qyearsley): Replace this confidence function with a function that
321 uses a Student's t-test. The confidence would be (1 - p-value), where 271 uses a Student's t-test. The confidence would be (1 - p-value), where
322 p-value is the probability of obtaining the given a set of good and bad 272 p-value is the probability of obtaining the given a set of good and bad
323 values just by chance. 273 values just by chance.
324 274
325 Args: 275 Args:
326 good_results_lists: A list of lists of "good" result numbers. 276 good_results_lists: A list of lists of "good" result numbers.
327 bad_results_lists: A list of lists of "bad" result numbers. 277 bad_results_lists: A list of lists of "bad" result numbers.
328 278
329 Returns: 279 Returns:
330 A number between in the range [0, 100]. 280 A number between in the range [0, 100].
331 """ 281 """
332 # Get the distance between the two groups. 282 # Get the distance between the two groups.
333 means_good = map(CalculateMean, good_results_lists) 283 means_good = map(math_utils.Mean, good_results_lists)
334 means_bad = map(CalculateMean, bad_results_lists) 284 means_bad = map(math_utils.Mean, bad_results_lists)
335 bounds_good = (min(means_good), max(means_good)) 285 bounds_good = (min(means_good), max(means_good))
336 bounds_bad = (min(means_bad), max(means_bad)) 286 bounds_bad = (min(means_bad), max(means_bad))
337 dist_between_groups = min( 287 dist_between_groups = min(
338 math.fabs(bounds_bad[1] - bounds_good[0]), 288 math.fabs(bounds_bad[1] - bounds_good[0]),
339 math.fabs(bounds_bad[0] - bounds_good[1])) 289 math.fabs(bounds_bad[0] - bounds_good[1]))
340 290
341 # Get the sum of the standard deviations of the two groups. 291 # Get the sum of the standard deviations of the two groups.
342 good_results_flattened = sum(good_results_lists, []) 292 good_results_flattened = sum(good_results_lists, [])
343 bad_results_flattened = sum(bad_results_lists, []) 293 bad_results_flattened = sum(bad_results_lists, [])
344 stddev_good = CalculateStandardDeviation(good_results_flattened) 294 stddev_good = math_utils.StandardDeviation(good_results_flattened)
345 stddev_bad = CalculateStandardDeviation(bad_results_flattened) 295 stddev_bad = math_utils.StandardDeviation(bad_results_flattened)
346 stddev_sum = stddev_good + stddev_bad 296 stddev_sum = stddev_good + stddev_bad
347 297
348 confidence = dist_between_groups / (max(0.0001, stddev_sum)) 298 confidence = dist_between_groups / (max(0.0001, stddev_sum))
349 confidence = int(min(1.0, max(confidence, 0.0)) * 100.0) 299 confidence = int(min(1.0, max(confidence, 0.0)) * 100.0)
350 return confidence 300 return confidence
351 301
352 302
353 def CalculateStandardDeviation(values):
354 """Calculates the sample standard deviation of the given list of values."""
355 if len(values) == 1:
356 return 0.0
357
358 mean = CalculateMean(values)
359 differences_from_mean = [float(x) - mean for x in values]
360 squared_differences = [float(x * x) for x in differences_from_mean]
361 variance = sum(squared_differences) / (len(values) - 1)
362 std_dev = math.sqrt(variance)
363
364 return std_dev
365
366
367 def CalculateRelativeChange(before, after):
368 """Returns the relative change of before and after, relative to before.
369
370 There are several different ways to define relative difference between
371 two numbers; sometimes it is defined as relative to the smaller number,
372 or to the mean of the two numbers. This version returns the difference
373 relative to the first of the two numbers.
374
375 Args:
376 before: A number representing an earlier value.
377 after: Another number, representing a later value.
378
379 Returns:
380 A non-negative floating point number; 0.1 represents a 10% change.
381 """
382 if before == after:
383 return 0.0
384 if before == 0:
385 return float('nan')
386 difference = after - before
387 return math.fabs(difference / before)
388
389
390 def CalculatePooledStandardError(work_sets):
391 numerator = 0.0
392 denominator1 = 0.0
393 denominator2 = 0.0
394
395 for current_set in work_sets:
396 std_dev = CalculateStandardDeviation(current_set)
397 numerator += (len(current_set) - 1) * std_dev ** 2
398 denominator1 += len(current_set) - 1
399 denominator2 += 1.0 / len(current_set)
400
401 if denominator1:
402 return math.sqrt(numerator / denominator1) * math.sqrt(denominator2)
403 return 0.0
404
405
406 def CalculateStandardError(values):
407 """Calculates the standard error of a list of values."""
408 if len(values) <= 1:
409 return 0.0
410
411 std_dev = CalculateStandardDeviation(values)
412
413 return std_dev / math.sqrt(len(values))
414
415
416
417
418 def GetSHA1HexDigest(contents): 303 def GetSHA1HexDigest(contents):
419 """Returns secured hash containing hexadecimal for the given contents.""" 304 """Returns secured hash containing hexadecimal for the given contents."""
420 return hashlib.sha1(contents).hexdigest() 305 return hashlib.sha1(contents).hexdigest()
421 306
422 307
423 def GetZipFileName(build_revision=None, target_arch='ia32', patch_sha=None): 308 def GetZipFileName(build_revision=None, target_arch='ia32', patch_sha=None):
424 """Gets the archive file name for the given revision.""" 309 """Gets the archive file name for the given revision."""
425 def PlatformName(): 310 def PlatformName():
426 """Return a string to be used in paths for the platform.""" 311 """Return a string to be used in paths for the platform."""
427 if bisect_utils.IsWindowsHost(): 312 if bisect_utils.IsWindowsHost():
(...skipping 1546 matching lines...) Expand 10 before | Expand all | Expand 10 after
1974 'std_err': 0.0, 1859 'std_err': 0.0,
1975 'std_dev': 0.0, 1860 'std_dev': 0.0,
1976 'values': metric_values, 1861 'values': metric_values,
1977 } 1862 }
1978 1863
1979 print 'Results of performance test: Command returned with %d' % ( 1864 print 'Results of performance test: Command returned with %d' % (
1980 overall_return_code) 1865 overall_return_code)
1981 print 1866 print
1982 else: 1867 else:
1983 # Need to get the average value if there were multiple values. 1868 # Need to get the average value if there were multiple values.
1984 truncated_mean = CalculateTruncatedMean(metric_values, 1869 truncated_mean = math_utils.TruncatedMean(
1985 self.opts.truncate_percent) 1870 metric_values, self.opts.truncate_percent)
1986 standard_err = CalculateStandardError(metric_values) 1871 standard_err = math_utils.StandardError(metric_values)
1987 standard_dev = CalculateStandardDeviation(metric_values) 1872 standard_dev = math_utils.StandardDeviation(metric_values)
1988 1873
1989 if self._IsBisectModeStandardDeviation(): 1874 if self._IsBisectModeStandardDeviation():
1990 metric_values = [standard_dev] 1875 metric_values = [standard_dev]
1991 1876
1992 values = { 1877 values = {
1993 'mean': truncated_mean, 1878 'mean': truncated_mean,
1994 'std_err': standard_err, 1879 'std_err': standard_err,
1995 'std_dev': standard_dev, 1880 'std_dev': standard_dev,
1996 'values': metric_values, 1881 'values': metric_values,
1997 } 1882 }
(...skipping 1169 matching lines...) Expand 10 before | Expand all | Expand 10 after
3167 3052
3168 def _FindOtherRegressions(self, revision_data_sorted, bad_greater_than_good): 3053 def _FindOtherRegressions(self, revision_data_sorted, bad_greater_than_good):
3169 other_regressions = [] 3054 other_regressions = []
3170 previous_values = [] 3055 previous_values = []
3171 previous_id = None 3056 previous_id = None
3172 for current_id, current_data in revision_data_sorted: 3057 for current_id, current_data in revision_data_sorted:
3173 current_values = current_data['value'] 3058 current_values = current_data['value']
3174 if current_values: 3059 if current_values:
3175 current_values = current_values['values'] 3060 current_values = current_values['values']
3176 if previous_values: 3061 if previous_values:
3177 confidence = CalculateConfidence(previous_values, [current_values]) 3062 confidence = ConfidenceScore(previous_values, [current_values])
3178 mean_of_prev_runs = CalculateMean(sum(previous_values, [])) 3063 mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))
3179 mean_of_current_runs = CalculateMean(current_values) 3064 mean_of_current_runs = math_utils.Mean(current_values)
3180 3065
3181 # Check that the potential regression is in the same direction as 3066 # Check that the potential regression is in the same direction as
3182 # the overall regression. If the mean of the previous runs < the 3067 # the overall regression. If the mean of the previous runs < the
3183 # mean of the current runs, this local regression is in same 3068 # mean of the current runs, this local regression is in same
3184 # direction. 3069 # direction.
3185 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs 3070 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs
3186 is_same_direction = (prev_less_than_current if 3071 is_same_direction = (prev_less_than_current if
3187 bad_greater_than_good else not prev_less_than_current) 3072 bad_greater_than_good else not prev_less_than_current)
3188 3073
3189 # Only report potential regressions with high confidence. 3074 # Only report potential regressions with high confidence.
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
3221 working_means = [] 3106 working_means = []
3222 for i in xrange(first_working_revision_index, len(revision_data_sorted)): 3107 for i in xrange(first_working_revision_index, len(revision_data_sorted)):
3223 if revision_data_sorted[i][1]['value']: 3108 if revision_data_sorted[i][1]['value']:
3224 working_means.append(revision_data_sorted[i][1]['value']['values']) 3109 working_means.append(revision_data_sorted[i][1]['value']['values'])
3225 3110
3226 # Flatten the lists to calculate mean of all values. 3111 # Flatten the lists to calculate mean of all values.
3227 working_mean = sum(working_means, []) 3112 working_mean = sum(working_means, [])
3228 broken_mean = sum(broken_means, []) 3113 broken_mean = sum(broken_means, [])
3229 3114
3230 # Calculate the approximate size of the regression 3115 # Calculate the approximate size of the regression
3231 mean_of_bad_runs = CalculateMean(broken_mean) 3116 mean_of_bad_runs = math_utils.Mean(broken_mean)
3232 mean_of_good_runs = CalculateMean(working_mean) 3117 mean_of_good_runs = math_utils.Mean(working_mean)
3233 3118
3234 regression_size = 100 * CalculateRelativeChange(mean_of_good_runs, 3119 regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
3235 mean_of_bad_runs) 3120 mean_of_bad_runs)
3236 if math.isnan(regression_size): 3121 if math.isnan(regression_size):
3237 regression_size = 'zero-to-nonzero' 3122 regression_size = 'zero-to-nonzero'
3238 3123
3239 regression_std_err = math.fabs(CalculatePooledStandardError( 3124 regression_std_err = math.fabs(math_utils.PooledStandardError(
3240 [working_mean, broken_mean]) / 3125 [working_mean, broken_mean]) /
3241 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 3126 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
3242 3127
3243 # Give a "confidence" in the bisect. At the moment we use how distinct the 3128 # Give a "confidence" in the bisect. At the moment we use how distinct the
3244 # values are before and after the last broken revision, and how noisy the 3129 # values are before and after the last broken revision, and how noisy the
3245 # overall graph is. 3130 # overall graph is.
3246 confidence = CalculateConfidence(working_means, broken_means) 3131 confidence = ConfidenceScore(working_means, broken_means)
3247 3132
3248 culprit_revisions = [] 3133 culprit_revisions = []
3249 3134
3250 cwd = os.getcwd() 3135 cwd = os.getcwd()
3251 self.ChangeToDepotWorkingDirectory( 3136 self.ChangeToDepotWorkingDirectory(
3252 revision_data[last_broken_revision]['depot']) 3137 revision_data[last_broken_revision]['depot'])
3253 3138
3254 if revision_data[last_broken_revision]['depot'] == 'cros': 3139 if revision_data[last_broken_revision]['depot'] == 'cros':
3255 # Want to get a list of all the commits and what depots they belong 3140 # Want to get a list of all the commits and what depots they belong
3256 # to so that we can grab info about each. 3141 # to so that we can grab info about each.
(...skipping 507 matching lines...) Expand 10 before | Expand all | Expand 10 after
3764 except RuntimeError, e: 3649 except RuntimeError, e:
3765 if opts.output_buildbot_annotations: 3650 if opts.output_buildbot_annotations:
3766 # The perf dashboard scrapes the "results" step in order to comment on 3651 # The perf dashboard scrapes the "results" step in order to comment on
3767 # bugs. If you change this, please update the perf dashboard as well. 3652 # bugs. If you change this, please update the perf dashboard as well.
3768 bisect_utils.OutputAnnotationStepStart('Results') 3653 bisect_utils.OutputAnnotationStepStart('Results')
3769 print 'Error: %s' % e.message 3654 print 'Error: %s' % e.message
3770 if opts.output_buildbot_annotations: 3655 if opts.output_buildbot_annotations:
3771 bisect_utils.OutputAnnotationStepClosed() 3656 bisect_utils.OutputAnnotationStepClosed()
3772 return 1 3657 return 1
3773 3658
3659
3774 if __name__ == '__main__': 3660 if __name__ == '__main__':
3775 sys.exit(main()) 3661 sys.exit(main())
OLDNEW
« no previous file with comments | « tools/auto_bisect/math_utils_test.py ('k') | tools/bisect-perf-regression_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698