tools/bisect-perf-regression.py - Issue 417013003: Move statistical functions in bisect script to their own module.

Side by Side Diff: tools/bisect-perf-regression.py

Issue 417013003: Move statistical functions in bisect script to their own module. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Rebased and fixed test failures Created 6 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 #!/usr/bin/env python	1 #!/usr/bin/env python

2 # Copyright (c) 2013 The Chromium Authors. All rights reserved.	2 # Copyright (c) 2013 The Chromium Authors. All rights reserved.

3 # Use of this source code is governed by a BSD-style license that can be	3 # Use of this source code is governed by a BSD-style license that can be

4 # found in the LICENSE file.	4 # found in the LICENSE file.

5	5

6 """Performance Test Bisect Tool	6 """Performance Test Bisect Tool

7	7

8 This script bisects a series of changelists using binary search. It starts at	8 This script bisects a series of changelists using binary search. It starts at

9 a bad revision where a performance metric has regressed, and asks for a last	9 a bad revision where a performance metric has regressed, and asks for a last

10 known-good revision. It will then binary search across this revision range by	10 known-good revision. It will then binary search across this revision range by

(...skipping 14 matching lines...) Expand all Loading...
25 revision were merged in.	25 revision were merged in.

26	26

27	27

28 An example usage (using git hashes):	28 An example usage (using git hashes):

29	29

30 ./tools/bisect-perf-regression.py -c\	30 ./tools/bisect-perf-regression.py -c\

31 "out/Release/performance_ui_tests --gtest_filter=ShutdownTest.SimpleUserQuit"\	31 "out/Release/performance_ui_tests --gtest_filter=ShutdownTest.SimpleUserQuit"\

32 -g 1f6e67861535121c5c819c16a666f2436c207e7b\	32 -g 1f6e67861535121c5c819c16a666f2436c207e7b\

33 -b b732f23b4f81c382db0b23b9035f3dadc7d925bb\	33 -b b732f23b4f81c382db0b23b9035f3dadc7d925bb\

34 -m shutdown/simple-user-quit	34 -m shutdown/simple-user-quit

35

36 """	35 """

37	36

38 import copy	37 import copy

39 import datetime	38 import datetime

40 import errno	39 import errno

41 import hashlib	40 import hashlib

42 import math	41 import math

43 import optparse	42 import optparse

44 import os	43 import os

45 import re	44 import re

46 import shlex	45 import shlex

47 import shutil	46 import shutil

48 import StringIO	47 import StringIO

49 import sys	48 import sys

50 import time	49 import time

51 import zipfile	50 import zipfile

52	51

53 sys.path.append(os.path.join(os.path.dirname(__file__), 'telemetry'))	52 sys.path.append(os.path.join(os.path.dirname(__file__), 'telemetry'))

54	53

55 from auto_bisect import bisect_utils	54 from auto_bisect import bisect_utils

	55 from auto_bisect import math_utils

56 from auto_bisect import post_perf_builder_job as bisect_builder	56 from auto_bisect import post_perf_builder_job as bisect_builder

57 from auto_bisect import source_control as source_control_module	57 from auto_bisect import source_control as source_control_module

58 from telemetry.util import cloud_storage	58 from telemetry.util import cloud_storage

59	59

60 # The additional repositories that might need to be bisected.	60 # The additional repositories that might need to be bisected.

61 # If the repository has any dependant repositories (such as skia/src needs	61 # If the repository has any dependant repositories (such as skia/src needs

62 # skia/include and skia/gyp to be updated), specify them in the 'depends'	62 # skia/include and skia/gyp to be updated), specify them in the 'depends'

63 # so that they're synced appropriately.	63 # so that they're synced appropriately.

64 # Format is:	64 # Format is:

65 # src: path to the working directory.	65 # src: path to the working directory.

(...skipping 186 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
252	252

253 def _AddAdditionalDepotInfo(depot_info):	253 def _AddAdditionalDepotInfo(depot_info):

254 """Adds additional depot info to the global depot variables."""	254 """Adds additional depot info to the global depot variables."""

255 global DEPOT_DEPS_NAME	255 global DEPOT_DEPS_NAME

256 global DEPOT_NAMES	256 global DEPOT_NAMES

257 DEPOT_DEPS_NAME = dict(DEPOT_DEPS_NAME.items() +	257 DEPOT_DEPS_NAME = dict(DEPOT_DEPS_NAME.items() +

258 depot_info.items())	258 depot_info.items())

259 DEPOT_NAMES = DEPOT_DEPS_NAME.keys()	259 DEPOT_NAMES = DEPOT_DEPS_NAME.keys()

260	260

261	261

262 def CalculateTruncatedMean(data_set, truncate_percent):	262 def ConfidenceScore(good_results_lists, bad_results_lists):

263 """Calculates the truncated mean of a set of values.

264

265 Note that this isn't just the mean of the set of values with the highest

266 and lowest values discarded; the non-discarded values are also weighted

267 differently depending how many values are discarded.

268

269 Args:

270 data_set: Non-empty list of values.

271 truncate_percent: The % from the upper and lower portions of the data set

272 to discard, expressed as a value in [0, 1].

273

274 Returns:

275 The truncated mean as a float.

276

277 Raises:

278 TypeError: The data set was empty after discarding values.

279 """

280 if len(data_set) > 2:

281 data_set = sorted(data_set)

282

283 discard_num_float = len(data_set) * truncate_percent

284 discard_num_int = int(math.floor(discard_num_float))

285 kept_weight = len(data_set) - discard_num_float * 2

286

287 data_set = data_set[discard_num_int:len(data_set)-discard_num_int]

288

289 weight_left = 1.0 - (discard_num_float - discard_num_int)

290

291 if weight_left < 1:

292 # If the % to discard leaves a fractional portion, need to weight those

293 # values.

294 unweighted_vals = data_set[1:len(data_set)-1]

295 weighted_vals = [data_set[0], data_set[len(data_set)-1]]

296 weighted_vals = [w * weight_left for w in weighted_vals]

297 data_set = weighted_vals + unweighted_vals

298 else:

299 kept_weight = len(data_set)

300

301 truncated_mean = reduce(lambda x, y: float(x) + float(y),

302 data_set) / kept_weight

303

304 return truncated_mean

305

306

307 def CalculateMean(values):

308 """Calculates the arithmetic mean of a list of values."""

309 return CalculateTruncatedMean(values, 0.0)

310

311

312 def CalculateConfidence(good_results_lists, bad_results_lists):

313 """Calculates a confidence percentage.	263 """Calculates a confidence percentage.

314	264

315 This is calculated based on how distinct the "good" and "bad" values are,	265 This is calculated based on how distinct the "good" and "bad" values are,

316 and how noisy the results are. More precisely, the confidence is the quotient	266 and how noisy the results are. More precisely, the confidence is the quotient

317 of the difference between the closest values across the good and bad groups	267 of the difference between the closest values across the good and bad groups

318 and the sum of the standard deviations of the good and bad groups.	268 and the sum of the standard deviations of the good and bad groups.

319	269

320 TODO(qyearsley): Replace this confidence function with a function that	270 TODO(qyearsley): Replace this confidence function with a function that

321 uses a Student's t-test. The confidence would be (1 - p-value), where	271 uses a Student's t-test. The confidence would be (1 - p-value), where

322 p-value is the probability of obtaining the given a set of good and bad	272 p-value is the probability of obtaining the given a set of good and bad

323 values just by chance.	273 values just by chance.

324	274

325 Args:	275 Args:

326 good_results_lists: A list of lists of "good" result numbers.	276 good_results_lists: A list of lists of "good" result numbers.

327 bad_results_lists: A list of lists of "bad" result numbers.	277 bad_results_lists: A list of lists of "bad" result numbers.

328	278

329 Returns:	279 Returns:

330 A number between in the range [0, 100].	280 A number between in the range [0, 100].

331 """	281 """

332 # Get the distance between the two groups.	282 # Get the distance between the two groups.

333 means_good = map(CalculateMean, good_results_lists)	283 means_good = map(math_utils.Mean, good_results_lists)

334 means_bad = map(CalculateMean, bad_results_lists)	284 means_bad = map(math_utils.Mean, bad_results_lists)

335 bounds_good = (min(means_good), max(means_good))	285 bounds_good = (min(means_good), max(means_good))

336 bounds_bad = (min(means_bad), max(means_bad))	286 bounds_bad = (min(means_bad), max(means_bad))

337 dist_between_groups = min(	287 dist_between_groups = min(

338 math.fabs(bounds_bad[1] - bounds_good[0]),	288 math.fabs(bounds_bad[1] - bounds_good[0]),

339 math.fabs(bounds_bad[0] - bounds_good[1]))	289 math.fabs(bounds_bad[0] - bounds_good[1]))

340	290

341 # Get the sum of the standard deviations of the two groups.	291 # Get the sum of the standard deviations of the two groups.

342 good_results_flattened = sum(good_results_lists, [])	292 good_results_flattened = sum(good_results_lists, [])

343 bad_results_flattened = sum(bad_results_lists, [])	293 bad_results_flattened = sum(bad_results_lists, [])

344 stddev_good = CalculateStandardDeviation(good_results_flattened)	294 stddev_good = math_utils.StandardDeviation(good_results_flattened)

345 stddev_bad = CalculateStandardDeviation(bad_results_flattened)	295 stddev_bad = math_utils.StandardDeviation(bad_results_flattened)

346 stddev_sum = stddev_good + stddev_bad	296 stddev_sum = stddev_good + stddev_bad

347	297

348 confidence = dist_between_groups / (max(0.0001, stddev_sum))	298 confidence = dist_between_groups / (max(0.0001, stddev_sum))

349 confidence = int(min(1.0, max(confidence, 0.0)) * 100.0)	299 confidence = int(min(1.0, max(confidence, 0.0)) * 100.0)

350 return confidence	300 return confidence

351	301

352	302

353 def CalculateStandardDeviation(values):

354 """Calculates the sample standard deviation of the given list of values."""

355 if len(values) == 1:

356 return 0.0

357

358 mean = CalculateMean(values)

359 differences_from_mean = [float(x) - mean for x in values]

360 squared_differences = [float(x * x) for x in differences_from_mean]

361 variance = sum(squared_differences) / (len(values) - 1)

362 std_dev = math.sqrt(variance)

363

364 return std_dev

365

366

367 def CalculateRelativeChange(before, after):

368 """Returns the relative change of before and after, relative to before.

369

370 There are several different ways to define relative difference between

371 two numbers; sometimes it is defined as relative to the smaller number,

372 or to the mean of the two numbers. This version returns the difference

373 relative to the first of the two numbers.

374

375 Args:

376 before: A number representing an earlier value.

377 after: Another number, representing a later value.

378

379 Returns:

380 A non-negative floating point number; 0.1 represents a 10% change.

381 """

382 if before == after:

383 return 0.0

384 if before == 0:

385 return float('nan')

386 difference = after - before

387 return math.fabs(difference / before)

388

389

390 def CalculatePooledStandardError(work_sets):

391 numerator = 0.0

392 denominator1 = 0.0

393 denominator2 = 0.0

394

395 for current_set in work_sets:

396 std_dev = CalculateStandardDeviation(current_set)

397 numerator += (len(current_set) - 1) * std_dev ** 2

398 denominator1 += len(current_set) - 1

399 denominator2 += 1.0 / len(current_set)

400

401 if denominator1:

402 return math.sqrt(numerator / denominator1) * math.sqrt(denominator2)

403 return 0.0

404

405

406 def CalculateStandardError(values):

407 """Calculates the standard error of a list of values."""

408 if len(values) <= 1:

409 return 0.0

410

411 std_dev = CalculateStandardDeviation(values)

412

413 return std_dev / math.sqrt(len(values))

414

415

416

417

418 def GetSHA1HexDigest(contents):	303 def GetSHA1HexDigest(contents):

419 """Returns secured hash containing hexadecimal for the given contents."""	304 """Returns secured hash containing hexadecimal for the given contents."""

420 return hashlib.sha1(contents).hexdigest()	305 return hashlib.sha1(contents).hexdigest()

421	306

422	307

423 def GetZipFileName(build_revision=None, target_arch='ia32', patch_sha=None):	308 def GetZipFileName(build_revision=None, target_arch='ia32', patch_sha=None):

424 """Gets the archive file name for the given revision."""	309 """Gets the archive file name for the given revision."""

425 def PlatformName():	310 def PlatformName():

426 """Return a string to be used in paths for the platform."""	311 """Return a string to be used in paths for the platform."""

427 if bisect_utils.IsWindowsHost():	312 if bisect_utils.IsWindowsHost():

(...skipping 1546 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1974 'std_err': 0.0,	1859 'std_err': 0.0,

1975 'std_dev': 0.0,	1860 'std_dev': 0.0,

1976 'values': metric_values,	1861 'values': metric_values,

1977 }	1862 }

1978	1863

1979 print 'Results of performance test: Command returned with %d' % (	1864 print 'Results of performance test: Command returned with %d' % (

1980 overall_return_code)	1865 overall_return_code)

1981 print	1866 print

1982 else:	1867 else:

1983 # Need to get the average value if there were multiple values.	1868 # Need to get the average value if there were multiple values.

1984 truncated_mean = CalculateTruncatedMean(metric_values,	1869 truncated_mean = math_utils.TruncatedMean(

1985 self.opts.truncate_percent)	1870 metric_values, self.opts.truncate_percent)

1986 standard_err = CalculateStandardError(metric_values)	1871 standard_err = math_utils.StandardError(metric_values)

1987 standard_dev = CalculateStandardDeviation(metric_values)	1872 standard_dev = math_utils.StandardDeviation(metric_values)

1988	1873

1989 if self._IsBisectModeStandardDeviation():	1874 if self._IsBisectModeStandardDeviation():

1990 metric_values = [standard_dev]	1875 metric_values = [standard_dev]

1991	1876

1992 values = {	1877 values = {

1993 'mean': truncated_mean,	1878 'mean': truncated_mean,

1994 'std_err': standard_err,	1879 'std_err': standard_err,

1995 'std_dev': standard_dev,	1880 'std_dev': standard_dev,

1996 'values': metric_values,	1881 'values': metric_values,

1997 }	1882 }

(...skipping 1169 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3167	3052

3168 def _FindOtherRegressions(self, revision_data_sorted, bad_greater_than_good):	3053 def _FindOtherRegressions(self, revision_data_sorted, bad_greater_than_good):

3169 other_regressions = []	3054 other_regressions = []

3170 previous_values = []	3055 previous_values = []

3171 previous_id = None	3056 previous_id = None

3172 for current_id, current_data in revision_data_sorted:	3057 for current_id, current_data in revision_data_sorted:

3173 current_values = current_data['value']	3058 current_values = current_data['value']

3174 if current_values:	3059 if current_values:

3175 current_values = current_values['values']	3060 current_values = current_values['values']

3176 if previous_values:	3061 if previous_values:

3177 confidence = CalculateConfidence(previous_values, [current_values])	3062 confidence = ConfidenceScore(previous_values, [current_values])

3178 mean_of_prev_runs = CalculateMean(sum(previous_values, []))	3063 mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))

3179 mean_of_current_runs = CalculateMean(current_values)	3064 mean_of_current_runs = math_utils.Mean(current_values)

3180	3065

3181 # Check that the potential regression is in the same direction as	3066 # Check that the potential regression is in the same direction as

3182 # the overall regression. If the mean of the previous runs < the	3067 # the overall regression. If the mean of the previous runs < the

3183 # mean of the current runs, this local regression is in same	3068 # mean of the current runs, this local regression is in same

3184 # direction.	3069 # direction.

3185 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs	3070 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs

3186 is_same_direction = (prev_less_than_current if	3071 is_same_direction = (prev_less_than_current if

3187 bad_greater_than_good else not prev_less_than_current)	3072 bad_greater_than_good else not prev_less_than_current)

3188	3073

3189 # Only report potential regressions with high confidence.	3074 # Only report potential regressions with high confidence.

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3221 working_means = []	3106 working_means = []

3222 for i in xrange(first_working_revision_index, len(revision_data_sorted)):	3107 for i in xrange(first_working_revision_index, len(revision_data_sorted)):

3223 if revision_data_sorted[i][1]['value']:	3108 if revision_data_sorted[i][1]['value']:

3224 working_means.append(revision_data_sorted[i][1]['value']['values'])	3109 working_means.append(revision_data_sorted[i][1]['value']['values'])

3225	3110

3226 # Flatten the lists to calculate mean of all values.	3111 # Flatten the lists to calculate mean of all values.

3227 working_mean = sum(working_means, [])	3112 working_mean = sum(working_means, [])

3228 broken_mean = sum(broken_means, [])	3113 broken_mean = sum(broken_means, [])

3229	3114

3230 # Calculate the approximate size of the regression	3115 # Calculate the approximate size of the regression

3231 mean_of_bad_runs = CalculateMean(broken_mean)	3116 mean_of_bad_runs = math_utils.Mean(broken_mean)

3232 mean_of_good_runs = CalculateMean(working_mean)	3117 mean_of_good_runs = math_utils.Mean(working_mean)

3233	3118

3234 regression_size = 100 * CalculateRelativeChange(mean_of_good_runs,	3119 regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,

3235 mean_of_bad_runs)	3120 mean_of_bad_runs)

3236 if math.isnan(regression_size):	3121 if math.isnan(regression_size):

3237 regression_size = 'zero-to-nonzero'	3122 regression_size = 'zero-to-nonzero'

3238	3123

3239 regression_std_err = math.fabs(CalculatePooledStandardError(	3124 regression_std_err = math.fabs(math_utils.PooledStandardError(

3240 [working_mean, broken_mean]) /	3125 [working_mean, broken_mean]) /

3241 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0	3126 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0

3242	3127

3243 # Give a "confidence" in the bisect. At the moment we use how distinct the	3128 # Give a "confidence" in the bisect. At the moment we use how distinct the

3244 # values are before and after the last broken revision, and how noisy the	3129 # values are before and after the last broken revision, and how noisy the

3245 # overall graph is.	3130 # overall graph is.

3246 confidence = CalculateConfidence(working_means, broken_means)	3131 confidence = ConfidenceScore(working_means, broken_means)

3247	3132

3248 culprit_revisions = []	3133 culprit_revisions = []

3249	3134

3250 cwd = os.getcwd()	3135 cwd = os.getcwd()

3251 self.ChangeToDepotWorkingDirectory(	3136 self.ChangeToDepotWorkingDirectory(

3252 revision_data[last_broken_revision]['depot'])	3137 revision_data[last_broken_revision]['depot'])

3253	3138

3254 if revision_data[last_broken_revision]['depot'] == 'cros':	3139 if revision_data[last_broken_revision]['depot'] == 'cros':

3255 # Want to get a list of all the commits and what depots they belong	3140 # Want to get a list of all the commits and what depots they belong

3256 # to so that we can grab info about each.	3141 # to so that we can grab info about each.

(...skipping 507 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3764 except RuntimeError, e:	3649 except RuntimeError, e:

3765 if opts.output_buildbot_annotations:	3650 if opts.output_buildbot_annotations:

3766 # The perf dashboard scrapes the "results" step in order to comment on	3651 # The perf dashboard scrapes the "results" step in order to comment on

3767 # bugs. If you change this, please update the perf dashboard as well.	3652 # bugs. If you change this, please update the perf dashboard as well.

3768 bisect_utils.OutputAnnotationStepStart('Results')	3653 bisect_utils.OutputAnnotationStepStart('Results')

3769 print 'Error: %s' % e.message	3654 print 'Error: %s' % e.message

3770 if opts.output_buildbot_annotations:	3655 if opts.output_buildbot_annotations:

3771 bisect_utils.OutputAnnotationStepClosed()	3656 bisect_utils.OutputAnnotationStepClosed()

3772 return 1	3657 return 1

3773	3658

	3659

3774 if __name__ == '__main__':	3660 if __name__ == '__main__':

3775 sys.exit(main())	3661 sys.exit(main())

OLD	NEW

« tools/auto_bisect/math_utils.py ('K') | « tools/auto_bisect/math_utils_test.py ('k') | tools/bisect-perf-regression_test.py » ('j') | no next file with comments »