| Index: tools/bisect-perf-regression.py
|
| diff --git a/tools/bisect-perf-regression.py b/tools/bisect-perf-regression.py
|
| index 8368e63e5a9d354556670c09f68db6214522d21b..6fd0289b86ee903d7694f27e9b414cd56cbf0cfa 100755
|
| --- a/tools/bisect-perf-regression.py
|
| +++ b/tools/bisect-perf-regression.py
|
| @@ -55,6 +55,7 @@ from auto_bisect import bisect_utils
|
| from auto_bisect import math_utils
|
| from auto_bisect import post_perf_builder_job as bisect_builder
|
| from auto_bisect import source_control as source_control_module
|
| +from auto_bisect import ttest
|
| from telemetry.util import cloud_storage
|
|
|
| # The additional repositories that might need to be bisected.
|
| @@ -260,44 +261,31 @@ def _AddAdditionalDepotInfo(depot_info):
|
|
|
|
|
| def ConfidenceScore(good_results_lists, bad_results_lists):
|
| - """Calculates a confidence percentage.
|
| + """Calculates a confidence score.
|
|
|
| - This is calculated based on how distinct the "good" and "bad" values are,
|
| - and how noisy the results are. More precisely, the confidence is the quotient
|
| - of the difference between the closest values across the good and bad groups
|
| - and the sum of the standard deviations of the good and bad groups.
|
| + This score is a percentage which represents our degree of confidence in the
|
| + proposition that the good results and bad results are distinct groups, and
|
| + their differences aren't due to chance alone.
|
|
|
| - TODO(qyearsley): Replace this confidence function with a function that
|
| - uses a Student's t-test. The confidence would be (1 - p-value), where
|
| - p-value is the probability of obtaining the given a set of good and bad
|
| - values just by chance.
|
|
|
| Args:
|
| good_results_lists: A list of lists of "good" result numbers.
|
| bad_results_lists: A list of lists of "bad" result numbers.
|
|
|
| Returns:
|
| - A number between in the range [0, 100].
|
| + A number in the range [0, 100].
|
| """
|
| - # Get the distance between the two groups.
|
| - means_good = map(math_utils.Mean, good_results_lists)
|
| - means_bad = map(math_utils.Mean, bad_results_lists)
|
| - bounds_good = (min(means_good), max(means_good))
|
| - bounds_bad = (min(means_bad), max(means_bad))
|
| - dist_between_groups = min(
|
| - math.fabs(bounds_bad[1] - bounds_good[0]),
|
| - math.fabs(bounds_bad[0] - bounds_good[1]))
|
| -
|
| - # Get the sum of the standard deviations of the two groups.
|
| - good_results_flattened = sum(good_results_lists, [])
|
| - bad_results_flattened = sum(bad_results_lists, [])
|
| - stddev_good = math_utils.StandardDeviation(good_results_flattened)
|
| - stddev_bad = math_utils.StandardDeviation(bad_results_flattened)
|
| - stddev_sum = stddev_good + stddev_bad
|
| -
|
| - confidence = dist_between_groups / (max(0.0001, stddev_sum))
|
| - confidence = int(min(1.0, max(confidence, 0.0)) * 100.0)
|
| - return confidence
|
| + if not good_results_lists or not bad_results_lists:
|
| + return 0.0
|
| +
|
| + # Flatten the lists of results lists.
|
| + sample1 = sum(good_results_lists, [])
|
| + sample2 = sum(bad_results_lists, [])
|
| +
|
| + # The p-value is approximately the probability of obtaining the given set
|
| + # of good and bad values just by chance.
|
| + _, _, p_value = ttest.WelchsTTest(sample1, sample2)
|
| + return 100.0 * (1.0 - p_value)
|
|
|
|
|
| def GetSHA1HexDigest(contents):
|
|
|