tools/bisect-perf-regression_test.py - Issue 413393002: Use Welch's t-test to calculate confidence scores in the bisect script.

Unified Diff: tools/bisect-perf-regression_test.py

Issue 413393002: Use Welch's t-test to calculate confidence scores in the bisect script. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Add Chromium copyright notice to ttest.py. Created 6 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/bisect-perf-regression_test.py

diff --git a/tools/bisect-perf-regression_test.py b/tools/bisect-perf-regression_test.py

index 913a851bcd3e054898c7eb5167abeb3c44c0dcd5..d4e88d24f64a568a2ffaf31344396f1d5cc7f49d 100644

--- a/tools/bisect-perf-regression_test.py

+++ b/tools/bisect-perf-regression_test.py

@@ -31,36 +31,65 @@ class BisectPerfRegressionTest(unittest.TestCase):

"""Cleans up the test environment after each test method."""

pass

- def testConfidenceScore(self):

+ def testConfidenceScoreHigh(self):

"""Tests the confidence calculation."""

- bad_values = [[0, 1], [1, 2]]

- good_values = [[6, 7], [7, 8]]

- # Closest means are mean(1, 2) and mean(6, 7).

- distance = 6.5 - 1.5

- # Standard deviation of [n-1, n, n, n+1] is 0.8165.

- stddev_sum = 0.8165 + 0.8165

- # Expected confidence is an int in the range [0, 100].

- expected_confidence = min(100, int(100 * distance / float(stddev_sum)))

- self.assertEqual(

- expected_confidence,

- bisect_perf_module.ConfidenceScore(bad_values, good_values))

+ bad_values = [[0, 1, 1], [1, 2, 2]]

+ good_values = [[1, 2, 2], [3, 3, 4]]

+ confidence = bisect_perf_module.ConfidenceScore(bad_values, good_values)

+ self.assertEqual(95.0, confidence)

- def testConfidenceScoreZeroConfidence(self):

+ def testConfidenceScoreNotSoHigh(self):

+ """Tests the confidence calculation."""

+ bad_values = [[0, 1, 1], [1, 2, 2]]

+ good_values = [[1, 1, 1], [3, 3, 4]]

+ # The good and bad groups are closer together than in the above test,

+ # so the confidence that they're different is a little lower.

+ confidence = bisect_perf_module.ConfidenceScore(bad_values, good_values)

+ self.assertEqual(80.0, confidence)

+ def testConfidenceScoreZero(self):

"""Tests the confidence calculation when it's expected to be 0."""

- bad_values = [[0, 1], [1, 2], [4, 5], [0, 2]]

- good_values = [[4, 5], [6, 7], [7, 8]]

- # Both groups have value lists with means of 4.5, which means distance

- # between groups is zero, and thus confidence is zero.

- self.assertEqual(

- 0, bisect_perf_module.ConfidenceScore(bad_values, good_values))

+ bad_values = [[4, 5], [7, 6], [8, 7]]

+ good_values = [[8, 7], [6, 7], [5, 4]]

+ # The good and bad sets contain the same values, so the confidence that

+ # they're different should be zero.

+ confidence = bisect_perf_module.ConfidenceScore(bad_values, good_values)

+ self.assertEqual(0.0, confidence)

- def testConfidenceScoreMaxConfidence(self):

- """Tests the confidence calculation when it's expected to be 100."""

+ def testConfidenceScoreVeryHigh(self):

+ """Tests the confidence calculation when it's expected to be high."""

bad_values = [[1, 1], [1, 1]]

good_values = [[1.2, 1.2], [1.2, 1.2]]

- # Standard deviation in both groups is zero, so confidence is 100.

- self.assertEqual(

- 100, bisect_perf_module.ConfidenceScore(bad_values, good_values))

+ confidence = bisect_perf_module.ConfidenceScore(bad_values, good_values)

+ self.assertEqual(99.9, confidence)

+ def testConfidenceScoreImbalance(self):

+ """Tests the confidence calculation one set of numbers is small."""

+ bad_values = [[1.1, 1.2], [1.1, 1.2], [1.0, 1.3], [1.2, 1.3]]

+ good_values = [[1.4]]

+ confidence = bisect_perf_module.ConfidenceScore(bad_values, good_values)

+ self.assertEqual(80.0, confidence)

+ def testConfidenceScoreImbalance(self):

+ """Tests the confidence calculation one set of numbers is empty."""

+ bad_values = [[1.1, 1.2], [1.1, 1.2], [1.0, 1.3], [1.2, 1.3]]

+ good_values = []

+ confidence = bisect_perf_module.ConfidenceScore(bad_values, good_values)

+ self.assertEqual(0.0, confidence)

+ def testConfidenceScoreFunctionalTestResultsInconsistent(self):

+ """Tests the confidence calculation when the numbers are just 0 and 1."""

+ bad_values = [[1], [1], [0], [1], [1], [1], [0], [1]]

+ good_values = [[0], [0], [1], [0], [1], [0]]

+ confidence = bisect_perf_module.ConfidenceScore(bad_values, good_values)

+ self.assertEqual(80.0, confidence)

+ def testConfidenceScoreFunctionalTestResultsConsistent(self):

+ """Tests the confidence calculation when the numbers are 0 and 1."""

+ bad_values = [[1], [1], [1], [1], [1], [1], [1], [1]]

+ good_values = [[0], [0], [0], [0], [0], [0]]

+ confidence = bisect_perf_module.ConfidenceScore(bad_values, good_values)

+ self.assertEqual(99.9, confidence)

def testParseDEPSStringManually(self):

"""Tests DEPS parsing."""

« no previous file with comments | « tools/bisect-perf-regression.py ('k') | tools/run-bisect-perf-regression.py » ('j') | no next file with comments »