scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py - Issue 1610203003: Iteratively increase sample size for good/bad classification.

Unified Diff: scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py

Issue 1610203003: Iteratively increase sample size for good/bad classification. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master

Patch Set: Fixing multiple problems Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« scripts/slave/recipe_modules/auto_bisect/bisector.py ('K') | « scripts/slave/recipe_modules/auto_bisect/example.expected/windows_x64_bisector.json ('k') | scripts/slave/recipe_modules/auto_bisect/resources/significantly_different.py » ('j') | scripts/slave/recipe_modules/auto_bisect/resources/significantly_different.py » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py

diff --git a/scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py b/scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py

index 41529e8b93566add7fc08abbeb90f8094497ff8c..62cc076ce9594976482d0c8bed4ca1003aa2c4af 100644

--- a/scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py

+++ b/scripts/slave/recipe_modules/auto_bisect/perf_revision_state.py

@@ -3,6 +3,7 @@

# found in the LICENSE file.

import json

+import math

import tempfile

import os

import uuid

@@ -22,9 +23,10 @@ class PerfRevisionState(revision_state.RevisionState):

self.values = []

self.mean_value = None

self.std_dev = None

+ self.custom_reps = None

self._test_config = None

- def _read_test_results(self):

+ def _read_test_results(self, check_revision_goodness=True):

"""Gets the test results from GS and checks if the rev is good or bad."""

test_results = self._get_test_results()

# Results will contain the keys 'results' and 'output' where output is the

@@ -37,7 +39,7 @@ class PerfRevisionState(revision_state.RevisionState):

if 'MISSING_METRIC' in results.get('errors'): # pragma: no cover

self.bisector.surface_result('MISSING_METRIC')

return

- self.values = results['values']

+ self.values = self.values or [] + results['values']

qyearsley 2016/01/26 19:26:00 `or` is less tightly binding than `+`, so this is

RobertoCN 2016/02/01 17:29:50 Done.

if self.bisector.is_return_code_mode():

retcodes = test_results['retcodes']

overall_return_code = 0 if all(v == 0 for v in retcodes) else 1

@@ -52,7 +54,8 @@ class PerfRevisionState(revision_state.RevisionState):

self.bisector.surface_result('MISSING_METRIC')

return

# We cannot test the goodness of the initial rev range.

- if self.bisector.good_rev != self and self.bisector.bad_rev != self:

+ if (self.bisector.good_rev != self and self.bisector.bad_rev != self and

+ check_revision_goodness):

if self._check_revision_good():

self.good = True

else:

@@ -122,8 +125,16 @@ class PerfRevisionState(revision_state.RevisionState):

if k in required_test_properties:

result[k] = v

self._test_config = result

+ if self.custom_reps: # pragma: no cover

qyearsley 2016/01/26 19:26:00 Two spaces before #.

RobertoCN 2016/02/01 17:29:50 Done.

+ result['repeat_count'] = self.custom_reps

+ else:

+ # We need at the very least 3 data points. (e.g. that's the minimum for

+ # shapiro's test for normality [even though we don't really use it])

qyearsley 2016/01/26 19:26:00 The parenthetical part of this comment could proba

RobertoCN 2016/02/01 17:29:50 Done.

+ result['repeat_count'] = max(int(result['repeat_count']), 3)

return result

+ # It seems it should be okay to call this again to test the revision some

+ # more.

qyearsley 2016/01/26 19:26:00 I'm not sure whether this comment is helpful; mayb

RobertoCN 2016/02/01 17:29:50 Yep, I was supposed to remove it and forgot. Done

def _do_test(self):

"""Posts a request to buildbot to download and perf-test this build."""

if self.bisector.dummy_builds:

@@ -179,6 +190,12 @@ class PerfRevisionState(revision_state.RevisionState):

'job_name': self.job_name,

}

+ def retest(self, target_sample_size): # pragma: no cover

+ self.status = PerfRevisionState.NEED_MORE_DATA

+ self.custom_reps = target_sample_size - len(self.values)

+ self.start_job()

+ self.bisector.wait_for_any([self])

qyearsley 2016/01/26 19:26:00 This triggers the bisect to run another |self.cust

RobertoCN 2016/02/01 17:29:50 Yeah.

def _get_test_results(self):

"""Tries to get the results of a test job from cloud storage."""

api = self.bisector.api

@@ -193,23 +210,42 @@ class PerfRevisionState(revision_state.RevisionState):

else:

return json.loads(step_result.stdout)

+ def _calculate_target_sample_size(self): # pragma: no cover

+ # Make the smallest bigger than the largest, by increasing it at least 33%

+ sizes = map(lambda x: len(x.values), [self.bisector.lkgr, self,

qyearsley 2016/01/26 19:26:00 Extra space after the comma.

RobertoCN 2016/02/01 17:29:50 Done.

+ self.bisector.fkbr])

+ smallest_sample_size = min(sizes)

+ target_sample_size = max(sizes) + 1

+ delta = target_sample_size - smallest_sample_size

dtu 2016/02/05 00:18:01 delta is unused.

+ min_increase = math.ceil(smallest_sample_size/3.0)

qyearsley 2016/01/26 19:26:00 Optional: could add spaces around the / operator.

RobertoCN 2016/02/01 17:29:50 Done.

+ return max(smallest_sample_size + min_increase, target_sample_size)

def _check_revision_good(self):

qyearsley 2016/01/26 19:26:00 Parts of the previous docstring could be kept here

RobertoCN 2016/02/01 17:29:51 Done.

- """Determines if a revision is good or bad.

- Note that our current approach is to determine whether it is closer to

- either the 'good' and 'bad' revisions given for the bisect job.

- Returns:

- True if this revision is closer to the initial good revision's value than

- to the initial bad revision's value. False otherwise.

- """

- # TODO: Reevaluate this approach

- bisector = self.bisector

- distance_to_good = abs(self.mean_value - bisector.good_rev.mean_value)

- distance_to_bad = abs(self.mean_value - bisector.bad_rev.mean_value)

- if distance_to_good < distance_to_bad:

- return True

- return False

+ diff_from_good = self.bisector.significantly_different(

+ self.bisector.lkgr.values, self.values)

+ diff_from_bad = self.bisector.significantly_different(

+ self.bisector.fkbr.values, self.values)

+ while not (diff_from_good or diff_from_bad): # pragma: no cover

+ target_sample_size = self._calculate_target_sample_size()

+ min(self.bisector.lkgr, self, self.bisector.fkbr,

+ key=lambda x: len(x.values)).retest(target_sample_size)

+ diff_from_good = self.bisector.significantly_different(

+ self.bisector.lkgr.values, self.values)

+ diff_from_bad = self.bisector.significantly_different(

+ self.bisector.fkbr.values, self.values)

+ if diff_from_good and diff_from_bad:

qyearsley 2016/01/26 19:26:00 A blank line above this if block might slightly he

RobertoCN 2016/02/01 17:29:50 Done.

+ # Multiple regressions.

+ # For now, proceed bisecting the biggest difference of the means, and

+ # handle the secondary regression by spinning a second job.

qyearsley 2016/01/26 19:26:00 No second job is started as of this CL, right?

RobertoCN 2016/02/01 17:29:50 Correct, for now we'll go with the larger regressi

+ dist_from_good = abs(self.mean_value - self.bisector.lkgr.mean_value)

+ dist_from_bad = abs(self.mean_value - self.bisector.fkbr.mean_value)

+ if dist_from_good > dist_from_bad:

+ #self.bisector.handle_secondary_regression(self, self.bisector.fkbr)

+ return False

+ else:

+ #self.bisector.handle_secondary_regression(self.bisector.lkgr, self)

+ return True

+ return diff_from_bad # pragma: no cover

def __repr__(self):

return ('PerfRevisionState(cp=%s, values=%r, mean_value=%r, std_dev=%r)' %