Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(525)

Side by Side Diff: scripts/slave/recipe_modules/auto_bisect/bisector.py

Issue 1610203003: Iteratively increase sample size for good/bad classification. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master
Patch Set: Making initial regression check have a timeout. Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2015 The Chromium Authors. All rights reserved. 1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import json 5 import json
6 import re 6 import re
7 import time
7 8
8 from . import bisect_results 9 from . import bisect_results
9 from . import depot_config 10 from . import depot_config
10 from . import revision_state 11 from . import revision_state
11 12
12 _DEPS_SHA_PATCH = """ 13 _DEPS_SHA_PATCH = """
13 diff --git DEPS.sha DEPS.sha 14 diff --git DEPS.sha DEPS.sha
14 new file mode 100644 15 new file mode 100644
15 --- /dev/null 16 --- /dev/null
16 +++ DEPS.sha 17 +++ DEPS.sha
(...skipping 18 matching lines...) Expand all
35 'LO_INIT_CONF', # Bisect aborted early for lack of confidence. 36 'LO_INIT_CONF', # Bisect aborted early for lack of confidence.
36 'MISSING_METRIC', # The metric was not found in the test text/json output. 37 'MISSING_METRIC', # The metric was not found in the test text/json output.
37 'LO_FINAL_CONF', # The bisect completed without a culprit. 38 'LO_FINAL_CONF', # The bisect completed without a culprit.
38 ) 39 )
39 40
40 # When we look for the next revision to build, we search nearby revisions 41 # When we look for the next revision to build, we search nearby revisions
41 # looking for a revision that's already been archived. Since we don't want 42 # looking for a revision that's already been archived. Since we don't want
42 # to move *too* far from the original revision, we'll cap the search at 25%. 43 # to move *too* far from the original revision, we'll cap the search at 25%.
43 DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25 44 DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25
44 45
46 # How long to re-test the initial good-bad range for until significant
47 # difference is established.
48 REGRESSION_CHECK_TIMEOUT = 6 * 60 * 60
dtu 2016/02/05 00:18:01 Is that six hours? Do we really have that much tim
49
50 # Significance level to use for determining difference between revisions via
51 # hypothesis testing.
52 SIGNIFICANCE_LEVEL = 0.05
53
45 54
46 class Bisector(object): 55 class Bisector(object):
47 """This class abstracts an ongoing bisect (or n-sect) job.""" 56 """This class abstracts an ongoing bisect (or n-sect) job."""
48 57
49 def __init__(self, api, bisect_config, revision_class, init_revisions=True): 58 def __init__(self, api, bisect_config, revision_class, init_revisions=True):
50 """Initializes the state of a new bisect job from a dictionary. 59 """Initializes the state of a new bisect job from a dictionary.
51 60
52 Note that the initial good_rev and bad_rev MUST resolve to a commit position 61 Note that the initial good_rev and bad_rev MUST resolve to a commit position
53 in the chromium repo. 62 in the chromium repo.
54 """ 63 """
(...skipping 15 matching lines...) Expand all
70 self.test_type = bisect_config.get('test_type', 'perf') 79 self.test_type = bisect_config.get('test_type', 'perf')
71 self.improvement_direction = int(bisect_config.get( 80 self.improvement_direction = int(bisect_config.get(
72 'improvement_direction', 0)) or None 81 'improvement_direction', 0)) or None
73 # Required initial confidence might be explicitly set to None, 82 # Required initial confidence might be explicitly set to None,
74 # which means that no initial confidence check should be done. 83 # which means that no initial confidence check should be done.
75 if ('required_initial_confidence' in bisect_config and 84 if ('required_initial_confidence' in bisect_config and
76 bisect_config['required_initial_confidence'] is None): 85 bisect_config['required_initial_confidence'] is None):
77 self.required_initial_confidence = None # pragma: no cover 86 self.required_initial_confidence = None # pragma: no cover
78 else: 87 else:
79 self.required_initial_confidence = float(bisect_config.get( 88 self.required_initial_confidence = float(bisect_config.get(
80 'required_initial_confidence', 80)) 89 'required_initial_confidence', 80))
dtu 2016/02/05 00:18:02 Should we rethink what this value is? Also make it
81 90
82 self.warnings = [] 91 self.warnings = []
83 92
84 # Status flags 93 # Status flags
85 self.initial_confidence = None 94 self.initial_confidence = None
86 self.failed_initial_confidence = False 95 self.failed_initial_confidence = False
87 self.failed = False 96 self.failed = False
88 self.failed_direction = False 97 self.failed_direction = False
89 self.lkgr = None # Last known good revision 98 self.lkgr = None # Last known good revision
90 self.fkbr = None # First known bad revision 99 self.fkbr = None # First known bad revision
(...skipping 16 matching lines...) Expand all
107 self.good_rev.good = True 116 self.good_rev.good = True
108 self.good_rev.read_deps(self.get_perf_tester_name()) 117 self.good_rev.read_deps(self.get_perf_tester_name())
109 api.m.step.active_result.presentation.logs['Debug Good Revision DEPS'] = [ 118 api.m.step.active_result.presentation.logs['Debug Good Revision DEPS'] = [
110 '%s: %s' % (key, value) for key, value in 119 '%s: %s' % (key, value) for key, value in
111 self.good_rev.deps.iteritems()] 120 self.good_rev.deps.iteritems()]
112 self.good_rev.deps = {} 121 self.good_rev.deps = {}
113 self.lkgr = self.good_rev 122 self.lkgr = self.good_rev
114 if init_revisions: 123 if init_revisions:
115 self._expand_chromium_revision_range() 124 self._expand_chromium_revision_range()
116 125
126 def significantly_different(
127 self, list_a, list_b,
128 significance_level=SIGNIFICANCE_LEVEL): # pragma: no cover
129 """Uses an external script to run hypothesis testing with scipy.
130
131 The reason why we need an external script is that scipy is not available to
132 the default python installed in all platforms. We instead rely on an
133 anaconda environment to provide those packages.
134
135 Args:
136 list_a, list_b: Two lists representing samples to be compared.
137 significance_level: Self-describing. As a decimal fraction.
138
139 Returns:
140 A boolean indicating whether the null hypothesis ~(that the lists are
141 samples from the same population) can be rejected at the specified
142 significance level.
143 """
144 step_result = self.api.m.python(
145 'Checking sample difference',
146 self.api.resource('significantly_different.py'),
147 [json.dumps(list_a), json.dumps(list_b), str(significance_level)],
148 stdout=self.api.m.json.output())
149 results = step_result.stdout
150 if results is None:
151 assert self.dummy_builds
152 return True
153 significantly_different = results['significantly_different']
154 step_result.presentation.logs[str(significantly_different)] = [
155 'See json.output for details']
156 return significantly_different
157
117 def config_step(self): 158 def config_step(self):
118 """Yields a simple echo step that outputs the bisect config.""" 159 """Yields a simple echo step that outputs the bisect config."""
119 api = self.api 160 api = self.api
120 # bisect_config may come as a FrozenDict (which is not serializable). 161 # bisect_config may come as a FrozenDict (which is not serializable).
121 bisect_config = dict(self.bisect_config) 162 bisect_config = dict(self.bisect_config)
122 163
123 def fix_windows_backslashes(s): 164 def fix_windows_backslashes(s):
124 backslash_regex = re.compile(r'(?<!\\)\\(?!\\)') 165 backslash_regex = re.compile(r'(?<!\\)\\(?!\\)')
125 return backslash_regex.sub(r'\\', s) 166 return backslash_regex.sub(r'\\', s)
126 167
(...skipping 278 matching lines...) Expand 10 before | Expand all | Expand 10 after
405 self.failed_direction = True 446 self.failed_direction = True
406 self.warnings.append('The initial regression range for return code ' 447 self.warnings.append('The initial regression range for return code '
407 'appears to show NO sign of a regression.') 448 'appears to show NO sign of a regression.')
408 449
409 def _set_failed_direction_results(self): # pragma: no cover 450 def _set_failed_direction_results(self): # pragma: no cover
410 self.failed_direction = True 451 self.failed_direction = True
411 self.warnings.append('The initial regression range appears to represent ' 452 self.warnings.append('The initial regression range appears to represent '
412 'an improvement rather than a regression, given the ' 453 'an improvement rather than a regression, given the '
413 'expected direction of improvement.') 454 'expected direction of improvement.')
414 455
415 def check_initial_confidence(self): 456 def check_initial_confidence(self): # pragma: no cover
416 """Checks that the initial range presents a clear enough regression. 457 """Checks that the initial range presents a clear enough regression.
417 458
418 We calculate the confidence of the results of the given 'good' 459 We calculate the confidence of the results of the given 'good'
419 and 'bad' revisions and compare it against the required confidence 460 and 'bad' revisions and compare it against the required confidence
420 set for the bisector. 461 set for the bisector.
421 462
422 Note that when a dummy regression confidence value has been set, that 463 Note that when a dummy regression confidence value has been set, that
423 is used instead. 464 is used instead.
424 """ 465 """
425 if self.test_type != 'perf': 466 if self.test_type != 'perf':
426 return True 467 return True
427 468
428 if self.required_initial_confidence is None: 469 if self.required_initial_confidence is None:
429 return True # pragma: no cover 470 return True # pragma: no cover
430 471
472 # TODO(robertocn): Remove all uses of "confidence".
431 if self.dummy_initial_confidence is not None: 473 if self.dummy_initial_confidence is not None:
432 self.initial_confidence = float( 474 self.initial_confidence = float(
433 self.dummy_initial_confidence) 475 self.dummy_initial_confidence)
476 if (float(self.initial_confidence) <
477 float(self.required_initial_confidence)):
478 self._set_insufficient_confidence_warning()
479 return False
480 return True
434 481
435 else: # pragma: no cover 482 if self.dummy_builds:
436 if len(self.good_rev.values) < 5 or len(self.bad_rev.values) < 5: 483 dummy_result = self.good_rev.values != self.bad_rev.values
437 # If there are too few values, the confidence score is not a good way to 484 if not dummy_result:
438 # determine whether the regression is reproducible. 485 self._set_insufficient_confidence_warning()
439 # TODO(robertocn): Investigate a straightforward approach to deal with 486 return dummy_result
dtu 2016/02/05 00:18:01 Seems like there's a bunch of technical debt relat
440 # these cases. Such as the mean of one group lying within the range of 487
441 # the other. 488 expiration_time = time.time() + REGRESSION_CHECK_TIMEOUT
442 return True 489 while time.time() < expiration_time:
443 self.initial_confidence = ( 490 if len(self.good_rev.values) >= 5 and len(self.bad_rev.values) >= 5:
444 self.api.m.math_utils.confidence_score( 491 if self.significantly_different(self.good_rev.values,
445 self.good_rev.values, 492 self.bad_rev.values):
446 self.bad_rev.values)) 493 return True
447 if (self.initial_confidence < 494 min(self.good_rev, self.bad_rev, key=lambda x: len(x.values)).retest()
448 self.required_initial_confidence): # pragma: no cover 495 self._set_insufficient_confidence_warning()
449 self._set_insufficient_confidence_warning(self.initial_confidence) 496 return False
450 return False 497
451 return True
452 498
453 def get_exception(self): 499 def get_exception(self):
454 raise NotImplementedError() # pragma: no cover 500 raise NotImplementedError() # pragma: no cover
455 # TODO: should return an exception with the details of the failure. 501 # TODO: should return an exception with the details of the failure.
456 502
457 def _set_insufficient_confidence_warning( 503 def _set_insufficient_confidence_warning(
458 self, actual_confidence): # pragma: no cover 504 self): # pragma: no cover
459 """Adds a warning about the lack of initial regression confidence.""" 505 """Adds a warning about the lack of initial regression confidence."""
460 self.failed_initial_confidence = True 506 self.failed_initial_confidence = True
461 self.surface_result('LO_INIT_CONF') 507 self.surface_result('LO_INIT_CONF')
462 self.warnings.append( 508 self.warnings.append(
463 ('Bisect failed to reproduce the regression with enough confidence. ' 509 'Bisect failed to reproduce the regression with enough confidence.')
464 'Needed {:.2f}%, got {:.2f}%.').format(
465 self.required_initial_confidence, actual_confidence))
466 510
467 def _results_debug_message(self): 511 def _results_debug_message(self):
468 """Returns a string with values used to debug a bisect result.""" 512 """Returns a string with values used to debug a bisect result."""
469 result = 'bisector.lkgr: %r\n' % self.lkgr 513 result = 'bisector.lkgr: %r\n' % self.lkgr
470 result += 'bisector.fkbr: %r\n\n' % self.fkbr 514 result += 'bisector.fkbr: %r\n\n' % self.fkbr
471 result += self._revision_value_table() 515 result += self._revision_value_table()
472 if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values): 516 if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values):
473 result += '\n' + self._t_test_results() 517 result += '\n' + self._t_test_results()
474 return result 518 return result
475 519
(...skipping 334 matching lines...) Expand 10 before | Expand all | Expand 10 after
810 854
811 def surface_result(self, result_string): 855 def surface_result(self, result_string):
812 assert result_string in VALID_RESULT_CODES 856 assert result_string in VALID_RESULT_CODES
813 prefix = 'B4T_' # To avoid collision. Stands for bisect (abbr. `a la i18n). 857 prefix = 'B4T_' # To avoid collision. Stands for bisect (abbr. `a la i18n).
814 result_code = prefix + result_string 858 result_code = prefix + result_string
815 assert len(result_code) <= 20 859 assert len(result_code) <= 20
816 if result_code not in self.result_codes: 860 if result_code not in self.result_codes:
817 self.result_codes.add(result_code) 861 self.result_codes.add(result_code)
818 properties = self.api.m.step.active_result.presentation.properties 862 properties = self.api.m.step.active_result.presentation.properties
819 properties['extra_result_code'] = sorted(self.result_codes) 863 properties['extra_result_code'] = sorted(self.result_codes)
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698