Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(152)

Side by Side Diff: scripts/slave/recipe_modules/auto_bisect/bisector.py

Issue 1610203003: Iteratively increase sample size for good/bad classification. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master
Patch Set: Rebasing Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2015 The Chromium Authors. All rights reserved. 1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import json 5 import json
6 import re 6 import re
7 import time
7 8
8 from . import bisect_results 9 from . import bisect_results
9 from . import depot_config 10 from . import depot_config
10 from . import revision_state 11 from . import revision_state
11 12
12 _DEPS_SHA_PATCH = """ 13 _DEPS_SHA_PATCH = """
13 diff --git DEPS.sha DEPS.sha 14 diff --git DEPS.sha DEPS.sha
14 new file mode 100644 15 new file mode 100644
15 --- /dev/null 16 --- /dev/null
16 +++ DEPS.sha 17 +++ DEPS.sha
(...skipping 18 matching lines...) Expand all
35 'LO_INIT_CONF', # Bisect aborted early for lack of confidence. 36 'LO_INIT_CONF', # Bisect aborted early for lack of confidence.
36 'MISSING_METRIC', # The metric was not found in the test text/json output. 37 'MISSING_METRIC', # The metric was not found in the test text/json output.
37 'LO_FINAL_CONF', # The bisect completed without a culprit. 38 'LO_FINAL_CONF', # The bisect completed without a culprit.
38 ) 39 )
39 40
40 # When we look for the next revision to build, we search nearby revisions 41 # When we look for the next revision to build, we search nearby revisions
41 # looking for a revision that's already been archived. Since we don't want 42 # looking for a revision that's already been archived. Since we don't want
42 # to move *too* far from the original revision, we'll cap the search at 25%. 43 # to move *too* far from the original revision, we'll cap the search at 25%.
43 DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25 44 DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25
44 45
46 # How long to re-test the initial good-bad range for until significant
47 # difference is established.
48 REGRESSION_CHECK_TIMEOUT = 2 * 60 * 60
49 # If we reach this number of samples on the reference range and have not
50 # achieved statistical significance, bail.
51 MAX_REQUIRED_SAMPLES = 50
52
53 # Significance level to use for determining difference between revisions via
54 # hypothesis testing.
55 SIGNIFICANCE_LEVEL = 0.01
56
45 57
46 class Bisector(object): 58 class Bisector(object):
47 """This class abstracts an ongoing bisect (or n-sect) job.""" 59 """This class abstracts an ongoing bisect (or n-sect) job."""
48 60
49 def __init__(self, api, bisect_config, revision_class, init_revisions=True): 61 def __init__(self, api, bisect_config, revision_class, init_revisions=True):
50 """Initializes the state of a new bisect job from a dictionary. 62 """Initializes the state of a new bisect job from a dictionary.
51 63
52 Note that the initial good_rev and bad_rev MUST resolve to a commit position 64 Note that the initial good_rev and bad_rev MUST resolve to a commit position
53 in the chromium repo. 65 in the chromium repo.
54 """ 66 """
55 super(Bisector, self).__init__() 67 super(Bisector, self).__init__()
56 self._api = api 68 self._api = api
57 self.ensure_sync_master_branch() 69 self.ensure_sync_master_branch()
58 self.bisect_config = bisect_config 70 self.bisect_config = bisect_config
59 self.config_step() 71 self.config_step()
60 self.revision_class = revision_class 72 self.revision_class = revision_class
61 self.result_codes = set() 73 self.result_codes = set()
74 self.last_tested_revision = None
62 75
63 # Test-only properties. 76 # Test-only properties.
64 # TODO: Replace these with proper mod_test_data. 77 # TODO: Replace these with proper mod_test_data.
65 self.dummy_initial_confidence = bisect_config.get( 78 self.dummy_initial_confidence = bisect_config.get(
66 'dummy_initial_confidence') 79 'dummy_initial_confidence')
67 self.dummy_builds = bisect_config.get('dummy_builds', False) 80 self.dummy_builds = bisect_config.get('dummy_builds', False)
68 81
69 # Load configuration items. 82 # Load configuration items.
70 self.test_type = bisect_config.get('test_type', 'perf') 83 self.test_type = bisect_config.get('test_type', 'perf')
71 self.improvement_direction = int(bisect_config.get( 84 self.improvement_direction = int(bisect_config.get(
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
107 self.good_rev.good = True 120 self.good_rev.good = True
108 self.good_rev.read_deps(self.get_perf_tester_name()) 121 self.good_rev.read_deps(self.get_perf_tester_name())
109 api.m.step.active_result.presentation.logs['Debug Good Revision DEPS'] = [ 122 api.m.step.active_result.presentation.logs['Debug Good Revision DEPS'] = [
110 '%s: %s' % (key, value) for key, value in 123 '%s: %s' % (key, value) for key, value in
111 self.good_rev.deps.iteritems()] 124 self.good_rev.deps.iteritems()]
112 self.good_rev.deps = {} 125 self.good_rev.deps = {}
113 self.lkgr = self.good_rev 126 self.lkgr = self.good_rev
114 if init_revisions: 127 if init_revisions:
115 self._expand_chromium_revision_range() 128 self._expand_chromium_revision_range()
116 129
130 def significantly_different(
131 self, list_a, list_b,
132 significance_level=SIGNIFICANCE_LEVEL): # pragma: no cover
133 """Uses an external script to run hypothesis testing with scipy.
134
135 The reason why we need an external script is that scipy is not available to
136 the default python installed in all platforms. We instead rely on an
137 anaconda environment to provide those packages.
138
139 Args:
140 list_a, list_b: Two lists representing samples to be compared.
141 significance_level: Self-describing. As a decimal fraction.
142
143 Returns:
144 A boolean indicating whether the null hypothesis ~(that the lists are
145 samples from the same population) can be rejected at the specified
146 significance level.
147 """
148 step_result = self.api.m.python(
149 'Checking sample difference',
150 self.api.resource('significantly_different.py'),
151 [json.dumps(list_a), json.dumps(list_b), str(significance_level)],
152 stdout=self.api.m.json.output())
153 results = step_result.stdout
154 if results is None:
155 assert self.dummy_builds
156 return True
157 significantly_different = results['significantly_different']
158 step_result.presentation.logs[str(significantly_different)] = [
159 'See json.output for details']
160 return significantly_different
161
117 def config_step(self): 162 def config_step(self):
118 """Yields a simple echo step that outputs the bisect config.""" 163 """Yields a simple echo step that outputs the bisect config."""
119 api = self.api 164 api = self.api
120 # bisect_config may come as a FrozenDict (which is not serializable). 165 # bisect_config may come as a FrozenDict (which is not serializable).
121 bisect_config = dict(self.bisect_config) 166 bisect_config = dict(self.bisect_config)
122 167
123 def fix_windows_backslashes(s): 168 def fix_windows_backslashes(s):
124 backslash_regex = re.compile(r'(?<!\\)\\(?!\\)') 169 backslash_regex = re.compile(r'(?<!\\)\\(?!\\)')
125 return backslash_regex.sub(r'\\', s) 170 return backslash_regex.sub(r'\\', s)
126 171
(...skipping 278 matching lines...) Expand 10 before | Expand all | Expand 10 after
405 self.failed_direction = True 450 self.failed_direction = True
406 self.warnings.append('The initial regression range for return code ' 451 self.warnings.append('The initial regression range for return code '
407 'appears to show NO sign of a regression.') 452 'appears to show NO sign of a regression.')
408 453
409 def _set_failed_direction_results(self): # pragma: no cover 454 def _set_failed_direction_results(self): # pragma: no cover
410 self.failed_direction = True 455 self.failed_direction = True
411 self.warnings.append('The initial regression range appears to represent ' 456 self.warnings.append('The initial regression range appears to represent '
412 'an improvement rather than a regression, given the ' 457 'an improvement rather than a regression, given the '
413 'expected direction of improvement.') 458 'expected direction of improvement.')
414 459
415 def check_initial_confidence(self): 460 def check_initial_confidence(self): # pragma: no cover
416 """Checks that the initial range presents a clear enough regression. 461 """Checks that the initial range presents a clear enough regression.
417 462
418 We calculate the confidence of the results of the given 'good' 463 We calculate the confidence of the results of the given 'good'
419 and 'bad' revisions and compare it against the required confidence 464 and 'bad' revisions and compare it against the required confidence
420 set for the bisector. 465 set for the bisector.
421 466
422 Note that when a dummy regression confidence value has been set, that 467 Note that when a dummy regression confidence value has been set, that
423 is used instead. 468 is used instead.
424 """ 469 """
425 if self.test_type != 'perf': 470 if self.test_type != 'perf':
426 return True 471 return True
427 472
428 if self.required_initial_confidence is None: 473 if self.required_initial_confidence is None:
429 return True # pragma: no cover 474 return True # pragma: no cover
430 475
476 # TODO(robertocn): Remove all uses of "confidence".
431 if self.dummy_initial_confidence is not None: 477 if self.dummy_initial_confidence is not None:
432 self.initial_confidence = float( 478 self.initial_confidence = float(
433 self.dummy_initial_confidence) 479 self.dummy_initial_confidence)
480 if (float(self.initial_confidence) <
481 float(self.required_initial_confidence)):
482 self._set_insufficient_confidence_warning()
483 return False
484 return True
434 485
435 else: # pragma: no cover 486 if self.dummy_builds:
436 if len(self.good_rev.values) < 5 or len(self.bad_rev.values) < 5: 487 dummy_result = self.good_rev.values != self.bad_rev.values
437 # If there are too few values, the confidence score is not a good way to 488 if not dummy_result:
438 # determine whether the regression is reproducible. 489 self._set_insufficient_confidence_warning()
439 # TODO(robertocn): Investigate a straightforward approach to deal with 490 return dummy_result
440 # these cases. Such as the mean of one group lying within the range of 491
441 # the other. 492 with self.api.m.step.nest('Re-testing reference range'):
442 return True 493 expiration_time = time.time() + REGRESSION_CHECK_TIMEOUT
443 self.initial_confidence = ( 494 while time.time() < expiration_time:
444 self.api.m.math_utils.confidence_score( 495 if len(self.good_rev.values) >= 5 and len(self.bad_rev.values) >= 5:
445 self.good_rev.values, 496 if self.significantly_different(self.good_rev.values,
446 self.bad_rev.values)) 497 self.bad_rev.values):
447 if (self.initial_confidence < 498 return True
448 self.required_initial_confidence): # pragma: no cover 499 if len(self.good_rev.values) == len(self.bad_rev.values):
449 self._set_insufficient_confidence_warning(self.initial_confidence) 500 revision_to_retest = self.last_tested_revision
501 else:
502 revision_to_retest = min(self.good_rev, self.bad_rev,
503 key=lambda x: len(x.values))
504 if len(revision_to_retest.values) < MAX_REQUIRED_SAMPLES:
505 revision_to_retest.retest()
506 else:
507 break
508 self._set_insufficient_confidence_warning()
450 return False 509 return False
451 return True 510
452 511
453 def get_exception(self): 512 def get_exception(self):
454 raise NotImplementedError() # pragma: no cover 513 raise NotImplementedError() # pragma: no cover
455 # TODO: should return an exception with the details of the failure. 514 # TODO: should return an exception with the details of the failure.
456 515
457 def _set_insufficient_confidence_warning( 516 def _set_insufficient_confidence_warning(
458 self, actual_confidence): # pragma: no cover 517 self): # pragma: no cover
459 """Adds a warning about the lack of initial regression confidence.""" 518 """Adds a warning about the lack of initial regression confidence."""
460 self.failed_initial_confidence = True 519 self.failed_initial_confidence = True
461 self.surface_result('LO_INIT_CONF') 520 self.surface_result('LO_INIT_CONF')
462 self.warnings.append( 521 self.warnings.append(
463 ('Bisect failed to reproduce the regression with enough confidence. ' 522 'Bisect failed to reproduce the regression with enough confidence.')
464 'Needed {:.2f}%, got {:.2f}%.').format(
465 self.required_initial_confidence, actual_confidence))
466 523
467 def _results_debug_message(self): 524 def _results_debug_message(self):
468 """Returns a string with values used to debug a bisect result.""" 525 """Returns a string with values used to debug a bisect result."""
469 result = 'bisector.lkgr: %r\n' % self.lkgr 526 result = 'bisector.lkgr: %r\n' % self.lkgr
470 result += 'bisector.fkbr: %r\n\n' % self.fkbr 527 result += 'bisector.fkbr: %r\n\n' % self.fkbr
471 result += self._revision_value_table() 528 result += self._revision_value_table()
472 if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values): 529 if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values):
473 result += '\n' + self._t_test_results() 530 result += '\n' + self._t_test_results()
474 return result 531 return result
475 532
(...skipping 340 matching lines...) Expand 10 before | Expand all | Expand 10 after
816 873
817 def surface_result(self, result_string): 874 def surface_result(self, result_string):
818 assert result_string in VALID_RESULT_CODES 875 assert result_string in VALID_RESULT_CODES
819 prefix = 'B4T_' # To avoid collision. Stands for bisect (abbr. `a la i18n). 876 prefix = 'B4T_' # To avoid collision. Stands for bisect (abbr. `a la i18n).
820 result_code = prefix + result_string 877 result_code = prefix + result_string
821 assert len(result_code) <= 20 878 assert len(result_code) <= 20
822 if result_code not in self.result_codes: 879 if result_code not in self.result_codes:
823 self.result_codes.add(result_code) 880 self.result_codes.add(result_code)
824 properties = self.api.m.step.active_result.presentation.properties 881 properties = self.api.m.step.active_result.presentation.properties
825 properties['extra_result_code'] = sorted(self.result_codes) 882 properties['extra_result_code'] = sorted(self.result_codes)
OLDNEW
« no previous file with comments | « scripts/slave/recipe_modules/auto_bisect/api.py ('k') | scripts/slave/recipe_modules/auto_bisect/bisector_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698