Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(232)

Side by Side Diff: scripts/slave/recipe_modules/auto_bisect/bisector.py

Issue 1702013004: Revert of Iteratively increase sample size for good/bad classification. (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/build.git@master
Patch Set: Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2015 The Chromium Authors. All rights reserved. 1 # Copyright 2015 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import json 5 import json
6 import re 6 import re
7 import time
8 7
9 from . import bisect_results 8 from . import bisect_results
10 from . import depot_config 9 from . import depot_config
11 from . import revision_state 10 from . import revision_state
12 11
13 _DEPS_SHA_PATCH = """ 12 _DEPS_SHA_PATCH = """
14 diff --git DEPS.sha DEPS.sha 13 diff --git DEPS.sha DEPS.sha
15 new file mode 100644 14 new file mode 100644
16 --- /dev/null 15 --- /dev/null
17 +++ DEPS.sha 16 +++ DEPS.sha
(...skipping 18 matching lines...) Expand all
36 'LO_INIT_CONF', # Bisect aborted early for lack of confidence. 35 'LO_INIT_CONF', # Bisect aborted early for lack of confidence.
37 'MISSING_METRIC', # The metric was not found in the test text/json output. 36 'MISSING_METRIC', # The metric was not found in the test text/json output.
38 'LO_FINAL_CONF', # The bisect completed without a culprit. 37 'LO_FINAL_CONF', # The bisect completed without a culprit.
39 ) 38 )
40 39
41 # When we look for the next revision to build, we search nearby revisions 40 # When we look for the next revision to build, we search nearby revisions
42 # looking for a revision that's already been archived. Since we don't want 41 # looking for a revision that's already been archived. Since we don't want
43 # to move *too* far from the original revision, we'll cap the search at 25%. 42 # to move *too* far from the original revision, we'll cap the search at 25%.
44 DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25 43 DEFAULT_SEARCH_RANGE_PERCENTAGE = 0.25
45 44
46 # How long to re-test the initial good-bad range for until significant
47 # difference is established.
48 REGRESSION_CHECK_TIMEOUT = 2 * 60 * 60
49 # If we reach this number of samples on the reference range and have not
50 # achieved statistical significance, bail.
51 MAX_REQUIRED_SAMPLES = 50
52
53 # Significance level to use for determining difference between revisions via
54 # hypothesis testing.
55 SIGNIFICANCE_LEVEL = 0.01
56
57 45
58 class Bisector(object): 46 class Bisector(object):
59 """This class abstracts an ongoing bisect (or n-sect) job.""" 47 """This class abstracts an ongoing bisect (or n-sect) job."""
60 48
61 def __init__(self, api, bisect_config, revision_class, init_revisions=True): 49 def __init__(self, api, bisect_config, revision_class, init_revisions=True):
62 """Initializes the state of a new bisect job from a dictionary. 50 """Initializes the state of a new bisect job from a dictionary.
63 51
64 Note that the initial good_rev and bad_rev MUST resolve to a commit position 52 Note that the initial good_rev and bad_rev MUST resolve to a commit position
65 in the chromium repo. 53 in the chromium repo.
66 """ 54 """
67 super(Bisector, self).__init__() 55 super(Bisector, self).__init__()
68 self._api = api 56 self._api = api
69 self.ensure_sync_master_branch() 57 self.ensure_sync_master_branch()
70 self.bisect_config = bisect_config 58 self.bisect_config = bisect_config
71 self.config_step() 59 self.config_step()
72 self.revision_class = revision_class 60 self.revision_class = revision_class
73 self.result_codes = set() 61 self.result_codes = set()
74 self.last_tested_revision = None
75 62
76 # Test-only properties. 63 # Test-only properties.
77 # TODO: Replace these with proper mod_test_data. 64 # TODO: Replace these with proper mod_test_data.
78 self.dummy_initial_confidence = bisect_config.get( 65 self.dummy_initial_confidence = bisect_config.get(
79 'dummy_initial_confidence') 66 'dummy_initial_confidence')
80 self.dummy_builds = bisect_config.get('dummy_builds', False) 67 self.dummy_builds = bisect_config.get('dummy_builds', False)
81 68
82 # Load configuration items. 69 # Load configuration items.
83 self.test_type = bisect_config.get('test_type', 'perf') 70 self.test_type = bisect_config.get('test_type', 'perf')
84 self.improvement_direction = int(bisect_config.get( 71 self.improvement_direction = int(bisect_config.get(
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
120 self.good_rev.good = True 107 self.good_rev.good = True
121 self.good_rev.read_deps(self.get_perf_tester_name()) 108 self.good_rev.read_deps(self.get_perf_tester_name())
122 api.m.step.active_result.presentation.logs['Debug Good Revision DEPS'] = [ 109 api.m.step.active_result.presentation.logs['Debug Good Revision DEPS'] = [
123 '%s: %s' % (key, value) for key, value in 110 '%s: %s' % (key, value) for key, value in
124 self.good_rev.deps.iteritems()] 111 self.good_rev.deps.iteritems()]
125 self.good_rev.deps = {} 112 self.good_rev.deps = {}
126 self.lkgr = self.good_rev 113 self.lkgr = self.good_rev
127 if init_revisions: 114 if init_revisions:
128 self._expand_chromium_revision_range() 115 self._expand_chromium_revision_range()
129 116
130 def significantly_different(
131 self, list_a, list_b,
132 significance_level=SIGNIFICANCE_LEVEL): # pragma: no cover
133 """Uses an external script to run hypothesis testing with scipy.
134
135 The reason why we need an external script is that scipy is not available to
136 the default python installed in all platforms. We instead rely on an
137 anaconda environment to provide those packages.
138
139 Args:
140 list_a, list_b: Two lists representing samples to be compared.
141 significance_level: Self-describing. As a decimal fraction.
142
143 Returns:
144 A boolean indicating whether the null hypothesis ~(that the lists are
145 samples from the same population) can be rejected at the specified
146 significance level.
147 """
148 step_result = self.api.m.python(
149 'Checking sample difference',
150 self.api.resource('significantly_different.py'),
151 [json.dumps(list_a), json.dumps(list_b), str(significance_level)],
152 stdout=self.api.m.json.output())
153 results = step_result.stdout
154 if results is None:
155 assert self.dummy_builds
156 return True
157 significantly_different = results['significantly_different']
158 step_result.presentation.logs[str(significantly_different)] = [
159 'See json.output for details']
160 return significantly_different
161
162 def config_step(self): 117 def config_step(self):
163 """Yields a simple echo step that outputs the bisect config.""" 118 """Yields a simple echo step that outputs the bisect config."""
164 api = self.api 119 api = self.api
165 # bisect_config may come as a FrozenDict (which is not serializable). 120 # bisect_config may come as a FrozenDict (which is not serializable).
166 bisect_config = dict(self.bisect_config) 121 bisect_config = dict(self.bisect_config)
167 122
168 def fix_windows_backslashes(s): 123 def fix_windows_backslashes(s):
169 backslash_regex = re.compile(r'(?<!\\)\\(?!\\)') 124 backslash_regex = re.compile(r'(?<!\\)\\(?!\\)')
170 return backslash_regex.sub(r'\\', s) 125 return backslash_regex.sub(r'\\', s)
171 126
(...skipping 278 matching lines...) Expand 10 before | Expand all | Expand 10 after
450 self.failed_direction = True 405 self.failed_direction = True
451 self.warnings.append('The initial regression range for return code ' 406 self.warnings.append('The initial regression range for return code '
452 'appears to show NO sign of a regression.') 407 'appears to show NO sign of a regression.')
453 408
454 def _set_failed_direction_results(self): # pragma: no cover 409 def _set_failed_direction_results(self): # pragma: no cover
455 self.failed_direction = True 410 self.failed_direction = True
456 self.warnings.append('The initial regression range appears to represent ' 411 self.warnings.append('The initial regression range appears to represent '
457 'an improvement rather than a regression, given the ' 412 'an improvement rather than a regression, given the '
458 'expected direction of improvement.') 413 'expected direction of improvement.')
459 414
460 def check_initial_confidence(self): # pragma: no cover 415 def check_initial_confidence(self):
461 """Checks that the initial range presents a clear enough regression. 416 """Checks that the initial range presents a clear enough regression.
462 417
463 We calculate the confidence of the results of the given 'good' 418 We calculate the confidence of the results of the given 'good'
464 and 'bad' revisions and compare it against the required confidence 419 and 'bad' revisions and compare it against the required confidence
465 set for the bisector. 420 set for the bisector.
466 421
467 Note that when a dummy regression confidence value has been set, that 422 Note that when a dummy regression confidence value has been set, that
468 is used instead. 423 is used instead.
469 """ 424 """
470 if self.test_type != 'perf': 425 if self.test_type != 'perf':
471 return True 426 return True
472 427
473 if self.required_initial_confidence is None: 428 if self.required_initial_confidence is None:
474 return True # pragma: no cover 429 return True # pragma: no cover
475 430
476 # TODO(robertocn): Remove all uses of "confidence".
477 if self.dummy_initial_confidence is not None: 431 if self.dummy_initial_confidence is not None:
478 self.initial_confidence = float( 432 self.initial_confidence = float(
479 self.dummy_initial_confidence) 433 self.dummy_initial_confidence)
480 if (float(self.initial_confidence) <
481 float(self.required_initial_confidence)):
482 self._set_insufficient_confidence_warning()
483 return False
484 return True
485 434
486 if self.dummy_builds: 435 else: # pragma: no cover
487 dummy_result = self.good_rev.values != self.bad_rev.values 436 if len(self.good_rev.values) < 5 or len(self.bad_rev.values) < 5:
488 if not dummy_result: 437 # If there are too few values, the confidence score is not a good way to
489 self._set_insufficient_confidence_warning() 438 # determine whether the regression is reproducible.
490 return dummy_result 439 # TODO(robertocn): Investigate a straightforward approach to deal with
491 440 # these cases. Such as the mean of one group lying within the range of
492 with self.api.m.step.nest('Re-testing reference range'): 441 # the other.
493 expiration_time = time.time() + REGRESSION_CHECK_TIMEOUT 442 return True
494 while time.time() < expiration_time: 443 self.initial_confidence = (
495 if len(self.good_rev.values) >= 5 and len(self.bad_rev.values) >= 5: 444 self.api.m.math_utils.confidence_score(
496 if self.significantly_different(self.good_rev.values, 445 self.good_rev.values,
497 self.bad_rev.values): 446 self.bad_rev.values))
498 return True 447 if (self.initial_confidence <
499 if len(self.good_rev.values) == len(self.bad_rev.values): 448 self.required_initial_confidence): # pragma: no cover
500 revision_to_retest = self.last_tested_revision 449 self._set_insufficient_confidence_warning(self.initial_confidence)
501 else:
502 revision_to_retest = min(self.good_rev, self.bad_rev,
503 key=lambda x: len(x.values))
504 if len(revision_to_retest.values) < MAX_REQUIRED_SAMPLES:
505 revision_to_retest.retest()
506 else:
507 break
508 self._set_insufficient_confidence_warning()
509 return False 450 return False
510 451 return True
511 452
512 def get_exception(self): 453 def get_exception(self):
513 raise NotImplementedError() # pragma: no cover 454 raise NotImplementedError() # pragma: no cover
514 # TODO: should return an exception with the details of the failure. 455 # TODO: should return an exception with the details of the failure.
515 456
516 def _set_insufficient_confidence_warning( 457 def _set_insufficient_confidence_warning(
517 self): # pragma: no cover 458 self, actual_confidence): # pragma: no cover
518 """Adds a warning about the lack of initial regression confidence.""" 459 """Adds a warning about the lack of initial regression confidence."""
519 self.failed_initial_confidence = True 460 self.failed_initial_confidence = True
520 self.surface_result('LO_INIT_CONF') 461 self.surface_result('LO_INIT_CONF')
521 self.warnings.append( 462 self.warnings.append(
522 'Bisect failed to reproduce the regression with enough confidence.') 463 ('Bisect failed to reproduce the regression with enough confidence. '
464 'Needed {:.2f}%, got {:.2f}%.').format(
465 self.required_initial_confidence, actual_confidence))
523 466
524 def _results_debug_message(self): 467 def _results_debug_message(self):
525 """Returns a string with values used to debug a bisect result.""" 468 """Returns a string with values used to debug a bisect result."""
526 result = 'bisector.lkgr: %r\n' % self.lkgr 469 result = 'bisector.lkgr: %r\n' % self.lkgr
527 result += 'bisector.fkbr: %r\n\n' % self.fkbr 470 result += 'bisector.fkbr: %r\n\n' % self.fkbr
528 result += self._revision_value_table() 471 result += self._revision_value_table()
529 if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values): 472 if (self.lkgr and self.lkgr.values and self.fkbr and self.fkbr.values):
530 result += '\n' + self._t_test_results() 473 result += '\n' + self._t_test_results()
531 return result 474 return result
532 475
(...skipping 340 matching lines...) Expand 10 before | Expand all | Expand 10 after
873 816
874 def surface_result(self, result_string): 817 def surface_result(self, result_string):
875 assert result_string in VALID_RESULT_CODES 818 assert result_string in VALID_RESULT_CODES
876 prefix = 'B4T_' # To avoid collision. Stands for bisect (abbr. `a la i18n). 819 prefix = 'B4T_' # To avoid collision. Stands for bisect (abbr. `a la i18n).
877 result_code = prefix + result_string 820 result_code = prefix + result_string
878 assert len(result_code) <= 20 821 assert len(result_code) <= 20
879 if result_code not in self.result_codes: 822 if result_code not in self.result_codes:
880 self.result_codes.add(result_code) 823 self.result_codes.add(result_code)
881 properties = self.api.m.step.active_result.presentation.properties 824 properties = self.api.m.step.active_result.presentation.properties
882 properties['extra_result_code'] = sorted(self.result_codes) 825 properties['extra_result_code'] = sorted(self.result_codes)
OLDNEW
« no previous file with comments | « scripts/slave/recipe_modules/auto_bisect/api.py ('k') | scripts/slave/recipe_modules/auto_bisect/bisector_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698