OLD | NEW |
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. | 2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. |
3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
5 | 5 |
6 """Performance Test Bisect Tool | 6 """Performance Test Bisect Tool |
7 | 7 |
8 This script bisects a series of changelists using binary search. It starts at | 8 This script bisects a series of changelists using binary search. It starts at |
9 a bad revision where a performance metric has regressed, and asks for a last | 9 a bad revision where a performance metric has regressed, and asks for a last |
10 known-good revision. It will then binary search across this revision range by | 10 known-good revision. It will then binary search across this revision range by |
(...skipping 153 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
164 BUILD_RESULT_FAIL = 1 | 164 BUILD_RESULT_FAIL = 1 |
165 BUILD_RESULT_SKIPPED = 2 | 165 BUILD_RESULT_SKIPPED = 2 |
166 | 166 |
167 # Maximum time in seconds to wait after posting build request to tryserver. | 167 # Maximum time in seconds to wait after posting build request to tryserver. |
168 # TODO: Change these values based on the actual time taken by buildbots on | 168 # TODO: Change these values based on the actual time taken by buildbots on |
169 # the tryserver. | 169 # the tryserver. |
170 MAX_MAC_BUILD_TIME = 14400 | 170 MAX_MAC_BUILD_TIME = 14400 |
171 MAX_WIN_BUILD_TIME = 14400 | 171 MAX_WIN_BUILD_TIME = 14400 |
172 MAX_LINUX_BUILD_TIME = 14400 | 172 MAX_LINUX_BUILD_TIME = 14400 |
173 | 173 |
| 174 # The confidence percentage at which confidence can be consider "high". |
| 175 HIGH_CONFIDENCE = 95 |
| 176 |
174 # Patch template to add a new file, DEPS.sha under src folder. | 177 # Patch template to add a new file, DEPS.sha under src folder. |
175 # This file contains SHA1 value of the DEPS changes made while bisecting | 178 # This file contains SHA1 value of the DEPS changes made while bisecting |
176 # dependency repositories. This patch send along with DEPS patch to tryserver. | 179 # dependency repositories. This patch send along with DEPS patch to tryserver. |
177 # When a build requested is posted with a patch, bisect builders on tryserver, | 180 # When a build requested is posted with a patch, bisect builders on tryserver, |
178 # once build is produced, it reads SHA value from this file and appends it | 181 # once build is produced, it reads SHA value from this file and appends it |
179 # to build archive filename. | 182 # to build archive filename. |
180 DEPS_SHA_PATCH = """diff --git src/DEPS.sha src/DEPS.sha | 183 DEPS_SHA_PATCH = """diff --git src/DEPS.sha src/DEPS.sha |
181 new file mode 100644 | 184 new file mode 100644 |
182 --- /dev/null | 185 --- /dev/null |
183 +++ src/DEPS.sha | 186 +++ src/DEPS.sha |
184 @@ -0,0 +1 @@ | 187 @@ -0,0 +1 @@ |
185 +%(deps_sha)s | 188 +%(deps_sha)s |
186 """ | 189 """ |
187 | 190 |
188 # The possible values of the --bisect_mode flag, which determines what to | 191 # The possible values of the --bisect_mode flag, which determines what to |
189 # use when classifying a revision as "good" or "bad". | 192 # use when classifying a revision as "good" or "bad". |
190 BISECT_MODE_MEAN = 'mean' | 193 BISECT_MODE_MEAN = 'mean' |
191 BISECT_MODE_STD_DEV = 'std_dev' | 194 BISECT_MODE_STD_DEV = 'std_dev' |
192 BISECT_MODE_RETURN_CODE = 'return_code' | 195 BISECT_MODE_RETURN_CODE = 'return_code' |
193 | 196 |
194 # The perf dashboard specifically looks for the string | 197 # The perf dashboard looks for a string like "Estimated Confidence: 95%" |
195 # "Estimated Confidence: 95%" to decide whether or not to cc the author(s). | 198 # to decide whether or not to cc the author(s). If you change this, please |
196 # If you change this, please update the perf dashboard as well. | 199 # update the perf dashboard as well. |
197 RESULTS_BANNER = """ | 200 RESULTS_BANNER = """ |
198 ===== BISECT JOB RESULTS ===== | 201 ===== BISECT JOB RESULTS ===== |
199 Status: %(status)s | 202 Status: %(status)s |
200 | 203 |
201 Test Command: %(command)s | 204 Test Command: %(command)s |
202 Test Metric: %(metrics)s | 205 Test Metric: %(metrics)s |
203 Relative Change: %(change)s | 206 Relative Change: %(change)s |
204 Estimated Confidence: %(confidence)d%%""" | 207 Estimated Confidence: %(confidence)d%%""" |
205 | 208 |
206 # The perf dashboard specifically looks for the string | 209 # The perf dashboard specifically looks for the string |
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
273 their differences aren't due to chance alone. | 276 their differences aren't due to chance alone. |
274 | 277 |
275 | 278 |
276 Args: | 279 Args: |
277 good_results_lists: A list of lists of "good" result numbers. | 280 good_results_lists: A list of lists of "good" result numbers. |
278 bad_results_lists: A list of lists of "bad" result numbers. | 281 bad_results_lists: A list of lists of "bad" result numbers. |
279 | 282 |
280 Returns: | 283 Returns: |
281 A number in the range [0, 100]. | 284 A number in the range [0, 100]. |
282 """ | 285 """ |
283 if not good_results_lists or not bad_results_lists: | 286 # If there's only one item in either list, this means only one revision was |
| 287 # classified good or bad; this isn't good enough evidence to make a decision. |
| 288 # If an empty list was passed, that also implies zero confidence. |
| 289 if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1: |
284 return 0.0 | 290 return 0.0 |
285 | 291 |
286 # Flatten the lists of results lists. | 292 # Flatten the lists of results lists. |
287 sample1 = sum(good_results_lists, []) | 293 sample1 = sum(good_results_lists, []) |
288 sample2 = sum(bad_results_lists, []) | 294 sample2 = sum(bad_results_lists, []) |
| 295 |
| 296 # If there were only empty lists in either of the lists (this is unexpected |
| 297 # and normally shouldn't happen), then we also want to return 0. |
289 if not sample1 or not sample2: | 298 if not sample1 or not sample2: |
290 return 0.0 | 299 return 0.0 |
291 | 300 |
292 # The p-value is approximately the probability of obtaining the given set | 301 # The p-value is approximately the probability of obtaining the given set |
293 # of good and bad values just by chance. | 302 # of good and bad values just by chance. |
294 _, _, p_value = ttest.WelchsTTest(sample1, sample2) | 303 _, _, p_value = ttest.WelchsTTest(sample1, sample2) |
295 return 100.0 * (1.0 - p_value) | 304 return 100.0 * (1.0 - p_value) |
296 | 305 |
297 | 306 |
298 def GetSHA1HexDigest(contents): | 307 def GetSHA1HexDigest(contents): |
(...skipping 2583 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2882 | 2891 |
2883 self._PrintTestedCommitsTable(revision_data_sorted, | 2892 self._PrintTestedCommitsTable(revision_data_sorted, |
2884 results_dict['first_working_revision'], | 2893 results_dict['first_working_revision'], |
2885 results_dict['last_broken_revision'], | 2894 results_dict['last_broken_revision'], |
2886 100, final_step=False) | 2895 100, final_step=False) |
2887 | 2896 |
2888 def _ConfidenceLevelStatus(self, results_dict): | 2897 def _ConfidenceLevelStatus(self, results_dict): |
2889 if not results_dict['confidence']: | 2898 if not results_dict['confidence']: |
2890 return None | 2899 return None |
2891 confidence_status = 'Successful with %(level)s confidence%(warning)s.' | 2900 confidence_status = 'Successful with %(level)s confidence%(warning)s.' |
2892 if results_dict['confidence'] >= 95: | 2901 if results_dict['confidence'] >= HIGH_CONFIDENCE: |
2893 level = 'high' | 2902 level = 'high' |
2894 else: | 2903 else: |
2895 level = 'low' | 2904 level = 'low' |
2896 warning = ' and warnings' | 2905 warning = ' and warnings' |
2897 if not self.warnings: | 2906 if not self.warnings: |
2898 warning = '' | 2907 warning = '' |
2899 return confidence_status % {'level': level, 'warning': warning} | 2908 return confidence_status % {'level': level, 'warning': warning} |
2900 | 2909 |
2901 def _GetViewVCLinkFromDepotAndHash(self, cl, depot): | 2910 def _GetViewVCLinkFromDepotAndHash(self, cl, depot): |
2902 info = self.source_control.QueryRevisionInfo(cl, | 2911 info = self.source_control.QueryRevisionInfo(cl, |
(...skipping 263 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3166 'confidence': confidence, | 3175 'confidence': confidence, |
3167 } | 3176 } |
3168 | 3177 |
3169 def _CheckForWarnings(self, results_dict): | 3178 def _CheckForWarnings(self, results_dict): |
3170 if len(results_dict['culprit_revisions']) > 1: | 3179 if len(results_dict['culprit_revisions']) > 1: |
3171 self.warnings.append('Due to build errors, regression range could ' | 3180 self.warnings.append('Due to build errors, regression range could ' |
3172 'not be narrowed down to a single commit.') | 3181 'not be narrowed down to a single commit.') |
3173 if self.opts.repeat_test_count == 1: | 3182 if self.opts.repeat_test_count == 1: |
3174 self.warnings.append('Tests were only set to run once. This may ' | 3183 self.warnings.append('Tests were only set to run once. This may ' |
3175 'be insufficient to get meaningful results.') | 3184 'be insufficient to get meaningful results.') |
3176 if results_dict['confidence'] < 100: | 3185 if 0 < results_dict['confidence'] < HIGH_CONFIDENCE: |
3177 if results_dict['confidence']: | 3186 self.warnings.append('Confidence is not high. Try bisecting again ' |
3178 self.warnings.append( | 3187 'with increased repeat_count, larger range, or ' |
3179 'Confidence is less than 100%. There could be other candidates ' | 3188 'on another metric.') |
3180 'for this regression. Try bisecting again with increased ' | 3189 if not results_dict['confidence']: |
3181 'repeat_count or on a sub-metric that shows the regression more ' | 3190 self.warnings.append('Confidence score is 0%. Try bisecting again on ' |
3182 'clearly.') | 3191 'another platform or another metric.') |
3183 else: | |
3184 self.warnings.append( | |
3185 'Confidence is 0%. Try bisecting again on another platform, with ' | |
3186 'increased repeat_count or on a sub-metric that shows the ' | |
3187 'regression more clearly.') | |
3188 | 3192 |
3189 def FormatAndPrintResults(self, bisect_results): | 3193 def FormatAndPrintResults(self, bisect_results): |
3190 """Prints the results from a bisection run in a readable format. | 3194 """Prints the results from a bisection run in a readable format. |
3191 | 3195 |
3192 Args: | 3196 Args: |
3193 bisect_results: The results from a bisection test run. | 3197 bisect_results: The results from a bisection test run. |
3194 """ | 3198 """ |
3195 revision_data = bisect_results['revision_data'] | 3199 revision_data = bisect_results['revision_data'] |
3196 revision_data_sorted = sorted(revision_data.iteritems(), | 3200 revision_data_sorted = sorted(revision_data.iteritems(), |
3197 key = lambda x: x[1]['sort']) | 3201 key = lambda x: x[1]['sort']) |
(...skipping 460 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3658 # bugs. If you change this, please update the perf dashboard as well. | 3662 # bugs. If you change this, please update the perf dashboard as well. |
3659 bisect_utils.OutputAnnotationStepStart('Results') | 3663 bisect_utils.OutputAnnotationStepStart('Results') |
3660 print 'Error: %s' % e.message | 3664 print 'Error: %s' % e.message |
3661 if opts.output_buildbot_annotations: | 3665 if opts.output_buildbot_annotations: |
3662 bisect_utils.OutputAnnotationStepClosed() | 3666 bisect_utils.OutputAnnotationStepClosed() |
3663 return 1 | 3667 return 1 |
3664 | 3668 |
3665 | 3669 |
3666 if __name__ == '__main__': | 3670 if __name__ == '__main__': |
3667 sys.exit(main()) | 3671 sys.exit(main()) |
OLD | NEW |