Chromium Code Reviews| Index: tools/auto_bisect/bisect_perf_regression.py |
| diff --git a/tools/auto_bisect/bisect_perf_regression.py b/tools/auto_bisect/bisect_perf_regression.py |
| index ec3e118df267f5206f97389b10d752d1ddf47621..bb71f5b4e39c046e09a3e3337ad39e6b94e0d8e7 100755 |
| --- a/tools/auto_bisect/bisect_perf_regression.py |
| +++ b/tools/auto_bisect/bisect_perf_regression.py |
| @@ -56,7 +56,7 @@ import math_utils |
| import request_build |
| import source_control as source_control_module |
| import ttest |
| -from telemetry.util import cloud_storage |
| +from telemetry.util import cloud_storage # pylint: disable=F0401 |
| # Below is the map of "depot" names to information about each depot. Each depot |
| # is a repository, and in the process of bisecting, revision ranges in these |
| @@ -868,44 +868,218 @@ def _PrintStepTime(revision_data_sorted): |
| seconds=int(step_perf_time_avg)) |
| -def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good): |
| - """Compiles a list of other possible regressions from the revision data. |
| +class BisectResults(object): |
| + """This class holds the results of the bisect.""" |
| - Args: |
| - revision_data_sorted: Sorted list of (revision, revision data) pairs. |
| - bad_greater_than_good: Whether the result value at the "bad" revision is |
| - numerically greater than the result value at the "good" revision. |
| + def __init__(self, bisect_perf_metrics, source_control): |
| + self._bisect_perf_metrics = bisect_perf_metrics |
|
qyearsley
2014/09/19 07:35:16
Initial thought: BisectPerformanceMetrics is a mas
Sergiy Byelozyorov
2014/09/19 13:36:22
Done.
|
| + self.revision_data = {} |
| + self.error = None |
| + self._source_control = source_control |
| - Returns: |
| - A list of [current_rev, previous_rev, confidence] for other places where |
| - there may have been a regression. |
| - """ |
| - other_regressions = [] |
| - previous_values = [] |
| - previous_id = None |
| - for current_id, current_data in revision_data_sorted: |
| - current_values = current_data['value'] |
| - if current_values: |
| - current_values = current_values['values'] |
| - if previous_values: |
| - confidence = ConfidenceScore(previous_values, [current_values]) |
| - mean_of_prev_runs = math_utils.Mean(sum(previous_values, [])) |
| - mean_of_current_runs = math_utils.Mean(current_values) |
| - |
| - # Check that the potential regression is in the same direction as |
| - # the overall regression. If the mean of the previous runs < the |
| - # mean of the current runs, this local regression is in same |
| - # direction. |
| - prev_less_than_current = mean_of_prev_runs < mean_of_current_runs |
| - is_same_direction = (prev_less_than_current if |
| - bad_greater_than_good else not prev_less_than_current) |
| - |
| - # Only report potential regressions with high confidence. |
| - if is_same_direction and confidence > 50: |
| - other_regressions.append([current_id, previous_id, confidence]) |
| - previous_values.append(current_values) |
| - previous_id = current_id |
| - return other_regressions |
| + @staticmethod |
| + def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good): |
| + """Compiles a list of other possible regressions from the revision data. |
| + |
| + Args: |
| + revision_data_sorted: Sorted list of (revision, revision data) pairs. |
| + bad_greater_than_good: Whether the result value at the "bad" revision is |
| + numerically greater than the result value at the "good" revision. |
| + |
| + Returns: |
| + A list of [current_rev, previous_rev, confidence] for other places where |
| + there may have been a regression. |
| + """ |
| + other_regressions = [] |
| + previous_values = [] |
| + previous_id = None |
| + for current_id, current_data in revision_data_sorted: |
| + current_values = current_data['value'] |
| + if current_values: |
| + current_values = current_values['values'] |
| + if previous_values: |
| + confidence = ConfidenceScore(previous_values, [current_values]) |
| + mean_of_prev_runs = math_utils.Mean(sum(previous_values, [])) |
| + mean_of_current_runs = math_utils.Mean(current_values) |
| + |
| + # Check that the potential regression is in the same direction as |
| + # the overall regression. If the mean of the previous runs < the |
| + # mean of the current runs, this local regression is in same |
| + # direction. |
| + prev_less_than_current = mean_of_prev_runs < mean_of_current_runs |
| + is_same_direction = (prev_less_than_current if |
| + bad_greater_than_good else not prev_less_than_current) |
| + |
| + # Only report potential regressions with high confidence. |
| + if is_same_direction and confidence > 50: |
| + other_regressions.append([current_id, previous_id, confidence]) |
| + previous_values.append(current_values) |
| + previous_id = current_id |
| + return other_regressions |
| + |
| + def GetResultsDict(self): |
| + """Returns a dictionary with the following fields |
|
qyearsley
2014/09/19 07:35:16
The first line of a docstring is generally a self-
Sergiy Byelozyorov
2014/09/19 13:36:22
Done.
|
| + |
| + 'first_working_revision': First good revision. |
| + 'last_broken_revision': Last bad revision. |
| + 'culprit_revisions': A list of revisions, which contain the bad change |
| + introducing the failure. |
| + 'other_regressions': A list of tuples representing other regressions, which |
| + may have occured. |
| + 'regression_size': For performance bisects, this is a relative change of the |
| + mean metric value. For other bisects this field always |
| + contains 'zero-to-nonzero'. |
| + 'regression_std_err': For performance bisects, it is a pooled standard |
| + error for groups of good and bad runs. Not used for |
| + other bisects. |
| + 'confidence': For performance bisects, it is a confidence that the good and |
| + bad runs are distinct groups. Not used for non-performance |
| + bisects. |
|
qyearsley
2014/09/19 07:35:16
I think this formatting might look better if line
Sergiy Byelozyorov
2014/09/19 13:36:22
Done.
|
| + |
| + 'revision_data_stored': dict mapping revision ids to data about that |
| + revision. Each piece of revision data consists of a dict with the |
| + following keys: |
| + |
| + 'passed': Represents whether the performance test was successful at |
| + that revision. Possible values include: 1 (passed), 0 (failed), |
| + '?' (skipped), 'F' (build failed). |
| + 'depot': The depot that this revision is from (i.e. WebKit) |
| + 'external': If the revision is a 'src' revision, 'external' contains |
| + the revisions of each of the external libraries. |
| + 'sort': A sort value for sorting the dict in order of commits. |
| + |
| + For example: |
| + { |
| + 'CL #1': |
| + { |
| + 'passed': False, |
| + 'depot': 'chromium', |
| + 'external': None, |
| + 'sort': 0 |
| + } |
| + } |
| + """ |
| + revision_data_sorted = sorted(self.revision_data.iteritems(), |
| + key = lambda x: x[1]['sort']) |
| + |
| + # Find range where it possibly broke. |
| + first_working_revision = None |
| + first_working_revision_index = -1 |
| + last_broken_revision = None |
| + last_broken_revision_index = -1 |
| + |
| + culprit_revisions = [] |
| + other_regressions = [] |
| + regression_size = 0.0 |
| + regression_std_err = 0.0 |
| + confidence = 0.0 |
| + |
| + for i in xrange(len(revision_data_sorted)): |
| + k, v = revision_data_sorted[i] |
| + if v['passed'] == 1: |
| + if not first_working_revision: |
| + first_working_revision = k |
| + first_working_revision_index = i |
| + |
| + if not v['passed']: |
| + last_broken_revision = k |
| + last_broken_revision_index = i |
| + |
| + if last_broken_revision != None and first_working_revision != None: |
| + broken_means = [] |
| + for i in xrange(0, last_broken_revision_index + 1): |
| + if revision_data_sorted[i][1]['value']: |
| + broken_means.append(revision_data_sorted[i][1]['value']['values']) |
| + |
| + working_means = [] |
| + for i in xrange(first_working_revision_index, len(revision_data_sorted)): |
| + if revision_data_sorted[i][1]['value']: |
| + working_means.append(revision_data_sorted[i][1]['value']['values']) |
| + |
| + # Flatten the lists to calculate mean of all values. |
| + working_mean = sum(working_means, []) |
| + broken_mean = sum(broken_means, []) |
| + |
| + # Calculate the approximate size of the regression |
| + mean_of_bad_runs = math_utils.Mean(broken_mean) |
| + mean_of_good_runs = math_utils.Mean(working_mean) |
| + |
| + regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs, |
| + mean_of_bad_runs) |
| + if math.isnan(regression_size): |
| + regression_size = 'zero-to-nonzero' |
| + |
| + regression_std_err = math.fabs(math_utils.PooledStandardError( |
| + [working_mean, broken_mean]) / |
| + max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 |
| + |
| + # Give a "confidence" in the bisect. At the moment we use how distinct the |
| + # values are before and after the last broken revision, and how noisy the |
| + # overall graph is. |
| + confidence = ConfidenceScore(working_means, broken_means) |
| + |
| + culprit_revisions = [] |
| + |
| + cwd = os.getcwd() |
| + self._bisect_perf_metrics.ChangeToDepotWorkingDirectory( |
| + self.revision_data[last_broken_revision]['depot']) |
| + |
| + if self.revision_data[last_broken_revision]['depot'] == 'cros': |
| + # Want to get a list of all the commits and what depots they belong |
| + # to so that we can grab info about each. |
| + cmd = ['repo', 'forall', '-c', |
| + 'pwd ; git log --pretty=oneline --before=%d --after=%d' % ( |
| + last_broken_revision, first_working_revision + 1)] |
| + output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd) |
| + |
| + changes = [] |
| + assert not return_code, ('An error occurred while running ' |
| + '"%s"' % ' '.join(cmd)) |
| + last_depot = None |
| + cwd = os.getcwd() |
| + for l in output.split('\n'): |
| + if l: |
| + # Output will be in form: |
| + # /path_to_depot |
| + # /path_to_other_depot |
| + # <SHA1> |
| + # /path_again |
| + # <SHA1> |
| + # etc. |
| + if l[0] == '/': |
| + last_depot = l |
| + else: |
| + contents = l.split(' ') |
| + if len(contents) > 1: |
| + changes.append([last_depot, contents[0]]) |
| + for c in changes: |
| + os.chdir(c[0]) |
| + info = self._source_control.QueryRevisionInfo(c[1]) |
| + culprit_revisions.append((c[1], info, None)) |
| + else: |
| + for i in xrange(last_broken_revision_index, len(revision_data_sorted)): |
| + k, v = revision_data_sorted[i] |
| + if k == first_working_revision: |
| + break |
| + self._bisect_perf_metrics.ChangeToDepotWorkingDirectory(v['depot']) |
| + info = self._source_control.QueryRevisionInfo(k) |
| + culprit_revisions.append((k, info, v['depot'])) |
| + os.chdir(cwd) |
| + |
| + # Check for any other possible regression ranges. |
| + other_regressions = self._FindOtherRegressions( |
| + revision_data_sorted, mean_of_bad_runs > mean_of_good_runs) |
| + |
| + return { |
| + 'first_working_revision': first_working_revision, |
| + 'last_broken_revision': last_broken_revision, |
| + 'culprit_revisions': culprit_revisions, |
| + 'other_regressions': other_regressions, |
| + 'regression_size': regression_size, |
| + 'regression_std_err': regression_std_err, |
| + 'confidence': confidence, |
| + 'revision_data_sorted': revision_data_sorted |
| + } |
| class BisectPerformanceMetrics(object): |
| @@ -2328,41 +2502,9 @@ class BisectPerformanceMetrics(object): |
| metric: The performance metric to monitor. |
| Returns: |
| - A dict with 2 members, 'revision_data' and 'error'. On success, |
| - 'revision_data' will contain a dict mapping revision ids to |
| - data about that revision. Each piece of revision data consists of a |
| - dict with the following keys: |
| - |
| - 'passed': Represents whether the performance test was successful at |
| - that revision. Possible values include: 1 (passed), 0 (failed), |
| - '?' (skipped), 'F' (build failed). |
| - 'depot': The depot that this revision is from (i.e. WebKit) |
| - 'external': If the revision is a 'src' revision, 'external' contains |
| - the revisions of each of the external libraries. |
| - 'sort': A sort value for sorting the dict in order of commits. |
| - |
| - For example: |
| - { |
| - 'error':None, |
| - 'revision_data': |
| - { |
| - 'CL #1': |
| - { |
| - 'passed': False, |
| - 'depot': 'chromium', |
| - 'external': None, |
| - 'sort': 0 |
| - } |
| - } |
| - } |
| - |
| - If an error occurred, the 'error' field will contain the message and |
| - 'revision_data' will be empty. |
| + A BisectResults object. |
| """ |
| - results = { |
| - 'revision_data' : {}, |
| - 'error' : None, |
| - } |
| + results = BisectResults(self, self.source_control) |
| # Choose depot to bisect first |
| target_depot = 'chromium' |
| @@ -2382,18 +2524,18 @@ class BisectPerformanceMetrics(object): |
| os.chdir(cwd) |
| if bad_revision is None: |
| - results['error'] = 'Couldn\'t resolve [%s] to SHA1.' % bad_revision_in |
| + results.error = 'Couldn\'t resolve [%s] to SHA1.' % bad_revision_in |
| return results |
| if good_revision is None: |
| - results['error'] = 'Couldn\'t resolve [%s] to SHA1.' % good_revision_in |
| + results.error = 'Couldn\'t resolve [%s] to SHA1.' % good_revision_in |
| return results |
| # Check that they didn't accidentally swap good and bad revisions. |
| if not self.CheckIfRevisionsInProperOrder( |
| target_depot, good_revision, bad_revision): |
| - results['error'] = ('bad_revision < good_revision, did you swap these ' |
| - 'by mistake?') |
| + results.error = ('bad_revision < good_revision, did you swap these ' |
| + 'by mistake?') |
| return results |
| bad_revision, good_revision = self.NudgeRevisionsIfDEPSChange( |
| bad_revision, good_revision, good_revision_in) |
| @@ -2402,7 +2544,7 @@ class BisectPerformanceMetrics(object): |
| cannot_bisect = self.CanPerformBisect(good_revision, bad_revision) |
| if cannot_bisect: |
| - results['error'] = cannot_bisect.get('error') |
| + results.error = cannot_bisect.get('error') |
| return results |
| print 'Gathering revision range for bisection.' |
| @@ -2417,7 +2559,7 @@ class BisectPerformanceMetrics(object): |
| # revision_data will store information about a revision such as the |
| # depot it came from, the webkit/V8 revision at that time, |
| # performance timing, build state, etc... |
| - revision_data = results['revision_data'] |
| + revision_data = results.revision_data |
| # revision_list is the list we're binary searching through at the moment. |
| revision_list = [] |
| @@ -2460,17 +2602,17 @@ class BisectPerformanceMetrics(object): |
| bisect_utils.OutputAnnotationStepClosed() |
| if bad_results[1]: |
| - results['error'] = ('An error occurred while building and running ' |
| + results.error = ('An error occurred while building and running ' |
| 'the \'bad\' reference value. The bisect cannot continue without ' |
| 'a working \'bad\' revision to start from.\n\nError: %s' % |
| - bad_results[0]) |
| + bad_results[0]) |
| return results |
| if good_results[1]: |
| - results['error'] = ('An error occurred while building and running ' |
| + results.error = ('An error occurred while building and running ' |
| 'the \'good\' reference value. The bisect cannot continue without ' |
| 'a working \'good\' revision to start from.\n\nError: %s' % |
| - good_results[0]) |
| + good_results[0]) |
| return results |
| @@ -2535,9 +2677,9 @@ class BisectPerformanceMetrics(object): |
| previous_revision) |
| if not new_revision_list: |
| - results['error'] = ('An error occurred attempting to retrieve ' |
| - 'revision range: [%s..%s]' % |
| - (earliest_revision, latest_revision)) |
| + results.error = ('An error occurred attempting to retrieve ' |
| + 'revision range: [%s..%s]' % |
| + (earliest_revision, latest_revision)) |
| return results |
| _AddRevisionsIntoRevisionData( |
| @@ -2616,18 +2758,14 @@ class BisectPerformanceMetrics(object): |
| bisect_utils.OutputAnnotationStepClosed() |
| else: |
| # Weren't able to sync and retrieve the revision range. |
| - results['error'] = ('An error occurred attempting to retrieve revision ' |
| - 'range: [%s..%s]' % (good_revision, bad_revision)) |
| + results.error = ('An error occurred attempting to retrieve revision ' |
| + 'range: [%s..%s]' % (good_revision, bad_revision)) |
| return results |
| - def _PrintPartialResults(self, results_dict): |
| - revision_data = results_dict['revision_data'] |
| - revision_data_sorted = sorted(revision_data.iteritems(), |
| - key = lambda x: x[1]['sort']) |
| - results_dict = self._GetResultsDict(revision_data, revision_data_sorted) |
| - |
| - self._PrintTestedCommitsTable(revision_data_sorted, |
| + def _PrintPartialResults(self, results): |
| + results_dict = results.GetResultsDict() |
| + self._PrintTestedCommitsTable(results_dict['revision_data_sorted'], |
| results_dict['first_working_revision'], |
| results_dict['last_broken_revision'], |
| 100, final_step=False) |
| @@ -2800,125 +2938,6 @@ class BisectPerformanceMetrics(object): |
| previous_data['depot'], previous_link) |
| - def _GetResultsDict(self, revision_data, revision_data_sorted): |
| - # Find range where it possibly broke. |
| - first_working_revision = None |
| - first_working_revision_index = -1 |
| - last_broken_revision = None |
| - last_broken_revision_index = -1 |
| - |
| - culprit_revisions = [] |
| - other_regressions = [] |
| - regression_size = 0.0 |
| - regression_std_err = 0.0 |
| - confidence = 0.0 |
| - |
| - for i in xrange(len(revision_data_sorted)): |
| - k, v = revision_data_sorted[i] |
| - if v['passed'] == 1: |
| - if not first_working_revision: |
| - first_working_revision = k |
| - first_working_revision_index = i |
| - |
| - if not v['passed']: |
| - last_broken_revision = k |
| - last_broken_revision_index = i |
| - |
| - if last_broken_revision != None and first_working_revision != None: |
| - broken_means = [] |
| - for i in xrange(0, last_broken_revision_index + 1): |
| - if revision_data_sorted[i][1]['value']: |
| - broken_means.append(revision_data_sorted[i][1]['value']['values']) |
| - |
| - working_means = [] |
| - for i in xrange(first_working_revision_index, len(revision_data_sorted)): |
| - if revision_data_sorted[i][1]['value']: |
| - working_means.append(revision_data_sorted[i][1]['value']['values']) |
| - |
| - # Flatten the lists to calculate mean of all values. |
| - working_mean = sum(working_means, []) |
| - broken_mean = sum(broken_means, []) |
| - |
| - # Calculate the approximate size of the regression |
| - mean_of_bad_runs = math_utils.Mean(broken_mean) |
| - mean_of_good_runs = math_utils.Mean(working_mean) |
| - |
| - regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs, |
| - mean_of_bad_runs) |
| - if math.isnan(regression_size): |
| - regression_size = 'zero-to-nonzero' |
| - |
| - regression_std_err = math.fabs(math_utils.PooledStandardError( |
| - [working_mean, broken_mean]) / |
| - max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 |
| - |
| - # Give a "confidence" in the bisect. At the moment we use how distinct the |
| - # values are before and after the last broken revision, and how noisy the |
| - # overall graph is. |
| - confidence = ConfidenceScore(working_means, broken_means) |
| - |
| - culprit_revisions = [] |
| - |
| - cwd = os.getcwd() |
| - self.ChangeToDepotWorkingDirectory( |
| - revision_data[last_broken_revision]['depot']) |
| - |
| - if revision_data[last_broken_revision]['depot'] == 'cros': |
| - # Want to get a list of all the commits and what depots they belong |
| - # to so that we can grab info about each. |
| - cmd = ['repo', 'forall', '-c', |
| - 'pwd ; git log --pretty=oneline --before=%d --after=%d' % ( |
| - last_broken_revision, first_working_revision + 1)] |
| - output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd) |
| - |
| - changes = [] |
| - assert not return_code, ('An error occurred while running ' |
| - '"%s"' % ' '.join(cmd)) |
| - last_depot = None |
| - cwd = os.getcwd() |
| - for l in output.split('\n'): |
| - if l: |
| - # Output will be in form: |
| - # /path_to_depot |
| - # /path_to_other_depot |
| - # <SHA1> |
| - # /path_again |
| - # <SHA1> |
| - # etc. |
| - if l[0] == '/': |
| - last_depot = l |
| - else: |
| - contents = l.split(' ') |
| - if len(contents) > 1: |
| - changes.append([last_depot, contents[0]]) |
| - for c in changes: |
| - os.chdir(c[0]) |
| - info = self.source_control.QueryRevisionInfo(c[1]) |
| - culprit_revisions.append((c[1], info, None)) |
| - else: |
| - for i in xrange(last_broken_revision_index, len(revision_data_sorted)): |
| - k, v = revision_data_sorted[i] |
| - if k == first_working_revision: |
| - break |
| - self.ChangeToDepotWorkingDirectory(v['depot']) |
| - info = self.source_control.QueryRevisionInfo(k) |
| - culprit_revisions.append((k, info, v['depot'])) |
| - os.chdir(cwd) |
| - |
| - # Check for any other possible regression ranges. |
| - other_regressions = _FindOtherRegressions( |
| - revision_data_sorted, mean_of_bad_runs > mean_of_good_runs) |
| - |
| - return { |
| - 'first_working_revision': first_working_revision, |
| - 'last_broken_revision': last_broken_revision, |
| - 'culprit_revisions': culprit_revisions, |
| - 'other_regressions': other_regressions, |
| - 'regression_size': regression_size, |
| - 'regression_std_err': regression_std_err, |
| - 'confidence': confidence, |
| - } |
| - |
| def _CheckForWarnings(self, results_dict): |
| if len(results_dict['culprit_revisions']) > 1: |
| self.warnings.append('Due to build errors, regression range could ' |
| @@ -2940,10 +2959,7 @@ class BisectPerformanceMetrics(object): |
| Args: |
| bisect_results: The results from a bisection test run. |
| """ |
| - revision_data = bisect_results['revision_data'] |
| - revision_data_sorted = sorted(revision_data.iteritems(), |
| - key = lambda x: x[1]['sort']) |
| - results_dict = self._GetResultsDict(revision_data, revision_data_sorted) |
| + results_dict = bisect_results.GetResultsDict() |
| self._CheckForWarnings(results_dict) |
| @@ -2952,7 +2968,7 @@ class BisectPerformanceMetrics(object): |
| print 'Full results of bisection:' |
| - for current_id, current_data in revision_data_sorted: |
| + for current_id, current_data in results_dict['revision_data_sorted']: |
| build_status = current_data['passed'] |
| if type(build_status) is bool: |
| @@ -2980,12 +2996,12 @@ class BisectPerformanceMetrics(object): |
| self._PrintRevisionInfo(cl, info, depot) |
| if results_dict['other_regressions']: |
| self._PrintOtherRegressions(results_dict['other_regressions'], |
| - revision_data) |
| - self._PrintTestedCommitsTable(revision_data_sorted, |
| + results_dict['revision_data']) |
| + self._PrintTestedCommitsTable(results_dict['revision_data_sorted'], |
| results_dict['first_working_revision'], |
| results_dict['last_broken_revision'], |
| results_dict['confidence']) |
| - _PrintStepTime(revision_data_sorted) |
| + _PrintStepTime(results_dict['revision_data_sorted']) |
| self._PrintReproSteps() |
| _PrintThankYou() |
| if self.opts.output_buildbot_annotations: |
| @@ -3396,8 +3412,8 @@ def main(): |
| opts.bad_revision, |
| opts.good_revision, |
| opts.metric) |
| - if bisect_results['error']: |
| - raise RuntimeError(bisect_results['error']) |
| + if bisect_results.error: |
| + raise RuntimeError(bisect_results.error) |
| bisect_test.FormatAndPrintResults(bisect_results) |
| return 0 |
| finally: |