Index: tools/auto_bisect/bisect_perf_regression.py |
diff --git a/tools/auto_bisect/bisect_perf_regression.py b/tools/auto_bisect/bisect_perf_regression.py |
index ec3e118df267f5206f97389b10d752d1ddf47621..bb71f5b4e39c046e09a3e3337ad39e6b94e0d8e7 100755 |
--- a/tools/auto_bisect/bisect_perf_regression.py |
+++ b/tools/auto_bisect/bisect_perf_regression.py |
@@ -56,7 +56,7 @@ import math_utils |
import request_build |
import source_control as source_control_module |
import ttest |
-from telemetry.util import cloud_storage |
+from telemetry.util import cloud_storage # pylint: disable=F0401 |
# Below is the map of "depot" names to information about each depot. Each depot |
# is a repository, and in the process of bisecting, revision ranges in these |
@@ -868,44 +868,218 @@ def _PrintStepTime(revision_data_sorted): |
seconds=int(step_perf_time_avg)) |
-def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good): |
- """Compiles a list of other possible regressions from the revision data. |
+class BisectResults(object): |
+ """This class holds the results of the bisect.""" |
- Args: |
- revision_data_sorted: Sorted list of (revision, revision data) pairs. |
- bad_greater_than_good: Whether the result value at the "bad" revision is |
- numerically greater than the result value at the "good" revision. |
+ def __init__(self, bisect_perf_metrics, source_control): |
+ self._bisect_perf_metrics = bisect_perf_metrics |
qyearsley
2014/09/19 07:35:16
Initial thought: BisectPerformanceMetrics is a mas
Sergiy Byelozyorov
2014/09/19 13:36:22
Done.
|
+ self.revision_data = {} |
+ self.error = None |
+ self._source_control = source_control |
- Returns: |
- A list of [current_rev, previous_rev, confidence] for other places where |
- there may have been a regression. |
- """ |
- other_regressions = [] |
- previous_values = [] |
- previous_id = None |
- for current_id, current_data in revision_data_sorted: |
- current_values = current_data['value'] |
- if current_values: |
- current_values = current_values['values'] |
- if previous_values: |
- confidence = ConfidenceScore(previous_values, [current_values]) |
- mean_of_prev_runs = math_utils.Mean(sum(previous_values, [])) |
- mean_of_current_runs = math_utils.Mean(current_values) |
- |
- # Check that the potential regression is in the same direction as |
- # the overall regression. If the mean of the previous runs < the |
- # mean of the current runs, this local regression is in same |
- # direction. |
- prev_less_than_current = mean_of_prev_runs < mean_of_current_runs |
- is_same_direction = (prev_less_than_current if |
- bad_greater_than_good else not prev_less_than_current) |
- |
- # Only report potential regressions with high confidence. |
- if is_same_direction and confidence > 50: |
- other_regressions.append([current_id, previous_id, confidence]) |
- previous_values.append(current_values) |
- previous_id = current_id |
- return other_regressions |
+ @staticmethod |
+ def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good): |
+ """Compiles a list of other possible regressions from the revision data. |
+ |
+ Args: |
+ revision_data_sorted: Sorted list of (revision, revision data) pairs. |
+ bad_greater_than_good: Whether the result value at the "bad" revision is |
+ numerically greater than the result value at the "good" revision. |
+ |
+ Returns: |
+ A list of [current_rev, previous_rev, confidence] for other places where |
+ there may have been a regression. |
+ """ |
+ other_regressions = [] |
+ previous_values = [] |
+ previous_id = None |
+ for current_id, current_data in revision_data_sorted: |
+ current_values = current_data['value'] |
+ if current_values: |
+ current_values = current_values['values'] |
+ if previous_values: |
+ confidence = ConfidenceScore(previous_values, [current_values]) |
+ mean_of_prev_runs = math_utils.Mean(sum(previous_values, [])) |
+ mean_of_current_runs = math_utils.Mean(current_values) |
+ |
+ # Check that the potential regression is in the same direction as |
+ # the overall regression. If the mean of the previous runs < the |
+ # mean of the current runs, this local regression is in same |
+ # direction. |
+ prev_less_than_current = mean_of_prev_runs < mean_of_current_runs |
+ is_same_direction = (prev_less_than_current if |
+ bad_greater_than_good else not prev_less_than_current) |
+ |
+ # Only report potential regressions with high confidence. |
+ if is_same_direction and confidence > 50: |
+ other_regressions.append([current_id, previous_id, confidence]) |
+ previous_values.append(current_values) |
+ previous_id = current_id |
+ return other_regressions |
+ |
+ def GetResultsDict(self): |
+ """Returns a dictionary with the following fields |
qyearsley
2014/09/19 07:35:16
The first line of a docstring is generally a self-
Sergiy Byelozyorov
2014/09/19 13:36:22
Done.
|
+ |
+ 'first_working_revision': First good revision. |
+ 'last_broken_revision': Last bad revision. |
+ 'culprit_revisions': A list of revisions, which contain the bad change |
+ introducing the failure. |
+ 'other_regressions': A list of tuples representing other regressions, which |
+ may have occured. |
+ 'regression_size': For performance bisects, this is a relative change of the |
+ mean metric value. For other bisects this field always |
+ contains 'zero-to-nonzero'. |
+ 'regression_std_err': For performance bisects, it is a pooled standard |
+ error for groups of good and bad runs. Not used for |
+ other bisects. |
+ 'confidence': For performance bisects, it is a confidence that the good and |
+ bad runs are distinct groups. Not used for non-performance |
+ bisects. |
qyearsley
2014/09/19 07:35:16
I think this formatting might look better if line
Sergiy Byelozyorov
2014/09/19 13:36:22
Done.
|
+ |
+ 'revision_data_stored': dict mapping revision ids to data about that |
+ revision. Each piece of revision data consists of a dict with the |
+ following keys: |
+ |
+ 'passed': Represents whether the performance test was successful at |
+ that revision. Possible values include: 1 (passed), 0 (failed), |
+ '?' (skipped), 'F' (build failed). |
+ 'depot': The depot that this revision is from (i.e. WebKit) |
+ 'external': If the revision is a 'src' revision, 'external' contains |
+ the revisions of each of the external libraries. |
+ 'sort': A sort value for sorting the dict in order of commits. |
+ |
+ For example: |
+ { |
+ 'CL #1': |
+ { |
+ 'passed': False, |
+ 'depot': 'chromium', |
+ 'external': None, |
+ 'sort': 0 |
+ } |
+ } |
+ """ |
+ revision_data_sorted = sorted(self.revision_data.iteritems(), |
+ key = lambda x: x[1]['sort']) |
+ |
+ # Find range where it possibly broke. |
+ first_working_revision = None |
+ first_working_revision_index = -1 |
+ last_broken_revision = None |
+ last_broken_revision_index = -1 |
+ |
+ culprit_revisions = [] |
+ other_regressions = [] |
+ regression_size = 0.0 |
+ regression_std_err = 0.0 |
+ confidence = 0.0 |
+ |
+ for i in xrange(len(revision_data_sorted)): |
+ k, v = revision_data_sorted[i] |
+ if v['passed'] == 1: |
+ if not first_working_revision: |
+ first_working_revision = k |
+ first_working_revision_index = i |
+ |
+ if not v['passed']: |
+ last_broken_revision = k |
+ last_broken_revision_index = i |
+ |
+ if last_broken_revision != None and first_working_revision != None: |
+ broken_means = [] |
+ for i in xrange(0, last_broken_revision_index + 1): |
+ if revision_data_sorted[i][1]['value']: |
+ broken_means.append(revision_data_sorted[i][1]['value']['values']) |
+ |
+ working_means = [] |
+ for i in xrange(first_working_revision_index, len(revision_data_sorted)): |
+ if revision_data_sorted[i][1]['value']: |
+ working_means.append(revision_data_sorted[i][1]['value']['values']) |
+ |
+ # Flatten the lists to calculate mean of all values. |
+ working_mean = sum(working_means, []) |
+ broken_mean = sum(broken_means, []) |
+ |
+ # Calculate the approximate size of the regression |
+ mean_of_bad_runs = math_utils.Mean(broken_mean) |
+ mean_of_good_runs = math_utils.Mean(working_mean) |
+ |
+ regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs, |
+ mean_of_bad_runs) |
+ if math.isnan(regression_size): |
+ regression_size = 'zero-to-nonzero' |
+ |
+ regression_std_err = math.fabs(math_utils.PooledStandardError( |
+ [working_mean, broken_mean]) / |
+ max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 |
+ |
+ # Give a "confidence" in the bisect. At the moment we use how distinct the |
+ # values are before and after the last broken revision, and how noisy the |
+ # overall graph is. |
+ confidence = ConfidenceScore(working_means, broken_means) |
+ |
+ culprit_revisions = [] |
+ |
+ cwd = os.getcwd() |
+ self._bisect_perf_metrics.ChangeToDepotWorkingDirectory( |
+ self.revision_data[last_broken_revision]['depot']) |
+ |
+ if self.revision_data[last_broken_revision]['depot'] == 'cros': |
+ # Want to get a list of all the commits and what depots they belong |
+ # to so that we can grab info about each. |
+ cmd = ['repo', 'forall', '-c', |
+ 'pwd ; git log --pretty=oneline --before=%d --after=%d' % ( |
+ last_broken_revision, first_working_revision + 1)] |
+ output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd) |
+ |
+ changes = [] |
+ assert not return_code, ('An error occurred while running ' |
+ '"%s"' % ' '.join(cmd)) |
+ last_depot = None |
+ cwd = os.getcwd() |
+ for l in output.split('\n'): |
+ if l: |
+ # Output will be in form: |
+ # /path_to_depot |
+ # /path_to_other_depot |
+ # <SHA1> |
+ # /path_again |
+ # <SHA1> |
+ # etc. |
+ if l[0] == '/': |
+ last_depot = l |
+ else: |
+ contents = l.split(' ') |
+ if len(contents) > 1: |
+ changes.append([last_depot, contents[0]]) |
+ for c in changes: |
+ os.chdir(c[0]) |
+ info = self._source_control.QueryRevisionInfo(c[1]) |
+ culprit_revisions.append((c[1], info, None)) |
+ else: |
+ for i in xrange(last_broken_revision_index, len(revision_data_sorted)): |
+ k, v = revision_data_sorted[i] |
+ if k == first_working_revision: |
+ break |
+ self._bisect_perf_metrics.ChangeToDepotWorkingDirectory(v['depot']) |
+ info = self._source_control.QueryRevisionInfo(k) |
+ culprit_revisions.append((k, info, v['depot'])) |
+ os.chdir(cwd) |
+ |
+ # Check for any other possible regression ranges. |
+ other_regressions = self._FindOtherRegressions( |
+ revision_data_sorted, mean_of_bad_runs > mean_of_good_runs) |
+ |
+ return { |
+ 'first_working_revision': first_working_revision, |
+ 'last_broken_revision': last_broken_revision, |
+ 'culprit_revisions': culprit_revisions, |
+ 'other_regressions': other_regressions, |
+ 'regression_size': regression_size, |
+ 'regression_std_err': regression_std_err, |
+ 'confidence': confidence, |
+ 'revision_data_sorted': revision_data_sorted |
+ } |
class BisectPerformanceMetrics(object): |
@@ -2328,41 +2502,9 @@ class BisectPerformanceMetrics(object): |
metric: The performance metric to monitor. |
Returns: |
- A dict with 2 members, 'revision_data' and 'error'. On success, |
- 'revision_data' will contain a dict mapping revision ids to |
- data about that revision. Each piece of revision data consists of a |
- dict with the following keys: |
- |
- 'passed': Represents whether the performance test was successful at |
- that revision. Possible values include: 1 (passed), 0 (failed), |
- '?' (skipped), 'F' (build failed). |
- 'depot': The depot that this revision is from (i.e. WebKit) |
- 'external': If the revision is a 'src' revision, 'external' contains |
- the revisions of each of the external libraries. |
- 'sort': A sort value for sorting the dict in order of commits. |
- |
- For example: |
- { |
- 'error':None, |
- 'revision_data': |
- { |
- 'CL #1': |
- { |
- 'passed': False, |
- 'depot': 'chromium', |
- 'external': None, |
- 'sort': 0 |
- } |
- } |
- } |
- |
- If an error occurred, the 'error' field will contain the message and |
- 'revision_data' will be empty. |
+ A BisectResults object. |
""" |
- results = { |
- 'revision_data' : {}, |
- 'error' : None, |
- } |
+ results = BisectResults(self, self.source_control) |
# Choose depot to bisect first |
target_depot = 'chromium' |
@@ -2382,18 +2524,18 @@ class BisectPerformanceMetrics(object): |
os.chdir(cwd) |
if bad_revision is None: |
- results['error'] = 'Couldn\'t resolve [%s] to SHA1.' % bad_revision_in |
+ results.error = 'Couldn\'t resolve [%s] to SHA1.' % bad_revision_in |
return results |
if good_revision is None: |
- results['error'] = 'Couldn\'t resolve [%s] to SHA1.' % good_revision_in |
+ results.error = 'Couldn\'t resolve [%s] to SHA1.' % good_revision_in |
return results |
# Check that they didn't accidentally swap good and bad revisions. |
if not self.CheckIfRevisionsInProperOrder( |
target_depot, good_revision, bad_revision): |
- results['error'] = ('bad_revision < good_revision, did you swap these ' |
- 'by mistake?') |
+ results.error = ('bad_revision < good_revision, did you swap these ' |
+ 'by mistake?') |
return results |
bad_revision, good_revision = self.NudgeRevisionsIfDEPSChange( |
bad_revision, good_revision, good_revision_in) |
@@ -2402,7 +2544,7 @@ class BisectPerformanceMetrics(object): |
cannot_bisect = self.CanPerformBisect(good_revision, bad_revision) |
if cannot_bisect: |
- results['error'] = cannot_bisect.get('error') |
+ results.error = cannot_bisect.get('error') |
return results |
print 'Gathering revision range for bisection.' |
@@ -2417,7 +2559,7 @@ class BisectPerformanceMetrics(object): |
# revision_data will store information about a revision such as the |
# depot it came from, the webkit/V8 revision at that time, |
# performance timing, build state, etc... |
- revision_data = results['revision_data'] |
+ revision_data = results.revision_data |
# revision_list is the list we're binary searching through at the moment. |
revision_list = [] |
@@ -2460,17 +2602,17 @@ class BisectPerformanceMetrics(object): |
bisect_utils.OutputAnnotationStepClosed() |
if bad_results[1]: |
- results['error'] = ('An error occurred while building and running ' |
+ results.error = ('An error occurred while building and running ' |
'the \'bad\' reference value. The bisect cannot continue without ' |
'a working \'bad\' revision to start from.\n\nError: %s' % |
- bad_results[0]) |
+ bad_results[0]) |
return results |
if good_results[1]: |
- results['error'] = ('An error occurred while building and running ' |
+ results.error = ('An error occurred while building and running ' |
'the \'good\' reference value. The bisect cannot continue without ' |
'a working \'good\' revision to start from.\n\nError: %s' % |
- good_results[0]) |
+ good_results[0]) |
return results |
@@ -2535,9 +2677,9 @@ class BisectPerformanceMetrics(object): |
previous_revision) |
if not new_revision_list: |
- results['error'] = ('An error occurred attempting to retrieve ' |
- 'revision range: [%s..%s]' % |
- (earliest_revision, latest_revision)) |
+ results.error = ('An error occurred attempting to retrieve ' |
+ 'revision range: [%s..%s]' % |
+ (earliest_revision, latest_revision)) |
return results |
_AddRevisionsIntoRevisionData( |
@@ -2616,18 +2758,14 @@ class BisectPerformanceMetrics(object): |
bisect_utils.OutputAnnotationStepClosed() |
else: |
# Weren't able to sync and retrieve the revision range. |
- results['error'] = ('An error occurred attempting to retrieve revision ' |
- 'range: [%s..%s]' % (good_revision, bad_revision)) |
+ results.error = ('An error occurred attempting to retrieve revision ' |
+ 'range: [%s..%s]' % (good_revision, bad_revision)) |
return results |
- def _PrintPartialResults(self, results_dict): |
- revision_data = results_dict['revision_data'] |
- revision_data_sorted = sorted(revision_data.iteritems(), |
- key = lambda x: x[1]['sort']) |
- results_dict = self._GetResultsDict(revision_data, revision_data_sorted) |
- |
- self._PrintTestedCommitsTable(revision_data_sorted, |
+ def _PrintPartialResults(self, results): |
+ results_dict = results.GetResultsDict() |
+ self._PrintTestedCommitsTable(results_dict['revision_data_sorted'], |
results_dict['first_working_revision'], |
results_dict['last_broken_revision'], |
100, final_step=False) |
@@ -2800,125 +2938,6 @@ class BisectPerformanceMetrics(object): |
previous_data['depot'], previous_link) |
- def _GetResultsDict(self, revision_data, revision_data_sorted): |
- # Find range where it possibly broke. |
- first_working_revision = None |
- first_working_revision_index = -1 |
- last_broken_revision = None |
- last_broken_revision_index = -1 |
- |
- culprit_revisions = [] |
- other_regressions = [] |
- regression_size = 0.0 |
- regression_std_err = 0.0 |
- confidence = 0.0 |
- |
- for i in xrange(len(revision_data_sorted)): |
- k, v = revision_data_sorted[i] |
- if v['passed'] == 1: |
- if not first_working_revision: |
- first_working_revision = k |
- first_working_revision_index = i |
- |
- if not v['passed']: |
- last_broken_revision = k |
- last_broken_revision_index = i |
- |
- if last_broken_revision != None and first_working_revision != None: |
- broken_means = [] |
- for i in xrange(0, last_broken_revision_index + 1): |
- if revision_data_sorted[i][1]['value']: |
- broken_means.append(revision_data_sorted[i][1]['value']['values']) |
- |
- working_means = [] |
- for i in xrange(first_working_revision_index, len(revision_data_sorted)): |
- if revision_data_sorted[i][1]['value']: |
- working_means.append(revision_data_sorted[i][1]['value']['values']) |
- |
- # Flatten the lists to calculate mean of all values. |
- working_mean = sum(working_means, []) |
- broken_mean = sum(broken_means, []) |
- |
- # Calculate the approximate size of the regression |
- mean_of_bad_runs = math_utils.Mean(broken_mean) |
- mean_of_good_runs = math_utils.Mean(working_mean) |
- |
- regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs, |
- mean_of_bad_runs) |
- if math.isnan(regression_size): |
- regression_size = 'zero-to-nonzero' |
- |
- regression_std_err = math.fabs(math_utils.PooledStandardError( |
- [working_mean, broken_mean]) / |
- max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 |
- |
- # Give a "confidence" in the bisect. At the moment we use how distinct the |
- # values are before and after the last broken revision, and how noisy the |
- # overall graph is. |
- confidence = ConfidenceScore(working_means, broken_means) |
- |
- culprit_revisions = [] |
- |
- cwd = os.getcwd() |
- self.ChangeToDepotWorkingDirectory( |
- revision_data[last_broken_revision]['depot']) |
- |
- if revision_data[last_broken_revision]['depot'] == 'cros': |
- # Want to get a list of all the commits and what depots they belong |
- # to so that we can grab info about each. |
- cmd = ['repo', 'forall', '-c', |
- 'pwd ; git log --pretty=oneline --before=%d --after=%d' % ( |
- last_broken_revision, first_working_revision + 1)] |
- output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd) |
- |
- changes = [] |
- assert not return_code, ('An error occurred while running ' |
- '"%s"' % ' '.join(cmd)) |
- last_depot = None |
- cwd = os.getcwd() |
- for l in output.split('\n'): |
- if l: |
- # Output will be in form: |
- # /path_to_depot |
- # /path_to_other_depot |
- # <SHA1> |
- # /path_again |
- # <SHA1> |
- # etc. |
- if l[0] == '/': |
- last_depot = l |
- else: |
- contents = l.split(' ') |
- if len(contents) > 1: |
- changes.append([last_depot, contents[0]]) |
- for c in changes: |
- os.chdir(c[0]) |
- info = self.source_control.QueryRevisionInfo(c[1]) |
- culprit_revisions.append((c[1], info, None)) |
- else: |
- for i in xrange(last_broken_revision_index, len(revision_data_sorted)): |
- k, v = revision_data_sorted[i] |
- if k == first_working_revision: |
- break |
- self.ChangeToDepotWorkingDirectory(v['depot']) |
- info = self.source_control.QueryRevisionInfo(k) |
- culprit_revisions.append((k, info, v['depot'])) |
- os.chdir(cwd) |
- |
- # Check for any other possible regression ranges. |
- other_regressions = _FindOtherRegressions( |
- revision_data_sorted, mean_of_bad_runs > mean_of_good_runs) |
- |
- return { |
- 'first_working_revision': first_working_revision, |
- 'last_broken_revision': last_broken_revision, |
- 'culprit_revisions': culprit_revisions, |
- 'other_regressions': other_regressions, |
- 'regression_size': regression_size, |
- 'regression_std_err': regression_std_err, |
- 'confidence': confidence, |
- } |
- |
def _CheckForWarnings(self, results_dict): |
if len(results_dict['culprit_revisions']) > 1: |
self.warnings.append('Due to build errors, regression range could ' |
@@ -2940,10 +2959,7 @@ class BisectPerformanceMetrics(object): |
Args: |
bisect_results: The results from a bisection test run. |
""" |
- revision_data = bisect_results['revision_data'] |
- revision_data_sorted = sorted(revision_data.iteritems(), |
- key = lambda x: x[1]['sort']) |
- results_dict = self._GetResultsDict(revision_data, revision_data_sorted) |
+ results_dict = bisect_results.GetResultsDict() |
self._CheckForWarnings(results_dict) |
@@ -2952,7 +2968,7 @@ class BisectPerformanceMetrics(object): |
print 'Full results of bisection:' |
- for current_id, current_data in revision_data_sorted: |
+ for current_id, current_data in results_dict['revision_data_sorted']: |
build_status = current_data['passed'] |
if type(build_status) is bool: |
@@ -2980,12 +2996,12 @@ class BisectPerformanceMetrics(object): |
self._PrintRevisionInfo(cl, info, depot) |
if results_dict['other_regressions']: |
self._PrintOtherRegressions(results_dict['other_regressions'], |
- revision_data) |
- self._PrintTestedCommitsTable(revision_data_sorted, |
+ results_dict['revision_data']) |
+ self._PrintTestedCommitsTable(results_dict['revision_data_sorted'], |
results_dict['first_working_revision'], |
results_dict['last_broken_revision'], |
results_dict['confidence']) |
- _PrintStepTime(revision_data_sorted) |
+ _PrintStepTime(results_dict['revision_data_sorted']) |
self._PrintReproSteps() |
_PrintThankYou() |
if self.opts.output_buildbot_annotations: |
@@ -3396,8 +3412,8 @@ def main(): |
opts.bad_revision, |
opts.good_revision, |
opts.metric) |
- if bisect_results['error']: |
- raise RuntimeError(bisect_results['error']) |
+ if bisect_results.error: |
+ raise RuntimeError(bisect_results.error) |
bisect_test.FormatAndPrintResults(bisect_results) |
return 0 |
finally: |