Index: Tools/AutoSheriff/analysis.py |
diff --git a/Tools/AutoSheriff/analysis.py b/Tools/AutoSheriff/analysis.py |
new file mode 100644 |
index 0000000000000000000000000000000000000000..6d8fa930895c7b44417c9142f22aba5d0e036877 |
--- /dev/null |
+++ b/Tools/AutoSheriff/analysis.py |
@@ -0,0 +1,175 @@ |
+# Copyright 2014 The Chromium Authors. All rights reserved. |
+# Use of this source code is governed by a BSD-style license that can be |
+# found in the LICENSE file. |
+ |
+import json |
+import collections |
+import operator |
+ |
+ |
+# Git ready, but only implemented for SVN atm. |
ojan
2014/07/22 02:01:23
I think we should only operate on monotonically in
|
+def ids_after_first_including_second(first, second): |
+ if not first or not second: |
+ return [] |
+ try: |
+ return range(int(first) + 1, int(second) + 1) |
+ except ValueError, e: |
+ # likely passed a git hash |
+ return [] |
+ |
+ |
+# Git ready, but only implemented for SVN atm. |
+def is_ancestor_of(older, younger): |
+ return int(older) < int(younger) |
+ |
+ |
+def is_decendant_of(younger, older): |
+ return is_ancestor_of(older, younger) |
+ |
+ |
+def commit_compare(one, two): |
+ if is_decendant_of(one, two): |
+ return -1 |
+ if is_ancestor_of(one, two): |
+ return 1 |
+ return 0 # This is technically not right, since commits can be non-comparable. |
+ |
+ |
+def flatten_to_commit_list(passing, failing): |
+ # Flatten two commit dicts to a list of 'name:commit' |
+ if not passing or not failing: |
+ return [] |
+ all_commits = [] |
+ for name in passing.keys(): |
+ commits = ids_after_first_including_second(passing[name], failing[name]) |
+ all_commits.extend(['%s:%s' % (name, commit) for commit in commits]) |
+ return all_commits |
+ |
+ |
+# FIXME: Perhaps this should be done by the feeder? |
+def assign_keys(alerts): |
+ for key, alert in enumerate(alerts): |
+ # We could come up with something more sophisticated if necessary. |
+ alert['key'] = 'f%s' % key # Just something so it doesn't look like a number. |
+ return alerts |
+ |
+ |
+def _make_merge_dicts(reducer): |
+ def merge_dicts(one, two): |
+ if not one or not two: |
+ return None |
+ reduction = {} |
+ for key in set(one.keys() + two.keys()): |
+ one_value = one.get(key) |
+ two_value = two.get(key) |
+ reduction[key] = reducer(one_value, two_value) |
+ return reduction |
+ return merge_dicts |
+ |
+ |
+def merge_regression_ranges(alerts): |
+ def make_compare(compare_func): |
+ def compare(one, two): |
+ if one and two and compare_func(one, two): |
+ return one |
+ # Treat None is 'infinite' so always return the more limiting. |
+ # FIXME: 'passing' may wish to prefer None over a value? |
+ return two or one |
+ return compare |
+ |
+ # These don't handle the case where commits can't be compared (git branches) |
+ younger_commit = make_compare(is_decendant_of) |
+ passing_dicts = map(operator.itemgetter('passing_revisions'), alerts) |
+ last_passing = reduce(_make_merge_dicts(younger_commit), passing_dicts) |
+ |
+ older_commit = make_compare(is_ancestor_of) |
+ failing_dicts = map(operator.itemgetter('failing_revisions'), alerts) |
+ first_failing = reduce(_make_merge_dicts(older_commit), failing_dicts) |
+ |
+ return last_passing, first_failing |
+ |
+ |
+def reason_key_for_alert(alert): |
+ # FIXME: May need something smarter for reason_key. |
+ reason_key = alert['step_name'] |
+ if alert['reason']: |
+ reason_key += ':%s' % alert['reason'] |
+ else: |
+ # If we don't understand the alert, just make it builder-unique. |
+ reason_key += ':%s' % alert['builder_name'] |
+ return reason_key |
+ |
+ |
+def group_by_reason(alerts): |
+ by_reason = collections.defaultdict(list) |
+ for alert in alerts: |
+ by_reason[reason_key_for_alert(alert)].append(alert) |
+ |
+ reason_groups = [] |
+ for reason_key, alerts in by_reason.items(): |
+ last_passing, first_failing = merge_regression_ranges(alerts) |
+ blame_list = flatten_to_commit_list(last_passing, first_failing) |
+ # FIXME: blame_list isn't filtered yet, but should be. |
+ reason_groups.append({ |
+ 'sort_key': reason_key, |
+ 'merged_last_passing': last_passing, |
+ 'merged_first_failing': first_failing, |
+ 'likely_revisions': blame_list, |
+ 'failure_keys': map(operator.itemgetter('key'), alerts), |
+ }) |
+ return reason_groups |
+ |
+# http://stackoverflow.com/questions/18715688/find-common-substring-between-two-strings |
+def longestSubstringFinder(string1, string2): |
ojan
2014/07/22 02:01:23
s/camelCase/camel_case
|
+ answer = "" |
+ len1, len2 = len(string1), len(string2) |
+ for i in range(len1): |
+ match = "" |
+ for j in range(len2): |
+ if (i + j < len1 and string1[i + j] == string2[j]): |
+ match += string2[j] |
+ else: |
+ if (len(match) > len(answer)): answer = match |
+ match = "" |
+ return answer |
+ |
+ |
+def range_key_for_group(group): |
+ last_passing = group['merged_last_passing'] |
+ first_failing = group['merged_first_failing'] |
+ if last_passing: |
+ range_key = ' '.join(flatten_to_commit_list(last_passing, first_failing)) |
+ else: |
+ # Even regressions where we don't know when they started can be |
+ # merged by our earliest known failure. |
+ parts = ['<=%s:%s' % (name, commit) for name, commit in first_failing.items()] |
+ range_key = ' '.join(parts) |
+ # sort_key is a heuristic to avoid merging failiures like |
+ # gclient revert + webkit_tests which just happened to pull |
+ # exact matching revisions when failing. |
ojan
2014/07/22 02:01:23
I'm not sure it's wrong to group these together in
|
+ return group['sort_key'][:3] + range_key |
+ |
+ |
+def merge_by_range(reason_groups): |
+ if not reason_groups: |
+ return [] |
+ expected_keys = sorted(reason_groups[0].keys()) |
+ by_range = {} |
+ for group in reason_groups: |
+ range_key = range_key_for_group(group) |
+ existing = by_range.get(range_key) |
+ if not existing: |
+ # Shallow copy of group. |
+ by_range[range_key] = dict(group) |
+ continue |
+ |
+ # FIXME: It's possible we don't want to merge two keys with nothing in common. |
+ # e.g. bot_update and |
+ # We only care about these two keys, the rest should be the same between all groups. |
+ # I guess we could assert that... |
+ by_range[range_key].update({ |
+ 'sort_key': longestSubstringFinder(existing['sort_key'], group['sort_key']), |
+ 'failure_keys': sorted(set(existing['failure_keys'] + group['failure_keys'])), |
+ }) |
+ |
+ return sorted(by_range.values(), key=operator.itemgetter('sort_key')) |