Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(18)

Side by Side Diff: appengine/findit/crash/loglinear/changelist_classifier.py

Issue 2560723005: Implementing a new LogLinearModel-based CL classifier (Closed)
Patch Set: fixing typo Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | appengine/findit/crash/loglinear/test/changelist_classifier_test.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 # Copyright 2016 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
4
5 import logging
6 import math
7
8 from common import chrome_dependency_fetcher
9 from crash import changelist_classifier
10 from crash.loglinear.changelist_features import min_distance
11 from crash.loglinear.changelist_features import top_frame_index
12 from crash.loglinear.model import ToFeatureFunction
13 from crash.loglinear.model import UnnormalizedLogLinearModel
14 from crash.stacktrace import CallStack
15 from crash.stacktrace import Stacktrace
16
17
18 class LogLinearChangelistClassifier(object):
19 """A ``LogLinearModel``-based implementation of CL classification."""
20
21 def __init__(self, repository, weights, top_n_frames=7, top_n_results=3):
22 """Args:
23 repository (Repository): the Git repository for getting CLs to classify.
24 weights (dict of float): the weights for the features. The keys of
25 the dictionary are the names of the feature that weight is
26 for. We take this argument as a dict rather than as a list so that
27 callers needn't worry about what order to provide the weights in.
28 top_n_frames (int): how many frames of each callstack to look at.
29 top_n_results (int): maximum number of results to return.
30 """
31 self._repository = repository
32 self._top_n_frames = top_n_frames
33 self._top_n_results = top_n_results
34
35 feature_function = ToFeatureFunction([
36 top_frame_index.TopFrameIndexFeature(top_n_frames),
37 min_distance.MinDistanceFeature(),
38 ])
39
40 weight_list = [
41 weights['TopFrameIndex'],
42 weights['MinDistance'],
43 ]
44
45 self._model = UnnormalizedLogLinearModel(feature_function, weight_list)
46
47 # TODO(crbug.com/674262): remove the need for storing these weights.
48 self._weights = weights
49
50 # TODO(crbug.com/673964): something better for detecting "close to log(0)".
Martin Barbella 2016/12/16 23:28:37 I'm not sure that a very clean solution to this wo
wrengr 2016/12/17 00:24:54 Yeah, it's not clear there's anything much better
51 def _LogZeroish(self, x):
52 """Determine whether a float is close enough to log(0).
53
54 If a ``FeatureValue`` has a (log-domain) score of -inf for a given
55 ``MatchResult``, then that result has zero probability of being
56 blamed. We want to filter these results out, to clean up the output
57 of classification; so this method encapsulates the logic of that check.
58
59 Args:
60 x (float): the float to check
61
62 Returns:
63 ``True`` if ``x`` is close enough to log(0); else ``False``.
64 """
65 return x < 0 and math.isinf(x)
66
67 def _SingleFeatureScore(self, feature_value):
68 """Returns the score (aka weighted value) of a ``FeatureValue``.
69
70 This method is a hack for filtering the JSON output ``__call__``
71 returns. If we really really need this, then we should probably move
72 it to the classes defining loglinear models.
73
74 Args:
75 feature_value (FeatureValue): the feature value to check.
76
77 Returns:
78 The score of the feature value.
79 """
80 return feature_value.value * self._weights.get(feature_value.name, 0.)
81
82 def __call__(self, report):
83 """Finds changelists suspected of being responsible for the crash report.
84
85 Args:
86 report (CrashReport): the report to be analyzed.
87
88 Returns:
89 List of Results, sorted by probability from highest to lowest.
90 """
91 if not report.regression_range:
92 logging.warning('ChangelistClassifier.__call__: Missing regression range '
93 'for report: %s', str(report))
94 return []
95 last_good_version, first_bad_version = report.regression_range
96 logging.info('ChangelistClassifier.__call__: Regression range %s:%s',
97 last_good_version, first_bad_version)
98
99 # Restrict analysis to just the top n frames in each callstack.
100 stacktrace = Stacktrace([
101 stack.SliceFrames(None, self._top_n_frames)
102 for stack in report.stacktrace])
103
104 # We are only interested in the deps in crash stack (the callstack that
105 # caused the crash).
106 # TODO(wrengr): we may want to receive the crash deps as an argument,
107 # so that when this method is called via Findit.FindCulprit, we avoid
108 # doing redundant work creating it.
109 stack_deps = changelist_classifier.GetDepsInCrashStack(
110 report.stacktrace.crash_stack,
111 chrome_dependency_fetcher.ChromeDependencyFetcher(
112 self._repository).GetDependency(report.crashed_version,
113 report.platform))
114
115 # Get dep and file to changelogs, stack_info and blame dicts.
116 dep_rolls = chrome_dependency_fetcher.ChromeDependencyFetcher(
117 self._repository).GetDependencyRollsDict(
118 last_good_version, first_bad_version, report.platform)
119
120 # Regression of a dep added/deleted (old_revision/new_revision is None) can
121 # not be known for sure and this case rarely happens, so just filter them
122 # out.
123 regression_deps_rolls = {}
124 for dep_path, dep_roll in dep_rolls.iteritems():
125 if not dep_roll.old_revision or not dep_roll.new_revision:
126 logging.info('Skip %s denpendency %s',
127 'added' if dep_roll.new_revision else 'deleted', dep_path)
128 continue
129 regression_deps_rolls[dep_path] = dep_roll
130
131 dep_to_file_to_changelogs, ignore_cls = (
132 changelist_classifier.GetChangeLogsForFilesGroupedByDeps(
133 regression_deps_rolls, stack_deps, self._repository))
134 dep_to_file_to_stack_infos = (
135 changelist_classifier.GetStackInfosForFilesGroupedByDeps(
136 stacktrace, stack_deps))
137
138 # Get the possible results.
139 results = changelist_classifier.FindMatchResults(
140 dep_to_file_to_changelogs,
141 dep_to_file_to_stack_infos,
142 stack_deps,
143 self._repository,
144 ignore_cls)
145 if results is None:
146 return []
147
148 # Score the results and organize them for outputting/returning.
149 features_given_report = self._model.Features(report)
150 score_given_report = self._model.Score(report)
151 scored_results = []
152 for result in results:
153 score = score_given_report(result)
154 if self._LogZeroish(score):
155 logging.debug('Discarding result because it has zero probability: %s'
156 % str(result.ToDict()))
157 continue
158
159 result.confidence = score
160 features = features_given_report(result)
161 result.reasons = self.FormatReasons(features)
162 result.changed_files = [changed_file.ToDict()
163 for changed_file in self.AggregateChangedFiles(features)]
164 scored_results.append(result)
165
166 scored_results.sort(key=lambda result: result.confidence)
167 return scored_results[:self._top_n_results]
168
169 def FormatReasons(self, features):
170 """Collect and format a list of all ``FeatureValue.reason`` strings.
171
172 Args:
173 features (list of FeatureValue): the values whose ``reason``
174 strings should be collected.
175
176 Returns:
177 A list of ``(str, float, str)`` triples; where the first string
178 is the feature name, the float is some numeric representation
179 of how much influence this feature exerts on the ``MatchResult``
180 being blamed, and the final string is the ``FeatureValue.reason``.
181 The list is sorted by feature name, just to ensure that it comes
182 out in some canonical order.
183
184 At present, the float is the log-domain score of the feature
185 value. However, this isn't the best thing for UX reasons. In the
186 future it might be replaced by the normal-domain score, or by
187 the probability.
188 """
189 formatted_reasons = []
190 for feature in features:
191 feature_score = self._SingleFeatureScore(feature)
192 if self._LogZeroish(feature_score): # pragma: no cover
193 logging.debug('Discarding reasons from feature %s'
194 ' because it has zero probability' % feature.name)
195 continue
196
197 formatted_reasons.append((feature.name, feature_score, feature.reason))
198
199 return sorted(formatted_reasons,
200 key=lambda formatted_reason: formatted_reason[0])
201
202 def AggregateChangedFiles(self, features):
203 """Merge multiple``FeatureValue.changed_files`` lists into one.
204
205 Args:
206 features (list of FeatureValue): the values whose ``changed_files``
207 lists should be aggregated.
208
209 Returns:
210 A list of ``ChangedFile`` objects sorted by file name. The sorting
211 is not essential, but is provided to ease testing by ensuring the
212 output is in some canonical order.
213
214 Raises:
215 ``ValueError`` if any file name is given inconsistent ``blame_url``s.
216 """
217 all_changed_files = {}
218 for feature in features:
219 if self._LogZeroish(self._SingleFeatureScore(feature)): # pragma: no cover
220 logging.debug('Discarding changed files from feature %s'
221 ' because it has zero probability' % feature.name)
222 continue
223
224 for changed_file in feature.changed_files or []:
225 accumulated_changed_file = all_changed_files.get(changed_file.name)
226 if accumulated_changed_file is None:
227 all_changed_files[changed_file.name] = changed_file
228 continue
229
230 assert accumulated_changed_file.blame_url == changed_file.blame_url, (
Martin Barbella 2016/12/16 23:28:37 Is this based on something from the existing code?
wrengr 2016/12/17 00:24:54 The scorer-based ChangelistClassifier has a sort o
231 ValueError('Blame URLs do not match: %s != %s'
232 % (accumulated_changed_file.blame_url, changed_file.blame_url)))
233 accumulated_changed_file.reasons.extend(changed_file.reasons or [])
234
235 return sorted(all_changed_files.values(),
236 key=lambda changed_file: changed_file.name)
OLDNEW
« no previous file with comments | « no previous file | appengine/findit/crash/loglinear/test/changelist_classifier_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698