Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(31)

Side by Side Diff: appengine/findit/crash/loglinear/changelist_classifier.py

Issue 2617273002: [Predator] Move ``SingleFeatureScore`` to LLM. (Closed)
Patch Set: Address comment. Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2016 The Chromium Authors. All rights reserved. 1 # Copyright 2016 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 from collections import defaultdict 5 from collections import defaultdict
6 import logging 6 import logging
7 import math 7 import math
8 8
9 from common.chrome_dependency_fetcher import ChromeDependencyFetcher 9 from common.chrome_dependency_fetcher import ChromeDependencyFetcher
10 from crash import changelist_classifier 10 from crash import changelist_classifier
11 from crash.crash_report_with_dependencies import CrashReportWithDependencies 11 from crash.crash_report_with_dependencies import CrashReportWithDependencies
12 from crash.loglinear.changelist_features import min_distance 12 from crash.loglinear.changelist_features import min_distance
13 from crash.loglinear.changelist_features import top_frame_index 13 from crash.loglinear.changelist_features import top_frame_index
14 from crash.loglinear.model import ToFeatureFunction 14 from crash.loglinear.feature import FeatureFunction
15 from crash.loglinear.model import UnnormalizedLogLinearModel 15 from crash.loglinear.model import UnnormalizedLogLinearModel
16 from crash.stacktrace import CallStack 16 from crash.stacktrace import CallStack
17 from crash.stacktrace import Stacktrace 17 from crash.stacktrace import Stacktrace
18 from crash.suspect import StackInfo 18 from crash.suspect import StackInfo
19 19
20 20
21 class LogLinearChangelistClassifier(object): 21 class LogLinearChangelistClassifier(object):
22 """A ``LogLinearModel``-based implementation of CL classification.""" 22 """A ``LogLinearModel``-based implementation of CL classification."""
23 23
24 def __init__(self, get_repository, weights, top_n_frames=7, top_n_suspects=3): 24 def __init__(self, get_repository, weights, top_n_frames=7, top_n_suspects=3):
(...skipping 10 matching lines...) Expand all
35 for. We take this argument as a dict rather than as a list so that 35 for. We take this argument as a dict rather than as a list so that
36 callers needn't worry about what order to provide the weights in. 36 callers needn't worry about what order to provide the weights in.
37 top_n_frames (int): how many frames of each callstack to look at. 37 top_n_frames (int): how many frames of each callstack to look at.
38 top_n_suspects (int): maximum number of suspects to return. 38 top_n_suspects (int): maximum number of suspects to return.
39 """ 39 """
40 self._dependency_fetcher = ChromeDependencyFetcher(get_repository) 40 self._dependency_fetcher = ChromeDependencyFetcher(get_repository)
41 self._get_repository = get_repository 41 self._get_repository = get_repository
42 self._top_n_frames = top_n_frames 42 self._top_n_frames = top_n_frames
43 self._top_n_suspects = top_n_suspects 43 self._top_n_suspects = top_n_suspects
44 44
45 feature_function = ToFeatureFunction([ 45 feature_function = FeatureFunction([
46 top_frame_index.TopFrameIndexFeature(top_n_frames), 46 top_frame_index.TopFrameIndexFeature(top_n_frames),
47 min_distance.MinDistanceFeature(), 47 min_distance.MinDistanceFeature(),
48 ]) 48 ])
49 49
50 weight_list = [ 50 self._model = UnnormalizedLogLinearModel(feature_function, weights)
51 weights['TopFrameIndex'],
52 weights['MinDistance'],
53 ]
54
55 self._model = UnnormalizedLogLinearModel(feature_function, weight_list)
56
57 # TODO(crbug.com/674262): remove the need for storing these weights.
58 self._weights = weights
59
60 # TODO(crbug.com/673964): something better for detecting "close to log(0)".
61 def _LogZeroish(self, x):
62 """Determine whether a float is close enough to log(0).
63
64 If a ``FeatureValue`` has a (log-domain) score of -inf for a given
65 ``Suspect``, then that suspect has zero probability of being the
66 culprit. We want to filter these suspects out, to clean up the
67 output of classification; so this method encapsulates the logic of
68 that check.
69
70 Args:
71 x (float): the float to check
72
73 Returns:
74 ``True`` if ``x`` is close enough to log(0); else ``False``.
75 """
76 return x < 0 and math.isinf(x)
77
78 def _SingleFeatureScore(self, feature_value):
79 """Returns the score (aka weighted value) of a ``FeatureValue``.
80
81 This function assumes the report's stacktrace has already had any necessary
82 preprocessing (like filtering or truncating) applied.
83
84 Args:
85 feature_value (FeatureValue): the feature value to check.
86
87 Returns:
88 The score of the feature value.
89 """
90 return feature_value.value * self._weights.get(feature_value.name, 0.)
91 51
92 def __call__(self, report): 52 def __call__(self, report):
93 """Finds changelists suspected of being responsible for the crash report. 53 """Finds changelists suspected of being responsible for the crash report.
94 54
95 Args: 55 Args:
96 report (CrashReport): the report to be analyzed. 56 report (CrashReport): the report to be analyzed.
97 57
98 Returns: 58 Returns:
99 List of ``Suspect``s, sorted by probability from highest to lowest. 59 List of ``Suspect``s, sorted by probability from highest to lowest.
100 """ 60 """
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
144 return changelist_classifier.FindSuspects( 104 return changelist_classifier.FindSuspects(
145 dep_to_file_to_changelogs, 105 dep_to_file_to_changelogs,
146 dep_to_file_to_stack_infos, 106 dep_to_file_to_stack_infos,
147 report.dependencies, 107 report.dependencies,
148 self._get_repository, 108 self._get_repository,
149 ignore_cls) 109 ignore_cls)
150 110
151 def RankSuspects(self, report, suspects): 111 def RankSuspects(self, report, suspects):
152 """Returns a lineup of the suspects in order of likelihood. 112 """Returns a lineup of the suspects in order of likelihood.
153 113
114 Suspects with a discardable score or lower ranking than top_n_suspects
115 will be filtered.
116
154 Args: 117 Args:
155 report (CrashReportWithDependencies): the crash we seek to explain. 118 report (CrashReportWithDependencies): the crash we seek to explain.
156 suspects (list of Suspect): the CLs to consider blaming for the crash. 119 suspects (iterable of Suspect): the CLs to consider blaming for the crash.
157 120
158 Returns: 121 Returns:
159 A list of suspects in order according to their likelihood. This 122 A list of suspects in order according to their likelihood. This
160 list contains elements of the ``suspects`` list, where we mutate 123 list contains elements of the ``suspects`` list, where we mutate
161 some of the fields to store information about why that suspect 124 some of the fields to store information about why that suspect
162 is being blamed (e.g., the ``confidence``, ``reasons``, and 125 is being blamed (e.g., the ``confidence``, ``reasons``, and
163 ``changed_files`` fields are updated). In addition to sorting the 126 ``changed_files`` fields are updated). In addition to sorting the
164 suspects, we also filter out those which are exceedingly unlikely 127 suspects, we also filter out those which are exceedingly unlikely
165 or don't make the ``top_n_suspects`` cut. 128 or don't make the ``top_n_suspects`` cut.
166 """ 129 """
167 # Score the suspects and organize them for outputting/returning. 130 # Score the suspects and organize them for outputting/returning.
168 features_given_report = self._model.Features(report) 131 features_given_report = self._model.Features(report)
169 score_given_report = self._model.Score(report) 132 score_given_report = self._model.Score(report)
170 133
171 scored_suspects = [] 134 scored_suspects = []
172 for suspect in suspects: 135 for suspect in suspects:
173 score = score_given_report(suspect) 136 score = score_given_report(suspect)
174 if self._LogZeroish(score): 137 if self._model.LogZeroish(score):
175 logging.debug('Discarding suspect because it has zero probability: %s' 138 logging.debug('Discarding suspect because it has zero probability: %s'
176 % str(suspect.ToDict())) 139 % str(suspect.ToDict()))
177 continue 140 continue
178 141
179 suspect.confidence = score 142 suspect.confidence = score
180 features = features_given_report(suspect) 143 features = features_given_report(suspect)
181 suspect.reasons = self.FormatReasons(features) 144 suspect.reasons = self._model.FormatReasons(features.itervalues())
182 suspect.changed_files = [ 145 suspect.changed_files = [
183 changed_file.ToDict() 146 changed_file.ToDict() for changed_file in
184 for changed_file in self.AggregateChangedFiles(features)] 147 self._model.AggregateChangedFiles(features.itervalues())]
185 scored_suspects.append(suspect) 148 scored_suspects.append(suspect)
186 149
187 scored_suspects.sort(key=lambda suspect: suspect.confidence) 150 scored_suspects.sort(key=lambda suspect: suspect.confidence)
188 return scored_suspects[:self._top_n_suspects] 151 return scored_suspects[:self._top_n_suspects]
189
190 def FormatReasons(self, features):
191 """Collect and format a list of all ``FeatureValue.reason`` strings.
192
193 Args:
194 features (list of FeatureValue): the values whose ``reason``
195 strings should be collected.
196
197 Returns:
198 A list of ``(str, float, str)`` triples; where the first string is
199 the feature name, the float is some numeric representation of how
200 much influence this feature exerts on the ``Suspect`` being blamed,
201 and the final string is the ``FeatureValue.reason``. The list is
202 sorted by feature name, just to ensure that it comes out in some
203 canonical order.
204
205 At present, the float is the log-domain score of the feature
206 value. However, this isn't the best thing for UX reasons. In the
207 future it might be replaced by the normal-domain score, or by
208 the probability.
209 """
210 formatted_reasons = []
211 for feature in features:
212 feature_score = self._SingleFeatureScore(feature)
213 if self._LogZeroish(feature_score): # pragma: no cover
214 logging.debug('Discarding reasons from feature %s'
215 ' because it has zero probability' % feature.name)
216 continue
217
218 formatted_reasons.append((feature.name, feature_score, feature.reason))
219
220 return sorted(formatted_reasons,
221 key=lambda formatted_reason: formatted_reason[0])
222
223 def AggregateChangedFiles(self, features):
224 """Merge multiple``FeatureValue.changed_files`` lists into one.
225
226 Args:
227 features (list of FeatureValue): the values whose ``changed_files``
228 lists should be aggregated.
229
230 Returns:
231 A list of ``ChangedFile`` objects sorted by file name. The sorting
232 is not essential, but is provided to ease testing by ensuring the
233 output is in some canonical order.
234
235 Raises:
236 ``ValueError`` if any file name is given inconsistent ``blame_url``s.
237 """
238 all_changed_files = {}
239 for feature in features:
240 if self._LogZeroish(self._SingleFeatureScore(feature)): # pragma: no cover
241 logging.debug('Discarding changed files from feature %s'
242 ' because it has zero probability' % feature.name)
243 continue
244
245 for changed_file in feature.changed_files or []:
246 accumulated_changed_file = all_changed_files.get(changed_file.name)
247 if accumulated_changed_file is None:
248 all_changed_files[changed_file.name] = changed_file
249 continue
250
251 if (accumulated_changed_file.blame_url !=
252 changed_file.blame_url): # pragma: no cover
253 raise ValueError('Blame URLs do not match: %s != %s'
254 % (accumulated_changed_file.blame_url, changed_file.blame_url))
255 accumulated_changed_file.reasons.extend(changed_file.reasons or [])
256
257 return sorted(all_changed_files.values(),
258 key=lambda changed_file: changed_file.name)
OLDNEW
« no previous file with comments | « no previous file | appengine/findit/crash/loglinear/feature.py » ('j') | appengine/findit/crash/loglinear/feature.py » ('J')

Powered by Google App Engine
This is Rietveld 408576698