| OLD | NEW |
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. | 1 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 from collections import defaultdict | 5 from collections import defaultdict |
| 6 import logging | 6 import logging |
| 7 import math | 7 import math |
| 8 | 8 |
| 9 from common.chrome_dependency_fetcher import ChromeDependencyFetcher | 9 from common.chrome_dependency_fetcher import ChromeDependencyFetcher |
| 10 from crash import changelist_classifier | 10 from crash import changelist_classifier |
| 11 from crash.crash_report_with_dependencies import CrashReportWithDependencies | 11 from crash.crash_report_with_dependencies import CrashReportWithDependencies |
| 12 from crash.loglinear.changelist_features import min_distance | 12 from crash.loglinear.changelist_features import min_distance |
| 13 from crash.loglinear.changelist_features import top_frame_index | 13 from crash.loglinear.changelist_features import top_frame_index |
| 14 from crash.loglinear.model import ToFeatureFunction | 14 from crash.loglinear.feature import FeatureFunction |
| 15 from crash.loglinear.model import UnnormalizedLogLinearModel | 15 from crash.loglinear.model import UnnormalizedLogLinearModel |
| 16 from crash.stacktrace import CallStack | 16 from crash.stacktrace import CallStack |
| 17 from crash.stacktrace import Stacktrace | 17 from crash.stacktrace import Stacktrace |
| 18 from crash.suspect import StackInfo | 18 from crash.suspect import StackInfo |
| 19 | 19 |
| 20 | 20 |
| 21 class LogLinearChangelistClassifier(object): | 21 class LogLinearChangelistClassifier(object): |
| 22 """A ``LogLinearModel``-based implementation of CL classification.""" | 22 """A ``LogLinearModel``-based implementation of CL classification.""" |
| 23 | 23 |
| 24 def __init__(self, get_repository, weights, top_n_frames=7, top_n_suspects=3): | 24 def __init__(self, get_repository, weights, top_n_frames=7, top_n_suspects=3): |
| (...skipping 10 matching lines...) Expand all Loading... |
| 35 for. We take this argument as a dict rather than as a list so that | 35 for. We take this argument as a dict rather than as a list so that |
| 36 callers needn't worry about what order to provide the weights in. | 36 callers needn't worry about what order to provide the weights in. |
| 37 top_n_frames (int): how many frames of each callstack to look at. | 37 top_n_frames (int): how many frames of each callstack to look at. |
| 38 top_n_suspects (int): maximum number of suspects to return. | 38 top_n_suspects (int): maximum number of suspects to return. |
| 39 """ | 39 """ |
| 40 self._dependency_fetcher = ChromeDependencyFetcher(get_repository) | 40 self._dependency_fetcher = ChromeDependencyFetcher(get_repository) |
| 41 self._get_repository = get_repository | 41 self._get_repository = get_repository |
| 42 self._top_n_frames = top_n_frames | 42 self._top_n_frames = top_n_frames |
| 43 self._top_n_suspects = top_n_suspects | 43 self._top_n_suspects = top_n_suspects |
| 44 | 44 |
| 45 feature_function = ToFeatureFunction([ | 45 feature_function = FeatureFunction([ |
| 46 top_frame_index.TopFrameIndexFeature(top_n_frames), | 46 top_frame_index.TopFrameIndexFeature(top_n_frames), |
| 47 min_distance.MinDistanceFeature(), | 47 min_distance.MinDistanceFeature(), |
| 48 ]) | 48 ]) |
| 49 | 49 |
| 50 weight_list = [ | 50 self._model = UnnormalizedLogLinearModel(feature_function, weights) |
| 51 weights['TopFrameIndex'], | |
| 52 weights['MinDistance'], | |
| 53 ] | |
| 54 | |
| 55 self._model = UnnormalizedLogLinearModel(feature_function, weight_list) | |
| 56 | |
| 57 # TODO(crbug.com/674262): remove the need for storing these weights. | |
| 58 self._weights = weights | |
| 59 | |
| 60 # TODO(crbug.com/673964): something better for detecting "close to log(0)". | |
| 61 def _LogZeroish(self, x): | |
| 62 """Determine whether a float is close enough to log(0). | |
| 63 | |
| 64 If a ``FeatureValue`` has a (log-domain) score of -inf for a given | |
| 65 ``Suspect``, then that suspect has zero probability of being the | |
| 66 culprit. We want to filter these suspects out, to clean up the | |
| 67 output of classification; so this method encapsulates the logic of | |
| 68 that check. | |
| 69 | |
| 70 Args: | |
| 71 x (float): the float to check | |
| 72 | |
| 73 Returns: | |
| 74 ``True`` if ``x`` is close enough to log(0); else ``False``. | |
| 75 """ | |
| 76 return x < 0 and math.isinf(x) | |
| 77 | |
| 78 def _SingleFeatureScore(self, feature_value): | |
| 79 """Returns the score (aka weighted value) of a ``FeatureValue``. | |
| 80 | |
| 81 This function assumes the report's stacktrace has already had any necessary | |
| 82 preprocessing (like filtering or truncating) applied. | |
| 83 | |
| 84 Args: | |
| 85 feature_value (FeatureValue): the feature value to check. | |
| 86 | |
| 87 Returns: | |
| 88 The score of the feature value. | |
| 89 """ | |
| 90 return feature_value.value * self._weights.get(feature_value.name, 0.) | |
| 91 | 51 |
| 92 def __call__(self, report): | 52 def __call__(self, report): |
| 93 """Finds changelists suspected of being responsible for the crash report. | 53 """Finds changelists suspected of being responsible for the crash report. |
| 94 | 54 |
| 95 Args: | 55 Args: |
| 96 report (CrashReport): the report to be analyzed. | 56 report (CrashReport): the report to be analyzed. |
| 97 | 57 |
| 98 Returns: | 58 Returns: |
| 99 List of ``Suspect``s, sorted by probability from highest to lowest. | 59 List of ``Suspect``s, sorted by probability from highest to lowest. |
| 100 """ | 60 """ |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 144 return changelist_classifier.FindSuspects( | 104 return changelist_classifier.FindSuspects( |
| 145 dep_to_file_to_changelogs, | 105 dep_to_file_to_changelogs, |
| 146 dep_to_file_to_stack_infos, | 106 dep_to_file_to_stack_infos, |
| 147 report.dependencies, | 107 report.dependencies, |
| 148 self._get_repository, | 108 self._get_repository, |
| 149 ignore_cls) | 109 ignore_cls) |
| 150 | 110 |
| 151 def RankSuspects(self, report, suspects): | 111 def RankSuspects(self, report, suspects): |
| 152 """Returns a lineup of the suspects in order of likelihood. | 112 """Returns a lineup of the suspects in order of likelihood. |
| 153 | 113 |
| 114 Suspects with a discardable score or lower ranking than top_n_suspects |
| 115 will be filtered. |
| 116 |
| 154 Args: | 117 Args: |
| 155 report (CrashReportWithDependencies): the crash we seek to explain. | 118 report (CrashReportWithDependencies): the crash we seek to explain. |
| 156 suspects (list of Suspect): the CLs to consider blaming for the crash. | 119 suspects (iterable of Suspect): the CLs to consider blaming for the crash. |
| 157 | 120 |
| 158 Returns: | 121 Returns: |
| 159 A list of suspects in order according to their likelihood. This | 122 A list of suspects in order according to their likelihood. This |
| 160 list contains elements of the ``suspects`` list, where we mutate | 123 list contains elements of the ``suspects`` list, where we mutate |
| 161 some of the fields to store information about why that suspect | 124 some of the fields to store information about why that suspect |
| 162 is being blamed (e.g., the ``confidence``, ``reasons``, and | 125 is being blamed (e.g., the ``confidence``, ``reasons``, and |
| 163 ``changed_files`` fields are updated). In addition to sorting the | 126 ``changed_files`` fields are updated). In addition to sorting the |
| 164 suspects, we also filter out those which are exceedingly unlikely | 127 suspects, we also filter out those which are exceedingly unlikely |
| 165 or don't make the ``top_n_suspects`` cut. | 128 or don't make the ``top_n_suspects`` cut. |
| 166 """ | 129 """ |
| 167 # Score the suspects and organize them for outputting/returning. | 130 # Score the suspects and organize them for outputting/returning. |
| 168 features_given_report = self._model.Features(report) | 131 features_given_report = self._model.Features(report) |
| 169 score_given_report = self._model.Score(report) | 132 score_given_report = self._model.Score(report) |
| 170 | 133 |
| 171 scored_suspects = [] | 134 scored_suspects = [] |
| 172 for suspect in suspects: | 135 for suspect in suspects: |
| 173 score = score_given_report(suspect) | 136 score = score_given_report(suspect) |
| 174 if self._LogZeroish(score): | 137 if self._model.LogZeroish(score): |
| 175 logging.debug('Discarding suspect because it has zero probability: %s' | 138 logging.debug('Discarding suspect because it has zero probability: %s' |
| 176 % str(suspect.ToDict())) | 139 % str(suspect.ToDict())) |
| 177 continue | 140 continue |
| 178 | 141 |
| 179 suspect.confidence = score | 142 suspect.confidence = score |
| 180 features = features_given_report(suspect) | 143 features = features_given_report(suspect) |
| 181 suspect.reasons = self.FormatReasons(features) | 144 suspect.reasons = self._model.FormatReasons(features.itervalues()) |
| 182 suspect.changed_files = [ | 145 suspect.changed_files = [ |
| 183 changed_file.ToDict() | 146 changed_file.ToDict() for changed_file in |
| 184 for changed_file in self.AggregateChangedFiles(features)] | 147 self._model.AggregateChangedFiles(features.itervalues())] |
| 185 scored_suspects.append(suspect) | 148 scored_suspects.append(suspect) |
| 186 | 149 |
| 187 scored_suspects.sort(key=lambda suspect: suspect.confidence) | 150 scored_suspects.sort(key=lambda suspect: suspect.confidence) |
| 188 return scored_suspects[:self._top_n_suspects] | 151 return scored_suspects[:self._top_n_suspects] |
| 189 | |
| 190 def FormatReasons(self, features): | |
| 191 """Collect and format a list of all ``FeatureValue.reason`` strings. | |
| 192 | |
| 193 Args: | |
| 194 features (list of FeatureValue): the values whose ``reason`` | |
| 195 strings should be collected. | |
| 196 | |
| 197 Returns: | |
| 198 A list of ``(str, float, str)`` triples; where the first string is | |
| 199 the feature name, the float is some numeric representation of how | |
| 200 much influence this feature exerts on the ``Suspect`` being blamed, | |
| 201 and the final string is the ``FeatureValue.reason``. The list is | |
| 202 sorted by feature name, just to ensure that it comes out in some | |
| 203 canonical order. | |
| 204 | |
| 205 At present, the float is the log-domain score of the feature | |
| 206 value. However, this isn't the best thing for UX reasons. In the | |
| 207 future it might be replaced by the normal-domain score, or by | |
| 208 the probability. | |
| 209 """ | |
| 210 formatted_reasons = [] | |
| 211 for feature in features: | |
| 212 feature_score = self._SingleFeatureScore(feature) | |
| 213 if self._LogZeroish(feature_score): # pragma: no cover | |
| 214 logging.debug('Discarding reasons from feature %s' | |
| 215 ' because it has zero probability' % feature.name) | |
| 216 continue | |
| 217 | |
| 218 formatted_reasons.append((feature.name, feature_score, feature.reason)) | |
| 219 | |
| 220 return sorted(formatted_reasons, | |
| 221 key=lambda formatted_reason: formatted_reason[0]) | |
| 222 | |
| 223 def AggregateChangedFiles(self, features): | |
| 224 """Merge multiple``FeatureValue.changed_files`` lists into one. | |
| 225 | |
| 226 Args: | |
| 227 features (list of FeatureValue): the values whose ``changed_files`` | |
| 228 lists should be aggregated. | |
| 229 | |
| 230 Returns: | |
| 231 A list of ``ChangedFile`` objects sorted by file name. The sorting | |
| 232 is not essential, but is provided to ease testing by ensuring the | |
| 233 output is in some canonical order. | |
| 234 | |
| 235 Raises: | |
| 236 ``ValueError`` if any file name is given inconsistent ``blame_url``s. | |
| 237 """ | |
| 238 all_changed_files = {} | |
| 239 for feature in features: | |
| 240 if self._LogZeroish(self._SingleFeatureScore(feature)): # pragma: no cover | |
| 241 logging.debug('Discarding changed files from feature %s' | |
| 242 ' because it has zero probability' % feature.name) | |
| 243 continue | |
| 244 | |
| 245 for changed_file in feature.changed_files or []: | |
| 246 accumulated_changed_file = all_changed_files.get(changed_file.name) | |
| 247 if accumulated_changed_file is None: | |
| 248 all_changed_files[changed_file.name] = changed_file | |
| 249 continue | |
| 250 | |
| 251 if (accumulated_changed_file.blame_url != | |
| 252 changed_file.blame_url): # pragma: no cover | |
| 253 raise ValueError('Blame URLs do not match: %s != %s' | |
| 254 % (accumulated_changed_file.blame_url, changed_file.blame_url)) | |
| 255 accumulated_changed_file.reasons.extend(changed_file.reasons or []) | |
| 256 | |
| 257 return sorted(all_changed_files.values(), | |
| 258 key=lambda changed_file: changed_file.name) | |
| OLD | NEW |