| OLD | NEW |
| (Empty) | |
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. |
| 4 |
| 5 import logging |
| 6 import math |
| 7 |
| 8 from common import chrome_dependency_fetcher |
| 9 from crash import changelist_classifier |
| 10 from crash.loglinear.changelist_features import min_distance |
| 11 from crash.loglinear.changelist_features import top_frame_index |
| 12 from crash.loglinear.model import ToFeatureFunction |
| 13 from crash.loglinear.model import UnnormalizedLogLinearModel |
| 14 from crash.stacktrace import CallStack |
| 15 from crash.stacktrace import Stacktrace |
| 16 |
| 17 |
| 18 class LogLinearChangelistClassifier(object): |
| 19 """A ``LogLinearModel``-based implementation of CL classification.""" |
| 20 |
| 21 def __init__(self, repository, weights, top_n_frames=7, top_n_suspects=3): |
| 22 """Args: |
| 23 repository (Repository): the Git repository for getting CLs to classify. |
| 24 weights (dict of float): the weights for the features. The keys of |
| 25 the dictionary are the names of the feature that weight is |
| 26 for. We take this argument as a dict rather than as a list so that |
| 27 callers needn't worry about what order to provide the weights in. |
| 28 top_n_frames (int): how many frames of each callstack to look at. |
| 29 top_n_suspects (int): maximum number of suspects to return. |
| 30 """ |
| 31 self._repository = repository |
| 32 self._top_n_frames = top_n_frames |
| 33 self._top_n_suspects = top_n_suspects |
| 34 |
| 35 feature_function = ToFeatureFunction([ |
| 36 top_frame_index.TopFrameIndexFeature(top_n_frames), |
| 37 min_distance.MinDistanceFeature(), |
| 38 ]) |
| 39 |
| 40 weight_list = [ |
| 41 weights['TopFrameIndex'], |
| 42 weights['MinDistance'], |
| 43 ] |
| 44 |
| 45 self._model = UnnormalizedLogLinearModel(feature_function, weight_list) |
| 46 |
| 47 # TODO(crbug.com/674262): remove the need for storing these weights. |
| 48 self._weights = weights |
| 49 |
| 50 # TODO(crbug.com/673964): something better for detecting "close to log(0)". |
| 51 def _LogZeroish(self, x): |
| 52 """Determine whether a float is close enough to log(0). |
| 53 |
| 54 If a ``FeatureValue`` has a (log-domain) score of -inf for a given |
| 55 ``Suspect``, then that suspect has zero probability of being the |
| 56 culprit. We want to filter these suspects out, to clean up the |
| 57 output of classification; so this method encapsulates the logic of |
| 58 that check. |
| 59 |
| 60 Args: |
| 61 x (float): the float to check |
| 62 |
| 63 Returns: |
| 64 ``True`` if ``x`` is close enough to log(0); else ``False``. |
| 65 """ |
| 66 return x < 0 and math.isinf(x) |
| 67 |
| 68 def _SingleFeatureScore(self, feature_value): |
| 69 """Returns the score (aka weighted value) of a ``FeatureValue``. |
| 70 |
| 71 This method is a hack for filtering the JSON output ``__call__`` |
| 72 returns. If we really really need this, then we should probably move |
| 73 it to the classes defining loglinear models. |
| 74 |
| 75 Args: |
| 76 feature_value (FeatureValue): the feature value to check. |
| 77 |
| 78 Returns: |
| 79 The score of the feature value. |
| 80 """ |
| 81 return feature_value.value * self._weights.get(feature_value.name, 0.) |
| 82 |
| 83 def __call__(self, report): |
| 84 """Finds changelists suspected of being responsible for the crash report. |
| 85 |
| 86 Args: |
| 87 report (CrashReport): the report to be analyzed. |
| 88 |
| 89 Returns: |
| 90 List of ``Suspect``s, sorted by probability from highest to lowest. |
| 91 """ |
| 92 if not report.regression_range: |
| 93 logging.warning('ChangelistClassifier.__call__: Missing regression range ' |
| 94 'for report: %s', str(report)) |
| 95 return [] |
| 96 last_good_version, first_bad_version = report.regression_range |
| 97 logging.info('ChangelistClassifier.__call__: Regression range %s:%s', |
| 98 last_good_version, first_bad_version) |
| 99 |
| 100 # Restrict analysis to just the top n frames in each callstack. |
| 101 stacktrace = Stacktrace([ |
| 102 stack.SliceFrames(None, self._top_n_frames) |
| 103 for stack in report.stacktrace]) |
| 104 |
| 105 # We are only interested in the deps in crash stack (the callstack that |
| 106 # caused the crash). |
| 107 # TODO(wrengr): we may want to receive the crash deps as an argument, |
| 108 # so that when this method is called via Findit.FindCulprit, we avoid |
| 109 # doing redundant work creating it. |
| 110 stack_deps = changelist_classifier.GetDepsInCrashStack( |
| 111 report.stacktrace.crash_stack, |
| 112 chrome_dependency_fetcher.ChromeDependencyFetcher( |
| 113 self._repository).GetDependency(report.crashed_version, |
| 114 report.platform)) |
| 115 |
| 116 # Get dep and file to changelogs, stack_info and blame dicts. |
| 117 dep_rolls = chrome_dependency_fetcher.ChromeDependencyFetcher( |
| 118 self._repository).GetDependencyRollsDict( |
| 119 last_good_version, first_bad_version, report.platform) |
| 120 |
| 121 # Regression of a dep added/deleted (old_revision/new_revision is None) can |
| 122 # not be known for sure and this case rarely happens, so just filter them |
| 123 # out. |
| 124 regression_deps_rolls = {} |
| 125 for dep_path, dep_roll in dep_rolls.iteritems(): |
| 126 if not dep_roll.old_revision or not dep_roll.new_revision: |
| 127 logging.info('Skip %s denpendency %s', |
| 128 'added' if dep_roll.new_revision else 'deleted', dep_path) |
| 129 continue |
| 130 regression_deps_rolls[dep_path] = dep_roll |
| 131 |
| 132 dep_to_file_to_changelogs, ignore_cls = ( |
| 133 changelist_classifier.GetChangeLogsForFilesGroupedByDeps( |
| 134 regression_deps_rolls, stack_deps, self._repository)) |
| 135 dep_to_file_to_stack_infos = ( |
| 136 changelist_classifier.GetStackInfosForFilesGroupedByDeps( |
| 137 stacktrace, stack_deps)) |
| 138 |
| 139 # Get the possible suspects. |
| 140 suspects = changelist_classifier.FindSuspects( |
| 141 dep_to_file_to_changelogs, |
| 142 dep_to_file_to_stack_infos, |
| 143 stack_deps, |
| 144 self._repository, |
| 145 ignore_cls) |
| 146 if suspects is None: |
| 147 return [] |
| 148 |
| 149 # Score the suspects and organize them for outputting/returning. |
| 150 features_given_report = self._model.Features(report) |
| 151 score_given_report = self._model.Score(report) |
| 152 scored_suspects = [] |
| 153 for suspect in suspects: |
| 154 score = score_given_report(suspect) |
| 155 if self._LogZeroish(score): |
| 156 logging.debug('Discarding suspect because it has zero probability: %s' |
| 157 % str(suspect.ToDict())) |
| 158 continue |
| 159 |
| 160 suspect.confidence = score |
| 161 features = features_given_report(suspect) |
| 162 suspect.reasons = self.FormatReasons(features) |
| 163 suspect.changed_files = [changed_file.ToDict() |
| 164 for changed_file in self.AggregateChangedFiles(features)] |
| 165 scored_suspects.append(suspect) |
| 166 |
| 167 scored_suspects.sort(key=lambda suspect: suspect.confidence) |
| 168 return scored_suspects[:self._top_n_suspects] |
| 169 |
| 170 def FormatReasons(self, features): |
| 171 """Collect and format a list of all ``FeatureValue.reason`` strings. |
| 172 |
| 173 Args: |
| 174 features (list of FeatureValue): the values whose ``reason`` |
| 175 strings should be collected. |
| 176 |
| 177 Returns: |
| 178 A list of ``(str, float, str)`` triples; where the first string is |
| 179 the feature name, the float is some numeric representation of how |
| 180 much influence this feature exerts on the ``Suspect`` being blamed, |
| 181 and the final string is the ``FeatureValue.reason``. The list is |
| 182 sorted by feature name, just to ensure that it comes out in some |
| 183 canonical order. |
| 184 |
| 185 At present, the float is the log-domain score of the feature |
| 186 value. However, this isn't the best thing for UX reasons. In the |
| 187 future it might be replaced by the normal-domain score, or by |
| 188 the probability. |
| 189 """ |
| 190 formatted_reasons = [] |
| 191 for feature in features: |
| 192 feature_score = self._SingleFeatureScore(feature) |
| 193 if self._LogZeroish(feature_score): # pragma: no cover |
| 194 logging.debug('Discarding reasons from feature %s' |
| 195 ' because it has zero probability' % feature.name) |
| 196 continue |
| 197 |
| 198 formatted_reasons.append((feature.name, feature_score, feature.reason)) |
| 199 |
| 200 return sorted(formatted_reasons, |
| 201 key=lambda formatted_reason: formatted_reason[0]) |
| 202 |
| 203 def AggregateChangedFiles(self, features): |
| 204 """Merge multiple``FeatureValue.changed_files`` lists into one. |
| 205 |
| 206 Args: |
| 207 features (list of FeatureValue): the values whose ``changed_files`` |
| 208 lists should be aggregated. |
| 209 |
| 210 Returns: |
| 211 A list of ``ChangedFile`` objects sorted by file name. The sorting |
| 212 is not essential, but is provided to ease testing by ensuring the |
| 213 output is in some canonical order. |
| 214 |
| 215 Raises: |
| 216 ``ValueError`` if any file name is given inconsistent ``blame_url``s. |
| 217 """ |
| 218 all_changed_files = {} |
| 219 for feature in features: |
| 220 if self._LogZeroish(self._SingleFeatureScore(feature)): # pragma: no cover |
| 221 logging.debug('Discarding changed files from feature %s' |
| 222 ' because it has zero probability' % feature.name) |
| 223 continue |
| 224 |
| 225 for changed_file in feature.changed_files or []: |
| 226 accumulated_changed_file = all_changed_files.get(changed_file.name) |
| 227 if accumulated_changed_file is None: |
| 228 all_changed_files[changed_file.name] = changed_file |
| 229 continue |
| 230 |
| 231 assert accumulated_changed_file.blame_url == changed_file.blame_url, ( |
| 232 ValueError('Blame URLs do not match: %s != %s' |
| 233 % (accumulated_changed_file.blame_url, changed_file.blame_url))) |
| 234 accumulated_changed_file.reasons.extend(changed_file.reasons or []) |
| 235 |
| 236 return sorted(all_changed_files.values(), |
| 237 key=lambda changed_file: changed_file.name) |
| OLD | NEW |