Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1626)

Unified Diff: appengine/findit/crash/loglinear/changelist_classifier.py

Issue 2617273002: [Predator] Move ``SingleFeatureScore`` to LLM. (Closed)
Patch Set: Address comment. Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: appengine/findit/crash/loglinear/changelist_classifier.py
diff --git a/appengine/findit/crash/loglinear/changelist_classifier.py b/appengine/findit/crash/loglinear/changelist_classifier.py
index f72ddd2461a692f3cd3637d9020de2581f5a749f..96de1a9b5952da53d9ed77dcad3d5fae8978209c 100644
--- a/appengine/findit/crash/loglinear/changelist_classifier.py
+++ b/appengine/findit/crash/loglinear/changelist_classifier.py
@@ -11,7 +11,7 @@ from crash import changelist_classifier
from crash.crash_report_with_dependencies import CrashReportWithDependencies
from crash.loglinear.changelist_features import min_distance
from crash.loglinear.changelist_features import top_frame_index
-from crash.loglinear.model import ToFeatureFunction
+from crash.loglinear.feature import FeatureFunction
from crash.loglinear.model import UnnormalizedLogLinearModel
from crash.stacktrace import CallStack
from crash.stacktrace import Stacktrace
@@ -42,52 +42,12 @@ class LogLinearChangelistClassifier(object):
self._top_n_frames = top_n_frames
self._top_n_suspects = top_n_suspects
- feature_function = ToFeatureFunction([
+ feature_function = FeatureFunction([
top_frame_index.TopFrameIndexFeature(top_n_frames),
min_distance.MinDistanceFeature(),
])
- weight_list = [
- weights['TopFrameIndex'],
- weights['MinDistance'],
- ]
-
- self._model = UnnormalizedLogLinearModel(feature_function, weight_list)
-
- # TODO(crbug.com/674262): remove the need for storing these weights.
- self._weights = weights
-
- # TODO(crbug.com/673964): something better for detecting "close to log(0)".
- def _LogZeroish(self, x):
- """Determine whether a float is close enough to log(0).
-
- If a ``FeatureValue`` has a (log-domain) score of -inf for a given
- ``Suspect``, then that suspect has zero probability of being the
- culprit. We want to filter these suspects out, to clean up the
- output of classification; so this method encapsulates the logic of
- that check.
-
- Args:
- x (float): the float to check
-
- Returns:
- ``True`` if ``x`` is close enough to log(0); else ``False``.
- """
- return x < 0 and math.isinf(x)
-
- def _SingleFeatureScore(self, feature_value):
- """Returns the score (aka weighted value) of a ``FeatureValue``.
-
- This function assumes the report's stacktrace has already had any necessary
- preprocessing (like filtering or truncating) applied.
-
- Args:
- feature_value (FeatureValue): the feature value to check.
-
- Returns:
- The score of the feature value.
- """
- return feature_value.value * self._weights.get(feature_value.name, 0.)
+ self._model = UnnormalizedLogLinearModel(feature_function, weights)
def __call__(self, report):
"""Finds changelists suspected of being responsible for the crash report.
@@ -151,9 +111,12 @@ class LogLinearChangelistClassifier(object):
def RankSuspects(self, report, suspects):
"""Returns a lineup of the suspects in order of likelihood.
+ Suspects with a discardable score or lower ranking than top_n_suspects
+ will be filtered.
+
Args:
report (CrashReportWithDependencies): the crash we seek to explain.
- suspects (list of Suspect): the CLs to consider blaming for the crash.
+ suspects (iterable of Suspect): the CLs to consider blaming for the crash.
Returns:
A list of suspects in order according to their likelihood. This
@@ -171,88 +134,18 @@ class LogLinearChangelistClassifier(object):
scored_suspects = []
for suspect in suspects:
score = score_given_report(suspect)
- if self._LogZeroish(score):
+ if self._model.LogZeroish(score):
logging.debug('Discarding suspect because it has zero probability: %s'
% str(suspect.ToDict()))
continue
suspect.confidence = score
features = features_given_report(suspect)
- suspect.reasons = self.FormatReasons(features)
+ suspect.reasons = self._model.FormatReasons(features.itervalues())
suspect.changed_files = [
- changed_file.ToDict()
- for changed_file in self.AggregateChangedFiles(features)]
+ changed_file.ToDict() for changed_file in
+ self._model.AggregateChangedFiles(features.itervalues())]
scored_suspects.append(suspect)
scored_suspects.sort(key=lambda suspect: suspect.confidence)
return scored_suspects[:self._top_n_suspects]
-
- def FormatReasons(self, features):
- """Collect and format a list of all ``FeatureValue.reason`` strings.
-
- Args:
- features (list of FeatureValue): the values whose ``reason``
- strings should be collected.
-
- Returns:
- A list of ``(str, float, str)`` triples; where the first string is
- the feature name, the float is some numeric representation of how
- much influence this feature exerts on the ``Suspect`` being blamed,
- and the final string is the ``FeatureValue.reason``. The list is
- sorted by feature name, just to ensure that it comes out in some
- canonical order.
-
- At present, the float is the log-domain score of the feature
- value. However, this isn't the best thing for UX reasons. In the
- future it might be replaced by the normal-domain score, or by
- the probability.
- """
- formatted_reasons = []
- for feature in features:
- feature_score = self._SingleFeatureScore(feature)
- if self._LogZeroish(feature_score): # pragma: no cover
- logging.debug('Discarding reasons from feature %s'
- ' because it has zero probability' % feature.name)
- continue
-
- formatted_reasons.append((feature.name, feature_score, feature.reason))
-
- return sorted(formatted_reasons,
- key=lambda formatted_reason: formatted_reason[0])
-
- def AggregateChangedFiles(self, features):
- """Merge multiple``FeatureValue.changed_files`` lists into one.
-
- Args:
- features (list of FeatureValue): the values whose ``changed_files``
- lists should be aggregated.
-
- Returns:
- A list of ``ChangedFile`` objects sorted by file name. The sorting
- is not essential, but is provided to ease testing by ensuring the
- output is in some canonical order.
-
- Raises:
- ``ValueError`` if any file name is given inconsistent ``blame_url``s.
- """
- all_changed_files = {}
- for feature in features:
- if self._LogZeroish(self._SingleFeatureScore(feature)): # pragma: no cover
- logging.debug('Discarding changed files from feature %s'
- ' because it has zero probability' % feature.name)
- continue
-
- for changed_file in feature.changed_files or []:
- accumulated_changed_file = all_changed_files.get(changed_file.name)
- if accumulated_changed_file is None:
- all_changed_files[changed_file.name] = changed_file
- continue
-
- if (accumulated_changed_file.blame_url !=
- changed_file.blame_url): # pragma: no cover
- raise ValueError('Blame URLs do not match: %s != %s'
- % (accumulated_changed_file.blame_url, changed_file.blame_url))
- accumulated_changed_file.reasons.extend(changed_file.reasons or [])
-
- return sorted(all_changed_files.values(),
- key=lambda changed_file: changed_file.name)
« no previous file with comments | « no previous file | appengine/findit/crash/loglinear/feature.py » ('j') | appengine/findit/crash/loglinear/feature.py » ('J')

Powered by Google App Engine
This is Rietveld 408576698