appengine/findit/crash/loglinear/model.py - Issue 2617273002: [Predator] Move ``SingleFeatureScore`` to LLM.

Unified Diff: appengine/findit/crash/loglinear/model.py

Issue 2617273002: [Predator] Move ``SingleFeatureScore`` to LLM. (Closed)

Patch Set: Address comments. Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« appengine/findit/crash/loglinear/feature.py ('K') | « appengine/findit/crash/loglinear/feature.py ('k') | appengine/findit/crash/loglinear/test/changelist_classifier_test.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: appengine/findit/crash/loglinear/model.py

diff --git a/appengine/findit/crash/loglinear/model.py b/appengine/findit/crash/loglinear/model.py

index b231e90e075534396bfbfbdccaefb5ecdf7ad1ad..c26de5d3b2584feaac1ccc90619553df0f7ad020 100644

--- a/appengine/findit/crash/loglinear/model.py

+++ b/appengine/findit/crash/loglinear/model.py

@@ -6,6 +6,7 @@

# in this file better. We avoid having separate todos per task; instead

# see that meta-issue ticket.

+import logging

import math

import numpy as np

# N.B., ``np.array`` can't take generators; you must pass explicit lists.

@@ -17,24 +18,6 @@ from libs.math.vectors import vsum

EPSILON = 0.00001

-def ToFeatureFunction(fs):

- """Given an array of scalar-valued functions, return an array-valued function.

- Args:

- fs (iterable): A collection of curried functions ``X -> Y -> A``.

- That is, given a particular ``x`` they return a function ``Y -> A``.

- Returns:

- A function ``X -> Y -> list(A)`` where for all ``x``, ``y``, and

- ``i`` we have that ``ToFeatureFunction(fs)(x)(y)[i] == fs[i](x)(y)``.

- """

- def _FeatureFunction(x):

- fxs = [f(x) for f in fs]

- return lambda y: [fx(y) for fx in fxs]

- return _FeatureFunction

class UnnormalizedLogLinearModel(object):

"""An unnormalized loglinear model.

@@ -67,19 +50,26 @@ class UnnormalizedLogLinearModel(object):

feature_function: A function ``X -> Y -> list(FeatureValue)``. N.B.,

for all ``x`` and ``y`` the length of ``feature_function(x)(y)``

must be the same as the length of ``weights``.

- weights (list of float): coefficients for how important we consider

- each component of the feature vector for deciding which ``y``

- to blame.

+ weights (dict of float): the weights for the features. The keys of

+ the dictionary are the names of the feature that weight is

+ for. We take this argument as a dict rather than as a list so that

+ callers needn't worry about what order to provide the weights in.

epsilon (float): The absolute-error threshold for considering a

weight to be "equal to zero". N.B., this should be a positive

number, as we will compare it against the absolute value of

each weight.

"""

if epsilon is None:

- epsilon = EPSILON

- self._weights = np.array([

- w if isinstance(w, float) and math.fabs(w) >= epsilon else 0.

- for w in weights])

+ self._epsilon = EPSILON

+ else:

+ self._epsilon = epsilon

+ # TODO(crbug.com/680207) Filter zero weights, use sparse representaion of

+ # weight covector.

+ self._weights = {

+ name: weight for name, weight in weights.iteritems()

+ if self.IsNonZeroWeight(weight)

+ }

self._quadrance = None

@@ -89,7 +79,7 @@ class UnnormalizedLogLinearModel(object):

This outer wrapping takes each ``x`` to a memoized instance of

``_FeaturesGivenX``. That is, for each ``x`` we return a

- ``MemoizedFunction`` from ``Y`` to ``list(FeatureValue)``.

+ ``MemoizedFunction`` from ``Y`` to ``dict(str to FeatureValue)``.

"""

fx = feature_function(x)

def _FeaturesGivenX(y):

@@ -117,9 +107,41 @@ class UnnormalizedLogLinearModel(object):

# more efficient way, we should. In particular, we will want to

# make the weights sparse, in which case we need to use a sparse

# variant of the dot product.

- self._scores = MemoizedFunction(lambda x:

- self._features(x).map(lambda fxy:

- self.weights.dot(np.array([feature.value for feature in fxy]))))

+ self._scores = MemoizedFunction(lambda x: self._features(x).map(

+ lambda fxy: math.fsum(self.SingleFeatureScore(feature)

+ for feature in fxy.itervalues())))

+ def IsNonZeroWeight(self, weight):

+ return isinstance(weight, float) and math.fabs(weight) >= self._epsilon

+ def SingleFeatureScore(self, feature_value):

+ """Returns the score (aka weighted value) of a ``FeatureValue``.

+ Args:

+ feature_value (FeatureValue): the feature value to check.

+ Returns:

+ The score of the feature value.

+ """

+ return feature_value.value * self._weights.get(feature_value.name, 0.)

+ # TODO(crbug.com/673964): something better for detecting "close to log(0)".

+ def LogZeroish(self, x):

+ """Determine whether a float is close enough to log(0).

+ If a ``FeatureValue`` has a (log-domain) score of -inf for a given

+ ``Suspect``, then that suspect has zero probability of being the

+ culprit. We want to filter these suspects out, to clean up the

+ output of classification; so this method encapsulates the logic of

+ that check.

+ Args:

+ x (float): the float to check

+ Returns:

+ ``True`` if ``x`` is close enough to log(0); else ``False``.

+ """

+ return x < 0 and math.isinf(x)

def ClearWeightBasedMemos(self):

"""Clear all the memos that depend on the weight covector."""

@@ -135,9 +157,9 @@ class UnnormalizedLogLinearModel(object):

def weights(self):

"""The weight covector.

- At present we return the weights as an ``np.ndarray``, but in the

- future that may be replaced by a more general type which specifies

- the semantics rather than the implementation details.

+ At present we return the weights as an dict mapping feature name to its

+ weight, but in the future that may be replaced by a more general type which

+ specifies the semantics rather than the implementation details.

"""

return self._weights

@@ -147,12 +169,12 @@ class UnnormalizedLogLinearModel(object):

N.B., despite being popularly called the "l0-norm", this isn't

actually a norm in the mathematical sense."""

- return float(np.count_nonzero(self.weights))

+ return float(len(self.weights) - self.weights.values().count(0.))

@property

def l1(self): # pragma: no cover

"""The l1 (aka: Manhattan) norm of the weight covector."""

- return math.fsum(math.fabs(w) for w in self.weights)

+ return math.fsum(math.fabs(w) for w in self.weights.itervalues())

@property

def quadrance(self):

@@ -164,7 +186,8 @@ class UnnormalizedLogLinearModel(object):

the error introduced by squaring the square-root of an IEEE-754 float.

"""

if self._quadrance is None:

- self._quadrance = math.fsum(math.fabs(w)**2 for w in self.weights)

+ self._quadrance = math.fsum(

+ math.fabs(w)**2 for w in self.weights.itervalues())

return self._quadrance

@@ -185,7 +208,7 @@ class UnnormalizedLogLinearModel(object):

x (X): the value of the independent variable.

Returns:

- A ``MemoizedFunction`` of type ``Y -> np.array(float)``.

+ A ``MemoizedFunction`` of type ``Y -> dict(str to float)``.

"""

return self._features(x)

@@ -210,6 +233,77 @@ class UnnormalizedLogLinearModel(object):

"""

return self._scores(x)

+ def FormatReasons(self, features):

+ """Collect and format a list of all ``FeatureValue.reason`` strings.

+ Args:

+ features (iterable of FeatureValue): the values whose ``reason``

+ strings should be collected.

+ Returns:

+ A list of ``(str, float, str)`` triples; where the first string is

+ the feature name, the float is some numeric representation of how

+ much influence this feature exerts on the ``Suspect`` being blamed,

+ and the final string is the ``FeatureValue.reason``. The list is

+ sorted by feature name, just to ensure that it comes out in some

+ canonical order.

+ At present, the float is the log-domain score of the feature

+ value. However, this isn't the best thing for UX reasons. In the

+ future it might be replaced by the normal-domain score, or by

+ the probability.

+ """

+ formatted_reasons = []

+ for feature in features:

+ feature_score = self.SingleFeatureScore(feature)

+ if self.LogZeroish(feature_score): # pragma: no cover

+ logging.debug('Discarding reasons from feature %s'

+ ' because it has zero probability' % feature.name)

+ continue

+ formatted_reasons.append((feature.name, feature_score, feature.reason))

+ formatted_reasons.sort(key=lambda formatted_reason: formatted_reason[0])

+ return formatted_reasons

+ def AggregateChangedFiles(self, features):

+ """Merge multiple``FeatureValue.changed_files`` lists into one.

+ Args:

+ features (iterable of FeatureValue): the values whose ``changed_files``

+ lists should be aggregated.

+ Returns:

+ A list of ``ChangedFile`` objects sorted by file name. The sorting

+ is not essential, but is provided to ease testing by ensuring the

+ output is in some canonical order.

+ Raises:

+ ``ValueError`` if any file name is given inconsistent ``blame_url``s.

+ """

+ all_changed_files = {}

+ for feature in features:

+ if self.LogZeroish(self.SingleFeatureScore(feature)): # pragma: no cover

+ logging.debug('Discarding changed files from feature %s'

+ ' because it has zero probability' % feature.name)

+ continue

+ for changed_file in feature.changed_files or []:

+ accumulated_changed_file = all_changed_files.get(changed_file.name)

+ if accumulated_changed_file is None:

+ all_changed_files[changed_file.name] = changed_file

+ continue

+ if (accumulated_changed_file.blame_url !=

+ changed_file.blame_url): # pragma: no cover

+ raise ValueError('Blame URLs do not match: %s != %s'

+ % (accumulated_changed_file.blame_url, changed_file.blame_url))

+ accumulated_changed_file.reasons.extend(changed_file.reasons or [])

+ changed_files = all_changed_files.values()

+ changed_files.sort(key=lambda changed_file: changed_file.name)

+ return changed_files

class LogLinearModel(UnnormalizedLogLinearModel):

"""A loglinear probability model.

@@ -233,9 +327,10 @@ class LogLinearModel(UnnormalizedLogLinearModel):

feature_function: A function ``X -> Y -> list(float)``. N.B.,

for all ``x`` and ``y`` the length of ``feature_function(x)(y)``

must be the same as the length of ``weights``.

- weights (list of float): coefficients for how important we consider

- each component of the feature vector for deciding which ``y``

- to blame.

+ weights (dict of float): the weights for the features. The keys of

+ the dictionary are the names of the feature that weight is

+ for. We take this argument as a dict rather than as a list so that

+ callers needn't worry about what order to provide the weights in.

epsilon (float): The absolute-error threshold for considering a

weight to be "equal to zero". N.B., this should be a positive

number, as we will compare it against the absolute value of