appengine/findit/crash/loglinear/model.py - Issue 2625073003: [Predator] Add MetaWeight and MetaFeatureValue to group multiple weights and features together.

Unified Diff: appengine/findit/crash/loglinear/model.py

Issue 2625073003: [Predator] Add MetaWeight and MetaFeatureValue to group multiple weights and features together. (Closed)

Patch Set: Fix nits. Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« appengine/findit/crash/loglinear/feature.py ('K') | « appengine/findit/crash/loglinear/feature.py ('k') | appengine/findit/crash/loglinear/test/changelist_classifier_test.py » ('j') | appengine/findit/crash/loglinear/weight.py » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: appengine/findit/crash/loglinear/model.py

diff --git a/appengine/findit/crash/loglinear/model.py b/appengine/findit/crash/loglinear/model.py

index c26de5d3b2584feaac1ccc90619553df0f7ad020..e9ae7103217811539e428fc9473fe2c164e543eb 100644

--- a/appengine/findit/crash/loglinear/model.py

+++ b/appengine/findit/crash/loglinear/model.py

@@ -43,15 +43,19 @@ class UnnormalizedLogLinearModel(object):

rather than returning a probability per se.

"""

- def __init__(self, feature_function, weights, epsilon=None):

- """Construct a new model with the given weights and feature function.

+ def __init__(self, meta_feature, meta_weight, epsilon=None):

+ """Construct a new model with the meta_feature and meta_weight.

Args:

- feature_function: A function ``X -> Y -> list(FeatureValue)``. N.B.,

- for all ``x`` and ``y`` the length of ``feature_function(x)(y)``

+ wrapped_feature: A function ``X -> Y -> list(FeatureValue)``. N.B.,

chanli 2017/01/19 04:26:06 Not in argument list above?

Sharu Jiang 2017/01/19 23:49:52 Oops, should have deleted it.

+ for all ``x`` and ``y`` the length of ``wrapped_feature(x)(y)``

must be the same as the length of ``weights``.

- weights (dict of float): the weights for the features. The keys of

- the dictionary are the names of the feature that weight is

+ meta_feature (MetaFeature): A function ``X -> Y -> MetaFeatureValue``.

+ N.B., for all ``x`` and ``y`` the length of ``wrapped_feature(x)(y)``

+ must be the same as the length of ``weights``.

+ All features.

+ meta_weight (MetaWeight): All weights. the weights for the features.

+ The keys of the dictionary are the names of the feature that weight is

for. We take this argument as a dict rather than as a list so that

callers needn't worry about what order to provide the weights in.

epsilon (float): The absolute-error threshold for considering a

@@ -64,84 +68,33 @@ class UnnormalizedLogLinearModel(object):

else:

self._epsilon = epsilon

chanli 2017/01/19 04:26:06 This if else can be simplified to self._epsilon

lijeffrey 2017/01/19 15:23:35 even simpler is self._epsilon = epsilon or EPSILON

Sharu Jiang 2017/01/19 23:49:52 if epsilon would be False if epsilon equals to zer

- # TODO(crbug.com/680207) Filter zero weights, use sparse representaion of

- # weight covector.

- self._weights = {

- name: weight for name, weight in weights.iteritems()

- if self.IsNonZeroWeight(weight)

- }

+ self._meta_weight = meta_weight

+ self._meta_weight.DropZeroWeights(self._epsilon)

self._quadrance = None

# TODO(crbug.com/674752): we need better names for ``self._features``.

def _Features(x):

- """Wrap ``feature_function`` to memoize things and ensure types.

+ """Wrap ``wrapped_feature`` to memoize things and ensure types.

This outer wrapping takes each ``x`` to a memoized instance of

``_FeaturesGivenX``. That is, for each ``x`` we return a

``MemoizedFunction`` from ``Y`` to ``dict(str to FeatureValue)``.

"""

- fx = feature_function(x)

- def _FeaturesGivenX(y):

- """Wrap ``feature_function(x)`` to ensure appropriate types.

- This inner wrapper ensures that the resulting ``FeatureValue``

- array has the same length as the weight covector.

- """

- fxy = fx(y)

- # N.B., we're assuming that ``len(self.weights)`` is O(1).

- assert len(fxy) == len(self.weights), TypeError(

- "vector length mismatch: %d != %d" % (len(fxy), len(self.weights)))

- return fxy

# Memoize on ``Y``, to ensure we don't need to recompute

# ``FeatureValue``s nor recheck the lengths.

- return MemoizedFunction(_FeaturesGivenX)

+ return MemoizedFunction(meta_feature(x))

# Memoize on ``X``, to ensure we share the memo tables on ``Y``.

self._features = MemoizedFunction(_Features)

# TODO(crbug.com/674752): we need better names for ``self._scores``.

- # N.B., this is just the inner product of ``self.weights``

+ # N.B., this is just the inner product of ``self._meta_weight``

# against ``self._features(x)``. If we can compute this in some

# more efficient way, we should. In particular, we will want to

# make the weights sparse, in which case we need to use a sparse

# variant of the dot product.

self._scores = MemoizedFunction(lambda x: self._features(x).map(

- lambda fxy: math.fsum(self.SingleFeatureScore(feature)

- for feature in fxy.itervalues())))

- def IsNonZeroWeight(self, weight):

- return isinstance(weight, float) and math.fabs(weight) >= self._epsilon

- def SingleFeatureScore(self, feature_value):

- """Returns the score (aka weighted value) of a ``FeatureValue``.

- Args:

- feature_value (FeatureValue): the feature value to check.

- Returns:

- The score of the feature value.

- """

- return feature_value.value * self._weights.get(feature_value.name, 0.)

- # TODO(crbug.com/673964): something better for detecting "close to log(0)".

- def LogZeroish(self, x):

- """Determine whether a float is close enough to log(0).

- If a ``FeatureValue`` has a (log-domain) score of -inf for a given

- ``Suspect``, then that suspect has zero probability of being the

- culprit. We want to filter these suspects out, to clean up the

- output of classification; so this method encapsulates the logic of

- that check.

- Args:

- x (float): the float to check

- Returns:

- ``True`` if ``x`` is close enough to log(0); else ``False``.

- """

- return x < 0 and math.isinf(x)

+ lambda fxy: self._meta_weight * fxy))

def ClearWeightBasedMemos(self):

"""Clear all the memos that depend on the weight covector."""

@@ -154,14 +107,19 @@ class UnnormalizedLogLinearModel(object):

self._features.ClearMemos()

@property

- def weights(self):

+ def meta_weight(self):

"""The weight covector.

At present we return the weights as an dict mapping feature name to its

weight, but in the future that may be replaced by a more general type which

specifies the semantics rather than the implementation details.

"""

- return self._weights

+ return self._meta_weight

+ @meta_weight.setter

+ def meta_weight(self, new_meta_weight):

+ self._meta_weight = new_meta_weight

+ self._meta_weight.DropZeroWeights(self._epsilon)

@property

def l0(self): # pragma: no cover

@@ -169,12 +127,12 @@ class UnnormalizedLogLinearModel(object):

N.B., despite being popularly called the "l0-norm", this isn't

actually a norm in the mathematical sense."""

- return float(len(self.weights) - self.weights.values().count(0.))

+ return self._meta_weight.l0

@property

def l1(self): # pragma: no cover

"""The l1 (aka: Manhattan) norm of the weight covector."""

- return math.fsum(math.fabs(w) for w in self.weights.itervalues())

+ return self._meta_weight.l1

@property

def quadrance(self):

@@ -185,11 +143,7 @@ class UnnormalizedLogLinearModel(object):

as its own quantity in many places. Also, computing it directly avoids

the error introduced by squaring the square-root of an IEEE-754 float.

"""

- if self._quadrance is None:

- self._quadrance = math.fsum(

- math.fabs(w)**2 for w in self.weights.itervalues())

- return self._quadrance

+ return self._meta_weight.quadrance

@property

def l2(self):

@@ -201,6 +155,24 @@ class UnnormalizedLogLinearModel(object):

"""

return math.sqrt(self.quadrance)

+ # TODO(crbug.com/673964): something better for detecting "close to log(0)".

+ def LogZeroish(self, x):

+ """Determine whether a float is close enough to log(0).

+ If a ``FeatureValue`` has a (log-domain) score of -inf for a given

+ ``Suspect``, then that suspect has zero probability of being the

+ culprit. We want to filter these suspects out, to clean up the

+ output of classification; so this method encapsulates the logic of

+ that check.

+ Args:

+ x (float): the float to check

+ Returns:

+ ``True`` if ``x`` is close enough to log(0); else ``False``.

+ """

+ return x < 0 and math.isinf(x)

def Features(self, x):

"""Returns a function mapping ``y`` to its feature vector given ``x``.

@@ -233,77 +205,6 @@ class UnnormalizedLogLinearModel(object):

"""

return self._scores(x)

- def FormatReasons(self, features):

- """Collect and format a list of all ``FeatureValue.reason`` strings.

- Args:

- features (iterable of FeatureValue): the values whose ``reason``

- strings should be collected.

- Returns:

- A list of ``(str, float, str)`` triples; where the first string is

- the feature name, the float is some numeric representation of how

- much influence this feature exerts on the ``Suspect`` being blamed,

- and the final string is the ``FeatureValue.reason``. The list is

- sorted by feature name, just to ensure that it comes out in some

- canonical order.

- At present, the float is the log-domain score of the feature

- value. However, this isn't the best thing for UX reasons. In the

- future it might be replaced by the normal-domain score, or by

- the probability.

- """

- formatted_reasons = []

- for feature in features:

- feature_score = self.SingleFeatureScore(feature)

- if self.LogZeroish(feature_score): # pragma: no cover

- logging.debug('Discarding reasons from feature %s'

- ' because it has zero probability' % feature.name)

- continue

- formatted_reasons.append((feature.name, feature_score, feature.reason))

- formatted_reasons.sort(key=lambda formatted_reason: formatted_reason[0])

- return formatted_reasons

- def AggregateChangedFiles(self, features):

- """Merge multiple``FeatureValue.changed_files`` lists into one.

- Args:

- features (iterable of FeatureValue): the values whose ``changed_files``

- lists should be aggregated.

- Returns:

- A list of ``ChangedFile`` objects sorted by file name. The sorting

- is not essential, but is provided to ease testing by ensuring the

- output is in some canonical order.

- Raises:

- ``ValueError`` if any file name is given inconsistent ``blame_url``s.

- """

- all_changed_files = {}

- for feature in features:

- if self.LogZeroish(self.SingleFeatureScore(feature)): # pragma: no cover

- logging.debug('Discarding changed files from feature %s'

- ' because it has zero probability' % feature.name)

- continue

- for changed_file in feature.changed_files or []:

- accumulated_changed_file = all_changed_files.get(changed_file.name)

- if accumulated_changed_file is None:

- all_changed_files[changed_file.name] = changed_file

- continue

- if (accumulated_changed_file.blame_url !=

- changed_file.blame_url): # pragma: no cover

- raise ValueError('Blame URLs do not match: %s != %s'

- % (accumulated_changed_file.blame_url, changed_file.blame_url))

- accumulated_changed_file.reasons.extend(changed_file.reasons or [])

- changed_files = all_changed_files.values()

- changed_files.sort(key=lambda changed_file: changed_file.name)

- return changed_files

class LogLinearModel(UnnormalizedLogLinearModel):

"""A loglinear probability model.

@@ -312,7 +213,7 @@ class LogLinearModel(UnnormalizedLogLinearModel):

we can provide probabilities (not just scores). However, to do so we

require a specification of the subsets of ``Y`` for each ``x``.

"""

- def __init__(self, Y_given_X, feature_function, weights, epsilon=None):

+ def __init__(self, Y_given_X, meta_feature, meta_weight, epsilon=None):

"""Construct a new probabilistic model.

Args:

@@ -324,11 +225,12 @@ class LogLinearModel(UnnormalizedLogLinearModel):

needed for computing the partition function and expectation. N.B.,

we do not actually need to know/enumerate of *all* of ``Y``,

only the subsets for each ``x``.

- feature_function: A function ``X -> Y -> list(float)``. N.B.,

- for all ``x`` and ``y`` the length of ``feature_function(x)(y)``

+ meta_feature (MetaFeature): A function ``X -> Y -> MetaFeatureValue``.

+ N.B., for all ``x`` and ``y`` the length of ``wrapped_feature(x)(y)``

must be the same as the length of ``weights``.

- weights (dict of float): the weights for the features. The keys of

- the dictionary are the names of the feature that weight is

+ All features.

+ meta_weight (MetaWeight): All weights. the weights for the features.

+ The keys of the dictionary are the names of the feature that weight is

for. We take this argument as a dict rather than as a list so that

callers needn't worry about what order to provide the weights in.

epsilon (float): The absolute-error threshold for considering a

@@ -336,7 +238,7 @@ class LogLinearModel(UnnormalizedLogLinearModel):

number, as we will compare it against the absolute value of

each weight.

"""

- super(LogLinearModel, self).__init__(feature_function, weights, epsilon)

+ super(LogLinearModel, self).__init__(meta_feature, meta_weight, epsilon)

self._Y = Y_given_X

@@ -379,7 +281,7 @@ class LogLinearModel(UnnormalizedLogLinearModel):

def Probability(self, x):

"""The normal-domain distribution over ``y`` given ``x``.

- That is, ``self.Probability(x)(y)`` returns ``p(y | x; self.weights)``

+ That is, ``self.Probability(x)(y)`` returns ``p(y | x; self._meta_weight)``

which is the model's estimation of ``Pr(y|x)``.

If you need the log-probability, don't use this method. Instead,

@@ -420,4 +322,3 @@ class LogLinearModel(UnnormalizedLogLinearModel):

# method polymorphic in the return type of ``f`` then we'll need an

# API that provides both scaling and ``vsum``.

return vsum([prob_given_x(y) * f(y) for y in self._Y(x)])