Chromium Code Reviews| Index: appengine/findit/crash/loglinear.py |
| diff --git a/appengine/findit/crash/loglinear.py b/appengine/findit/crash/loglinear.py |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..cd54981c730e21a08558ef86b86724806cce7ccf |
| --- /dev/null |
| +++ b/appengine/findit/crash/loglinear.py |
| @@ -0,0 +1,267 @@ |
| +# Copyright 2016 The Chromium Authors. All rights reserved. |
| +# Use of this source code is governed by a BSD-style license that can be |
| +# found in the LICENSE file. |
| + |
| +import math |
| +import numpy as np |
| + |
| +from libs.math.functions import MemoizedFunction |
| +from libs.math.logarithms import logsumexp |
| +from libs.math.vectors import vsum |
| + |
| + |
| +def ToFeatureFunction(fs): |
| + """Given an array of scalar-valued functions, return a vector-valued function. |
| + |
| + Args: |
| + fs (iterable): A collection of functions from ``X`` to functions from |
|
Sharu Jiang
2016/12/06 20:49:19
X -> Y -> float should be more clear.
wrengr
2016/12/07 00:55:38
Can do. I wasn't sure if folks were comfortable wi
|
| + ``Y`` to ``float``. |
| + |
| + Returns: |
| + A function from ``X`` to a function from ``Y`` to a ``list`` of |
| + ``float`` where ``ToFeatureFunction(fs)(x)(y)[i] == fs[i](x)(y)``. |
| + """ |
| + def _FeatureFunction(x): |
|
Sharu Jiang
2016/12/06 20:49:19
What would X look like? (regression or stacktrace
wrengr
2016/12/07 00:55:38
X will be whatever information is given a priori a
Sharu Jiang
2016/12/07 18:40:04
Acknowledged.
|
| + fxs = [f(x) for f in fs] |
| + # TODO(wrengr): allow ``fx(y)`` to be a list, and flatten everything. |
| + return lambda y: [fx(y) for fx in fxs] |
| + |
| + return _FeatureFunction |
| + |
| + |
| +# TODO(http://crbug.com/669639): lots of ways to make this code better. |
| +class LogLinearModel(object): |
| + """A loglinear probability model. |
| + |
| + We use this model for combining a bunch of different features in order |
| + to decide who to blame. In particular, the nice thing about loglinear |
| + models is that (a) they allow separating the amount of weight we |
| + give to each feature from the feature itself, and (b) they allow us |
| + to keep adding features at will without needing to worry about the |
| + fiddly details needed for it to be a probability model. |
| + |
| + Throughout the documentation we use the following conventional |
| + terminology. The independent variable (aka: data to be classified) is |
| + called ``X``, with particular values for that variable called ``x``. The |
| + dependent variable (aka: the label returned by classification) is |
| + called ``Y``, with particular values for that random variable called |
| + ``y``. The partition function is called ``Z``. And, somewhat |
| + non-conventionally, we will call the log of the partition function |
| + ``zeta``. |
| + """ |
| + |
| + def __init__(self, Y, feature_function, weights, epsilon=None): |
| + """ |
| + Args: |
| + Y (iterable): the entire range of values for the independent |
| + variable. This is needed for computing the partition function. |
| + feature_function: A function from ``X`` to ``Y`` to a list of |
| + ``float``. N.B., for all ``x`` and ``y`` the length of |
| + ``feature_function(x)(y)`` must be the same as the length of |
| + ``weights``. |
| + weights (list of float): coefficients for how much we believe |
| + each component of the feature vector. |
| + epsilon (float): The absolute-error threshold for considering a |
| + weight to be "equal to zero". N.B., this should be a positive |
| + number, as we will compare it against the absolute value of |
| + each weight. |
| + """ |
| + self._Y = frozenset(Y) |
|
Sharu Jiang
2016/12/07 00:05:06
Instead of passing all Y(changelogs) into LogLinea
wrengr
2016/12/07 00:55:38
Could do that. Of course, those constants need to
|
| + |
| + if epsilon is None: |
| + epsilon = 0.00001 |
|
Sharu Jiang
2016/12/06 20:49:19
we should define a EPSILON constant.
wrengr
2016/12/07 00:55:38
Done.
|
| + # N.B., ``np.array`` can't take generators. |
| + self._weights = np.array([ |
| + w if isinstance(w, float) and math.fabs(w) >= epsilon else 0.0 |
| + for w in weights]) |
| + |
| + def _FeaturesMemoizedOnY(x): |
| + fx = feature_function(x) |
| + def _FeaturesCoercedToCovector(y): |
| + # N.B., ``np.array`` can't take generators. |
| + fxy = np.array(list(fx(y))) |
| + # N.B., we're assuming that ``len(self.weights)`` is O(1). |
| + assert len(fxy) == len(self.weights), TypeError( |
| + "vector length mismatch: %d != %d" % (len(fxy), len(self.weights))) |
| + return fxy |
| + return MemoizedFunction(_FeaturesCoercedToCovector) |
| + self._features = MemoizedFunction(_FeaturesMemoizedOnY) |
| + |
| + # N.B., this is just the inner product of ``self.weights`` |
| + # against ``self._features(x)``. If we can compute this in some |
| + # more efficient way, we should. In particular, we will want to |
| + # make the wrights sparse, in which case we need to use a sparse |
| + # variant of the dot product. |
| + self._scores = MemoizedFunction(lambda x: |
| + self._features(x).map(self.weights.dot)) |
| + |
| + def _LogZ(x): |
| + score_given_x = self._scores(x) |
| + return logsumexp([score_given_x(y) for y in self._Y]) |
| + self._zetas = MemoizedFunction(_LogZ) |
| + |
| + self._quadrance = None |
| + |
| + def ClearWeightBasedMemos(self): |
| + """Clear all the memos that depend on the weight covector.""" |
| + self._quadrance = None |
| + self._scores.ClearMemos() |
| + self._zetas.ClearMemos() |
| + |
| + def ClearAllMemos(self): |
| + """Clear all memos, even those independent of the weight covector.""" |
| + self.ClearWeightBasedMemos() |
| + self._features.ClearMemos() |
| + |
| + @property |
| + def weights(self): |
| + """The weight covector. |
| + |
| + At present we return the weights as an ``np.ndarray``, but in the |
| + future that may be replaced by a more general type which specifies |
| + the semantics rather than the implementation details. |
| + """ |
| + return self._weights |
| + |
| + @property |
| + def l0(self): # pragma: no cover |
| + """The l0-norm of the weight covector. |
| + |
| + N.B., despite being popularly called the "l0-norm", this isn't |
| + actually a norm in the mathematical sense.""" |
| + return float(np.count_nonzero(self.weights)) |
| + |
| + @property |
| + def l1(self): # pragma: no cover |
| + """The l1 (aka: Manhattan) norm of the weight covector.""" |
| + return math.fsum(math.fabs(w) for w in self.weights) |
| + |
| + @property |
| + def quadrance(self): |
| + """The square of the l2 norm of the weight covector. |
| + |
| + This value is often more helpful to have direct access to, as it |
| + avoids the need for non-rational functions (e.g., sqrt) and shows up |
| + as its own quantity in many places. Also, computing it directly avoids |
| + the error introduced by squaring the square-root of an IEEE-754 float. |
| + """ |
| + if self._quadrance is None: |
| + self._quadrance = math.fsum(math.fabs(w)**2 for w in self.weights) |
| + |
| + return self._quadrance |
| + |
| + @property |
| + def l2(self): |
| + """The l2 (aka Euclidean) norm of the weight covector. |
| + |
| + If you need the square of the l2-norm, do not use this property. |
| + Instead, use the ``quadrance`` property which is more accurate than |
| + squaring this one. |
| + """ |
| + return math.sqrt(self.quadrance) |
| + |
| + # TODO(wrengr): avoid the extra indirection from this method to the |
| + # MemoizedFunction stored in the field. |
| + def Features(self, x): |
| + """Return a function mapping ``y`` to its feature vector given ``x``. |
| + |
| + Args: |
| + x (X): the value of the dependent variable. |
| + |
| + Returns: |
| + A ``MemoizedFunction`` from ``Y`` to an ``np.array`` of ``float``. |
| + """ |
| + return self._features(x) |
| + |
| + # TODO(wrengr): avoid the extra indirection from this method to the |
| + # MemoizedFunction stored in the field. |
| + def Score(self, x): |
| + """Return a function mapping ``y`` to its "score" given ``x``. |
| + |
| + Semantically, the "score" of ``y`` given ``x`` is the |
| + unnormalized log-domain conditional probability of ``y`` given |
| + ``x``. Operationally, we compute this by taking the inner product |
| + of the weight covector with the ``Features(x)(y)`` vector. The |
| + conditional probability itself can be computed by exponentiating |
| + the "score" and normalizing with the partition function. However, |
| + for determining the "best" ``y`` we don't need to bother computing |
| + the probability, because ``Score(x)`` is monotonic with respect to |
| + ``Probability(x)``. |
| + |
| + Args: |
| + x (X): the value of the dependent variable. |
| + |
| + Returns: |
| + A ``MemoizedFunction`` from ``Y`` to ``float``. |
| + """ |
| + return self._scores(x) |
| + |
| + def Z(self, x): |
| + """The normal-domain conditional partition-function given ``x``. |
| + |
| + If you need the log of the partition function, do not use this |
| + method. Instead, use the ``logZ`` method which computes it directly |
| + and thus is more accurate than taking the log of this one. |
| + |
| + Args: |
| + x (X): the value of the dependent variable. |
|
Sharu Jiang
2016/12/07 00:05:06
I think x is independent variable?
wrengr
2016/12/07 00:55:38
yep, good catch.
|
| + |
| + Returns: |
| + The normalizing constant of ``Probability(x)``. |
| + """ |
| + return math.exp(self.logZ(x)) |
| + |
| + # TODO(wrengr): avoid the extra indirection from this method to the |
| + # MemoizedFunction stored in the field. |
| + def logZ(self, x): |
|
Sharu Jiang
2016/12/07 00:05:06
To be consistent with other names, this should be
wrengr
2016/12/07 00:55:38
Done.
|
| + """The log-domain conditional partition-function given ``x``. |
| + |
| + Args: |
| + x (X): the value of the dependent variable. |
| + |
| + Returns: |
| + The normalizing constant of ``LogProbability(x)``. |
| + """ |
| + return self._zetas(x) |
| + |
| + def Probability(self, x): |
| + """The normal-domain distribution over ``y`` given ``x``. |
| + |
| + That is, ``self.Probability(x)(y)`` returns ``p(y | x; self.weights)`` |
| + which is the model's estimation of ``Pr(y|x)``. |
| + |
| + If you need the log-probability, don't use this method. Instead, |
| + use the ``LogProbability`` method which computes it directly and |
| + thus is more accurate than taking the log of this one. |
| + """ |
| + logprob_given_x = self.LogProbability(x) |
| + return lambda y: math.exp(logprob_given_x(y)) |
| + |
| + def LogProbability(self, x): |
| + """The log-domain distribution over ``y`` given ``x``. |
| + |
| + That is, ``self.LogProbability(x)(y)`` returns the log of |
| + ``self.Probability(x)(y)``. However, this method computes the |
| + log-probabilities directly and so is more accurate and efficient |
| + than taking the log of ``Probability``. |
| + """ |
| + zeta_x = self.logZ(x) |
| + score_given_x = self.Score(x) |
| + return lambda y: score_given_x(y) - zeta_x |
| + |
| + def Expectation(self, x, f): |
| + """Compute the expectation of a function with respect to ``Probability(x)``. |
|
Sharu Jiang
2016/12/07 00:05:06
Probability and Expectation are only used for trai
wrengr
2016/12/07 00:55:38
Semantically there's actually a three-way split ba
|
| + |
| + Args: |
| + x (X): the value of the dependent variable. |
| + f (...): a function from ``Y`` to ``np.ndarray`` of ``float``. |
| + |
| + Returns: |
| + An ``np.ndarray`` of ``float`` |
| + """ |
| + prob_given_x = self.Probability(x) |
| + # N.B., the ``*`` below is vector scaling! If we want to make this |
| + # method polymorphic in the return type of ``f`` then we'll need an |
| + # API that provides both scaling and ``vsum``. |
| + return vsum([prob_given_x(y) * f(y) for y in self._Y]) |
| + |