Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(627)

Unified Diff: appengine/findit/crash/loglinear.py

Issue 2517383005: Implementing loglinear classification (without training), for CL classification (Closed)
Patch Set: breaking apart normalized and unnormalized models Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | appengine/findit/crash/results.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: appengine/findit/crash/loglinear.py
diff --git a/appengine/findit/crash/loglinear.py b/appengine/findit/crash/loglinear.py
new file mode 100644
index 0000000000000000000000000000000000000000..147b820b729f76490f462c107c7ab06f7f533f0d
--- /dev/null
+++ b/appengine/findit/crash/loglinear.py
@@ -0,0 +1,302 @@
+# Copyright 2016 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import math
+import numpy as np
+
+from libs.math.functions import MemoizedFunction
+from libs.math.logarithms import logsumexp
+from libs.math.vectors import vsum
+
+EPSILON = 0.00001
+
+
+def ToFeatureFunction(fs):
+ """Given an array of scalar-valued functions, return a vector-valued function.
+
+ Args:
+ fs (iterable): A collection of curried functions ``X -> Y -> float``.
+ That is, given a particular ``x`` they return a function ``Y -> float``.
+
+ Returns:
+ A function ``X -> Y -> list(float)`` where for all ``x``, ``y``, and
+ ``i`` we have that ``ToFeatureFunction(fs)(x)(y)[i] == fs[i](x)(y)``.
+ """
+ def _FeatureFunction(x):
+ fxs = [f(x) for f in fs]
+ # TODO(wrengr): allow ``fx(y)`` to be a list, and flatten everything.
+ return lambda y: [fx(y) for fx in fxs]
+
+ return _FeatureFunction
+
+
+# TODO(http://crbug.com/669639): lots of ways to make this code better.
+class UnnormalizedLogLinearModel(object):
+ """An unnormalized loglinear model.
+
+ We use this model for combining a bunch of different features in order
+ to decide who to blame. In particular, the nice thing about loglinear
+ models is that (a) they allow separating the amount of weight we
+ give to each feature from the feature itself, and (b) they allow us
+ to keep adding features at will without needing to worry about the
+ fiddly details needed for it to be a probability model.
+
+ Throughout the documentation we use the following conventional
+ terminology. The independent variable (aka: the given data which
+ sets up the classification problem) is called ``X``, where particular
+ values for that variable called ``x``. The dependent variable (aka:
+ the answers/labels returned by classification) is called ``Y``,
+ where particular values for that random variable called ``y``. The
+ partition function is called ``Z``. And, somewhat non-conventionally,
+ we will call the log of the partition function ``zeta``.
+
+ This class is distinct from ``LogLinearModel`` in that we do not require
+ a specification of ``Y``. This class is sufficient for determining
+ which ``y`` is the most likely, though it can only return a "score"
+ rather than returning a probability per se.
+ """
+
+ def __init__(self, feature_function, weights, epsilon=None):
+ """Construct a new model with the given weights and feature function.
+
+ Args:
+ feature_function: A function ``X -> Y -> list(float)``. N.B.,
+ for all ``x`` and ``y`` the length of ``feature_function(x)(y)``
+ must be the same as the length of ``weights``.
+ weights (list of float): coefficients for how important we consider
+ each component of the feature vector for deciding which ``y``
+ to blame.
+ epsilon (float): The absolute-error threshold for considering a
+ weight to be "equal to zero". N.B., this should be a positive
+ number, as we will compare it against the absolute value of
+ each weight.
+ """
+ if epsilon is None:
+ epsilon = EPSILON
+ # N.B., ``np.array`` can't take generators.
+ self._weights = np.array([
+ w if isinstance(w, float) and math.fabs(w) >= epsilon else 0.0
+ for w in weights])
+
+ self._quadrance = None
+
+ def _FeaturesMemoizedOnY(x):
+ fx = feature_function(x)
+ def _FeaturesCoercedToCovector(y):
+ # N.B., ``np.array`` can't take generators.
+ fxy = np.array(list(fx(y)))
+ # N.B., we're assuming that ``len(self.weights)`` is O(1).
+ assert len(fxy) == len(self.weights), TypeError(
+ "vector length mismatch: %d != %d" % (len(fxy), len(self.weights)))
+ return fxy
+ return MemoizedFunction(_FeaturesCoercedToCovector)
+ self._features = MemoizedFunction(_FeaturesMemoizedOnY)
+
+ # N.B., this is just the inner product of ``self.weights``
+ # against ``self._features(x)``. If we can compute this in some
+ # more efficient way, we should. In particular, we will want to
+ # make the weights sparse, in which case we need to use a sparse
+ # variant of the dot product.
+ self._scores = MemoizedFunction(lambda x:
+ self._features(x).map(self.weights.dot))
+
+ def ClearWeightBasedMemos(self):
+ """Clear all the memos that depend on the weight covector."""
+ self._quadrance = None
+ self._scores.ClearMemos()
+
+ def ClearAllMemos(self):
+ """Clear all memos, even those independent of the weight covector."""
+ self.ClearWeightBasedMemos()
+ self._features.ClearMemos()
+
+ @property
+ def weights(self):
+ """The weight covector.
+
+ At present we return the weights as an ``np.ndarray``, but in the
+ future that may be replaced by a more general type which specifies
+ the semantics rather than the implementation details.
+ """
+ return self._weights
+
+ @property
+ def l0(self): # pragma: no cover
+ """The l0-norm of the weight covector.
+
+ N.B., despite being popularly called the "l0-norm", this isn't
+ actually a norm in the mathematical sense."""
+ return float(np.count_nonzero(self.weights))
+
+ @property
+ def l1(self): # pragma: no cover
+ """The l1 (aka: Manhattan) norm of the weight covector."""
+ return math.fsum(math.fabs(w) for w in self.weights)
+
+ @property
+ def quadrance(self):
+ """The square of the l2 norm of the weight covector.
+
+ This value is often more helpful to have direct access to, as it
+ avoids the need for non-rational functions (e.g., sqrt) and shows up
+ as its own quantity in many places. Also, computing it directly avoids
+ the error introduced by squaring the square-root of an IEEE-754 float.
+ """
+ if self._quadrance is None:
+ self._quadrance = math.fsum(math.fabs(w)**2 for w in self.weights)
+
+ return self._quadrance
+
+ @property
+ def l2(self):
+ """The l2 (aka Euclidean) norm of the weight covector.
+
+ If you need the square of the l2-norm, do not use this property.
+ Instead, use the ``quadrance`` property which is more accurate than
+ squaring this one.
+ """
+ return math.sqrt(self.quadrance)
+
+ def Features(self, x):
+ """Return a function mapping ``y`` to its feature vector given ``x``.
+
+ Args:
+ x (X): the value of the independent variable.
+
+ Returns:
+ A ``MemoizedFunction`` of type ``Y -> np.array(float)``.
+ """
+ return self._features(x)
+
+ def Score(self, x):
+ """Return a function mapping ``y`` to its "score" given ``x``.
+
+ Semantically, the "score" of ``y`` given ``x`` is the
+ unnormalized log-domain conditional probability of ``y`` given
+ ``x``. Operationally, we compute this by taking the inner product
+ of the weight covector with the ``Features(x)(y)`` vector. The
+ conditional probability itself can be computed by exponentiating
+ the "score" and normalizing with the partition function. However,
+ for determining the "best" ``y`` we don't need to bother computing
+ the probability, because ``Score(x)`` is monotonic with respect to
+ ``Probability(x)``.
+
+ Args:
+ x (X): the value of the independent variable.
+
+ Returns:
+ A ``MemoizedFunction`` of type ``Y -> float``.
+ """
+ return self._scores(x)
+
+
+class LogLinearModel(UnnormalizedLogLinearModel):
Sharu Jiang 2016/12/07 18:40:04 one question, so the online/offline split is betwe
wrengr 2016/12/08 20:09:07 I'm not sure exactly which sense of "on-/offline"
+ """A loglinear probability model.
+
+ This class is distinct from ``UnnormalizedLogLinearModel`` in that
+ we can provide probabilities (not just scores). However, to do so we
+ require a specification of the entire set ``Y``.
+ """
+ def __init__(self, Y, feature_function, weights, epsilon=None):
+ """Construct a new probabilistic model.
+
+ Args:
+ Y (iterable): the entire range of values for the independent
+ variable. This is needed for computing the partition function.
+ feature_function: A function ``X -> Y -> list(float)``. N.B.,
+ for all ``x`` and ``y`` the length of ``feature_function(x)(y)``
+ must be the same as the length of ``weights``.
+ weights (list of float): coefficients for how important we consider
+ each component of the feature vector for deciding which ``y``
+ to blame.
+ epsilon (float): The absolute-error threshold for considering a
+ weight to be "equal to zero". N.B., this should be a positive
+ number, as we will compare it against the absolute value of
+ each weight.
+ """
+ super(LogLinearModel, self).__init__(feature_function, weights, epsilon)
+
+ self._Y = frozenset(Y)
+
+ def _LogZ(x):
+ score_given_x = self._scores(x)
+ return logsumexp([score_given_x(y) for y in self._Y])
+ self._zetas = MemoizedFunction(_LogZ)
+
+ def ClearWeightBasedMemos(self):
+ """Clear all the memos that depend on the weight covector."""
+ super(LogLinearModel, self).ClearWeightBasedMemos()
+ self._zetas.ClearMemos()
+
+ def Z(self, x):
+ """The normal-domain conditional partition-function given ``x``.
+
+ If you need the log of the partition function, do not use this
+ method. Instead, use the ``LogZ`` method which computes it directly
+ and thus is more accurate than taking the log of this one.
+
+ Args:
+ x (X): the value of the independent variable.
+
+ Returns:
+ The normalizing constant of ``Probability(x)``.
+ """
+ return math.exp(self.LogZ(x))
+
+ def LogZ(self, x):
+ """The log-domain conditional partition-function given ``x``.
+
+ Args:
+ x (X): the value of the independent variable.
+
+ Returns:
+ The normalizing constant of ``LogProbability(x)``.
+ """
+ return self._zetas(x)
+
+ def Probability(self, x):
+ """The normal-domain distribution over ``y`` given ``x``.
+
+ That is, ``self.Probability(x)(y)`` returns ``p(y | x; self.weights)``
+ which is the model's estimation of ``Pr(y|x)``.
+
+ If you need the log-probability, don't use this method. Instead,
+ use the ``LogProbability`` method which computes it directly and
+ thus is more accurate than taking the log of this one.
+ """
+ logprob_given_x = self.LogProbability(x)
+ return lambda y: math.exp(logprob_given_x(y))
+
+ def LogProbability(self, x):
+ """The log-domain distribution over ``y`` given ``x``.
+
+ That is, ``self.LogProbability(x)(y)`` returns the log of
+ ``self.Probability(x)(y)``. However, this method computes the
+ log-probabilities directly and so is more accurate and efficient
+ than taking the log of ``Probability``.
+ """
+ zeta_x = self.LogZ(x)
+ score_given_x = self.Score(x)
+ return lambda y: score_given_x(y) - zeta_x
+
+ def Expectation(self, x, f):
+ """Compute the expectation of a function with respect to ``Probability(x)``.
+
+ Args:
+ x (X): the value of the independent variable.
+ f: a function of type ``Y -> np.ndarray(float)``.
+
+ Returns:
+ An ``np.ndarray`` of ``float`` called the "expected value". N.B.,
+ the particular array returned may not actually be one that the
+ function returns; rather, it's a sort of average of all the results
+ returned. For more information you can take a look at Wikipedia
+ <https://en.wikipedia.org/wiki/Expected_value>.
+ """
+ prob_given_x = self.Probability(x)
+ # N.B., the ``*`` below is vector scaling! If we want to make this
+ # method polymorphic in the return type of ``f`` then we'll need an
+ # API that provides both scaling and ``vsum``.
+ return vsum([prob_given_x(y) * f(y) for y in self._Y])
+
« no previous file with comments | « no previous file | appengine/findit/crash/results.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698