appengine/findit/crash/loglinear.py - Issue 2517383005: Implementing loglinear classification (without training), for CL classification

Unified Diff: appengine/findit/crash/loglinear.py

Issue 2517383005: Implementing loglinear classification (without training), for CL classification (Closed)

Patch Set: breaking apart normalized and unnormalized models Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: appengine/findit/crash/loglinear.py

diff --git a/appengine/findit/crash/loglinear.py b/appengine/findit/crash/loglinear.py

new file mode 100644

index 0000000000000000000000000000000000000000..147b820b729f76490f462c107c7ab06f7f533f0d

--- /dev/null

+++ b/appengine/findit/crash/loglinear.py

@@ -0,0 +1,302 @@

+# Use of this source code is governed by a BSD-style license that can be

+# found in the LICENSE file.

+import math

+import numpy as np

+from libs.math.functions import MemoizedFunction

+from libs.math.logarithms import logsumexp

+from libs.math.vectors import vsum

+EPSILON = 0.00001

+def ToFeatureFunction(fs):

+ """Given an array of scalar-valued functions, return a vector-valued function.

+ Args:

+ fs (iterable): A collection of curried functions ``X -> Y -> float``.

+ That is, given a particular ``x`` they return a function ``Y -> float``.

+ Returns:

+ A function ``X -> Y -> list(float)`` where for all ``x``, ``y``, and

+ ``i`` we have that ``ToFeatureFunction(fs)(x)(y)[i] == fs[i](x)(y)``.

+ """

+ def _FeatureFunction(x):

+ fxs = [f(x) for f in fs]

+ # TODO(wrengr): allow ``fx(y)`` to be a list, and flatten everything.

+ return lambda y: [fx(y) for fx in fxs]

+ return _FeatureFunction

+# TODO(http://crbug.com/669639): lots of ways to make this code better.

+class UnnormalizedLogLinearModel(object):

+ """An unnormalized loglinear model.

+ We use this model for combining a bunch of different features in order

+ to decide who to blame. In particular, the nice thing about loglinear

+ models is that (a) they allow separating the amount of weight we

+ give to each feature from the feature itself, and (b) they allow us

+ to keep adding features at will without needing to worry about the

+ fiddly details needed for it to be a probability model.

+ Throughout the documentation we use the following conventional

+ terminology. The independent variable (aka: the given data which

+ sets up the classification problem) is called ``X``, where particular

+ values for that variable called ``x``. The dependent variable (aka:

+ the answers/labels returned by classification) is called ``Y``,

+ where particular values for that random variable called ``y``. The

+ partition function is called ``Z``. And, somewhat non-conventionally,

+ we will call the log of the partition function ``zeta``.

+ This class is distinct from ``LogLinearModel`` in that we do not require

+ a specification of ``Y``. This class is sufficient for determining

+ which ``y`` is the most likely, though it can only return a "score"

+ rather than returning a probability per se.

+ """

+ def __init__(self, feature_function, weights, epsilon=None):

+ """Construct a new model with the given weights and feature function.

+ Args:

+ feature_function: A function ``X -> Y -> list(float)``. N.B.,

+ for all ``x`` and ``y`` the length of ``feature_function(x)(y)``

+ must be the same as the length of ``weights``.

+ weights (list of float): coefficients for how important we consider

+ each component of the feature vector for deciding which ``y``

+ to blame.

+ epsilon (float): The absolute-error threshold for considering a

+ weight to be "equal to zero". N.B., this should be a positive

+ number, as we will compare it against the absolute value of

+ each weight.

+ """

+ if epsilon is None:

+ epsilon = EPSILON

+ # N.B., ``np.array`` can't take generators.

+ self._weights = np.array([

+ w if isinstance(w, float) and math.fabs(w) >= epsilon else 0.0

+ for w in weights])

+ self._quadrance = None

+ def _FeaturesMemoizedOnY(x):

+ fx = feature_function(x)

+ def _FeaturesCoercedToCovector(y):

+ # N.B., ``np.array`` can't take generators.

+ fxy = np.array(list(fx(y)))

+ # N.B., we're assuming that ``len(self.weights)`` is O(1).

+ assert len(fxy) == len(self.weights), TypeError(

+ "vector length mismatch: %d != %d" % (len(fxy), len(self.weights)))

+ return fxy

+ return MemoizedFunction(_FeaturesCoercedToCovector)

+ self._features = MemoizedFunction(_FeaturesMemoizedOnY)

+ # N.B., this is just the inner product of ``self.weights``

+ # against ``self._features(x)``. If we can compute this in some

+ # more efficient way, we should. In particular, we will want to

+ # make the weights sparse, in which case we need to use a sparse

+ # variant of the dot product.

+ self._scores = MemoizedFunction(lambda x:

+ self._features(x).map(self.weights.dot))

+ def ClearWeightBasedMemos(self):

+ """Clear all the memos that depend on the weight covector."""

+ self._quadrance = None

+ self._scores.ClearMemos()

+ def ClearAllMemos(self):

+ """Clear all memos, even those independent of the weight covector."""

+ self.ClearWeightBasedMemos()

+ self._features.ClearMemos()

+ @property

+ def weights(self):

+ """The weight covector.

+ At present we return the weights as an ``np.ndarray``, but in the

+ future that may be replaced by a more general type which specifies

+ the semantics rather than the implementation details.

+ """

+ return self._weights

+ @property

+ def l0(self): # pragma: no cover

+ """The l0-norm of the weight covector.

+ N.B., despite being popularly called the "l0-norm", this isn't

+ actually a norm in the mathematical sense."""

+ return float(np.count_nonzero(self.weights))

+ @property

+ def l1(self): # pragma: no cover

+ """The l1 (aka: Manhattan) norm of the weight covector."""

+ return math.fsum(math.fabs(w) for w in self.weights)

+ @property

+ def quadrance(self):

+ """The square of the l2 norm of the weight covector.

+ This value is often more helpful to have direct access to, as it

+ avoids the need for non-rational functions (e.g., sqrt) and shows up

+ as its own quantity in many places. Also, computing it directly avoids

+ the error introduced by squaring the square-root of an IEEE-754 float.

+ """

+ if self._quadrance is None:

+ self._quadrance = math.fsum(math.fabs(w)**2 for w in self.weights)

+ return self._quadrance

+ @property

+ def l2(self):

+ """The l2 (aka Euclidean) norm of the weight covector.

+ If you need the square of the l2-norm, do not use this property.

+ Instead, use the ``quadrance`` property which is more accurate than

+ squaring this one.

+ """

+ return math.sqrt(self.quadrance)

+ def Features(self, x):

+ """Return a function mapping ``y`` to its feature vector given ``x``.

+ Args:

+ x (X): the value of the independent variable.

+ Returns:

+ A ``MemoizedFunction`` of type ``Y -> np.array(float)``.

+ """

+ return self._features(x)

+ def Score(self, x):

+ """Return a function mapping ``y`` to its "score" given ``x``.

+ Semantically, the "score" of ``y`` given ``x`` is the

+ unnormalized log-domain conditional probability of ``y`` given

+ ``x``. Operationally, we compute this by taking the inner product

+ of the weight covector with the ``Features(x)(y)`` vector. The

+ conditional probability itself can be computed by exponentiating

+ the "score" and normalizing with the partition function. However,

+ for determining the "best" ``y`` we don't need to bother computing

+ the probability, because ``Score(x)`` is monotonic with respect to

+ ``Probability(x)``.

+ Args:

+ x (X): the value of the independent variable.

+ Returns:

+ A ``MemoizedFunction`` of type ``Y -> float``.

+ """

+ return self._scores(x)

+class LogLinearModel(UnnormalizedLogLinearModel):

Sharu Jiang 2016/12/07 18:40:04 one question, so the online/offline split is betwe

wrengr 2016/12/08 20:09:07 I'm not sure exactly which sense of "on-/offline"

+ """A loglinear probability model.

+ This class is distinct from ``UnnormalizedLogLinearModel`` in that

+ we can provide probabilities (not just scores). However, to do so we

+ require a specification of the entire set ``Y``.

+ """

+ def __init__(self, Y, feature_function, weights, epsilon=None):

+ """Construct a new probabilistic model.

+ Args:

+ Y (iterable): the entire range of values for the independent

+ variable. This is needed for computing the partition function.

+ feature_function: A function ``X -> Y -> list(float)``. N.B.,

+ for all ``x`` and ``y`` the length of ``feature_function(x)(y)``

+ must be the same as the length of ``weights``.

+ weights (list of float): coefficients for how important we consider

+ each component of the feature vector for deciding which ``y``

+ to blame.

+ epsilon (float): The absolute-error threshold for considering a

+ weight to be "equal to zero". N.B., this should be a positive

+ number, as we will compare it against the absolute value of

+ each weight.

+ """

+ super(LogLinearModel, self).__init__(feature_function, weights, epsilon)

+ self._Y = frozenset(Y)

+ def _LogZ(x):

+ score_given_x = self._scores(x)

+ return logsumexp([score_given_x(y) for y in self._Y])

+ self._zetas = MemoizedFunction(_LogZ)

+ def ClearWeightBasedMemos(self):

+ """Clear all the memos that depend on the weight covector."""

+ super(LogLinearModel, self).ClearWeightBasedMemos()

+ self._zetas.ClearMemos()

+ def Z(self, x):

+ """The normal-domain conditional partition-function given ``x``.

+ If you need the log of the partition function, do not use this

+ method. Instead, use the ``LogZ`` method which computes it directly

+ and thus is more accurate than taking the log of this one.

+ Args:

+ x (X): the value of the independent variable.

+ Returns:

+ The normalizing constant of ``Probability(x)``.

+ """

+ return math.exp(self.LogZ(x))

+ def LogZ(self, x):

+ """The log-domain conditional partition-function given ``x``.

+ Args:

+ x (X): the value of the independent variable.

+ Returns:

+ The normalizing constant of ``LogProbability(x)``.

+ """

+ return self._zetas(x)

+ def Probability(self, x):

+ """The normal-domain distribution over ``y`` given ``x``.

+ That is, ``self.Probability(x)(y)`` returns ``p(y | x; self.weights)``

+ which is the model's estimation of ``Pr(y|x)``.

+ If you need the log-probability, don't use this method. Instead,

+ use the ``LogProbability`` method which computes it directly and

+ thus is more accurate than taking the log of this one.

+ """

+ logprob_given_x = self.LogProbability(x)

+ return lambda y: math.exp(logprob_given_x(y))

+ def LogProbability(self, x):

+ """The log-domain distribution over ``y`` given ``x``.

+ That is, ``self.LogProbability(x)(y)`` returns the log of

+ ``self.Probability(x)(y)``. However, this method computes the

+ log-probabilities directly and so is more accurate and efficient

+ than taking the log of ``Probability``.

+ """

+ zeta_x = self.LogZ(x)

+ score_given_x = self.Score(x)

+ return lambda y: score_given_x(y) - zeta_x

+ def Expectation(self, x, f):

+ """Compute the expectation of a function with respect to ``Probability(x)``.

+ Args:

+ x (X): the value of the independent variable.

+ f: a function of type ``Y -> np.ndarray(float)``.

+ Returns:

+ An ``np.ndarray`` of ``float`` called the "expected value". N.B.,

+ the particular array returned may not actually be one that the

+ function returns; rather, it's a sort of average of all the results

+ returned. For more information you can take a look at Wikipedia

+ <https://en.wikipedia.org/wiki/Expected_value>.

+ """

+ prob_given_x = self.Probability(x)

+ # N.B., the ``*`` below is vector scaling! If we want to make this

+ # method polymorphic in the return type of ``f`` then we'll need an

+ # API that provides both scaling and ``vsum``.

+ return vsum([prob_given_x(y) * f(y) for y in self._Y])

« no previous file with comments | « no previous file | appengine/findit/crash/results.py » ('j') | no next file with comments »