appengine/findit/crash/loglinear/training.py - Issue 2544493004: [Predator] Implement training for loglinear models

Unified Diff: appengine/findit/crash/loglinear/training.py

Issue 2544493004: [Predator] Implement training for loglinear models (Closed)

Patch Set: Breaking out the shared code of loglinear/{model,training}_test.py Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: appengine/findit/crash/loglinear/training.py

diff --git a/appengine/findit/crash/loglinear/training.py b/appengine/findit/crash/loglinear/training.py

new file mode 100644

index 0000000000000000000000000000000000000000..33f8e8c19e692668845823f50f331bfa9a618116

--- /dev/null

+++ b/appengine/findit/crash/loglinear/training.py

@@ -0,0 +1,179 @@

+# Use of this source code is governed by a BSD-style license that can be

+# found in the LICENSE file.

+import math

+import numpy as np

+# N.B., ``np.array`` can't take generators; you must pass explicit lists.

+import scipy.optimize as spo

+from crash.loglinear.model import LogLinearModel

+from libs.math.vectors import vsum

+# N.B., ``vsum`` can't take generators; you must pass explicit lists.

+class TrainableLogLinearModel(LogLinearModel):

+ """A loglinear model with some labelled data set for training the weights."""

+ def __init__(self, Y, training_data, feature_function, initial_weights,

+ epsilon=None):

+ """

+ Args:

+ Y (iterable): the entire range of values for the independent

+ variable. This is needed for computing the partition function.

+ training_data (iterable): a collection of ``(x, y)`` pairs where

+ ``y`` is the known-correct label for ``x``.

+ feature_function: A function from ``X`` to ``Y`` to a list of

+ ``float``. N.B., the length of the list must be the same for all

+ ``x`` and ``y``, and must be the same as the length of the list

+ of weights.

+ initial_weights (list of float): the pre-training coefficients

+ for how much we believe components of the feature vector. This

+ provides the seed for training; this starting value shouldn't

+ affect the final weights obtained by training (thanks to

+ convexity), but will affect how long it takes for training

+ to converge.

+ epsilon (float): The absolute-error threshold for considering a

+ weight to be "equal to zero". N.B., this should be a positive

+ number, as we will compare it against the absolute value of

+ each weight.

+ """

+ super(TrainableLogLinearModel, self).__init__(

+ Y, feature_function, initial_weights, epsilon)

+ self._training_data = training_data

+ self._observed_feature_vector = vsum([

+ self.FeaturesAsNumPyArray(x)(y)

+ for x, y in self._training_data])

+ # Even though this is identical to the superclass definition, we must

+ # re-provide it in order to define the setter.

+ @property

+ def weights(self):

+ """The weight covector.

+ At present we return the weights as an ``np.ndarray``, but in the

+ future that may be replaced by a more general type which specifies

+ the semantics rather than the implementation details.

+ """

+ return self._weights

+ @weights.setter

+ def weights(self, new_weights): # pylint: disable=W0221

+ """Mutate the weight covector, and clear memos as necessary.

+ This setter attempts to avoid clearing memos whenever possible,

+ but errs on the side of caution/correctness when it needs to.

+ Args:

+ new_weights (np.ndarray): the new weights to use. Must have the

+ same shape as the old ``np.ndarray``.

+ """

+ if new_weights is self._weights:

+ return

+ if not isinstance(new_weights, np.ndarray):

+ raise TypeError('Expected an np.ndarray but got %s instead'

+ % new_weights.__class__.__name__)

+ if new_weights.shape != self._weights.shape:

+ raise TypeError('Weight shape mismatch: %s != %s'

+ % (new_weights.shape, self._weights.shape))

+ self.ClearWeightBasedMemos()

+ self._weights = new_weights

+ def FeaturesAsNumPyArray(self, x):

+ """A variant of ``Features`` which returns a ``np.ndarray``.

+ For training we need to have the feature function return an

+ ``np.ndarray(float)`` rather than the ``list(FeatureValue)`` used

+ elsewhere. This function performes the necessary conversion.

+ N.B., at present we do not memoize this function. The underlying

+ ``Features`` method is memoized, so we won't re-compute the features

+ each time; but we will repeatedly copy the floats into newly allocated

+ ``np.ndarray`` objects. If that turns out to be a performance

+ bottleneck, we can add the extra layer of memoization to avoid that.

+ """

+ fx = self.Features(x)

+ return lambda y: np.array([fxy.value for fxy in fx(y)])

+ def LogLikelihood(self):

+ """The conditional log-likelihood of the training data.

+ The conditional likelihood of the training data is the product

+ of ``Pr(y|x)`` for each ``(x, y)`` pair in the training data; so

+ the conditional log-likelihood is the log of that. This is called

+ "likelihood" because it is thought of as a function of the weight

+ covector, with the training data held fixed.

+ This is the ideal objective function for training the weights, as it

+ will give us the MLE weight covector for the training data. However,

+ in practice, we want to do regularization to ensure we don't overfit

+ the training data and to reduce classification time by ensuring that

+ the weight vector is sparse. Thus, the actual objective function

+ will be the log-likelihood plus some penalty terms for regularization.

+ """

+ observed_zeta = math.fsum(self.LogZ(x) for x, _ in self._training_data)

+ observed_score = self.weights.dot(self._observed_feature_vector)

+ return observed_score - observed_zeta

+ def LogLikelihoodGradient(self):

+ """The gradient (aka Jacobian) of ``LogLikelihood``."""

+ expected_feature_vector = vsum([

+ self.Expectation(x, self.FeaturesAsNumPyArray(x))

+ for x, _ in self._training_data])

+ return self._observed_feature_vector - expected_feature_vector

+ def TrainWeights(self, l2_penalty):

+ """Optimize the weight covector based on the training data.

+ Args:

+ l2_penalty (float): the hyperparameter for how much to penalize

+ weight covectors far from zero.

+ Returns:

+ Nothing, but has the side effect of mutating the stored weights.

+ """

+ initial_weights = self.weights

+ # We want to minimize the number of times we reset the weights since

+ # that clears our memos. One might think we could do that in the

+ # between-iterations callback; but actually, in a single iteration,

+ # BFGS calls the objective function and gradient more than once with

+ # different arguments; so, alas, we must reset the weights in both.

+ # This is why the ``weights`` setter tries to avoid clearing memos

+ # when possible.

+ def objective_function(new_weights):

+ self.weights = new_weights

+ return -self.LogLikelihood() + 0.5 * l2_penalty * self.quadrance

+ def objective_function_gradient(new_weights):

+ self.weights = new_weights

+ return -self.LogLikelihoodGradient() + l2_penalty * self.weights

+ result = spo.minimize(

+ objective_function,

+ initial_weights,

+ method='BFGS',

+ jac=objective_function_gradient)

+ if not result.success: # pragma: no cover

+ # This should happen infrequently enough that there's no point in

+ # logging it and attempting to carry on.

+ raise Exception(

+ 'TrainableLogLinearModel.TrainWeights failed:'

+ '\n\tReason: %s'

+ '\n\tCurrent objective value: %s'

+ '\n\tCurrent objective gradient: %s'

+ '\n\tIterations: %d'

+ '\n\tFunction evaluations: %d'

+ '\n\tGradient evaluations: %d'

+ % (result.message, result.fun, result.jac, result.nit, result.nfev,

+ result.njev))

+ # This shouldn't really be necessary, since we're resetting it

+ # directly during training; but just to be safe/sure.

+ self.weights = result.x

« no previous file with comments | « appengine/findit/crash/loglinear/test/training_test.py ('k') | no next file » | no next file with comments »