Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1656)

Unified Diff: appengine/findit/crash/loglinear/training.py

Issue 2544493004: [Predator] Implement training for loglinear models (Closed)
Patch Set: Breaking out the shared code of loglinear/{model,training}_test.py Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « appengine/findit/crash/loglinear/test/training_test.py ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: appengine/findit/crash/loglinear/training.py
diff --git a/appengine/findit/crash/loglinear/training.py b/appengine/findit/crash/loglinear/training.py
new file mode 100644
index 0000000000000000000000000000000000000000..33f8e8c19e692668845823f50f331bfa9a618116
--- /dev/null
+++ b/appengine/findit/crash/loglinear/training.py
@@ -0,0 +1,179 @@
+# Copyright 2016 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import math
+import numpy as np
+# N.B., ``np.array`` can't take generators; you must pass explicit lists.
+import scipy.optimize as spo
+
+from crash.loglinear.model import LogLinearModel
+from libs.math.vectors import vsum
+# N.B., ``vsum`` can't take generators; you must pass explicit lists.
+
+
+class TrainableLogLinearModel(LogLinearModel):
+ """A loglinear model with some labelled data set for training the weights."""
+
+ def __init__(self, Y, training_data, feature_function, initial_weights,
+ epsilon=None):
+ """
+ Args:
+ Y (iterable): the entire range of values for the independent
+ variable. This is needed for computing the partition function.
+ training_data (iterable): a collection of ``(x, y)`` pairs where
+ ``y`` is the known-correct label for ``x``.
+ feature_function: A function from ``X`` to ``Y`` to a list of
+ ``float``. N.B., the length of the list must be the same for all
+ ``x`` and ``y``, and must be the same as the length of the list
+ of weights.
+ initial_weights (list of float): the pre-training coefficients
+ for how much we believe components of the feature vector. This
+ provides the seed for training; this starting value shouldn't
+ affect the final weights obtained by training (thanks to
+ convexity), but will affect how long it takes for training
+ to converge.
+ epsilon (float): The absolute-error threshold for considering a
+ weight to be "equal to zero". N.B., this should be a positive
+ number, as we will compare it against the absolute value of
+ each weight.
+ """
+ super(TrainableLogLinearModel, self).__init__(
+ Y, feature_function, initial_weights, epsilon)
+ self._training_data = training_data
+
+ self._observed_feature_vector = vsum([
+ self.FeaturesAsNumPyArray(x)(y)
+ for x, y in self._training_data])
+
+ # Even though this is identical to the superclass definition, we must
+ # re-provide it in order to define the setter.
+ @property
+ def weights(self):
+ """The weight covector.
+
+ At present we return the weights as an ``np.ndarray``, but in the
+ future that may be replaced by a more general type which specifies
+ the semantics rather than the implementation details.
+ """
+ return self._weights
+
+ @weights.setter
+ def weights(self, new_weights): # pylint: disable=W0221
+ """Mutate the weight covector, and clear memos as necessary.
+
+ This setter attempts to avoid clearing memos whenever possible,
+ but errs on the side of caution/correctness when it needs to.
+
+ Args:
+ new_weights (np.ndarray): the new weights to use. Must have the
+ same shape as the old ``np.ndarray``.
+ """
+ if new_weights is self._weights:
+ return
+
+ if not isinstance(new_weights, np.ndarray):
+ raise TypeError('Expected an np.ndarray but got %s instead'
+ % new_weights.__class__.__name__)
+
+ if new_weights.shape != self._weights.shape:
+ raise TypeError('Weight shape mismatch: %s != %s'
+ % (new_weights.shape, self._weights.shape))
+
+ self.ClearWeightBasedMemos()
+ self._weights = new_weights
+
+ def FeaturesAsNumPyArray(self, x):
+ """A variant of ``Features`` which returns a ``np.ndarray``.
+
+ For training we need to have the feature function return an
+ ``np.ndarray(float)`` rather than the ``list(FeatureValue)`` used
+ elsewhere. This function performes the necessary conversion.
+
+ N.B., at present we do not memoize this function. The underlying
+ ``Features`` method is memoized, so we won't re-compute the features
+ each time; but we will repeatedly copy the floats into newly allocated
+ ``np.ndarray`` objects. If that turns out to be a performance
+ bottleneck, we can add the extra layer of memoization to avoid that.
+ """
+ fx = self.Features(x)
+ return lambda y: np.array([fxy.value for fxy in fx(y)])
+
+ def LogLikelihood(self):
+ """The conditional log-likelihood of the training data.
+
+ The conditional likelihood of the training data is the product
+ of ``Pr(y|x)`` for each ``(x, y)`` pair in the training data; so
+ the conditional log-likelihood is the log of that. This is called
+ "likelihood" because it is thought of as a function of the weight
+ covector, with the training data held fixed.
+
+ This is the ideal objective function for training the weights, as it
+ will give us the MLE weight covector for the training data. However,
+ in practice, we want to do regularization to ensure we don't overfit
+ the training data and to reduce classification time by ensuring that
+ the weight vector is sparse. Thus, the actual objective function
+ will be the log-likelihood plus some penalty terms for regularization.
+ """
+ observed_zeta = math.fsum(self.LogZ(x) for x, _ in self._training_data)
+ observed_score = self.weights.dot(self._observed_feature_vector)
+ return observed_score - observed_zeta
+
+ def LogLikelihoodGradient(self):
+ """The gradient (aka Jacobian) of ``LogLikelihood``."""
+ expected_feature_vector = vsum([
+ self.Expectation(x, self.FeaturesAsNumPyArray(x))
+ for x, _ in self._training_data])
+ return self._observed_feature_vector - expected_feature_vector
+
+ def TrainWeights(self, l2_penalty):
+ """Optimize the weight covector based on the training data.
+
+ Args:
+ l2_penalty (float): the hyperparameter for how much to penalize
+ weight covectors far from zero.
+
+ Returns:
+ Nothing, but has the side effect of mutating the stored weights.
+ """
+ initial_weights = self.weights
+
+ # We want to minimize the number of times we reset the weights since
+ # that clears our memos. One might think we could do that in the
+ # between-iterations callback; but actually, in a single iteration,
+ # BFGS calls the objective function and gradient more than once with
+ # different arguments; so, alas, we must reset the weights in both.
+ # This is why the ``weights`` setter tries to avoid clearing memos
+ # when possible.
+
+ def objective_function(new_weights):
+ self.weights = new_weights
+ return -self.LogLikelihood() + 0.5 * l2_penalty * self.quadrance
+
+ def objective_function_gradient(new_weights):
+ self.weights = new_weights
+ return -self.LogLikelihoodGradient() + l2_penalty * self.weights
+
+ result = spo.minimize(
+ objective_function,
+ initial_weights,
+ method='BFGS',
+ jac=objective_function_gradient)
+
+ if not result.success: # pragma: no cover
+ # This should happen infrequently enough that there's no point in
+ # logging it and attempting to carry on.
+ raise Exception(
+ 'TrainableLogLinearModel.TrainWeights failed:'
+ '\n\tReason: %s'
+ '\n\tCurrent objective value: %s'
+ '\n\tCurrent objective gradient: %s'
+ '\n\tIterations: %d'
+ '\n\tFunction evaluations: %d'
+ '\n\tGradient evaluations: %d'
+ % (result.message, result.fun, result.jac, result.nit, result.nfev,
+ result.njev))
+
+ # This shouldn't really be necessary, since we're resetting it
+ # directly during training; but just to be safe/sure.
+ self.weights = result.x
« no previous file with comments | « appengine/findit/crash/loglinear/test/training_test.py ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698