Chromium Code Reviews| Index: appengine/findit/crash/loglinear/training.py |
| diff --git a/appengine/findit/crash/loglinear/training.py b/appengine/findit/crash/loglinear/training.py |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..01fcb4d55be5791331d048969460229672783ab2 |
| --- /dev/null |
| +++ b/appengine/findit/crash/loglinear/training.py |
| @@ -0,0 +1,179 @@ |
| +# Copyright 2016 The Chromium Authors. All rights reserved. |
| +# Use of this source code is governed by a BSD-style license that can be |
| +# found in the LICENSE file. |
| + |
| +import math |
| +import numpy as np |
| +# N.B., ``np.array`` can't take generators; you must pass explicit lists. |
| +import scipy.optimize as spo |
| + |
| +from crash.loglinear.model import LogLinearModel |
| +from libs.math.vectors import vsum |
| +# N.B., ``vsum`` can't take generators; you must pass explicit lists. |
| + |
| + |
| +class TrainableLogLinearModel(LogLinearModel): |
| + """A loglinear model with some labelled data set for training the weights.""" |
| + |
| + def __init__(self, Y, training_data, feature_function, initial_weights, |
| + epsilon=None): |
|
Sharu Jiang
2016/12/21 21:30:01
nit: alignment.
|
| + """ |
| + Args: |
| + Y (iterable): the entire range of values for the independent |
| + variable. This is needed for computing the partition function. |
| + training_data (iterable): a collection of ``(x, y)`` pairs where |
|
Sharu Jiang
2016/12/21 21:30:01
Using ``y`` denotation is a bit confusing, since w
wrengr
2016/12/21 22:34:32
``Y`` is the type of the second argument of the fe
|
| + ``y`` is the known-correct label for ``x``. |
| + feature_function: A function from ``X`` to ``Y`` to a list of |
| + ``float``. N.B., the length of the list must be the same for all |
| + ``x`` and ``y``, and must be the same as the length of the list |
| + of weights. |
| + initial_weights (list of float): the pre-training coefficients |
| + for how much we believe components of the feature vector. This |
| + provides the seed for training; this starting value shouldn't |
| + affect the final weights obtained by training (thanks to |
| + convexity), but will affect how long it takes for training |
| + to converge. |
| + epsilon (float): The absolute-error threshold for considering a |
| + weight to be "equal to zero". N.B., this should be a positive |
| + number, as we will compare it against the absolute value of |
| + each weight. |
| + """ |
| + super(TrainableLogLinearModel, self).__init__( |
| + Y, feature_function, initial_weights, epsilon) |
| + self._training_data = training_data |
| + |
| + self._observed_feature_vector = vsum([ |
| + self.FeaturesAsNumPyArray(x)(y) |
| + for x, y in self._training_data]) |
| + |
| + # Even though this is identical to the superclass definition, we must |
| + # re-provide it in order to define the setter. |
|
Sharu Jiang
2016/12/21 21:30:01
Interesting to know, so we cannot overwrite the se
wrengr
2016/12/21 22:34:32
The problem has to do with the way the @property d
|
| + @property |
| + def weights(self): |
| + """The weight covector. |
| + |
| + At present we return the weights as an ``np.ndarray``, but in the |
| + future that may be replaced by a more general type which specifies |
| + the semantics rather than the implementation details. |
| + """ |
| + return self._weights |
| + |
| + @weights.setter |
| + def weights(self, new_weights): # pylint: disable=W0221 |
| + """Mutate the weight covector, and clear memos as necessary. |
| + |
| + This setter attempts to avoid clearing memos whenever possible, |
| + but errs on the side of caution/correctness when it needs to. |
| + |
| + Args: |
| + new_weights (np.ndarray): the new weights to use. Must have the |
| + same shape as the old ``np.ndarray``. |
| + """ |
| + if new_weights is self._weights: |
| + return |
| + |
| + if not isinstance(new_weights, np.ndarray): |
| + raise TypeError('Expected an np.ndarray but got %s instead' |
| + % new_weights.__class__.__name__) |
| + |
| + if new_weights.shape != self._weights.shape: |
| + raise TypeError('Weight shape mismatch: %s != %s' |
| + % (new_weights.shape, self._weights.shape)) |
| + |
| + self.ClearWeightBasedMemos() |
| + self._weights = new_weights |
| + |
| + def FeaturesAsNumPyArray(self, x): |
| + """A variant of ``Features`` which returns a ``np.ndarray``. |
| + |
| + For training we need to have the feature function return an |
| + ``np.ndarray(float)`` rather than the ``list(FeatureValue)`` used |
| + elsewhere. This function performes the necessary conversion. |
| + |
| + N.B., at present we do not memoize this function. The underlying |
| + ``Features`` method is memoized, so we won't re-compute the features |
| + each time; but we will repeatedly copy the floats into newly allocated |
| + ``np.ndarray`` objects. If that turns out to be a performance |
| + bottleneck, we can add the extra layer of memoization to avoid that. |
| + """ |
| + fx = self.Features(x) |
| + return lambda y: np.array([fxy.value for fxy in fx(y)]) |
| + |
| + def LogLikelihood(self): |
| + """The conditional log-likelihood of the training data. |
| + |
| + The conditional likelihood of the training data is the product |
| + of ``Pr(y|x)`` for each ``(x, y)`` pair in the training data; so |
| + the conditional log-likelihood is the log of that. This is called |
| + "likelihood" because it is thought of as a function of the weight |
| + covector, with the training data held fixed. |
| + |
| + This is the ideal objective function for training the weights, as it |
| + will give us the MLE weight covector for the training data. However, |
| + in practice, we want to do regularization to ensure we don't overfit |
| + the training data and to reduce classification time by ensuring that |
| + the weight vector is sparse. Thus, the actual objective function |
| + will be the log-likelihood plus some penalty terms for regularization. |
| + """ |
| + observed_zeta = math.fsum(self.LogZ(x) for x, _ in self._training_data) |
| + observed_score = self.weights.dot(self._observed_feature_vector) |
| + return observed_score - observed_zeta |
| + |
| + def LogLikelihoodGradient(self): |
| + """The gradient (aka Jacobian) of ``LogLikelihood``.""" |
| + expected_feature_vector = vsum([ |
| + self.Expectation(x, self.FeaturesAsNumPyArray(x)) |
| + for x, _ in self._training_data]) |
| + return self._observed_feature_vector - expected_feature_vector |
| + |
| + def TrainWeights(self, l2_penalty): |
| + """Optimize the weight covector based on the training data. |
| + |
| + Args: |
| + l2_penalty (float): the hyperparameter for how much to penalize |
| + weight covectors far from zero. |
| + |
| + Returns: |
| + Nothing, but has the side effect of mutating the stored weights. |
| + """ |
| + initial_weights = self.weights |
| + |
| + # We want to minimize the number of times we reset the weights since |
| + # that clears our memos. One might think we could do that in the |
| + # between-iterations callback; but actually, in a single iteration, |
| + # BFGS calls the objective function and gradient more than once with |
| + # different arguments; so, alas, we must reset the weights in both. |
| + # This is why the ``weights`` setter tries to avoid clearing memos |
| + # when possible. |
| + |
| + def objective_function(new_weights): |
| + self.weights = new_weights |
| + return -self.LogLikelihood() + 0.5 * l2_penalty * self.quadrance |
| + |
| + def objective_function_gradient(new_weights): |
| + self.weights = new_weights |
| + return -self.LogLikelihoodGradient() + l2_penalty * self.weights |
| + |
| + result = spo.minimize( |
| + objective_function, |
| + initial_weights, |
| + method='BFGS', |
| + jac=objective_function_gradient) |
| + |
| + if not result.success: # pragma: no cover |
| + # This should happen infrequently enough that there's no point in |
| + # logging it and attempting to carry on. |
| + raise Exception( |
| + 'TrainableLogLinearModel.TrainWeights failed:' |
| + '\n\tReason: %s' |
| + '\n\tCurrent objective value: %s' |
| + '\n\tCurrent objective gradient: %s' |
| + '\n\tIterations: %d' |
| + '\n\tFunction evaluations: %d' |
| + '\n\tGradient evaluations: %d' |
| + % (result.message, result.fun, result.jac, result.nit, result.nfev, |
| + result.njev)) |
| + |
| + # This shouldn't really be necessary, since we're resetting it |
| + # directly during training; but just to be safe/sure. |
| + self.weights = result.x |