| OLD | NEW |
| (Empty) | |
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. |
| 4 |
| 5 import math |
| 6 import numpy as np |
| 7 # N.B., ``np.array`` can't take generators; you must pass explicit lists. |
| 8 import scipy.optimize as spo |
| 9 |
| 10 from crash.loglinear.model import LogLinearModel |
| 11 from libs.math.vectors import vsum |
| 12 # N.B., ``vsum`` can't take generators; you must pass explicit lists. |
| 13 |
| 14 |
| 15 class TrainableLogLinearModel(LogLinearModel): |
| 16 """A loglinear model with some labelled data set for training the weights.""" |
| 17 |
| 18 def __init__(self, Y, training_data, feature_function, initial_weights, |
| 19 epsilon=None): |
| 20 """ |
| 21 Args: |
| 22 Y (iterable): the entire range of values for the independent |
| 23 variable. This is needed for computing the partition function. |
| 24 training_data (iterable): a collection of ``(x, y)`` pairs where |
| 25 ``y`` is the known-correct label for ``x``. |
| 26 feature_function: A function from ``X`` to ``Y`` to a list of |
| 27 ``float``. N.B., the length of the list must be the same for all |
| 28 ``x`` and ``y``, and must be the same as the length of the list |
| 29 of weights. |
| 30 initial_weights (list of float): the pre-training coefficients |
| 31 for how much we believe components of the feature vector. This |
| 32 provides the seed for training; this starting value shouldn't |
| 33 affect the final weights obtained by training (thanks to |
| 34 convexity), but will affect how long it takes for training |
| 35 to converge. |
| 36 epsilon (float): The absolute-error threshold for considering a |
| 37 weight to be "equal to zero". N.B., this should be a positive |
| 38 number, as we will compare it against the absolute value of |
| 39 each weight. |
| 40 """ |
| 41 super(TrainableLogLinearModel, self).__init__( |
| 42 Y, feature_function, initial_weights, epsilon) |
| 43 self._training_data = training_data |
| 44 |
| 45 self._observed_feature_vector = vsum([ |
| 46 self.FeaturesAsNumPyArray(x)(y) |
| 47 for x, y in self._training_data]) |
| 48 |
| 49 # Even though this is identical to the superclass definition, we must |
| 50 # re-provide it in order to define the setter. |
| 51 @property |
| 52 def weights(self): |
| 53 """The weight covector. |
| 54 |
| 55 At present we return the weights as an ``np.ndarray``, but in the |
| 56 future that may be replaced by a more general type which specifies |
| 57 the semantics rather than the implementation details. |
| 58 """ |
| 59 return self._weights |
| 60 |
| 61 @weights.setter |
| 62 def weights(self, new_weights): # pylint: disable=W0221 |
| 63 """Mutate the weight covector, and clear memos as necessary. |
| 64 |
| 65 This setter attempts to avoid clearing memos whenever possible, |
| 66 but errs on the side of caution/correctness when it needs to. |
| 67 |
| 68 Args: |
| 69 new_weights (np.ndarray): the new weights to use. Must have the |
| 70 same shape as the old ``np.ndarray``. |
| 71 """ |
| 72 if new_weights is self._weights: |
| 73 return |
| 74 |
| 75 if not isinstance(new_weights, np.ndarray): |
| 76 raise TypeError('Expected an np.ndarray but got %s instead' |
| 77 % new_weights.__class__.__name__) |
| 78 |
| 79 if new_weights.shape != self._weights.shape: |
| 80 raise TypeError('Weight shape mismatch: %s != %s' |
| 81 % (new_weights.shape, self._weights.shape)) |
| 82 |
| 83 self.ClearWeightBasedMemos() |
| 84 self._weights = new_weights |
| 85 |
| 86 def FeaturesAsNumPyArray(self, x): |
| 87 """A variant of ``Features`` which returns a ``np.ndarray``. |
| 88 |
| 89 For training we need to have the feature function return an |
| 90 ``np.ndarray(float)`` rather than the ``list(FeatureValue)`` used |
| 91 elsewhere. This function performes the necessary conversion. |
| 92 |
| 93 N.B., at present we do not memoize this function. The underlying |
| 94 ``Features`` method is memoized, so we won't re-compute the features |
| 95 each time; but we will repeatedly copy the floats into newly allocated |
| 96 ``np.ndarray`` objects. If that turns out to be a performance |
| 97 bottleneck, we can add the extra layer of memoization to avoid that. |
| 98 """ |
| 99 fx = self.Features(x) |
| 100 return lambda y: np.array([fxy.value for fxy in fx(y)]) |
| 101 |
| 102 def LogLikelihood(self): |
| 103 """The conditional log-likelihood of the training data. |
| 104 |
| 105 The conditional likelihood of the training data is the product |
| 106 of ``Pr(y|x)`` for each ``(x, y)`` pair in the training data; so |
| 107 the conditional log-likelihood is the log of that. This is called |
| 108 "likelihood" because it is thought of as a function of the weight |
| 109 covector, with the training data held fixed. |
| 110 |
| 111 This is the ideal objective function for training the weights, as it |
| 112 will give us the MLE weight covector for the training data. However, |
| 113 in practice, we want to do regularization to ensure we don't overfit |
| 114 the training data and to reduce classification time by ensuring that |
| 115 the weight vector is sparse. Thus, the actual objective function |
| 116 will be the log-likelihood plus some penalty terms for regularization. |
| 117 """ |
| 118 observed_zeta = math.fsum(self.LogZ(x) for x, _ in self._training_data) |
| 119 observed_score = self.weights.dot(self._observed_feature_vector) |
| 120 return observed_score - observed_zeta |
| 121 |
| 122 def LogLikelihoodGradient(self): |
| 123 """The gradient (aka Jacobian) of ``LogLikelihood``.""" |
| 124 expected_feature_vector = vsum([ |
| 125 self.Expectation(x, self.FeaturesAsNumPyArray(x)) |
| 126 for x, _ in self._training_data]) |
| 127 return self._observed_feature_vector - expected_feature_vector |
| 128 |
| 129 def TrainWeights(self, l2_penalty): |
| 130 """Optimize the weight covector based on the training data. |
| 131 |
| 132 Args: |
| 133 l2_penalty (float): the hyperparameter for how much to penalize |
| 134 weight covectors far from zero. |
| 135 |
| 136 Returns: |
| 137 Nothing, but has the side effect of mutating the stored weights. |
| 138 """ |
| 139 initial_weights = self.weights |
| 140 |
| 141 # We want to minimize the number of times we reset the weights since |
| 142 # that clears our memos. One might think we could do that in the |
| 143 # between-iterations callback; but actually, in a single iteration, |
| 144 # BFGS calls the objective function and gradient more than once with |
| 145 # different arguments; so, alas, we must reset the weights in both. |
| 146 # This is why the ``weights`` setter tries to avoid clearing memos |
| 147 # when possible. |
| 148 |
| 149 def objective_function(new_weights): |
| 150 self.weights = new_weights |
| 151 return -self.LogLikelihood() + 0.5 * l2_penalty * self.quadrance |
| 152 |
| 153 def objective_function_gradient(new_weights): |
| 154 self.weights = new_weights |
| 155 return -self.LogLikelihoodGradient() + l2_penalty * self.weights |
| 156 |
| 157 result = spo.minimize( |
| 158 objective_function, |
| 159 initial_weights, |
| 160 method='BFGS', |
| 161 jac=objective_function_gradient) |
| 162 |
| 163 if not result.success: # pragma: no cover |
| 164 # This should happen infrequently enough that there's no point in |
| 165 # logging it and attempting to carry on. |
| 166 raise Exception( |
| 167 'TrainableLogLinearModel.TrainWeights failed:' |
| 168 '\n\tReason: %s' |
| 169 '\n\tCurrent objective value: %s' |
| 170 '\n\tCurrent objective gradient: %s' |
| 171 '\n\tIterations: %d' |
| 172 '\n\tFunction evaluations: %d' |
| 173 '\n\tGradient evaluations: %d' |
| 174 % (result.message, result.fun, result.jac, result.nit, result.nfev, |
| 175 result.njev)) |
| 176 |
| 177 # This shouldn't really be necessary, since we're resetting it |
| 178 # directly during training; but just to be safe/sure. |
| 179 self.weights = result.x |
| OLD | NEW |