Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. | 1 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 import math | 5 import math |
| 6 import numpy as np | 6 import numpy as np |
| 7 # N.B., ``np.array`` can't take generators; you must pass explicit lists. | 7 # N.B., ``np.array`` can't take generators; you must pass explicit lists. |
| 8 import scipy.optimize as spo | 8 import scipy.optimize as spo |
| 9 | 9 |
| 10 from crash.loglinear.model import LogLinearModel | 10 from crash.loglinear.model import LogLinearModel |
| (...skipping 15 matching lines...) Expand all Loading... | |
| 26 ``Y`` (it'll just take more computation time is all). This is | 26 ``Y`` (it'll just take more computation time is all). This is |
| 27 needed for computing the partition function and expectation. N.B., | 27 needed for computing the partition function and expectation. N.B., |
| 28 we do not actually need to know/enumerate of *all* of ``Y``, | 28 we do not actually need to know/enumerate of *all* of ``Y``, |
| 29 only the subsets for each ``x``. | 29 only the subsets for each ``x``. |
| 30 training_data (iterable): a collection of ``(x, y)`` pairs where | 30 training_data (iterable): a collection of ``(x, y)`` pairs where |
| 31 ``y`` is the known-correct label for ``x``. | 31 ``y`` is the known-correct label for ``x``. |
| 32 feature_function: A function from ``X`` to ``Y`` to a list of | 32 feature_function: A function from ``X`` to ``Y`` to a list of |
| 33 ``float``. N.B., the length of the list must be the same for all | 33 ``float``. N.B., the length of the list must be the same for all |
| 34 ``x`` and ``y``, and must be the same as the length of the list | 34 ``x`` and ``y``, and must be the same as the length of the list |
| 35 of weights. | 35 of weights. |
| 36 initial_weights (list of float): the pre-training coefficients | 36 initial_weights (dict from str to float): the pre-training coefficients |
| 37 for how much we believe components of the feature vector. This | 37 for how much we believe components of the feature vector. This |
| 38 provides the seed for training; this starting value shouldn't | 38 provides the seed for training; this starting value shouldn't |
| 39 affect the final weights obtained by training (thanks to | 39 affect the final weights obtained by training (thanks to |
| 40 convexity), but will affect how long it takes for training | 40 convexity), but will affect how long it takes for training |
| 41 to converge. | 41 to converge. |
| 42 N.B. The dict should not be sparse (only contains non-zero weights), | |
| 43 because we only train those features whose names are keys in this dict. | |
| 42 epsilon (float): The absolute-error threshold for considering a | 44 epsilon (float): The absolute-error threshold for considering a |
| 43 weight to be "equal to zero". N.B., this should be a positive | 45 weight to be "equal to zero". N.B., this should be a positive |
| 44 number, as we will compare it against the absolute value of | 46 number, as we will compare it against the absolute value of |
| 45 each weight. | 47 each weight. |
| 46 """ | 48 """ |
| 47 super(TrainableLogLinearModel, self).__init__( | 49 super(TrainableLogLinearModel, self).__init__( |
| 48 Y_given_X, feature_function, initial_weights, epsilon) | 50 Y_given_X, feature_function, initial_weights, epsilon) |
| 49 self._training_data = training_data | 51 self._training_data = training_data |
| 50 | 52 # Use self._weights instead of initialz_weights, since self._weights already |
|
wrengr
2017/01/25 21:54:27
"initialz" -> "initial"
| |
| 53 # filtered zero weights in the __init__ of superclass. | |
| 54 self._feature_order = self._weights.keys() | |
| 55 self._np_weights = self._DictToNumPyArray(self._weights) | |
| 51 self._observed_feature_vector = vsum([ | 56 self._observed_feature_vector = vsum([ |
| 52 self.FeaturesAsNumPyArray(x)(y) | 57 self.FeaturesAsNumPyArray(x)(y) |
| 53 for x, y in self._training_data]) | 58 for x, y in self._training_data]) |
| 54 | 59 |
| 55 # Even though this is identical to the superclass definition, we must | |
| 56 # re-provide it in order to define the setter. | |
| 57 @property | 60 @property |
| 58 def weights(self): | 61 def np_weights(self): |
| 59 """The weight covector. | 62 """The NumPy Array of the weight covector.""" |
| 63 return self._np_weights | |
| 60 | 64 |
| 61 At present we return the weights as an ``np.ndarray``, but in the | 65 @np_weights.setter |
| 62 future that may be replaced by a more general type which specifies | 66 def np_weights(self, new_np_weights): # pylint: disable=W0221 |
| 63 the semantics rather than the implementation details. | |
| 64 """ | |
| 65 return self._weights | |
| 66 | |
| 67 @weights.setter | |
| 68 def weights(self, new_weights): # pylint: disable=W0221 | |
| 69 """Mutate the weight covector, and clear memos as necessary. | 67 """Mutate the weight covector, and clear memos as necessary. |
| 70 | 68 |
| 71 This setter attempts to avoid clearing memos whenever possible, | 69 This setter attempts to avoid clearing memos whenever possible, |
| 72 but errs on the side of caution/correctness when it needs to. | 70 but errs on the side of caution/correctness when it needs to. |
| 71 This setter also drop all the zero weights in weight covector using | |
| 72 self._epsilon. | |
| 73 | |
| 74 Note, the conversion between dict and np array is needed because model uses | |
| 75 dict to organize weights of features, however SciPy trainning (e.g. BFGS) | |
| 76 needs numpy array to do computaion. | |
| 73 | 77 |
| 74 Args: | 78 Args: |
| 75 new_weights (np.ndarray): the new weights to use. Must have the | 79 new_np_weights (np.ndarray): the new weights to use. It will be converted |
| 76 same shape as the old ``np.ndarray``. | 80 to weights dict mapping feature_name to its weight. |
| 77 """ | 81 """ |
| 78 if new_weights is self._weights: | 82 if np.array_equal(self._np_weights, new_np_weights): |
| 79 return | 83 return |
| 80 | 84 |
| 81 if not isinstance(new_weights, np.ndarray): | 85 if not isinstance(new_np_weights, np.ndarray): |
| 82 raise TypeError('Expected an np.ndarray but got %s instead' | 86 raise TypeError('Expected an np.ndarray but got %s instead' % |
| 83 % new_weights.__class__.__name__) | 87 new_np_weights.__class__.__name__) |
| 84 | 88 |
| 85 if new_weights.shape != self._weights.shape: | 89 if new_np_weights.shape != self._np_weights.shape: |
| 86 raise TypeError('Weight shape mismatch: %s != %s' | 90 raise TypeError('Weight shape mismatch: %s != %s' % |
| 87 % (new_weights.shape, self._weights.shape)) | 91 (new_np_weights.shape, self._np_weights.shape)) |
| 88 | 92 |
| 93 self._np_weights = np.array(filter(self.IsNonZeroWeight, new_np_weights)) | |
| 89 self.ClearWeightBasedMemos() | 94 self.ClearWeightBasedMemos() |
| 90 self._weights = new_weights | 95 self._weights = self._NumPyArrayToDict(self._np_weights) |
| 96 self._feature_order = self._weights.keys() | |
| 97 | |
| 98 def _NumPyArrayToDict(self, np_weights): | |
| 99 """Converts numpy array to dict (mapping feature name to weight). | |
| 100 | |
| 101 Note, this conversion is needed because model uses weights dict to organize | |
| 102 weights for features, however SciPy trainning (e.g. BFGS) needs numpy array | |
| 103 to do computaion. | |
| 104 | |
| 105 Args: | |
| 106 np_weights (np.ndarray): Weights which have the same order of | |
| 107 self._feature_order. Note, feature np array should also be serialized by | |
| 108 the same order as self._feature_order to match. | |
| 109 | |
| 110 Returns: | |
| 111 A dict mapping feature name to weight. | |
| 112 """ | |
| 113 return {feature_name: weight | |
| 114 for feature_name, weight in zip(self._feature_order, np_weights)} | |
| 115 | |
| 116 def _DictToNumPyArray(self, weights, default=0.): | |
| 117 """Converts dict (mapping feature name to weight) to numpy array.""" | |
| 118 return np.array([weights.get(feature_name, default) | |
| 119 for feature_name in self._feature_order]) | |
| 91 | 120 |
| 92 def FeaturesAsNumPyArray(self, x): | 121 def FeaturesAsNumPyArray(self, x): |
| 93 """A variant of ``Features`` which returns a ``np.ndarray``. | 122 """A variant of ``Features`` which returns a ``np.ndarray``. |
| 94 | 123 |
| 124 Note, the features np array should have the same order as in | |
| 125 self._feature_order to stay aligned with weights np array. | |
| 126 | |
| 95 For training we need to have the feature function return an | 127 For training we need to have the feature function return an |
| 96 ``np.ndarray(float)`` rather than the ``list(FeatureValue)`` used | 128 ``np.ndarray(float)`` rather than the ``list(FeatureValue)`` used |
| 97 elsewhere. This function performes the necessary conversion. | 129 elsewhere. This function performes the necessary conversion. |
| 98 | 130 |
| 99 N.B., at present we do not memoize this function. The underlying | 131 N.B., at present we do not memoize this function. The underlying |
| 100 ``Features`` method is memoized, so we won't re-compute the features | 132 ``Features`` method is memoized, so we won't re-compute the features |
| 101 each time; but we will repeatedly copy the floats into newly allocated | 133 each time; but we will repeatedly copy the floats into newly allocated |
| 102 ``np.ndarray`` objects. If that turns out to be a performance | 134 ``np.ndarray`` objects. If that turns out to be a performance |
| 103 bottleneck, we can add the extra layer of memoization to avoid that. | 135 bottleneck, we can add the extra layer of memoization to avoid that. |
| 104 """ | 136 """ |
| 105 fx = self.Features(x) | 137 fx = self.Features(x) |
| 106 return lambda y: np.array([fxy.value for fxy in fx(y)]) | 138 def FeaturesAsNumPyArrayGivenX(y): |
| 139 fxys = fx(y) | |
| 140 return np.array([fxys[feature_name].value | |
| 141 for feature_name in self._feature_order]) | |
| 142 | |
| 143 return FeaturesAsNumPyArrayGivenX | |
| 107 | 144 |
| 108 def LogLikelihood(self): | 145 def LogLikelihood(self): |
| 109 """The conditional log-likelihood of the training data. | 146 """The conditional log-likelihood of the training data. |
| 110 | 147 |
| 111 The conditional likelihood of the training data is the product | 148 The conditional likelihood of the training data is the product |
| 112 of ``Pr(y|x)`` for each ``(x, y)`` pair in the training data; so | 149 of ``Pr(y|x)`` for each ``(x, y)`` pair in the training data; so |
| 113 the conditional log-likelihood is the log of that. This is called | 150 the conditional log-likelihood is the log of that. This is called |
| 114 "likelihood" because it is thought of as a function of the weight | 151 "likelihood" because it is thought of as a function of the weight |
| 115 covector, with the training data held fixed. | 152 covector, with the training data held fixed. |
| 116 | 153 |
| 117 This is the ideal objective function for training the weights, as it | 154 This is the ideal objective function for training the weights, as it |
| 118 will give us the MLE weight covector for the training data. However, | 155 will give us the MLE weight covector for the training data. However, |
| 119 in practice, we want to do regularization to ensure we don't overfit | 156 in practice, we want to do regularization to ensure we don't overfit |
| 120 the training data and to reduce classification time by ensuring that | 157 the training data and to reduce classification time by ensuring that |
| 121 the weight vector is sparse. Thus, the actual objective function | 158 the weight vector is sparse. Thus, the actual objective function |
| 122 will be the log-likelihood plus some penalty terms for regularization. | 159 will be the log-likelihood plus some penalty terms for regularization. |
| 123 """ | 160 """ |
| 124 observed_zeta = math.fsum(self.LogZ(x) for x, _ in self._training_data) | 161 observed_zeta = math.fsum(self.LogZ(x) for x, _ in self._training_data) |
| 125 observed_score = self.weights.dot(self._observed_feature_vector) | 162 observed_score = self.np_weights.dot( |
| 163 self._observed_feature_vector) | |
| 126 return observed_score - observed_zeta | 164 return observed_score - observed_zeta |
| 127 | 165 |
| 128 def LogLikelihoodGradient(self): | 166 def LogLikelihoodGradient(self): |
| 129 """The gradient (aka Jacobian) of ``LogLikelihood``.""" | 167 """The gradient (aka Jacobian) of ``LogLikelihood``.""" |
| 130 expected_feature_vector = vsum([ | 168 expected_feature_vector = vsum([ |
| 131 self.Expectation(x, self.FeaturesAsNumPyArray(x)) | 169 self.Expectation(x, self.FeaturesAsNumPyArray(x)) |
| 132 for x, _ in self._training_data]) | 170 for x, _ in self._training_data]) |
| 133 return self._observed_feature_vector - expected_feature_vector | 171 return self._observed_feature_vector - expected_feature_vector |
| 134 | 172 |
| 135 def TrainWeights(self, l2_penalty): | 173 def TrainWeights(self, l2_penalty): |
| 136 """Optimize the weight covector based on the training data. | 174 """Optimize the weight covector based on the training data. |
| 137 | 175 |
| 138 Args: | 176 Args: |
| 139 l2_penalty (float): the hyperparameter for how much to penalize | 177 l2_penalty (float): the hyperparameter for how much to penalize |
| 140 weight covectors far from zero. | 178 weight covectors far from zero. |
| 141 | 179 |
| 142 Returns: | 180 Returns: |
| 143 Nothing, but has the side effect of mutating the stored weights. | 181 Nothing, but has the side effect of mutating the stored weights. |
| 144 """ | 182 """ |
| 145 initial_weights = self.weights | 183 initial_np_weights = self.np_weights |
| 146 | 184 |
| 147 # We want to minimize the number of times we reset the weights since | 185 # We want to minimize the number of times we reset the weights since |
| 148 # that clears our memos. One might think we could do that in the | 186 # that clears our memos. One might think we could do that in the |
| 149 # between-iterations callback; but actually, in a single iteration, | 187 # between-iterations callback; but actually, in a single iteration, |
| 150 # BFGS calls the objective function and gradient more than once with | 188 # BFGS calls the objective function and gradient more than once with |
| 151 # different arguments; so, alas, we must reset the weights in both. | 189 # different arguments; so, alas, we must reset the weights in both. |
| 152 # This is why the ``weights`` setter tries to avoid clearing memos | 190 # This is why the ``weights`` setter tries to avoid clearing memos |
| 153 # when possible. | 191 # when possible. |
| 154 | 192 |
| 155 def objective_function(new_weights): | 193 def objective_function(new_np_weights): |
| 156 self.weights = new_weights | 194 self.np_weights = new_np_weights |
| 157 return -self.LogLikelihood() + 0.5 * l2_penalty * self.quadrance | 195 return -self.LogLikelihood() + 0.5 * l2_penalty * self.quadrance |
| 158 | 196 |
| 159 def objective_function_gradient(new_weights): | 197 def objective_function_gradient(new_np_weights): |
| 160 self.weights = new_weights | 198 self.np_weights = new_np_weights |
| 161 return -self.LogLikelihoodGradient() + l2_penalty * self.weights | 199 return -self.LogLikelihoodGradient() + l2_penalty * self.np_weights |
| 162 | 200 |
| 163 result = spo.minimize( | 201 result = spo.minimize( |
| 164 objective_function, | 202 objective_function, |
| 165 initial_weights, | 203 initial_np_weights, |
| 166 method='BFGS', | 204 method='BFGS', |
| 167 jac=objective_function_gradient) | 205 jac=objective_function_gradient) |
| 168 | 206 |
| 169 if not result.success: # pragma: no cover | 207 if not result.success: # pragma: no cover |
| 170 # This should happen infrequently enough that there's no point in | 208 # This should happen infrequently enough that there's no point in |
| 171 # logging it and attempting to carry on. | 209 # logging it and attempting to carry on. |
| 172 raise Exception( | 210 raise Exception( |
| 173 'TrainableLogLinearModel.TrainWeights failed:' | 211 'TrainableLogLinearModel.TrainWeights failed:' |
| 174 '\n\tReason: %s' | 212 '\n\tReason: %s' |
| 175 '\n\tCurrent objective value: %s' | 213 '\n\tCurrent objective value: %s' |
| 176 '\n\tCurrent objective gradient: %s' | 214 '\n\tCurrent objective gradient: %s' |
| 177 '\n\tIterations: %d' | 215 '\n\tIterations: %d' |
| 178 '\n\tFunction evaluations: %d' | 216 '\n\tFunction evaluations: %d' |
| 179 '\n\tGradient evaluations: %d' | 217 '\n\tGradient evaluations: %d' |
| 180 % (result.message, result.fun, result.jac, result.nit, result.nfev, | 218 % (result.message, result.fun, result.jac, result.nit, result.nfev, |
| 181 result.njev)) | 219 result.njev)) |
| 182 | 220 |
| 183 # This shouldn't really be necessary, since we're resetting it | 221 # This shouldn't really be necessary, since we're resetting it |
| 184 # directly during training; but just to be safe/sure. | 222 # directly during training; but just to be safe/sure. |
| 185 self.weights = result.x | 223 self.np_weights = result.x |
| OLD | NEW |