Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. | 1 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 import math | 5 import math |
| 6 import numpy as np | 6 import numpy as np |
| 7 # N.B., ``np.array`` can't take generators; you must pass explicit lists. | 7 # N.B., ``np.array`` can't take generators; you must pass explicit lists. |
| 8 import scipy.optimize as spo | 8 import scipy.optimize as spo |
| 9 | 9 |
| 10 from crash.loglinear.model import LogLinearModel | 10 from crash.loglinear.model import LogLinearModel |
| (...skipping 15 matching lines...) Expand all Loading... | |
| 26 ``Y`` (it'll just take more computation time is all). This is | 26 ``Y`` (it'll just take more computation time is all). This is |
| 27 needed for computing the partition function and expectation. N.B., | 27 needed for computing the partition function and expectation. N.B., |
| 28 we do not actually need to know/enumerate of *all* of ``Y``, | 28 we do not actually need to know/enumerate of *all* of ``Y``, |
| 29 only the subsets for each ``x``. | 29 only the subsets for each ``x``. |
| 30 training_data (iterable): a collection of ``(x, y)`` pairs where | 30 training_data (iterable): a collection of ``(x, y)`` pairs where |
| 31 ``y`` is the known-correct label for ``x``. | 31 ``y`` is the known-correct label for ``x``. |
| 32 feature_function: A function from ``X`` to ``Y`` to a list of | 32 feature_function: A function from ``X`` to ``Y`` to a list of |
| 33 ``float``. N.B., the length of the list must be the same for all | 33 ``float``. N.B., the length of the list must be the same for all |
| 34 ``x`` and ``y``, and must be the same as the length of the list | 34 ``x`` and ``y``, and must be the same as the length of the list |
| 35 of weights. | 35 of weights. |
| 36 initial_weights (list of float): the pre-training coefficients | 36 initial_weights (dict from str to float): the pre-training coefficients |
| 37 for how much we believe components of the feature vector. This | 37 for how much we believe components of the feature vector. This |
| 38 provides the seed for training; this starting value shouldn't | 38 provides the seed for training; this starting value shouldn't |
| 39 affect the final weights obtained by training (thanks to | 39 affect the final weights obtained by training (thanks to |
| 40 convexity), but will affect how long it takes for training | 40 convexity), but will affect how long it takes for training |
| 41 to converge. | 41 to converge. |
| 42 epsilon (float): The absolute-error threshold for considering a | 42 epsilon (float): The absolute-error threshold for considering a |
| 43 weight to be "equal to zero". N.B., this should be a positive | 43 weight to be "equal to zero". N.B., this should be a positive |
| 44 number, as we will compare it against the absolute value of | 44 number, as we will compare it against the absolute value of |
| 45 each weight. | 45 each weight. |
| 46 """ | 46 """ |
| 47 super(TrainableLogLinearModel, self).__init__( | 47 super(TrainableLogLinearModel, self).__init__( |
| 48 Y_given_X, feature_function, initial_weights, epsilon) | 48 Y_given_X, feature_function, initial_weights, epsilon) |
| 49 self._training_data = training_data | 49 self._training_data = training_data |
| 50 self._feature_order = initial_weights.keys() | |
|
wrengr
2017/01/11 20:38:30
You should explicitly convert this to a list (or t
Sharu Jiang
2017/01/12 01:41:38
I think the ``.keys()`` will return a list, but ex
wrengr
2017/01/12 18:16:16
So it does. Weird.
| |
| 50 | 51 |
| 51 self._observed_feature_vector = vsum([ | 52 self._observed_feature_vector = vsum([ |
| 52 self.FeaturesAsNumPyArray(x)(y) | 53 self.FeaturesAsNumPyArray(x)(y) |
| 53 for x, y in self._training_data]) | 54 for x, y in self._training_data]) |
| 54 | 55 |
| 55 # Even though this is identical to the superclass definition, we must | 56 # Even though this is identical to the superclass definition, we must |
| 56 # re-provide it in order to define the setter. | 57 # re-provide it in order to define the setter. |
| 57 @property | 58 @property |
| 58 def weights(self): | 59 def weights(self): |
| 59 """The weight covector. | 60 """The weight covector. |
| 60 | 61 |
| 61 At present we return the weights as an ``np.ndarray``, but in the | 62 At present we return the weights as an ``np.ndarray``, but in the |
| 62 future that may be replaced by a more general type which specifies | 63 future that may be replaced by a more general type which specifies |
| 63 the semantics rather than the implementation details. | 64 the semantics rather than the implementation details. |
| 64 """ | 65 """ |
| 65 return self._weights | 66 return self._weights |
| 66 | 67 |
| 67 @weights.setter | 68 @weights.setter |
| 68 def weights(self, new_weights): # pylint: disable=W0221 | 69 def weights(self, new_np_weights): # pylint: disable=W0221 |
| 69 """Mutate the weight covector, and clear memos as necessary. | 70 """Mutate the weight covector, and clear memos as necessary. |
| 70 | 71 |
| 71 This setter attempts to avoid clearing memos whenever possible, | 72 This setter attempts to avoid clearing memos whenever possible, |
| 72 but errs on the side of caution/correctness when it needs to. | 73 but errs on the side of caution/correctness when it needs to. |
| 73 | 74 |
| 74 Args: | 75 Args: |
| 75 new_weights (np.ndarray): the new weights to use. Must have the | 76 new_np_weights (np.ndarray): the new weights to use. It will be converted |
| 76 same shape as the old ``np.ndarray``. | 77 to weights dict mapping feature_name to its weight. |
| 77 """ | 78 """ |
| 78 if new_weights is self._weights: | 79 new_weights = self.NumPyArrayToWeights(new_np_weights) |
| 79 return | |
| 80 | |
| 81 if not isinstance(new_weights, np.ndarray): | |
| 82 raise TypeError('Expected an np.ndarray but got %s instead' | |
| 83 % new_weights.__class__.__name__) | |
| 84 | |
| 85 if new_weights.shape != self._weights.shape: | |
| 86 raise TypeError('Weight shape mismatch: %s != %s' | |
| 87 % (new_weights.shape, self._weights.shape)) | |
| 88 | |
| 89 self.ClearWeightBasedMemos() | 80 self.ClearWeightBasedMemos() |
| 90 self._weights = new_weights | 81 self._weights = new_weights |
|
wrengr
2017/01/11 20:38:31
To avoid excessive conversion back and forth, this
Sharu Jiang
2017/01/12 01:41:38
Done.
| |
| 91 | 82 |
| 92 def FeaturesAsNumPyArray(self, x): | 83 def FeaturesAsNumPyArray(self, x): |
| 93 """A variant of ``Features`` which returns a ``np.ndarray``. | 84 """A variant of ``Features`` which returns a ``np.ndarray``. |
| 94 | 85 |
| 86 Note, the features nparray should have the same order as in | |
| 87 self._feature_order to stay aligned with weights np array. | |
| 88 | |
| 95 For training we need to have the feature function return an | 89 For training we need to have the feature function return an |
| 96 ``np.ndarray(float)`` rather than the ``list(FeatureValue)`` used | 90 ``np.ndarray(float)`` rather than the ``list(FeatureValue)`` used |
| 97 elsewhere. This function performes the necessary conversion. | 91 elsewhere. This function performes the necessary conversion. |
| 98 | 92 |
| 99 N.B., at present we do not memoize this function. The underlying | 93 N.B., at present we do not memoize this function. The underlying |
| 100 ``Features`` method is memoized, so we won't re-compute the features | 94 ``Features`` method is memoized, so we won't re-compute the features |
| 101 each time; but we will repeatedly copy the floats into newly allocated | 95 each time; but we will repeatedly copy the floats into newly allocated |
| 102 ``np.ndarray`` objects. If that turns out to be a performance | 96 ``np.ndarray`` objects. If that turns out to be a performance |
| 103 bottleneck, we can add the extra layer of memoization to avoid that. | 97 bottleneck, we can add the extra layer of memoization to avoid that. |
| 104 """ | 98 """ |
| 105 fx = self.Features(x) | 99 fx = self.Features(x) |
| 106 return lambda y: np.array([fxy.value for fxy in fx(y)]) | 100 def FeaturesAsNumPyArrayGivenX(y): |
| 101 fxys = fx(y) | |
| 102 return np.array([fxys[feature_name].value if feature_name in fxys else 0. | |
|
wrengr
2017/01/11 20:38:30
is clearer to use ``fxys.get(feature_name, 0.)``
Sharu Jiang
2017/01/12 01:41:38
I have to do this, because I need ``fxys[feature_n
wrengr
2017/01/12 18:16:16
hrm... that does make it tricky.
This sort of thi
| |
| 103 for feature_name in self._feature_order]) | |
| 104 | |
| 105 return FeaturesAsNumPyArrayGivenX | |
| 106 | |
| 107 def WeightsAsNumPyArray(self): | |
| 108 """Converts dict (mapping feature name to weight) to numpy array. | |
| 109 | |
| 110 Note, this conversion is needed because model uses weights dict to organize | |
| 111 weights for features, however SciPy trainning (e.g. BFGS) needs numpy array | |
| 112 to do computaion. | |
| 113 """ | |
| 114 return np.array([self._weights.get(feature_name, 0.) | |
| 115 for feature_name in self._feature_order]) | |
|
wrengr
2017/01/11 20:38:31
Again, to avoid excessive conversion, we can do th
Sharu Jiang
2017/01/12 01:41:38
Done.
| |
| 116 | |
| 117 def NumPyArrayToWeights(self, np_weights): | |
| 118 """Converts numpy array to dict (mapping feature name to weight). | |
| 119 | |
| 120 Note, this conversion is needed because model uses weights dict to organize | |
| 121 weights for features, however SciPy trainning (e.g. BFGS) needs numpy array | |
| 122 to do computaion. | |
| 123 | |
| 124 Args: | |
| 125 np_weights (np.ndarray): Weights which have the same order of | |
| 126 self._feature_order. Note, featuer np array should also be serialized by | |
| 127 the same order as self._feature_order to match. | |
| 128 | |
| 129 Returns: | |
| 130 A dict mapping feature name to weight. | |
| 131 """ | |
| 132 if not isinstance(np_weights, np.ndarray): | |
| 133 raise TypeError('Expected an np.ndarray but got %s instead' | |
| 134 % np_weights.__class__.__name__) | |
| 135 | |
| 136 return {feature_name: weight | |
| 137 for feature_name, weight in zip(self._feature_order, np_weights)} | |
| 107 | 138 |
| 108 def LogLikelihood(self): | 139 def LogLikelihood(self): |
| 109 """The conditional log-likelihood of the training data. | 140 """The conditional log-likelihood of the training data. |
| 110 | 141 |
| 111 The conditional likelihood of the training data is the product | 142 The conditional likelihood of the training data is the product |
| 112 of ``Pr(y|x)`` for each ``(x, y)`` pair in the training data; so | 143 of ``Pr(y|x)`` for each ``(x, y)`` pair in the training data; so |
| 113 the conditional log-likelihood is the log of that. This is called | 144 the conditional log-likelihood is the log of that. This is called |
| 114 "likelihood" because it is thought of as a function of the weight | 145 "likelihood" because it is thought of as a function of the weight |
| 115 covector, with the training data held fixed. | 146 covector, with the training data held fixed. |
| 116 | 147 |
| 117 This is the ideal objective function for training the weights, as it | 148 This is the ideal objective function for training the weights, as it |
| 118 will give us the MLE weight covector for the training data. However, | 149 will give us the MLE weight covector for the training data. However, |
| 119 in practice, we want to do regularization to ensure we don't overfit | 150 in practice, we want to do regularization to ensure we don't overfit |
| 120 the training data and to reduce classification time by ensuring that | 151 the training data and to reduce classification time by ensuring that |
| 121 the weight vector is sparse. Thus, the actual objective function | 152 the weight vector is sparse. Thus, the actual objective function |
| 122 will be the log-likelihood plus some penalty terms for regularization. | 153 will be the log-likelihood plus some penalty terms for regularization. |
| 123 """ | 154 """ |
| 124 observed_zeta = math.fsum(self.LogZ(x) for x, _ in self._training_data) | 155 observed_zeta = math.fsum(self.LogZ(x) for x, _ in self._training_data) |
| 125 observed_score = self.weights.dot(self._observed_feature_vector) | 156 observed_score = self.WeightsAsNumPyArray().dot( |
| 157 self._observed_feature_vector) | |
| 126 return observed_score - observed_zeta | 158 return observed_score - observed_zeta |
| 127 | 159 |
| 128 def LogLikelihoodGradient(self): | 160 def LogLikelihoodGradient(self): |
| 129 """The gradient (aka Jacobian) of ``LogLikelihood``.""" | 161 """The gradient (aka Jacobian) of ``LogLikelihood``.""" |
| 130 expected_feature_vector = vsum([ | 162 expected_feature_vector = vsum([ |
| 131 self.Expectation(x, self.FeaturesAsNumPyArray(x)) | 163 self.Expectation(x, self.FeaturesAsNumPyArray(x)) |
| 132 for x, _ in self._training_data]) | 164 for x, _ in self._training_data]) |
| 133 return self._observed_feature_vector - expected_feature_vector | 165 return self._observed_feature_vector - expected_feature_vector |
| 134 | 166 |
| 135 def TrainWeights(self, l2_penalty): | 167 def TrainWeights(self, l2_penalty): |
| 136 """Optimize the weight covector based on the training data. | 168 """Optimize the weight covector based on the training data. |
| 137 | 169 |
| 138 Args: | 170 Args: |
| 139 l2_penalty (float): the hyperparameter for how much to penalize | 171 l2_penalty (float): the hyperparameter for how much to penalize |
| 140 weight covectors far from zero. | 172 weight covectors far from zero. |
| 141 | 173 |
| 142 Returns: | 174 Returns: |
| 143 Nothing, but has the side effect of mutating the stored weights. | 175 Nothing, but has the side effect of mutating the stored weights. |
| 144 """ | 176 """ |
| 145 initial_weights = self.weights | 177 initial_weights = self.WeightsAsNumPyArray() |
| 146 | 178 |
| 147 # We want to minimize the number of times we reset the weights since | 179 # We want to minimize the number of times we reset the weights since |
| 148 # that clears our memos. One might think we could do that in the | 180 # that clears our memos. One might think we could do that in the |
| 149 # between-iterations callback; but actually, in a single iteration, | 181 # between-iterations callback; but actually, in a single iteration, |
| 150 # BFGS calls the objective function and gradient more than once with | 182 # BFGS calls the objective function and gradient more than once with |
| 151 # different arguments; so, alas, we must reset the weights in both. | 183 # different arguments; so, alas, we must reset the weights in both. |
| 152 # This is why the ``weights`` setter tries to avoid clearing memos | 184 # This is why the ``weights`` setter tries to avoid clearing memos |
| 153 # when possible. | 185 # when possible. |
| 154 | 186 |
| 155 def objective_function(new_weights): | 187 def objective_function(new_weights): |
| 156 self.weights = new_weights | 188 self.weights = new_weights |
| 157 return -self.LogLikelihood() + 0.5 * l2_penalty * self.quadrance | 189 return -self.LogLikelihood() + 0.5 * l2_penalty * self.quadrance |
| 158 | 190 |
| 159 def objective_function_gradient(new_weights): | 191 def objective_function_gradient(new_weights): |
| 160 self.weights = new_weights | 192 self.weights = new_weights |
| 161 return -self.LogLikelihoodGradient() + l2_penalty * self.weights | 193 return -self.LogLikelihoodGradient() + l2_penalty * new_weights |
|
wrengr
2017/01/11 20:38:31
I don't think that's any cleaner than the old code
Sharu Jiang
2017/01/12 01:41:38
This is because we need np_array here instead of a
wrengr
2017/01/12 18:16:16
Ah, right <chagrin>
| |
| 162 | 194 |
| 163 result = spo.minimize( | 195 result = spo.minimize( |
| 164 objective_function, | 196 objective_function, |
| 165 initial_weights, | 197 initial_weights, |
| 166 method='BFGS', | 198 method='BFGS', |
| 167 jac=objective_function_gradient) | 199 jac=objective_function_gradient) |
| 168 | 200 |
| 169 if not result.success: # pragma: no cover | 201 if not result.success: # pragma: no cover |
| 170 # This should happen infrequently enough that there's no point in | 202 # This should happen infrequently enough that there's no point in |
| 171 # logging it and attempting to carry on. | 203 # logging it and attempting to carry on. |
| 172 raise Exception( | 204 raise Exception( |
| 173 'TrainableLogLinearModel.TrainWeights failed:' | 205 'TrainableLogLinearModel.TrainWeights failed:' |
| 174 '\n\tReason: %s' | 206 '\n\tReason: %s' |
| 175 '\n\tCurrent objective value: %s' | 207 '\n\tCurrent objective value: %s' |
| 176 '\n\tCurrent objective gradient: %s' | 208 '\n\tCurrent objective gradient: %s' |
| 177 '\n\tIterations: %d' | 209 '\n\tIterations: %d' |
| 178 '\n\tFunction evaluations: %d' | 210 '\n\tFunction evaluations: %d' |
| 179 '\n\tGradient evaluations: %d' | 211 '\n\tGradient evaluations: %d' |
| 180 % (result.message, result.fun, result.jac, result.nit, result.nfev, | 212 % (result.message, result.fun, result.jac, result.nit, result.nfev, |
| 181 result.njev)) | 213 result.njev)) |
| 182 | 214 |
| 183 # This shouldn't really be necessary, since we're resetting it | 215 # This shouldn't really be necessary, since we're resetting it |
| 184 # directly during training; but just to be safe/sure. | 216 # directly during training; but just to be safe/sure. |
| 185 self.weights = result.x | 217 self.weights = result.x |
| OLD | NEW |