| OLD | NEW |
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. | 1 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 from collections import OrderedDict |
| 5 import math | 6 import math |
| 6 import numpy as np | 7 import numpy as np |
| 7 # N.B., ``np.array`` can't take generators; you must pass explicit lists. | 8 # N.B., ``np.array`` can't take generators; you must pass explicit lists. |
| 8 import scipy.optimize as spo | 9 import scipy.optimize as spo |
| 9 | 10 |
| 10 from crash.loglinear.model import LogLinearModel | 11 from crash.loglinear.model import LogLinearModel |
| 12 from crash.loglinear.weight import MetaWeight |
| 13 from crash.loglinear.weight import Weight |
| 14 from libs.meta_dict_serializer import GetSerializer |
| 11 from libs.math.vectors import vsum | 15 from libs.math.vectors import vsum |
| 12 # N.B., ``vsum`` can't take generators; you must pass explicit lists. | 16 # N.B., ``vsum`` can't take generators; you must pass explicit lists. |
| 13 | 17 |
| 14 | 18 |
| 15 class TrainableLogLinearModel(LogLinearModel): | 19 class TrainableLogLinearModel(LogLinearModel): |
| 16 """A loglinear model with some labelled data set for training the weights.""" | 20 """A loglinear model with labelled data set for training the meta_weight.""" |
| 17 | 21 |
| 18 def __init__(self, Y_given_X, training_data, feature_function, | 22 def __init__(self, Y_given_X, training_data, meta_feature, meta_weight, |
| 19 initial_weights, epsilon=None): | 23 epsilon=None): |
| 20 """ | 24 """ |
| 21 Args: | 25 Args: |
| 22 Y_given_X: a function from ``X`` to an iterable object giving the | 26 Y_given_X: a function from ``X`` to an iterable object giving the |
| 23 subset of ``Y`` which has non-zero probability given the | 27 subset of ``Y`` which has non-zero probability given the |
| 24 ``x``. When in doubt about whether some ``y`` has zero probability | 28 ``x``. When in doubt about whether some ``y`` has zero probability |
| 25 or not, it is always safe/correct to return a larger subset of | 29 or not, it is always safe/correct to return a larger subset of |
| 26 ``Y`` (it'll just take more computation time is all). This is | 30 ``Y`` (it'll just take more computation time is all). This is |
| 27 needed for computing the partition function and expectation. N.B., | 31 needed for computing the partition function and expectation. N.B., |
| 28 we do not actually need to know/enumerate of *all* of ``Y``, | 32 we do not actually need to know/enumerate of *all* of ``Y``, |
| 29 only the subsets for each ``x``. | 33 only the subsets for each ``x``. |
| 30 training_data (iterable): a collection of ``(x, y)`` pairs where | 34 training_data (iterable): a collection of ``(x, y)`` pairs where |
| 31 ``y`` is the known-correct label for ``x``. | 35 ``y`` is the known-correct label for ``x``. |
| 32 feature_function: A function from ``X`` to ``Y`` to a list of | 36 meta_feature: A function from ``X`` to ``Y`` to a list of |
| 33 ``float``. N.B., the length of the list must be the same for all | 37 ``float``. N.B., the length of the list must be the same for all |
| 34 ``x`` and ``y``, and must be the same as the length of the list | 38 ``x`` and ``y``, and must be the same as the length of the list |
| 35 of weights. | 39 of meta_weight. |
| 36 initial_weights (dict from str to float): the pre-training coefficients | 40 meta_weight (dict from str to (Vector)Weight): the pre-training |
| 37 for how much we believe components of the feature vector. This | 41 coefficients for how much we believe components of the feature vector. |
| 38 provides the seed for training; this starting value shouldn't | 42 This provides the seed for training; this starting value shouldn't |
| 39 affect the final weights obtained by training (thanks to | 43 affect the final meta_weight obtained by training (thanks to |
| 40 convexity), but will affect how long it takes for training | 44 convexity), but will affect how long it takes for training |
| 41 to converge. | 45 to converge. |
| 42 N.B. The dict should not be sparse (only contains non-zero weights), | 46 N.B. The dict should not be sparse (only contains non-zero meta_weight), |
| 43 because we only train those features whose names are keys in this dict. | 47 because we only train those features whose names are keys in this dict. |
| 44 epsilon (float): The absolute-error threshold for considering a | 48 epsilon (float): The absolute-error threshold for considering a |
| 45 weight to be "equal to zero". N.B., this should be a positive | 49 weight to be "equal to zero". N.B., this should be a positive |
| 46 number, as we will compare it against the absolute value of | 50 number, as we will compare it against the absolute value of |
| 47 each weight. | 51 each weight. |
| 48 """ | 52 """ |
| 49 super(TrainableLogLinearModel, self).__init__( | 53 super(TrainableLogLinearModel, self).__init__( |
| 50 Y_given_X, feature_function, initial_weights, epsilon) | 54 Y_given_X, meta_feature, meta_weight, epsilon) |
| 51 self._training_data = training_data | 55 self._training_data = training_data |
| 52 # Use self._weights instead of initialz_weights, since self._weights already | 56 # Use self._meta_weight instead of initialz_meta_weight, |
| 53 # filtered zero weights in the __init__ of superclass. | 57 # since self._meta_weight already filtered zero meta_weight in the __init__ |
| 54 self._feature_order = self._weights.keys() | 58 # of superclass. |
| 55 self._np_weights = self._DictToNumPyArray(self._weights) | 59 self._serializer = GetSerializer(meta_feature) |
| 60 self._np_weight = self._MetaToNumPyArray(self.meta_weight) |
| 56 self._observed_feature_vector = vsum([ | 61 self._observed_feature_vector = vsum([ |
| 57 self.FeaturesAsNumPyArray(x)(y) | 62 self.FeaturesAsNumPyArray(x)(y) |
| 58 for x, y in self._training_data]) | 63 for x, y in self._training_data]) |
| 59 | 64 |
| 60 @property | 65 @property |
| 61 def np_weights(self): | 66 def np_weight(self): |
| 62 """The NumPy Array of the weight covector.""" | 67 """The NumPy Array of the weight covector.""" |
| 63 return self._np_weights | 68 return self._np_weight |
| 64 | 69 |
| 65 @np_weights.setter | 70 @np_weight.setter |
| 66 def np_weights(self, new_np_weights): # pylint: disable=W0221 | 71 def np_weight(self, new_np_weight): # pylint: disable=W0221 |
| 67 """Mutate the weight covector, and clear memos as necessary. | 72 """Mutate the weight covector, and clear memos as necessary. |
| 68 | 73 |
| 69 This setter attempts to avoid clearing memos whenever possible, | 74 This setter attempts to avoid clearing memos whenever possible, |
| 70 but errs on the side of caution/correctness when it needs to. | 75 but errs on the side of caution/correctness when it needs to. |
| 71 This setter also drop all the zero weights in weight covector using | 76 This setter also drop all the zero meta_weight in weight covector using |
| 72 self._epsilon. | 77 self._epsilon. |
| 73 | 78 |
| 74 Note, the conversion between dict and np array is needed because model uses | 79 Note, the conversion between dict and np array is needed because model uses |
| 75 dict to organize weights of features, however SciPy trainning (e.g. BFGS) | 80 dict to organize meta_weight of features, however SciPy trainning |
| 76 needs numpy array to do computaion. | 81 (e.g. BFGS) needs numpy array to do computaion. |
| 77 | 82 |
| 78 Args: | 83 Args: |
| 79 new_np_weights (np.ndarray): the new weights to use. It will be converted | 84 new_np_weight (np.ndarray): the new meta_weight to use. It will be |
| 80 to weights dict mapping feature_name to its weight. | 85 converted to meta_weight dict mapping feature_name to its weight. |
| 81 """ | 86 """ |
| 82 if np.array_equal(self._np_weights, new_np_weights): | 87 if np.array_equal(self._np_weight, new_np_weight): |
| 83 return | 88 return |
| 84 | 89 |
| 85 if not isinstance(new_np_weights, np.ndarray): | 90 if not isinstance(new_np_weight, np.ndarray): |
| 86 raise TypeError('Expected an np.ndarray but got %s instead' % | 91 raise TypeError('Expected an np.ndarray but got %s instead' % |
| 87 new_np_weights.__class__.__name__) | 92 new_np_weight.__class__.__name__) |
| 88 | 93 |
| 89 if new_np_weights.shape != self._np_weights.shape: | 94 if new_np_weight.shape != self._np_weight.shape: |
| 90 raise TypeError('Weight shape mismatch: %s != %s' % | 95 raise TypeError('Weight shape mismatch: %s != %s' % |
| 91 (new_np_weights.shape, self._np_weights.shape)) | 96 (new_np_weight.shape, self._np_weight.shape)) |
| 92 | 97 |
| 93 self._np_weights = np.array(filter(self.IsNonZeroWeight, new_np_weights)) | 98 self._np_weight = new_np_weight |
| 99 self.meta_weight = self._NumPyArrayToMeta(self.np_weight) |
| 94 self.ClearWeightBasedMemos() | 100 self.ClearWeightBasedMemos() |
| 95 self._weights = self._NumPyArrayToDict(self._np_weights) | |
| 96 self._feature_order = self._weights.keys() | |
| 97 | 101 |
| 98 def _NumPyArrayToDict(self, np_weights): | 102 def _NumPyArrayToMeta(self, np_weight): |
| 99 """Converts numpy array to dict (mapping feature name to weight). | 103 """Converts numpy array to dict (mapping feature name to weight). |
| 100 | 104 |
| 101 Note, this conversion is needed because model uses weights dict to organize | 105 Note, this conversion is needed because model uses meta_weight dict to |
| 102 weights for features, however SciPy trainning (e.g. BFGS) needs numpy array | 106 organize meta_weight for features, however SciPy trainning (e.g. BFGS) needs |
| 103 to do computaion. | 107 numpy array to do computaion. |
| 104 | 108 |
| 105 Args: | 109 Args: |
| 106 np_weights (np.ndarray): Weights which have the same order of | 110 np_weight (np.ndarray): meta_weight which have the same order of |
| 107 self._feature_order. Note, feature np array should also be serialized by | 111 self._ordered_feature_to_len. Note, featuer np array should also be |
| 108 the same order as self._feature_order to match. | 112 serialized by the same order as self._ordered_feature_to_len to match. |
| 109 | 113 |
| 110 Returns: | 114 Returns: |
| 111 A dict mapping feature name to weight. | 115 A dict mapping feature name to weight. |
| 112 """ | 116 """ |
| 113 return {feature_name: weight | 117 return self._serializer.FromList(np_weight, meta_constructor=MetaWeight, |
| 114 for feature_name, weight in zip(self._feature_order, np_weights)} | 118 element_constructor=Weight) |
| 115 | 119 |
| 116 def _DictToNumPyArray(self, weights, default=0.): | 120 def _MetaToNumPyArray(self, meta_weight): |
| 117 """Converts dict (mapping feature name to weight) to numpy array.""" | 121 """Converts dict (mapping feature name to weight) to numpy array.""" |
| 118 return np.array([weights.get(feature_name, default) | 122 return np.array([weight.value |
| 119 for feature_name in self._feature_order]) | 123 for weight in self._serializer.ToList(meta_weight)]) |
| 120 | 124 |
| 121 def FeaturesAsNumPyArray(self, x): | 125 def FeaturesAsNumPyArray(self, x): |
| 122 """A variant of ``Features`` which returns a ``np.ndarray``. | 126 """A variant of ``Features`` which returns a ``np.ndarray``. |
| 123 | 127 |
| 124 Note, the features np array should have the same order as in | 128 Note, the features nparray should have the same order as in |
| 125 self._feature_order to stay aligned with weights np array. | 129 self._ordered_feature_to_len to stay aligned with meta_weight np array. |
| 126 | 130 |
| 127 For training we need to have the feature function return an | 131 For training we need to have the feature function return an |
| 128 ``np.ndarray(float)`` rather than the ``list(FeatureValue)`` used | 132 ``np.ndarray(float)`` rather than the ``list(FeatureValue)`` used |
| 129 elsewhere. This function performes the necessary conversion. | 133 elsewhere. This function performes the necessary conversion. |
| 130 | 134 |
| 131 N.B., at present we do not memoize this function. The underlying | 135 N.B., at present we do not memoize this function. The underlying |
| 132 ``Features`` method is memoized, so we won't re-compute the features | 136 ``Features`` method is memoized, so we won't re-compute the features |
| 133 each time; but we will repeatedly copy the floats into newly allocated | 137 each time; but we will repeatedly copy the floats into newly allocated |
| 134 ``np.ndarray`` objects. If that turns out to be a performance | 138 ``np.ndarray`` objects. If that turns out to be a performance |
| 135 bottleneck, we can add the extra layer of memoization to avoid that. | 139 bottleneck, we can add the extra layer of memoization to avoid that. |
| 136 """ | 140 """ |
| 137 fx = self.Features(x) | 141 return lambda y: np.array(self._serializer.ToList(self.Features(x)(y))) |
| 138 def FeaturesAsNumPyArrayGivenX(y): | |
| 139 fxys = fx(y) | |
| 140 return np.array([fxys[feature_name].value | |
| 141 for feature_name in self._feature_order]) | |
| 142 | |
| 143 return FeaturesAsNumPyArrayGivenX | |
| 144 | 142 |
| 145 def LogLikelihood(self): | 143 def LogLikelihood(self): |
| 146 """The conditional log-likelihood of the training data. | 144 """The conditional log-likelihood of the training data. |
| 147 | 145 |
| 148 The conditional likelihood of the training data is the product | 146 The conditional likelihood of the training data is the product |
| 149 of ``Pr(y|x)`` for each ``(x, y)`` pair in the training data; so | 147 of ``Pr(y|x)`` for each ``(x, y)`` pair in the training data; so |
| 150 the conditional log-likelihood is the log of that. This is called | 148 the conditional log-likelihood is the log of that. This is called |
| 151 "likelihood" because it is thought of as a function of the weight | 149 "likelihood" because it is thought of as a function of the weight |
| 152 covector, with the training data held fixed. | 150 covector, with the training data held fixed. |
| 153 | 151 |
| 154 This is the ideal objective function for training the weights, as it | 152 This is the ideal objective function for training the meta_weight, as it |
| 155 will give us the MLE weight covector for the training data. However, | 153 will give us the MLE weight covector for the training data. However, |
| 156 in practice, we want to do regularization to ensure we don't overfit | 154 in practice, we want to do regularization to ensure we don't overfit |
| 157 the training data and to reduce classification time by ensuring that | 155 the training data and to reduce classification time by ensuring that |
| 158 the weight vector is sparse. Thus, the actual objective function | 156 the weight vector is sparse. Thus, the actual objective function |
| 159 will be the log-likelihood plus some penalty terms for regularization. | 157 will be the log-likelihood plus some penalty terms for regularization. |
| 160 """ | 158 """ |
| 161 observed_zeta = math.fsum(self.LogZ(x) for x, _ in self._training_data) | 159 observed_zeta = math.fsum(self.LogZ(x) for x, _ in self._training_data) |
| 162 observed_score = self.np_weights.dot( | 160 observed_score = self.np_weight.dot( |
| 163 self._observed_feature_vector) | 161 self._observed_feature_vector) |
| 164 return observed_score - observed_zeta | 162 return observed_score - observed_zeta |
| 165 | 163 |
| 166 def LogLikelihoodGradient(self): | 164 def LogLikelihoodGradient(self): |
| 167 """The gradient (aka Jacobian) of ``LogLikelihood``.""" | 165 """The gradient (aka Jacobian) of ``LogLikelihood``.""" |
| 168 expected_feature_vector = vsum([ | 166 expected_feature_vector = vsum([ |
| 169 self.Expectation(x, self.FeaturesAsNumPyArray(x)) | 167 self.Expectation(x, self.FeaturesAsNumPyArray(x)) |
| 170 for x, _ in self._training_data]) | 168 for x, _ in self._training_data]) |
| 171 return self._observed_feature_vector - expected_feature_vector | 169 return self._observed_feature_vector - expected_feature_vector |
| 172 | 170 |
| 173 def TrainWeights(self, l2_penalty): | 171 def TrainWeights(self, l2_penalty): |
| 174 """Optimize the weight covector based on the training data. | 172 """Optimize the weight covector based on the training data. |
| 175 | 173 |
| 176 Args: | 174 Args: |
| 177 l2_penalty (float): the hyperparameter for how much to penalize | 175 l2_penalty (float): the hyperparameter for how much to penalize |
| 178 weight covectors far from zero. | 176 weight covectors far from zero. |
| 179 | 177 |
| 180 Returns: | 178 Returns: |
| 181 Nothing, but has the side effect of mutating the stored weights. | 179 Nothing, but has the side effect of mutating the stored meta_weight. |
| 182 """ | 180 """ |
| 183 initial_np_weights = self.np_weights | 181 initial_np_weight = self.np_weight |
| 184 | 182 |
| 185 # We want to minimize the number of times we reset the weights since | 183 # We want to minimize the number of times we reset the meta_weight since |
| 186 # that clears our memos. One might think we could do that in the | 184 # that clears our memos. One might think we could do that in the |
| 187 # between-iterations callback; but actually, in a single iteration, | 185 # between-iterations callback; but actually, in a single iteration, |
| 188 # BFGS calls the objective function and gradient more than once with | 186 # BFGS calls the objective function and gradient more than once with |
| 189 # different arguments; so, alas, we must reset the weights in both. | 187 # different arguments; so, alas, we must reset the meta_weight in both. |
| 190 # This is why the ``weights`` setter tries to avoid clearing memos | 188 # This is why the ``meta_weight`` setter tries to avoid clearing memos |
| 191 # when possible. | 189 # when possible. |
| 192 | 190 |
| 193 def objective_function(new_np_weights): | 191 def objective_function(new_np_weight): |
| 194 self.np_weights = new_np_weights | 192 self.np_weight = new_np_weight |
| 195 return -self.LogLikelihood() + 0.5 * l2_penalty * self.quadrance | 193 return -self.LogLikelihood() + 0.5 * l2_penalty * self.quadrance |
| 196 | 194 |
| 197 def objective_function_gradient(new_np_weights): | 195 def objective_function_gradient(new_np_weight): |
| 198 self.np_weights = new_np_weights | 196 self.np_weight = new_np_weight |
| 199 return -self.LogLikelihoodGradient() + l2_penalty * self.np_weights | 197 return -self.LogLikelihoodGradient() + l2_penalty * self.np_weight |
| 200 | 198 |
| 201 result = spo.minimize( | 199 result = spo.minimize( |
| 202 objective_function, | 200 objective_function, |
| 203 initial_np_weights, | 201 initial_np_weight, |
| 204 method='BFGS', | 202 method='BFGS', |
| 205 jac=objective_function_gradient) | 203 jac=objective_function_gradient) |
| 206 | 204 |
| 207 if not result.success: # pragma: no cover | 205 if not result.success: # pragma: no cover |
| 208 # This should happen infrequently enough that there's no point in | 206 # This should happen infrequently enough that there's no point in |
| 209 # logging it and attempting to carry on. | 207 # logging it and attempting to carry on. |
| 210 raise Exception( | 208 raise Exception( |
| 211 'TrainableLogLinearModel.TrainWeights failed:' | 209 'TrainableLogLinearModel.TrainWeights failed:' |
| 212 '\n\tReason: %s' | 210 '\n\tReason: %s' |
| 213 '\n\tCurrent objective value: %s' | 211 '\n\tCurrent objective value: %s' |
| 214 '\n\tCurrent objective gradient: %s' | 212 '\n\tCurrent objective gradient: %s' |
| 215 '\n\tIterations: %d' | 213 '\n\tIterations: %d' |
| 216 '\n\tFunction evaluations: %d' | 214 '\n\tFunction evaluations: %d' |
| 217 '\n\tGradient evaluations: %d' | 215 '\n\tGradient evaluations: %d' |
| 218 % (result.message, result.fun, result.jac, result.nit, result.nfev, | 216 % (result.message, result.fun, result.jac, result.nit, result.nfev, |
| 219 result.njev)) | 217 result.njev)) |
| 220 | 218 |
| 221 # This shouldn't really be necessary, since we're resetting it | 219 # This shouldn't really be necessary, since we're resetting it |
| 222 # directly during training; but just to be safe/sure. | 220 # directly during training; but just to be safe/sure. |
| 223 self.np_weights = result.x | 221 self.np_weight = result.x |
| OLD | NEW |