Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(739)

Unified Diff: appengine/findit/crash/loglinear/training.py

Issue 2617273002: [Predator] Move ``SingleFeatureScore`` to LLM. (Closed)
Patch Set: Address comments. Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: appengine/findit/crash/loglinear/training.py
diff --git a/appengine/findit/crash/loglinear/training.py b/appengine/findit/crash/loglinear/training.py
index f094848713c5f024c0cd5d594615c9c33e169f43..152a42216dfb24dd4d4226d8b44e8ae4d999f71c 100644
--- a/appengine/findit/crash/loglinear/training.py
+++ b/appengine/findit/crash/loglinear/training.py
@@ -33,7 +33,7 @@ class TrainableLogLinearModel(LogLinearModel):
``float``. N.B., the length of the list must be the same for all
``x`` and ``y``, and must be the same as the length of the list
of weights.
- initial_weights (list of float): the pre-training coefficients
+ initial_weights (dict from str to float): the pre-training coefficients
for how much we believe components of the feature vector. This
provides the seed for training; this starting value shouldn't
affect the final weights obtained by training (thanks to
@@ -47,51 +47,48 @@ class TrainableLogLinearModel(LogLinearModel):
super(TrainableLogLinearModel, self).__init__(
Y_given_X, feature_function, initial_weights, epsilon)
self._training_data = training_data
-
+ self._feature_order = list(initial_weights.keys())
+ self._np_weights = None
wrengr 2017/01/12 19:09:10 I think it'd be clearer to set the _weights and _n
Sharu Jiang 2017/01/13 01:08:35 Done.
self._observed_feature_vector = vsum([
self.FeaturesAsNumPyArray(x)(y)
for x, y in self._training_data])
- # Even though this is identical to the superclass definition, we must
- # re-provide it in order to define the setter.
+ self.np_weights = self.WeightsToNumPyArrayWeights(initial_weights)
+
@property
- def weights(self):
- """The weight covector.
+ def np_weights(self):
+ """The NumPy Array of weight covector.
wrengr 2017/01/12 19:09:10 "of weight covector" -> "of the weight covector"
Sharu Jiang 2017/01/13 01:08:35 Done.
At present we return the weights as an ``np.ndarray``, but in the
future that may be replaced by a more general type which specifies
the semantics rather than the implementation details.
wrengr 2017/01/12 19:09:10 This paragraph should be deleted, since the whole
Sharu Jiang 2017/01/13 01:08:35 Done.
"""
- return self._weights
+ return self._np_weights
- @weights.setter
- def weights(self, new_weights): # pylint: disable=W0221
- """Mutate the weight covector, and clear memos as necessary.
+ @np_weights.setter
+ def np_weights(self, new_np_weights): # pylint: disable=W0221
+ """Mutate the np array weight covector, and clear memos as necessary.
wrengr 2017/01/12 19:09:10 It's fine to leave off the "np array" part, since
Sharu Jiang 2017/01/13 01:08:35 Done.
This setter attempts to avoid clearing memos whenever possible,
but errs on the side of caution/correctness when it needs to.
Args:
- new_weights (np.ndarray): the new weights to use. Must have the
- same shape as the old ``np.ndarray``.
+ new_np_weights (np.ndarray): the new weights to use. It will be converted
+ to weights dict mapping feature_name to its weight.
"""
- if new_weights is self._weights:
+ if np.array_equal(self._np_weights, new_np_weights):
return
- if not isinstance(new_weights, np.ndarray):
- raise TypeError('Expected an np.ndarray but got %s instead'
- % new_weights.__class__.__name__)
-
- if new_weights.shape != self._weights.shape:
- raise TypeError('Weight shape mismatch: %s != %s'
- % (new_weights.shape, self._weights.shape))
-
+ self._np_weights = new_np_weights
self.ClearWeightBasedMemos()
- self._weights = new_weights
+ self._weights = self.NumPyArrayWeightsToWeights(new_np_weights)
def FeaturesAsNumPyArray(self, x):
"""A variant of ``Features`` which returns a ``np.ndarray``.
+ Note, the features nparray should have the same order as in
wrengr 2017/01/12 19:09:10 "nparray" -> "np array"
Sharu Jiang 2017/01/13 01:08:35 Done.
+ self._feature_order to stay aligned with weights np array.
+
For training we need to have the feature function return an
``np.ndarray(float)`` rather than the ``list(FeatureValue)`` used
elsewhere. This function performes the necessary conversion.
@@ -103,7 +100,48 @@ class TrainableLogLinearModel(LogLinearModel):
bottleneck, we can add the extra layer of memoization to avoid that.
"""
fx = self.Features(x)
- return lambda y: np.array([fxy.value for fxy in fx(y)])
+ def FeaturesAsNumPyArrayGivenX(y):
+ fxys = fx(y)
+ return np.array([fxys[feature_name].value if feature_name in fxys else 0.
wrengr 2017/01/12 19:09:10 If feature_name is not in fxys then that should be
Sharu Jiang 2017/01/13 01:08:35 Done.
+ for feature_name in self._feature_order])
+
+ return FeaturesAsNumPyArrayGivenX
+
+ def WeightsAsNumPyArray(self):
wrengr 2017/01/12 19:09:10 This is unnecessary. Callers should use the ``np_w
Sharu Jiang 2017/01/13 01:08:35 Done.
+ """Returns converted numpy array version of weights.
+
+ Note, this conversion is needed because model uses weights dict to organize
+ weights for features, however SciPy trainning (e.g. BFGS) needs numpy array
+ to do computaion.
+ """
+ return self.np_weights
+
+ def NumPyArrayWeightsToWeights(self, np_weights):
wrengr 2017/01/12 19:09:10 This "weights to weights" name is confusing. Also,
Sharu Jiang 2017/01/13 01:08:35 Done.
+ """Converts numpy array to dict (mapping feature name to weight).
+
+ Note, this conversion is needed because model uses weights dict to organize
+ weights for features, however SciPy trainning (e.g. BFGS) needs numpy array
+ to do computaion.
+
+ Args:
+ np_weights (np.ndarray): Weights which have the same order of
+ self._feature_order. Note, feature np array should also be serialized by
+ the same order as self._feature_order to match.
+
+ Returns:
+ A dict mapping feature name to weight.
+ """
+ if not isinstance(np_weights, np.ndarray):
+ raise TypeError('Expected an np.ndarray but got %s instead'
+ % np_weights.__class__.__name__)
+
+ return {feature_name: weight
+ for feature_name, weight in zip(self._feature_order, np_weights)}
+
+ def WeightsToNumPyArrayWeights(self, weights, default=0.):
wrengr 2017/01/12 19:09:10 again, name is confusing and should also be privat
Sharu Jiang 2017/01/13 01:08:35 Done.
+ """Converts dict (mapping feature name to weight) to numpy array."""
+ return np.array([weights.get(feature_name, default)
+ for feature_name in self._feature_order])
def LogLikelihood(self):
"""The conditional log-likelihood of the training data.
@@ -122,7 +160,8 @@ class TrainableLogLinearModel(LogLinearModel):
will be the log-likelihood plus some penalty terms for regularization.
"""
observed_zeta = math.fsum(self.LogZ(x) for x, _ in self._training_data)
- observed_score = self.weights.dot(self._observed_feature_vector)
+ observed_score = self.WeightsAsNumPyArray().dot(
+ self._observed_feature_vector)
return observed_score - observed_zeta
def LogLikelihoodGradient(self):
@@ -142,7 +181,7 @@ class TrainableLogLinearModel(LogLinearModel):
Returns:
Nothing, but has the side effect of mutating the stored weights.
"""
- initial_weights = self.weights
+ initial_np_weights = self.WeightsAsNumPyArray()
# We want to minimize the number of times we reset the weights since
# that clears our memos. One might think we could do that in the
@@ -152,17 +191,17 @@ class TrainableLogLinearModel(LogLinearModel):
# This is why the ``weights`` setter tries to avoid clearing memos
# when possible.
- def objective_function(new_weights):
- self.weights = new_weights
+ def objective_function(new_np_weights):
+ self.np_weights = new_np_weights
return -self.LogLikelihood() + 0.5 * l2_penalty * self.quadrance
- def objective_function_gradient(new_weights):
- self.weights = new_weights
- return -self.LogLikelihoodGradient() + l2_penalty * self.weights
+ def objective_function_gradient(new_np_weights):
+ self.np_weights = new_np_weights
+ return -self.LogLikelihoodGradient() + l2_penalty * self.np_weights
result = spo.minimize(
objective_function,
- initial_weights,
+ initial_np_weights,
method='BFGS',
jac=objective_function_gradient)
@@ -182,4 +221,4 @@ class TrainableLogLinearModel(LogLinearModel):
# This shouldn't really be necessary, since we're resetting it
# directly during training; but just to be safe/sure.
- self.weights = result.x
+ self.np_weights = result.x

Powered by Google App Engine
This is Rietveld 408576698