appengine/findit/crash/loglinear/training.py - Issue 2625073003: [Predator] Add MetaWeight and MetaFeatureValue to group multiple weights and features together.

Unified Diff: appengine/findit/crash/loglinear/training.py

Issue 2625073003: [Predator] Add MetaWeight and MetaFeatureValue to group multiple weights and features together. (Closed)

Patch Set: Rebase. Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: appengine/findit/crash/loglinear/training.py

diff --git a/appengine/findit/crash/loglinear/training.py b/appengine/findit/crash/loglinear/training.py

index f01a4dd81c5dddf2c599c99d9697527a7a4a6924..78cc50c0cd3ed107ce3a866cd0aa9821dbceccd5 100644

--- a/appengine/findit/crash/loglinear/training.py

+++ b/appengine/findit/crash/loglinear/training.py

@@ -2,21 +2,25 @@

# Use of this source code is governed by a BSD-style license that can be

# found in the LICENSE file.

+from collections import OrderedDict

import math

import numpy as np

# N.B., ``np.array`` can't take generators; you must pass explicit lists.

import scipy.optimize as spo

from crash.loglinear.model import LogLinearModel

+from crash.loglinear.weight import MetaWeight

+from crash.loglinear.weight import Weight

+from libs.meta_dict_serializer import GetSerializer

from libs.math.vectors import vsum

# N.B., ``vsum`` can't take generators; you must pass explicit lists.

class TrainableLogLinearModel(LogLinearModel):

- """A loglinear model with some labelled data set for training the weights."""

+ """A loglinear model with labelled data set for training the meta_weight."""

- def __init__(self, Y_given_X, training_data, feature_function,

- initial_weights, epsilon=None):

+ def __init__(self, Y_given_X, training_data, meta_feature, meta_weight,

+ epsilon=None):

"""

Args:

Y_given_X: a function from ``X`` to an iterable object giving the

@@ -29,17 +33,17 @@ class TrainableLogLinearModel(LogLinearModel):

only the subsets for each ``x``.

training_data (iterable): a collection of ``(x, y)`` pairs where

``y`` is the known-correct label for ``x``.

- feature_function: A function from ``X`` to ``Y`` to a list of

+ meta_feature: A function from ``X`` to ``Y`` to a list of

``float``. N.B., the length of the list must be the same for all

``x`` and ``y``, and must be the same as the length of the list

- of weights.

- initial_weights (dict from str to float): the pre-training coefficients

- for how much we believe components of the feature vector. This

- provides the seed for training; this starting value shouldn't

- affect the final weights obtained by training (thanks to

+ of meta_weight.

+ meta_weight (dict from str to (Vector)Weight): the pre-training

+ coefficients for how much we believe components of the feature vector.

+ This provides the seed for training; this starting value shouldn't

+ affect the final meta_weight obtained by training (thanks to

convexity), but will affect how long it takes for training

to converge.

- N.B. The dict should not be sparse (only contains non-zero weights),

+ N.B. The dict should not be sparse (only contains non-zero meta_weight),

because we only train those features whose names are keys in this dict.

epsilon (float): The absolute-error threshold for considering a

weight to be "equal to zero". N.B., this should be a positive

@@ -47,82 +51,82 @@ class TrainableLogLinearModel(LogLinearModel):

each weight.

"""

super(TrainableLogLinearModel, self).__init__(

- Y_given_X, feature_function, initial_weights, epsilon)

+ Y_given_X, meta_feature, meta_weight, epsilon)

self._training_data = training_data

- # Use self._weights instead of initialz_weights, since self._weights already

- # filtered zero weights in the __init__ of superclass.

- self._feature_order = self._weights.keys()

- self._np_weights = self._DictToNumPyArray(self._weights)

+ # Use self._meta_weight instead of initialz_meta_weight,

+ # since self._meta_weight already filtered zero meta_weight in the __init__

+ # of superclass.

+ self._serializer = GetSerializer(meta_feature)

+ self._np_weight = self._MetaToNumPyArray(self.meta_weight)

self._observed_feature_vector = vsum([

self.FeaturesAsNumPyArray(x)(y)

for x, y in self._training_data])

@property

- def np_weights(self):

+ def np_weight(self):

"""The NumPy Array of the weight covector."""

- return self._np_weights

+ return self._np_weight

- @np_weights.setter

- def np_weights(self, new_np_weights): # pylint: disable=W0221

+ @np_weight.setter

+ def np_weight(self, new_np_weight): # pylint: disable=W0221

"""Mutate the weight covector, and clear memos as necessary.

This setter attempts to avoid clearing memos whenever possible,

but errs on the side of caution/correctness when it needs to.

- This setter also drop all the zero weights in weight covector using

+ This setter also drop all the zero meta_weight in weight covector using

self._epsilon.

Note, the conversion between dict and np array is needed because model uses

- dict to organize weights of features, however SciPy trainning (e.g. BFGS)

- needs numpy array to do computaion.

+ dict to organize meta_weight of features, however SciPy trainning

+ (e.g. BFGS) needs numpy array to do computaion.

Args:

- new_np_weights (np.ndarray): the new weights to use. It will be converted

- to weights dict mapping feature_name to its weight.

+ new_np_weight (np.ndarray): the new meta_weight to use. It will be

+ converted to meta_weight dict mapping feature_name to its weight.

"""

- if np.array_equal(self._np_weights, new_np_weights):

+ if np.array_equal(self._np_weight, new_np_weight):

return

- if not isinstance(new_np_weights, np.ndarray):

+ if not isinstance(new_np_weight, np.ndarray):

raise TypeError('Expected an np.ndarray but got %s instead' %

- new_np_weights.__class__.__name__)

+ new_np_weight.__class__.__name__)

- if new_np_weights.shape != self._np_weights.shape:

+ if new_np_weight.shape != self._np_weight.shape:

raise TypeError('Weight shape mismatch: %s != %s' %

- (new_np_weights.shape, self._np_weights.shape))

+ (new_np_weight.shape, self._np_weight.shape))

- self._np_weights = np.array(filter(self.IsNonZeroWeight, new_np_weights))

+ self._np_weight = new_np_weight

+ self.meta_weight = self._NumPyArrayToMeta(self.np_weight)

self.ClearWeightBasedMemos()

- self._weights = self._NumPyArrayToDict(self._np_weights)

- self._feature_order = self._weights.keys()

- def _NumPyArrayToDict(self, np_weights):

+ def _NumPyArrayToMeta(self, np_weight):

"""Converts numpy array to dict (mapping feature name to weight).

- Note, this conversion is needed because model uses weights dict to organize

- weights for features, however SciPy trainning (e.g. BFGS) needs numpy array

- to do computaion.

+ Note, this conversion is needed because model uses meta_weight dict to

+ organize meta_weight for features, however SciPy trainning (e.g. BFGS) needs

+ numpy array to do computaion.

Args:

- np_weights (np.ndarray): Weights which have the same order of

- self._feature_order. Note, feature np array should also be serialized by

- the same order as self._feature_order to match.

+ np_weight (np.ndarray): meta_weight which have the same order of

+ self._ordered_feature_to_len. Note, featuer np array should also be

+ serialized by the same order as self._ordered_feature_to_len to match.

Returns:

A dict mapping feature name to weight.

"""

- return {feature_name: weight

- for feature_name, weight in zip(self._feature_order, np_weights)}

+ return self._serializer.FromList(np_weight, meta_constructor=MetaWeight,

+ element_constructor=Weight)

- def _DictToNumPyArray(self, weights, default=0.):

+ def _MetaToNumPyArray(self, meta_weight):

"""Converts dict (mapping feature name to weight) to numpy array."""

- return np.array([weights.get(feature_name, default)

- for feature_name in self._feature_order])

+ return np.array([weight.value

+ for weight in self._serializer.ToList(meta_weight)])

def FeaturesAsNumPyArray(self, x):

"""A variant of ``Features`` which returns a ``np.ndarray``.

- Note, the features np array should have the same order as in

- self._feature_order to stay aligned with weights np array.

+ Note, the features nparray should have the same order as in

+ self._ordered_feature_to_len to stay aligned with meta_weight np array.

For training we need to have the feature function return an

``np.ndarray(float)`` rather than the ``list(FeatureValue)`` used

@@ -134,13 +138,7 @@ class TrainableLogLinearModel(LogLinearModel):

``np.ndarray`` objects. If that turns out to be a performance

bottleneck, we can add the extra layer of memoization to avoid that.

"""

- fx = self.Features(x)

- def FeaturesAsNumPyArrayGivenX(y):

- fxys = fx(y)

- return np.array([fxys[feature_name].value

- for feature_name in self._feature_order])

- return FeaturesAsNumPyArrayGivenX

+ return lambda y: np.array(self._serializer.ToList(self.Features(x)(y)))

def LogLikelihood(self):

"""The conditional log-likelihood of the training data.

@@ -151,7 +149,7 @@ class TrainableLogLinearModel(LogLinearModel):

"likelihood" because it is thought of as a function of the weight

covector, with the training data held fixed.

- This is the ideal objective function for training the weights, as it

+ This is the ideal objective function for training the meta_weight, as it

will give us the MLE weight covector for the training data. However,

in practice, we want to do regularization to ensure we don't overfit

the training data and to reduce classification time by ensuring that

@@ -159,7 +157,7 @@ class TrainableLogLinearModel(LogLinearModel):

will be the log-likelihood plus some penalty terms for regularization.

"""

observed_zeta = math.fsum(self.LogZ(x) for x, _ in self._training_data)

- observed_score = self.np_weights.dot(

+ observed_score = self.np_weight.dot(

self._observed_feature_vector)

return observed_score - observed_zeta

@@ -178,29 +176,29 @@ class TrainableLogLinearModel(LogLinearModel):

weight covectors far from zero.

Returns:

- Nothing, but has the side effect of mutating the stored weights.

+ Nothing, but has the side effect of mutating the stored meta_weight.

"""

- initial_np_weights = self.np_weights

+ initial_np_weight = self.np_weight

- # We want to minimize the number of times we reset the weights since

+ # We want to minimize the number of times we reset the meta_weight since

# that clears our memos. One might think we could do that in the

# between-iterations callback; but actually, in a single iteration,

# BFGS calls the objective function and gradient more than once with

- # different arguments; so, alas, we must reset the weights in both.

- # This is why the ``weights`` setter tries to avoid clearing memos

+ # different arguments; so, alas, we must reset the meta_weight in both.

+ # This is why the ``meta_weight`` setter tries to avoid clearing memos

# when possible.

- def objective_function(new_np_weights):

- self.np_weights = new_np_weights

+ def objective_function(new_np_weight):

+ self.np_weight = new_np_weight

return -self.LogLikelihood() + 0.5 * l2_penalty * self.quadrance

- def objective_function_gradient(new_np_weights):

- self.np_weights = new_np_weights

- return -self.LogLikelihoodGradient() + l2_penalty * self.np_weights

+ def objective_function_gradient(new_np_weight):

+ self.np_weight = new_np_weight

+ return -self.LogLikelihoodGradient() + l2_penalty * self.np_weight

result = spo.minimize(

objective_function,

- initial_np_weights,

+ initial_np_weight,

method='BFGS',

jac=objective_function_gradient)

@@ -220,4 +218,4 @@ class TrainableLogLinearModel(LogLinearModel):

# This shouldn't really be necessary, since we're resetting it

# directly during training; but just to be safe/sure.

- self.np_weights = result.x

+ self.np_weight = result.x

« no previous file with comments | « appengine/findit/crash/loglinear/test/weight_test.py ('k') | appengine/findit/crash/loglinear/weight.py » ('j') | no next file with comments »