Chromium Code Reviews| Index: appengine/findit/crash/loglinear/model.py |
| diff --git a/appengine/findit/crash/loglinear/model.py b/appengine/findit/crash/loglinear/model.py |
| index b231e90e075534396bfbfbdccaefb5ecdf7ad1ad..5196df1a0ef79f82a0924538c1f1260838a36ffc 100644 |
| --- a/appengine/findit/crash/loglinear/model.py |
| +++ b/appengine/findit/crash/loglinear/model.py |
| @@ -17,24 +17,6 @@ from libs.math.vectors import vsum |
| EPSILON = 0.00001 |
| -def ToFeatureFunction(fs): |
| - """Given an array of scalar-valued functions, return an array-valued function. |
| - |
| - Args: |
| - fs (iterable): A collection of curried functions ``X -> Y -> A``. |
| - That is, given a particular ``x`` they return a function ``Y -> A``. |
| - |
| - Returns: |
| - A function ``X -> Y -> list(A)`` where for all ``x``, ``y``, and |
| - ``i`` we have that ``ToFeatureFunction(fs)(x)(y)[i] == fs[i](x)(y)``. |
| - """ |
| - def _FeatureFunction(x): |
| - fxs = [f(x) for f in fs] |
| - return lambda y: [fx(y) for fx in fxs] |
| - |
| - return _FeatureFunction |
| - |
| - |
| class UnnormalizedLogLinearModel(object): |
| """An unnormalized loglinear model. |
| @@ -67,9 +49,10 @@ class UnnormalizedLogLinearModel(object): |
| feature_function: A function ``X -> Y -> list(FeatureValue)``. N.B., |
| for all ``x`` and ``y`` the length of ``feature_function(x)(y)`` |
| must be the same as the length of ``weights``. |
| - weights (list of float): coefficients for how important we consider |
| - each component of the feature vector for deciding which ``y`` |
| - to blame. |
| + weights (dict of float): the weights for the features. The keys of |
| + the dictionary are the names of the feature that weight is |
| + for. We take this argument as a dict rather than as a list so that |
| + callers needn't worry about what order to provide the weights in. |
| epsilon (float): The absolute-error threshold for considering a |
| weight to be "equal to zero". N.B., this should be a positive |
| number, as we will compare it against the absolute value of |
| @@ -77,9 +60,12 @@ class UnnormalizedLogLinearModel(object): |
| """ |
| if epsilon is None: |
| epsilon = EPSILON |
| - self._weights = np.array([ |
| - w if isinstance(w, float) and math.fabs(w) >= epsilon else 0. |
| - for w in weights]) |
| + |
| + self._weights = { |
| + name: weight if isinstance(weight, float) and |
| + math.fabs(weight) >= epsilon else 0. |
|
wrengr
2017/01/11 20:38:30
The ``if...`` should be moved to after the ``for..
Sharu Jiang
2017/01/12 01:41:38
Done.
|
| + for name, weight in weights.iteritems() |
| + } |
| self._quadrance = None |
| @@ -89,7 +75,7 @@ class UnnormalizedLogLinearModel(object): |
| This outer wrapping takes each ``x`` to a memoized instance of |
| ``_FeaturesGivenX``. That is, for each ``x`` we return a |
| - ``MemoizedFunction`` from ``Y`` to ``list(FeatureValue)``. |
| + ``MemoizedFunction`` from ``Y`` to ``dict(str to FeatureValue)``. |
| """ |
| fx = feature_function(x) |
| def _FeaturesGivenX(y): |
| @@ -117,9 +103,23 @@ class UnnormalizedLogLinearModel(object): |
| # more efficient way, we should. In particular, we will want to |
| # make the weights sparse, in which case we need to use a sparse |
| # variant of the dot product. |
| - self._scores = MemoizedFunction(lambda x: |
| - self._features(x).map(lambda fxy: |
| - self.weights.dot(np.array([feature.value for feature in fxy])))) |
| + self._scores = MemoizedFunction(lambda x: self._features(x).map( |
| + lambda fxy: sum(self.SingleFeatureScore(feature) |
|
wrengr
2017/01/11 20:38:30
Should use ``math.fsum`` whenever adding floats; n
Sharu Jiang
2017/01/12 01:41:38
Done.
|
| + for feature in fxy.itervalues()))) |
|
wrengr
2017/01/11 20:38:30
N.B., you're not taking advantage of the sparsity
Sharu Jiang
2017/01/12 01:41:38
That's because I didn't filter those 0 weights in
wrengr
2017/01/12 18:16:16
Yeah, the weights will change during training, but
|
| + |
| + def SingleFeatureScore(self, feature_value): |
| + """Returns the score (aka weighted value) of a ``FeatureValue``. |
| + |
| + This function assumes the report's stacktrace has already had any necessary |
| + preprocessing (like filtering or truncating) applied. |
| + |
| + Args: |
| + feature_value (FeatureValue): the feature value to check. |
| + |
| + Returns: |
| + The score of the feature value. |
| + """ |
| + return feature_value.value * self._weights.get(feature_value.name, 0.) |
| def ClearWeightBasedMemos(self): |
| """Clear all the memos that depend on the weight covector.""" |
| @@ -135,9 +135,9 @@ class UnnormalizedLogLinearModel(object): |
| def weights(self): |
| """The weight covector. |
| - At present we return the weights as an ``np.ndarray``, but in the |
| - future that may be replaced by a more general type which specifies |
| - the semantics rather than the implementation details. |
| + At present we return the weights as an dict mapping feature name to its |
| + weight, but in the future that may be replaced by a more general type which |
| + specifies the semantics rather than the implementation details. |
| """ |
| return self._weights |
| @@ -147,12 +147,12 @@ class UnnormalizedLogLinearModel(object): |
| N.B., despite being popularly called the "l0-norm", this isn't |
| actually a norm in the mathematical sense.""" |
| - return float(np.count_nonzero(self.weights)) |
| + return float(np.count_nonzero(self.weights.itervalues())) |
|
wrengr
2017/01/11 20:38:30
You can't use ``np.count_nonzero`` anymore since `
Sharu Jiang
2017/01/12 01:41:38
Since I haven't filtered 0 weights yet, I just use
wrengr
2017/01/12 18:16:16
Testing for exact equality with 0.0 isn't reliable
|
| @property |
| def l1(self): # pragma: no cover |
| """The l1 (aka: Manhattan) norm of the weight covector.""" |
| - return math.fsum(math.fabs(w) for w in self.weights) |
| + return math.fsum(math.fabs(w) for w in self.weights.itervalues()) |
| @property |
| def quadrance(self): |
| @@ -164,7 +164,8 @@ class UnnormalizedLogLinearModel(object): |
| the error introduced by squaring the square-root of an IEEE-754 float. |
| """ |
| if self._quadrance is None: |
| - self._quadrance = math.fsum(math.fabs(w)**2 for w in self.weights) |
| + self._quadrance = math.fsum( |
| + math.fabs(w)**2 for w in self.weights.itervalues()) |
| return self._quadrance |
| @@ -185,7 +186,7 @@ class UnnormalizedLogLinearModel(object): |
| x (X): the value of the independent variable. |
| Returns: |
| - A ``MemoizedFunction`` of type ``Y -> np.array(float)``. |
| + A ``MemoizedFunction`` of type ``Y -> dict(str to float)``. |
| """ |
| return self._features(x) |
| @@ -233,9 +234,10 @@ class LogLinearModel(UnnormalizedLogLinearModel): |
| feature_function: A function ``X -> Y -> list(float)``. N.B., |
| for all ``x`` and ``y`` the length of ``feature_function(x)(y)`` |
| must be the same as the length of ``weights``. |
| - weights (list of float): coefficients for how important we consider |
| - each component of the feature vector for deciding which ``y`` |
| - to blame. |
| + weights (dict of float): the weights for the features. The keys of |
| + the dictionary are the names of the feature that weight is |
| + for. We take this argument as a dict rather than as a list so that |
| + callers needn't worry about what order to provide the weights in. |
| epsilon (float): The absolute-error threshold for considering a |
| weight to be "equal to zero". N.B., this should be a positive |
| number, as we will compare it against the absolute value of |