Chromium Code Reviews| Index: appengine/findit/crash/loglinear/model.py |
| diff --git a/appengine/findit/crash/loglinear/model.py b/appengine/findit/crash/loglinear/model.py |
| index c26de5d3b2584feaac1ccc90619553df0f7ad020..e9ae7103217811539e428fc9473fe2c164e543eb 100644 |
| --- a/appengine/findit/crash/loglinear/model.py |
| +++ b/appengine/findit/crash/loglinear/model.py |
| @@ -43,15 +43,19 @@ class UnnormalizedLogLinearModel(object): |
| rather than returning a probability per se. |
| """ |
| - def __init__(self, feature_function, weights, epsilon=None): |
| - """Construct a new model with the given weights and feature function. |
| + def __init__(self, meta_feature, meta_weight, epsilon=None): |
| + """Construct a new model with the meta_feature and meta_weight. |
| Args: |
| - feature_function: A function ``X -> Y -> list(FeatureValue)``. N.B., |
| - for all ``x`` and ``y`` the length of ``feature_function(x)(y)`` |
| + wrapped_feature: A function ``X -> Y -> list(FeatureValue)``. N.B., |
|
chanli
2017/01/19 04:26:06
Not in argument list above?
Sharu Jiang
2017/01/19 23:49:52
Oops, should have deleted it.
|
| + for all ``x`` and ``y`` the length of ``wrapped_feature(x)(y)`` |
| must be the same as the length of ``weights``. |
| - weights (dict of float): the weights for the features. The keys of |
| - the dictionary are the names of the feature that weight is |
| + meta_feature (MetaFeature): A function ``X -> Y -> MetaFeatureValue``. |
| + N.B., for all ``x`` and ``y`` the length of ``wrapped_feature(x)(y)`` |
| + must be the same as the length of ``weights``. |
| + All features. |
| + meta_weight (MetaWeight): All weights. the weights for the features. |
| + The keys of the dictionary are the names of the feature that weight is |
| for. We take this argument as a dict rather than as a list so that |
| callers needn't worry about what order to provide the weights in. |
| epsilon (float): The absolute-error threshold for considering a |
| @@ -64,84 +68,33 @@ class UnnormalizedLogLinearModel(object): |
| else: |
| self._epsilon = epsilon |
|
chanli
2017/01/19 04:26:06
This if else can be simplified to
self._epsilon
lijeffrey
2017/01/19 15:23:35
even simpler is self._epsilon = epsilon or EPSILON
Sharu Jiang
2017/01/19 23:49:52
if epsilon would be False if epsilon equals to zer
|
| - # TODO(crbug.com/680207) Filter zero weights, use sparse representaion of |
| - # weight covector. |
| - self._weights = { |
| - name: weight for name, weight in weights.iteritems() |
| - if self.IsNonZeroWeight(weight) |
| - } |
| + self._meta_weight = meta_weight |
| + self._meta_weight.DropZeroWeights(self._epsilon) |
| self._quadrance = None |
| - |
| # TODO(crbug.com/674752): we need better names for ``self._features``. |
| def _Features(x): |
| - """Wrap ``feature_function`` to memoize things and ensure types. |
| + """Wrap ``wrapped_feature`` to memoize things and ensure types. |
| This outer wrapping takes each ``x`` to a memoized instance of |
| ``_FeaturesGivenX``. That is, for each ``x`` we return a |
| ``MemoizedFunction`` from ``Y`` to ``dict(str to FeatureValue)``. |
| """ |
| - fx = feature_function(x) |
| - def _FeaturesGivenX(y): |
| - """Wrap ``feature_function(x)`` to ensure appropriate types. |
| - |
| - This inner wrapper ensures that the resulting ``FeatureValue`` |
| - array has the same length as the weight covector. |
| - """ |
| - fxy = fx(y) |
| - # N.B., we're assuming that ``len(self.weights)`` is O(1). |
| - assert len(fxy) == len(self.weights), TypeError( |
| - "vector length mismatch: %d != %d" % (len(fxy), len(self.weights))) |
| - return fxy |
| - |
| # Memoize on ``Y``, to ensure we don't need to recompute |
| # ``FeatureValue``s nor recheck the lengths. |
| - return MemoizedFunction(_FeaturesGivenX) |
| + return MemoizedFunction(meta_feature(x)) |
| # Memoize on ``X``, to ensure we share the memo tables on ``Y``. |
| self._features = MemoizedFunction(_Features) |
| # TODO(crbug.com/674752): we need better names for ``self._scores``. |
| - # N.B., this is just the inner product of ``self.weights`` |
| + # N.B., this is just the inner product of ``self._meta_weight`` |
| # against ``self._features(x)``. If we can compute this in some |
| # more efficient way, we should. In particular, we will want to |
| # make the weights sparse, in which case we need to use a sparse |
| # variant of the dot product. |
| self._scores = MemoizedFunction(lambda x: self._features(x).map( |
| - lambda fxy: math.fsum(self.SingleFeatureScore(feature) |
| - for feature in fxy.itervalues()))) |
| - |
| - def IsNonZeroWeight(self, weight): |
| - return isinstance(weight, float) and math.fabs(weight) >= self._epsilon |
| - |
| - def SingleFeatureScore(self, feature_value): |
| - """Returns the score (aka weighted value) of a ``FeatureValue``. |
| - |
| - Args: |
| - feature_value (FeatureValue): the feature value to check. |
| - |
| - Returns: |
| - The score of the feature value. |
| - """ |
| - return feature_value.value * self._weights.get(feature_value.name, 0.) |
| - |
| - # TODO(crbug.com/673964): something better for detecting "close to log(0)". |
| - def LogZeroish(self, x): |
| - """Determine whether a float is close enough to log(0). |
| - |
| - If a ``FeatureValue`` has a (log-domain) score of -inf for a given |
| - ``Suspect``, then that suspect has zero probability of being the |
| - culprit. We want to filter these suspects out, to clean up the |
| - output of classification; so this method encapsulates the logic of |
| - that check. |
| - |
| - Args: |
| - x (float): the float to check |
| - |
| - Returns: |
| - ``True`` if ``x`` is close enough to log(0); else ``False``. |
| - """ |
| - return x < 0 and math.isinf(x) |
| + lambda fxy: self._meta_weight * fxy)) |
| def ClearWeightBasedMemos(self): |
| """Clear all the memos that depend on the weight covector.""" |
| @@ -154,14 +107,19 @@ class UnnormalizedLogLinearModel(object): |
| self._features.ClearMemos() |
| @property |
| - def weights(self): |
| + def meta_weight(self): |
| """The weight covector. |
| At present we return the weights as an dict mapping feature name to its |
| weight, but in the future that may be replaced by a more general type which |
| specifies the semantics rather than the implementation details. |
| """ |
| - return self._weights |
| + return self._meta_weight |
| + |
| + @meta_weight.setter |
| + def meta_weight(self, new_meta_weight): |
| + self._meta_weight = new_meta_weight |
| + self._meta_weight.DropZeroWeights(self._epsilon) |
| @property |
| def l0(self): # pragma: no cover |
| @@ -169,12 +127,12 @@ class UnnormalizedLogLinearModel(object): |
| N.B., despite being popularly called the "l0-norm", this isn't |
| actually a norm in the mathematical sense.""" |
| - return float(len(self.weights) - self.weights.values().count(0.)) |
| + return self._meta_weight.l0 |
| @property |
| def l1(self): # pragma: no cover |
| """The l1 (aka: Manhattan) norm of the weight covector.""" |
| - return math.fsum(math.fabs(w) for w in self.weights.itervalues()) |
| + return self._meta_weight.l1 |
| @property |
| def quadrance(self): |
| @@ -185,11 +143,7 @@ class UnnormalizedLogLinearModel(object): |
| as its own quantity in many places. Also, computing it directly avoids |
| the error introduced by squaring the square-root of an IEEE-754 float. |
| """ |
| - if self._quadrance is None: |
| - self._quadrance = math.fsum( |
| - math.fabs(w)**2 for w in self.weights.itervalues()) |
| - |
| - return self._quadrance |
| + return self._meta_weight.quadrance |
| @property |
| def l2(self): |
| @@ -201,6 +155,24 @@ class UnnormalizedLogLinearModel(object): |
| """ |
| return math.sqrt(self.quadrance) |
| + # TODO(crbug.com/673964): something better for detecting "close to log(0)". |
| + def LogZeroish(self, x): |
| + """Determine whether a float is close enough to log(0). |
| + |
| + If a ``FeatureValue`` has a (log-domain) score of -inf for a given |
| + ``Suspect``, then that suspect has zero probability of being the |
| + culprit. We want to filter these suspects out, to clean up the |
| + output of classification; so this method encapsulates the logic of |
| + that check. |
| + |
| + Args: |
| + x (float): the float to check |
| + |
| + Returns: |
| + ``True`` if ``x`` is close enough to log(0); else ``False``. |
| + """ |
| + return x < 0 and math.isinf(x) |
| + |
| def Features(self, x): |
| """Returns a function mapping ``y`` to its feature vector given ``x``. |
| @@ -233,77 +205,6 @@ class UnnormalizedLogLinearModel(object): |
| """ |
| return self._scores(x) |
| - def FormatReasons(self, features): |
| - """Collect and format a list of all ``FeatureValue.reason`` strings. |
| - |
| - Args: |
| - features (iterable of FeatureValue): the values whose ``reason`` |
| - strings should be collected. |
| - |
| - Returns: |
| - A list of ``(str, float, str)`` triples; where the first string is |
| - the feature name, the float is some numeric representation of how |
| - much influence this feature exerts on the ``Suspect`` being blamed, |
| - and the final string is the ``FeatureValue.reason``. The list is |
| - sorted by feature name, just to ensure that it comes out in some |
| - canonical order. |
| - |
| - At present, the float is the log-domain score of the feature |
| - value. However, this isn't the best thing for UX reasons. In the |
| - future it might be replaced by the normal-domain score, or by |
| - the probability. |
| - """ |
| - formatted_reasons = [] |
| - for feature in features: |
| - feature_score = self.SingleFeatureScore(feature) |
| - if self.LogZeroish(feature_score): # pragma: no cover |
| - logging.debug('Discarding reasons from feature %s' |
| - ' because it has zero probability' % feature.name) |
| - continue |
| - |
| - formatted_reasons.append((feature.name, feature_score, feature.reason)) |
| - |
| - formatted_reasons.sort(key=lambda formatted_reason: formatted_reason[0]) |
| - return formatted_reasons |
| - |
| - def AggregateChangedFiles(self, features): |
| - """Merge multiple``FeatureValue.changed_files`` lists into one. |
| - |
| - Args: |
| - features (iterable of FeatureValue): the values whose ``changed_files`` |
| - lists should be aggregated. |
| - |
| - Returns: |
| - A list of ``ChangedFile`` objects sorted by file name. The sorting |
| - is not essential, but is provided to ease testing by ensuring the |
| - output is in some canonical order. |
| - |
| - Raises: |
| - ``ValueError`` if any file name is given inconsistent ``blame_url``s. |
| - """ |
| - all_changed_files = {} |
| - for feature in features: |
| - if self.LogZeroish(self.SingleFeatureScore(feature)): # pragma: no cover |
| - logging.debug('Discarding changed files from feature %s' |
| - ' because it has zero probability' % feature.name) |
| - continue |
| - |
| - for changed_file in feature.changed_files or []: |
| - accumulated_changed_file = all_changed_files.get(changed_file.name) |
| - if accumulated_changed_file is None: |
| - all_changed_files[changed_file.name] = changed_file |
| - continue |
| - |
| - if (accumulated_changed_file.blame_url != |
| - changed_file.blame_url): # pragma: no cover |
| - raise ValueError('Blame URLs do not match: %s != %s' |
| - % (accumulated_changed_file.blame_url, changed_file.blame_url)) |
| - accumulated_changed_file.reasons.extend(changed_file.reasons or []) |
| - |
| - changed_files = all_changed_files.values() |
| - changed_files.sort(key=lambda changed_file: changed_file.name) |
| - return changed_files |
| - |
| class LogLinearModel(UnnormalizedLogLinearModel): |
| """A loglinear probability model. |
| @@ -312,7 +213,7 @@ class LogLinearModel(UnnormalizedLogLinearModel): |
| we can provide probabilities (not just scores). However, to do so we |
| require a specification of the subsets of ``Y`` for each ``x``. |
| """ |
| - def __init__(self, Y_given_X, feature_function, weights, epsilon=None): |
| + def __init__(self, Y_given_X, meta_feature, meta_weight, epsilon=None): |
| """Construct a new probabilistic model. |
| Args: |
| @@ -324,11 +225,12 @@ class LogLinearModel(UnnormalizedLogLinearModel): |
| needed for computing the partition function and expectation. N.B., |
| we do not actually need to know/enumerate of *all* of ``Y``, |
| only the subsets for each ``x``. |
| - feature_function: A function ``X -> Y -> list(float)``. N.B., |
| - for all ``x`` and ``y`` the length of ``feature_function(x)(y)`` |
| + meta_feature (MetaFeature): A function ``X -> Y -> MetaFeatureValue``. |
| + N.B., for all ``x`` and ``y`` the length of ``wrapped_feature(x)(y)`` |
| must be the same as the length of ``weights``. |
| - weights (dict of float): the weights for the features. The keys of |
| - the dictionary are the names of the feature that weight is |
| + All features. |
| + meta_weight (MetaWeight): All weights. the weights for the features. |
| + The keys of the dictionary are the names of the feature that weight is |
| for. We take this argument as a dict rather than as a list so that |
| callers needn't worry about what order to provide the weights in. |
| epsilon (float): The absolute-error threshold for considering a |
| @@ -336,7 +238,7 @@ class LogLinearModel(UnnormalizedLogLinearModel): |
| number, as we will compare it against the absolute value of |
| each weight. |
| """ |
| - super(LogLinearModel, self).__init__(feature_function, weights, epsilon) |
| + super(LogLinearModel, self).__init__(meta_feature, meta_weight, epsilon) |
| self._Y = Y_given_X |
| @@ -379,7 +281,7 @@ class LogLinearModel(UnnormalizedLogLinearModel): |
| def Probability(self, x): |
| """The normal-domain distribution over ``y`` given ``x``. |
| - That is, ``self.Probability(x)(y)`` returns ``p(y | x; self.weights)`` |
| + That is, ``self.Probability(x)(y)`` returns ``p(y | x; self._meta_weight)`` |
| which is the model's estimation of ``Pr(y|x)``. |
| If you need the log-probability, don't use this method. Instead, |
| @@ -420,4 +322,3 @@ class LogLinearModel(UnnormalizedLogLinearModel): |
| # method polymorphic in the return type of ``f`` then we'll need an |
| # API that provides both scaling and ``vsum``. |
| return vsum([prob_given_x(y) * f(y) for y in self._Y(x)]) |
| - |