Chromium Code Reviews| Index: appengine/findit/crash/loglinear/feature.py |
| diff --git a/appengine/findit/crash/loglinear/feature.py b/appengine/findit/crash/loglinear/feature.py |
| index 51a160a499cb78e65652e32621632368e290e651..f1652373580b6450d75083b6dbed894e5c78268c 100644 |
| --- a/appengine/findit/crash/loglinear/feature.py |
| +++ b/appengine/findit/crash/loglinear/feature.py |
| @@ -3,9 +3,13 @@ |
| # found in the LICENSE file. |
| from collections import namedtuple |
| +import logging |
| import math |
| import libs.math.logarithms as lmath |
| +from libs.math.vectors import vsum |
| +from libs.meta_object import Element |
| +from libs.meta_object import MetaDict |
| def LinearlyScaled(value, maximum): |
| @@ -79,9 +83,8 @@ class ChangedFile(namedtuple('ChangedFile', |
| % (self.__class__.__name__, self.name, self.blame_url, self.reasons)) |
| -class FeatureValue(namedtuple('FeatureValue', |
| - ['name', 'value', 'reason', 'changed_files'])): # pragma: no cover |
| - """The result of an individual feature. |
| +class FeatureValue(Element): # pragma: no cover |
| + """The result of an feature. |
| Attributes: |
| name (str): the name of the feature producing this value. |
| @@ -95,17 +98,162 @@ class FeatureValue(namedtuple('FeatureValue', |
| """ |
| __slots__ = () |
| - def __new__(cls, name, value, reason, changed_files): |
| - return super(cls, FeatureValue).__new__(cls, |
| - str(name), float(value), str(reason), changed_files) |
| + def __init__(self, name, value, reason, changed_files): |
| + self._value = float(value) |
| + self._name = name |
| + self._reason = reason |
| + self._changed_files = changed_files |
| + |
| + @property |
| + def name(self): |
| + return self._name |
| + |
| + @property |
| + def value(self): |
| + return self._value |
| + |
| + @property |
| + def reason(self): |
| + return self._reason |
| + |
| + @property |
| + def changed_files(self): |
| + return self._changed_files |
| def __str__(self): |
| return ('%s(name = %s, value = %f, reason = %s, changed_files = %s)' |
| % (self.__class__.__name__, self.name, self.value, self.reason, |
| self.changed_files)) |
| + def __len__(self): |
|
chanli
2017/01/19 04:26:06
Is this necessary? Since Element has the same len?
Sharu Jiang
2017/01/19 23:49:52
Yes, the element can be a list of anything, but as
|
| + return 1 |
| + |
| + def __mul__(self, number): |
| + return self._value * float(number) |
| + |
| + __rmul__ = __mul__ |
| + |
| + def __add__(self, number): |
| + return self._value + float(number) |
| + |
| + __radd__ = __add__ |
| + |
| + def __float__(self): |
| + return self._value |
| + |
| + def __eq__(self, other): |
| + return (self.name == other.name and self._value == other._value and |
| + self.reason == other.reason and |
| + self.changed_files == other.changed_files) |
| + |
| + def __ne__(self, other): |
| + return not self.__eq__(other) |
| + |
| + |
| +class MetaFeatureValue(MetaDict): |
| + """The result of a meta feature which groups a list of ``FeatureValue``s. |
| + |
| + N.B. ``MetaFeatureValue`` must have more than one ``FeatureValue``. |
| + |
| + Attributes: |
| + |
| + """ |
| + def __init__(self, name, feature_values): |
| + """ |
| + Args: |
| + feature_value (dict of FeatureValue/MetaFeatureValue): |
| + All the sub features that this ``MetaFeatureValue`` contains. |
| + """ |
| + super(MetaFeatureValue, self).__init__(feature_values) |
| + self._name = name |
| + self._reason = None |
| + self._changed_files = None |
| + |
| + @property |
| + def name(self): |
| + return self._name |
| + |
| + @property |
| + def reason(self): |
| + """Collect and format a list of all ``FeatureValue.reason`` strings. |
| + |
| + Returns: |
| + A str of reasons, each line has a format |
| + "feature_name: feature_value -- reason" triples; where the first string is |
| + the feature name, the float is some numeric representation of how |
| + much influence this feature exerts on the ``Suspect`` being blamed, |
| + and the final string is the ``FeatureValue.reason``. The list is |
| + sorted by feature name, just to ensure that it comes out in some |
| + canonical order. |
| + |
| + At present, the float is the log-domain score of the feature |
| + value. However, this isn't the best thing for UX reasons. In the |
| + future it might be replaced by the normal-domain score, or by |
| + the probability. |
| + """ |
| + if self._reason: |
| + return self._reason |
| + |
| + formatted_reasons = [] |
| + for feature in self.itervalues(): |
| + if feature.reason: |
| + formatted_reasons.append('%s: %f -- %s' % (feature.name, |
| + feature.value, |
| + feature.reason)) |
| + |
| + formatted_reasons.sort() |
| + self._reason = '\n'.join(formatted_reasons) |
| + return self._reason |
| + |
| + @property |
| + def changed_files(self): |
| + """Merge multiple``FeatureValue.changed_files`` lists into one. |
| + |
| + Returns: |
| + A list of ``ChangedFile`` objects sorted by file name. The sorting |
| + is not essential, but is provided to ease testing by ensuring the |
| + output is in some canonical order. |
| + |
| + Raises: |
| + ``ValueError`` if any file name is given inconsistent ``blame_url``s. |
| + """ |
| + if self._changed_files: |
| + return self._changed_files |
| + |
| + all_changed_files = {} |
| + for feature in self.itervalues(): |
| + if not feature.changed_files: |
| + continue |
| + |
| + for changed_file in feature.changed_files or []: |
| + accumulated_changed_file = all_changed_files.get(changed_file.name) |
| + if accumulated_changed_file is None: |
| + all_changed_files[changed_file.name] = changed_file |
| + continue |
| + |
| + if (accumulated_changed_file.blame_url != |
| + changed_file.blame_url): # pragma: no cover |
| + raise ValueError('Blame URLs do not match: %s != %s' |
| + % (accumulated_changed_file.blame_url, changed_file.blame_url)) |
| + accumulated_changed_file.reasons.extend(changed_file.reasons or []) |
| -class Feature(object): |
| + self._changed_files = all_changed_files.values() |
| + self._changed_files.sort(key=lambda changed_file: changed_file.name) |
| + return self._changed_files |
| + |
| + def __len__(self): |
| + return len(self._value) |
| + |
| + def __eq__(self, other): |
| + return (self.name == other.name and self._value == other._value and |
| + self.reason == other.reason and |
| + self.changed_files == other.changed_files) |
| + |
| + def __ne__(self, other): |
| + return not self.__eq__(other) |
| + |
| + |
| +class Feature(Element): |
| """Abstract base class for features use by loglinear models.""" |
| @property |
| @@ -120,8 +268,8 @@ class Feature(object): |
| ``X`` and ``Y``, as described in the documentation there. As an |
| example: for the CL classifier, ``X`` is ``CrashReport`` and ``Y`` is |
| ``Suspect``. Given those two types, this method is a curried function |
| - of type ``X -> Y -> FeatureValue``. That is, given some ``x`` of type |
| - ``X``, we return a function of type ``Y -> FeatureValue``, where |
| + of type ``X -> Y -> FeatureValue``. That is, given some ``x`` of |
| + type ``X``, we return a function of type ``Y -> FeatureValue``, where |
| the final result for each ``y`` of type ``Y`` is the value of that |
| ``y`` given that ``x``. |
| @@ -140,25 +288,74 @@ class Feature(object): |
| raise NotImplementedError() |
| -class FeatureFunction(object): |
| +class MetaFeature(MetaDict): |
| + """Abstract base class for meta features use by loglinear models. |
| + |
| + MetaFeature is a dict of (Meta)Features. |
| + """ |
| + |
| + @property |
| + def name(self): |
| + """The name of this feature.""" |
| + raise NotImplementedError() |
| + |
| + def __call__(self, x): |
| + """Returns a value for a ``y`` given some ``x``. |
| + |
| + The loglinear model this feature is used in will specify some types |
| + ``X`` and ``Y``, as described in the documentation there. As an |
| + example: for the CL classifier, ``X`` is ``CrashReport`` and ``Y`` is |
| + ``Suspect``. Given those two types, this method is a curried function |
| + of type ``X -> Y -> FeatureValue``. That is, given some ``x`` of |
| + type ``X``, we return a function of type ``Y -> FeatureValue``, where |
| + the final result for each ``y`` of type ``Y`` is the value of that |
| + ``y`` given that ``x``. |
| + |
| + Values closer to zero indicate this feature has less to say about |
| + whether the ``y`` is to be blamed. Values further from zero indicate |
| + that this feature has more to say about it. (Whether this feature |
| + thinks the ``y`` should be blamed or should not be depends on the sign |
| + of the value and the sign of the weight given to this feature.) As |
| + special cases, a value of negative infinity means "do not blame this |
| + ``y`` no matter what any other features say", and a value of positive |
| + infinity means "definitely blame this ``y`` no matter what any other |
| + features say". Both of those special values should be used sparingly, |
| + since they override the model's ability to combine multiple sources of |
| + information and decide the cuplrit based on all the evidence together. |
| + """ |
| + raise NotImplementedError() |
| + |
| + |
| +class WrapperMetaFeature(MetaFeature): |
| """Given a dict of scalar-valued functions, return an dict-valued function. |
| + Note, the features that get wrapped should be independent to each other, which |
| + means their feature values can be computed independently. |
| + |
| + Either wrap single Feature or wrap features whose final results are computed |
| + independently. |
| + |
| Properties: |
| - fs (iterable of functions): A collection of curried functions |
| - ``X -> Y -> FeatureValue``. That is, given a particular ``x`` they |
| - return a function ``Y -> dict(FeatureValue)``. N.B. each function should |
| - have a name property. |
| + fs (Feature of iterable of (Meta)Features): A collection of curried |
| + functions ``X -> Y -> (Meta)FeatureValue``. That is, given a particular |
| + ``x`` they return a function ``Y -> dict(FeatureValue)``. N.B. each function |
| + should have a name property. |
| """ |
| def __init__(self, fs): |
| - self._fs = fs |
| + super(WrapperMetaFeature, self).__init__({f.name: f for f in fs or []}) |
| + |
| + @property |
| + def name(self): |
| + return 'WrapperFeature' |
| def __call__(self, x): |
| """Fuction mapping ``X -> Y -> dict(FeatureValue.name to FeatureValue). |
| Returns: |
| - A function ``X -> Y -> dict(FeatureValue.name to FeatureValue)`` where for |
| + A function ``X -> Y -> dict(FeatureValue.name to FeatureValue)`` where for |
| all ``x``, ``y``, and for a feature f in fs, we have |
| ``FeatureFunction(fs)(x)(y)[f.name] == f(x)(y)``. |
| """ |
| - name_to_fx = {f.name: f(x) for f in self._fs} |
| - return lambda y: {name: fx(y) for name, fx in name_to_fx.iteritems()} |
| + fxs = {name: f(x) for name, f in self.iteritems()} |
| + return lambda y: MetaFeatureValue( |
| + self.name, {name: fx(y) for name, fx in fxs.iteritems()}) |