| Index: appengine/findit/crash/loglinear/feature.py
|
| diff --git a/appengine/findit/crash/loglinear/feature.py b/appengine/findit/crash/loglinear/feature.py
|
| index 4633c9f1869866e3e76dff39fcf75fdf05609a90..f1652373580b6450d75083b6dbed894e5c78268c 100644
|
| --- a/appengine/findit/crash/loglinear/feature.py
|
| +++ b/appengine/findit/crash/loglinear/feature.py
|
| @@ -3,9 +3,13 @@
|
| # found in the LICENSE file.
|
|
|
| from collections import namedtuple
|
| +import logging
|
| import math
|
|
|
| import libs.math.logarithms as lmath
|
| +from libs.math.vectors import vsum
|
| +from libs.meta_object import Element
|
| +from libs.meta_object import MetaDict
|
|
|
|
|
| def LinearlyScaled(value, maximum):
|
| @@ -79,9 +83,8 @@ class ChangedFile(namedtuple('ChangedFile',
|
| % (self.__class__.__name__, self.name, self.blame_url, self.reasons))
|
|
|
|
|
| -class FeatureValue(namedtuple('FeatureValue',
|
| - ['name', 'value', 'reason', 'changed_files'])): # pragma: no cover
|
| - """The result of an individual feature.
|
| +class FeatureValue(Element): # pragma: no cover
|
| + """The result of an feature.
|
|
|
| Attributes:
|
| name (str): the name of the feature producing this value.
|
| @@ -95,17 +98,162 @@ class FeatureValue(namedtuple('FeatureValue',
|
| """
|
| __slots__ = ()
|
|
|
| - def __new__(cls, name, value, reason, changed_files):
|
| - return super(cls, FeatureValue).__new__(cls,
|
| - str(name), float(value), str(reason), changed_files)
|
| + def __init__(self, name, value, reason, changed_files):
|
| + self._value = float(value)
|
| + self._name = name
|
| + self._reason = reason
|
| + self._changed_files = changed_files
|
| +
|
| + @property
|
| + def name(self):
|
| + return self._name
|
| +
|
| + @property
|
| + def value(self):
|
| + return self._value
|
| +
|
| + @property
|
| + def reason(self):
|
| + return self._reason
|
| +
|
| + @property
|
| + def changed_files(self):
|
| + return self._changed_files
|
|
|
| def __str__(self):
|
| return ('%s(name = %s, value = %f, reason = %s, changed_files = %s)'
|
| % (self.__class__.__name__, self.name, self.value, self.reason,
|
| self.changed_files))
|
|
|
| + def __len__(self):
|
| + return 1
|
| +
|
| + def __mul__(self, number):
|
| + return self._value * float(number)
|
| +
|
| + __rmul__ = __mul__
|
| +
|
| + def __add__(self, number):
|
| + return self._value + float(number)
|
| +
|
| + __radd__ = __add__
|
| +
|
| + def __float__(self):
|
| + return self._value
|
| +
|
| + def __eq__(self, other):
|
| + return (self.name == other.name and self._value == other._value and
|
| + self.reason == other.reason and
|
| + self.changed_files == other.changed_files)
|
| +
|
| + def __ne__(self, other):
|
| + return not self.__eq__(other)
|
| +
|
| +
|
| +class MetaFeatureValue(MetaDict):
|
| + """The result of a meta feature which groups a list of ``FeatureValue``s.
|
| +
|
| + N.B. ``MetaFeatureValue`` must have more than one ``FeatureValue``.
|
| +
|
| + Attributes:
|
| +
|
| + """
|
| + def __init__(self, name, feature_values):
|
| + """
|
| + Args:
|
| + feature_value (dict of FeatureValue/MetaFeatureValue):
|
| + All the sub features that this ``MetaFeatureValue`` contains.
|
| + """
|
| + super(MetaFeatureValue, self).__init__(feature_values)
|
| + self._name = name
|
| + self._reason = None
|
| + self._changed_files = None
|
| +
|
| + @property
|
| + def name(self):
|
| + return self._name
|
| +
|
| + @property
|
| + def reason(self):
|
| + """Collect and format a list of all ``FeatureValue.reason`` strings.
|
| +
|
| + Returns:
|
| + A str of reasons, each line has a format
|
| + "feature_name: feature_value -- reason" triples; where the first string is
|
| + the feature name, the float is some numeric representation of how
|
| + much influence this feature exerts on the ``Suspect`` being blamed,
|
| + and the final string is the ``FeatureValue.reason``. The list is
|
| + sorted by feature name, just to ensure that it comes out in some
|
| + canonical order.
|
| +
|
| + At present, the float is the log-domain score of the feature
|
| + value. However, this isn't the best thing for UX reasons. In the
|
| + future it might be replaced by the normal-domain score, or by
|
| + the probability.
|
| + """
|
| + if self._reason:
|
| + return self._reason
|
| +
|
| + formatted_reasons = []
|
| + for feature in self.itervalues():
|
| + if feature.reason:
|
| + formatted_reasons.append('%s: %f -- %s' % (feature.name,
|
| + feature.value,
|
| + feature.reason))
|
| +
|
| + formatted_reasons.sort()
|
| + self._reason = '\n'.join(formatted_reasons)
|
| + return self._reason
|
| +
|
| + @property
|
| + def changed_files(self):
|
| + """Merge multiple``FeatureValue.changed_files`` lists into one.
|
| +
|
| + Returns:
|
| + A list of ``ChangedFile`` objects sorted by file name. The sorting
|
| + is not essential, but is provided to ease testing by ensuring the
|
| + output is in some canonical order.
|
| +
|
| + Raises:
|
| + ``ValueError`` if any file name is given inconsistent ``blame_url``s.
|
| + """
|
| + if self._changed_files:
|
| + return self._changed_files
|
| +
|
| + all_changed_files = {}
|
| + for feature in self.itervalues():
|
| + if not feature.changed_files:
|
| + continue
|
| +
|
| + for changed_file in feature.changed_files or []:
|
| + accumulated_changed_file = all_changed_files.get(changed_file.name)
|
| + if accumulated_changed_file is None:
|
| + all_changed_files[changed_file.name] = changed_file
|
| + continue
|
| +
|
| + if (accumulated_changed_file.blame_url !=
|
| + changed_file.blame_url): # pragma: no cover
|
| + raise ValueError('Blame URLs do not match: %s != %s'
|
| + % (accumulated_changed_file.blame_url, changed_file.blame_url))
|
| + accumulated_changed_file.reasons.extend(changed_file.reasons or [])
|
|
|
| -class Feature(object):
|
| + self._changed_files = all_changed_files.values()
|
| + self._changed_files.sort(key=lambda changed_file: changed_file.name)
|
| + return self._changed_files
|
| +
|
| + def __len__(self):
|
| + return len(self._value)
|
| +
|
| + def __eq__(self, other):
|
| + return (self.name == other.name and self._value == other._value and
|
| + self.reason == other.reason and
|
| + self.changed_files == other.changed_files)
|
| +
|
| + def __ne__(self, other):
|
| + return not self.__eq__(other)
|
| +
|
| +
|
| +class Feature(Element):
|
| """Abstract base class for features use by loglinear models."""
|
|
|
| @property
|
| @@ -120,8 +268,8 @@ class Feature(object):
|
| ``X`` and ``Y``, as described in the documentation there. As an
|
| example: for the CL classifier, ``X`` is ``CrashReport`` and ``Y`` is
|
| ``Suspect``. Given those two types, this method is a curried function
|
| - of type ``X -> Y -> FeatureValue``. That is, given some ``x`` of type
|
| - ``X``, we return a function of type ``Y -> FeatureValue``, where
|
| + of type ``X -> Y -> FeatureValue``. That is, given some ``x`` of
|
| + type ``X``, we return a function of type ``Y -> FeatureValue``, where
|
| the final result for each ``y`` of type ``Y`` is the value of that
|
| ``y`` given that ``x``.
|
|
|
| @@ -140,25 +288,74 @@ class Feature(object):
|
| raise NotImplementedError()
|
|
|
|
|
| -class FeatureFunction(object):
|
| - """Given an iterable of scalar-valued functions, return an dict function.
|
| +class MetaFeature(MetaDict):
|
| + """Abstract base class for meta features use by loglinear models.
|
| +
|
| + MetaFeature is a dict of (Meta)Features.
|
| + """
|
| +
|
| + @property
|
| + def name(self):
|
| + """The name of this feature."""
|
| + raise NotImplementedError()
|
| +
|
| + def __call__(self, x):
|
| + """Returns a value for a ``y`` given some ``x``.
|
| +
|
| + The loglinear model this feature is used in will specify some types
|
| + ``X`` and ``Y``, as described in the documentation there. As an
|
| + example: for the CL classifier, ``X`` is ``CrashReport`` and ``Y`` is
|
| + ``Suspect``. Given those two types, this method is a curried function
|
| + of type ``X -> Y -> FeatureValue``. That is, given some ``x`` of
|
| + type ``X``, we return a function of type ``Y -> FeatureValue``, where
|
| + the final result for each ``y`` of type ``Y`` is the value of that
|
| + ``y`` given that ``x``.
|
| +
|
| + Values closer to zero indicate this feature has less to say about
|
| + whether the ``y`` is to be blamed. Values further from zero indicate
|
| + that this feature has more to say about it. (Whether this feature
|
| + thinks the ``y`` should be blamed or should not be depends on the sign
|
| + of the value and the sign of the weight given to this feature.) As
|
| + special cases, a value of negative infinity means "do not blame this
|
| + ``y`` no matter what any other features say", and a value of positive
|
| + infinity means "definitely blame this ``y`` no matter what any other
|
| + features say". Both of those special values should be used sparingly,
|
| + since they override the model's ability to combine multiple sources of
|
| + information and decide the cuplrit based on all the evidence together.
|
| + """
|
| + raise NotImplementedError()
|
| +
|
| +
|
| +class WrapperMetaFeature(MetaFeature):
|
| + """Given a dict of scalar-valued functions, return an dict-valued function.
|
| +
|
| + Note, the features that get wrapped should be independent to each other, which
|
| + means their feature values can be computed independently.
|
| +
|
| + Either wrap single Feature or wrap features whose final results are computed
|
| + independently.
|
|
|
| Properties:
|
| - fs (iterable of functions): A collection of curried functions
|
| - ``X -> Y -> FeatureValue``. That is, given a particular ``x`` they
|
| - return a function ``Y -> dict(FeatureValue)``. N.B. each function should
|
| - have a name property.
|
| + fs (Feature of iterable of (Meta)Features): A collection of curried
|
| + functions ``X -> Y -> (Meta)FeatureValue``. That is, given a particular
|
| + ``x`` they return a function ``Y -> dict(FeatureValue)``. N.B. each function
|
| + should have a name property.
|
| """
|
| def __init__(self, fs):
|
| - self._fs = fs
|
| + super(WrapperMetaFeature, self).__init__({f.name: f for f in fs or []})
|
| +
|
| + @property
|
| + def name(self):
|
| + return 'WrapperFeature'
|
|
|
| def __call__(self, x):
|
| """Fuction mapping ``X -> Y -> dict(FeatureValue.name to FeatureValue).
|
|
|
| Returns:
|
| - A function ``X -> Y -> dict(FeatureValue.name to FeatureValue)`` where for
|
| + A function ``X -> Y -> dict(FeatureValue.name to FeatureValue)`` where for
|
| all ``x``, ``y``, and for a feature f in fs, we have
|
| ``FeatureFunction(fs)(x)(y)[f.name] == f(x)(y)``.
|
| """
|
| - name_to_fx = {f.name: f(x) for f in self._fs}
|
| - return lambda y: {name: fx(y) for name, fx in name_to_fx.iteritems()}
|
| + fxs = {name: f(x) for name, f in self.iteritems()}
|
| + return lambda y: MetaFeatureValue(
|
| + self.name, {name: fx(y) for name, fx in fxs.iteritems()})
|
|
|