appengine/findit/crash/loglinear/feature.py - Issue 2625073003: [Predator] Add MetaWeight and MetaFeatureValue to group multiple weights and features together.

Unified Diff: appengine/findit/crash/loglinear/feature.py

Issue 2625073003: [Predator] Add MetaWeight and MetaFeatureValue to group multiple weights and features together. (Closed)

Patch Set: Rebase. Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: appengine/findit/crash/loglinear/feature.py

diff --git a/appengine/findit/crash/loglinear/feature.py b/appengine/findit/crash/loglinear/feature.py

index 4633c9f1869866e3e76dff39fcf75fdf05609a90..f1652373580b6450d75083b6dbed894e5c78268c 100644

--- a/appengine/findit/crash/loglinear/feature.py

+++ b/appengine/findit/crash/loglinear/feature.py

@@ -3,9 +3,13 @@

# found in the LICENSE file.

from collections import namedtuple

+import logging

import math

import libs.math.logarithms as lmath

+from libs.math.vectors import vsum

+from libs.meta_object import Element

+from libs.meta_object import MetaDict

def LinearlyScaled(value, maximum):

@@ -79,9 +83,8 @@ class ChangedFile(namedtuple('ChangedFile',

% (self.__class__.__name__, self.name, self.blame_url, self.reasons))

-class FeatureValue(namedtuple('FeatureValue',

- ['name', 'value', 'reason', 'changed_files'])): # pragma: no cover

- """The result of an individual feature.

+class FeatureValue(Element): # pragma: no cover

+ """The result of an feature.

Attributes:

name (str): the name of the feature producing this value.

@@ -95,17 +98,162 @@ class FeatureValue(namedtuple('FeatureValue',

"""

__slots__ = ()

- def __new__(cls, name, value, reason, changed_files):

- return super(cls, FeatureValue).__new__(cls,

- str(name), float(value), str(reason), changed_files)

+ def __init__(self, name, value, reason, changed_files):

+ self._value = float(value)

+ self._name = name

+ self._reason = reason

+ self._changed_files = changed_files

+ @property

+ def name(self):

+ return self._name

+ @property

+ def value(self):

+ return self._value

+ @property

+ def reason(self):

+ return self._reason

+ @property

+ def changed_files(self):

+ return self._changed_files

def __str__(self):

return ('%s(name = %s, value = %f, reason = %s, changed_files = %s)'

% (self.__class__.__name__, self.name, self.value, self.reason,

self.changed_files))

+ def __len__(self):

+ return 1

+ def __mul__(self, number):

+ return self._value * float(number)

+ __rmul__ = __mul__

+ def __add__(self, number):

+ return self._value + float(number)

+ __radd__ = __add__

+ def __float__(self):

+ return self._value

+ def __eq__(self, other):

+ return (self.name == other.name and self._value == other._value and

+ self.reason == other.reason and

+ self.changed_files == other.changed_files)

+ def __ne__(self, other):

+ return not self.__eq__(other)

+class MetaFeatureValue(MetaDict):

+ """The result of a meta feature which groups a list of ``FeatureValue``s.

+ N.B. ``MetaFeatureValue`` must have more than one ``FeatureValue``.

+ Attributes:

+ """

+ def __init__(self, name, feature_values):

+ """

+ Args:

+ feature_value (dict of FeatureValue/MetaFeatureValue):

+ All the sub features that this ``MetaFeatureValue`` contains.

+ """

+ super(MetaFeatureValue, self).__init__(feature_values)

+ self._name = name

+ self._reason = None

+ self._changed_files = None

+ @property

+ def name(self):

+ return self._name

+ @property

+ def reason(self):

+ """Collect and format a list of all ``FeatureValue.reason`` strings.

+ Returns:

+ A str of reasons, each line has a format

+ "feature_name: feature_value -- reason" triples; where the first string is

+ the feature name, the float is some numeric representation of how

+ much influence this feature exerts on the ``Suspect`` being blamed,

+ and the final string is the ``FeatureValue.reason``. The list is

+ sorted by feature name, just to ensure that it comes out in some

+ canonical order.

+ At present, the float is the log-domain score of the feature

+ value. However, this isn't the best thing for UX reasons. In the

+ future it might be replaced by the normal-domain score, or by

+ the probability.

+ """

+ if self._reason:

+ return self._reason

+ formatted_reasons = []

+ for feature in self.itervalues():

+ if feature.reason:

+ formatted_reasons.append('%s: %f -- %s' % (feature.name,

+ feature.value,

+ feature.reason))

+ formatted_reasons.sort()

+ self._reason = '\n'.join(formatted_reasons)

+ return self._reason

+ @property

+ def changed_files(self):

+ """Merge multiple``FeatureValue.changed_files`` lists into one.

+ Returns:

+ A list of ``ChangedFile`` objects sorted by file name. The sorting

+ is not essential, but is provided to ease testing by ensuring the

+ output is in some canonical order.

+ Raises:

+ ``ValueError`` if any file name is given inconsistent ``blame_url``s.

+ """

+ if self._changed_files:

+ return self._changed_files

+ all_changed_files = {}

+ for feature in self.itervalues():

+ if not feature.changed_files:

+ continue

+ for changed_file in feature.changed_files or []:

+ accumulated_changed_file = all_changed_files.get(changed_file.name)

+ if accumulated_changed_file is None:

+ all_changed_files[changed_file.name] = changed_file

+ continue

+ if (accumulated_changed_file.blame_url !=

+ changed_file.blame_url): # pragma: no cover

+ raise ValueError('Blame URLs do not match: %s != %s'

+ % (accumulated_changed_file.blame_url, changed_file.blame_url))

+ accumulated_changed_file.reasons.extend(changed_file.reasons or [])

-class Feature(object):

+ self._changed_files = all_changed_files.values()

+ self._changed_files.sort(key=lambda changed_file: changed_file.name)

+ return self._changed_files

+ def __len__(self):

+ return len(self._value)

+ def __eq__(self, other):

+ return (self.name == other.name and self._value == other._value and

+ self.reason == other.reason and

+ self.changed_files == other.changed_files)

+ def __ne__(self, other):

+ return not self.__eq__(other)

+class Feature(Element):

"""Abstract base class for features use by loglinear models."""

@property

@@ -120,8 +268,8 @@ class Feature(object):

``X`` and ``Y``, as described in the documentation there. As an

example: for the CL classifier, ``X`` is ``CrashReport`` and ``Y`` is

``Suspect``. Given those two types, this method is a curried function

- of type ``X -> Y -> FeatureValue``. That is, given some ``x`` of type

- ``X``, we return a function of type ``Y -> FeatureValue``, where

+ of type ``X -> Y -> FeatureValue``. That is, given some ``x`` of

+ type ``X``, we return a function of type ``Y -> FeatureValue``, where

the final result for each ``y`` of type ``Y`` is the value of that

``y`` given that ``x``.

@@ -140,25 +288,74 @@ class Feature(object):

raise NotImplementedError()

-class FeatureFunction(object):

- """Given an iterable of scalar-valued functions, return an dict function.

+class MetaFeature(MetaDict):

+ """Abstract base class for meta features use by loglinear models.

+ MetaFeature is a dict of (Meta)Features.

+ """

+ @property

+ def name(self):

+ """The name of this feature."""

+ raise NotImplementedError()

+ def __call__(self, x):

+ """Returns a value for a ``y`` given some ``x``.

+ The loglinear model this feature is used in will specify some types

+ ``X`` and ``Y``, as described in the documentation there. As an

+ example: for the CL classifier, ``X`` is ``CrashReport`` and ``Y`` is

+ ``Suspect``. Given those two types, this method is a curried function

+ of type ``X -> Y -> FeatureValue``. That is, given some ``x`` of

+ type ``X``, we return a function of type ``Y -> FeatureValue``, where

+ the final result for each ``y`` of type ``Y`` is the value of that

+ ``y`` given that ``x``.

+ Values closer to zero indicate this feature has less to say about

+ whether the ``y`` is to be blamed. Values further from zero indicate

+ that this feature has more to say about it. (Whether this feature

+ thinks the ``y`` should be blamed or should not be depends on the sign

+ of the value and the sign of the weight given to this feature.) As

+ special cases, a value of negative infinity means "do not blame this

+ ``y`` no matter what any other features say", and a value of positive

+ infinity means "definitely blame this ``y`` no matter what any other

+ features say". Both of those special values should be used sparingly,

+ since they override the model's ability to combine multiple sources of

+ information and decide the cuplrit based on all the evidence together.

+ """

+ raise NotImplementedError()

+class WrapperMetaFeature(MetaFeature):

+ """Given a dict of scalar-valued functions, return an dict-valued function.

+ Note, the features that get wrapped should be independent to each other, which

+ means their feature values can be computed independently.

+ Either wrap single Feature or wrap features whose final results are computed

+ independently.

Properties:

- fs (iterable of functions): A collection of curried functions

- ``X -> Y -> FeatureValue``. That is, given a particular ``x`` they

- return a function ``Y -> dict(FeatureValue)``. N.B. each function should

- have a name property.

+ fs (Feature of iterable of (Meta)Features): A collection of curried

+ functions ``X -> Y -> (Meta)FeatureValue``. That is, given a particular

+ ``x`` they return a function ``Y -> dict(FeatureValue)``. N.B. each function

+ should have a name property.

"""

def __init__(self, fs):

- self._fs = fs

+ super(WrapperMetaFeature, self).__init__({f.name: f for f in fs or []})

+ @property

+ def name(self):

+ return 'WrapperFeature'

def __call__(self, x):

"""Fuction mapping ``X -> Y -> dict(FeatureValue.name to FeatureValue).

Returns:

- A function ``X -> Y -> dict(FeatureValue.name to FeatureValue)`` where for

+ A function ``X -> Y -> dict(FeatureValue.name to FeatureValue)`` where for

all ``x``, ``y``, and for a feature f in fs, we have

``FeatureFunction(fs)(x)(y)[f.name] == f(x)(y)``.

"""

- name_to_fx = {f.name: f(x) for f in self._fs}

- return lambda y: {name: fx(y) for name, fx in name_to_fx.iteritems()}

+ fxs = {name: f(x) for name, f in self.iteritems()}

+ return lambda y: MetaFeatureValue(

+ self.name, {name: fx(y) for name, fx in fxs.iteritems()})

« no previous file with comments | « appengine/findit/crash/loglinear/changelist_classifier.py ('k') | appengine/findit/crash/loglinear/model.py » ('j') | no next file with comments »