appengine/findit/crash/loglinear/feature.py - Issue 2625073003: [Predator] Add MetaWeight and MetaFeatureValue to group multiple weights and features together.

Side by Side Diff: appengine/findit/crash/loglinear/feature.py

Issue 2625073003: [Predator] Add MetaWeight and MetaFeatureValue to group multiple weights and features together. (Closed)

Patch Set: Fix nits. Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « appengine/findit/crash/loglinear/changelist_classifier.py ('k') | appengine/findit/crash/loglinear/model.py » ('j') | appengine/findit/crash/loglinear/model.py » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # Copyright 2016 The Chromium Authors. All rights reserved.	1 # Copyright 2016 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 from collections import namedtuple	5 from collections import namedtuple

	6 import logging

6 import math	7 import math

7	8

8 import libs.math.logarithms as lmath	9 import libs.math.logarithms as lmath

	10 from libs.math.vectors import vsum

	11 from libs.meta_object import Element

	12 from libs.meta_object import MetaDict

9	13

10	14

11 def LinearlyScaled(value, maximum):	15 def LinearlyScaled(value, maximum):

12 """Returns a value scaled linearly between 0 and 1.	16 """Returns a value scaled linearly between 0 and 1.

13	17

14 Args:	18 Args:

15 value (float): the value to be scaled.	19 value (float): the value to be scaled.

16 maximum (float): the maximum value to consider. Must be strictly	20 maximum (float): the maximum value to consider. Must be strictly

17 positive and finite (i.e., can't be zero nor infinity).	21 positive and finite (i.e., can't be zero nor infinity).

18	22

(...skipping 53 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
72 'file': self.name,	76 'file': self.name,

73 'blame_url': self.blame_url,	77 'blame_url': self.blame_url,

74 'info': '\n'.join(self.reasons)	78 'info': '\n'.join(self.reasons)

75 }	79 }

76	80

77 def __str__(self):	81 def __str__(self):

78 return ('%s(name = %s, blame_url = %s, reasons = %s)'	82 return ('%s(name = %s, blame_url = %s, reasons = %s)'

79 % (self.__class__.__name__, self.name, self.blame_url, self.reasons))	83 % (self.__class__.__name__, self.name, self.blame_url, self.reasons))

80	84

81	85

82 class FeatureValue(namedtuple('FeatureValue',	86 class FeatureValue(Element): # pragma: no cover

83 ['name', 'value', 'reason', 'changed_files'])): # pragma: no cover	87 """The result of an feature.

84 """The result of an individual feature.

85	88

86 Attributes:	89 Attributes:

87 name (str): the name of the feature producing this value.	90 name (str): the name of the feature producing this value.

88 value (convertable to float): the value itself. N.B. we call the	91 value (convertable to float): the value itself. N.B. we call the

89 ``float`` builtin function to coerce this value to float; thus	92 ``float`` builtin function to coerce this value to float; thus

90 it is okay to pass an ``int`` or ``bool`` value as well.	93 it is okay to pass an ``int`` or ``bool`` value as well.

91 reason (str): some explanation of where the value came from.	94 reason (str): some explanation of where the value came from.

92 changed_files (list of ChangedFile, or None): A list of files changed	95 changed_files (list of ChangedFile, or None): A list of files changed

93 by the ``Suspect`` annotated with reasons why the feature function	96 by the ``Suspect`` annotated with reasons why the feature function

94 generating this object blames those changes.	97 generating this object blames those changes.

95 """	98 """

96 __slots__ = ()	99 __slots__ = ()

97	100

98 def __new__(cls, name, value, reason, changed_files):	101 def __init__(self, name, value, reason, changed_files):

99 return super(cls, FeatureValue).__new__(cls,	102 self._value = float(value)

100 str(name), float(value), str(reason), changed_files)	103 self._name = name

	104 self._reason = reason

	105 self._changed_files = changed_files

	106

	107 @property

	108 def name(self):

	109 return self._name

	110

	111 @property

	112 def value(self):

	113 return self._value

	114

	115 @property

	116 def reason(self):

	117 return self._reason

	118

	119 @property

	120 def changed_files(self):

	121 return self._changed_files

101	122

102 def __str__(self):	123 def __str__(self):

103 return ('%s(name = %s, value = %f, reason = %s, changed_files = %s)'	124 return ('%s(name = %s, value = %f, reason = %s, changed_files = %s)'

104 % (self.__class__.__name__, self.name, self.value, self.reason,	125 % (self.__class__.__name__, self.name, self.value, self.reason,

105 self.changed_files))	126 self.changed_files))

106	127

	128 def __len__(self):
	chanli 2017/01/19 04:26:06 Is this necessary? Since Element has the same len? Is this necessary? Since Element has the same len? Sharu Jiang 2017/01/19 23:49:52 Yes, the element can be a list of anything, but as Show quoted text On 2017/01/19 04:26:06, chanli wrote: > Is this necessary? Since Element has the same len? Yes, the element can be a list of anything, but as long as it inherits the Element, the length should be one.
	129 return 1

107	130

108 class Feature(object):	131 def __mul__(self, number):

	132 return self._value * float(number)

	133

	134 __rmul__ = __mul__

	135

	136 def __add__(self, number):

	137 return self._value + float(number)

	138

	139 __radd__ = __add__

	140

	141 def __float__(self):

	142 return self._value

	143

	144 def __eq__(self, other):

	145 return (self.name == other.name and self._value == other._value and

	146 self.reason == other.reason and

	147 self.changed_files == other.changed_files)

	148

	149 def __ne__(self, other):

	150 return not self.__eq__(other)

	151

	152

	153 class MetaFeatureValue(MetaDict):

	154 """The result of a meta feature which groups a list of ``FeatureValue``s.

	155

	156 N.B. ``MetaFeatureValue`` must have more than one ``FeatureValue``.

	157

	158 Attributes:

	159

	160 """

	161 def __init__(self, name, feature_values):

	162 """

	163 Args:

	164 feature_value (dict of FeatureValue/MetaFeatureValue):

	165 All the sub features that this ``MetaFeatureValue`` contains.

	166 """

	167 super(MetaFeatureValue, self).__init__(feature_values)

	168 self._name = name

	169 self._reason = None

	170 self._changed_files = None

	171

	172 @property

	173 def name(self):

	174 return self._name

	175

	176 @property

	177 def reason(self):

	178 """Collect and format a list of all ``FeatureValue.reason`` strings.

	179

	180 Returns:

	181 A str of reasons, each line has a format

	182 "feature_name: feature_value -- reason" triples; where the first string is

	183 the feature name, the float is some numeric representation of how

	184 much influence this feature exerts on the ``Suspect`` being blamed,

	185 and the final string is the ``FeatureValue.reason``. The list is

	186 sorted by feature name, just to ensure that it comes out in some

	187 canonical order.

	188

	189 At present, the float is the log-domain score of the feature

	190 value. However, this isn't the best thing for UX reasons. In the

	191 future it might be replaced by the normal-domain score, or by

	192 the probability.

	193 """

	194 if self._reason:

	195 return self._reason

	196

	197 formatted_reasons = []

	198 for feature in self.itervalues():

	199 if feature.reason:

	200 formatted_reasons.append('%s: %f -- %s' % (feature.name,

	201 feature.value,

	202 feature.reason))

	203

	204 formatted_reasons.sort()

	205 self._reason = '\n'.join(formatted_reasons)

	206 return self._reason

	207

	208 @property

	209 def changed_files(self):

	210 """Merge multiple``FeatureValue.changed_files`` lists into one.

	211

	212 Returns:

	213 A list of ``ChangedFile`` objects sorted by file name. The sorting

	214 is not essential, but is provided to ease testing by ensuring the

	215 output is in some canonical order.

	216

	217 Raises:

	218 ``ValueError`` if any file name is given inconsistent ``blame_url``s.

	219 """

	220 if self._changed_files:

	221 return self._changed_files

	222

	223 all_changed_files = {}

	224 for feature in self.itervalues():

	225 if not feature.changed_files:

	226 continue

	227

	228 for changed_file in feature.changed_files or []:

	229 accumulated_changed_file = all_changed_files.get(changed_file.name)

	230 if accumulated_changed_file is None:

	231 all_changed_files[changed_file.name] = changed_file

	232 continue

	233

	234 if (accumulated_changed_file.blame_url !=

	235 changed_file.blame_url): # pragma: no cover

	236 raise ValueError('Blame URLs do not match: %s != %s'

	237 % (accumulated_changed_file.blame_url, changed_file.blame_url))

	238 accumulated_changed_file.reasons.extend(changed_file.reasons or [])

	239

	240 self._changed_files = all_changed_files.values()

	241 self._changed_files.sort(key=lambda changed_file: changed_file.name)

	242 return self._changed_files

	243

	244 def __len__(self):

	245 return len(self._value)

	246

	247 def __eq__(self, other):

	248 return (self.name == other.name and self._value == other._value and

	249 self.reason == other.reason and

	250 self.changed_files == other.changed_files)

	251

	252 def __ne__(self, other):

	253 return not self.__eq__(other)

	254

	255

	256 class Feature(Element):

109 """Abstract base class for features use by loglinear models."""	257 """Abstract base class for features use by loglinear models."""

110	258

111 @property	259 @property

112 def name(self):	260 def name(self):

113 """The name of this feature."""	261 """The name of this feature."""

114 raise NotImplementedError()	262 raise NotImplementedError()

115	263

116 def __call__(self, report):	264 def __call__(self, report):

117 """Returns a value for a ``y`` given some ``x``.	265 """Returns a value for a ``y`` given some ``x``.

118	266

119 The loglinear model this feature is used in will specify some types	267 The loglinear model this feature is used in will specify some types

120 ``X`` and ``Y``, as described in the documentation there. As an	268 ``X`` and ``Y``, as described in the documentation there. As an

121 example: for the CL classifier, ``X`` is ``CrashReport`` and ``Y`` is	269 example: for the CL classifier, ``X`` is ``CrashReport`` and ``Y`` is

122 ``Suspect``. Given those two types, this method is a curried function	270 ``Suspect``. Given those two types, this method is a curried function

123 of type ``X -> Y -> FeatureValue``. That is, given some ``x`` of type	271 of type ``X -> Y -> FeatureValue``. That is, given some ``x`` of

124 ``X``, we return a function of type ``Y -> FeatureValue``, where	272 type ``X``, we return a function of type ``Y -> FeatureValue``, where

125 the final result for each ``y`` of type ``Y`` is the value of that	273 the final result for each ``y`` of type ``Y`` is the value of that

126 ``y`` given that ``x``.	274 ``y`` given that ``x``.

127	275

128 Values closer to zero indicate this feature has less to say about	276 Values closer to zero indicate this feature has less to say about

129 whether the ``y`` is to be blamed. Values further from zero indicate	277 whether the ``y`` is to be blamed. Values further from zero indicate

130 that this feature has more to say about it. (Whether this feature	278 that this feature has more to say about it. (Whether this feature

131 thinks the ``y`` should be blamed or should not be depends on the sign	279 thinks the ``y`` should be blamed or should not be depends on the sign

132 of the value and the sign of the weight given to this feature.) As	280 of the value and the sign of the weight given to this feature.) As

133 special cases, a value of negative infinity means "do not blame this	281 special cases, a value of negative infinity means "do not blame this

134 ``y`` no matter what any other features say", and a value of positive	282 ``y`` no matter what any other features say", and a value of positive

135 infinity means "definitely blame this ``y`` no matter what any other	283 infinity means "definitely blame this ``y`` no matter what any other

136 features say". Both of those special values should be used sparingly,	284 features say". Both of those special values should be used sparingly,

137 since they override the model's ability to combine multiple sources of	285 since they override the model's ability to combine multiple sources of

138 information and decide the cuplrit based on all the evidence together.	286 information and decide the cuplrit based on all the evidence together.

139 """	287 """

140 raise NotImplementedError()	288 raise NotImplementedError()

141	289

142	290

143 class FeatureFunction(object):	291 class MetaFeature(MetaDict):

	292 """Abstract base class for meta features use by loglinear models.

	293

	294 MetaFeature is a dict of (Meta)Features.

	295 """

	296

	297 @property

	298 def name(self):

	299 """The name of this feature."""

	300 raise NotImplementedError()

	301

	302 def __call__(self, x):

	303 """Returns a value for a ``y`` given some ``x``.

	304

	305 The loglinear model this feature is used in will specify some types

	306 ``X`` and ``Y``, as described in the documentation there. As an

	307 example: for the CL classifier, ``X`` is ``CrashReport`` and ``Y`` is

	308 ``Suspect``. Given those two types, this method is a curried function

	309 of type ``X -> Y -> FeatureValue``. That is, given some ``x`` of

	310 type ``X``, we return a function of type ``Y -> FeatureValue``, where

	311 the final result for each ``y`` of type ``Y`` is the value of that

	312 ``y`` given that ``x``.

	313

	314 Values closer to zero indicate this feature has less to say about

	315 whether the ``y`` is to be blamed. Values further from zero indicate

	316 that this feature has more to say about it. (Whether this feature

	317 thinks the ``y`` should be blamed or should not be depends on the sign

	318 of the value and the sign of the weight given to this feature.) As

	319 special cases, a value of negative infinity means "do not blame this

	320 ``y`` no matter what any other features say", and a value of positive

	321 infinity means "definitely blame this ``y`` no matter what any other

	322 features say". Both of those special values should be used sparingly,

	323 since they override the model's ability to combine multiple sources of

	324 information and decide the cuplrit based on all the evidence together.

	325 """

	326 raise NotImplementedError()

	327

	328

	329 class WrapperMetaFeature(MetaFeature):

144 """Given a dict of scalar-valued functions, return an dict-valued function.	330 """Given a dict of scalar-valued functions, return an dict-valued function.

145	331

	332 Note, the features that get wrapped should be independent to each other, which

	333 means their feature values can be computed independently.

	334

	335 Either wrap single Feature or wrap features whose final results are computed

	336 independently.

	337

146 Properties:	338 Properties:

147 fs (iterable of functions): A collection of curried functions	339 fs (Feature of iterable of (Meta)Features): A collection of curried

148 ``X -> Y -> FeatureValue``. That is, given a particular ``x`` they	340 functions ``X -> Y -> (Meta)FeatureValue``. That is, given a particular

149 return a function ``Y -> dict(FeatureValue)``. N.B. each function should	341 ``x`` they return a function ``Y -> dict(FeatureValue)``. N.B. each function

150 have a name property.	342 should have a name property.

151 """	343 """

152 def __init__(self, fs):	344 def __init__(self, fs):

153 self._fs = fs	345 super(WrapperMetaFeature, self).__init__({f.name: f for f in fs or []})

	346

	347 @property

	348 def name(self):

	349 return 'WrapperFeature'

154	350

155 def __call__(self, x):	351 def __call__(self, x):

156 """Fuction mapping ``X -> Y -> dict(FeatureValue.name to FeatureValue).	352 """Fuction mapping ``X -> Y -> dict(FeatureValue.name to FeatureValue).

157	353

158 Returns:	354 Returns:

159 A function ``X -> Y -> dict(FeatureValue.name to FeatureValue)`` where for	355 A function ``X -> Y -> dict(FeatureValue.name to FeatureValue)`` where for

160 all ``x``, ``y``, and for a feature f in fs, we have	356 all ``x``, ``y``, and for a feature f in fs, we have

161 ``FeatureFunction(fs)(x)(y)[f.name] == f(x)(y)``.	357 ``FeatureFunction(fs)(x)(y)[f.name] == f(x)(y)``.

162 """	358 """

163 name_to_fx = {f.name: f(x) for f in self._fs}	359 fxs = {name: f(x) for name, f in self.iteritems()}

164 return lambda y: {name: fx(y) for name, fx in name_to_fx.iteritems()}	360 return lambda y: MetaFeatureValue(

	361 self.name, {name: fx(y) for name, fx in fxs.iteritems()})

OLD	NEW