Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. | 1 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 from collections import namedtuple | 5 from collections import namedtuple |
| 6 import logging | |
| 6 import math | 7 import math |
| 7 | 8 |
| 8 import libs.math.logarithms as lmath | 9 import libs.math.logarithms as lmath |
| 10 from libs.math.vectors import vsum | |
| 11 from libs.meta_object import Element | |
| 12 from libs.meta_object import MetaDict | |
| 9 | 13 |
| 10 | 14 |
| 11 def LinearlyScaled(value, maximum): | 15 def LinearlyScaled(value, maximum): |
| 12 """Returns a value scaled linearly between 0 and 1. | 16 """Returns a value scaled linearly between 0 and 1. |
| 13 | 17 |
| 14 Args: | 18 Args: |
| 15 value (float): the value to be scaled. | 19 value (float): the value to be scaled. |
| 16 maximum (float): the maximum value to consider. Must be strictly | 20 maximum (float): the maximum value to consider. Must be strictly |
| 17 positive and finite (i.e., can't be zero nor infinity). | 21 positive and finite (i.e., can't be zero nor infinity). |
| 18 | 22 |
| (...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 72 'file': self.name, | 76 'file': self.name, |
| 73 'blame_url': self.blame_url, | 77 'blame_url': self.blame_url, |
| 74 'info': '\n'.join(self.reasons) | 78 'info': '\n'.join(self.reasons) |
| 75 } | 79 } |
| 76 | 80 |
| 77 def __str__(self): | 81 def __str__(self): |
| 78 return ('%s(name = %s, blame_url = %s, reasons = %s)' | 82 return ('%s(name = %s, blame_url = %s, reasons = %s)' |
| 79 % (self.__class__.__name__, self.name, self.blame_url, self.reasons)) | 83 % (self.__class__.__name__, self.name, self.blame_url, self.reasons)) |
| 80 | 84 |
| 81 | 85 |
| 82 class FeatureValue(namedtuple('FeatureValue', | 86 class FeatureValue(Element): # pragma: no cover |
| 83 ['name', 'value', 'reason', 'changed_files'])): # pragma: no cover | 87 """The result of an feature. |
| 84 """The result of an individual feature. | |
| 85 | 88 |
| 86 Attributes: | 89 Attributes: |
| 87 name (str): the name of the feature producing this value. | 90 name (str): the name of the feature producing this value. |
| 88 value (convertable to float): the value itself. N.B. we call the | 91 value (convertable to float): the value itself. N.B. we call the |
| 89 ``float`` builtin function to coerce this value to float; thus | 92 ``float`` builtin function to coerce this value to float; thus |
| 90 it is okay to pass an ``int`` or ``bool`` value as well. | 93 it is okay to pass an ``int`` or ``bool`` value as well. |
| 91 reason (str): some explanation of where the value came from. | 94 reason (str): some explanation of where the value came from. |
| 92 changed_files (list of ChangedFile, or None): A list of files changed | 95 changed_files (list of ChangedFile, or None): A list of files changed |
| 93 by the ``Suspect`` annotated with reasons why the feature function | 96 by the ``Suspect`` annotated with reasons why the feature function |
| 94 generating this object blames those changes. | 97 generating this object blames those changes. |
| 95 """ | 98 """ |
| 96 __slots__ = () | 99 __slots__ = () |
| 97 | 100 |
| 98 def __new__(cls, name, value, reason, changed_files): | 101 def __init__(self, name, value, reason, changed_files): |
| 99 return super(cls, FeatureValue).__new__(cls, | 102 self._value = float(value) |
| 100 str(name), float(value), str(reason), changed_files) | 103 self._name = name |
| 104 self._reason = reason | |
| 105 self._changed_files = changed_files | |
| 106 | |
| 107 @property | |
| 108 def name(self): | |
| 109 return self._name | |
| 110 | |
| 111 @property | |
| 112 def value(self): | |
| 113 return self._value | |
| 114 | |
| 115 @property | |
| 116 def reason(self): | |
| 117 return self._reason | |
| 118 | |
| 119 @property | |
| 120 def changed_files(self): | |
| 121 return self._changed_files | |
| 101 | 122 |
| 102 def __str__(self): | 123 def __str__(self): |
| 103 return ('%s(name = %s, value = %f, reason = %s, changed_files = %s)' | 124 return ('%s(name = %s, value = %f, reason = %s, changed_files = %s)' |
| 104 % (self.__class__.__name__, self.name, self.value, self.reason, | 125 % (self.__class__.__name__, self.name, self.value, self.reason, |
| 105 self.changed_files)) | 126 self.changed_files)) |
| 106 | 127 |
| 128 def __len__(self): | |
|
chanli
2017/01/19 04:26:06
Is this necessary? Since Element has the same len?
Sharu Jiang
2017/01/19 23:49:52
Yes, the element can be a list of anything, but as
| |
| 129 return 1 | |
| 107 | 130 |
| 108 class Feature(object): | 131 def __mul__(self, number): |
| 132 return self._value * float(number) | |
| 133 | |
| 134 __rmul__ = __mul__ | |
| 135 | |
| 136 def __add__(self, number): | |
| 137 return self._value + float(number) | |
| 138 | |
| 139 __radd__ = __add__ | |
| 140 | |
| 141 def __float__(self): | |
| 142 return self._value | |
| 143 | |
| 144 def __eq__(self, other): | |
| 145 return (self.name == other.name and self._value == other._value and | |
| 146 self.reason == other.reason and | |
| 147 self.changed_files == other.changed_files) | |
| 148 | |
| 149 def __ne__(self, other): | |
| 150 return not self.__eq__(other) | |
| 151 | |
| 152 | |
| 153 class MetaFeatureValue(MetaDict): | |
| 154 """The result of a meta feature which groups a list of ``FeatureValue``s. | |
| 155 | |
| 156 N.B. ``MetaFeatureValue`` must have more than one ``FeatureValue``. | |
| 157 | |
| 158 Attributes: | |
| 159 | |
| 160 """ | |
| 161 def __init__(self, name, feature_values): | |
| 162 """ | |
| 163 Args: | |
| 164 feature_value (dict of FeatureValue/MetaFeatureValue): | |
| 165 All the sub features that this ``MetaFeatureValue`` contains. | |
| 166 """ | |
| 167 super(MetaFeatureValue, self).__init__(feature_values) | |
| 168 self._name = name | |
| 169 self._reason = None | |
| 170 self._changed_files = None | |
| 171 | |
| 172 @property | |
| 173 def name(self): | |
| 174 return self._name | |
| 175 | |
| 176 @property | |
| 177 def reason(self): | |
| 178 """Collect and format a list of all ``FeatureValue.reason`` strings. | |
| 179 | |
| 180 Returns: | |
| 181 A str of reasons, each line has a format | |
| 182 "feature_name: feature_value -- reason" triples; where the first string is | |
| 183 the feature name, the float is some numeric representation of how | |
| 184 much influence this feature exerts on the ``Suspect`` being blamed, | |
| 185 and the final string is the ``FeatureValue.reason``. The list is | |
| 186 sorted by feature name, just to ensure that it comes out in some | |
| 187 canonical order. | |
| 188 | |
| 189 At present, the float is the log-domain score of the feature | |
| 190 value. However, this isn't the best thing for UX reasons. In the | |
| 191 future it might be replaced by the normal-domain score, or by | |
| 192 the probability. | |
| 193 """ | |
| 194 if self._reason: | |
| 195 return self._reason | |
| 196 | |
| 197 formatted_reasons = [] | |
| 198 for feature in self.itervalues(): | |
| 199 if feature.reason: | |
| 200 formatted_reasons.append('%s: %f -- %s' % (feature.name, | |
| 201 feature.value, | |
| 202 feature.reason)) | |
| 203 | |
| 204 formatted_reasons.sort() | |
| 205 self._reason = '\n'.join(formatted_reasons) | |
| 206 return self._reason | |
| 207 | |
| 208 @property | |
| 209 def changed_files(self): | |
| 210 """Merge multiple``FeatureValue.changed_files`` lists into one. | |
| 211 | |
| 212 Returns: | |
| 213 A list of ``ChangedFile`` objects sorted by file name. The sorting | |
| 214 is not essential, but is provided to ease testing by ensuring the | |
| 215 output is in some canonical order. | |
| 216 | |
| 217 Raises: | |
| 218 ``ValueError`` if any file name is given inconsistent ``blame_url``s. | |
| 219 """ | |
| 220 if self._changed_files: | |
| 221 return self._changed_files | |
| 222 | |
| 223 all_changed_files = {} | |
| 224 for feature in self.itervalues(): | |
| 225 if not feature.changed_files: | |
| 226 continue | |
| 227 | |
| 228 for changed_file in feature.changed_files or []: | |
| 229 accumulated_changed_file = all_changed_files.get(changed_file.name) | |
| 230 if accumulated_changed_file is None: | |
| 231 all_changed_files[changed_file.name] = changed_file | |
| 232 continue | |
| 233 | |
| 234 if (accumulated_changed_file.blame_url != | |
| 235 changed_file.blame_url): # pragma: no cover | |
| 236 raise ValueError('Blame URLs do not match: %s != %s' | |
| 237 % (accumulated_changed_file.blame_url, changed_file.blame_url)) | |
| 238 accumulated_changed_file.reasons.extend(changed_file.reasons or []) | |
| 239 | |
| 240 self._changed_files = all_changed_files.values() | |
| 241 self._changed_files.sort(key=lambda changed_file: changed_file.name) | |
| 242 return self._changed_files | |
| 243 | |
| 244 def __len__(self): | |
| 245 return len(self._value) | |
| 246 | |
| 247 def __eq__(self, other): | |
| 248 return (self.name == other.name and self._value == other._value and | |
| 249 self.reason == other.reason and | |
| 250 self.changed_files == other.changed_files) | |
| 251 | |
| 252 def __ne__(self, other): | |
| 253 return not self.__eq__(other) | |
| 254 | |
| 255 | |
| 256 class Feature(Element): | |
| 109 """Abstract base class for features use by loglinear models.""" | 257 """Abstract base class for features use by loglinear models.""" |
| 110 | 258 |
| 111 @property | 259 @property |
| 112 def name(self): | 260 def name(self): |
| 113 """The name of this feature.""" | 261 """The name of this feature.""" |
| 114 raise NotImplementedError() | 262 raise NotImplementedError() |
| 115 | 263 |
| 116 def __call__(self, report): | 264 def __call__(self, report): |
| 117 """Returns a value for a ``y`` given some ``x``. | 265 """Returns a value for a ``y`` given some ``x``. |
| 118 | 266 |
| 119 The loglinear model this feature is used in will specify some types | 267 The loglinear model this feature is used in will specify some types |
| 120 ``X`` and ``Y``, as described in the documentation there. As an | 268 ``X`` and ``Y``, as described in the documentation there. As an |
| 121 example: for the CL classifier, ``X`` is ``CrashReport`` and ``Y`` is | 269 example: for the CL classifier, ``X`` is ``CrashReport`` and ``Y`` is |
| 122 ``Suspect``. Given those two types, this method is a curried function | 270 ``Suspect``. Given those two types, this method is a curried function |
| 123 of type ``X -> Y -> FeatureValue``. That is, given some ``x`` of type | 271 of type ``X -> Y -> FeatureValue``. That is, given some ``x`` of |
| 124 ``X``, we return a function of type ``Y -> FeatureValue``, where | 272 type ``X``, we return a function of type ``Y -> FeatureValue``, where |
| 125 the final result for each ``y`` of type ``Y`` is the value of that | 273 the final result for each ``y`` of type ``Y`` is the value of that |
| 126 ``y`` given that ``x``. | 274 ``y`` given that ``x``. |
| 127 | 275 |
| 128 Values closer to zero indicate this feature has less to say about | 276 Values closer to zero indicate this feature has less to say about |
| 129 whether the ``y`` is to be blamed. Values further from zero indicate | 277 whether the ``y`` is to be blamed. Values further from zero indicate |
| 130 that this feature has more to say about it. (Whether this feature | 278 that this feature has more to say about it. (Whether this feature |
| 131 thinks the ``y`` should be blamed or should not be depends on the sign | 279 thinks the ``y`` should be blamed or should not be depends on the sign |
| 132 of the value and the sign of the weight given to this feature.) As | 280 of the value and the sign of the weight given to this feature.) As |
| 133 special cases, a value of negative infinity means "do not blame this | 281 special cases, a value of negative infinity means "do not blame this |
| 134 ``y`` no matter what any other features say", and a value of positive | 282 ``y`` no matter what any other features say", and a value of positive |
| 135 infinity means "definitely blame this ``y`` no matter what any other | 283 infinity means "definitely blame this ``y`` no matter what any other |
| 136 features say". Both of those special values should be used sparingly, | 284 features say". Both of those special values should be used sparingly, |
| 137 since they override the model's ability to combine multiple sources of | 285 since they override the model's ability to combine multiple sources of |
| 138 information and decide the cuplrit based on all the evidence together. | 286 information and decide the cuplrit based on all the evidence together. |
| 139 """ | 287 """ |
| 140 raise NotImplementedError() | 288 raise NotImplementedError() |
| 141 | 289 |
| 142 | 290 |
| 143 class FeatureFunction(object): | 291 class MetaFeature(MetaDict): |
| 292 """Abstract base class for meta features use by loglinear models. | |
| 293 | |
| 294 MetaFeature is a dict of (Meta)Features. | |
| 295 """ | |
| 296 | |
| 297 @property | |
| 298 def name(self): | |
| 299 """The name of this feature.""" | |
| 300 raise NotImplementedError() | |
| 301 | |
| 302 def __call__(self, x): | |
| 303 """Returns a value for a ``y`` given some ``x``. | |
| 304 | |
| 305 The loglinear model this feature is used in will specify some types | |
| 306 ``X`` and ``Y``, as described in the documentation there. As an | |
| 307 example: for the CL classifier, ``X`` is ``CrashReport`` and ``Y`` is | |
| 308 ``Suspect``. Given those two types, this method is a curried function | |
| 309 of type ``X -> Y -> FeatureValue``. That is, given some ``x`` of | |
| 310 type ``X``, we return a function of type ``Y -> FeatureValue``, where | |
| 311 the final result for each ``y`` of type ``Y`` is the value of that | |
| 312 ``y`` given that ``x``. | |
| 313 | |
| 314 Values closer to zero indicate this feature has less to say about | |
| 315 whether the ``y`` is to be blamed. Values further from zero indicate | |
| 316 that this feature has more to say about it. (Whether this feature | |
| 317 thinks the ``y`` should be blamed or should not be depends on the sign | |
| 318 of the value and the sign of the weight given to this feature.) As | |
| 319 special cases, a value of negative infinity means "do not blame this | |
| 320 ``y`` no matter what any other features say", and a value of positive | |
| 321 infinity means "definitely blame this ``y`` no matter what any other | |
| 322 features say". Both of those special values should be used sparingly, | |
| 323 since they override the model's ability to combine multiple sources of | |
| 324 information and decide the cuplrit based on all the evidence together. | |
| 325 """ | |
| 326 raise NotImplementedError() | |
| 327 | |
| 328 | |
| 329 class WrapperMetaFeature(MetaFeature): | |
| 144 """Given a dict of scalar-valued functions, return an dict-valued function. | 330 """Given a dict of scalar-valued functions, return an dict-valued function. |
| 145 | 331 |
| 332 Note, the features that get wrapped should be independent to each other, which | |
| 333 means their feature values can be computed independently. | |
| 334 | |
| 335 Either wrap single Feature or wrap features whose final results are computed | |
| 336 independently. | |
| 337 | |
| 146 Properties: | 338 Properties: |
| 147 fs (iterable of functions): A collection of curried functions | 339 fs (Feature of iterable of (Meta)Features): A collection of curried |
| 148 ``X -> Y -> FeatureValue``. That is, given a particular ``x`` they | 340 functions ``X -> Y -> (Meta)FeatureValue``. That is, given a particular |
| 149 return a function ``Y -> dict(FeatureValue)``. N.B. each function should | 341 ``x`` they return a function ``Y -> dict(FeatureValue)``. N.B. each function |
| 150 have a name property. | 342 should have a name property. |
| 151 """ | 343 """ |
| 152 def __init__(self, fs): | 344 def __init__(self, fs): |
| 153 self._fs = fs | 345 super(WrapperMetaFeature, self).__init__({f.name: f for f in fs or []}) |
| 346 | |
| 347 @property | |
| 348 def name(self): | |
| 349 return 'WrapperFeature' | |
| 154 | 350 |
| 155 def __call__(self, x): | 351 def __call__(self, x): |
| 156 """Fuction mapping ``X -> Y -> dict(FeatureValue.name to FeatureValue). | 352 """Fuction mapping ``X -> Y -> dict(FeatureValue.name to FeatureValue). |
| 157 | 353 |
| 158 Returns: | 354 Returns: |
| 159 A function ``X -> Y -> dict(FeatureValue.name to FeatureValue)`` where for | 355 A function ``X -> Y -> dict(FeatureValue.name to FeatureValue)`` where for |
| 160 all ``x``, ``y``, and for a feature f in fs, we have | 356 all ``x``, ``y``, and for a feature f in fs, we have |
| 161 ``FeatureFunction(fs)(x)(y)[f.name] == f(x)(y)``. | 357 ``FeatureFunction(fs)(x)(y)[f.name] == f(x)(y)``. |
| 162 """ | 358 """ |
| 163 name_to_fx = {f.name: f(x) for f in self._fs} | 359 fxs = {name: f(x) for name, f in self.iteritems()} |
| 164 return lambda y: {name: fx(y) for name, fx in name_to_fx.iteritems()} | 360 return lambda y: MetaFeatureValue( |
| 361 self.name, {name: fx(y) for name, fx in fxs.iteritems()}) | |
| OLD | NEW |