appengine/findit/crash/loglinear/model.py - Issue 2560723005: Implementing a new LogLinearModel-based CL classifier

Side by Side Diff: appengine/findit/crash/loglinear/model.py

Issue 2560723005: Implementing a new LogLinearModel-based CL classifier (Closed)

Patch Set: rebase Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « appengine/findit/crash/loglinear/changelist_classifier.py ('k') | appengine/findit/crash/loglinear/test/changelist_classifier_test.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # Copyright 2016 The Chromium Authors. All rights reserved.	1 # Copyright 2016 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 # TODO(http://crbug.com/669639): there are lots of ways to make the code	5 # TODO(http://crbug.com/669639): there are lots of ways to make the code

6 # in this file better. We avoid having separate todos per task; instead	6 # in this file better. We avoid having separate todos per task; instead

7 # see that meta-issue ticket.	7 # see that meta-issue ticket.

8	8

9 import math	9 import math

10 import numpy as np	10 import numpy as np

(...skipping 66 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
77 """	77 """

78 if epsilon is None:	78 if epsilon is None:

79 epsilon = EPSILON	79 epsilon = EPSILON

80 self._weights = np.array([	80 self._weights = np.array([

81 w if isinstance(w, float) and math.fabs(w) >= epsilon else 0.	81 w if isinstance(w, float) and math.fabs(w) >= epsilon else 0.

82 for w in weights])	82 for w in weights])

83	83

84 self._quadrance = None	84 self._quadrance = None

85	85

86 # TODO(crbug.com/674752): we need better names for ``self._features``.	86 # TODO(crbug.com/674752): we need better names for ``self._features``.

87 def _FeaturesMemoizedOnY(x):	87 def _Features(x):

	88 """Wrap ``feature_function`` to memoize things and ensure types.

	89

	90 This outer wrapping takes each ``x`` to a memoized instance of

	91 ``_FeaturesGivenX``. That is, for each ``x`` we return a

	92 ``MemoizedFunction`` from ``Y`` to ``list(FeatureValue)``.

	93 """

88 fx = feature_function(x)	94 fx = feature_function(x)

89 def _TypeCheckFeatures(y):	95 def _FeaturesGivenX(y):

	96 """Wrap ``feature_function(x)`` to ensure appropriate types.

	97

	98 This inner wrapper ensures that the resulting ``FeatureValue``

	99 array has the same length as the weight covector.

	100 """

90 fxy = fx(y)	101 fxy = fx(y)

91 # N.B., we're assuming that ``len(self.weights)`` is O(1).	102 # N.B., we're assuming that ``len(self.weights)`` is O(1).

92 assert len(fxy) == len(self.weights), TypeError(	103 assert len(fxy) == len(self.weights), TypeError(

93 "vector length mismatch: %d != %d" % (len(fxy), len(self.weights)))	104 "vector length mismatch: %d != %d" % (len(fxy), len(self.weights)))

94 return fxy	105 return fxy

95 return MemoizedFunction(_TypeCheckFeatures)	106

96 self._features = MemoizedFunction(_FeaturesMemoizedOnY)	107 # Memoize on ``Y``, to ensure we don't need to recompute

	108 # ``FeatureValue``s nor recheck the lengths.

	109 return MemoizedFunction(_FeaturesGivenX)

	110

	111 # Memoize on ``X``, to ensure we share the memo tables on ``Y``.

	112 self._features = MemoizedFunction(_Features)

97	113

98 # TODO(crbug.com/674752): we need better names for ``self._scores``.	114 # TODO(crbug.com/674752): we need better names for ``self._scores``.

99 # N.B., this is just the inner product of ``self.weights``	115 # N.B., this is just the inner product of ``self.weights``

100 # against ``self._features(x)``. If we can compute this in some	116 # against ``self._features(x)``. If we can compute this in some

101 # more efficient way, we should. In particular, we will want to	117 # more efficient way, we should. In particular, we will want to

102 # make the weights sparse, in which case we need to use a sparse	118 # make the weights sparse, in which case we need to use a sparse

103 # variant of the dot product.	119 # variant of the dot product.

104 self._scores = MemoizedFunction(lambda x:	120 self._scores = MemoizedFunction(lambda x:

105 self._features(x).map(lambda fxy:	121 self._features(x).map(lambda fxy:

106 self.weights.dot(np.array(map(lambda feature:	122 self.weights.dot(np.array([feature.value for feature in fxy]))))

107 feature.value, fxy)))))

108	123

109 def ClearWeightBasedMemos(self):	124 def ClearWeightBasedMemos(self):

110 """Clear all the memos that depend on the weight covector."""	125 """Clear all the memos that depend on the weight covector."""

111 self._quadrance = None	126 self._quadrance = None

112 self._scores.ClearMemos()	127 self._scores.ClearMemos()

113	128

114 def ClearAllMemos(self):	129 def ClearAllMemos(self):

115 """Clear all memos, even those independent of the weight covector."""	130 """Clear all memos, even those independent of the weight covector."""

116 self.ClearWeightBasedMemos()	131 self.ClearWeightBasedMemos()

117 self._features.ClearMemos()	132 self._features.ClearMemos()

(...skipping 180 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
298 function returns; rather, it's a sort of average of all the results	313 function returns; rather, it's a sort of average of all the results

299 returned. For more information you can take a look at Wikipedia	314 returned. For more information you can take a look at Wikipedia

300 <https://en.wikipedia.org/wiki/Expected_value>.	315 <https://en.wikipedia.org/wiki/Expected_value>.

301 """	316 """

302 prob_given_x = self.Probability(x)	317 prob_given_x = self.Probability(x)

303 # N.B., the ``*`` below is vector scaling! If we want to make this	318 # N.B., the ``*`` below is vector scaling! If we want to make this

304 # method polymorphic in the return type of ``f`` then we'll need an	319 # method polymorphic in the return type of ``f`` then we'll need an

305 # API that provides both scaling and ``vsum``.	320 # API that provides both scaling and ``vsum``.

306 return vsum([prob_given_x(y) * f(y) for y in self._Y])	321 return vsum([prob_given_x(y) * f(y) for y in self._Y])

307	322

OLD	NEW