Chromium Code Reviews| Index: appengine/findit/crash/detect_regression_range.py |
| diff --git a/appengine/findit/crash/detect_regression_range.py b/appengine/findit/crash/detect_regression_range.py |
| index 1c50c2700d1e34701efbe9239a122cf20a41bfc7..219edc51f3a59b34545c156cab03385fb9aefdc6 100644 |
| --- a/appengine/findit/crash/detect_regression_range.py |
| +++ b/appengine/findit/crash/detect_regression_range.py |
| @@ -18,30 +18,36 @@ _SPIKINESS_THRESHOLD = 20 |
| # TODO(wrengr): make this streaming, rather than holding onto the whole list. |
| -def GetSpikes(data_series, alpha=_DEFAULT_ALPHA, |
| +def GetSpikes(events, get_value, alpha=_DEFAULT_ALPHA, |
| threshold=_SPIKINESS_THRESHOLD): |
| """Given a time series, determine the regression ranges for each |
| anomolous spike. |
| - The time series is represented by a list of (x,y) tuples, where each |
| - tuple denotes an event, the position of the tuple in the list indicates |
| - the relative ordering of events, x is an abstract name for the point |
| - in time the event occured, and y is the value associated with that |
| - event. N.B., this function doesn't care at all about what the x values |
| - actually look like; we just pass them around as abstract values, |
| - thus they can be whatever the caller wants. |
| - |
| - We use exponential smoothing as our model for determining whether |
| - an event is a spike or not. This model has two parameters: the alpha |
| - parameter determines how readily we update the running average for each |
| - new event, and the threshold parameter determines when an event is |
| - considered anomolous enough to be called a spike. N.B., exponential |
| - smoothing is considered a naive model for analysing time series, |
| - because it cannot account for trends; but since we are only interested |
| - in detecting deviation from zero, this model is good enough for us. |
| + The time series is represented by a list of "events" together with a |
| + function for computing the "value" of each event. The events themselves |
| + can be any sort of object (including None), all we care about is their |
| + value according to the valuation function. We assume the list is given |
| + with the events already in order, but otherwise we do not care about |
| + their exact position within the list. We return a list of pairs of |
| + events, where the two events in each pair are ajacent in the original |
| + list. Whenever we determine that the value of some event deviates too |
| + much from the values of previous events, we will return a pair of the |
| + event which deviates too much, together with the immediately previous |
| + event. |
| + |
| + We use exponential smoothing as our model for determining when there |
| + are spikes in the events' values. This model has two parameters: |
| + the alpha parameter determines how readily we update the running |
| + average each time we see a new event, and the threshold parameter |
| + determines when an event's value is considered anomolous enough to |
| + be called a spike. N.B., exponential smoothing is considered a naive |
| + model for analysing time series, because it cannot account for trends; |
| + but since we are only interested in detecting deviation from zero, |
| + this model is good enough for us. |
| Args: |
| - data_series (list): A list of (x, y) tuples where y is a number. |
| + events (list): A list of objects. |
| + get_value (callable): a function from event objects to a number. |
| alpha (float): In (0, 1], controls how we balance between evidence |
| from new events vs the running average. When alpha=0 we completely |
| ignore the new event; when alpha=1 we completely ignore the running |
| @@ -50,43 +56,42 @@ def GetSpikes(data_series, alpha=_DEFAULT_ALPHA, |
| a spike. |
| Returns: |
| - A list of (y0,y1) pairs denoting the regression range for each |
| - spike. That is, y0 is the abstract name for the point in time just |
| - before the spike, and y1 is the abstract name for the point in time |
| - just after the spike. If data_series has fewer than 2 elements, |
| + A list of (e0,e1) pairs denoting the regression range for each |
| + spike. That is, e0 is the event just before the spike, and e1 is the |
| + event just after the spike. If data_series has fewer than 2 elements, |
| then we return the empty list rather than returning None. |
| """ |
| + # TODO(wrengr): assert callable(get_value), so we can give a decent |
| + # error message if it's not. |
| + |
| # TODO(wrengr): would it be better to return None so callers fail fast? |
| - if not data_series or len(data_series) == 1: |
| + if not events or len(events) == 1: |
| return [] |
| - logging.info('For data series %s', repr(data_series)) |
| + logging.info('For data series %s', repr(events)) |
| logging.info('Alpha is %f', alpha) |
| logging.info('Threshold is %f', threshold) |
| # TODO(wrengr): do we care enough about precision to avoid the floating |
| # point issues with computing things the naive way? |
| spikes = [] |
| - previous_x, previous_mean = data_series[0] |
| - for event in data_series[1:]: |
| - x, y = event |
| - |
| - current_mean = (1 - alpha) * previous_mean + alpha * y |
| + previous_event = events[0] |
| + previous_mean = get_value(previous_event) |
| + for event in events[1:]: |
| + current_value = get_value(event) |
| + current_mean = (1 - alpha) * previous_mean + alpha * current_value |
| # Spikiness is basically relative increase. Add epsilon to both |
| # numerator and denominator to avoid dividing by zero error. |
| spikiness = ((current_mean - previous_mean + _EPSILON) / |
| (previous_mean + _EPSILON)) |
| - logging.info('The spikiness of data %s is %s', |
| + logging.info('The spikiness of event %s is %s', |
| repr(event), spikiness) |
| if spikiness > threshold: |
| - spikes.append((previous_x,x)) |
| + spikes.append((previous_event, event)) |
| + previous_event = event |
| previous_mean = current_mean |
| - previous_x = x |
| - |
| - # Because we don't know the x for the next event after the final one, |
| - # we ignore spikes at the very end of the series. |
| return spikes |
| @@ -138,12 +143,16 @@ def DetectRegressionRange(historic_metadata, max_win_size=_MAXIMUM_WINDOW_SIZE): |
| if not historic_metadata: |
| return None |
| + # TODO(http://crbug.com/644406): we should be able to remove this code |
| + # for converting historic_metadata to pairs now. Just need to update |
| + # the unit tests to match. |
| + |
| # Truncate the historic data so we only analyze data for max_win_size of |
| # latest versions. |
| versions_to_cpm = GetAttributesListFromHistoricData( |
| historic_metadata[-max_win_size:], ['chrome_version', 'cpm']) |
|
Sharu Jiang
2016/09/08 00:48:24
We don't need this anymore.
We can just use:
spik
wrengr (wrong one)
2016/09/08 18:12:13
Right. I just hadn't gotten around to deleting the
Sharu Jiang
2016/09/08 18:26:08
Acknowledged.
|
| - spikes = GetSpikes(versions_to_cpm) |
| + spikes = GetSpikes(versions_to_cpm, lambda x: x[1]) |
| if not spikes: |
| logging.warning('Failed to find spikes in history data %s' % repr( |
| @@ -151,5 +160,5 @@ def DetectRegressionRange(historic_metadata, max_win_size=_MAXIMUM_WINDOW_SIZE): |
| return None |
| # Only return the latest regression range. |
| - return spikes[-1] |
| + return spikes[-1][0] |
|
Sharu Jiang
2016/09/08 00:48:24
I think this should be
return spikes[-1][0]['chro
wrengr (wrong one)
2016/09/08 18:12:13
Do you mean you want DetectRegressionRange to (con
Sharu Jiang
2016/09/08 18:26:08
The final result of DetectRegressionRange should s
|