Chromium Code Reviews| Index: appengine/findit/crash/detect_regression_range.py |
| diff --git a/appengine/findit/crash/detect_regression_range.py b/appengine/findit/crash/detect_regression_range.py |
| index deaca24b1d09a466bf4bcf207cff689af4e9c415..c579e762acad0717403ce9d0b4fb11d7274c6d68 100644 |
| --- a/appengine/findit/crash/detect_regression_range.py |
| +++ b/appengine/findit/crash/detect_regression_range.py |
| @@ -4,108 +4,96 @@ |
| import logging |
| -# Maximum number of versions to look back. |
| +# Default value for the maximum number of versions to look back. |
| _MAXIMUM_WINDOW_SIZE = 30 |
| -# Add epsilon to avoid dividing by zero when computing spike score. |
| +# Add epsilon to avoid dividing by zero when computing spikiness. |
| _EPSILON = 0.00000001 |
| -# Default value to control weight of current data when computing spike score. |
| +# Default value to control weight of current data when computing spikiness. |
| _DEFAULT_ALPHA = 0.9 |
| -# Threshold of spike score. |
| -_SPIKENESS_THRESHOLD = 20 |
| +# Threshold for calling something a spike. |
| +_SPIKINESS_THRESHOLD = 20 |
| -# TODO(http://crbug.com/644406): This code needs cleaning/refactoring |
| -def GetSpikeIndexes(data_series, alpha=_DEFAULT_ALPHA, |
| - threshold=_SPIKENESS_THRESHOLD): |
| - """Given a time series, determine anomolous spikes. |
| +# TODO(wrengr): make this streaming, rather than holding onto the whole list. |
| +def GetSpikes(events, get_value, alpha=_DEFAULT_ALPHA, |
| + threshold=_SPIKINESS_THRESHOLD): |
| + """Given a time series, determine the regression ranges for each |
|
Sharu Jiang
2016/09/08 21:33:14
nit: A convention, for long doc string, keep the f
stgao
2016/09/09 18:13:59
nit: one line summary here.
|
| + anomolous spike. |
| - The time series is represented by a list of (x,y) tuples, where the |
| - position of the tuple in the list is the time at which the event |
| - occured, x is an abstract name for that time, and y is the value of |
| - the time series at that point in time. We detect spikes by determining |
| - when the time series diverges too much from the running average (of |
| - the y values). The alpha parameter determines how readily we update |
| - the running average; which is to say, how much weight we give each |
| - new event compared to the weight we give to the current running |
| - average. The threshold parameter defines when we've deviated 'too far' |
| - from the running average, and therefore say that this event is in fact |
| - a spike. |
| + The time series is represented by a list of "events" together with a |
| + function for computing the "value" of each event. The events themselves |
|
stgao
2016/09/09 18:13:58
Information in this paragraph could be merged into
|
| + can be any sort of object (including None), all we care about is their |
| + value according to the valuation function. We assume the list is given |
| + with the events already in order, but otherwise we do not care about |
| + their exact position within the list. We return a list of pairs of |
| + events, where the two events in each pair are ajacent in the original |
| + list. Whenever we determine that the value of some event deviates too |
| + much from the values of previous events, we will return a pair of the |
| + event which deviates too much, together with the immediately previous |
| + event. |
| + |
| + We use exponential smoothing as our model for determining when there |
| + are spikes in the events' values. This model has two parameters: |
| + the alpha parameter determines how readily we update the running |
| + average each time we see a new event, and the threshold parameter |
| + determines when an event's value is considered anomolous enough to |
| + be called a spike. N.B., exponential smoothing is considered a naive |
| + model for analysing time series, because it cannot account for trends; |
| + but since we are only interested in detecting deviation from zero, |
| + this model is good enough for us. |
| Args: |
| - data_series (list): A list of (x, y) tuples where y is a number. |
| - alpha (float): In (0, 1], it controls the weight of current data |
| - when computing the running mean, a higher value has more weight. |
| - threshold (float): Threshold for when something counts as a spike. |
| + events (list): A list of objects. |
| + get_value (callable): a function from event objects to a number. |
| + alpha (float): In (0, 1], controls how we balance between evidence |
| + from new events vs the running average. When alpha=0 we completely |
| + ignore the new event; when alpha=1 we completely ignore the running |
| + average. |
| + threshold (float): Threshold for when something is considered to be |
| + a spike. |
| Returns: |
| - A list of spike indexes (i.e., positions in the data_series list; |
| - not the abstract names associated with those indices) |
| + A list of (e0,e1) pairs denoting the regression range for each |
| + spike. That is, e0 is the event just before the spike, and e1 is the |
| + event just after the spike. If data_series has fewer than 2 elements, |
| + then we return the empty list rather than returning None. |
| """ |
| - spike_indexes = [] |
| + # TODO(wrengr): assert callable(get_value), so we can give a decent |
| + # error message if it's not. |
| - if not data_series or len(data_series) == 1: |
| - return spike_indexes |
| + # TODO(wrengr): would it be better to return None so callers fail fast? |
| + if not events or len(events) == 1: |
| + return [] |
| - logging.info('For data series %s', repr(data_series)) |
| + logging.info('For data series %s', repr(events)) |
| + logging.info('Alpha is %f', alpha) |
| logging.info('Threshold is %f', threshold) |
| - previous_mean = data_series[0][1] |
| - for i, (_, y) in enumerate(data_series[1:]): |
| - current_mean = (1 - alpha) * previous_mean + alpha * y |
| - |
| - # Spike score is basically relative increase. Add epsilon |
| - # to both numerator and denominator to avoid dividing by zero error. |
| - spike_score = ((current_mean - previous_mean + _EPSILON) / |
| - (previous_mean + _EPSILON)) |
| - logging.info('The spike score for data %s is %s', |
| - repr(data_series[i]), spike_score) |
| - if spike_score > threshold: |
| - spike_indexes.append(i + 1) |
| - |
| + # TODO(wrengr): do we care enough about precision to avoid the floating |
| + # point issues of computing the mean in the naive/straightforward way? |
| + spikes = [] |
| + previous_event = events[0] |
| + previous_mean = get_value(previous_event) |
| + for event in events[1:]: |
| + current_value = get_value(event) |
| + current_mean = (1 - alpha) * previous_mean + alpha * current_value |
| + |
| + # Spikiness is basically relative increase. Add epsilon to both |
| + # numerator and denominator to avoid dividing by zero error. |
| + spikiness = ((current_mean - previous_mean + _EPSILON) / |
| + (previous_mean + _EPSILON)) |
| + logging.info('The spikiness of event %s is %s', |
| + repr(event), spikiness) |
|
stgao
2016/09/09 18:13:58
nit: could it fix in one line?
|
| + if spikiness > threshold: |
| + spikes.append((previous_event, event)) |
| + |
| + previous_event = event |
| previous_mean = current_mean |
| - return spike_indexes |
| - |
| - |
| -def GetRegressionRangeFromSpike(spike_index, versions): |
| - """Get the regression range based on spike_index and versions.""" |
| - if spike_index < 1 or spike_index >= len(versions): |
| - return None |
| - |
| - return (versions[spike_index - 1], versions[spike_index]) |
| - |
| - |
| -def GetAttributesListFromHistoricData(historic_metadata, attributes): |
| - """Returns a list of attributes from historic_metadata. |
| - |
| - Args: |
| - historic_metadata (list): A list of dict of metadata, for example: |
| - [{'chrome_version': '1', 'cpm': 0}, {'chrome_version': '2', 'cpm': 0}] |
| - attributes (list): A list of attribute names. |
| - |
| - Returns: |
| - A list of strs(attributes has only 1 element), or tuples |
| - """ |
| - if not attributes: |
| - return [] |
| - |
| - attributes_list = [] |
| - |
| - for data in historic_metadata: |
| - attributes_entry = [] |
| - |
| - for attribute in attributes: |
| - attributes_entry.append(data[attribute]) |
| - |
| - if len(attributes_entry) == 1: |
| - attributes_list.append(attributes_entry[0]) |
| - else: |
| - attributes_list.append(tuple(attributes_entry)) |
| - |
| - return attributes_list |
| + return spikes |
| def DetectRegressionRange(historic_metadata, max_win_size=_MAXIMUM_WINDOW_SIZE): |
| @@ -127,16 +115,14 @@ def DetectRegressionRange(historic_metadata, max_win_size=_MAXIMUM_WINDOW_SIZE): |
| # Truncate the historic data so we only analyze data for max_win_size of |
| # latest versions. |
| - versions_to_cpm = GetAttributesListFromHistoricData( |
| - historic_metadata[-max_win_size:], ['chrome_version', 'cpm']) |
| - |
| - versions, _ = zip(*versions_to_cpm) |
| - spike_indexes = GetSpikeIndexes(versions_to_cpm) |
| + spikes = GetSpikes(historic_metadata[-max_win_size:], lambda x: x['cpm']) |
| - if not spike_indexes: |
| + if not spikes: |
| logging.warning('Failed to find spikes in history data %s' % repr( |
| historic_metadata)) |
| return None |
| - # Only return the latest regression range. |
| - return GetRegressionRangeFromSpike(spike_indexes[-1], versions) |
| + # Only return the last/most-recent regression range. |
| + last_good, first_bad = spikes[-1] |
| + return last_good['chrome_version'], first_bad['chrome_version'] |
| + |