OLD | NEW |
| (Empty) |
1 # Copyright 2013 The Chromium Authors. All rights reserved. | |
2 # Use of this source code is governed by a BSD-style license that can be | |
3 # found in the LICENSE file. | |
4 | |
5 """A collection of statistical utility functions to be used by metrics.""" | |
6 | |
7 import bisect | |
8 import math | |
9 | |
10 | |
11 def Clamp(value, low=0.0, high=1.0): | |
12 """Clamp a value between some low and high value.""" | |
13 return min(max(value, low), high) | |
14 | |
15 | |
16 def NormalizeSamples(samples): | |
17 """Sorts the samples, and map them linearly to the range [0,1]. | |
18 | |
19 They're mapped such that for the N samples, the first sample is 0.5/N and the | |
20 last sample is (N-0.5)/N. | |
21 | |
22 Background: The discrepancy of the sample set i/(N-1); i=0, ..., N-1 is 2/N, | |
23 twice the discrepancy of the sample set (i+1/2)/N; i=0, ..., N-1. In our case | |
24 we don't want to distinguish between these two cases, as our original domain | |
25 is not bounded (it is for Monte Carlo integration, where discrepancy was | |
26 first used). | |
27 """ | |
28 if not samples: | |
29 return samples, 1.0 | |
30 samples = sorted(samples) | |
31 low = min(samples) | |
32 high = max(samples) | |
33 new_low = 0.5 / len(samples) | |
34 new_high = (len(samples)-0.5) / len(samples) | |
35 if high-low == 0.0: | |
36 return samples, 1.0 | |
37 scale = (new_high - new_low) / (high - low) | |
38 for i in xrange(0, len(samples)): | |
39 samples[i] = float(samples[i] - low) * scale + new_low | |
40 return samples, scale | |
41 | |
42 | |
43 def Discrepancy(samples, interval_multiplier=10000): | |
44 """Computes the discrepancy of a set of 1D samples from the interval [0,1]. | |
45 | |
46 The samples must be sorted. | |
47 | |
48 http://en.wikipedia.org/wiki/Low-discrepancy_sequence | |
49 http://mathworld.wolfram.com/Discrepancy.html | |
50 """ | |
51 if not samples: | |
52 return 1.0 | |
53 | |
54 max_local_discrepancy = 0 | |
55 locations = [] | |
56 # For each location, stores the number of samples less than that location. | |
57 left = [] | |
58 # For each location, stores the number of samples less than or equal to that | |
59 # location. | |
60 right = [] | |
61 | |
62 interval_count = len(samples) * interval_multiplier | |
63 # Compute number of locations the will roughly result in the requested number | |
64 # of intervals. | |
65 location_count = int(math.ceil(math.sqrt(interval_count*2))) | |
66 inv_sample_count = 1.0 / len(samples) | |
67 | |
68 # Generate list of equally spaced locations. | |
69 for i in xrange(0, location_count): | |
70 location = float(i) / (location_count-1) | |
71 locations.append(location) | |
72 left.append(bisect.bisect_left(samples, location)) | |
73 right.append(bisect.bisect_right(samples, location)) | |
74 | |
75 # Iterate over the intervals defined by any pair of locations. | |
76 for i in xrange(0, len(locations)): | |
77 for j in xrange(i, len(locations)): | |
78 # Compute length of interval and number of samples in the interval. | |
79 length = locations[j] - locations[i] | |
80 count = right[j] - left[i] | |
81 | |
82 # Compute local discrepancy and update max_local_discrepancy. | |
83 local_discrepancy = abs(float(count)*inv_sample_count - length) | |
84 max_local_discrepancy = max(local_discrepancy, max_local_discrepancy) | |
85 | |
86 return max_local_discrepancy | |
87 | |
88 | |
89 def TimestampsDiscrepancy(timestamps, absolute=True, | |
90 interval_multiplier=10000): | |
91 """A discrepancy based metric for measuring timestamp jank. | |
92 | |
93 TimestampsDiscrepancy quantifies the largest area of jank observed in a series | |
94 of timestamps. Note that this is different from metrics based on the | |
95 max_time_interval. For example, the time stamp series A = [0,1,2,3,5,6] and | |
96 B = [0,1,2,3,5,7] have the same max_time_interval = 2, but | |
97 Discrepancy(B) > Discrepancy(A). | |
98 | |
99 Two variants of discrepancy can be computed: | |
100 | |
101 Relative discrepancy is following the original definition of | |
102 discrepancy. It characterized the largest area of jank, relative to the | |
103 duration of the entire time stamp series. We normalize the raw results, | |
104 because the best case discrepancy for a set of N samples is 1/N (for | |
105 equally spaced samples), and we want our metric to report 0.0 in that | |
106 case. | |
107 | |
108 Absolute discrepancy also characterizes the largest area of jank, but its | |
109 value wouldn't change (except for imprecisions due to a low | |
110 |interval_multiplier|) if additional 'good' intervals were added to an | |
111 exisiting list of time stamps. Its range is [0,inf] and the unit is | |
112 milliseconds. | |
113 | |
114 The time stamp series C = [0,2,3,4] and D = [0,2,3,4,5] have the same | |
115 absolute discrepancy, but D has lower relative discrepancy than C. | |
116 | |
117 |timestamps| may be a list of lists S = [S_1, S_2, ..., S_N], where each | |
118 S_i is a time stamp series. In that case, the discrepancy D(S) is: | |
119 D(S) = max(D(S_1), D(S_2), ..., D(S_N)) | |
120 """ | |
121 if not timestamps: | |
122 return 1.0 | |
123 | |
124 if isinstance(timestamps[0], list): | |
125 range_discrepancies = [TimestampsDiscrepancy(r) for r in timestamps] | |
126 return max(range_discrepancies) | |
127 | |
128 samples, sample_scale = NormalizeSamples(timestamps) | |
129 discrepancy = Discrepancy(samples, interval_multiplier) | |
130 inv_sample_count = 1.0 / len(samples) | |
131 if absolute: | |
132 # Compute absolute discrepancy | |
133 discrepancy /= sample_scale | |
134 else: | |
135 # Compute relative discrepancy | |
136 discrepancy = Clamp((discrepancy-inv_sample_count) / (1.0-inv_sample_count)) | |
137 return discrepancy | |
138 | |
139 | |
140 def DurationsDiscrepancy(durations, absolute=True, | |
141 interval_multiplier=10000): | |
142 """A discrepancy based metric for measuring duration jank. | |
143 | |
144 DurationsDiscrepancy computes a jank metric which measures how irregular a | |
145 given sequence of intervals is. In order to minimize jank, each duration | |
146 should be equally long. This is similar to how timestamp jank works, | |
147 and we therefore reuse the timestamp discrepancy function above to compute a | |
148 similar duration discrepancy number. | |
149 | |
150 Because timestamp discrepancy is defined in terms of timestamps, we first | |
151 convert the list of durations to monotonically increasing timestamps. | |
152 | |
153 Args: | |
154 durations: List of interval lengths in milliseconds. | |
155 absolute: See TimestampsDiscrepancy. | |
156 interval_multiplier: See TimestampsDiscrepancy. | |
157 """ | |
158 timestamps = reduce(lambda x, y: x + [x[-1] + y], durations, [0])[1:] | |
159 return TimestampsDiscrepancy(timestamps, absolute, interval_multiplier) | |
160 | |
161 | |
162 def ArithmeticMean(numerator, denominator): | |
163 """Calculates arithmetic mean. | |
164 | |
165 Both numerator and denominator can be given as either individual | |
166 values or lists of values which will be summed. | |
167 | |
168 Args: | |
169 numerator: A quantity that represents a sum total value. | |
170 denominator: A quantity that represents a count of the number of things. | |
171 | |
172 Returns: | |
173 The arithmetic mean value, or 0 if the denominator value was 0. | |
174 """ | |
175 numerator_total = Total(numerator) | |
176 denominator_total = Total(denominator) | |
177 return DivideIfPossibleOrZero(numerator_total, denominator_total) | |
178 | |
179 | |
180 def Total(data): | |
181 """Returns the float value of a number or the sum of a list.""" | |
182 if type(data) == float: | |
183 total = data | |
184 elif type(data) == int: | |
185 total = float(data) | |
186 elif type(data) == list: | |
187 total = float(sum(data)) | |
188 else: | |
189 raise TypeError | |
190 return total | |
191 | |
192 | |
193 def DivideIfPossibleOrZero(numerator, denominator): | |
194 """Returns the quotient, or zero if the denominator is zero.""" | |
195 if not denominator: | |
196 return 0.0 | |
197 else: | |
198 return numerator / denominator | |
199 | |
200 | |
201 def GeneralizedMean(values, exponent): | |
202 """See http://en.wikipedia.org/wiki/Generalized_mean""" | |
203 if not values: | |
204 return 0.0 | |
205 sum_of_powers = 0.0 | |
206 for v in values: | |
207 sum_of_powers += v ** exponent | |
208 return (sum_of_powers / len(values)) ** (1.0/exponent) | |
209 | |
210 | |
211 def Median(values): | |
212 """Gets the median of a list of values.""" | |
213 return Percentile(values, 50) | |
214 | |
215 | |
216 def Percentile(values, percentile): | |
217 """Calculates the value below which a given percentage of values fall. | |
218 | |
219 For example, if 17% of the values are less than 5.0, then 5.0 is the 17th | |
220 percentile for this set of values. When the percentage doesn't exactly | |
221 match a rank in the list of values, the percentile is computed using linear | |
222 interpolation between closest ranks. | |
223 | |
224 Args: | |
225 values: A list of numerical values. | |
226 percentile: A number between 0 and 100. | |
227 | |
228 Returns: | |
229 The Nth percentile for the list of values, where N is the given percentage. | |
230 """ | |
231 if not values: | |
232 return 0.0 | |
233 sorted_values = sorted(values) | |
234 n = len(values) | |
235 percentile /= 100.0 | |
236 if percentile <= 0.5 / n: | |
237 return sorted_values[0] | |
238 elif percentile >= (n - 0.5) / n: | |
239 return sorted_values[-1] | |
240 else: | |
241 floor_index = int(math.floor(n * percentile - 0.5)) | |
242 floor_value = sorted_values[floor_index] | |
243 ceil_value = sorted_values[floor_index+1] | |
244 alpha = n * percentile - 0.5 - floor_index | |
245 return floor_value + alpha * (ceil_value - floor_value) | |
246 | |
247 | |
248 def GeometricMean(values): | |
249 """Compute a rounded geometric mean from an array of values.""" | |
250 if not values: | |
251 return None | |
252 # To avoid infinite value errors, make sure no value is less than 0.001. | |
253 new_values = [] | |
254 for value in values: | |
255 if value > 0.001: | |
256 new_values.append(value) | |
257 else: | |
258 new_values.append(0.001) | |
259 # Compute the sum of the log of the values. | |
260 log_sum = sum(map(math.log, new_values)) | |
261 # Raise e to that sum over the number of values. | |
262 mean = math.pow(math.e, (log_sum / len(new_values))) | |
263 # Return the rounded mean. | |
264 return int(round(mean)) | |
265 | |
OLD | NEW |