| Index: dashboard/dashboard/pinpoint/mann_whitney_u.py
|
| diff --git a/dashboard/dashboard/pinpoint/mann_whitney_u.py b/dashboard/dashboard/pinpoint/mann_whitney_u.py
|
| deleted file mode 100644
|
| index 770f5aadd4370e91552fb9d2e2dce2be5e11566f..0000000000000000000000000000000000000000
|
| --- a/dashboard/dashboard/pinpoint/mann_whitney_u.py
|
| +++ /dev/null
|
| @@ -1,90 +0,0 @@
|
| -# Copyright 2016 The Chromium Authors. All rights reserved.
|
| -# Use of this source code is governed by a BSD-style license that can be
|
| -# found in the LICENSE file.
|
| -
|
| -"""Pure Python implementation of the Mann-Whitney U test.
|
| -
|
| -This code is adapted from SciPy:
|
| - https://github.com/scipy/scipy/blob/master/scipy/stats/stats.py
|
| -Which is provided under a BSD-style license.
|
| -
|
| -There is also a JavaScript version in Catapult:
|
| - https://github.com/catapult-project/catapult/blob/master/tracing/third_party/mannwhitneyu/mannwhitneyu.js
|
| -"""
|
| -
|
| -import itertools
|
| -import math
|
| -
|
| -
|
| -def MannWhitneyU(x, y):
|
| - """Computes the Mann-Whitney rank test on samples x and y.
|
| -
|
| - The distribution of U is approximately normal for large samples. This
|
| - implementation uses the normal approximation, so it's recommended to have
|
| - sample sizes > 20.
|
| - """
|
| - n1 = len(x)
|
| - n2 = len(y)
|
| - ranked = _RankData(x + y)
|
| - rankx = ranked[0:n1] # get the x-ranks
|
| - u1 = n1*n2 + n1*(n1+1)/2.0 - sum(rankx) # calc U for x
|
| - u2 = n1*n2 - u1 # remainder is U for y
|
| - t = _TieCorrectionFactor(ranked)
|
| - if t == 0:
|
| - raise ValueError('All numbers are identical in mannwhitneyu')
|
| - sd = math.sqrt(t * n1 * n2 * (n1+n2+1) / 12.0)
|
| -
|
| - mean_rank = n1*n2/2.0 + 0.5
|
| - big_u = max(u1, u2)
|
| -
|
| - z = (big_u - mean_rank) / sd
|
| - return 2 * _NormSf(abs(z))
|
| -
|
| -
|
| -def _RankData(a):
|
| - """Assigns ranks to data. Ties are given the mean of the ranks of the items.
|
| -
|
| - This is called "fractional ranking":
|
| - https://en.wikipedia.org/wiki/Ranking
|
| - """
|
| - sorter = _ArgSortReverse(a)
|
| - ranked_min = [0] * len(sorter)
|
| - for i, j in reversed(list(enumerate(sorter))):
|
| - ranked_min[j] = i
|
| -
|
| - sorter = _ArgSort(a)
|
| - ranked_max = [0] * len(sorter)
|
| - for i, j in enumerate(sorter):
|
| - ranked_max[j] = i
|
| -
|
| - return [1 + (x+y)/2.0 for x, y in zip(ranked_min, ranked_max)]
|
| -
|
| -
|
| -def _ArgSort(a):
|
| - """Returns the indices that would sort an array.
|
| -
|
| - Ties are given indices in ordinal order."""
|
| - return sorted(range(len(a)), key=a.__getitem__)
|
| -
|
| -
|
| -def _ArgSortReverse(a):
|
| - """Returns the indices that would sort an array.
|
| -
|
| - Ties are given indices in reverse ordinal order."""
|
| - return list(reversed(sorted(range(len(a)), key=a.__getitem__, reverse=True)))
|
| -
|
| -
|
| -def _TieCorrectionFactor(rankvals):
|
| - """Tie correction factor for ties in the Mann-Whitney U test."""
|
| - arr = sorted(rankvals)
|
| - cnt = [len(list(group)) for _, group in itertools.groupby(arr)]
|
| - size = len(arr)
|
| - if size < 2:
|
| - return 1.0
|
| - else:
|
| - return 1.0 - sum(x**3 - x for x in cnt) / float(size**3 - size)
|
| -
|
| -
|
| -def _NormSf(x):
|
| - """Survival function of the standard normal distribution. (1 - cdf)"""
|
| - return (1 - math.erf(x/math.sqrt(2))) / 2
|
|
|