bin/compare - Issue 1228783003: Change to use mean and to use stderr.

Side by Side Diff: bin/compare

Issue 1228783003: Change to use mean and to use stderr. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: add flags Created 5 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 #!/usr/bin/env python	1 #!/usr/bin/env python

2	2

	3 import argparse

	4 import numpy

3 import sys	5 import sys

4 from scipy.stats import mannwhitneyu	6 from scipy.stats import mannwhitneyu

	7 from scipy.stats import sem

5	8

6 SIGNIFICANCE_THRESHOLD = 0.0001	9 SIGNIFICANCE_THRESHOLD = 0.0001

7	10

	11 parser = argparse.ArgumentParser(

	12 formatter_class=argparse.RawDescriptionHelpFormatter,

	13 description='Compare performance of two runs from nanobench.')

	14 parser.add_argument('--use_means', action='store_true', default=False,

	15 help='Use means to calculate performance ratios.')

	16 parser.add_argument('baseline', help='Baseline file.')

	17 parser.add_argument('experiment', help='Experiment file.')

	18 args = parser.parse_args()

	19

8 a,b = {},{}	20 a,b = {},{}

9 for (path, d) in [(sys.argv[1], a), (sys.argv[2], b)]:	21 for (path, d) in [(args.baseline, a), (args.experiment, b)]:

10 for line in open(path):	22 for line in open(path):

11 try:	23 try:

12 tokens = line.split()	24 tokens = line.split()

13 if tokens[0] != "Samples:":	25 if tokens[0] != "Samples:":

14 continue	26 continue

15 samples = tokens[1:-1]	27 samples = tokens[1:-1]

16 label = tokens[-1]	28 label = tokens[-1]

17 d[label] = map(float, samples)	29 d[label] = map(float, samples)

18 except:	30 except:

19 pass	31 pass

20	32

21 common = set(a.keys()).intersection(b.keys())	33 common = set(a.keys()).intersection(b.keys())

22	34

23 ps = []	35 ps = []

24 for key in common:	36 for key in common:

25 _, p = mannwhitneyu(a[key], b[key]) # Non-parametric t-test. Doesn't ass ume normal dist.	37 _, p = mannwhitneyu(a[key], b[key]) # Non-parametric t-test. Doesn't ass ume normal dist.

26 am, bm = min(a[key]), min(b[key])	38 if args.use_means:

27 ps.append((bm/am, p, key, am, bm))	39 am, bm = numpy.mean(a[key]), numpy.mean(b[key])

	40 asem, bsem = sem(a[key]), sem(b[key])

	41 else:

	42 am, bm = min(a[key]), min(b[key])

	43 asem, bsem = 0, 0

	44 ps.append((bm/am, p, key, am, bm, asem, bsem))

28 ps.sort(reverse=True)	45 ps.sort(reverse=True)

29	46

30 def humanize(ns):	47 def humanize(ns):

31 for threshold, suffix in [(1e9, 's'), (1e6, 'ms'), (1e3, 'us'), (1e0, 'ns')] :	48 for threshold, suffix in [(1e9, 's'), (1e6, 'ms'), (1e3, 'us'), (1e0, 'ns')] :

32 if ns > threshold:	49 if ns > threshold:

33 return "%.3g%s" % (ns/threshold, suffix)	50 return "%.3g%s" % (ns/threshold, suffix)

34	51

35 maxlen = max(map(len, common))	52 maxlen = max(map(len, common))

36	53

37 # We print only signficant changes in benchmark timing distribution.	54 # We print only signficant changes in benchmark timing distribution.

38 bonferroni = SIGNIFICANCE_THRESHOLD / len(ps) # Adjust for the fact we've run m ultiple tests.	55 bonferroni = SIGNIFICANCE_THRESHOLD / len(ps) # Adjust for the fact we've run m ultiple tests.

39 for ratio, p, key, am, bm in ps:	56 for ratio, p, key, am, bm, asem, bsem in ps:

40 if p < bonferroni:	57 if p < bonferroni:

41 str_ratio = ('%.2gx' if ratio < 1 else '%.3gx') % ratio	58 str_ratio = ('%.2gx' if ratio < 1 else '%.3gx') % ratio

42 print '%*s\t%6s -> %6s\t%s' % (maxlen, key, humanize(am), humanize(bm), str_ratio)	59 if args.use_means:

	60 print '%*s\t%6s(%6s) -> %6s(%6s)\t%s' % (maxlen, key, humanize(am), humanize(asem),

	61 humanize(bm), humanize(bsem ), str_ratio)

	62 else:

	63 print '%*s\t%6s -> %6s\t%s' % (maxlen, key, humanize(am), humanize(b m), str_ratio)

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »