cluster_telemetry/csv_merger.py - Issue 231433003: Repeat Chromium Tryserver page_set runs to reduce variance.

Side by Side Diff: cluster_telemetry/csv_merger.py

Issue 231433003: Repeat Chromium Tryserver page_set runs to reduce variance. (Closed) Base URL: https://skia.googlesource.com/buildbot.git@master

Patch Set: Move to cluster_telemetry directory Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 #!/usr/bin/env python	1 #!/usr/bin/env python

2 # Copyright (c) 2013 The Chromium Authors. All rights reserved.	2 # Copyright (c) 2013 The Chromium Authors. All rights reserved.

3 # Use of this source code is governed by a BSD-style license that can be	3 # Use of this source code is governed by a BSD-style license that can be

4 # found in the LICENSE file.	4 # found in the LICENSE file.

5	5

6 """Python utility to merge many CSV files into a single file."""	6 """Python utility to merge many CSV files into a single file.

	7

	8 If there are multiple CSV files with the same TELEMETRY_PAGE_NAME_KEY then the

	9 median of all values is stored in the resultant CSV file.

	10 """

	11

7	12

8 import csv	13 import csv

9 import glob	14 import glob

10 import optparse	15 import optparse

11 import os	16 import os

12 import sys	17 import sys

13	18

14	19

	20 TELEMETRY_PAGE_NAME_KEY = 'page_name'

	21

	22

15 class CsvMerger(object):	23 class CsvMerger(object):

16 """Class that merges many CSV files into a single file."""	24 """Class that merges many CSV files into a single file."""

17	25

18 def __init__(self, csv_dir, output_csv_name):	26 def __init__(self, csv_dir, output_csv_name):

19 """Constructs a CsvMerge instance."""	27 """Constructs a CsvMerge instance."""

20 self._input_csv_files = sorted([	28 self._input_csv_files = sorted([

21 os.path.join(csv_dir, f) for f in	29 os.path.join(csv_dir, f) for f in

22 glob.glob(os.path.join(csv_dir, '*.csv'))	30 glob.glob(os.path.join(csv_dir, '*.csv'))

23 if os.path.getsize(os.path.join(csv_dir, f))])	31 if os.path.getsize(os.path.join(csv_dir, f))])

24 self._output_csv_name = os.path.join(csv_dir, output_csv_name)	32 self._output_csv_name = os.path.join(csv_dir, output_csv_name)

25	33

26 def _GetFieldNames(self):	34 def _GetFieldNames(self):

27 field_names = set()	35 field_names = set()

28 for csv_file in self._input_csv_files:	36 for csv_file in self._input_csv_files:

29 field_names.update(csv.DictReader(open(csv_file, 'r')).fieldnames)	37 field_names.update(csv.DictReader(open(csv_file, 'r')).fieldnames)

30 return field_names	38 return field_names

31	39

	40 def _GetMedian(self, l):

	41 """Returns the median value from the specified list."""

	42 l.sort()

	43 length = len(l)

	44 if not length % 2:

	45 return (l[(length/2) - 1] + l[length/2]) / 2

	46 else:

	47 return l[length/2]

	48

	49 def _GetRowWithMedianValues(self, rows):

	50 """Parses the specified rows and returns a single row with median values."""

	51 fieldname_to_values = {}

	52 for row in rows:

	53 for fieldname in row:

	54 if fieldname == TELEMETRY_PAGE_NAME_KEY:

	55 fieldname_to_values[fieldname] = row[fieldname]

	56 continue

	57 try:

	58 value = float(row[fieldname])

	59 except ValueError:

	60 # We expected only floats, cannot compare strings. Skip this field.

	61 continue

	62 if fieldname in fieldname_to_values:

	63 fieldname_to_values[fieldname].append(value)

	64 else:

	65 fieldname_to_values[fieldname] = [value]

	66

	67 median_row = {}

	68 for fieldname, values in fieldname_to_values.items():

	69 if fieldname == TELEMETRY_PAGE_NAME_KEY:

	70 median_row[fieldname] = values

	71 continue

	72 median_row[fieldname] = self._GetMedian(values)

	73

	74 print

	75 print 'For rows: %s' % rows

	76 print 'Median row is %s' % median_row

	77 print

	78 return median_row

	79

32 def Merge(self):	80 def Merge(self):

33 """Method that does the CSV merging."""	81 """Method that does the CSV merging."""

34 field_names = self._GetFieldNames()	82 field_names = self._GetFieldNames()

35 print 'Merging %d csv files into %d columns' % (len(self._input_csv_files),	83 print 'Merging %d csv files into %d columns' % (len(self._input_csv_files),

36 len(field_names))	84 len(field_names))

37	85

	86 # List that will contain all rows read from the CSV files. It will also

	87 # combine all rows found with the same TELEMETRY_PAGE_NAME_KEY into one

	88 # with median values.

	89 csv_rows = []

	90

	91 # Dictionary containing all the encountered page names. If a page name that

	92 # is already in the dictionary is encountered then the median of its

	93 # values is used.

	94 page_names_to_rows = {}

	95

	96 for csv_file in self._input_csv_files:

	97 dict_reader = csv.DictReader(open(csv_file, 'r'))

	98 for row in dict_reader:

	99 if TELEMETRY_PAGE_NAME_KEY in row:

	100 # Add rows found with 'page_name' to a different dictionary for

	101 # processing.

	102 if row[TELEMETRY_PAGE_NAME_KEY] in page_names_to_rows:

	103 page_names_to_rows[row[TELEMETRY_PAGE_NAME_KEY]].append(row)

	104 else:

	105 page_names_to_rows[row[TELEMETRY_PAGE_NAME_KEY]] = [row]

	106 else:

	107 # Add rows found without TELEMETRY_PAGE_NAME_KEY to the final list of

	108 # rows, they require no further processing.

	109 csv_rows.append(row)

	110

	111 if page_names_to_rows:

	112 for page_name in page_names_to_rows:

	113 rows = page_names_to_rows[page_name]

	114 median_row = self._GetRowWithMedianValues(rows)

	115 # Add a single row that contains median values from all rows with the

	116 # same TELEMETRY_PAGE_NAME_KEY.

	117 csv_rows.append(median_row)

	118

	119 # Write all rows in csv_rows to the specified output CSV.

38 dict_writer = csv.DictWriter(open(self._output_csv_name, 'w'), field_names)	120 dict_writer = csv.DictWriter(open(self._output_csv_name, 'w'), field_names)

39 dict_writer.writeheader()	121 dict_writer.writeheader()

40

41 total_rows = 0	122 total_rows = 0

42	123 for row in csv_rows:

43 for csv_file in self._input_csv_files:	124 dict_writer.writerow(row)

44 print 'Merging %s' % csv_file	125 total_rows += 1

45

46 dict_reader = csv.DictReader(open(csv_file, 'r'))

47 for row in dict_reader:

48 dict_writer.writerow(row)

49 total_rows += 1

50	126

51 print 'Successfully merged %d rows' % total_rows	127 print 'Successfully merged %d rows' % total_rows

52	128

53	129

54 if '__main__' == __name__:	130 if '__main__' == __name__:

55 option_parser = optparse.OptionParser()	131 option_parser = optparse.OptionParser()

56 option_parser.add_option(	132 option_parser.add_option(

57 '', '--csv_dir',	133 '', '--csv_dir',

58 help='Directory that contains the csv files to be merged. This directory'	134 help='Directory that contains the csv files to be merged. This directory'

59 ' will also contain the merged CSV.')	135 ' will also contain the merged CSV.')

60 option_parser.add_option(	136 option_parser.add_option(

61 '', '--output_csv_name',	137 '', '--output_csv_name',

62 help='The name of the resultant merged CSV. It will be outputted to the '	138 help='The name of the resultant merged CSV. It will be outputted to the '

63 '--csv_dir')	139 '--csv_dir')

64 options, unused_args = option_parser.parse_args()	140 options, unused_args = option_parser.parse_args()

65 if not options.csv_dir or not options.output_csv_name:	141 if not options.csv_dir or not options.output_csv_name:

66 option_parser.error('Must specify both csv_dir and output_csv_name')	142 option_parser.error('Must specify both csv_dir and output_csv_name')

67	143

68 sys.exit(CsvMerger(options.csv_dir, options.output_csv_name).Merge())	144 sys.exit(CsvMerger(options.csv_dir, options.output_csv_name).Merge())

OLD	NEW

« no previous file with comments | « no previous file | cluster_telemetry/telemetry_slave_scripts/vm_run_telemetry.sh » ('j') | no next file with comments »