Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(71)

Side by Side Diff: cluster_telemetry/csv_merger.py

Issue 231433003: Repeat Chromium Tryserver page_set runs to reduce variance. (Closed) Base URL: https://skia.googlesource.com/buildbot.git@master
Patch Set: Move to cluster_telemetry directory Created 6 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | cluster_telemetry/telemetry_slave_scripts/vm_run_telemetry.sh » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 # Copyright (c) 2013 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be 3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file. 4 # found in the LICENSE file.
5 5
6 """Python utility to merge many CSV files into a single file.""" 6 """Python utility to merge many CSV files into a single file.
7
8 If there are multiple CSV files with the same TELEMETRY_PAGE_NAME_KEY then the
9 median of all values is stored in the resultant CSV file.
10 """
11
7 12
8 import csv 13 import csv
9 import glob 14 import glob
10 import optparse 15 import optparse
11 import os 16 import os
12 import sys 17 import sys
13 18
14 19
20 TELEMETRY_PAGE_NAME_KEY = 'page_name'
21
22
15 class CsvMerger(object): 23 class CsvMerger(object):
16 """Class that merges many CSV files into a single file.""" 24 """Class that merges many CSV files into a single file."""
17 25
18 def __init__(self, csv_dir, output_csv_name): 26 def __init__(self, csv_dir, output_csv_name):
19 """Constructs a CsvMerge instance.""" 27 """Constructs a CsvMerge instance."""
20 self._input_csv_files = sorted([ 28 self._input_csv_files = sorted([
21 os.path.join(csv_dir, f) for f in 29 os.path.join(csv_dir, f) for f in
22 glob.glob(os.path.join(csv_dir, '*.csv')) 30 glob.glob(os.path.join(csv_dir, '*.csv'))
23 if os.path.getsize(os.path.join(csv_dir, f))]) 31 if os.path.getsize(os.path.join(csv_dir, f))])
24 self._output_csv_name = os.path.join(csv_dir, output_csv_name) 32 self._output_csv_name = os.path.join(csv_dir, output_csv_name)
25 33
26 def _GetFieldNames(self): 34 def _GetFieldNames(self):
27 field_names = set() 35 field_names = set()
28 for csv_file in self._input_csv_files: 36 for csv_file in self._input_csv_files:
29 field_names.update(csv.DictReader(open(csv_file, 'r')).fieldnames) 37 field_names.update(csv.DictReader(open(csv_file, 'r')).fieldnames)
30 return field_names 38 return field_names
31 39
40 def _GetMedian(self, l):
41 """Returns the median value from the specified list."""
42 l.sort()
43 length = len(l)
44 if not length % 2:
45 return (l[(length/2) - 1] + l[length/2]) / 2
46 else:
47 return l[length/2]
48
49 def _GetRowWithMedianValues(self, rows):
50 """Parses the specified rows and returns a single row with median values."""
51 fieldname_to_values = {}
52 for row in rows:
53 for fieldname in row:
54 if fieldname == TELEMETRY_PAGE_NAME_KEY:
55 fieldname_to_values[fieldname] = row[fieldname]
56 continue
57 try:
58 value = float(row[fieldname])
59 except ValueError:
60 # We expected only floats, cannot compare strings. Skip this field.
61 continue
62 if fieldname in fieldname_to_values:
63 fieldname_to_values[fieldname].append(value)
64 else:
65 fieldname_to_values[fieldname] = [value]
66
67 median_row = {}
68 for fieldname, values in fieldname_to_values.items():
69 if fieldname == TELEMETRY_PAGE_NAME_KEY:
70 median_row[fieldname] = values
71 continue
72 median_row[fieldname] = self._GetMedian(values)
73
74 print
75 print 'For rows: %s' % rows
76 print 'Median row is %s' % median_row
77 print
78 return median_row
79
32 def Merge(self): 80 def Merge(self):
33 """Method that does the CSV merging.""" 81 """Method that does the CSV merging."""
34 field_names = self._GetFieldNames() 82 field_names = self._GetFieldNames()
35 print 'Merging %d csv files into %d columns' % (len(self._input_csv_files), 83 print 'Merging %d csv files into %d columns' % (len(self._input_csv_files),
36 len(field_names)) 84 len(field_names))
37 85
86 # List that will contain all rows read from the CSV files. It will also
87 # combine all rows found with the same TELEMETRY_PAGE_NAME_KEY into one
88 # with median values.
89 csv_rows = []
90
91 # Dictionary containing all the encountered page names. If a page name that
92 # is already in the dictionary is encountered then the median of its
93 # values is used.
94 page_names_to_rows = {}
95
96 for csv_file in self._input_csv_files:
97 dict_reader = csv.DictReader(open(csv_file, 'r'))
98 for row in dict_reader:
99 if TELEMETRY_PAGE_NAME_KEY in row:
100 # Add rows found with 'page_name' to a different dictionary for
101 # processing.
102 if row[TELEMETRY_PAGE_NAME_KEY] in page_names_to_rows:
103 page_names_to_rows[row[TELEMETRY_PAGE_NAME_KEY]].append(row)
104 else:
105 page_names_to_rows[row[TELEMETRY_PAGE_NAME_KEY]] = [row]
106 else:
107 # Add rows found without TELEMETRY_PAGE_NAME_KEY to the final list of
108 # rows, they require no further processing.
109 csv_rows.append(row)
110
111 if page_names_to_rows:
112 for page_name in page_names_to_rows:
113 rows = page_names_to_rows[page_name]
114 median_row = self._GetRowWithMedianValues(rows)
115 # Add a single row that contains median values from all rows with the
116 # same TELEMETRY_PAGE_NAME_KEY.
117 csv_rows.append(median_row)
118
119 # Write all rows in csv_rows to the specified output CSV.
38 dict_writer = csv.DictWriter(open(self._output_csv_name, 'w'), field_names) 120 dict_writer = csv.DictWriter(open(self._output_csv_name, 'w'), field_names)
39 dict_writer.writeheader() 121 dict_writer.writeheader()
40
41 total_rows = 0 122 total_rows = 0
42 123 for row in csv_rows:
43 for csv_file in self._input_csv_files: 124 dict_writer.writerow(row)
44 print 'Merging %s' % csv_file 125 total_rows += 1
45
46 dict_reader = csv.DictReader(open(csv_file, 'r'))
47 for row in dict_reader:
48 dict_writer.writerow(row)
49 total_rows += 1
50 126
51 print 'Successfully merged %d rows' % total_rows 127 print 'Successfully merged %d rows' % total_rows
52 128
53 129
54 if '__main__' == __name__: 130 if '__main__' == __name__:
55 option_parser = optparse.OptionParser() 131 option_parser = optparse.OptionParser()
56 option_parser.add_option( 132 option_parser.add_option(
57 '', '--csv_dir', 133 '', '--csv_dir',
58 help='Directory that contains the csv files to be merged. This directory' 134 help='Directory that contains the csv files to be merged. This directory'
59 ' will also contain the merged CSV.') 135 ' will also contain the merged CSV.')
60 option_parser.add_option( 136 option_parser.add_option(
61 '', '--output_csv_name', 137 '', '--output_csv_name',
62 help='The name of the resultant merged CSV. It will be outputted to the ' 138 help='The name of the resultant merged CSV. It will be outputted to the '
63 '--csv_dir') 139 '--csv_dir')
64 options, unused_args = option_parser.parse_args() 140 options, unused_args = option_parser.parse_args()
65 if not options.csv_dir or not options.output_csv_name: 141 if not options.csv_dir or not options.output_csv_name:
66 option_parser.error('Must specify both csv_dir and output_csv_name') 142 option_parser.error('Must specify both csv_dir and output_csv_name')
67 143
68 sys.exit(CsvMerger(options.csv_dir, options.output_csv_name).Merge()) 144 sys.exit(CsvMerger(options.csv_dir, options.output_csv_name).Merge())
OLDNEW
« no previous file with comments | « no previous file | cluster_telemetry/telemetry_slave_scripts/vm_run_telemetry.sh » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698