| OLD | NEW |
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. | 2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
| 5 | 5 |
| 6 """Python utility to merge many CSV files into a single file.""" | 6 """Python utility to merge many CSV files into a single file. |
| 7 |
| 8 If there are multiple CSV files with the same TELEMETRY_PAGE_NAME_KEY then the |
| 9 median of all values is stored in the resultant CSV file. |
| 10 """ |
| 11 |
| 7 | 12 |
| 8 import csv | 13 import csv |
| 9 import glob | 14 import glob |
| 10 import optparse | 15 import optparse |
| 11 import os | 16 import os |
| 12 import sys | 17 import sys |
| 13 | 18 |
| 14 | 19 |
| 20 TELEMETRY_PAGE_NAME_KEY = 'page_name' |
| 21 |
| 22 |
| 15 class CsvMerger(object): | 23 class CsvMerger(object): |
| 16 """Class that merges many CSV files into a single file.""" | 24 """Class that merges many CSV files into a single file.""" |
| 17 | 25 |
| 18 def __init__(self, csv_dir, output_csv_name): | 26 def __init__(self, csv_dir, output_csv_name): |
| 19 """Constructs a CsvMerge instance.""" | 27 """Constructs a CsvMerge instance.""" |
| 20 self._input_csv_files = sorted([ | 28 self._input_csv_files = sorted([ |
| 21 os.path.join(csv_dir, f) for f in | 29 os.path.join(csv_dir, f) for f in |
| 22 glob.glob(os.path.join(csv_dir, '*.csv')) | 30 glob.glob(os.path.join(csv_dir, '*.csv')) |
| 23 if os.path.getsize(os.path.join(csv_dir, f))]) | 31 if os.path.getsize(os.path.join(csv_dir, f))]) |
| 24 self._output_csv_name = os.path.join(csv_dir, output_csv_name) | 32 self._output_csv_name = os.path.join(csv_dir, output_csv_name) |
| 25 | 33 |
| 26 def _GetFieldNames(self): | 34 def _GetFieldNames(self): |
| 27 field_names = set() | 35 field_names = set() |
| 28 for csv_file in self._input_csv_files: | 36 for csv_file in self._input_csv_files: |
| 29 field_names.update(csv.DictReader(open(csv_file, 'r')).fieldnames) | 37 field_names.update(csv.DictReader(open(csv_file, 'r')).fieldnames) |
| 30 return field_names | 38 return field_names |
| 31 | 39 |
| 40 def _GetMedian(self, l): |
| 41 """Returns the median value from the specified list.""" |
| 42 l.sort() |
| 43 length = len(l) |
| 44 if not length % 2: |
| 45 return (l[(length/2) - 1] + l[length/2]) / 2 |
| 46 else: |
| 47 return l[length/2] |
| 48 |
| 49 def _GetRowWithMedianValues(self, rows): |
| 50 """Parses the specified rows and returns a single row with median values.""" |
| 51 fieldname_to_values = {} |
| 52 for row in rows: |
| 53 for fieldname in row: |
| 54 if fieldname == TELEMETRY_PAGE_NAME_KEY: |
| 55 fieldname_to_values[fieldname] = row[fieldname] |
| 56 continue |
| 57 try: |
| 58 value = float(row[fieldname]) |
| 59 except ValueError: |
| 60 # We expected only floats, cannot compare strings. Skip this field. |
| 61 continue |
| 62 if fieldname in fieldname_to_values: |
| 63 fieldname_to_values[fieldname].append(value) |
| 64 else: |
| 65 fieldname_to_values[fieldname] = [value] |
| 66 |
| 67 median_row = {} |
| 68 for fieldname, values in fieldname_to_values.items(): |
| 69 if fieldname == TELEMETRY_PAGE_NAME_KEY: |
| 70 median_row[fieldname] = values |
| 71 continue |
| 72 median_row[fieldname] = self._GetMedian(values) |
| 73 |
| 74 print |
| 75 print 'For rows: %s' % rows |
| 76 print 'Median row is %s' % median_row |
| 77 print |
| 78 return median_row |
| 79 |
| 32 def Merge(self): | 80 def Merge(self): |
| 33 """Method that does the CSV merging.""" | 81 """Method that does the CSV merging.""" |
| 34 field_names = self._GetFieldNames() | 82 field_names = self._GetFieldNames() |
| 35 print 'Merging %d csv files into %d columns' % (len(self._input_csv_files), | 83 print 'Merging %d csv files into %d columns' % (len(self._input_csv_files), |
| 36 len(field_names)) | 84 len(field_names)) |
| 37 | 85 |
| 86 # List that will contain all rows read from the CSV files. It will also |
| 87 # combine all rows found with the same TELEMETRY_PAGE_NAME_KEY into one |
| 88 # with median values. |
| 89 csv_rows = [] |
| 90 |
| 91 # Dictionary containing all the encountered page names. If a page name that |
| 92 # is already in the dictionary is encountered then the median of its |
| 93 # values is used. |
| 94 page_names_to_rows = {} |
| 95 |
| 96 for csv_file in self._input_csv_files: |
| 97 dict_reader = csv.DictReader(open(csv_file, 'r')) |
| 98 for row in dict_reader: |
| 99 if TELEMETRY_PAGE_NAME_KEY in row: |
| 100 # Add rows found with 'page_name' to a different dictionary for |
| 101 # processing. |
| 102 if row[TELEMETRY_PAGE_NAME_KEY] in page_names_to_rows: |
| 103 page_names_to_rows[row[TELEMETRY_PAGE_NAME_KEY]].append(row) |
| 104 else: |
| 105 page_names_to_rows[row[TELEMETRY_PAGE_NAME_KEY]] = [row] |
| 106 else: |
| 107 # Add rows found without TELEMETRY_PAGE_NAME_KEY to the final list of |
| 108 # rows, they require no further processing. |
| 109 csv_rows.append(row) |
| 110 |
| 111 if page_names_to_rows: |
| 112 for page_name in page_names_to_rows: |
| 113 rows = page_names_to_rows[page_name] |
| 114 median_row = self._GetRowWithMedianValues(rows) |
| 115 # Add a single row that contains median values from all rows with the |
| 116 # same TELEMETRY_PAGE_NAME_KEY. |
| 117 csv_rows.append(median_row) |
| 118 |
| 119 # Write all rows in csv_rows to the specified output CSV. |
| 38 dict_writer = csv.DictWriter(open(self._output_csv_name, 'w'), field_names) | 120 dict_writer = csv.DictWriter(open(self._output_csv_name, 'w'), field_names) |
| 39 dict_writer.writeheader() | 121 dict_writer.writeheader() |
| 40 | |
| 41 total_rows = 0 | 122 total_rows = 0 |
| 42 | 123 for row in csv_rows: |
| 43 for csv_file in self._input_csv_files: | 124 dict_writer.writerow(row) |
| 44 print 'Merging %s' % csv_file | 125 total_rows += 1 |
| 45 | |
| 46 dict_reader = csv.DictReader(open(csv_file, 'r')) | |
| 47 for row in dict_reader: | |
| 48 dict_writer.writerow(row) | |
| 49 total_rows += 1 | |
| 50 | 126 |
| 51 print 'Successfully merged %d rows' % total_rows | 127 print 'Successfully merged %d rows' % total_rows |
| 52 | 128 |
| 53 | 129 |
| 54 if '__main__' == __name__: | 130 if '__main__' == __name__: |
| 55 option_parser = optparse.OptionParser() | 131 option_parser = optparse.OptionParser() |
| 56 option_parser.add_option( | 132 option_parser.add_option( |
| 57 '', '--csv_dir', | 133 '', '--csv_dir', |
| 58 help='Directory that contains the csv files to be merged. This directory' | 134 help='Directory that contains the csv files to be merged. This directory' |
| 59 ' will also contain the merged CSV.') | 135 ' will also contain the merged CSV.') |
| 60 option_parser.add_option( | 136 option_parser.add_option( |
| 61 '', '--output_csv_name', | 137 '', '--output_csv_name', |
| 62 help='The name of the resultant merged CSV. It will be outputted to the ' | 138 help='The name of the resultant merged CSV. It will be outputted to the ' |
| 63 '--csv_dir') | 139 '--csv_dir') |
| 64 options, unused_args = option_parser.parse_args() | 140 options, unused_args = option_parser.parse_args() |
| 65 if not options.csv_dir or not options.output_csv_name: | 141 if not options.csv_dir or not options.output_csv_name: |
| 66 option_parser.error('Must specify both csv_dir and output_csv_name') | 142 option_parser.error('Must specify both csv_dir and output_csv_name') |
| 67 | 143 |
| 68 sys.exit(CsvMerger(options.csv_dir, options.output_csv_name).Merge()) | 144 sys.exit(CsvMerger(options.csv_dir, options.output_csv_name).Merge()) |
| OLD | NEW |