| Index: cluster_telemetry/csv_merger.py
|
| diff --git a/cluster_telemetry/csv_merger.py b/cluster_telemetry/csv_merger.py
|
| index ee84bcfaa5489dce2bcddbd4910e15bfb920b91c..8afe29314a59c9962eb40679ab274e0629f0a210 100644
|
| --- a/cluster_telemetry/csv_merger.py
|
| +++ b/cluster_telemetry/csv_merger.py
|
| @@ -3,7 +3,12 @@
|
| # Use of this source code is governed by a BSD-style license that can be
|
| # found in the LICENSE file.
|
|
|
| -"""Python utility to merge many CSV files into a single file."""
|
| +"""Python utility to merge many CSV files into a single file.
|
| +
|
| +If there are multiple CSV files with the same TELEMETRY_PAGE_NAME_KEY then the
|
| +median of all values is stored in the resultant CSV file.
|
| +"""
|
| +
|
|
|
| import csv
|
| import glob
|
| @@ -12,6 +17,9 @@ import os
|
| import sys
|
|
|
|
|
| +TELEMETRY_PAGE_NAME_KEY = 'page_name'
|
| +
|
| +
|
| class CsvMerger(object):
|
| """Class that merges many CSV files into a single file."""
|
|
|
| @@ -29,24 +37,92 @@ class CsvMerger(object):
|
| field_names.update(csv.DictReader(open(csv_file, 'r')).fieldnames)
|
| return field_names
|
|
|
| + def _GetMedian(self, l):
|
| + """Returns the median value from the specified list."""
|
| + l.sort()
|
| + length = len(l)
|
| + if not length % 2:
|
| + return (l[(length/2) - 1] + l[length/2]) / 2
|
| + else:
|
| + return l[length/2]
|
| +
|
| + def _GetRowWithMedianValues(self, rows):
|
| + """Parses the specified rows and returns a single row with median values."""
|
| + fieldname_to_values = {}
|
| + for row in rows:
|
| + for fieldname in row:
|
| + if fieldname == TELEMETRY_PAGE_NAME_KEY:
|
| + fieldname_to_values[fieldname] = row[fieldname]
|
| + continue
|
| + try:
|
| + value = float(row[fieldname])
|
| + except ValueError:
|
| + # We expected only floats, cannot compare strings. Skip this field.
|
| + continue
|
| + if fieldname in fieldname_to_values:
|
| + fieldname_to_values[fieldname].append(value)
|
| + else:
|
| + fieldname_to_values[fieldname] = [value]
|
| +
|
| + median_row = {}
|
| + for fieldname, values in fieldname_to_values.items():
|
| + if fieldname == TELEMETRY_PAGE_NAME_KEY:
|
| + median_row[fieldname] = values
|
| + continue
|
| + median_row[fieldname] = self._GetMedian(values)
|
| +
|
| + print
|
| + print 'For rows: %s' % rows
|
| + print 'Median row is %s' % median_row
|
| + print
|
| + return median_row
|
| +
|
| def Merge(self):
|
| """Method that does the CSV merging."""
|
| field_names = self._GetFieldNames()
|
| print 'Merging %d csv files into %d columns' % (len(self._input_csv_files),
|
| len(field_names))
|
|
|
| - dict_writer = csv.DictWriter(open(self._output_csv_name, 'w'), field_names)
|
| - dict_writer.writeheader()
|
| + # List that will contain all rows read from the CSV files. It will also
|
| + # combine all rows found with the same TELEMETRY_PAGE_NAME_KEY into one
|
| + # with median values.
|
| + csv_rows = []
|
|
|
| - total_rows = 0
|
| + # Dictionary containing all the encountered page names. If a page name that
|
| + # is already in the dictionary is encountered then the median of its
|
| + # values is used.
|
| + page_names_to_rows = {}
|
|
|
| for csv_file in self._input_csv_files:
|
| - print 'Merging %s' % csv_file
|
| -
|
| dict_reader = csv.DictReader(open(csv_file, 'r'))
|
| for row in dict_reader:
|
| - dict_writer.writerow(row)
|
| - total_rows += 1
|
| + if TELEMETRY_PAGE_NAME_KEY in row:
|
| + # Add rows found with 'page_name' to a different dictionary for
|
| + # processing.
|
| + if row[TELEMETRY_PAGE_NAME_KEY] in page_names_to_rows:
|
| + page_names_to_rows[row[TELEMETRY_PAGE_NAME_KEY]].append(row)
|
| + else:
|
| + page_names_to_rows[row[TELEMETRY_PAGE_NAME_KEY]] = [row]
|
| + else:
|
| + # Add rows found without TELEMETRY_PAGE_NAME_KEY to the final list of
|
| + # rows, they require no further processing.
|
| + csv_rows.append(row)
|
| +
|
| + if page_names_to_rows:
|
| + for page_name in page_names_to_rows:
|
| + rows = page_names_to_rows[page_name]
|
| + median_row = self._GetRowWithMedianValues(rows)
|
| + # Add a single row that contains median values from all rows with the
|
| + # same TELEMETRY_PAGE_NAME_KEY.
|
| + csv_rows.append(median_row)
|
| +
|
| + # Write all rows in csv_rows to the specified output CSV.
|
| + dict_writer = csv.DictWriter(open(self._output_csv_name, 'w'), field_names)
|
| + dict_writer.writeheader()
|
| + total_rows = 0
|
| + for row in csv_rows:
|
| + dict_writer.writerow(row)
|
| + total_rows += 1
|
|
|
| print 'Successfully merged %d rows' % total_rows
|
|
|
|
|