cluster_telemetry/csv_merger.py - Issue 231433003: Repeat Chromium Tryserver page_set runs to reduce variance.

Unified Diff: cluster_telemetry/csv_merger.py

Issue 231433003: Repeat Chromium Tryserver page_set runs to reduce variance. (Closed) Base URL: https://skia.googlesource.com/buildbot.git@master

Patch Set: Move to cluster_telemetry directory Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: cluster_telemetry/csv_merger.py

diff --git a/cluster_telemetry/csv_merger.py b/cluster_telemetry/csv_merger.py

index ee84bcfaa5489dce2bcddbd4910e15bfb920b91c..8afe29314a59c9962eb40679ab274e0629f0a210 100644

--- a/cluster_telemetry/csv_merger.py

+++ b/cluster_telemetry/csv_merger.py

@@ -3,7 +3,12 @@

# Use of this source code is governed by a BSD-style license that can be

# found in the LICENSE file.

-"""Python utility to merge many CSV files into a single file."""

+"""Python utility to merge many CSV files into a single file.

+If there are multiple CSV files with the same TELEMETRY_PAGE_NAME_KEY then the

+median of all values is stored in the resultant CSV file.

+"""

import csv

import glob

@@ -12,6 +17,9 @@ import os

import sys

+TELEMETRY_PAGE_NAME_KEY = 'page_name'

class CsvMerger(object):

"""Class that merges many CSV files into a single file."""

@@ -29,24 +37,92 @@ class CsvMerger(object):

field_names.update(csv.DictReader(open(csv_file, 'r')).fieldnames)

return field_names

+ def _GetMedian(self, l):

+ """Returns the median value from the specified list."""

+ l.sort()

+ length = len(l)

+ if not length % 2:

+ return (l[(length/2) - 1] + l[length/2]) / 2

+ else:

+ return l[length/2]

+ def _GetRowWithMedianValues(self, rows):

+ """Parses the specified rows and returns a single row with median values."""

+ fieldname_to_values = {}

+ for row in rows:

+ for fieldname in row:

+ if fieldname == TELEMETRY_PAGE_NAME_KEY:

+ fieldname_to_values[fieldname] = row[fieldname]

+ continue

+ try:

+ value = float(row[fieldname])

+ except ValueError:

+ # We expected only floats, cannot compare strings. Skip this field.

+ continue

+ if fieldname in fieldname_to_values:

+ fieldname_to_values[fieldname].append(value)

+ else:

+ fieldname_to_values[fieldname] = [value]

+ median_row = {}

+ for fieldname, values in fieldname_to_values.items():

+ if fieldname == TELEMETRY_PAGE_NAME_KEY:

+ median_row[fieldname] = values

+ continue

+ median_row[fieldname] = self._GetMedian(values)

+ print

+ print 'For rows: %s' % rows

+ print 'Median row is %s' % median_row

+ print

+ return median_row

def Merge(self):

"""Method that does the CSV merging."""

field_names = self._GetFieldNames()

print 'Merging %d csv files into %d columns' % (len(self._input_csv_files),

len(field_names))

- dict_writer = csv.DictWriter(open(self._output_csv_name, 'w'), field_names)

- dict_writer.writeheader()

+ # List that will contain all rows read from the CSV files. It will also

+ # combine all rows found with the same TELEMETRY_PAGE_NAME_KEY into one

+ # with median values.

+ csv_rows = []

- total_rows = 0

+ # Dictionary containing all the encountered page names. If a page name that

+ # is already in the dictionary is encountered then the median of its

+ # values is used.

+ page_names_to_rows = {}

for csv_file in self._input_csv_files:

- print 'Merging %s' % csv_file

dict_reader = csv.DictReader(open(csv_file, 'r'))

for row in dict_reader:

- dict_writer.writerow(row)

- total_rows += 1

+ if TELEMETRY_PAGE_NAME_KEY in row:

+ # Add rows found with 'page_name' to a different dictionary for

+ # processing.

+ if row[TELEMETRY_PAGE_NAME_KEY] in page_names_to_rows:

+ page_names_to_rows[row[TELEMETRY_PAGE_NAME_KEY]].append(row)

+ else:

+ page_names_to_rows[row[TELEMETRY_PAGE_NAME_KEY]] = [row]

+ else:

+ # Add rows found without TELEMETRY_PAGE_NAME_KEY to the final list of

+ # rows, they require no further processing.

+ csv_rows.append(row)

+ if page_names_to_rows:

+ for page_name in page_names_to_rows:

+ rows = page_names_to_rows[page_name]

+ median_row = self._GetRowWithMedianValues(rows)

+ # Add a single row that contains median values from all rows with the

+ # same TELEMETRY_PAGE_NAME_KEY.

+ csv_rows.append(median_row)

+ # Write all rows in csv_rows to the specified output CSV.

+ dict_writer = csv.DictWriter(open(self._output_csv_name, 'w'), field_names)

+ dict_writer.writeheader()

+ total_rows = 0

+ for row in csv_rows:

+ dict_writer.writerow(row)

+ total_rows += 1

print 'Successfully merged %d rows' % total_rows

« no previous file with comments | « no previous file | cluster_telemetry/telemetry_slave_scripts/vm_run_telemetry.sh » ('j') | no next file with comments »