Index: chrome/test/functional/dataset-converter.py |
=================================================================== |
--- chrome/test/functional/dataset-converter.py (revision 0) |
+++ chrome/test/functional/dataset-converter.py (revision 0) |
@@ -0,0 +1,155 @@ |
+#!/usr/bin/python |
+# Copyright (c) 2010 The Chromium Authors. All rights reserved. |
+# Use of this source code is governed by a BSD-style license that can be |
+# found in the LICENSE file. |
+ |
+ |
+"""Takes in a CSV profiles file and outputs to a pyAuto dictionary list format |
Nirnimesh
2011/02/07 22:20:02
what's a pyauto dictionary list format?
dyu1
2011/02/07 23:06:59
Changed the wording. Input is a csv file with a bu
|
+for converting Autofill Prfofile datasets. |
+ |
+Used for test autofill.AutoFillTest.testMergeDuplicateProfilesInAutofill. |
+""" |
Nirnimesh
2011/02/07 22:20:02
Do you really need dataset.txt?
Why not just have
dyu1
2011/02/07 23:06:59
Then I have to manually create this list. I was gi
|
+ |
+# Specify input and output filename can be full |
+# path: 'c:\folder\file' or '/home/folder/file' |
+INPUT_FILE = r"../data/autofill/dataset.txt" |
+OUTPUT_FILE = r"../data/autofill/dataset_duplicate-profiles.txt" |
+ |
+# Controls output display on the screen. |
+DISPLAY_INPUT_LINES = True |
+DISPLAY_CONVERTED_LINES = False |
+ |
+FIELDS = [ |
+ u'NAME_FIRST', |
+ u'NAME_MIDDLE', |
+ u'NAME_LAST', |
+ u'EMAIL_ADDRESS', |
+ u'COMPANY_NAME', |
+ u'ADDRESS_HOME_LINE1', |
+ u'ADDRESS_HOME_LINE2', |
+ u'ADDRESS_HOME_CITY', |
+ u'ADDRESS_HOME_STATE', |
+ u'ADDRESS_HOME_ZIP', |
+ u'ADDRESS_HOME_COUNTRY', |
+ u'PHONE_HOME_WHOLE_NUMBER', |
+ u'PHONE_FAX_WHOLE_NUMBER', |
+] |
+ |
+import codecs |
+import os |
+import re |
+import sys |
+ |
+class Converter(object): |
+ def __init__(self, fields, filein, fileout): |
+ """ |
+ The pattern is a regular expression which has named parenthesis groups |
+ like this (?P<name>...) in order to match the '|' separated fields. |
+ If we had only the NAME_FIRST and NAME_MIDDLE fields (e.g 'Jared|JV') our |
+ pattern would be: "(?P<NAME_FIRST>.*?)\|(?P<NAME_MIDDLE>.*?)$" |
+ |
+ This means that '(?P<NAME_FIRST> regexp)\|' matches whatever regular |
+ expression is inside the parentheses, and indicates the start and end of a |
+ group; the contents of a group can be retrieved after a match has been |
+ performed using the symbolic group name 'NAME_FIRST'. |
+ |
+ The regexp is '.*?'. '.*' which means to match 0 or more repetitions of any |
+ character. The following '?' makes the regexp non-greedy meaning it will |
+ stop at the first occurrence of the '|' character (escaped in the pattern). |
+ |
+ For '(?P<NAME_MIDDLE>.*?)$' there is no '|' at the end, so we have '$' to |
+ indicate the end of the line. |
+ |
+ From the full pattern, we construct once from the FIELDS list. |
+ |
+ The out_line_pattern for one field: "{u'NAME_FIRST': u'%s'," |
+ is ready to accept the value for the 'NAME_FIRST' field once it is extracted |
+ from an input line using the above group pattern. |
+ |
+ 'pattern' is used in __gerRec(line) to construct and return a dictionary |
+ from a line. |
+ |
+ 'out_line_pattern' is used in 'convert()' to construct the final dataset |
+ line that will be printed to the output file. |
+ """ |
+ self.fields = fields[:] |
+ self.pattern = '(?P<%s>.*?)' %fields[0] |
+ for key in fields[1:]: |
+ self.pattern += '\|(?P<%s>.*?)' %key |
+ self.pattern = self.pattern + "$" |
+ |
+ self.out_line_pattern = u"{" |
+ for key in fields: |
+ self.out_line_pattern += u"u'%s': u'%s', " %(key, "%s") |
+ self.out_line_pattern = self.out_line_pattern[:-1] + "},\n" |
+ |
+ self.filein = filein |
+ self.fileout = fileout |
+ |
+ def __getRec(self, line): |
+ """ |
+ Constructs and returns a dictionary from a line using patterns. |
+ See constructor above. |
+ """ |
+ rePat = re.compile("'", re.UNICODE) |
+ line = rePat.sub(r"\'", line) |
+ rePat = re.compile(self.pattern, re.UNICODE) |
+ m = rePat.match(line) |
+ if m: |
+ outrec = {} |
+ for key in self.fields: |
+ outrec[key] = m.group(key) |
+ return outrec |
+ |
+ def convert(self, display_input_lines, display_converted_lines): |
+ """ |
+ The out_line_pattern is here. Each field needs to be formatted with a tuple |
+ of values for each containing '%s'. |
+ This is done in the line: |
+ out_line = self.out_line_pattern %tuple( |
+ [outrec[key] for key in self.fields]) |
+ For two fiels, translates to: |
+ out_line = "{u'NAME_FIRST': u'%s', u'MIDDLE_NAME': u'%s',}," % ( |
+ outrec['NAME_FIRST'], outrec['MIDDLE_NAME']) |
+ """ |
+ with open(self.filein) as fin: |
+ with codecs.open(self.fileout, mode = "wb", |
+ encoding = "utf-8-sig") as fout: |
+ i = 0 |
+ fout.write("[") |
+ fout.write(os.linesep) |
+ for line in fin.readlines(): |
+ line = line.strip() |
+ if not line: |
+ continue |
+ line = unicode(line, 'UTF-8') |
+ outrec = self.__getRec(line) |
+ if outrec: |
+ i += 1 |
+ out_line = self.out_line_pattern %tuple( |
+ [outrec[key] for key in self.fields]) |
+ fout.write(out_line) |
+ fout.write(os.linesep) |
+ if display_input_lines: |
+ print "\n%d: %s" %(i, line.encode(sys.stdout.encoding, 'ignore')) |
+ if display_converted_lines: |
+ print "\tconverted to: %s" %out_line.encode( |
+ sys.stdout.encoding, 'ignore') |
+ else: |
+ if not display_input_lines and not i % 10: |
+ print "\t%d lines converted so far!" %i |
+ |
+ fout.write("]") |
+ fout.write(os.linesep) |
+ print "%d lines converted SUCCESSFULLY!" %i |
+ print "--- FINISHED ---" |
+ |
+ |
+def main(): |
+ c = Converter(FIELDS, INPUT_FILE, OUTPUT_FILE) |
+ c.convert(DISPLAY_INPUT_LINES, DISPLAY_CONVERTED_LINES) |
+ |
+if __name__ == '__main__': |
+ main() |