Chromium Code Reviews| Index: chrome/test/functional/dataset-converter.py |
| =================================================================== |
| --- chrome/test/functional/dataset-converter.py (revision 0) |
| +++ chrome/test/functional/dataset-converter.py (revision 0) |
| @@ -0,0 +1,155 @@ |
| +#!/usr/bin/python |
| +# Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| +# Use of this source code is governed by a BSD-style license that can be |
| +# found in the LICENSE file. |
| + |
| + |
| +"""Takes in a CSV profiles file and outputs to a pyAuto dictionary list format |
|
Nirnimesh
2011/02/07 22:20:02
what's a pyauto dictionary list format?
dyu1
2011/02/07 23:06:59
Changed the wording. Input is a csv file with a bu
|
| +for converting Autofill Prfofile datasets. |
| + |
| +Used for test autofill.AutoFillTest.testMergeDuplicateProfilesInAutofill. |
| +""" |
|
Nirnimesh
2011/02/07 22:20:02
Do you really need dataset.txt?
Why not just have
dyu1
2011/02/07 23:06:59
Then I have to manually create this list. I was gi
|
| + |
| +# Specify input and output filename can be full |
| +# path: 'c:\folder\file' or '/home/folder/file' |
| +INPUT_FILE = r"../data/autofill/dataset.txt" |
| +OUTPUT_FILE = r"../data/autofill/dataset_duplicate-profiles.txt" |
| + |
| +# Controls output display on the screen. |
| +DISPLAY_INPUT_LINES = True |
| +DISPLAY_CONVERTED_LINES = False |
| + |
| +FIELDS = [ |
| + u'NAME_FIRST', |
| + u'NAME_MIDDLE', |
| + u'NAME_LAST', |
| + u'EMAIL_ADDRESS', |
| + u'COMPANY_NAME', |
| + u'ADDRESS_HOME_LINE1', |
| + u'ADDRESS_HOME_LINE2', |
| + u'ADDRESS_HOME_CITY', |
| + u'ADDRESS_HOME_STATE', |
| + u'ADDRESS_HOME_ZIP', |
| + u'ADDRESS_HOME_COUNTRY', |
| + u'PHONE_HOME_WHOLE_NUMBER', |
| + u'PHONE_FAX_WHOLE_NUMBER', |
| +] |
| + |
| +import codecs |
| +import os |
| +import re |
| +import sys |
| + |
| +class Converter(object): |
| + def __init__(self, fields, filein, fileout): |
| + """ |
| + The pattern is a regular expression which has named parenthesis groups |
| + like this (?P<name>...) in order to match the '|' separated fields. |
| + If we had only the NAME_FIRST and NAME_MIDDLE fields (e.g 'Jared|JV') our |
| + pattern would be: "(?P<NAME_FIRST>.*?)\|(?P<NAME_MIDDLE>.*?)$" |
| + |
| + This means that '(?P<NAME_FIRST> regexp)\|' matches whatever regular |
| + expression is inside the parentheses, and indicates the start and end of a |
| + group; the contents of a group can be retrieved after a match has been |
| + performed using the symbolic group name 'NAME_FIRST'. |
| + |
| + The regexp is '.*?'. '.*' which means to match 0 or more repetitions of any |
| + character. The following '?' makes the regexp non-greedy meaning it will |
| + stop at the first occurrence of the '|' character (escaped in the pattern). |
| + |
| + For '(?P<NAME_MIDDLE>.*?)$' there is no '|' at the end, so we have '$' to |
| + indicate the end of the line. |
| + |
| + From the full pattern, we construct once from the FIELDS list. |
| + |
| + The out_line_pattern for one field: "{u'NAME_FIRST': u'%s'," |
| + is ready to accept the value for the 'NAME_FIRST' field once it is extracted |
| + from an input line using the above group pattern. |
| + |
| + 'pattern' is used in __gerRec(line) to construct and return a dictionary |
| + from a line. |
| + |
| + 'out_line_pattern' is used in 'convert()' to construct the final dataset |
| + line that will be printed to the output file. |
| + """ |
| + self.fields = fields[:] |
| + self.pattern = '(?P<%s>.*?)' %fields[0] |
| + for key in fields[1:]: |
| + self.pattern += '\|(?P<%s>.*?)' %key |
| + self.pattern = self.pattern + "$" |
| + |
| + self.out_line_pattern = u"{" |
| + for key in fields: |
| + self.out_line_pattern += u"u'%s': u'%s', " %(key, "%s") |
| + self.out_line_pattern = self.out_line_pattern[:-1] + "},\n" |
| + |
| + self.filein = filein |
| + self.fileout = fileout |
| + |
| + def __getRec(self, line): |
| + """ |
| + Constructs and returns a dictionary from a line using patterns. |
| + See constructor above. |
| + """ |
| + rePat = re.compile("'", re.UNICODE) |
| + line = rePat.sub(r"\'", line) |
| + rePat = re.compile(self.pattern, re.UNICODE) |
| + m = rePat.match(line) |
| + if m: |
| + outrec = {} |
| + for key in self.fields: |
| + outrec[key] = m.group(key) |
| + return outrec |
| + |
| + def convert(self, display_input_lines, display_converted_lines): |
| + """ |
| + The out_line_pattern is here. Each field needs to be formatted with a tuple |
| + of values for each containing '%s'. |
| + This is done in the line: |
| + out_line = self.out_line_pattern %tuple( |
| + [outrec[key] for key in self.fields]) |
| + For two fiels, translates to: |
| + out_line = "{u'NAME_FIRST': u'%s', u'MIDDLE_NAME': u'%s',}," % ( |
| + outrec['NAME_FIRST'], outrec['MIDDLE_NAME']) |
| + """ |
| + with open(self.filein) as fin: |
| + with codecs.open(self.fileout, mode = "wb", |
| + encoding = "utf-8-sig") as fout: |
| + i = 0 |
| + fout.write("[") |
| + fout.write(os.linesep) |
| + for line in fin.readlines(): |
| + line = line.strip() |
| + if not line: |
| + continue |
| + line = unicode(line, 'UTF-8') |
| + outrec = self.__getRec(line) |
| + if outrec: |
| + i += 1 |
| + out_line = self.out_line_pattern %tuple( |
| + [outrec[key] for key in self.fields]) |
| + fout.write(out_line) |
| + fout.write(os.linesep) |
| + if display_input_lines: |
| + print "\n%d: %s" %(i, line.encode(sys.stdout.encoding, 'ignore')) |
| + if display_converted_lines: |
| + print "\tconverted to: %s" %out_line.encode( |
| + sys.stdout.encoding, 'ignore') |
| + else: |
| + if not display_input_lines and not i % 10: |
| + print "\t%d lines converted so far!" %i |
| + |
| + fout.write("]") |
| + fout.write(os.linesep) |
| + print "%d lines converted SUCCESSFULLY!" %i |
| + print "--- FINISHED ---" |
| + |
| + |
| +def main(): |
| + c = Converter(FIELDS, INPUT_FILE, OUTPUT_FILE) |
| + c.convert(DISPLAY_INPUT_LINES, DISPLAY_CONVERTED_LINES) |
| + |
| +if __name__ == '__main__': |
| + main() |