Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 #!/usr/bin/python | |
| 2 # Copyright (c) 2010 The Chromium Authors. All rights reserved. | |
| 3 # Use of this source code is governed by a BSD-style license that can be | |
| 4 # found in the LICENSE file. | |
| 5 | |
| 6 | |
| 7 """Takes in a CSV profiles file and outputs to a pyAuto dictionary list format | |
|
Nirnimesh
2011/02/07 22:20:02
what's a pyauto dictionary list format?
dyu1
2011/02/07 23:06:59
Changed the wording. Input is a csv file with a bu
| |
| 8 for converting Autofill Prfofile datasets. | |
| 9 | |
| 10 Used for test autofill.AutoFillTest.testMergeDuplicateProfilesInAutofill. | |
| 11 """ | |
|
Nirnimesh
2011/02/07 22:20:02
Do you really need dataset.txt?
Why not just have
dyu1
2011/02/07 23:06:59
Then I have to manually create this list. I was gi
| |
| 12 | |
| 13 # Specify input and output filename can be full | |
| 14 # path: 'c:\folder\file' or '/home/folder/file' | |
| 15 INPUT_FILE = r"../data/autofill/dataset.txt" | |
| 16 OUTPUT_FILE = r"../data/autofill/dataset_duplicate-profiles.txt" | |
| 17 | |
| 18 # Controls output display on the screen. | |
| 19 DISPLAY_INPUT_LINES = True | |
| 20 DISPLAY_CONVERTED_LINES = False | |
| 21 | |
| 22 FIELDS = [ | |
| 23 u'NAME_FIRST', | |
| 24 u'NAME_MIDDLE', | |
| 25 u'NAME_LAST', | |
| 26 u'EMAIL_ADDRESS', | |
| 27 u'COMPANY_NAME', | |
| 28 u'ADDRESS_HOME_LINE1', | |
| 29 u'ADDRESS_HOME_LINE2', | |
| 30 u'ADDRESS_HOME_CITY', | |
| 31 u'ADDRESS_HOME_STATE', | |
| 32 u'ADDRESS_HOME_ZIP', | |
| 33 u'ADDRESS_HOME_COUNTRY', | |
| 34 u'PHONE_HOME_WHOLE_NUMBER', | |
| 35 u'PHONE_FAX_WHOLE_NUMBER', | |
| 36 ] | |
| 37 | |
| 38 import codecs | |
| 39 import os | |
| 40 import re | |
| 41 import sys | |
| 42 | |
| 43 class Converter(object): | |
| 44 def __init__(self, fields, filein, fileout): | |
| 45 """ | |
| 46 The pattern is a regular expression which has named parenthesis groups | |
| 47 like this (?P<name>...) in order to match the '|' separated fields. | |
| 48 If we had only the NAME_FIRST and NAME_MIDDLE fields (e.g 'Jared|JV') our | |
| 49 pattern would be: "(?P<NAME_FIRST>.*?)\|(?P<NAME_MIDDLE>.*?)$" | |
| 50 | |
| 51 This means that '(?P<NAME_FIRST> regexp)\|' matches whatever regular | |
| 52 expression is inside the parentheses, and indicates the start and end of a | |
| 53 group; the contents of a group can be retrieved after a match has been | |
| 54 performed using the symbolic group name 'NAME_FIRST'. | |
| 55 | |
| 56 The regexp is '.*?'. '.*' which means to match 0 or more repetitions of any | |
| 57 character. The following '?' makes the regexp non-greedy meaning it will | |
| 58 stop at the first occurrence of the '|' character (escaped in the pattern). | |
| 59 | |
| 60 For '(?P<NAME_MIDDLE>.*?)$' there is no '|' at the end, so we have '$' to | |
| 61 indicate the end of the line. | |
| 62 | |
| 63 From the full pattern, we construct once from the FIELDS list. | |
| 64 | |
| 65 The out_line_pattern for one field: "{u'NAME_FIRST': u'%s'," | |
| 66 is ready to accept the value for the 'NAME_FIRST' field once it is extracted | |
| 67 from an input line using the above group pattern. | |
| 68 | |
| 69 'pattern' is used in __gerRec(line) to construct and return a dictionary | |
| 70 from a line. | |
| 71 | |
| 72 'out_line_pattern' is used in 'convert()' to construct the final dataset | |
| 73 line that will be printed to the output file. | |
| 74 """ | |
| 75 self.fields = fields[:] | |
| 76 self.pattern = '(?P<%s>.*?)' %fields[0] | |
| 77 for key in fields[1:]: | |
| 78 self.pattern += '\|(?P<%s>.*?)' %key | |
| 79 self.pattern = self.pattern + "$" | |
| 80 | |
| 81 self.out_line_pattern = u"{" | |
| 82 for key in fields: | |
| 83 self.out_line_pattern += u"u'%s': u'%s', " %(key, "%s") | |
| 84 self.out_line_pattern = self.out_line_pattern[:-1] + "},\n" | |
| 85 | |
| 86 self.filein = filein | |
| 87 self.fileout = fileout | |
| 88 | |
| 89 def __getRec(self, line): | |
| 90 """ | |
| 91 Constructs and returns a dictionary from a line using patterns. | |
| 92 See constructor above. | |
| 93 """ | |
| 94 rePat = re.compile("'", re.UNICODE) | |
| 95 line = rePat.sub(r"\'", line) | |
| 96 rePat = re.compile(self.pattern, re.UNICODE) | |
| 97 m = rePat.match(line) | |
| 98 if m: | |
| 99 outrec = {} | |
| 100 for key in self.fields: | |
| 101 outrec[key] = m.group(key) | |
| 102 return outrec | |
| 103 | |
| 104 def convert(self, display_input_lines, display_converted_lines): | |
| 105 """ | |
| 106 The out_line_pattern is here. Each field needs to be formatted with a tuple | |
| 107 of values for each containing '%s'. | |
| 108 This is done in the line: | |
| 109 out_line = self.out_line_pattern %tuple( | |
| 110 [outrec[key] for key in self.fields]) | |
| 111 For two fiels, translates to: | |
| 112 out_line = "{u'NAME_FIRST': u'%s', u'MIDDLE_NAME': u'%s',}," % ( | |
| 113 outrec['NAME_FIRST'], outrec['MIDDLE_NAME']) | |
| 114 """ | |
| 115 with open(self.filein) as fin: | |
| 116 with codecs.open(self.fileout, mode = "wb", | |
| 117 encoding = "utf-8-sig") as fout: | |
| 118 i = 0 | |
| 119 fout.write("[") | |
| 120 fout.write(os.linesep) | |
| 121 for line in fin.readlines(): | |
| 122 line = line.strip() | |
| 123 if not line: | |
| 124 continue | |
| 125 line = unicode(line, 'UTF-8') | |
| 126 outrec = self.__getRec(line) | |
| 127 if outrec: | |
| 128 i += 1 | |
| 129 out_line = self.out_line_pattern %tuple( | |
| 130 [outrec[key] for key in self.fields]) | |
| 131 fout.write(out_line) | |
| 132 fout.write(os.linesep) | |
| 133 if display_input_lines: | |
| 134 print "\n%d: %s" %(i, line.encode(sys.stdout.encoding, 'ignore')) | |
| 135 if display_converted_lines: | |
| 136 print "\tconverted to: %s" %out_line.encode( | |
| 137 sys.stdout.encoding, 'ignore') | |
| 138 else: | |
| 139 if not display_input_lines and not i % 10: | |
| 140 print "\t%d lines converted so far!" %i | |
| 141 | |
| 142 fout.write("]") | |
| 143 fout.write(os.linesep) | |
| 144 print | |
| 145 print "%d lines converted SUCCESSFULLY!" %i | |
| 146 print "--- FINISHED ---" | |
| 147 print | |
| 148 | |
| 149 | |
| 150 def main(): | |
| 151 c = Converter(FIELDS, INPUT_FILE, OUTPUT_FILE) | |
| 152 c.convert(DISPLAY_INPUT_LINES, DISPLAY_CONVERTED_LINES) | |
| 153 | |
| 154 if __name__ == '__main__': | |
| 155 main() | |
| OLD | NEW |