OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/python |
| 2 # Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. |
| 5 |
| 6 |
| 7 """Takes in a CSV profiles file and outputs to a pyAuto dictionary list format |
| 8 for converting Autofill Prfofile datasets. |
| 9 |
| 10 Used for test autofill.AutoFillTest.testMergeDuplicateProfilesInAutofill. |
| 11 """ |
| 12 |
| 13 # Specify input and output filename can be full |
| 14 # path: 'c:\folder\file' or '/home/folder/file' |
| 15 INPUT_FILE = r"../data/autofill/dataset.txt" |
| 16 OUTPUT_FILE = r"../data/autofill/dataset_duplicate-profiles.txt" |
| 17 |
| 18 # Controls output display on the screen. |
| 19 DISPLAY_INPUT_LINES = True |
| 20 DISPLAY_CONVERTED_LINES = False |
| 21 |
| 22 FIELDS = [ |
| 23 u'NAME_FIRST', |
| 24 u'NAME_MIDDLE', |
| 25 u'NAME_LAST', |
| 26 u'EMAIL_ADDRESS', |
| 27 u'COMPANY_NAME', |
| 28 u'ADDRESS_HOME_LINE1', |
| 29 u'ADDRESS_HOME_LINE2', |
| 30 u'ADDRESS_HOME_CITY', |
| 31 u'ADDRESS_HOME_STATE', |
| 32 u'ADDRESS_HOME_ZIP', |
| 33 u'ADDRESS_HOME_COUNTRY', |
| 34 u'PHONE_HOME_WHOLE_NUMBER', |
| 35 u'PHONE_FAX_WHOLE_NUMBER', |
| 36 ] |
| 37 |
| 38 import codecs |
| 39 import re |
| 40 import sys |
| 41 |
| 42 class Converter(object): |
| 43 def __init__(self, fields, filein, fileout): |
| 44 """ |
| 45 The pattern is a regular expression which has named parenthesis groups |
| 46 like this (?P<name>...) in order to match the '|' separated fields. |
| 47 If we had only the NAME_FIRST and NAME_MIDDLE fields (e.g 'Jared|JV') our |
| 48 pattern would be: "(?P<NAME_FIRST>.*?)\|(?P<NAME_MIDDLE>.*?)$" |
| 49 |
| 50 This means that '(?P<NAME_FIRST> regexp)\|' matches whatever regular |
| 51 expression is inside the parentheses, and indicates the start and end of a |
| 52 group; the contents of a group can be retrieved after a match has been |
| 53 performed using the symbolic group name 'NAME_FIRST'. |
| 54 |
| 55 The regexp is '.*?'. '.*' which means to match 0 or more repetitions of any |
| 56 character. The following '?' makes the regexp non-greedy meaning it will |
| 57 stop at the first occurrence of the '|' character (escaped in the pattern). |
| 58 |
| 59 For '(?P<NAME_MIDDLE>.*?)$' there is no '|' at the end, so we have '$' to |
| 60 indicate the end of the line. |
| 61 |
| 62 From the full pattern, we construct once from the FIELDS list. |
| 63 |
| 64 The out_line_pattern for one field: "{u'NAME_FIRST': u'%s'," |
| 65 is ready to accept the value for the 'NAME_FIRST' field once it is extracted |
| 66 from an input line using the above group pattern. |
| 67 |
| 68 'pattern' is used in __gerRec(line) to construct and return a dictionary |
| 69 from a line. |
| 70 |
| 71 'out_line_pattern' is used in 'convert()' to construct the final dataset |
| 72 line that will be printed to the output file. |
| 73 """ |
| 74 self.fields = fields[:] |
| 75 self.pattern = '(?P<%s>.*?)' %fields[0] |
| 76 for key in fields[1:]: |
| 77 self.pattern += '\|(?P<%s>.*?)' %key |
| 78 self.pattern = self.pattern + "$" |
| 79 |
| 80 self.out_line_pattern = u"{" |
| 81 for key in fields: |
| 82 self.out_line_pattern += u"u'%s': u'%s', " %(key, "%s") |
| 83 self.out_line_pattern = self.out_line_pattern[:-1] + "},\n" |
| 84 |
| 85 self.filein = filein |
| 86 self.fileout = fileout |
| 87 |
| 88 def __getRec(self, line): |
| 89 """ |
| 90 Constructs and returns a dictionary from a line using patterns. |
| 91 See constructor above. |
| 92 """ |
| 93 rePat = re.compile("'", re.UNICODE) |
| 94 line = rePat.sub(r"\'", line) |
| 95 rePat = re.compile(self.pattern, re.UNICODE) |
| 96 m = rePat.match(line) |
| 97 if m: |
| 98 outrec = {} |
| 99 for key in self.fields: |
| 100 outrec[key] = m.group(key) |
| 101 return outrec |
| 102 |
| 103 def convert(self, display_input_lines, display_converted_lines): |
| 104 """ |
| 105 The out_line_pattern is here. Each field needs to be formatted with a tuple |
| 106 of values for each containing '%s'. |
| 107 This is done in the line: |
| 108 out_line = self.out_line_pattern %tuple( |
| 109 [outrec[key] for key in self.fields]) |
| 110 For two fiels, translates to: |
| 111 out_line = "{u'NAME_FIRST': u'%s', u'MIDDLE_NAME': u'%s',}," % ( |
| 112 outrec['NAME_FIRST'], outrec['MIDDLE_NAME']) |
| 113 """ |
| 114 with open(self.filein) as fin: |
| 115 with codecs.open(self.fileout, mode = "wb", |
| 116 encoding = "utf-8-sig") as fout: |
| 117 i = 0 |
| 118 fout.write("[\n") |
| 119 for line in fin.readlines(): |
| 120 line = line.strip() |
| 121 if not line: |
| 122 continue |
| 123 line = unicode(line, 'UTF-8') |
| 124 outrec = self.__getRec(line) |
| 125 if outrec: |
| 126 i += 1 |
| 127 out_line = self.out_line_pattern %tuple( |
| 128 [outrec[key] for key in self.fields]) |
| 129 fout.write(out_line) |
| 130 if display_input_lines: |
| 131 print "\n%d: %s" %(i, line.encode(sys.stdout.encoding, 'ignore')) |
| 132 if display_converted_lines: |
| 133 print "\tconverted to: %s" %out_line.encode( |
| 134 sys.stdout.encoding, 'ignore') |
| 135 else: |
| 136 if not display_input_lines and not i % 10: |
| 137 print "\t%d lines converted so far!" %i |
| 138 |
| 139 fout.write("]\n") |
| 140 print |
| 141 print "%d lines converted SUCCESSFULLY!" %i |
| 142 print "--- FINISHED ---" |
| 143 print |
| 144 |
| 145 |
| 146 def main(): |
| 147 c = Converter(FIELDS, INPUT_FILE, OUTPUT_FILE) |
| 148 c.convert(DISPLAY_INPUT_LINES, DISPLAY_CONVERTED_LINES) |
| 149 |
| 150 if __name__ == '__main__': |
| 151 main() |
OLD | NEW |