OLD | NEW |
---|---|
(Empty) | |
1 #!/usr/bin/python | |
2 # Copyright (c) 2010 The Chromium Authors. All rights reserved. | |
Nirnimesh
2011/02/07 23:26:20
2011
dyu1
2011/02/09 19:44:56
Done.
| |
3 # Use of this source code is governed by a BSD-style license that can be | |
4 # found in the LICENSE file. | |
5 | |
Nirnimesh
2011/02/07 23:26:20
Remove blank line
dyu1
2011/02/09 19:44:56
Done.
| |
6 | |
7 """Takes in a CSV profiles file and outputs to a dictionary list format | |
8 for converting Autofill profile datasets. | |
9 | |
10 Used for test autofill.AutoFillTest.testMergeDuplicateProfilesInAutofill. | |
11 """ | |
12 | |
13 # Specify input and output filename can be full | |
14 # path: 'c:\folder\file' or '/home/folder/file' | |
15 INPUT_FILE = r"../data/autofill/dataset.txt" | |
Nirnimesh
2011/02/07 23:26:20
Can these be moved inside the class as memeber var
dyu1
2011/02/09 19:44:56
Done.
| |
16 OUTPUT_FILE = r"../data/autofill/dataset_duplicate-profiles.txt" | |
17 | |
18 # Controls output display on the screen. | |
19 DISPLAY_INPUT_LINES = True | |
20 DISPLAY_CONVERTED_LINES = False | |
21 | |
22 FIELDS = [ | |
23 u'NAME_FIRST', | |
24 u'NAME_MIDDLE', | |
25 u'NAME_LAST', | |
26 u'EMAIL_ADDRESS', | |
27 u'COMPANY_NAME', | |
28 u'ADDRESS_HOME_LINE1', | |
29 u'ADDRESS_HOME_LINE2', | |
30 u'ADDRESS_HOME_CITY', | |
31 u'ADDRESS_HOME_STATE', | |
32 u'ADDRESS_HOME_ZIP', | |
33 u'ADDRESS_HOME_COUNTRY', | |
34 u'PHONE_HOME_WHOLE_NUMBER', | |
35 u'PHONE_FAX_WHOLE_NUMBER', | |
36 ] | |
37 | |
38 import codecs | |
39 import os | |
40 import re | |
41 import sys | |
42 | |
Nirnimesh
2011/02/07 23:26:20
Leave another blank line
dyu1
2011/02/09 19:44:56
Done.
| |
43 class Converter(object): | |
Nirnimesh
2011/02/07 23:26:20
Use a more descriptive name. |Converter| is too ge
dyu1
2011/02/09 19:44:56
Done.
| |
44 def __init__(self, fields, filein, fileout): | |
Nirnimesh
2011/02/07 23:26:20
Explain the args. Add an "Args:" section in the do
dyu1
2011/02/09 19:44:56
Done.
| |
45 """ | |
46 The pattern is a regular expression which has named parenthesis groups | |
Nirnimesh
2011/02/07 23:26:20
An example would explain this much better
dyu1
2011/02/09 19:44:56
Done.
| |
47 like this (?P<name>...) in order to match the '|' separated fields. | |
48 If we had only the NAME_FIRST and NAME_MIDDLE fields (e.g 'Jared|JV') our | |
49 pattern would be: "(?P<NAME_FIRST>.*?)\|(?P<NAME_MIDDLE>.*?)$" | |
50 | |
51 This means that '(?P<NAME_FIRST> regexp)\|' matches whatever regular | |
52 expression is inside the parentheses, and indicates the start and end of a | |
53 group; the contents of a group can be retrieved after a match has been | |
54 performed using the symbolic group name 'NAME_FIRST'. | |
55 | |
56 The regexp is '.*?'. '.*' which means to match 0 or more repetitions of any | |
57 character. The following '?' makes the regexp non-greedy meaning it will | |
58 stop at the first occurrence of the '|' character (escaped in the pattern). | |
59 | |
60 For '(?P<NAME_MIDDLE>.*?)$' there is no '|' at the end, so we have '$' to | |
61 indicate the end of the line. | |
62 | |
63 From the full pattern, we construct once from the FIELDS list. | |
64 | |
65 The out_line_pattern for one field: "{u'NAME_FIRST': u'%s'," | |
66 is ready to accept the value for the 'NAME_FIRST' field once it is extracted | |
67 from an input line using the above group pattern. | |
68 | |
69 'pattern' is used in __gerRec(line) to construct and return a dictionary | |
70 from a line. | |
71 | |
72 'out_line_pattern' is used in 'convert()' to construct the final dataset | |
73 line that will be printed to the output file. | |
74 """ | |
75 self.fields = fields[:] | |
Nirnimesh
2011/02/07 23:26:20
self._fields
use _ prefix for member vars
Repeat f
dyu1
2011/02/09 19:44:56
Done.
| |
76 self.pattern = '(?P<%s>.*?)' %fields[0] | |
77 for key in fields[1:]: | |
78 self.pattern += '\|(?P<%s>.*?)' %key | |
Nirnimesh
2011/02/07 23:26:20
Isn't it easier to split by '|'?
re.split('|', lin
dyu1
2011/02/09 19:44:56
Done.
| |
79 self.pattern = self.pattern + "$" | |
80 | |
81 self.out_line_pattern = u"{" | |
82 for key in fields: | |
83 self.out_line_pattern += u"u'%s': u'%s', " %(key, "%s") | |
84 self.out_line_pattern = self.out_line_pattern[:-1] + "},\n" | |
85 | |
86 self.filein = filein | |
87 self.fileout = fileout | |
88 | |
89 def __getRec(self, line): | |
Nirnimesh
2011/02/07 23:26:20
why double _?
Method names begin with Cap letter.
dyu1
2011/02/09 19:44:56
Based on this http://docs.python.org/tutorial/clas
| |
90 """ | |
91 Constructs and returns a dictionary from a line using patterns. | |
Nirnimesh
2011/02/07 23:26:20
Merge with last line. Repeat for all methods
dyu1
2011/02/09 19:44:56
Done.
| |
92 See constructor above. | |
93 """ | |
94 rePat = re.compile("'", re.UNICODE) | |
Nirnimesh
2011/02/07 23:26:20
the style guide prohibits camelCase style for loca
dyu1
2011/02/09 19:44:56
Done.
| |
95 line = rePat.sub(r"\'", line) | |
96 rePat = re.compile(self.pattern, re.UNICODE) | |
97 m = rePat.match(line) | |
98 if m: | |
99 outrec = {} | |
100 for key in self.fields: | |
101 outrec[key] = m.group(key) | |
102 return outrec | |
103 | |
104 def convert(self, display_input_lines, display_converted_lines): | |
105 """ | |
106 The out_line_pattern is here. Each field needs to be formatted with a tuple | |
107 of values for each containing '%s'. | |
108 This is done in the line: | |
109 out_line = self.out_line_pattern %tuple( | |
110 [outrec[key] for key in self.fields]) | |
111 For two fiels, translates to: | |
112 out_line = "{u'NAME_FIRST': u'%s', u'MIDDLE_NAME': u'%s',}," % ( | |
113 outrec['NAME_FIRST'], outrec['MIDDLE_NAME']) | |
114 """ | |
115 with open(self.filein) as fin: | |
116 with codecs.open(self.fileout, mode = "wb", | |
117 encoding = "utf-8-sig") as fout: | |
118 i = 0 | |
119 fout.write("[") | |
120 fout.write(os.linesep) | |
121 for line in fin.readlines(): | |
122 line = line.strip() | |
123 if not line: | |
124 continue | |
125 line = unicode(line, 'UTF-8') | |
126 outrec = self.__getRec(line) | |
127 if outrec: | |
128 i += 1 | |
129 out_line = self.out_line_pattern %tuple( | |
130 [outrec[key] for key in self.fields]) | |
131 fout.write(out_line) | |
132 fout.write(os.linesep) | |
133 if display_input_lines: | |
134 print "\n%d: %s" %(i, line.encode(sys.stdout.encoding, 'ignore')) | |
135 if display_converted_lines: | |
136 print "\tconverted to: %s" %out_line.encode( | |
137 sys.stdout.encoding, 'ignore') | |
138 else: | |
139 if not display_input_lines and not i % 10: | |
140 print "\t%d lines converted so far!" %i | |
141 | |
142 fout.write("]") | |
143 fout.write(os.linesep) | |
144 print | |
145 print "%d lines converted SUCCESSFULLY!" %i | |
146 print "--- FINISHED ---" | |
147 print | |
148 | |
149 | |
150 def main(): | |
151 c = Converter(FIELDS, INPUT_FILE, OUTPUT_FILE) | |
Nirnimesh
2011/02/07 23:26:20
Do you really need OUTPUT_FILE?
You could just ret
dyu1
2011/02/09 19:44:56
Will write this script so that it be standalone an
| |
152 c.convert(DISPLAY_INPUT_LINES, DISPLAY_CONVERTED_LINES) | |
153 | |
154 if __name__ == '__main__': | |
155 main() | |
OLD | NEW |