OLD | NEW |
---|---|
(Empty) | |
1 #!/usr/bin/python | |
2 # Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
3 # Use of this source code is governed by a BSD-style license that can be | |
4 # found in the LICENSE file. | |
5 | |
6 """Converts profile datasets to dictionary list for Autofill profiles. | |
7 | |
8 Used for test autofill.AutoFillTest.testMergeDuplicateProfilesInAutofill. | |
9 """ | |
10 | |
11 import codecs | |
12 import logging | |
13 import os | |
14 import re | |
15 import sys | |
16 | |
17 | |
18 class NullHandler(logging.Handler): | |
19 def emit(self, record): | |
20 pass | |
21 | |
22 | |
23 class DatasetConverter(object): | |
24 _fields = [ | |
25 u'NAME_FIRST', | |
26 u'NAME_MIDDLE', | |
27 u'NAME_LAST', | |
28 u'EMAIL_ADDRESS', | |
29 u'COMPANY_NAME', | |
30 u'ADDRESS_HOME_LINE1', | |
31 u'ADDRESS_HOME_LINE2', | |
32 u'ADDRESS_HOME_CITY', | |
33 u'ADDRESS_HOME_STATE', | |
34 u'ADDRESS_HOME_ZIP', | |
35 u'ADDRESS_HOME_COUNTRY', | |
36 u'PHONE_HOME_WHOLE_NUMBER', | |
37 u'PHONE_FAX_WHOLE_NUMBER', | |
38 ] | |
39 _record_length = len(_fields) | |
40 _output_pattern = u'{' | |
41 for key in _fields: | |
42 _output_pattern += u"u'%s': u'%%s', " % key | |
43 _output_pattern = _output_pattern[:-1] + '},' | |
44 _re_single_quote = re.compile("'", re.UNICODE) | |
45 _logger = logging.getLogger(__name__) | |
46 _logger.addHandler(NullHandler()) | |
dennis_jeffrey
2011/02/17 22:58:35
In the rest of this file, you use "self._logger",
dyu1
2011/02/18 00:31:47
Done.
| |
47 info_level = logging.INFO | |
48 warning_level = logging.WARNING | |
49 error_level = logging.ERROR | |
dennis_jeffrey
2011/02/17 22:58:35
I think there's no need to define "info_level", "w
dyu1
2011/02/18 00:31:47
Done.
| |
50 | |
51 def __init__(self, input_filename, output_filename=None, logging_level=None): | |
dennis_jeffrey
2011/02/17 22:58:35
Rather than having "logging_level" default to "Non
dyu1
2011/02/18 00:31:47
Done.
| |
52 """Constructs a dataset converter object. | |
53 | |
54 Full input pattern: | |
55 '(?P<NAME_FIRST>.*?)\|(?P<MIDDLE_NAME>.*?)\|(?P<NAME_LAST>.*?)\| | |
56 (?P<EMAIL_ADDRESS>.*?)\|(?P<COMPANY_NAME>.*?)\|(?P<ADDRESS_HOME_LINE1>.*?) | |
57 \|(?P<ADDRESS_HOME_LINE2>.*?)\|(?P<ADDRESS_HOME_CITY>.*?)\| | |
58 (?P<ADDRESS_HOME_STATE>.*?)\|(?P<ADDRESS_HOME_ZIP>.*?)\| | |
59 (?P<ADDRESS_HOME_COUNTRY>.*?)\| | |
60 (?P<PHONE_HOME_WHOLE_NUMBER>.*?)\|(?P<PHONE_FAX_WHOLE_NUMBER>.*?)$' | |
61 | |
62 Full ouput pattern: | |
63 "{u'NAME_FIRST': u'%s', u'NAME_MIDDLE': u'%s', u'NAME_LAST': u'%s', | |
64 u'EMAIL_ADDRESS': u'%s', u'COMPANY_NAME': u'%s', u'ADDRESS_HOME_LINE1': | |
65 u'%s', u'ADDRESS_HOME_LINE2': u'%s', u'ADDRESS_HOME_CITY': u'%s', | |
66 u'ADDRESS_HOME_STATE': u'%s', u'ADDRESS_HOME_ZIP': u'%s', | |
67 u'ADDRESS_HOME_COUNTRY': u'%s', u'PHONE_HOME_WHOLE_NUMBER': u'%s', | |
68 u'PHONE_FAX_WHOLE_NUMBER': u'%s',}," | |
69 | |
70 Args: | |
71 input_filename: name and path of the input dataset. | |
72 output_filename: name and path of the converted file, default is none. | |
73 logging_level: set verbosity levels, default is none. | |
74 | |
75 Raises: | |
76 IOError: error if input file does not exist. | |
77 """ | |
78 if logging_level: | |
79 console = logging.StreamHandler() | |
80 console.setLevel(logging.INFO) | |
81 self._logger.addHandler(console) | |
82 self._logger.setLevel(logging_level) | |
dennis_jeffrey
2011/02/17 22:58:35
Right now, if the default logging level of "None"
dyu1
2011/02/18 00:31:47
Done.
| |
83 | |
84 self._input_filename = os.path.join(os.path.dirname(sys.argv[0]), | |
85 input_filename) | |
86 if not os.path.isfile(self._input_filename): | |
87 msg = 'File "%s" does not exist' % self._input_filename | |
88 self._logger.error(msg) | |
89 raise IOError(msg) | |
90 self._output_filename = output_filename | |
91 | |
92 def _CreateDictionaryFromRecord(self, record): | |
93 """Constructs and returns a dictionary from a record in the dataset file. | |
94 | |
95 Escapes single quotation first and uses split('|') to separate values. | |
96 Example: | |
97 Take an argument as a string u'John|Doe|Mountain View' | |
98 and returns a dictionary | |
99 { | |
100 u'NAME_FIRST': u'John', | |
101 u'NAME_LAST': u'Doe', | |
102 u'ADDRESS_HOME_CITY': u'Mountain View', | |
103 } | |
dennis_jeffrey
2011/02/17 22:58:35
You may want to also mention in the comment here t
dyu1
2011/02/18 00:31:47
Done.
| |
104 | |
105 Args: | |
106 record: row of record from the dataset file. | |
107 | |
108 Returns: | |
109 A dictionary representing a single record from the dataset file. | |
dennis_jeffrey
2011/02/17 22:58:35
The method may also potentially return None if the
dyu1
2011/02/18 00:31:47
Done.
| |
110 """ | |
111 # Ignore irrelevant record lines that do not contain '|'. | |
112 if not '|' in record: | |
113 return | |
114 # Escaping single quote: "'" -> "\'" | |
115 record = self._re_single_quote.sub(r"\'", record) | |
116 record_list = record.split('|') | |
117 if record_list: | |
118 # Check for case when a record may have more or less fields than expected. | |
119 if len(record_list) != self._record_length: | |
120 self._logger.warning( | |
121 'A "|" separated line has %d fields instead of %d: %s' % ( | |
122 len(record_list), self._record_length, record)) | |
123 return | |
124 out_record = {} | |
125 i = 0 | |
126 for key in self._fields: | |
127 out_record[key] = record_list[i] | |
128 i += 1 | |
dennis_jeffrey
2011/02/17 22:58:35
There's a cool way in python to iterate through a
dyu1
2011/02/18 00:31:47
Done.
| |
129 return out_record | |
130 | |
131 def Convert(self): | |
132 """Wrapper function to convert input data into the desired output format.""" | |
dennis_jeffrey
2011/02/17 22:58:35
This function can return something, so you should
dennis_jeffrey
2011/02/17 22:58:35
Since you've removed the "_Convert()" function, th
dyu1
2011/02/18 00:31:47
Done.
dyu1
2011/02/18 00:31:47
Done.
| |
133 with open(self._input_filename) as input_file: | |
134 if self._output_filename: | |
135 output_file = codecs.open(self._output_filename, mode='wb', | |
136 encoding='utf-8-sig') | |
137 else: | |
138 output_file = None | |
139 try: | |
140 list_of_dict = [] | |
141 i = 0 | |
142 if output_file: | |
143 output_file.write('[') | |
144 output_file.write(os.linesep) | |
145 for line in input_file.readlines(): | |
146 line = line.strip() | |
147 if not line: | |
148 continue | |
149 line = unicode(line, 'UTF-8') | |
150 output_record = self._CreateDictionaryFromRecord(line) | |
151 if output_record: | |
152 i += 1 | |
153 list_of_dict.append(output_record) | |
154 output_line = self._output_pattern % tuple( | |
155 [output_record[key] for key in self._fields]) | |
156 if output_file: | |
157 output_file.write(output_line) | |
158 output_file.write(os.linesep) | |
159 self._logger.info('%d: %s' % (i, line.encode(sys.stdout.encoding, | |
160 'ignore'))) | |
161 self._logger.info('\tconverted to: %s' % | |
162 output_line.encode(sys.stdout.encoding, 'ignore')) | |
163 if output_file: | |
164 output_file.write(']') | |
165 output_file.write(os.linesep) | |
166 self._logger.info('%d lines converted SUCCESSFULLY!' % i) | |
167 self._logger.info('--- FINISHED ---') | |
168 return list_of_dict | |
169 finally: | |
170 if output_file: | |
171 output_file.close() | |
172 | |
173 | |
174 def main(): | |
175 c = DatasetConverter(r'../data/autofill/dataset.txt', | |
176 r'../data/autofill/dataset_duplicate-profiles.txt', | |
177 DatasetConverter.info_level) | |
dennis_jeffrey
2011/02/17 22:58:35
I recommend changing
"DatasetConverter.info_level
dyu1
2011/02/18 00:31:47
Done.
| |
178 c.Convert() | |
179 | |
180 if __name__ == '__main__': | |
181 main() | |
OLD | NEW |