Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 #!/usr/bin/python | |
| 2 # Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
| 3 # Use of this source code is governed by a BSD-style license that can be | |
| 4 # found in the LICENSE file. | |
| 5 | |
| 6 """Converts profile datasets to dictionary list for Autofill profiles. | |
| 7 | |
| 8 Used for test autofill.AutoFillTest.testMergeDuplicateProfilesInAutofill. | |
| 9 """ | |
| 10 | |
| 11 import codecs | |
| 12 import logging | |
| 13 import os | |
| 14 import re | |
| 15 import sys | |
| 16 | |
| 17 | |
| 18 class NullHandler(logging.Handler): | |
| 19 def emit(self, record): | |
| 20 pass | |
|
dennis_jeffrey
2011/02/16 19:43:29
Right now it looks like you will never see any log
dyu1
2011/02/17 20:38:06
Done.
| |
| 21 | |
|
dennis_jeffrey
2011/02/16 19:43:29
Put one more blank line here, to separate these tw
dyu1
2011/02/17 20:38:06
Done.
| |
| 22 class DatasetConverter(object): | |
| 23 _fields = [ | |
| 24 u'NAME_FIRST', | |
| 25 u'NAME_MIDDLE', | |
| 26 u'NAME_LAST', | |
| 27 u'EMAIL_ADDRESS', | |
| 28 u'COMPANY_NAME', | |
| 29 u'ADDRESS_HOME_LINE1', | |
| 30 u'ADDRESS_HOME_LINE2', | |
| 31 u'ADDRESS_HOME_CITY', | |
| 32 u'ADDRESS_HOME_STATE', | |
| 33 u'ADDRESS_HOME_ZIP', | |
| 34 u'ADDRESS_HOME_COUNTRY', | |
| 35 u'PHONE_HOME_WHOLE_NUMBER', | |
| 36 u'PHONE_FAX_WHOLE_NUMBER', | |
| 37 ] | |
| 38 _record_length = len(_fields) | |
| 39 _output_pattern = u'{' | |
| 40 for key in _fields: | |
| 41 _output_pattern += u"u'%s': u'%%s', " % key | |
| 42 _output_pattern = _output_pattern[:-1] + '},' | |
| 43 _re_single_quote = re.compile("'", re.UNICODE) | |
| 44 _logger = logging.getLogger(__name__) | |
| 45 _logger.addHandler(NullHandler()) | |
| 46 | |
| 47 def __init__(self, input_filename, output_filename=None): | |
| 48 """Constructs a dataset converter object. | |
| 49 | |
| 50 Full input pattern: | |
| 51 '(?P<NAME_FIRST>.*?)\|(?P<MIDDLE_NAME>.*?)\|(?P<NAME_LAST>.*?)\| | |
| 52 (?P<EMAIL_ADDRESS>.*?)\|(?P<COMPANY_NAME>.*?)\|(?P<ADDRESS_HOME_LINE1>.*?) | |
| 53 \|(?P<ADDRESS_HOME_LINE2>.*?)\|(?P<ADDRESS_HOME_CITY>.*?)\| | |
| 54 (?P<ADDRESS_HOME_STATE>.*?)\|(?P<ADDRESS_HOME_ZIP>.*?)\| | |
| 55 (?P<ADDRESS_HOME_COUNTRY>.*?)\| | |
| 56 (?P<PHONE_HOME_WHOLE_NUMBER>.*?)\|(?P<PHONE_FAX_WHOLE_NUMBER>.*?)$' | |
| 57 | |
| 58 Full ouput pattern: | |
| 59 "{u'NAME_FIRST': u'%s', u'NAME_MIDDLE': u'%s', u'NAME_LAST': u'%s', | |
| 60 u'EMAIL_ADDRESS': u'%s', u'COMPANY_NAME': u'%s', u'ADDRESS_HOME_LINE1': | |
| 61 u'%s', u'ADDRESS_HOME_LINE2': u'%s', u'ADDRESS_HOME_CITY': u'%s', | |
| 62 u'ADDRESS_HOME_STATE': u'%s', u'ADDRESS_HOME_ZIP': u'%s', | |
| 63 u'ADDRESS_HOME_COUNTRY': u'%s', u'PHONE_HOME_WHOLE_NUMBER': u'%s', | |
| 64 u'PHONE_FAX_WHOLE_NUMBER': u'%s',}," | |
| 65 | |
| 66 args: | |
|
dennis_jeffrey
2011/02/16 19:43:29
Capitalize "a" in "args".
dyu1
2011/02/17 20:38:06
Done.
| |
| 67 input_filename: name and path of the input dataset. | |
| 68 output_filename: name and path of the converted file, default is none. | |
|
dennis_jeffrey
2011/02/16 19:43:29
Since this method can now possibly raise "IOError"
dyu1
2011/02/17 20:38:06
Done.
| |
| 69 """ | |
| 70 self._input_filename = os.path.join(os.path.dirname(sys.argv[0]), | |
| 71 input_filename) | |
| 72 if not os.path.isfile(self._input_filename): | |
| 73 raise IOError('File "%s" does not exist' % self._input_filename) | |
| 74 self._output_filename = output_filename | |
| 75 | |
| 76 def _CreateDictionaryFromRecord(self, line): | |
| 77 """Constructs and returns a dictionary from a record in the dataset file. | |
| 78 | |
| 79 Escapes single quotation first and uses split('|') to separate values. | |
| 80 Example: | |
| 81 Take an argument as a string u'John|Doe|Mountain View' | |
| 82 and returns a dictionary | |
| 83 { | |
| 84 u'NAME_FIRST': u'John', | |
| 85 u'NAME_LAST': u'Doe', | |
| 86 u'ADDRESS_HOME_CITY': u'Mountain View', | |
| 87 } | |
| 88 | |
| 89 Args: | |
| 90 line: row of record from the dataset file. | |
|
dennis_jeffrey
2011/02/16 19:43:29
Maybe a variable name of "record" might be better
dyu1
2011/02/17 20:38:06
Done.
| |
| 91 | |
| 92 Returns: | |
| 93 out_record: a dictionary that comes from conversion of a single line. | |
| 94 same as the output_record. | |
|
dennis_jeffrey
2011/02/16 19:43:29
In the "Returns:" section, I think you don't need
dyu1
2011/02/17 20:38:06
Done.
| |
| 95 """ | |
| 96 # Ignore irrelevant record lines that does not contain '|'. | |
|
dennis_jeffrey
2011/02/16 19:43:29
"does" --> "do"
dyu1
2011/02/17 20:38:06
Done.
| |
| 97 if not '|' in line: | |
| 98 return | |
| 99 # Escaping single quote: "'" -> "\'" | |
| 100 line = self._re_single_quote.sub(r"\'", line) | |
| 101 line_list = line.split('|') | |
| 102 if line_list: | |
| 103 # Check for case when a line may have more or less fields than expected. | |
| 104 if len(line_list) != self._record_length: | |
| 105 self._logger.warning( | |
| 106 'A "|" seperated line has %d fields instead of %d: %s' % ( | |
|
dennis_jeffrey
2011/02/16 19:43:29
"seperated" --> "separated"
dyu1
2011/02/17 20:38:06
Done.
| |
| 107 len(line_list), self._record_length, line)) | |
| 108 return | |
| 109 out_record = {} | |
| 110 i = 0 | |
| 111 for key in self._fields: | |
| 112 out_record[key] = line_list[i] | |
| 113 i += 1 | |
| 114 return out_record | |
| 115 | |
| 116 def _Convert(self, input_file, output_file): | |
| 117 """The real conversion takes place here. | |
| 118 | |
| 119 The output pattern takes place in this function. Each field needs to be | |
|
dennis_jeffrey
2011/02/16 19:43:29
What does it mean for an "output pattern" to "take
dyu1
2011/02/17 20:38:06
Removed this function.
On 2011/02/16 19:43:29, de
| |
| 120 formatted in order to give the converted line. | |
| 121 | |
| 122 Args: | |
| 123 input_file: dataset input file. | |
| 124 output_file: the converted dictionary list output file. | |
| 125 | |
| 126 Returns: | |
| 127 list_of_dict: list that holds all the dictionaries. | |
|
dennis_jeffrey
2011/02/16 19:43:29
Can remove the returned variable name "list_of_dic
dyu1
2011/02/17 20:38:06
Done.
| |
| 128 """ | |
| 129 list_of_dict = [] | |
| 130 i = 0 | |
| 131 if output_file: | |
| 132 output_file.write('[') | |
| 133 output_file.write(os.linesep) | |
| 134 for line in input_file.readlines(): | |
| 135 line = line.strip() | |
| 136 if not line: | |
| 137 continue | |
| 138 line = unicode(line, 'UTF-8') | |
| 139 output_record = self._CreateDictionaryFromRecord(line) | |
| 140 if output_record: | |
| 141 i += 1 | |
| 142 list_of_dict.append(output_record) | |
| 143 output_line = self._output_pattern % tuple( | |
| 144 [output_record[key] for key in self._fields]) | |
| 145 if output_file: | |
| 146 output_file.write(output_line) | |
| 147 output_file.write(os.linesep) | |
| 148 self._logger.info('%d: %s' % (i, line.encode(sys.stdout.encoding, | |
| 149 'ignore'))) | |
| 150 self._logger.info('\tconverted to: %s' % | |
| 151 output_line.encode(sys.stdout.encoding, 'ignore')) | |
| 152 if output_file: | |
| 153 output_file.write(']') | |
| 154 output_file.write(os.linesep) | |
| 155 self._logger.info('%d lines converted SUCCESSFULLY!' % i) | |
| 156 self._logger.info('--- FINISHED ---') | |
| 157 return list_of_dict | |
| 158 | |
| 159 def Convert(self): | |
| 160 """Uses values of the two data attributes of the current objects.""" | |
|
dennis_jeffrey
2011/02/16 19:43:29
I think a more descriptive comment might be someth
dyu1
2011/02/17 20:38:06
Done.
| |
| 161 with open(self._input_filename) as input_file: | |
| 162 if self._output_filename: | |
| 163 with codecs.open(self._output_filename, mode='wb', | |
| 164 encoding='utf-8-sig') as output_file: | |
| 165 return self._Convert(input_file, output_file) | |
| 166 else: | |
| 167 return self._Convert(input_file, None) | |
| 168 | |
| 169 | |
| 170 def main(): | |
| 171 c = DatasetConverter(r'../data/autofill/dataset.txt', | |
| 172 r'../data/autofill/dataset_duplicate-profiles.txt') | |
| 173 c.Convert() | |
| 174 | |
| 175 if __name__ == '__main__': | |
| 176 main() | |
| OLD | NEW |