OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/python |
| 2 # Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. |
| 5 |
| 6 """Converts profile datasets to dictionary list for Autofill profiles. |
| 7 |
| 8 Used for test autofill.AutoFillTest.testMergeDuplicateProfilesInAutofill. |
| 9 """ |
| 10 |
| 11 import codecs |
| 12 import logging |
| 13 import os |
| 14 import re |
| 15 import sys |
| 16 |
| 17 |
| 18 class DatasetConverter(object): |
| 19 _fields = [ |
| 20 u'NAME_FIRST', |
| 21 u'NAME_MIDDLE', |
| 22 u'NAME_LAST', |
| 23 u'EMAIL_ADDRESS', |
| 24 u'COMPANY_NAME', |
| 25 u'ADDRESS_HOME_LINE1', |
| 26 u'ADDRESS_HOME_LINE2', |
| 27 u'ADDRESS_HOME_CITY', |
| 28 u'ADDRESS_HOME_STATE', |
| 29 u'ADDRESS_HOME_ZIP', |
| 30 u'ADDRESS_HOME_COUNTRY', |
| 31 u'PHONE_HOME_WHOLE_NUMBER', |
| 32 u'PHONE_FAX_WHOLE_NUMBER', |
| 33 ] |
| 34 _record_length = len(_fields) |
| 35 _output_pattern = u'{' |
| 36 for key in _fields: |
| 37 _output_pattern += u"u'%s': u'%%s', " % key |
| 38 _output_pattern = _output_pattern[:-1] + '},' |
| 39 _re_single_quote = re.compile("'", re.UNICODE) |
| 40 _logger = logging.getLogger(__name__) |
| 41 |
| 42 def __init__(self, input_filename, output_filename=None, |
| 43 logging_level=logging.ERROR): |
| 44 """Constructs a dataset converter object. |
| 45 |
| 46 Full input pattern: |
| 47 '(?P<NAME_FIRST>.*?)\|(?P<MIDDLE_NAME>.*?)\|(?P<NAME_LAST>.*?)\| |
| 48 (?P<EMAIL_ADDRESS>.*?)\|(?P<COMPANY_NAME>.*?)\|(?P<ADDRESS_HOME_LINE1>.*?) |
| 49 \|(?P<ADDRESS_HOME_LINE2>.*?)\|(?P<ADDRESS_HOME_CITY>.*?)\| |
| 50 (?P<ADDRESS_HOME_STATE>.*?)\|(?P<ADDRESS_HOME_ZIP>.*?)\| |
| 51 (?P<ADDRESS_HOME_COUNTRY>.*?)\| |
| 52 (?P<PHONE_HOME_WHOLE_NUMBER>.*?)\|(?P<PHONE_FAX_WHOLE_NUMBER>.*?)$' |
| 53 |
| 54 Full ouput pattern: |
| 55 "{u'NAME_FIRST': u'%s', u'NAME_MIDDLE': u'%s', u'NAME_LAST': u'%s', |
| 56 u'EMAIL_ADDRESS': u'%s', u'COMPANY_NAME': u'%s', u'ADDRESS_HOME_LINE1': |
| 57 u'%s', u'ADDRESS_HOME_LINE2': u'%s', u'ADDRESS_HOME_CITY': u'%s', |
| 58 u'ADDRESS_HOME_STATE': u'%s', u'ADDRESS_HOME_ZIP': u'%s', |
| 59 u'ADDRESS_HOME_COUNTRY': u'%s', u'PHONE_HOME_WHOLE_NUMBER': u'%s', |
| 60 u'PHONE_FAX_WHOLE_NUMBER': u'%s',}," |
| 61 |
| 62 Args: |
| 63 input_filename: name and path of the input dataset. |
| 64 output_filename: name and path of the converted file, default is none. |
| 65 logging_level: set verbosity levels, default is ERROR. |
| 66 |
| 67 Raises: |
| 68 IOError: error if input file does not exist. |
| 69 """ |
| 70 console = logging.StreamHandler() |
| 71 console.setLevel(logging_level) |
| 72 self._logger.addHandler(console) |
| 73 |
| 74 self._input_filename = os.path.join(os.path.dirname(sys.argv[0]), |
| 75 input_filename) |
| 76 if not os.path.isfile(self._input_filename): |
| 77 msg = 'File "%s" does not exist' % self._input_filename |
| 78 self._logger.error(msg) |
| 79 raise IOError(msg) |
| 80 self._output_filename = output_filename |
| 81 |
| 82 def _CreateDictionaryFromRecord(self, record): |
| 83 """Constructs and returns a dictionary from a record in the dataset file. |
| 84 |
| 85 Escapes single quotation first and uses split('|') to separate values. |
| 86 The method assumes a valid record always contains at least one "|" |
| 87 character. |
| 88 Example: |
| 89 Take an argument as a string u'John|Doe|Mountain View' |
| 90 and returns a dictionary |
| 91 { |
| 92 u'NAME_FIRST': u'John', |
| 93 u'NAME_LAST': u'Doe', |
| 94 u'ADDRESS_HOME_CITY': u'Mountain View', |
| 95 } |
| 96 |
| 97 Args: |
| 98 record: row of record from the dataset file. |
| 99 |
| 100 Returns: |
| 101 None if the current record line is invalid or a dictionary representing a |
| 102 single record from the dataset file. |
| 103 """ |
| 104 # Ignore irrelevant record lines that do not contain '|'. |
| 105 if not '|' in record: |
| 106 return |
| 107 # Escaping single quote: "'" -> "\'" |
| 108 record = self._re_single_quote.sub(r"\'", record) |
| 109 record_list = record.split('|') |
| 110 if record_list: |
| 111 # Check for case when a record may have more or less fields than expected. |
| 112 if len(record_list) != self._record_length: |
| 113 self._logger.warning( |
| 114 'A "|" separated line has %d fields instead of %d: %s' % ( |
| 115 len(record_list), self._record_length, record)) |
| 116 return |
| 117 out_record = {} |
| 118 for i, key in enumerate(self._fields): |
| 119 out_record[key] = record_list[i] |
| 120 return out_record |
| 121 |
| 122 def Convert(self): |
| 123 """Function to convert input data into the desired output format. |
| 124 |
| 125 Returns: |
| 126 List that holds all the dictionaries. |
| 127 """ |
| 128 with open(self._input_filename) as input_file: |
| 129 if self._output_filename: |
| 130 output_file = codecs.open(self._output_filename, mode='wb', |
| 131 encoding='utf-8-sig') |
| 132 else: |
| 133 output_file = None |
| 134 try: |
| 135 list_of_dict = [] |
| 136 i = 0 |
| 137 if output_file: |
| 138 output_file.write('[') |
| 139 output_file.write(os.linesep) |
| 140 for line in input_file.readlines(): |
| 141 line = line.strip() |
| 142 if not line: |
| 143 continue |
| 144 line = unicode(line, 'UTF-8') |
| 145 output_record = self._CreateDictionaryFromRecord(line) |
| 146 if output_record: |
| 147 i += 1 |
| 148 list_of_dict.append(output_record) |
| 149 output_line = self._output_pattern % tuple( |
| 150 [output_record[key] for key in self._fields]) |
| 151 if output_file: |
| 152 output_file.write(output_line) |
| 153 output_file.write(os.linesep) |
| 154 self._logger.info('%d: %s' % (i, line.encode(sys.stdout.encoding, |
| 155 'ignore'))) |
| 156 self._logger.info('\tconverted to: %s' % |
| 157 output_line.encode(sys.stdout.encoding, 'ignore')) |
| 158 if output_file: |
| 159 output_file.write(']') |
| 160 output_file.write(os.linesep) |
| 161 self._logger.info('%d lines converted SUCCESSFULLY!' % i) |
| 162 self._logger.info('--- FINISHED ---') |
| 163 return list_of_dict |
| 164 finally: |
| 165 if output_file: |
| 166 output_file.close() |
| 167 |
| 168 |
| 169 def main(): |
| 170 c = DatasetConverter(r'../data/autofill/dataset.txt', |
| 171 r'../data/autofill/dataset_duplicate-profiles.txt', |
| 172 logging.INFO) |
| 173 c.Convert() |
| 174 |
| 175 if __name__ == '__main__': |
| 176 main() |
OLD | NEW |