Index: chrome/test/functional/autofill_dataset_converter.py |
=================================================================== |
--- chrome/test/functional/autofill_dataset_converter.py (revision 0) |
+++ chrome/test/functional/autofill_dataset_converter.py (revision 0) |
@@ -0,0 +1,181 @@ |
+#!/usr/bin/python |
+# Copyright (c) 2011 The Chromium Authors. All rights reserved. |
+# Use of this source code is governed by a BSD-style license that can be |
+# found in the LICENSE file. |
+ |
+"""Converts profile datasets to dictionary list for Autofill profiles. |
+ |
+Used for test autofill.AutoFillTest.testMergeDuplicateProfilesInAutofill. |
+""" |
+ |
+import codecs |
+import logging |
+import os |
+import re |
+import sys |
+ |
+ |
+class NullHandler(logging.Handler): |
+ def emit(self, record): |
+ pass |
+ |
+ |
+class DatasetConverter(object): |
+ _fields = [ |
+ u'NAME_FIRST', |
+ u'NAME_MIDDLE', |
+ u'NAME_LAST', |
+ u'EMAIL_ADDRESS', |
+ u'COMPANY_NAME', |
+ u'ADDRESS_HOME_LINE1', |
+ u'ADDRESS_HOME_LINE2', |
+ u'ADDRESS_HOME_CITY', |
+ u'ADDRESS_HOME_STATE', |
+ u'ADDRESS_HOME_ZIP', |
+ u'ADDRESS_HOME_COUNTRY', |
+ u'PHONE_HOME_WHOLE_NUMBER', |
+ u'PHONE_FAX_WHOLE_NUMBER', |
+ ] |
+ _record_length = len(_fields) |
+ _output_pattern = u'{' |
+ for key in _fields: |
+ _output_pattern += u"u'%s': u'%%s', " % key |
+ _output_pattern = _output_pattern[:-1] + '},' |
+ _re_single_quote = re.compile("'", re.UNICODE) |
+ _logger = logging.getLogger(__name__) |
+ _logger.addHandler(NullHandler()) |
dennis_jeffrey
2011/02/17 22:58:35
In the rest of this file, you use "self._logger",
dyu1
2011/02/18 00:31:47
Done.
|
+ info_level = logging.INFO |
+ warning_level = logging.WARNING |
+ error_level = logging.ERROR |
dennis_jeffrey
2011/02/17 22:58:35
I think there's no need to define "info_level", "w
dyu1
2011/02/18 00:31:47
Done.
|
+ |
+ def __init__(self, input_filename, output_filename=None, logging_level=None): |
dennis_jeffrey
2011/02/17 22:58:35
Rather than having "logging_level" default to "Non
dyu1
2011/02/18 00:31:47
Done.
|
+ """Constructs a dataset converter object. |
+ |
+ Full input pattern: |
+ '(?P<NAME_FIRST>.*?)\|(?P<MIDDLE_NAME>.*?)\|(?P<NAME_LAST>.*?)\| |
+ (?P<EMAIL_ADDRESS>.*?)\|(?P<COMPANY_NAME>.*?)\|(?P<ADDRESS_HOME_LINE1>.*?) |
+ \|(?P<ADDRESS_HOME_LINE2>.*?)\|(?P<ADDRESS_HOME_CITY>.*?)\| |
+ (?P<ADDRESS_HOME_STATE>.*?)\|(?P<ADDRESS_HOME_ZIP>.*?)\| |
+ (?P<ADDRESS_HOME_COUNTRY>.*?)\| |
+ (?P<PHONE_HOME_WHOLE_NUMBER>.*?)\|(?P<PHONE_FAX_WHOLE_NUMBER>.*?)$' |
+ |
+ Full ouput pattern: |
+ "{u'NAME_FIRST': u'%s', u'NAME_MIDDLE': u'%s', u'NAME_LAST': u'%s', |
+ u'EMAIL_ADDRESS': u'%s', u'COMPANY_NAME': u'%s', u'ADDRESS_HOME_LINE1': |
+ u'%s', u'ADDRESS_HOME_LINE2': u'%s', u'ADDRESS_HOME_CITY': u'%s', |
+ u'ADDRESS_HOME_STATE': u'%s', u'ADDRESS_HOME_ZIP': u'%s', |
+ u'ADDRESS_HOME_COUNTRY': u'%s', u'PHONE_HOME_WHOLE_NUMBER': u'%s', |
+ u'PHONE_FAX_WHOLE_NUMBER': u'%s',}," |
+ |
+ Args: |
+ input_filename: name and path of the input dataset. |
+ output_filename: name and path of the converted file, default is none. |
+ logging_level: set verbosity levels, default is none. |
+ |
+ Raises: |
+ IOError: error if input file does not exist. |
+ """ |
+ if logging_level: |
+ console = logging.StreamHandler() |
+ console.setLevel(logging.INFO) |
+ self._logger.addHandler(console) |
+ self._logger.setLevel(logging_level) |
dennis_jeffrey
2011/02/17 22:58:35
Right now, if the default logging level of "None"
dyu1
2011/02/18 00:31:47
Done.
|
+ |
+ self._input_filename = os.path.join(os.path.dirname(sys.argv[0]), |
+ input_filename) |
+ if not os.path.isfile(self._input_filename): |
+ msg = 'File "%s" does not exist' % self._input_filename |
+ self._logger.error(msg) |
+ raise IOError(msg) |
+ self._output_filename = output_filename |
+ |
+ def _CreateDictionaryFromRecord(self, record): |
+ """Constructs and returns a dictionary from a record in the dataset file. |
+ |
+ Escapes single quotation first and uses split('|') to separate values. |
+ Example: |
+ Take an argument as a string u'John|Doe|Mountain View' |
+ and returns a dictionary |
+ { |
+ u'NAME_FIRST': u'John', |
+ u'NAME_LAST': u'Doe', |
+ u'ADDRESS_HOME_CITY': u'Mountain View', |
+ } |
dennis_jeffrey
2011/02/17 22:58:35
You may want to also mention in the comment here t
dyu1
2011/02/18 00:31:47
Done.
|
+ |
+ Args: |
+ record: row of record from the dataset file. |
+ |
+ Returns: |
+ A dictionary representing a single record from the dataset file. |
dennis_jeffrey
2011/02/17 22:58:35
The method may also potentially return None if the
dyu1
2011/02/18 00:31:47
Done.
|
+ """ |
+ # Ignore irrelevant record lines that do not contain '|'. |
+ if not '|' in record: |
+ return |
+ # Escaping single quote: "'" -> "\'" |
+ record = self._re_single_quote.sub(r"\'", record) |
+ record_list = record.split('|') |
+ if record_list: |
+ # Check for case when a record may have more or less fields than expected. |
+ if len(record_list) != self._record_length: |
+ self._logger.warning( |
+ 'A "|" separated line has %d fields instead of %d: %s' % ( |
+ len(record_list), self._record_length, record)) |
+ return |
+ out_record = {} |
+ i = 0 |
+ for key in self._fields: |
+ out_record[key] = record_list[i] |
+ i += 1 |
dennis_jeffrey
2011/02/17 22:58:35
There's a cool way in python to iterate through a
dyu1
2011/02/18 00:31:47
Done.
|
+ return out_record |
+ |
+ def Convert(self): |
+ """Wrapper function to convert input data into the desired output format.""" |
dennis_jeffrey
2011/02/17 22:58:35
This function can return something, so you should
dennis_jeffrey
2011/02/17 22:58:35
Since you've removed the "_Convert()" function, th
dyu1
2011/02/18 00:31:47
Done.
dyu1
2011/02/18 00:31:47
Done.
|
+ with open(self._input_filename) as input_file: |
+ if self._output_filename: |
+ output_file = codecs.open(self._output_filename, mode='wb', |
+ encoding='utf-8-sig') |
+ else: |
+ output_file = None |
+ try: |
+ list_of_dict = [] |
+ i = 0 |
+ if output_file: |
+ output_file.write('[') |
+ output_file.write(os.linesep) |
+ for line in input_file.readlines(): |
+ line = line.strip() |
+ if not line: |
+ continue |
+ line = unicode(line, 'UTF-8') |
+ output_record = self._CreateDictionaryFromRecord(line) |
+ if output_record: |
+ i += 1 |
+ list_of_dict.append(output_record) |
+ output_line = self._output_pattern % tuple( |
+ [output_record[key] for key in self._fields]) |
+ if output_file: |
+ output_file.write(output_line) |
+ output_file.write(os.linesep) |
+ self._logger.info('%d: %s' % (i, line.encode(sys.stdout.encoding, |
+ 'ignore'))) |
+ self._logger.info('\tconverted to: %s' % |
+ output_line.encode(sys.stdout.encoding, 'ignore')) |
+ if output_file: |
+ output_file.write(']') |
+ output_file.write(os.linesep) |
+ self._logger.info('%d lines converted SUCCESSFULLY!' % i) |
+ self._logger.info('--- FINISHED ---') |
+ return list_of_dict |
+ finally: |
+ if output_file: |
+ output_file.close() |
+ |
+ |
+def main(): |
+ c = DatasetConverter(r'../data/autofill/dataset.txt', |
+ r'../data/autofill/dataset_duplicate-profiles.txt', |
+ DatasetConverter.info_level) |
dennis_jeffrey
2011/02/17 22:58:35
I recommend changing
"DatasetConverter.info_level
dyu1
2011/02/18 00:31:47
Done.
|
+ c.Convert() |
+ |
+if __name__ == '__main__': |
+ main() |