Chromium Code Reviews| Index: chrome/test/functional/autofill_dataset_converter.py |
| =================================================================== |
| --- chrome/test/functional/autofill_dataset_converter.py (revision 0) |
| +++ chrome/test/functional/autofill_dataset_converter.py (revision 0) |
| @@ -0,0 +1,181 @@ |
| +#!/usr/bin/python |
| +# Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| +# Use of this source code is governed by a BSD-style license that can be |
| +# found in the LICENSE file. |
| + |
| +"""Converts profile datasets to dictionary list for Autofill profiles. |
| + |
| +Used for test autofill.AutoFillTest.testMergeDuplicateProfilesInAutofill. |
| +""" |
| + |
| +import codecs |
| +import logging |
| +import os |
| +import re |
| +import sys |
| + |
| + |
| +class NullHandler(logging.Handler): |
| + def emit(self, record): |
| + pass |
| + |
| + |
| +class DatasetConverter(object): |
| + _fields = [ |
| + u'NAME_FIRST', |
| + u'NAME_MIDDLE', |
| + u'NAME_LAST', |
| + u'EMAIL_ADDRESS', |
| + u'COMPANY_NAME', |
| + u'ADDRESS_HOME_LINE1', |
| + u'ADDRESS_HOME_LINE2', |
| + u'ADDRESS_HOME_CITY', |
| + u'ADDRESS_HOME_STATE', |
| + u'ADDRESS_HOME_ZIP', |
| + u'ADDRESS_HOME_COUNTRY', |
| + u'PHONE_HOME_WHOLE_NUMBER', |
| + u'PHONE_FAX_WHOLE_NUMBER', |
| + ] |
| + _record_length = len(_fields) |
| + _output_pattern = u'{' |
| + for key in _fields: |
| + _output_pattern += u"u'%s': u'%%s', " % key |
| + _output_pattern = _output_pattern[:-1] + '},' |
| + _re_single_quote = re.compile("'", re.UNICODE) |
| + _logger = logging.getLogger(__name__) |
| + _logger.addHandler(NullHandler()) |
|
dennis_jeffrey
2011/02/17 22:58:35
In the rest of this file, you use "self._logger",
dyu1
2011/02/18 00:31:47
Done.
|
| + info_level = logging.INFO |
| + warning_level = logging.WARNING |
| + error_level = logging.ERROR |
|
dennis_jeffrey
2011/02/17 22:58:35
I think there's no need to define "info_level", "w
dyu1
2011/02/18 00:31:47
Done.
|
| + |
| + def __init__(self, input_filename, output_filename=None, logging_level=None): |
|
dennis_jeffrey
2011/02/17 22:58:35
Rather than having "logging_level" default to "Non
dyu1
2011/02/18 00:31:47
Done.
|
| + """Constructs a dataset converter object. |
| + |
| + Full input pattern: |
| + '(?P<NAME_FIRST>.*?)\|(?P<MIDDLE_NAME>.*?)\|(?P<NAME_LAST>.*?)\| |
| + (?P<EMAIL_ADDRESS>.*?)\|(?P<COMPANY_NAME>.*?)\|(?P<ADDRESS_HOME_LINE1>.*?) |
| + \|(?P<ADDRESS_HOME_LINE2>.*?)\|(?P<ADDRESS_HOME_CITY>.*?)\| |
| + (?P<ADDRESS_HOME_STATE>.*?)\|(?P<ADDRESS_HOME_ZIP>.*?)\| |
| + (?P<ADDRESS_HOME_COUNTRY>.*?)\| |
| + (?P<PHONE_HOME_WHOLE_NUMBER>.*?)\|(?P<PHONE_FAX_WHOLE_NUMBER>.*?)$' |
| + |
| + Full ouput pattern: |
| + "{u'NAME_FIRST': u'%s', u'NAME_MIDDLE': u'%s', u'NAME_LAST': u'%s', |
| + u'EMAIL_ADDRESS': u'%s', u'COMPANY_NAME': u'%s', u'ADDRESS_HOME_LINE1': |
| + u'%s', u'ADDRESS_HOME_LINE2': u'%s', u'ADDRESS_HOME_CITY': u'%s', |
| + u'ADDRESS_HOME_STATE': u'%s', u'ADDRESS_HOME_ZIP': u'%s', |
| + u'ADDRESS_HOME_COUNTRY': u'%s', u'PHONE_HOME_WHOLE_NUMBER': u'%s', |
| + u'PHONE_FAX_WHOLE_NUMBER': u'%s',}," |
| + |
| + Args: |
| + input_filename: name and path of the input dataset. |
| + output_filename: name and path of the converted file, default is none. |
| + logging_level: set verbosity levels, default is none. |
| + |
| + Raises: |
| + IOError: error if input file does not exist. |
| + """ |
| + if logging_level: |
| + console = logging.StreamHandler() |
| + console.setLevel(logging.INFO) |
| + self._logger.addHandler(console) |
| + self._logger.setLevel(logging_level) |
|
dennis_jeffrey
2011/02/17 22:58:35
Right now, if the default logging level of "None"
dyu1
2011/02/18 00:31:47
Done.
|
| + |
| + self._input_filename = os.path.join(os.path.dirname(sys.argv[0]), |
| + input_filename) |
| + if not os.path.isfile(self._input_filename): |
| + msg = 'File "%s" does not exist' % self._input_filename |
| + self._logger.error(msg) |
| + raise IOError(msg) |
| + self._output_filename = output_filename |
| + |
| + def _CreateDictionaryFromRecord(self, record): |
| + """Constructs and returns a dictionary from a record in the dataset file. |
| + |
| + Escapes single quotation first and uses split('|') to separate values. |
| + Example: |
| + Take an argument as a string u'John|Doe|Mountain View' |
| + and returns a dictionary |
| + { |
| + u'NAME_FIRST': u'John', |
| + u'NAME_LAST': u'Doe', |
| + u'ADDRESS_HOME_CITY': u'Mountain View', |
| + } |
|
dennis_jeffrey
2011/02/17 22:58:35
You may want to also mention in the comment here t
dyu1
2011/02/18 00:31:47
Done.
|
| + |
| + Args: |
| + record: row of record from the dataset file. |
| + |
| + Returns: |
| + A dictionary representing a single record from the dataset file. |
|
dennis_jeffrey
2011/02/17 22:58:35
The method may also potentially return None if the
dyu1
2011/02/18 00:31:47
Done.
|
| + """ |
| + # Ignore irrelevant record lines that do not contain '|'. |
| + if not '|' in record: |
| + return |
| + # Escaping single quote: "'" -> "\'" |
| + record = self._re_single_quote.sub(r"\'", record) |
| + record_list = record.split('|') |
| + if record_list: |
| + # Check for case when a record may have more or less fields than expected. |
| + if len(record_list) != self._record_length: |
| + self._logger.warning( |
| + 'A "|" separated line has %d fields instead of %d: %s' % ( |
| + len(record_list), self._record_length, record)) |
| + return |
| + out_record = {} |
| + i = 0 |
| + for key in self._fields: |
| + out_record[key] = record_list[i] |
| + i += 1 |
|
dennis_jeffrey
2011/02/17 22:58:35
There's a cool way in python to iterate through a
dyu1
2011/02/18 00:31:47
Done.
|
| + return out_record |
| + |
| + def Convert(self): |
| + """Wrapper function to convert input data into the desired output format.""" |
|
dennis_jeffrey
2011/02/17 22:58:35
This function can return something, so you should
dennis_jeffrey
2011/02/17 22:58:35
Since you've removed the "_Convert()" function, th
dyu1
2011/02/18 00:31:47
Done.
dyu1
2011/02/18 00:31:47
Done.
|
| + with open(self._input_filename) as input_file: |
| + if self._output_filename: |
| + output_file = codecs.open(self._output_filename, mode='wb', |
| + encoding='utf-8-sig') |
| + else: |
| + output_file = None |
| + try: |
| + list_of_dict = [] |
| + i = 0 |
| + if output_file: |
| + output_file.write('[') |
| + output_file.write(os.linesep) |
| + for line in input_file.readlines(): |
| + line = line.strip() |
| + if not line: |
| + continue |
| + line = unicode(line, 'UTF-8') |
| + output_record = self._CreateDictionaryFromRecord(line) |
| + if output_record: |
| + i += 1 |
| + list_of_dict.append(output_record) |
| + output_line = self._output_pattern % tuple( |
| + [output_record[key] for key in self._fields]) |
| + if output_file: |
| + output_file.write(output_line) |
| + output_file.write(os.linesep) |
| + self._logger.info('%d: %s' % (i, line.encode(sys.stdout.encoding, |
| + 'ignore'))) |
| + self._logger.info('\tconverted to: %s' % |
| + output_line.encode(sys.stdout.encoding, 'ignore')) |
| + if output_file: |
| + output_file.write(']') |
| + output_file.write(os.linesep) |
| + self._logger.info('%d lines converted SUCCESSFULLY!' % i) |
| + self._logger.info('--- FINISHED ---') |
| + return list_of_dict |
| + finally: |
| + if output_file: |
| + output_file.close() |
| + |
| + |
| +def main(): |
| + c = DatasetConverter(r'../data/autofill/dataset.txt', |
| + r'../data/autofill/dataset_duplicate-profiles.txt', |
| + DatasetConverter.info_level) |
|
dennis_jeffrey
2011/02/17 22:58:35
I recommend changing
"DatasetConverter.info_level
dyu1
2011/02/18 00:31:47
Done.
|
| + c.Convert() |
| + |
| +if __name__ == '__main__': |
| + main() |