Chromium Code Reviews| Index: chrome/test/functional/dataset_converter.py |
| =================================================================== |
| --- chrome/test/functional/dataset_converter.py (revision 0) |
| +++ chrome/test/functional/dataset_converter.py (revision 0) |
| @@ -0,0 +1,176 @@ |
| +#!/usr/bin/python |
| +# Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| +# Use of this source code is governed by a BSD-style license that can be |
| +# found in the LICENSE file. |
| + |
| +"""Converts profile datasets to dictionary list for Autofill profiles. |
| + |
| +Used for test autofill.AutoFillTest.testMergeDuplicateProfilesInAutofill. |
| +""" |
| + |
| +import codecs |
| +import logging |
| +import os |
| +import re |
| +import sys |
| + |
| + |
| +class NullHandler(logging.Handler): |
| + def emit(self, record): |
| + pass |
|
dennis_jeffrey
2011/02/16 19:43:29
Right now it looks like you will never see any log
dyu1
2011/02/17 20:38:06
Done.
|
| + |
|
dennis_jeffrey
2011/02/16 19:43:29
Put one more blank line here, to separate these tw
dyu1
2011/02/17 20:38:06
Done.
|
| +class DatasetConverter(object): |
| + _fields = [ |
| + u'NAME_FIRST', |
| + u'NAME_MIDDLE', |
| + u'NAME_LAST', |
| + u'EMAIL_ADDRESS', |
| + u'COMPANY_NAME', |
| + u'ADDRESS_HOME_LINE1', |
| + u'ADDRESS_HOME_LINE2', |
| + u'ADDRESS_HOME_CITY', |
| + u'ADDRESS_HOME_STATE', |
| + u'ADDRESS_HOME_ZIP', |
| + u'ADDRESS_HOME_COUNTRY', |
| + u'PHONE_HOME_WHOLE_NUMBER', |
| + u'PHONE_FAX_WHOLE_NUMBER', |
| + ] |
| + _record_length = len(_fields) |
| + _output_pattern = u'{' |
| + for key in _fields: |
| + _output_pattern += u"u'%s': u'%%s', " % key |
| + _output_pattern = _output_pattern[:-1] + '},' |
| + _re_single_quote = re.compile("'", re.UNICODE) |
| + _logger = logging.getLogger(__name__) |
| + _logger.addHandler(NullHandler()) |
| + |
| + def __init__(self, input_filename, output_filename=None): |
| + """Constructs a dataset converter object. |
| + |
| + Full input pattern: |
| + '(?P<NAME_FIRST>.*?)\|(?P<MIDDLE_NAME>.*?)\|(?P<NAME_LAST>.*?)\| |
| + (?P<EMAIL_ADDRESS>.*?)\|(?P<COMPANY_NAME>.*?)\|(?P<ADDRESS_HOME_LINE1>.*?) |
| + \|(?P<ADDRESS_HOME_LINE2>.*?)\|(?P<ADDRESS_HOME_CITY>.*?)\| |
| + (?P<ADDRESS_HOME_STATE>.*?)\|(?P<ADDRESS_HOME_ZIP>.*?)\| |
| + (?P<ADDRESS_HOME_COUNTRY>.*?)\| |
| + (?P<PHONE_HOME_WHOLE_NUMBER>.*?)\|(?P<PHONE_FAX_WHOLE_NUMBER>.*?)$' |
| + |
| + Full ouput pattern: |
| + "{u'NAME_FIRST': u'%s', u'NAME_MIDDLE': u'%s', u'NAME_LAST': u'%s', |
| + u'EMAIL_ADDRESS': u'%s', u'COMPANY_NAME': u'%s', u'ADDRESS_HOME_LINE1': |
| + u'%s', u'ADDRESS_HOME_LINE2': u'%s', u'ADDRESS_HOME_CITY': u'%s', |
| + u'ADDRESS_HOME_STATE': u'%s', u'ADDRESS_HOME_ZIP': u'%s', |
| + u'ADDRESS_HOME_COUNTRY': u'%s', u'PHONE_HOME_WHOLE_NUMBER': u'%s', |
| + u'PHONE_FAX_WHOLE_NUMBER': u'%s',}," |
| + |
| + args: |
|
dennis_jeffrey
2011/02/16 19:43:29
Capitalize "a" in "args".
dyu1
2011/02/17 20:38:06
Done.
|
| + input_filename: name and path of the input dataset. |
| + output_filename: name and path of the converted file, default is none. |
|
dennis_jeffrey
2011/02/16 19:43:29
Since this method can now possibly raise "IOError"
dyu1
2011/02/17 20:38:06
Done.
|
| + """ |
| + self._input_filename = os.path.join(os.path.dirname(sys.argv[0]), |
| + input_filename) |
| + if not os.path.isfile(self._input_filename): |
| + raise IOError('File "%s" does not exist' % self._input_filename) |
| + self._output_filename = output_filename |
| + |
| + def _CreateDictionaryFromRecord(self, line): |
| + """Constructs and returns a dictionary from a record in the dataset file. |
| + |
| + Escapes single quotation first and uses split('|') to separate values. |
| + Example: |
| + Take an argument as a string u'John|Doe|Mountain View' |
| + and returns a dictionary |
| + { |
| + u'NAME_FIRST': u'John', |
| + u'NAME_LAST': u'Doe', |
| + u'ADDRESS_HOME_CITY': u'Mountain View', |
| + } |
| + |
| + Args: |
| + line: row of record from the dataset file. |
|
dennis_jeffrey
2011/02/16 19:43:29
Maybe a variable name of "record" might be better
dyu1
2011/02/17 20:38:06
Done.
|
| + |
| + Returns: |
| + out_record: a dictionary that comes from conversion of a single line. |
| + same as the output_record. |
|
dennis_jeffrey
2011/02/16 19:43:29
In the "Returns:" section, I think you don't need
dyu1
2011/02/17 20:38:06
Done.
|
| + """ |
| + # Ignore irrelevant record lines that does not contain '|'. |
|
dennis_jeffrey
2011/02/16 19:43:29
"does" --> "do"
dyu1
2011/02/17 20:38:06
Done.
|
| + if not '|' in line: |
| + return |
| + # Escaping single quote: "'" -> "\'" |
| + line = self._re_single_quote.sub(r"\'", line) |
| + line_list = line.split('|') |
| + if line_list: |
| + # Check for case when a line may have more or less fields than expected. |
| + if len(line_list) != self._record_length: |
| + self._logger.warning( |
| + 'A "|" seperated line has %d fields instead of %d: %s' % ( |
|
dennis_jeffrey
2011/02/16 19:43:29
"seperated" --> "separated"
dyu1
2011/02/17 20:38:06
Done.
|
| + len(line_list), self._record_length, line)) |
| + return |
| + out_record = {} |
| + i = 0 |
| + for key in self._fields: |
| + out_record[key] = line_list[i] |
| + i += 1 |
| + return out_record |
| + |
| + def _Convert(self, input_file, output_file): |
| + """The real conversion takes place here. |
| + |
| + The output pattern takes place in this function. Each field needs to be |
|
dennis_jeffrey
2011/02/16 19:43:29
What does it mean for an "output pattern" to "take
dyu1
2011/02/17 20:38:06
Removed this function.
On 2011/02/16 19:43:29, de
|
| + formatted in order to give the converted line. |
| + |
| + Args: |
| + input_file: dataset input file. |
| + output_file: the converted dictionary list output file. |
| + |
| + Returns: |
| + list_of_dict: list that holds all the dictionaries. |
|
dennis_jeffrey
2011/02/16 19:43:29
Can remove the returned variable name "list_of_dic
dyu1
2011/02/17 20:38:06
Done.
|
| + """ |
| + list_of_dict = [] |
| + i = 0 |
| + if output_file: |
| + output_file.write('[') |
| + output_file.write(os.linesep) |
| + for line in input_file.readlines(): |
| + line = line.strip() |
| + if not line: |
| + continue |
| + line = unicode(line, 'UTF-8') |
| + output_record = self._CreateDictionaryFromRecord(line) |
| + if output_record: |
| + i += 1 |
| + list_of_dict.append(output_record) |
| + output_line = self._output_pattern % tuple( |
| + [output_record[key] for key in self._fields]) |
| + if output_file: |
| + output_file.write(output_line) |
| + output_file.write(os.linesep) |
| + self._logger.info('%d: %s' % (i, line.encode(sys.stdout.encoding, |
| + 'ignore'))) |
| + self._logger.info('\tconverted to: %s' % |
| + output_line.encode(sys.stdout.encoding, 'ignore')) |
| + if output_file: |
| + output_file.write(']') |
| + output_file.write(os.linesep) |
| + self._logger.info('%d lines converted SUCCESSFULLY!' % i) |
| + self._logger.info('--- FINISHED ---') |
| + return list_of_dict |
| + |
| + def Convert(self): |
| + """Uses values of the two data attributes of the current objects.""" |
|
dennis_jeffrey
2011/02/16 19:43:29
I think a more descriptive comment might be someth
dyu1
2011/02/17 20:38:06
Done.
|
| + with open(self._input_filename) as input_file: |
| + if self._output_filename: |
| + with codecs.open(self._output_filename, mode='wb', |
| + encoding='utf-8-sig') as output_file: |
| + return self._Convert(input_file, output_file) |
| + else: |
| + return self._Convert(input_file, None) |
| + |
| + |
| +def main(): |
| + c = DatasetConverter(r'../data/autofill/dataset.txt', |
| + r'../data/autofill/dataset_duplicate-profiles.txt') |
| + c.Convert() |
| + |
| +if __name__ == '__main__': |
| + main() |