Index: chrome/test/functional/dataset_converter.py |
=================================================================== |
--- chrome/test/functional/dataset_converter.py (revision 0) |
+++ chrome/test/functional/dataset_converter.py (revision 0) |
@@ -0,0 +1,176 @@ |
+#!/usr/bin/python |
+# Copyright (c) 2011 The Chromium Authors. All rights reserved. |
+# Use of this source code is governed by a BSD-style license that can be |
+# found in the LICENSE file. |
+ |
+"""Converts profile datasets to dictionary list for Autofill profiles. |
+ |
+Used for test autofill.AutoFillTest.testMergeDuplicateProfilesInAutofill. |
+""" |
+ |
+import codecs |
+import logging |
+import os |
+import re |
+import sys |
+ |
+ |
+class NullHandler(logging.Handler): |
+ def emit(self, record): |
+ pass |
dennis_jeffrey
2011/02/16 19:43:29
Right now it looks like you will never see any log
dyu1
2011/02/17 20:38:06
Done.
|
+ |
dennis_jeffrey
2011/02/16 19:43:29
Put one more blank line here, to separate these tw
dyu1
2011/02/17 20:38:06
Done.
|
+class DatasetConverter(object): |
+ _fields = [ |
+ u'NAME_FIRST', |
+ u'NAME_MIDDLE', |
+ u'NAME_LAST', |
+ u'EMAIL_ADDRESS', |
+ u'COMPANY_NAME', |
+ u'ADDRESS_HOME_LINE1', |
+ u'ADDRESS_HOME_LINE2', |
+ u'ADDRESS_HOME_CITY', |
+ u'ADDRESS_HOME_STATE', |
+ u'ADDRESS_HOME_ZIP', |
+ u'ADDRESS_HOME_COUNTRY', |
+ u'PHONE_HOME_WHOLE_NUMBER', |
+ u'PHONE_FAX_WHOLE_NUMBER', |
+ ] |
+ _record_length = len(_fields) |
+ _output_pattern = u'{' |
+ for key in _fields: |
+ _output_pattern += u"u'%s': u'%%s', " % key |
+ _output_pattern = _output_pattern[:-1] + '},' |
+ _re_single_quote = re.compile("'", re.UNICODE) |
+ _logger = logging.getLogger(__name__) |
+ _logger.addHandler(NullHandler()) |
+ |
+ def __init__(self, input_filename, output_filename=None): |
+ """Constructs a dataset converter object. |
+ |
+ Full input pattern: |
+ '(?P<NAME_FIRST>.*?)\|(?P<MIDDLE_NAME>.*?)\|(?P<NAME_LAST>.*?)\| |
+ (?P<EMAIL_ADDRESS>.*?)\|(?P<COMPANY_NAME>.*?)\|(?P<ADDRESS_HOME_LINE1>.*?) |
+ \|(?P<ADDRESS_HOME_LINE2>.*?)\|(?P<ADDRESS_HOME_CITY>.*?)\| |
+ (?P<ADDRESS_HOME_STATE>.*?)\|(?P<ADDRESS_HOME_ZIP>.*?)\| |
+ (?P<ADDRESS_HOME_COUNTRY>.*?)\| |
+ (?P<PHONE_HOME_WHOLE_NUMBER>.*?)\|(?P<PHONE_FAX_WHOLE_NUMBER>.*?)$' |
+ |
+ Full ouput pattern: |
+ "{u'NAME_FIRST': u'%s', u'NAME_MIDDLE': u'%s', u'NAME_LAST': u'%s', |
+ u'EMAIL_ADDRESS': u'%s', u'COMPANY_NAME': u'%s', u'ADDRESS_HOME_LINE1': |
+ u'%s', u'ADDRESS_HOME_LINE2': u'%s', u'ADDRESS_HOME_CITY': u'%s', |
+ u'ADDRESS_HOME_STATE': u'%s', u'ADDRESS_HOME_ZIP': u'%s', |
+ u'ADDRESS_HOME_COUNTRY': u'%s', u'PHONE_HOME_WHOLE_NUMBER': u'%s', |
+ u'PHONE_FAX_WHOLE_NUMBER': u'%s',}," |
+ |
+ args: |
dennis_jeffrey
2011/02/16 19:43:29
Capitalize "a" in "args".
dyu1
2011/02/17 20:38:06
Done.
|
+ input_filename: name and path of the input dataset. |
+ output_filename: name and path of the converted file, default is none. |
dennis_jeffrey
2011/02/16 19:43:29
Since this method can now possibly raise "IOError"
dyu1
2011/02/17 20:38:06
Done.
|
+ """ |
+ self._input_filename = os.path.join(os.path.dirname(sys.argv[0]), |
+ input_filename) |
+ if not os.path.isfile(self._input_filename): |
+ raise IOError('File "%s" does not exist' % self._input_filename) |
+ self._output_filename = output_filename |
+ |
+ def _CreateDictionaryFromRecord(self, line): |
+ """Constructs and returns a dictionary from a record in the dataset file. |
+ |
+ Escapes single quotation first and uses split('|') to separate values. |
+ Example: |
+ Take an argument as a string u'John|Doe|Mountain View' |
+ and returns a dictionary |
+ { |
+ u'NAME_FIRST': u'John', |
+ u'NAME_LAST': u'Doe', |
+ u'ADDRESS_HOME_CITY': u'Mountain View', |
+ } |
+ |
+ Args: |
+ line: row of record from the dataset file. |
dennis_jeffrey
2011/02/16 19:43:29
Maybe a variable name of "record" might be better
dyu1
2011/02/17 20:38:06
Done.
|
+ |
+ Returns: |
+ out_record: a dictionary that comes from conversion of a single line. |
+ same as the output_record. |
dennis_jeffrey
2011/02/16 19:43:29
In the "Returns:" section, I think you don't need
dyu1
2011/02/17 20:38:06
Done.
|
+ """ |
+ # Ignore irrelevant record lines that does not contain '|'. |
dennis_jeffrey
2011/02/16 19:43:29
"does" --> "do"
dyu1
2011/02/17 20:38:06
Done.
|
+ if not '|' in line: |
+ return |
+ # Escaping single quote: "'" -> "\'" |
+ line = self._re_single_quote.sub(r"\'", line) |
+ line_list = line.split('|') |
+ if line_list: |
+ # Check for case when a line may have more or less fields than expected. |
+ if len(line_list) != self._record_length: |
+ self._logger.warning( |
+ 'A "|" seperated line has %d fields instead of %d: %s' % ( |
dennis_jeffrey
2011/02/16 19:43:29
"seperated" --> "separated"
dyu1
2011/02/17 20:38:06
Done.
|
+ len(line_list), self._record_length, line)) |
+ return |
+ out_record = {} |
+ i = 0 |
+ for key in self._fields: |
+ out_record[key] = line_list[i] |
+ i += 1 |
+ return out_record |
+ |
+ def _Convert(self, input_file, output_file): |
+ """The real conversion takes place here. |
+ |
+ The output pattern takes place in this function. Each field needs to be |
dennis_jeffrey
2011/02/16 19:43:29
What does it mean for an "output pattern" to "take
dyu1
2011/02/17 20:38:06
Removed this function.
On 2011/02/16 19:43:29, de
|
+ formatted in order to give the converted line. |
+ |
+ Args: |
+ input_file: dataset input file. |
+ output_file: the converted dictionary list output file. |
+ |
+ Returns: |
+ list_of_dict: list that holds all the dictionaries. |
dennis_jeffrey
2011/02/16 19:43:29
Can remove the returned variable name "list_of_dic
dyu1
2011/02/17 20:38:06
Done.
|
+ """ |
+ list_of_dict = [] |
+ i = 0 |
+ if output_file: |
+ output_file.write('[') |
+ output_file.write(os.linesep) |
+ for line in input_file.readlines(): |
+ line = line.strip() |
+ if not line: |
+ continue |
+ line = unicode(line, 'UTF-8') |
+ output_record = self._CreateDictionaryFromRecord(line) |
+ if output_record: |
+ i += 1 |
+ list_of_dict.append(output_record) |
+ output_line = self._output_pattern % tuple( |
+ [output_record[key] for key in self._fields]) |
+ if output_file: |
+ output_file.write(output_line) |
+ output_file.write(os.linesep) |
+ self._logger.info('%d: %s' % (i, line.encode(sys.stdout.encoding, |
+ 'ignore'))) |
+ self._logger.info('\tconverted to: %s' % |
+ output_line.encode(sys.stdout.encoding, 'ignore')) |
+ if output_file: |
+ output_file.write(']') |
+ output_file.write(os.linesep) |
+ self._logger.info('%d lines converted SUCCESSFULLY!' % i) |
+ self._logger.info('--- FINISHED ---') |
+ return list_of_dict |
+ |
+ def Convert(self): |
+ """Uses values of the two data attributes of the current objects.""" |
dennis_jeffrey
2011/02/16 19:43:29
I think a more descriptive comment might be someth
dyu1
2011/02/17 20:38:06
Done.
|
+ with open(self._input_filename) as input_file: |
+ if self._output_filename: |
+ with codecs.open(self._output_filename, mode='wb', |
+ encoding='utf-8-sig') as output_file: |
+ return self._Convert(input_file, output_file) |
+ else: |
+ return self._Convert(input_file, None) |
+ |
+ |
+def main(): |
+ c = DatasetConverter(r'../data/autofill/dataset.txt', |
+ r'../data/autofill/dataset_duplicate-profiles.txt') |
+ c.Convert() |
+ |
+if __name__ == '__main__': |
+ main() |