| Index: chrome/tools/webforms_extractor.py
|
| diff --git a/chrome/tools/webforms_extractor.py b/chrome/tools/webforms_extractor.py
|
| old mode 100644
|
| new mode 100755
|
| index 71fed7c15bfe37d8efac4a22d5f641f6fb0fae6d..dc50ea92065eb1010f820a6d6c4b74dc75242c39
|
| --- a/chrome/tools/webforms_extractor.py
|
| +++ b/chrome/tools/webforms_extractor.py
|
| @@ -1,253 +1,253 @@
|
| -#!/usr/bin/python
|
| -# Copyright (c) 2011 The Chromium Authors. All rights reserved.
|
| -# Use of this source code is governed by a BSD-style license that can be found
|
| -# in the LICENSE file.
|
| -
|
| -"""Extracts registration forms from the corresponding HTML files.
|
| -
|
| -Used for extracting forms within HTML files. This script is used in
|
| -conjunction with the webforms_aggregator.py script, which aggregates web pages
|
| -with fillable forms (i.e registration forms).
|
| -
|
| -The purpose of this script is to extract out all non-form elements that may be
|
| -causing parsing errors and timeout issues when running browser_tests.
|
| -
|
| -This script extracts all forms from a HTML file.
|
| -If there are multiple forms per downloaded site, multiple files are created
|
| -for each form.
|
| -
|
| -Used as a standalone script but assumes that it is run from the directory in
|
| -which it is checked into.
|
| -
|
| -Usage: forms_extractor.py [options]
|
| -
|
| -Options:
|
| - -l LOG_LEVEL, --log_level=LOG_LEVEL,
|
| - LOG_LEVEL: debug, info, warning or error [default: error]
|
| - -j, --js extracts javascript elements from web form.
|
| - -h, --help show this help message and exit
|
| -"""
|
| -
|
| -import glob
|
| -import logging
|
| -from optparse import OptionParser
|
| -import os
|
| -import re
|
| -import sys
|
| -
|
| -
|
| -class FormsExtractor(object):
|
| - """Extracts HTML files, leaving only registration forms from the HTML file."""
|
| - _HTML_FILES_PATTERN = r'*.html'
|
| - _HTML_FILE_PREFIX = r'grabber-'
|
| - _FORM_FILE_PREFIX = r'grabber-stripped-'
|
| -
|
| - _REGISTRATION_PAGES_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
|
| - 'heuristics', 'input')
|
| - _EXTRACTED_FORMS_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
|
| - 'heuristics', 'input')
|
| -
|
| - logger = logging.getLogger(__name__)
|
| - log_handlers = {'StreamHandler': None}
|
| -
|
| - # This pattern is used for retrieving the form location comment located at the
|
| - # top of each downloaded HTML file indicating where the form originated from.
|
| - _RE_FORM_LOCATION_PATTERN = re.compile(
|
| - ur"""
|
| - <!--Form\s{1}Location: # Starting of form location comment.
|
| - .*? # Any characters (non-greedy).
|
| - --> # Ending of the form comment.
|
| - """, re.U | re.S | re.I | re.X)
|
| -
|
| - # This pattern is used for removing all script code.
|
| - _RE_SCRIPT_PATTERN = re.compile(
|
| - ur"""
|
| - <script # A new opening '<script' tag.
|
| - \b # The end of the word 'script'.
|
| - .*? # Any characters (non-greedy).
|
| - > # Ending of the (opening) tag: '>'.
|
| - .*? # Any characters (non-greedy) between the tags.
|
| - </script\s*> # The '</script>' closing tag.
|
| - """, re.U | re.S | re.I | re.X)
|
| -
|
| - # This pattern is used for removing all href js code.
|
| - _RE_HREF_JS_PATTERN = re.compile(
|
| - ur"""
|
| - \bhref # The word href and its beginning.
|
| - \s*=\s* # The '=' with all whitespace before and after it.
|
| - (?P<quote>[\'\"]) # A single or double quote which is captured.
|
| - \s*javascript\s*: # The word 'javascript:' with any whitespace possible.
|
| - .*? # Any characters (non-greedy) between the quotes.
|
| - \1 # The previously captured single or double quote.
|
| - """, re.U | re.S | re.I | re.X)
|
| -
|
| - _RE_EVENT_EXPR = (
|
| - ur"""
|
| - \b # The beginning of a new word.
|
| - on\w+? # All words starting with 'on' (non-greedy)
|
| - # example: |onmouseover|.
|
| - \s*=\s* # The '=' with all whitespace before and after it.
|
| - (?P<quote>[\'\"]) # A captured single or double quote.
|
| - .*? # Any characters (non-greedy) between the quotes.
|
| - \1 # The previously captured single or double quote.
|
| - """)
|
| -
|
| - # This pattern is used for removing code with js events, such as |onload|.
|
| - # By adding the leading |ur'<[^<>]*?'| and the trailing |'ur'[^<>]*?>'| the
|
| - # pattern matches to strings such as '<tr class="nav"
|
| - # onmouseover="mOvr1(this);" onmouseout="mOut1(this);">'
|
| - _RE_TAG_WITH_EVENTS_PATTERN = re.compile(
|
| - ur"""
|
| - < # Matches character '<'.
|
| - [^<>]*? # Matches any characters except '<' and '>' (non-greedy).""" +
|
| - _RE_EVENT_EXPR +
|
| - ur"""
|
| - [^<>]*? # Matches any characters except '<' and '>' (non-greedy).
|
| - > # Matches character '>'.
|
| - """, re.U | re.S | re.I | re.X)
|
| -
|
| - # Adds whitespace chars at the end of the matched event. Also match trailing
|
| - # whitespaces for JS events. Do not match leading whitespace.
|
| - # For example: |< /form>| is invalid HTML and does not exist but |</form >| is
|
| - # considered valid HTML.
|
| - _RE_EVENT_PATTERN = re.compile(
|
| - _RE_EVENT_EXPR + ur'\s*', re.U | re.S | re.I | re.X)
|
| -
|
| - # This pattern is used for finding form elements.
|
| - _RE_FORM_PATTERN = re.compile(
|
| - ur"""
|
| - <form # A new opening '<form' tag.
|
| - \b # The end of the word 'form'.
|
| - .*? # Any characters (non-greedy).
|
| - > # Ending of the (opening) tag: '>'.
|
| - .*? # Any characters (non-greedy) between the tags.
|
| - </form\s*> # The '</form>' closing tag.
|
| - """, re.U | re.S | re.I | re.X)
|
| -
|
| - def __init__(self, input_dir=_REGISTRATION_PAGES_DIR,
|
| - output_dir=_EXTRACTED_FORMS_DIR, logging_level=None):
|
| - """Creates a FormsExtractor object.
|
| -
|
| - Args:
|
| - input_dir: the directory of HTML files.
|
| - output_dir: the directory where the registration form files will be
|
| - saved.
|
| - logging_level: verbosity level, default is None.
|
| -
|
| - Raises:
|
| - IOError exception if input directory doesn't exist.
|
| - """
|
| - if logging_level:
|
| - if not self.log_handlers['StreamHandler']:
|
| - console = logging.StreamHandler()
|
| - console.setLevel(logging.DEBUG)
|
| - self.log_handlers['StreamHandler'] = console
|
| - self.logger.addHandler(console)
|
| - self.logger.setLevel(logging_level)
|
| - else:
|
| - if self.log_handlers['StreamHandler']:
|
| - self.logger.removeHandler(self.log_handlers['StreamHandler'])
|
| - self.log_handlers['StreamHandler'] = None
|
| -
|
| - self._input_dir = input_dir
|
| - self._output_dir = output_dir
|
| - if not os.path.isdir(self._input_dir):
|
| - error_msg = 'Directory "%s" doesn\'t exist.' % self._input_dir
|
| - self.logger.error('Error: %s', error_msg)
|
| - raise IOError(error_msg)
|
| - if not os.path.isdir(output_dir):
|
| - os.makedirs(output_dir)
|
| - self._form_location_comment = ''
|
| -
|
| - def _SubstituteAllEvents(self, matchobj):
|
| - """Remove all js events that are present as attributes within a tag.
|
| -
|
| - Args:
|
| - matchobj: A regexp |re.MatchObject| containing text that has at least one
|
| - event. Example: |<tr class="nav" onmouseover="mOvr1(this);"
|
| - onmouseout="mOut1(this);">|.
|
| -
|
| - Returns:
|
| - The text containing the tag with all the attributes except for the tags
|
| - with events. Example: |<tr class="nav">|.
|
| - """
|
| - tag_with_all_attrs = matchobj.group(0)
|
| - return self._RE_EVENT_PATTERN.sub('', tag_with_all_attrs)
|
| -
|
| - def Extract(self, strip_js_only):
|
| - """Extracts and saves the extracted registration forms.
|
| -
|
| - Iterates through all the HTML files.
|
| -
|
| - Args:
|
| - strip_js_only: If True, only Javascript is stripped from the HTML content.
|
| - Otherwise, all non-form elements are stripped.
|
| - """
|
| - pathname_pattern = os.path.join(self._input_dir, self._HTML_FILES_PATTERN)
|
| - html_files = [f for f in glob.glob(pathname_pattern) if os.path.isfile(f)]
|
| - for filename in html_files:
|
| - self.logger.info('Stripping file "%s" ...', filename)
|
| - with open(filename, 'U') as f:
|
| - html_content = self._RE_TAG_WITH_EVENTS_PATTERN.sub(
|
| - self._SubstituteAllEvents,
|
| - self._RE_HREF_JS_PATTERN.sub(
|
| - '', self._RE_SCRIPT_PATTERN.sub('', f.read())))
|
| -
|
| - form_filename = os.path.split(filename)[1] # Path dropped.
|
| - form_filename = form_filename.replace(self._HTML_FILE_PREFIX, '', 1)
|
| - (form_filename, extension) = os.path.splitext(form_filename)
|
| - form_filename = (self._FORM_FILE_PREFIX + form_filename +
|
| - '%s' + extension)
|
| - form_filename = os.path.join(self._output_dir, form_filename)
|
| - if strip_js_only:
|
| - form_filename = form_filename % ''
|
| - try:
|
| - with open(form_filename, 'w') as f:
|
| - f.write(html_content)
|
| - except IOError as e:
|
| - self.logger.error('Error: %s', e)
|
| - continue
|
| - else: # Remove all non form elements.
|
| - match = self._RE_FORM_LOCATION_PATTERN.search(html_content)
|
| - if match:
|
| - form_location_comment = match.group() + os.linesep
|
| - else:
|
| - form_location_comment = ''
|
| - forms_iterator = self._RE_FORM_PATTERN.finditer(html_content)
|
| - for form_number, form_match in enumerate(forms_iterator, start=1):
|
| - form_content = form_match.group()
|
| - numbered_form_filename = form_filename % form_number
|
| - try:
|
| - with open(numbered_form_filename, 'w') as f:
|
| - f.write(form_location_comment)
|
| - f.write(form_content)
|
| - except IOError as e:
|
| - self.logger.error('Error: %s', e)
|
| - continue
|
| - self.logger.info('\tFile "%s" extracted SUCCESSFULLY!', filename)
|
| -
|
| -
|
| -def main():
|
| - # Command line options.
|
| - parser = OptionParser()
|
| - parser.add_option(
|
| - '-l', '--log_level', metavar='LOG_LEVEL', default='error',
|
| - help='LOG_LEVEL: debug, info, warning or error [default: %default]')
|
| - parser.add_option(
|
| - '-j', '--js', dest='js', action='store_true', default=False,
|
| - help='Removes all javascript elements [default: %default]')
|
| -
|
| - (options, args) = parser.parse_args()
|
| - options.log_level = options.log_level.upper()
|
| - if options.log_level not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']:
|
| - print 'Wrong log_level argument.'
|
| - parser.print_help()
|
| - sys.exit(1)
|
| -
|
| - options.log_level = getattr(logging, options.log_level)
|
| - extractor = FormsExtractor(logging_level=options.log_level)
|
| - extractor.Extract(options.js)
|
| -
|
| -
|
| -if __name__ == '__main__':
|
| - main()
|
| +#!/usr/bin/env python
|
| +# Copyright (c) 2011 The Chromium Authors. All rights reserved.
|
| +# Use of this source code is governed by a BSD-style license that can be found
|
| +# in the LICENSE file.
|
| +
|
| +"""Extracts registration forms from the corresponding HTML files.
|
| +
|
| +Used for extracting forms within HTML files. This script is used in
|
| +conjunction with the webforms_aggregator.py script, which aggregates web pages
|
| +with fillable forms (i.e registration forms).
|
| +
|
| +The purpose of this script is to extract out all non-form elements that may be
|
| +causing parsing errors and timeout issues when running browser_tests.
|
| +
|
| +This script extracts all forms from a HTML file.
|
| +If there are multiple forms per downloaded site, multiple files are created
|
| +for each form.
|
| +
|
| +Used as a standalone script but assumes that it is run from the directory in
|
| +which it is checked into.
|
| +
|
| +Usage: forms_extractor.py [options]
|
| +
|
| +Options:
|
| + -l LOG_LEVEL, --log_level=LOG_LEVEL,
|
| + LOG_LEVEL: debug, info, warning or error [default: error]
|
| + -j, --js extracts javascript elements from web form.
|
| + -h, --help show this help message and exit
|
| +"""
|
| +
|
| +import glob
|
| +import logging
|
| +from optparse import OptionParser
|
| +import os
|
| +import re
|
| +import sys
|
| +
|
| +
|
| +class FormsExtractor(object):
|
| + """Extracts HTML files, leaving only registration forms from the HTML file."""
|
| + _HTML_FILES_PATTERN = r'*.html'
|
| + _HTML_FILE_PREFIX = r'grabber-'
|
| + _FORM_FILE_PREFIX = r'grabber-stripped-'
|
| +
|
| + _REGISTRATION_PAGES_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
|
| + 'heuristics', 'input')
|
| + _EXTRACTED_FORMS_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
|
| + 'heuristics', 'input')
|
| +
|
| + logger = logging.getLogger(__name__)
|
| + log_handlers = {'StreamHandler': None}
|
| +
|
| + # This pattern is used for retrieving the form location comment located at the
|
| + # top of each downloaded HTML file indicating where the form originated from.
|
| + _RE_FORM_LOCATION_PATTERN = re.compile(
|
| + ur"""
|
| + <!--Form\s{1}Location: # Starting of form location comment.
|
| + .*? # Any characters (non-greedy).
|
| + --> # Ending of the form comment.
|
| + """, re.U | re.S | re.I | re.X)
|
| +
|
| + # This pattern is used for removing all script code.
|
| + _RE_SCRIPT_PATTERN = re.compile(
|
| + ur"""
|
| + <script # A new opening '<script' tag.
|
| + \b # The end of the word 'script'.
|
| + .*? # Any characters (non-greedy).
|
| + > # Ending of the (opening) tag: '>'.
|
| + .*? # Any characters (non-greedy) between the tags.
|
| + </script\s*> # The '</script>' closing tag.
|
| + """, re.U | re.S | re.I | re.X)
|
| +
|
| + # This pattern is used for removing all href js code.
|
| + _RE_HREF_JS_PATTERN = re.compile(
|
| + ur"""
|
| + \bhref # The word href and its beginning.
|
| + \s*=\s* # The '=' with all whitespace before and after it.
|
| + (?P<quote>[\'\"]) # A single or double quote which is captured.
|
| + \s*javascript\s*: # The word 'javascript:' with any whitespace possible.
|
| + .*? # Any characters (non-greedy) between the quotes.
|
| + \1 # The previously captured single or double quote.
|
| + """, re.U | re.S | re.I | re.X)
|
| +
|
| + _RE_EVENT_EXPR = (
|
| + ur"""
|
| + \b # The beginning of a new word.
|
| + on\w+? # All words starting with 'on' (non-greedy)
|
| + # example: |onmouseover|.
|
| + \s*=\s* # The '=' with all whitespace before and after it.
|
| + (?P<quote>[\'\"]) # A captured single or double quote.
|
| + .*? # Any characters (non-greedy) between the quotes.
|
| + \1 # The previously captured single or double quote.
|
| + """)
|
| +
|
| + # This pattern is used for removing code with js events, such as |onload|.
|
| + # By adding the leading |ur'<[^<>]*?'| and the trailing |'ur'[^<>]*?>'| the
|
| + # pattern matches to strings such as '<tr class="nav"
|
| + # onmouseover="mOvr1(this);" onmouseout="mOut1(this);">'
|
| + _RE_TAG_WITH_EVENTS_PATTERN = re.compile(
|
| + ur"""
|
| + < # Matches character '<'.
|
| + [^<>]*? # Matches any characters except '<' and '>' (non-greedy).""" +
|
| + _RE_EVENT_EXPR +
|
| + ur"""
|
| + [^<>]*? # Matches any characters except '<' and '>' (non-greedy).
|
| + > # Matches character '>'.
|
| + """, re.U | re.S | re.I | re.X)
|
| +
|
| + # Adds whitespace chars at the end of the matched event. Also match trailing
|
| + # whitespaces for JS events. Do not match leading whitespace.
|
| + # For example: |< /form>| is invalid HTML and does not exist but |</form >| is
|
| + # considered valid HTML.
|
| + _RE_EVENT_PATTERN = re.compile(
|
| + _RE_EVENT_EXPR + ur'\s*', re.U | re.S | re.I | re.X)
|
| +
|
| + # This pattern is used for finding form elements.
|
| + _RE_FORM_PATTERN = re.compile(
|
| + ur"""
|
| + <form # A new opening '<form' tag.
|
| + \b # The end of the word 'form'.
|
| + .*? # Any characters (non-greedy).
|
| + > # Ending of the (opening) tag: '>'.
|
| + .*? # Any characters (non-greedy) between the tags.
|
| + </form\s*> # The '</form>' closing tag.
|
| + """, re.U | re.S | re.I | re.X)
|
| +
|
| + def __init__(self, input_dir=_REGISTRATION_PAGES_DIR,
|
| + output_dir=_EXTRACTED_FORMS_DIR, logging_level=None):
|
| + """Creates a FormsExtractor object.
|
| +
|
| + Args:
|
| + input_dir: the directory of HTML files.
|
| + output_dir: the directory where the registration form files will be
|
| + saved.
|
| + logging_level: verbosity level, default is None.
|
| +
|
| + Raises:
|
| + IOError exception if input directory doesn't exist.
|
| + """
|
| + if logging_level:
|
| + if not self.log_handlers['StreamHandler']:
|
| + console = logging.StreamHandler()
|
| + console.setLevel(logging.DEBUG)
|
| + self.log_handlers['StreamHandler'] = console
|
| + self.logger.addHandler(console)
|
| + self.logger.setLevel(logging_level)
|
| + else:
|
| + if self.log_handlers['StreamHandler']:
|
| + self.logger.removeHandler(self.log_handlers['StreamHandler'])
|
| + self.log_handlers['StreamHandler'] = None
|
| +
|
| + self._input_dir = input_dir
|
| + self._output_dir = output_dir
|
| + if not os.path.isdir(self._input_dir):
|
| + error_msg = 'Directory "%s" doesn\'t exist.' % self._input_dir
|
| + self.logger.error('Error: %s', error_msg)
|
| + raise IOError(error_msg)
|
| + if not os.path.isdir(output_dir):
|
| + os.makedirs(output_dir)
|
| + self._form_location_comment = ''
|
| +
|
| + def _SubstituteAllEvents(self, matchobj):
|
| + """Remove all js events that are present as attributes within a tag.
|
| +
|
| + Args:
|
| + matchobj: A regexp |re.MatchObject| containing text that has at least one
|
| + event. Example: |<tr class="nav" onmouseover="mOvr1(this);"
|
| + onmouseout="mOut1(this);">|.
|
| +
|
| + Returns:
|
| + The text containing the tag with all the attributes except for the tags
|
| + with events. Example: |<tr class="nav">|.
|
| + """
|
| + tag_with_all_attrs = matchobj.group(0)
|
| + return self._RE_EVENT_PATTERN.sub('', tag_with_all_attrs)
|
| +
|
| + def Extract(self, strip_js_only):
|
| + """Extracts and saves the extracted registration forms.
|
| +
|
| + Iterates through all the HTML files.
|
| +
|
| + Args:
|
| + strip_js_only: If True, only Javascript is stripped from the HTML content.
|
| + Otherwise, all non-form elements are stripped.
|
| + """
|
| + pathname_pattern = os.path.join(self._input_dir, self._HTML_FILES_PATTERN)
|
| + html_files = [f for f in glob.glob(pathname_pattern) if os.path.isfile(f)]
|
| + for filename in html_files:
|
| + self.logger.info('Stripping file "%s" ...', filename)
|
| + with open(filename, 'U') as f:
|
| + html_content = self._RE_TAG_WITH_EVENTS_PATTERN.sub(
|
| + self._SubstituteAllEvents,
|
| + self._RE_HREF_JS_PATTERN.sub(
|
| + '', self._RE_SCRIPT_PATTERN.sub('', f.read())))
|
| +
|
| + form_filename = os.path.split(filename)[1] # Path dropped.
|
| + form_filename = form_filename.replace(self._HTML_FILE_PREFIX, '', 1)
|
| + (form_filename, extension) = os.path.splitext(form_filename)
|
| + form_filename = (self._FORM_FILE_PREFIX + form_filename +
|
| + '%s' + extension)
|
| + form_filename = os.path.join(self._output_dir, form_filename)
|
| + if strip_js_only:
|
| + form_filename = form_filename % ''
|
| + try:
|
| + with open(form_filename, 'w') as f:
|
| + f.write(html_content)
|
| + except IOError as e:
|
| + self.logger.error('Error: %s', e)
|
| + continue
|
| + else: # Remove all non form elements.
|
| + match = self._RE_FORM_LOCATION_PATTERN.search(html_content)
|
| + if match:
|
| + form_location_comment = match.group() + os.linesep
|
| + else:
|
| + form_location_comment = ''
|
| + forms_iterator = self._RE_FORM_PATTERN.finditer(html_content)
|
| + for form_number, form_match in enumerate(forms_iterator, start=1):
|
| + form_content = form_match.group()
|
| + numbered_form_filename = form_filename % form_number
|
| + try:
|
| + with open(numbered_form_filename, 'w') as f:
|
| + f.write(form_location_comment)
|
| + f.write(form_content)
|
| + except IOError as e:
|
| + self.logger.error('Error: %s', e)
|
| + continue
|
| + self.logger.info('\tFile "%s" extracted SUCCESSFULLY!', filename)
|
| +
|
| +
|
| +def main():
|
| + parser = OptionParser()
|
| + parser.add_option(
|
| + '-l', '--log_level', metavar='LOG_LEVEL', default='error',
|
| + help='LOG_LEVEL: debug, info, warning or error [default: %default]')
|
| + parser.add_option(
|
| + '-j', '--js', dest='js', action='store_true', default=False,
|
| + help='Removes all javascript elements [default: %default]')
|
| +
|
| + (options, args) = parser.parse_args()
|
| + options.log_level = options.log_level.upper()
|
| + if options.log_level not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']:
|
| + print 'Wrong log_level argument.'
|
| + parser.print_help()
|
| + return 1
|
| +
|
| + options.log_level = getattr(logging, options.log_level)
|
| + extractor = FormsExtractor(logging_level=options.log_level)
|
| + extractor.Extract(options.js)
|
| + return 0
|
| +
|
| +
|
| +if __name__ == '__main__':
|
| + sys.exit(main())
|
|
|