Index: third_party/closure_linter/closure_linter/javascripttokenizer.py |
diff --git a/third_party/closure_linter/closure_linter/javascripttokenizer.py b/third_party/closure_linter/closure_linter/javascripttokenizer.py |
deleted file mode 100755 |
index 964db7ce2fdcb4b17af1a8c565c1a08f7fb94fa6..0000000000000000000000000000000000000000 |
--- a/third_party/closure_linter/closure_linter/javascripttokenizer.py |
+++ /dev/null |
@@ -1,478 +0,0 @@ |
-#!/usr/bin/env python |
-# |
-# Copyright 2007 The Closure Linter Authors. All Rights Reserved. |
-# |
-# Licensed under the Apache License, Version 2.0 (the "License"); |
-# you may not use this file except in compliance with the License. |
-# You may obtain a copy of the License at |
-# |
-# http://www.apache.org/licenses/LICENSE-2.0 |
-# |
-# Unless required by applicable law or agreed to in writing, software |
-# distributed under the License is distributed on an "AS-IS" BASIS, |
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
-# See the License for the specific language governing permissions and |
-# limitations under the License. |
- |
-"""Regular expression based JavaScript parsing classes.""" |
- |
-__author__ = ('robbyw@google.com (Robert Walker)', |
- 'ajp@google.com (Andy Perelson)') |
- |
-import copy |
-import re |
- |
-from closure_linter import javascripttokens |
-from closure_linter.common import matcher |
-from closure_linter.common import tokenizer |
- |
-# Shorthand |
-Type = javascripttokens.JavaScriptTokenType |
-Matcher = matcher.Matcher |
- |
- |
-class JavaScriptModes(object): |
- """Enumeration of the different matcher modes used for JavaScript.""" |
- TEXT_MODE = 'text' |
- SINGLE_QUOTE_STRING_MODE = 'single_quote_string' |
- DOUBLE_QUOTE_STRING_MODE = 'double_quote_string' |
- TEMPLATE_STRING_MODE = 'template_string' |
- BLOCK_COMMENT_MODE = 'block_comment' |
- DOC_COMMENT_MODE = 'doc_comment' |
- DOC_COMMENT_LEX_SPACES_MODE = 'doc_comment_spaces' |
- LINE_COMMENT_MODE = 'line_comment' |
- PARAMETER_MODE = 'parameter' |
- FUNCTION_MODE = 'function' |
- |
- |
-class JavaScriptTokenizer(tokenizer.Tokenizer): |
- """JavaScript tokenizer. |
- |
- Convert JavaScript code in to an array of tokens. |
- """ |
- |
- # Useful patterns for JavaScript parsing. |
- IDENTIFIER_CHAR = r'A-Za-z0-9_$' |
- |
- # Number patterns based on: |
- # http://www.mozilla.org/js/language/js20-2000-07/formal/lexer-grammar.html |
- MANTISSA = r""" |
- (\d+(?!\.)) | # Matches '10' |
- (\d+\.(?!\d)) | # Matches '10.' |
- (\d*\.\d+) # Matches '.5' or '10.5' |
- """ |
- DECIMAL_LITERAL = r'(%s)([eE][-+]?\d+)?' % MANTISSA |
- HEX_LITERAL = r'0[xX][0-9a-fA-F]+' |
- NUMBER = re.compile(r""" |
- ((%s)|(%s)) |
- """ % (HEX_LITERAL, DECIMAL_LITERAL), re.VERBOSE) |
- |
- # Strings come in three parts - first we match the start of the string, then |
- # the contents, then the end. The contents consist of any character except a |
- # backslash or end of string, or a backslash followed by any character, or a |
- # backslash followed by end of line to support correct parsing of multi-line |
- # strings. |
- SINGLE_QUOTE = re.compile(r"'") |
- SINGLE_QUOTE_TEXT = re.compile(r"([^'\\]|\\(.|$))+") |
- DOUBLE_QUOTE = re.compile(r'"') |
- DOUBLE_QUOTE_TEXT = re.compile(r'([^"\\]|\\(.|$))+') |
- # Template strings are different from normal strings in that they do not |
- # require escaping of end of lines in order to be multi-line. |
- TEMPLATE_QUOTE = re.compile(r'`') |
- TEMPLATE_QUOTE_TEXT = re.compile(r'([^`]|$)+') |
- |
- START_SINGLE_LINE_COMMENT = re.compile(r'//') |
- END_OF_LINE_SINGLE_LINE_COMMENT = re.compile(r'//$') |
- |
- START_DOC_COMMENT = re.compile(r'/\*\*') |
- START_BLOCK_COMMENT = re.compile(r'/\*') |
- END_BLOCK_COMMENT = re.compile(r'\*/') |
- BLOCK_COMMENT_TEXT = re.compile(r'([^*]|\*(?!/))+') |
- |
- # Comment text is anything that we are not going to parse into another special |
- # token like (inline) flags or end comments. Complicated regex to match |
- # most normal characters, and '*', '{', '}', and '@' when we are sure that |
- # it is safe. Expression [^*{\s]@ must come first, or the other options will |
- # match everything before @, and we won't match @'s that aren't part of flags |
- # like in email addresses in the @author tag. |
- DOC_COMMENT_TEXT = re.compile(r'([^*{}\s]@|[^*{}@]|\*(?!/))+') |
- DOC_COMMENT_NO_SPACES_TEXT = re.compile(r'([^*{}\s]@|[^*{}@\s]|\*(?!/))+') |
- # Match anything that is allowed in a type definition, except for tokens |
- # needed to parse it (and the lookahead assertion for "*/"). |
- DOC_COMMENT_TYPE_TEXT = re.compile(r'([^*|!?=<>(){}:,\s]|\*(?!/))+') |
- |
- # Match the prefix ' * ' that starts every line of jsdoc. Want to include |
- # spaces after the '*', but nothing else that occurs after a '*', and don't |
- # want to match the '*' in '*/'. |
- DOC_PREFIX = re.compile(r'\s*\*(\s+|(?!/))') |
- |
- START_BLOCK = re.compile('{') |
- END_BLOCK = re.compile('}') |
- |
- REGEX_CHARACTER_CLASS = r""" |
- \[ # Opening bracket |
- ([^\]\\]|\\.)* # Anything but a ] or \, |
- # or a backslash followed by anything |
- \] # Closing bracket |
- """ |
- # We ensure the regex is followed by one of the above tokens to avoid |
- # incorrectly parsing something like x / y / z as x REGEX(/ y /) z |
- POST_REGEX_LIST = [ |
- ';', ',', r'\.', r'\)', r'\]', '$', r'\/\/', r'\/\*', ':', '}'] |
- |
- REGEX = re.compile(r""" |
- / # opening slash |
- (?!\*) # not the start of a comment |
- (\\.|[^\[\/\\]|(%s))* # a backslash followed by anything, |
- # or anything but a / or [ or \, |
- # or a character class |
- / # closing slash |
- [gimsx]* # optional modifiers |
- (?=\s*(%s)) |
- """ % (REGEX_CHARACTER_CLASS, '|'.join(POST_REGEX_LIST)), |
- re.VERBOSE) |
- |
- ANYTHING = re.compile(r'.*') |
- PARAMETERS = re.compile(r'[^\)]+') |
- CLOSING_PAREN_WITH_SPACE = re.compile(r'\)\s*') |
- |
- FUNCTION_DECLARATION = re.compile(r'\bfunction\b') |
- |
- OPENING_PAREN = re.compile(r'\(') |
- CLOSING_PAREN = re.compile(r'\)') |
- |
- OPENING_BRACKET = re.compile(r'\[') |
- CLOSING_BRACKET = re.compile(r'\]') |
- |
- # We omit these JS keywords from the list: |
- # function - covered by FUNCTION_DECLARATION. |
- # delete, in, instanceof, new, typeof - included as operators. |
- # this - included in identifiers. |
- # null, undefined - not included, should go in some "special constant" list. |
- KEYWORD_LIST = [ |
- 'break', |
- 'case', |
- 'catch', |
- 'continue', |
- 'default', |
- 'do', |
- 'else', |
- 'finally', |
- 'for', |
- 'if', |
- 'return', |
- 'switch', |
- 'throw', |
- 'try', |
- 'var', |
- 'while', |
- 'with', |
- ] |
- |
- # List of regular expressions to match as operators. Some notes: for our |
- # purposes, the comma behaves similarly enough to a normal operator that we |
- # include it here. r'\bin\b' actually matches 'in' surrounded by boundary |
- # characters - this may not match some very esoteric uses of the in operator. |
- # Operators that are subsets of larger operators must come later in this list |
- # for proper matching, e.g., '>>' must come AFTER '>>>'. |
- OPERATOR_LIST = [ |
- ',', |
- r'\+\+', |
- '===', |
- '!==', |
- '>>>=', |
- '>>>', |
- '==', |
- '>=', |
- '<=', |
- '!=', |
- '<<=', |
- '>>=', |
- '<<', |
- '>>', |
- '=>', |
- '>', |
- '<', |
- r'\+=', |
- r'\+', |
- '--', |
- r'\^=', |
- '-=', |
- '-', |
- '/=', |
- '/', |
- r'\*=', |
- r'\*', |
- '%=', |
- '%', |
- '&&', |
- r'\|\|', |
- '&=', |
- '&', |
- r'\|=', |
- r'\|', |
- '=', |
- '!', |
- ':', |
- r'\?', |
- r'\^', |
- r'\bdelete\b', |
- r'\bin\b', |
- r'\binstanceof\b', |
- r'\bnew\b', |
- r'\btypeof\b', |
- r'\bvoid\b', |
- r'\.', |
- ] |
- OPERATOR = re.compile('|'.join(OPERATOR_LIST)) |
- |
- WHITESPACE = re.compile(r'\s+') |
- SEMICOLON = re.compile(r';') |
- # Technically JavaScript identifiers can't contain '.', but we treat a set of |
- # nested identifiers as a single identifier, except for trailing dots. |
- NESTED_IDENTIFIER = r'[a-zA-Z_$]([%s]|\.[a-zA-Z_$])*' % IDENTIFIER_CHAR |
- IDENTIFIER = re.compile(NESTED_IDENTIFIER) |
- |
- SIMPLE_LVALUE = re.compile(r""" |
- (?P<identifier>%s) # a valid identifier |
- (?=\s* # optional whitespace |
- \= # look ahead to equal sign |
- (?!=)) # not follwed by equal |
- """ % NESTED_IDENTIFIER, re.VERBOSE) |
- |
- # A doc flag is a @ sign followed by non-space characters that appears at the |
- # beginning of the line, after whitespace, or after a '{'. The look-behind |
- # check is necessary to not match someone@google.com as a flag. |
- DOC_FLAG = re.compile(r'(^|(?<=\s))@(?P<name>[a-zA-Z]+)') |
- # To properly parse parameter names and complex doctypes containing |
- # whitespace, we need to tokenize whitespace into a token after certain |
- # doctags. All statetracker.HAS_TYPE that are not listed here must not contain |
- # any whitespace in their types. |
- DOC_FLAG_LEX_SPACES = re.compile( |
- r'(^|(?<=\s))@(?P<name>%s)\b' % |
- '|'.join([ |
- 'const', |
- 'enum', |
- 'export', |
- 'extends', |
- 'final', |
- 'implements', |
- 'package', |
- 'param', |
- 'private', |
- 'protected', |
- 'public', |
- 'return', |
- 'type', |
- 'typedef' |
- ])) |
- |
- DOC_INLINE_FLAG = re.compile(r'(?<={)@(?P<name>[a-zA-Z]+)') |
- |
- DOC_TYPE_BLOCK_START = re.compile(r'[<(]') |
- DOC_TYPE_BLOCK_END = re.compile(r'[>)]') |
- DOC_TYPE_MODIFIERS = re.compile(r'[!?|,:=]') |
- |
- # Star followed by non-slash, i.e a star that does not end a comment. |
- # This is used for TYPE_GROUP below. |
- SAFE_STAR = r'(\*(?!/))' |
- |
- COMMON_DOC_MATCHERS = [ |
- # Find the end of the comment. |
- Matcher(END_BLOCK_COMMENT, Type.END_DOC_COMMENT, |
- JavaScriptModes.TEXT_MODE), |
- |
- # Tokenize documented flags like @private. |
- Matcher(DOC_INLINE_FLAG, Type.DOC_INLINE_FLAG), |
- Matcher(DOC_FLAG_LEX_SPACES, Type.DOC_FLAG, |
- JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE), |
- |
- # Encountering a doc flag should leave lex spaces mode. |
- Matcher(DOC_FLAG, Type.DOC_FLAG, JavaScriptModes.DOC_COMMENT_MODE), |
- |
- # Tokenize braces so we can find types. |
- Matcher(START_BLOCK, Type.DOC_START_BRACE), |
- Matcher(END_BLOCK, Type.DOC_END_BRACE), |
- |
- # And some more to parse types. |
- Matcher(DOC_TYPE_BLOCK_START, Type.DOC_TYPE_START_BLOCK), |
- Matcher(DOC_TYPE_BLOCK_END, Type.DOC_TYPE_END_BLOCK), |
- |
- Matcher(DOC_TYPE_MODIFIERS, Type.DOC_TYPE_MODIFIER), |
- Matcher(DOC_COMMENT_TYPE_TEXT, Type.COMMENT), |
- |
- Matcher(DOC_PREFIX, Type.DOC_PREFIX, None, True)] |
- |
- # When text is not matched, it is given this default type based on mode. |
- # If unspecified in this map, the default default is Type.NORMAL. |
- JAVASCRIPT_DEFAULT_TYPES = { |
- JavaScriptModes.DOC_COMMENT_MODE: Type.COMMENT, |
- JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: Type.COMMENT |
- } |
- |
- @classmethod |
- def BuildMatchers(cls): |
- """Builds the token matcher group. |
- |
- The token matcher groups work as follows: it is a list of Matcher objects. |
- The matchers will be tried in this order, and the first to match will be |
- returned. Hence the order is important because the matchers that come first |
- overrule the matchers that come later. |
- |
- Returns: |
- The completed token matcher group. |
- """ |
- # Match a keyword string followed by a non-identifier character in order to |
- # not match something like doSomething as do + Something. |
- keyword = re.compile('(%s)((?=[^%s])|$)' % ( |
- '|'.join(cls.KEYWORD_LIST), cls.IDENTIFIER_CHAR)) |
- return { |
- |
- # Matchers for basic text mode. |
- JavaScriptModes.TEXT_MODE: [ |
- # Check a big group - strings, starting comments, and regexes - all |
- # of which could be intertwined. 'string with /regex/', |
- # /regex with 'string'/, /* comment with /regex/ and string */ (and |
- # so on) |
- Matcher(cls.START_DOC_COMMENT, Type.START_DOC_COMMENT, |
- JavaScriptModes.DOC_COMMENT_MODE), |
- Matcher(cls.START_BLOCK_COMMENT, Type.START_BLOCK_COMMENT, |
- JavaScriptModes.BLOCK_COMMENT_MODE), |
- Matcher(cls.END_OF_LINE_SINGLE_LINE_COMMENT, |
- Type.START_SINGLE_LINE_COMMENT), |
- Matcher(cls.START_SINGLE_LINE_COMMENT, |
- Type.START_SINGLE_LINE_COMMENT, |
- JavaScriptModes.LINE_COMMENT_MODE), |
- Matcher(cls.SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_START, |
- JavaScriptModes.SINGLE_QUOTE_STRING_MODE), |
- Matcher(cls.DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_START, |
- JavaScriptModes.DOUBLE_QUOTE_STRING_MODE), |
- Matcher(cls.TEMPLATE_QUOTE, Type.TEMPLATE_STRING_START, |
- JavaScriptModes.TEMPLATE_STRING_MODE), |
- Matcher(cls.REGEX, Type.REGEX), |
- |
- # Next we check for start blocks appearing outside any of the items |
- # above. |
- Matcher(cls.START_BLOCK, Type.START_BLOCK), |
- Matcher(cls.END_BLOCK, Type.END_BLOCK), |
- |
- # Then we search for function declarations. |
- Matcher(cls.FUNCTION_DECLARATION, Type.FUNCTION_DECLARATION, |
- JavaScriptModes.FUNCTION_MODE), |
- |
- # Next, we convert non-function related parens to tokens. |
- Matcher(cls.OPENING_PAREN, Type.START_PAREN), |
- Matcher(cls.CLOSING_PAREN, Type.END_PAREN), |
- |
- # Next, we convert brackets to tokens. |
- Matcher(cls.OPENING_BRACKET, Type.START_BRACKET), |
- Matcher(cls.CLOSING_BRACKET, Type.END_BRACKET), |
- |
- # Find numbers. This has to happen before operators because |
- # scientific notation numbers can have + and - in them. |
- Matcher(cls.NUMBER, Type.NUMBER), |
- |
- # Find operators and simple assignments |
- Matcher(cls.SIMPLE_LVALUE, Type.SIMPLE_LVALUE), |
- Matcher(cls.OPERATOR, Type.OPERATOR), |
- |
- # Find key words and whitespace. |
- Matcher(keyword, Type.KEYWORD), |
- Matcher(cls.WHITESPACE, Type.WHITESPACE), |
- |
- # Find identifiers. |
- Matcher(cls.IDENTIFIER, Type.IDENTIFIER), |
- |
- # Finally, we convert semicolons to tokens. |
- Matcher(cls.SEMICOLON, Type.SEMICOLON)], |
- |
- # Matchers for single quote strings. |
- JavaScriptModes.SINGLE_QUOTE_STRING_MODE: [ |
- Matcher(cls.SINGLE_QUOTE_TEXT, Type.STRING_TEXT), |
- Matcher(cls.SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_END, |
- JavaScriptModes.TEXT_MODE)], |
- |
- # Matchers for double quote strings. |
- JavaScriptModes.DOUBLE_QUOTE_STRING_MODE: [ |
- Matcher(cls.DOUBLE_QUOTE_TEXT, Type.STRING_TEXT), |
- Matcher(cls.DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_END, |
- JavaScriptModes.TEXT_MODE)], |
- |
- # Matchers for template strings. |
- JavaScriptModes.TEMPLATE_STRING_MODE: [ |
- Matcher(cls.TEMPLATE_QUOTE_TEXT, Type.STRING_TEXT), |
- Matcher(cls.TEMPLATE_QUOTE, Type.TEMPLATE_STRING_END, |
- JavaScriptModes.TEXT_MODE)], |
- |
- # Matchers for block comments. |
- JavaScriptModes.BLOCK_COMMENT_MODE: [ |
- # First we check for exiting a block comment. |
- Matcher(cls.END_BLOCK_COMMENT, Type.END_BLOCK_COMMENT, |
- JavaScriptModes.TEXT_MODE), |
- |
- # Match non-comment-ending text.. |
- Matcher(cls.BLOCK_COMMENT_TEXT, Type.COMMENT)], |
- |
- # Matchers for doc comments. |
- JavaScriptModes.DOC_COMMENT_MODE: cls.COMMON_DOC_MATCHERS + [ |
- Matcher(cls.DOC_COMMENT_TEXT, Type.COMMENT)], |
- |
- JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: cls.COMMON_DOC_MATCHERS + [ |
- Matcher(cls.WHITESPACE, Type.COMMENT), |
- Matcher(cls.DOC_COMMENT_NO_SPACES_TEXT, Type.COMMENT)], |
- |
- # Matchers for single line comments. |
- JavaScriptModes.LINE_COMMENT_MODE: [ |
- # We greedy match until the end of the line in line comment mode. |
- Matcher(cls.ANYTHING, Type.COMMENT, JavaScriptModes.TEXT_MODE)], |
- |
- # Matchers for code after the function keyword. |
- JavaScriptModes.FUNCTION_MODE: [ |
- # Must match open paren before anything else and move into parameter |
- # mode, otherwise everything inside the parameter list is parsed |
- # incorrectly. |
- Matcher(cls.OPENING_PAREN, Type.START_PARAMETERS, |
- JavaScriptModes.PARAMETER_MODE), |
- Matcher(cls.WHITESPACE, Type.WHITESPACE), |
- Matcher(cls.IDENTIFIER, Type.FUNCTION_NAME)], |
- |
- # Matchers for function parameters |
- JavaScriptModes.PARAMETER_MODE: [ |
- # When in function parameter mode, a closing paren is treated |
- # specially. Everything else is treated as lines of parameters. |
- Matcher(cls.CLOSING_PAREN_WITH_SPACE, Type.END_PARAMETERS, |
- JavaScriptModes.TEXT_MODE), |
- Matcher(cls.PARAMETERS, Type.PARAMETERS, |
- JavaScriptModes.PARAMETER_MODE)]} |
- |
- def __init__(self, parse_js_doc=True): |
- """Create a tokenizer object. |
- |
- Args: |
- parse_js_doc: Whether to do detailed parsing of javascript doc comments, |
- or simply treat them as normal comments. Defaults to parsing JsDoc. |
- """ |
- matchers = self.BuildMatchers() |
- if not parse_js_doc: |
- # Make a copy so the original doesn't get modified. |
- matchers = copy.deepcopy(matchers) |
- matchers[JavaScriptModes.DOC_COMMENT_MODE] = matchers[ |
- JavaScriptModes.BLOCK_COMMENT_MODE] |
- |
- tokenizer.Tokenizer.__init__(self, JavaScriptModes.TEXT_MODE, matchers, |
- self.JAVASCRIPT_DEFAULT_TYPES) |
- |
- def _CreateToken(self, string, token_type, line, line_number, values=None): |
- """Creates a new JavaScriptToken object. |
- |
- Args: |
- string: The string of input the token contains. |
- token_type: The type of token. |
- line: The text of the line this token is in. |
- line_number: The line number of the token. |
- values: A dict of named values within the token. For instance, a |
- function declaration may have a value called 'name' which captures the |
- name of the function. |
- """ |
- return javascripttokens.JavaScriptToken(string, token_type, line, |
- line_number, values, line_number) |