Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(704)

Side by Side Diff: third_party/closure_linter/closure_linter/javascripttokenizer.py

Issue 2592193002: Remove closure_linter from Chrome (Closed)
Patch Set: Created 3 years, 12 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 #
3 # Copyright 2007 The Closure Linter Authors. All Rights Reserved.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS-IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16
17 """Regular expression based JavaScript parsing classes."""
18
19 __author__ = ('robbyw@google.com (Robert Walker)',
20 'ajp@google.com (Andy Perelson)')
21
22 import copy
23 import re
24
25 from closure_linter import javascripttokens
26 from closure_linter.common import matcher
27 from closure_linter.common import tokenizer
28
29 # Shorthand
30 Type = javascripttokens.JavaScriptTokenType
31 Matcher = matcher.Matcher
32
33
34 class JavaScriptModes(object):
35 """Enumeration of the different matcher modes used for JavaScript."""
36 TEXT_MODE = 'text'
37 SINGLE_QUOTE_STRING_MODE = 'single_quote_string'
38 DOUBLE_QUOTE_STRING_MODE = 'double_quote_string'
39 TEMPLATE_STRING_MODE = 'template_string'
40 BLOCK_COMMENT_MODE = 'block_comment'
41 DOC_COMMENT_MODE = 'doc_comment'
42 DOC_COMMENT_LEX_SPACES_MODE = 'doc_comment_spaces'
43 LINE_COMMENT_MODE = 'line_comment'
44 PARAMETER_MODE = 'parameter'
45 FUNCTION_MODE = 'function'
46
47
48 class JavaScriptTokenizer(tokenizer.Tokenizer):
49 """JavaScript tokenizer.
50
51 Convert JavaScript code in to an array of tokens.
52 """
53
54 # Useful patterns for JavaScript parsing.
55 IDENTIFIER_CHAR = r'A-Za-z0-9_$'
56
57 # Number patterns based on:
58 # http://www.mozilla.org/js/language/js20-2000-07/formal/lexer-grammar.html
59 MANTISSA = r"""
60 (\d+(?!\.)) | # Matches '10'
61 (\d+\.(?!\d)) | # Matches '10.'
62 (\d*\.\d+) # Matches '.5' or '10.5'
63 """
64 DECIMAL_LITERAL = r'(%s)([eE][-+]?\d+)?' % MANTISSA
65 HEX_LITERAL = r'0[xX][0-9a-fA-F]+'
66 NUMBER = re.compile(r"""
67 ((%s)|(%s))
68 """ % (HEX_LITERAL, DECIMAL_LITERAL), re.VERBOSE)
69
70 # Strings come in three parts - first we match the start of the string, then
71 # the contents, then the end. The contents consist of any character except a
72 # backslash or end of string, or a backslash followed by any character, or a
73 # backslash followed by end of line to support correct parsing of multi-line
74 # strings.
75 SINGLE_QUOTE = re.compile(r"'")
76 SINGLE_QUOTE_TEXT = re.compile(r"([^'\\]|\\(.|$))+")
77 DOUBLE_QUOTE = re.compile(r'"')
78 DOUBLE_QUOTE_TEXT = re.compile(r'([^"\\]|\\(.|$))+')
79 # Template strings are different from normal strings in that they do not
80 # require escaping of end of lines in order to be multi-line.
81 TEMPLATE_QUOTE = re.compile(r'`')
82 TEMPLATE_QUOTE_TEXT = re.compile(r'([^`]|$)+')
83
84 START_SINGLE_LINE_COMMENT = re.compile(r'//')
85 END_OF_LINE_SINGLE_LINE_COMMENT = re.compile(r'//$')
86
87 START_DOC_COMMENT = re.compile(r'/\*\*')
88 START_BLOCK_COMMENT = re.compile(r'/\*')
89 END_BLOCK_COMMENT = re.compile(r'\*/')
90 BLOCK_COMMENT_TEXT = re.compile(r'([^*]|\*(?!/))+')
91
92 # Comment text is anything that we are not going to parse into another special
93 # token like (inline) flags or end comments. Complicated regex to match
94 # most normal characters, and '*', '{', '}', and '@' when we are sure that
95 # it is safe. Expression [^*{\s]@ must come first, or the other options will
96 # match everything before @, and we won't match @'s that aren't part of flags
97 # like in email addresses in the @author tag.
98 DOC_COMMENT_TEXT = re.compile(r'([^*{}\s]@|[^*{}@]|\*(?!/))+')
99 DOC_COMMENT_NO_SPACES_TEXT = re.compile(r'([^*{}\s]@|[^*{}@\s]|\*(?!/))+')
100 # Match anything that is allowed in a type definition, except for tokens
101 # needed to parse it (and the lookahead assertion for "*/").
102 DOC_COMMENT_TYPE_TEXT = re.compile(r'([^*|!?=<>(){}:,\s]|\*(?!/))+')
103
104 # Match the prefix ' * ' that starts every line of jsdoc. Want to include
105 # spaces after the '*', but nothing else that occurs after a '*', and don't
106 # want to match the '*' in '*/'.
107 DOC_PREFIX = re.compile(r'\s*\*(\s+|(?!/))')
108
109 START_BLOCK = re.compile('{')
110 END_BLOCK = re.compile('}')
111
112 REGEX_CHARACTER_CLASS = r"""
113 \[ # Opening bracket
114 ([^\]\\]|\\.)* # Anything but a ] or \,
115 # or a backslash followed by anything
116 \] # Closing bracket
117 """
118 # We ensure the regex is followed by one of the above tokens to avoid
119 # incorrectly parsing something like x / y / z as x REGEX(/ y /) z
120 POST_REGEX_LIST = [
121 ';', ',', r'\.', r'\)', r'\]', '$', r'\/\/', r'\/\*', ':', '}']
122
123 REGEX = re.compile(r"""
124 / # opening slash
125 (?!\*) # not the start of a comment
126 (\\.|[^\[\/\\]|(%s))* # a backslash followed by anything,
127 # or anything but a / or [ or \,
128 # or a character class
129 / # closing slash
130 [gimsx]* # optional modifiers
131 (?=\s*(%s))
132 """ % (REGEX_CHARACTER_CLASS, '|'.join(POST_REGEX_LIST)),
133 re.VERBOSE)
134
135 ANYTHING = re.compile(r'.*')
136 PARAMETERS = re.compile(r'[^\)]+')
137 CLOSING_PAREN_WITH_SPACE = re.compile(r'\)\s*')
138
139 FUNCTION_DECLARATION = re.compile(r'\bfunction\b')
140
141 OPENING_PAREN = re.compile(r'\(')
142 CLOSING_PAREN = re.compile(r'\)')
143
144 OPENING_BRACKET = re.compile(r'\[')
145 CLOSING_BRACKET = re.compile(r'\]')
146
147 # We omit these JS keywords from the list:
148 # function - covered by FUNCTION_DECLARATION.
149 # delete, in, instanceof, new, typeof - included as operators.
150 # this - included in identifiers.
151 # null, undefined - not included, should go in some "special constant" list.
152 KEYWORD_LIST = [
153 'break',
154 'case',
155 'catch',
156 'continue',
157 'default',
158 'do',
159 'else',
160 'finally',
161 'for',
162 'if',
163 'return',
164 'switch',
165 'throw',
166 'try',
167 'var',
168 'while',
169 'with',
170 ]
171
172 # List of regular expressions to match as operators. Some notes: for our
173 # purposes, the comma behaves similarly enough to a normal operator that we
174 # include it here. r'\bin\b' actually matches 'in' surrounded by boundary
175 # characters - this may not match some very esoteric uses of the in operator.
176 # Operators that are subsets of larger operators must come later in this list
177 # for proper matching, e.g., '>>' must come AFTER '>>>'.
178 OPERATOR_LIST = [
179 ',',
180 r'\+\+',
181 '===',
182 '!==',
183 '>>>=',
184 '>>>',
185 '==',
186 '>=',
187 '<=',
188 '!=',
189 '<<=',
190 '>>=',
191 '<<',
192 '>>',
193 '=>',
194 '>',
195 '<',
196 r'\+=',
197 r'\+',
198 '--',
199 r'\^=',
200 '-=',
201 '-',
202 '/=',
203 '/',
204 r'\*=',
205 r'\*',
206 '%=',
207 '%',
208 '&&',
209 r'\|\|',
210 '&=',
211 '&',
212 r'\|=',
213 r'\|',
214 '=',
215 '!',
216 ':',
217 r'\?',
218 r'\^',
219 r'\bdelete\b',
220 r'\bin\b',
221 r'\binstanceof\b',
222 r'\bnew\b',
223 r'\btypeof\b',
224 r'\bvoid\b',
225 r'\.',
226 ]
227 OPERATOR = re.compile('|'.join(OPERATOR_LIST))
228
229 WHITESPACE = re.compile(r'\s+')
230 SEMICOLON = re.compile(r';')
231 # Technically JavaScript identifiers can't contain '.', but we treat a set of
232 # nested identifiers as a single identifier, except for trailing dots.
233 NESTED_IDENTIFIER = r'[a-zA-Z_$]([%s]|\.[a-zA-Z_$])*' % IDENTIFIER_CHAR
234 IDENTIFIER = re.compile(NESTED_IDENTIFIER)
235
236 SIMPLE_LVALUE = re.compile(r"""
237 (?P<identifier>%s) # a valid identifier
238 (?=\s* # optional whitespace
239 \= # look ahead to equal sign
240 (?!=)) # not follwed by equal
241 """ % NESTED_IDENTIFIER, re.VERBOSE)
242
243 # A doc flag is a @ sign followed by non-space characters that appears at the
244 # beginning of the line, after whitespace, or after a '{'. The look-behind
245 # check is necessary to not match someone@google.com as a flag.
246 DOC_FLAG = re.compile(r'(^|(?<=\s))@(?P<name>[a-zA-Z]+)')
247 # To properly parse parameter names and complex doctypes containing
248 # whitespace, we need to tokenize whitespace into a token after certain
249 # doctags. All statetracker.HAS_TYPE that are not listed here must not contain
250 # any whitespace in their types.
251 DOC_FLAG_LEX_SPACES = re.compile(
252 r'(^|(?<=\s))@(?P<name>%s)\b' %
253 '|'.join([
254 'const',
255 'enum',
256 'export',
257 'extends',
258 'final',
259 'implements',
260 'package',
261 'param',
262 'private',
263 'protected',
264 'public',
265 'return',
266 'type',
267 'typedef'
268 ]))
269
270 DOC_INLINE_FLAG = re.compile(r'(?<={)@(?P<name>[a-zA-Z]+)')
271
272 DOC_TYPE_BLOCK_START = re.compile(r'[<(]')
273 DOC_TYPE_BLOCK_END = re.compile(r'[>)]')
274 DOC_TYPE_MODIFIERS = re.compile(r'[!?|,:=]')
275
276 # Star followed by non-slash, i.e a star that does not end a comment.
277 # This is used for TYPE_GROUP below.
278 SAFE_STAR = r'(\*(?!/))'
279
280 COMMON_DOC_MATCHERS = [
281 # Find the end of the comment.
282 Matcher(END_BLOCK_COMMENT, Type.END_DOC_COMMENT,
283 JavaScriptModes.TEXT_MODE),
284
285 # Tokenize documented flags like @private.
286 Matcher(DOC_INLINE_FLAG, Type.DOC_INLINE_FLAG),
287 Matcher(DOC_FLAG_LEX_SPACES, Type.DOC_FLAG,
288 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE),
289
290 # Encountering a doc flag should leave lex spaces mode.
291 Matcher(DOC_FLAG, Type.DOC_FLAG, JavaScriptModes.DOC_COMMENT_MODE),
292
293 # Tokenize braces so we can find types.
294 Matcher(START_BLOCK, Type.DOC_START_BRACE),
295 Matcher(END_BLOCK, Type.DOC_END_BRACE),
296
297 # And some more to parse types.
298 Matcher(DOC_TYPE_BLOCK_START, Type.DOC_TYPE_START_BLOCK),
299 Matcher(DOC_TYPE_BLOCK_END, Type.DOC_TYPE_END_BLOCK),
300
301 Matcher(DOC_TYPE_MODIFIERS, Type.DOC_TYPE_MODIFIER),
302 Matcher(DOC_COMMENT_TYPE_TEXT, Type.COMMENT),
303
304 Matcher(DOC_PREFIX, Type.DOC_PREFIX, None, True)]
305
306 # When text is not matched, it is given this default type based on mode.
307 # If unspecified in this map, the default default is Type.NORMAL.
308 JAVASCRIPT_DEFAULT_TYPES = {
309 JavaScriptModes.DOC_COMMENT_MODE: Type.COMMENT,
310 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: Type.COMMENT
311 }
312
313 @classmethod
314 def BuildMatchers(cls):
315 """Builds the token matcher group.
316
317 The token matcher groups work as follows: it is a list of Matcher objects.
318 The matchers will be tried in this order, and the first to match will be
319 returned. Hence the order is important because the matchers that come first
320 overrule the matchers that come later.
321
322 Returns:
323 The completed token matcher group.
324 """
325 # Match a keyword string followed by a non-identifier character in order to
326 # not match something like doSomething as do + Something.
327 keyword = re.compile('(%s)((?=[^%s])|$)' % (
328 '|'.join(cls.KEYWORD_LIST), cls.IDENTIFIER_CHAR))
329 return {
330
331 # Matchers for basic text mode.
332 JavaScriptModes.TEXT_MODE: [
333 # Check a big group - strings, starting comments, and regexes - all
334 # of which could be intertwined. 'string with /regex/',
335 # /regex with 'string'/, /* comment with /regex/ and string */ (and
336 # so on)
337 Matcher(cls.START_DOC_COMMENT, Type.START_DOC_COMMENT,
338 JavaScriptModes.DOC_COMMENT_MODE),
339 Matcher(cls.START_BLOCK_COMMENT, Type.START_BLOCK_COMMENT,
340 JavaScriptModes.BLOCK_COMMENT_MODE),
341 Matcher(cls.END_OF_LINE_SINGLE_LINE_COMMENT,
342 Type.START_SINGLE_LINE_COMMENT),
343 Matcher(cls.START_SINGLE_LINE_COMMENT,
344 Type.START_SINGLE_LINE_COMMENT,
345 JavaScriptModes.LINE_COMMENT_MODE),
346 Matcher(cls.SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_START,
347 JavaScriptModes.SINGLE_QUOTE_STRING_MODE),
348 Matcher(cls.DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_START,
349 JavaScriptModes.DOUBLE_QUOTE_STRING_MODE),
350 Matcher(cls.TEMPLATE_QUOTE, Type.TEMPLATE_STRING_START,
351 JavaScriptModes.TEMPLATE_STRING_MODE),
352 Matcher(cls.REGEX, Type.REGEX),
353
354 # Next we check for start blocks appearing outside any of the items
355 # above.
356 Matcher(cls.START_BLOCK, Type.START_BLOCK),
357 Matcher(cls.END_BLOCK, Type.END_BLOCK),
358
359 # Then we search for function declarations.
360 Matcher(cls.FUNCTION_DECLARATION, Type.FUNCTION_DECLARATION,
361 JavaScriptModes.FUNCTION_MODE),
362
363 # Next, we convert non-function related parens to tokens.
364 Matcher(cls.OPENING_PAREN, Type.START_PAREN),
365 Matcher(cls.CLOSING_PAREN, Type.END_PAREN),
366
367 # Next, we convert brackets to tokens.
368 Matcher(cls.OPENING_BRACKET, Type.START_BRACKET),
369 Matcher(cls.CLOSING_BRACKET, Type.END_BRACKET),
370
371 # Find numbers. This has to happen before operators because
372 # scientific notation numbers can have + and - in them.
373 Matcher(cls.NUMBER, Type.NUMBER),
374
375 # Find operators and simple assignments
376 Matcher(cls.SIMPLE_LVALUE, Type.SIMPLE_LVALUE),
377 Matcher(cls.OPERATOR, Type.OPERATOR),
378
379 # Find key words and whitespace.
380 Matcher(keyword, Type.KEYWORD),
381 Matcher(cls.WHITESPACE, Type.WHITESPACE),
382
383 # Find identifiers.
384 Matcher(cls.IDENTIFIER, Type.IDENTIFIER),
385
386 # Finally, we convert semicolons to tokens.
387 Matcher(cls.SEMICOLON, Type.SEMICOLON)],
388
389 # Matchers for single quote strings.
390 JavaScriptModes.SINGLE_QUOTE_STRING_MODE: [
391 Matcher(cls.SINGLE_QUOTE_TEXT, Type.STRING_TEXT),
392 Matcher(cls.SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_END,
393 JavaScriptModes.TEXT_MODE)],
394
395 # Matchers for double quote strings.
396 JavaScriptModes.DOUBLE_QUOTE_STRING_MODE: [
397 Matcher(cls.DOUBLE_QUOTE_TEXT, Type.STRING_TEXT),
398 Matcher(cls.DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_END,
399 JavaScriptModes.TEXT_MODE)],
400
401 # Matchers for template strings.
402 JavaScriptModes.TEMPLATE_STRING_MODE: [
403 Matcher(cls.TEMPLATE_QUOTE_TEXT, Type.STRING_TEXT),
404 Matcher(cls.TEMPLATE_QUOTE, Type.TEMPLATE_STRING_END,
405 JavaScriptModes.TEXT_MODE)],
406
407 # Matchers for block comments.
408 JavaScriptModes.BLOCK_COMMENT_MODE: [
409 # First we check for exiting a block comment.
410 Matcher(cls.END_BLOCK_COMMENT, Type.END_BLOCK_COMMENT,
411 JavaScriptModes.TEXT_MODE),
412
413 # Match non-comment-ending text..
414 Matcher(cls.BLOCK_COMMENT_TEXT, Type.COMMENT)],
415
416 # Matchers for doc comments.
417 JavaScriptModes.DOC_COMMENT_MODE: cls.COMMON_DOC_MATCHERS + [
418 Matcher(cls.DOC_COMMENT_TEXT, Type.COMMENT)],
419
420 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: cls.COMMON_DOC_MATCHERS + [
421 Matcher(cls.WHITESPACE, Type.COMMENT),
422 Matcher(cls.DOC_COMMENT_NO_SPACES_TEXT, Type.COMMENT)],
423
424 # Matchers for single line comments.
425 JavaScriptModes.LINE_COMMENT_MODE: [
426 # We greedy match until the end of the line in line comment mode.
427 Matcher(cls.ANYTHING, Type.COMMENT, JavaScriptModes.TEXT_MODE)],
428
429 # Matchers for code after the function keyword.
430 JavaScriptModes.FUNCTION_MODE: [
431 # Must match open paren before anything else and move into parameter
432 # mode, otherwise everything inside the parameter list is parsed
433 # incorrectly.
434 Matcher(cls.OPENING_PAREN, Type.START_PARAMETERS,
435 JavaScriptModes.PARAMETER_MODE),
436 Matcher(cls.WHITESPACE, Type.WHITESPACE),
437 Matcher(cls.IDENTIFIER, Type.FUNCTION_NAME)],
438
439 # Matchers for function parameters
440 JavaScriptModes.PARAMETER_MODE: [
441 # When in function parameter mode, a closing paren is treated
442 # specially. Everything else is treated as lines of parameters.
443 Matcher(cls.CLOSING_PAREN_WITH_SPACE, Type.END_PARAMETERS,
444 JavaScriptModes.TEXT_MODE),
445 Matcher(cls.PARAMETERS, Type.PARAMETERS,
446 JavaScriptModes.PARAMETER_MODE)]}
447
448 def __init__(self, parse_js_doc=True):
449 """Create a tokenizer object.
450
451 Args:
452 parse_js_doc: Whether to do detailed parsing of javascript doc comments,
453 or simply treat them as normal comments. Defaults to parsing JsDoc.
454 """
455 matchers = self.BuildMatchers()
456 if not parse_js_doc:
457 # Make a copy so the original doesn't get modified.
458 matchers = copy.deepcopy(matchers)
459 matchers[JavaScriptModes.DOC_COMMENT_MODE] = matchers[
460 JavaScriptModes.BLOCK_COMMENT_MODE]
461
462 tokenizer.Tokenizer.__init__(self, JavaScriptModes.TEXT_MODE, matchers,
463 self.JAVASCRIPT_DEFAULT_TYPES)
464
465 def _CreateToken(self, string, token_type, line, line_number, values=None):
466 """Creates a new JavaScriptToken object.
467
468 Args:
469 string: The string of input the token contains.
470 token_type: The type of token.
471 line: The text of the line this token is in.
472 line_number: The line number of the token.
473 values: A dict of named values within the token. For instance, a
474 function declaration may have a value called 'name' which captures the
475 name of the function.
476 """
477 return javascripttokens.JavaScriptToken(string, token_type, line,
478 line_number, values, line_number)
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698