OLD | NEW |
| (Empty) |
1 #!/usr/bin/env python | |
2 # | |
3 # Copyright 2007 The Closure Linter Authors. All Rights Reserved. | |
4 # | |
5 # Licensed under the Apache License, Version 2.0 (the "License"); | |
6 # you may not use this file except in compliance with the License. | |
7 # You may obtain a copy of the License at | |
8 # | |
9 # http://www.apache.org/licenses/LICENSE-2.0 | |
10 # | |
11 # Unless required by applicable law or agreed to in writing, software | |
12 # distributed under the License is distributed on an "AS-IS" BASIS, | |
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 # See the License for the specific language governing permissions and | |
15 # limitations under the License. | |
16 | |
17 """Regular expression based JavaScript parsing classes.""" | |
18 | |
19 __author__ = ('robbyw@google.com (Robert Walker)', | |
20 'ajp@google.com (Andy Perelson)') | |
21 | |
22 import copy | |
23 import re | |
24 | |
25 from closure_linter import javascripttokens | |
26 from closure_linter.common import matcher | |
27 from closure_linter.common import tokenizer | |
28 | |
29 # Shorthand | |
30 Type = javascripttokens.JavaScriptTokenType | |
31 Matcher = matcher.Matcher | |
32 | |
33 | |
34 class JavaScriptModes(object): | |
35 """Enumeration of the different matcher modes used for JavaScript.""" | |
36 TEXT_MODE = 'text' | |
37 SINGLE_QUOTE_STRING_MODE = 'single_quote_string' | |
38 DOUBLE_QUOTE_STRING_MODE = 'double_quote_string' | |
39 TEMPLATE_STRING_MODE = 'template_string' | |
40 BLOCK_COMMENT_MODE = 'block_comment' | |
41 DOC_COMMENT_MODE = 'doc_comment' | |
42 DOC_COMMENT_LEX_SPACES_MODE = 'doc_comment_spaces' | |
43 LINE_COMMENT_MODE = 'line_comment' | |
44 PARAMETER_MODE = 'parameter' | |
45 FUNCTION_MODE = 'function' | |
46 | |
47 | |
48 class JavaScriptTokenizer(tokenizer.Tokenizer): | |
49 """JavaScript tokenizer. | |
50 | |
51 Convert JavaScript code in to an array of tokens. | |
52 """ | |
53 | |
54 # Useful patterns for JavaScript parsing. | |
55 IDENTIFIER_CHAR = r'A-Za-z0-9_$' | |
56 | |
57 # Number patterns based on: | |
58 # http://www.mozilla.org/js/language/js20-2000-07/formal/lexer-grammar.html | |
59 MANTISSA = r""" | |
60 (\d+(?!\.)) | # Matches '10' | |
61 (\d+\.(?!\d)) | # Matches '10.' | |
62 (\d*\.\d+) # Matches '.5' or '10.5' | |
63 """ | |
64 DECIMAL_LITERAL = r'(%s)([eE][-+]?\d+)?' % MANTISSA | |
65 HEX_LITERAL = r'0[xX][0-9a-fA-F]+' | |
66 NUMBER = re.compile(r""" | |
67 ((%s)|(%s)) | |
68 """ % (HEX_LITERAL, DECIMAL_LITERAL), re.VERBOSE) | |
69 | |
70 # Strings come in three parts - first we match the start of the string, then | |
71 # the contents, then the end. The contents consist of any character except a | |
72 # backslash or end of string, or a backslash followed by any character, or a | |
73 # backslash followed by end of line to support correct parsing of multi-line | |
74 # strings. | |
75 SINGLE_QUOTE = re.compile(r"'") | |
76 SINGLE_QUOTE_TEXT = re.compile(r"([^'\\]|\\(.|$))+") | |
77 DOUBLE_QUOTE = re.compile(r'"') | |
78 DOUBLE_QUOTE_TEXT = re.compile(r'([^"\\]|\\(.|$))+') | |
79 # Template strings are different from normal strings in that they do not | |
80 # require escaping of end of lines in order to be multi-line. | |
81 TEMPLATE_QUOTE = re.compile(r'`') | |
82 TEMPLATE_QUOTE_TEXT = re.compile(r'([^`]|$)+') | |
83 | |
84 START_SINGLE_LINE_COMMENT = re.compile(r'//') | |
85 END_OF_LINE_SINGLE_LINE_COMMENT = re.compile(r'//$') | |
86 | |
87 START_DOC_COMMENT = re.compile(r'/\*\*') | |
88 START_BLOCK_COMMENT = re.compile(r'/\*') | |
89 END_BLOCK_COMMENT = re.compile(r'\*/') | |
90 BLOCK_COMMENT_TEXT = re.compile(r'([^*]|\*(?!/))+') | |
91 | |
92 # Comment text is anything that we are not going to parse into another special | |
93 # token like (inline) flags or end comments. Complicated regex to match | |
94 # most normal characters, and '*', '{', '}', and '@' when we are sure that | |
95 # it is safe. Expression [^*{\s]@ must come first, or the other options will | |
96 # match everything before @, and we won't match @'s that aren't part of flags | |
97 # like in email addresses in the @author tag. | |
98 DOC_COMMENT_TEXT = re.compile(r'([^*{}\s]@|[^*{}@]|\*(?!/))+') | |
99 DOC_COMMENT_NO_SPACES_TEXT = re.compile(r'([^*{}\s]@|[^*{}@\s]|\*(?!/))+') | |
100 # Match anything that is allowed in a type definition, except for tokens | |
101 # needed to parse it (and the lookahead assertion for "*/"). | |
102 DOC_COMMENT_TYPE_TEXT = re.compile(r'([^*|!?=<>(){}:,\s]|\*(?!/))+') | |
103 | |
104 # Match the prefix ' * ' that starts every line of jsdoc. Want to include | |
105 # spaces after the '*', but nothing else that occurs after a '*', and don't | |
106 # want to match the '*' in '*/'. | |
107 DOC_PREFIX = re.compile(r'\s*\*(\s+|(?!/))') | |
108 | |
109 START_BLOCK = re.compile('{') | |
110 END_BLOCK = re.compile('}') | |
111 | |
112 REGEX_CHARACTER_CLASS = r""" | |
113 \[ # Opening bracket | |
114 ([^\]\\]|\\.)* # Anything but a ] or \, | |
115 # or a backslash followed by anything | |
116 \] # Closing bracket | |
117 """ | |
118 # We ensure the regex is followed by one of the above tokens to avoid | |
119 # incorrectly parsing something like x / y / z as x REGEX(/ y /) z | |
120 POST_REGEX_LIST = [ | |
121 ';', ',', r'\.', r'\)', r'\]', '$', r'\/\/', r'\/\*', ':', '}'] | |
122 | |
123 REGEX = re.compile(r""" | |
124 / # opening slash | |
125 (?!\*) # not the start of a comment | |
126 (\\.|[^\[\/\\]|(%s))* # a backslash followed by anything, | |
127 # or anything but a / or [ or \, | |
128 # or a character class | |
129 / # closing slash | |
130 [gimsx]* # optional modifiers | |
131 (?=\s*(%s)) | |
132 """ % (REGEX_CHARACTER_CLASS, '|'.join(POST_REGEX_LIST)), | |
133 re.VERBOSE) | |
134 | |
135 ANYTHING = re.compile(r'.*') | |
136 PARAMETERS = re.compile(r'[^\)]+') | |
137 CLOSING_PAREN_WITH_SPACE = re.compile(r'\)\s*') | |
138 | |
139 FUNCTION_DECLARATION = re.compile(r'\bfunction\b') | |
140 | |
141 OPENING_PAREN = re.compile(r'\(') | |
142 CLOSING_PAREN = re.compile(r'\)') | |
143 | |
144 OPENING_BRACKET = re.compile(r'\[') | |
145 CLOSING_BRACKET = re.compile(r'\]') | |
146 | |
147 # We omit these JS keywords from the list: | |
148 # function - covered by FUNCTION_DECLARATION. | |
149 # delete, in, instanceof, new, typeof - included as operators. | |
150 # this - included in identifiers. | |
151 # null, undefined - not included, should go in some "special constant" list. | |
152 KEYWORD_LIST = [ | |
153 'break', | |
154 'case', | |
155 'catch', | |
156 'continue', | |
157 'default', | |
158 'do', | |
159 'else', | |
160 'finally', | |
161 'for', | |
162 'if', | |
163 'return', | |
164 'switch', | |
165 'throw', | |
166 'try', | |
167 'var', | |
168 'while', | |
169 'with', | |
170 ] | |
171 | |
172 # List of regular expressions to match as operators. Some notes: for our | |
173 # purposes, the comma behaves similarly enough to a normal operator that we | |
174 # include it here. r'\bin\b' actually matches 'in' surrounded by boundary | |
175 # characters - this may not match some very esoteric uses of the in operator. | |
176 # Operators that are subsets of larger operators must come later in this list | |
177 # for proper matching, e.g., '>>' must come AFTER '>>>'. | |
178 OPERATOR_LIST = [ | |
179 ',', | |
180 r'\+\+', | |
181 '===', | |
182 '!==', | |
183 '>>>=', | |
184 '>>>', | |
185 '==', | |
186 '>=', | |
187 '<=', | |
188 '!=', | |
189 '<<=', | |
190 '>>=', | |
191 '<<', | |
192 '>>', | |
193 '=>', | |
194 '>', | |
195 '<', | |
196 r'\+=', | |
197 r'\+', | |
198 '--', | |
199 r'\^=', | |
200 '-=', | |
201 '-', | |
202 '/=', | |
203 '/', | |
204 r'\*=', | |
205 r'\*', | |
206 '%=', | |
207 '%', | |
208 '&&', | |
209 r'\|\|', | |
210 '&=', | |
211 '&', | |
212 r'\|=', | |
213 r'\|', | |
214 '=', | |
215 '!', | |
216 ':', | |
217 r'\?', | |
218 r'\^', | |
219 r'\bdelete\b', | |
220 r'\bin\b', | |
221 r'\binstanceof\b', | |
222 r'\bnew\b', | |
223 r'\btypeof\b', | |
224 r'\bvoid\b', | |
225 r'\.', | |
226 ] | |
227 OPERATOR = re.compile('|'.join(OPERATOR_LIST)) | |
228 | |
229 WHITESPACE = re.compile(r'\s+') | |
230 SEMICOLON = re.compile(r';') | |
231 # Technically JavaScript identifiers can't contain '.', but we treat a set of | |
232 # nested identifiers as a single identifier, except for trailing dots. | |
233 NESTED_IDENTIFIER = r'[a-zA-Z_$]([%s]|\.[a-zA-Z_$])*' % IDENTIFIER_CHAR | |
234 IDENTIFIER = re.compile(NESTED_IDENTIFIER) | |
235 | |
236 SIMPLE_LVALUE = re.compile(r""" | |
237 (?P<identifier>%s) # a valid identifier | |
238 (?=\s* # optional whitespace | |
239 \= # look ahead to equal sign | |
240 (?!=)) # not follwed by equal | |
241 """ % NESTED_IDENTIFIER, re.VERBOSE) | |
242 | |
243 # A doc flag is a @ sign followed by non-space characters that appears at the | |
244 # beginning of the line, after whitespace, or after a '{'. The look-behind | |
245 # check is necessary to not match someone@google.com as a flag. | |
246 DOC_FLAG = re.compile(r'(^|(?<=\s))@(?P<name>[a-zA-Z]+)') | |
247 # To properly parse parameter names and complex doctypes containing | |
248 # whitespace, we need to tokenize whitespace into a token after certain | |
249 # doctags. All statetracker.HAS_TYPE that are not listed here must not contain | |
250 # any whitespace in their types. | |
251 DOC_FLAG_LEX_SPACES = re.compile( | |
252 r'(^|(?<=\s))@(?P<name>%s)\b' % | |
253 '|'.join([ | |
254 'const', | |
255 'enum', | |
256 'export', | |
257 'extends', | |
258 'final', | |
259 'implements', | |
260 'package', | |
261 'param', | |
262 'private', | |
263 'protected', | |
264 'public', | |
265 'return', | |
266 'type', | |
267 'typedef' | |
268 ])) | |
269 | |
270 DOC_INLINE_FLAG = re.compile(r'(?<={)@(?P<name>[a-zA-Z]+)') | |
271 | |
272 DOC_TYPE_BLOCK_START = re.compile(r'[<(]') | |
273 DOC_TYPE_BLOCK_END = re.compile(r'[>)]') | |
274 DOC_TYPE_MODIFIERS = re.compile(r'[!?|,:=]') | |
275 | |
276 # Star followed by non-slash, i.e a star that does not end a comment. | |
277 # This is used for TYPE_GROUP below. | |
278 SAFE_STAR = r'(\*(?!/))' | |
279 | |
280 COMMON_DOC_MATCHERS = [ | |
281 # Find the end of the comment. | |
282 Matcher(END_BLOCK_COMMENT, Type.END_DOC_COMMENT, | |
283 JavaScriptModes.TEXT_MODE), | |
284 | |
285 # Tokenize documented flags like @private. | |
286 Matcher(DOC_INLINE_FLAG, Type.DOC_INLINE_FLAG), | |
287 Matcher(DOC_FLAG_LEX_SPACES, Type.DOC_FLAG, | |
288 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE), | |
289 | |
290 # Encountering a doc flag should leave lex spaces mode. | |
291 Matcher(DOC_FLAG, Type.DOC_FLAG, JavaScriptModes.DOC_COMMENT_MODE), | |
292 | |
293 # Tokenize braces so we can find types. | |
294 Matcher(START_BLOCK, Type.DOC_START_BRACE), | |
295 Matcher(END_BLOCK, Type.DOC_END_BRACE), | |
296 | |
297 # And some more to parse types. | |
298 Matcher(DOC_TYPE_BLOCK_START, Type.DOC_TYPE_START_BLOCK), | |
299 Matcher(DOC_TYPE_BLOCK_END, Type.DOC_TYPE_END_BLOCK), | |
300 | |
301 Matcher(DOC_TYPE_MODIFIERS, Type.DOC_TYPE_MODIFIER), | |
302 Matcher(DOC_COMMENT_TYPE_TEXT, Type.COMMENT), | |
303 | |
304 Matcher(DOC_PREFIX, Type.DOC_PREFIX, None, True)] | |
305 | |
306 # When text is not matched, it is given this default type based on mode. | |
307 # If unspecified in this map, the default default is Type.NORMAL. | |
308 JAVASCRIPT_DEFAULT_TYPES = { | |
309 JavaScriptModes.DOC_COMMENT_MODE: Type.COMMENT, | |
310 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: Type.COMMENT | |
311 } | |
312 | |
313 @classmethod | |
314 def BuildMatchers(cls): | |
315 """Builds the token matcher group. | |
316 | |
317 The token matcher groups work as follows: it is a list of Matcher objects. | |
318 The matchers will be tried in this order, and the first to match will be | |
319 returned. Hence the order is important because the matchers that come first | |
320 overrule the matchers that come later. | |
321 | |
322 Returns: | |
323 The completed token matcher group. | |
324 """ | |
325 # Match a keyword string followed by a non-identifier character in order to | |
326 # not match something like doSomething as do + Something. | |
327 keyword = re.compile('(%s)((?=[^%s])|$)' % ( | |
328 '|'.join(cls.KEYWORD_LIST), cls.IDENTIFIER_CHAR)) | |
329 return { | |
330 | |
331 # Matchers for basic text mode. | |
332 JavaScriptModes.TEXT_MODE: [ | |
333 # Check a big group - strings, starting comments, and regexes - all | |
334 # of which could be intertwined. 'string with /regex/', | |
335 # /regex with 'string'/, /* comment with /regex/ and string */ (and | |
336 # so on) | |
337 Matcher(cls.START_DOC_COMMENT, Type.START_DOC_COMMENT, | |
338 JavaScriptModes.DOC_COMMENT_MODE), | |
339 Matcher(cls.START_BLOCK_COMMENT, Type.START_BLOCK_COMMENT, | |
340 JavaScriptModes.BLOCK_COMMENT_MODE), | |
341 Matcher(cls.END_OF_LINE_SINGLE_LINE_COMMENT, | |
342 Type.START_SINGLE_LINE_COMMENT), | |
343 Matcher(cls.START_SINGLE_LINE_COMMENT, | |
344 Type.START_SINGLE_LINE_COMMENT, | |
345 JavaScriptModes.LINE_COMMENT_MODE), | |
346 Matcher(cls.SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_START, | |
347 JavaScriptModes.SINGLE_QUOTE_STRING_MODE), | |
348 Matcher(cls.DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_START, | |
349 JavaScriptModes.DOUBLE_QUOTE_STRING_MODE), | |
350 Matcher(cls.TEMPLATE_QUOTE, Type.TEMPLATE_STRING_START, | |
351 JavaScriptModes.TEMPLATE_STRING_MODE), | |
352 Matcher(cls.REGEX, Type.REGEX), | |
353 | |
354 # Next we check for start blocks appearing outside any of the items | |
355 # above. | |
356 Matcher(cls.START_BLOCK, Type.START_BLOCK), | |
357 Matcher(cls.END_BLOCK, Type.END_BLOCK), | |
358 | |
359 # Then we search for function declarations. | |
360 Matcher(cls.FUNCTION_DECLARATION, Type.FUNCTION_DECLARATION, | |
361 JavaScriptModes.FUNCTION_MODE), | |
362 | |
363 # Next, we convert non-function related parens to tokens. | |
364 Matcher(cls.OPENING_PAREN, Type.START_PAREN), | |
365 Matcher(cls.CLOSING_PAREN, Type.END_PAREN), | |
366 | |
367 # Next, we convert brackets to tokens. | |
368 Matcher(cls.OPENING_BRACKET, Type.START_BRACKET), | |
369 Matcher(cls.CLOSING_BRACKET, Type.END_BRACKET), | |
370 | |
371 # Find numbers. This has to happen before operators because | |
372 # scientific notation numbers can have + and - in them. | |
373 Matcher(cls.NUMBER, Type.NUMBER), | |
374 | |
375 # Find operators and simple assignments | |
376 Matcher(cls.SIMPLE_LVALUE, Type.SIMPLE_LVALUE), | |
377 Matcher(cls.OPERATOR, Type.OPERATOR), | |
378 | |
379 # Find key words and whitespace. | |
380 Matcher(keyword, Type.KEYWORD), | |
381 Matcher(cls.WHITESPACE, Type.WHITESPACE), | |
382 | |
383 # Find identifiers. | |
384 Matcher(cls.IDENTIFIER, Type.IDENTIFIER), | |
385 | |
386 # Finally, we convert semicolons to tokens. | |
387 Matcher(cls.SEMICOLON, Type.SEMICOLON)], | |
388 | |
389 # Matchers for single quote strings. | |
390 JavaScriptModes.SINGLE_QUOTE_STRING_MODE: [ | |
391 Matcher(cls.SINGLE_QUOTE_TEXT, Type.STRING_TEXT), | |
392 Matcher(cls.SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_END, | |
393 JavaScriptModes.TEXT_MODE)], | |
394 | |
395 # Matchers for double quote strings. | |
396 JavaScriptModes.DOUBLE_QUOTE_STRING_MODE: [ | |
397 Matcher(cls.DOUBLE_QUOTE_TEXT, Type.STRING_TEXT), | |
398 Matcher(cls.DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_END, | |
399 JavaScriptModes.TEXT_MODE)], | |
400 | |
401 # Matchers for template strings. | |
402 JavaScriptModes.TEMPLATE_STRING_MODE: [ | |
403 Matcher(cls.TEMPLATE_QUOTE_TEXT, Type.STRING_TEXT), | |
404 Matcher(cls.TEMPLATE_QUOTE, Type.TEMPLATE_STRING_END, | |
405 JavaScriptModes.TEXT_MODE)], | |
406 | |
407 # Matchers for block comments. | |
408 JavaScriptModes.BLOCK_COMMENT_MODE: [ | |
409 # First we check for exiting a block comment. | |
410 Matcher(cls.END_BLOCK_COMMENT, Type.END_BLOCK_COMMENT, | |
411 JavaScriptModes.TEXT_MODE), | |
412 | |
413 # Match non-comment-ending text.. | |
414 Matcher(cls.BLOCK_COMMENT_TEXT, Type.COMMENT)], | |
415 | |
416 # Matchers for doc comments. | |
417 JavaScriptModes.DOC_COMMENT_MODE: cls.COMMON_DOC_MATCHERS + [ | |
418 Matcher(cls.DOC_COMMENT_TEXT, Type.COMMENT)], | |
419 | |
420 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: cls.COMMON_DOC_MATCHERS + [ | |
421 Matcher(cls.WHITESPACE, Type.COMMENT), | |
422 Matcher(cls.DOC_COMMENT_NO_SPACES_TEXT, Type.COMMENT)], | |
423 | |
424 # Matchers for single line comments. | |
425 JavaScriptModes.LINE_COMMENT_MODE: [ | |
426 # We greedy match until the end of the line in line comment mode. | |
427 Matcher(cls.ANYTHING, Type.COMMENT, JavaScriptModes.TEXT_MODE)], | |
428 | |
429 # Matchers for code after the function keyword. | |
430 JavaScriptModes.FUNCTION_MODE: [ | |
431 # Must match open paren before anything else and move into parameter | |
432 # mode, otherwise everything inside the parameter list is parsed | |
433 # incorrectly. | |
434 Matcher(cls.OPENING_PAREN, Type.START_PARAMETERS, | |
435 JavaScriptModes.PARAMETER_MODE), | |
436 Matcher(cls.WHITESPACE, Type.WHITESPACE), | |
437 Matcher(cls.IDENTIFIER, Type.FUNCTION_NAME)], | |
438 | |
439 # Matchers for function parameters | |
440 JavaScriptModes.PARAMETER_MODE: [ | |
441 # When in function parameter mode, a closing paren is treated | |
442 # specially. Everything else is treated as lines of parameters. | |
443 Matcher(cls.CLOSING_PAREN_WITH_SPACE, Type.END_PARAMETERS, | |
444 JavaScriptModes.TEXT_MODE), | |
445 Matcher(cls.PARAMETERS, Type.PARAMETERS, | |
446 JavaScriptModes.PARAMETER_MODE)]} | |
447 | |
448 def __init__(self, parse_js_doc=True): | |
449 """Create a tokenizer object. | |
450 | |
451 Args: | |
452 parse_js_doc: Whether to do detailed parsing of javascript doc comments, | |
453 or simply treat them as normal comments. Defaults to parsing JsDoc. | |
454 """ | |
455 matchers = self.BuildMatchers() | |
456 if not parse_js_doc: | |
457 # Make a copy so the original doesn't get modified. | |
458 matchers = copy.deepcopy(matchers) | |
459 matchers[JavaScriptModes.DOC_COMMENT_MODE] = matchers[ | |
460 JavaScriptModes.BLOCK_COMMENT_MODE] | |
461 | |
462 tokenizer.Tokenizer.__init__(self, JavaScriptModes.TEXT_MODE, matchers, | |
463 self.JAVASCRIPT_DEFAULT_TYPES) | |
464 | |
465 def _CreateToken(self, string, token_type, line, line_number, values=None): | |
466 """Creates a new JavaScriptToken object. | |
467 | |
468 Args: | |
469 string: The string of input the token contains. | |
470 token_type: The type of token. | |
471 line: The text of the line this token is in. | |
472 line_number: The line number of the token. | |
473 values: A dict of named values within the token. For instance, a | |
474 function declaration may have a value called 'name' which captures the | |
475 name of the function. | |
476 """ | |
477 return javascripttokens.JavaScriptToken(string, token_type, line, | |
478 line_number, values, line_number) | |
OLD | NEW |