third_party/closure_linter/closure_linter/javascripttokenizer.py - Issue 2592193002: Remove closure_linter from Chrome

Side by Side Diff: third_party/closure_linter/closure_linter/javascripttokenizer.py

Issue 2592193002: Remove closure_linter from Chrome (Closed)

Patch Set: Created 3 years, 12 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « third_party/closure_linter/closure_linter/javascriptstatetracker_test.py ('k') | third_party/closure_linter/closure_linter/javascripttokens.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 #!/usr/bin/env python

2 #

3 # Copyright 2007 The Closure Linter Authors. All Rights Reserved.

4 #

5 # Licensed under the Apache License, Version 2.0 (the "License");

6 # you may not use this file except in compliance with the License.

7 # You may obtain a copy of the License at

8 #

9 # http://www.apache.org/licenses/LICENSE-2.0

10 #

11 # Unless required by applicable law or agreed to in writing, software

12 # distributed under the License is distributed on an "AS-IS" BASIS,

13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14 # See the License for the specific language governing permissions and

15 # limitations under the License.

16

17 """Regular expression based JavaScript parsing classes."""

18

19 __author__ = ('robbyw@google.com (Robert Walker)',

20 'ajp@google.com (Andy Perelson)')

21

22 import copy

23 import re

24

25 from closure_linter import javascripttokens

26 from closure_linter.common import matcher

27 from closure_linter.common import tokenizer

28

29 # Shorthand

30 Type = javascripttokens.JavaScriptTokenType

31 Matcher = matcher.Matcher

32

33

34 class JavaScriptModes(object):

35 """Enumeration of the different matcher modes used for JavaScript."""

36 TEXT_MODE = 'text'

37 SINGLE_QUOTE_STRING_MODE = 'single_quote_string'

38 DOUBLE_QUOTE_STRING_MODE = 'double_quote_string'

39 TEMPLATE_STRING_MODE = 'template_string'

40 BLOCK_COMMENT_MODE = 'block_comment'

41 DOC_COMMENT_MODE = 'doc_comment'

42 DOC_COMMENT_LEX_SPACES_MODE = 'doc_comment_spaces'

43 LINE_COMMENT_MODE = 'line_comment'

44 PARAMETER_MODE = 'parameter'

45 FUNCTION_MODE = 'function'

46

47

48 class JavaScriptTokenizer(tokenizer.Tokenizer):

49 """JavaScript tokenizer.

50

51 Convert JavaScript code in to an array of tokens.

52 """

53

54 # Useful patterns for JavaScript parsing.

55 IDENTIFIER_CHAR = r'A-Za-z0-9_$'

56

57 # Number patterns based on:

58 # http://www.mozilla.org/js/language/js20-2000-07/formal/lexer-grammar.html

59 MANTISSA = r"""

60 (\d+(?!\.)) \| # Matches '10'

61 (\d+\.(?!\d)) \| # Matches '10.'

62 (\d*\.\d+) # Matches '.5' or '10.5'

63 """

64 DECIMAL_LITERAL = r'(%s)([eE][-+]?\d+)?' % MANTISSA

65 HEX_LITERAL = r'0[xX][0-9a-fA-F]+'

66 NUMBER = re.compile(r"""

67 ((%s)\|(%s))

68 """ % (HEX_LITERAL, DECIMAL_LITERAL), re.VERBOSE)

69

70 # Strings come in three parts - first we match the start of the string, then

71 # the contents, then the end. The contents consist of any character except a

72 # backslash or end of string, or a backslash followed by any character, or a

73 # backslash followed by end of line to support correct parsing of multi-line

74 # strings.

75 SINGLE_QUOTE = re.compile(r"'")

76 SINGLE_QUOTE_TEXT = re.compile(r"([^'\\]\|\\(.\|$))+")

77 DOUBLE_QUOTE = re.compile(r'"')

78 DOUBLE_QUOTE_TEXT = re.compile(r'([^"\\]\|\\(.\|$))+')

79 # Template strings are different from normal strings in that they do not

80 # require escaping of end of lines in order to be multi-line.

81 TEMPLATE_QUOTE = re.compile(r'`')

82 TEMPLATE_QUOTE_TEXT = re.compile(r'([^`]\|$)+')

83

84 START_SINGLE_LINE_COMMENT = re.compile(r'//')

85 END_OF_LINE_SINGLE_LINE_COMMENT = re.compile(r'//$')

86

87 START_DOC_COMMENT = re.compile(r'/\\')

88 START_BLOCK_COMMENT = re.compile(r'/\*')

89 END_BLOCK_COMMENT = re.compile(r'\*/')

90 BLOCK_COMMENT_TEXT = re.compile(r'([^]\|\(?!/))+')

91

92 # Comment text is anything that we are not going to parse into another special

93 # token like (inline) flags or end comments. Complicated regex to match

94 # most normal characters, and '*', '{', '}', and '@' when we are sure that

95 # it is safe. Expression [^*{\s]@ must come first, or the other options will

96 # match everything before @, and we won't match @'s that aren't part of flags

97 # like in email addresses in the @author tag.

98 DOC_COMMENT_TEXT = re.compile(r'([^{}\s]@\|[^{}@]\|\*(?!/))+')

99 DOC_COMMENT_NO_SPACES_TEXT = re.compile(r'([^{}\s]@\|[^{}@\s]\|\*(?!/))+')

100 # Match anything that is allowed in a type definition, except for tokens

101 # needed to parse it (and the lookahead assertion for "*/").

102 DOC_COMMENT_TYPE_TEXT = re.compile(r'([^\|!?=<>(){}:,\s]\|\(?!/))+')

103

104 # Match the prefix ' * ' that starts every line of jsdoc. Want to include

105 # spaces after the '', but nothing else that occurs after a '', and don't

106 # want to match the '' in '/'.

107 DOC_PREFIX = re.compile(r'\s\(\s+\|(?!/))')

108

109 START_BLOCK = re.compile('{')

110 END_BLOCK = re.compile('}')

111

112 REGEX_CHARACTER_CLASS = r"""

113 \[ # Opening bracket

114 ([^\]\\]\|\\.)* # Anything but a ] or \,

115 # or a backslash followed by anything

116 \] # Closing bracket

117 """

118 # We ensure the regex is followed by one of the above tokens to avoid

119 # incorrectly parsing something like x / y / z as x REGEX(/ y /) z

120 POST_REGEX_LIST = [

121 ';', ',', r'\.', r'\)', r'\]', '$', r'\/\/', r'\/\*', ':', '}']

122

123 REGEX = re.compile(r"""

124 / # opening slash

125 (?!\*) # not the start of a comment

126 (\\.\|[^\[\/\\]\|(%s))* # a backslash followed by anything,

127 # or anything but a / or [ or \,

128 # or a character class

129 / # closing slash

130 [gimsx]* # optional modifiers

131 (?=\s*(%s))

132 """ % (REGEX_CHARACTER_CLASS, '\|'.join(POST_REGEX_LIST)),

133 re.VERBOSE)

134

135 ANYTHING = re.compile(r'.*')

136 PARAMETERS = re.compile(r'[^\)]+')

137 CLOSING_PAREN_WITH_SPACE = re.compile(r'\)\s*')

138

139 FUNCTION_DECLARATION = re.compile(r'\bfunction\b')

140

141 OPENING_PAREN = re.compile(r'\(')

142 CLOSING_PAREN = re.compile(r'\)')

143

144 OPENING_BRACKET = re.compile(r'\[')

145 CLOSING_BRACKET = re.compile(r'\]')

146

147 # We omit these JS keywords from the list:

148 # function - covered by FUNCTION_DECLARATION.

149 # delete, in, instanceof, new, typeof - included as operators.

150 # this - included in identifiers.

151 # null, undefined - not included, should go in some "special constant" list.

152 KEYWORD_LIST = [

153 'break',

154 'case',

155 'catch',

156 'continue',

157 'default',

158 'do',

159 'else',

160 'finally',

161 'for',

162 'if',

163 'return',

164 'switch',

165 'throw',

166 'try',

167 'var',

168 'while',

169 'with',

170 ]

171

172 # List of regular expressions to match as operators. Some notes: for our

173 # purposes, the comma behaves similarly enough to a normal operator that we

174 # include it here. r'\bin\b' actually matches 'in' surrounded by boundary

175 # characters - this may not match some very esoteric uses of the in operator.

176 # Operators that are subsets of larger operators must come later in this list

177 # for proper matching, e.g., '>>' must come AFTER '>>>'.

178 OPERATOR_LIST = [

179 ',',

180 r'\+\+',

181 '===',

182 '!==',

183 '>>>=',

184 '>>>',

185 '==',

186 '>=',

187 '<=',

188 '!=',

189 '<<=',

190 '>>=',

191 '<<',

192 '>>',

193 '=>',

194 '>',

195 '<',

196 r'\+=',

197 r'\+',

198 '--',

199 r'\^=',

200 '-=',

201 '-',

202 '/=',

203 '/',

204 r'\*=',

205 r'\*',

206 '%=',

207 '%',

208 '&&',

209 r'\\|\\|',

210 '&=',

211 '&',

212 r'\\|=',

213 r'\\|',

214 '=',

215 '!',

216 ':',

217 r'\?',

218 r'\^',

219 r'\bdelete\b',

220 r'\bin\b',

221 r'\binstanceof\b',

222 r'\bnew\b',

223 r'\btypeof\b',

224 r'\bvoid\b',

225 r'\.',

226 ]

227 OPERATOR = re.compile('\|'.join(OPERATOR_LIST))

228

229 WHITESPACE = re.compile(r'\s+')

230 SEMICOLON = re.compile(r';')

231 # Technically JavaScript identifiers can't contain '.', but we treat a set of

232 # nested identifiers as a single identifier, except for trailing dots.

233 NESTED_IDENTIFIER = r'[a-zA-Z_$]([%s]\|\.[a-zA-Z_$])*' % IDENTIFIER_CHAR

234 IDENTIFIER = re.compile(NESTED_IDENTIFIER)

235

236 SIMPLE_LVALUE = re.compile(r"""

237 (?P<identifier>%s) # a valid identifier

238 (?=\s* # optional whitespace

239 \= # look ahead to equal sign

240 (?!=)) # not follwed by equal

241 """ % NESTED_IDENTIFIER, re.VERBOSE)

242

243 # A doc flag is a @ sign followed by non-space characters that appears at the

244 # beginning of the line, after whitespace, or after a '{'. The look-behind

245 # check is necessary to not match someone@google.com as a flag.

246 DOC_FLAG = re.compile(r'(^\|(?<=\s))@(?P<name>[a-zA-Z]+)')

247 # To properly parse parameter names and complex doctypes containing

248 # whitespace, we need to tokenize whitespace into a token after certain

249 # doctags. All statetracker.HAS_TYPE that are not listed here must not contain

250 # any whitespace in their types.

251 DOC_FLAG_LEX_SPACES = re.compile(

252 r'(^\|(?<=\s))@(?P<name>%s)\b' %

253 '\|'.join([

254 'const',

255 'enum',

256 'export',

257 'extends',

258 'final',

259 'implements',

260 'package',

261 'param',

262 'private',

263 'protected',

264 'public',

265 'return',

266 'type',

267 'typedef'

268 ]))

269

270 DOC_INLINE_FLAG = re.compile(r'(?<={)@(?P<name>[a-zA-Z]+)')

271

272 DOC_TYPE_BLOCK_START = re.compile(r'[<(]')

273 DOC_TYPE_BLOCK_END = re.compile(r'[>)]')

274 DOC_TYPE_MODIFIERS = re.compile(r'[!?\|,:=]')

275

276 # Star followed by non-slash, i.e a star that does not end a comment.

277 # This is used for TYPE_GROUP below.

278 SAFE_STAR = r'(\*(?!/))'

279

280 COMMON_DOC_MATCHERS = [

281 # Find the end of the comment.

282 Matcher(END_BLOCK_COMMENT, Type.END_DOC_COMMENT,

283 JavaScriptModes.TEXT_MODE),

284

285 # Tokenize documented flags like @private.

286 Matcher(DOC_INLINE_FLAG, Type.DOC_INLINE_FLAG),

287 Matcher(DOC_FLAG_LEX_SPACES, Type.DOC_FLAG,

288 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE),

289

290 # Encountering a doc flag should leave lex spaces mode.

291 Matcher(DOC_FLAG, Type.DOC_FLAG, JavaScriptModes.DOC_COMMENT_MODE),

292

293 # Tokenize braces so we can find types.

294 Matcher(START_BLOCK, Type.DOC_START_BRACE),

295 Matcher(END_BLOCK, Type.DOC_END_BRACE),

296

297 # And some more to parse types.

298 Matcher(DOC_TYPE_BLOCK_START, Type.DOC_TYPE_START_BLOCK),

299 Matcher(DOC_TYPE_BLOCK_END, Type.DOC_TYPE_END_BLOCK),

300

301 Matcher(DOC_TYPE_MODIFIERS, Type.DOC_TYPE_MODIFIER),

302 Matcher(DOC_COMMENT_TYPE_TEXT, Type.COMMENT),

303

304 Matcher(DOC_PREFIX, Type.DOC_PREFIX, None, True)]

305

306 # When text is not matched, it is given this default type based on mode.

307 # If unspecified in this map, the default default is Type.NORMAL.

308 JAVASCRIPT_DEFAULT_TYPES = {

309 JavaScriptModes.DOC_COMMENT_MODE: Type.COMMENT,

310 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: Type.COMMENT

311 }

312

313 @classmethod

314 def BuildMatchers(cls):

315 """Builds the token matcher group.

316

317 The token matcher groups work as follows: it is a list of Matcher objects.

318 The matchers will be tried in this order, and the first to match will be

319 returned. Hence the order is important because the matchers that come first

320 overrule the matchers that come later.

321

322 Returns:

323 The completed token matcher group.

324 """

325 # Match a keyword string followed by a non-identifier character in order to

326 # not match something like doSomething as do + Something.

327 keyword = re.compile('(%s)((?=[^%s])\|$)' % (

328 '\|'.join(cls.KEYWORD_LIST), cls.IDENTIFIER_CHAR))

329 return {

330

331 # Matchers for basic text mode.

332 JavaScriptModes.TEXT_MODE: [

333 # Check a big group - strings, starting comments, and regexes - all

334 # of which could be intertwined. 'string with /regex/',

335 # /regex with 'string'/, /* comment with /regex/ and string */ (and

336 # so on)

337 Matcher(cls.START_DOC_COMMENT, Type.START_DOC_COMMENT,

338 JavaScriptModes.DOC_COMMENT_MODE),

339 Matcher(cls.START_BLOCK_COMMENT, Type.START_BLOCK_COMMENT,

340 JavaScriptModes.BLOCK_COMMENT_MODE),

341 Matcher(cls.END_OF_LINE_SINGLE_LINE_COMMENT,

342 Type.START_SINGLE_LINE_COMMENT),

343 Matcher(cls.START_SINGLE_LINE_COMMENT,

344 Type.START_SINGLE_LINE_COMMENT,

345 JavaScriptModes.LINE_COMMENT_MODE),

346 Matcher(cls.SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_START,

347 JavaScriptModes.SINGLE_QUOTE_STRING_MODE),

348 Matcher(cls.DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_START,

349 JavaScriptModes.DOUBLE_QUOTE_STRING_MODE),

350 Matcher(cls.TEMPLATE_QUOTE, Type.TEMPLATE_STRING_START,

351 JavaScriptModes.TEMPLATE_STRING_MODE),

352 Matcher(cls.REGEX, Type.REGEX),

353

354 # Next we check for start blocks appearing outside any of the items

355 # above.

356 Matcher(cls.START_BLOCK, Type.START_BLOCK),

357 Matcher(cls.END_BLOCK, Type.END_BLOCK),

358

359 # Then we search for function declarations.

360 Matcher(cls.FUNCTION_DECLARATION, Type.FUNCTION_DECLARATION,

361 JavaScriptModes.FUNCTION_MODE),

362

363 # Next, we convert non-function related parens to tokens.

364 Matcher(cls.OPENING_PAREN, Type.START_PAREN),

365 Matcher(cls.CLOSING_PAREN, Type.END_PAREN),

366

367 # Next, we convert brackets to tokens.

368 Matcher(cls.OPENING_BRACKET, Type.START_BRACKET),

369 Matcher(cls.CLOSING_BRACKET, Type.END_BRACKET),

370

371 # Find numbers. This has to happen before operators because

372 # scientific notation numbers can have + and - in them.

373 Matcher(cls.NUMBER, Type.NUMBER),

374

375 # Find operators and simple assignments

376 Matcher(cls.SIMPLE_LVALUE, Type.SIMPLE_LVALUE),

377 Matcher(cls.OPERATOR, Type.OPERATOR),

378

379 # Find key words and whitespace.

380 Matcher(keyword, Type.KEYWORD),

381 Matcher(cls.WHITESPACE, Type.WHITESPACE),

382

383 # Find identifiers.

384 Matcher(cls.IDENTIFIER, Type.IDENTIFIER),

385

386 # Finally, we convert semicolons to tokens.

387 Matcher(cls.SEMICOLON, Type.SEMICOLON)],

388

389 # Matchers for single quote strings.

390 JavaScriptModes.SINGLE_QUOTE_STRING_MODE: [

391 Matcher(cls.SINGLE_QUOTE_TEXT, Type.STRING_TEXT),

392 Matcher(cls.SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_END,

393 JavaScriptModes.TEXT_MODE)],

394

395 # Matchers for double quote strings.

396 JavaScriptModes.DOUBLE_QUOTE_STRING_MODE: [

397 Matcher(cls.DOUBLE_QUOTE_TEXT, Type.STRING_TEXT),

398 Matcher(cls.DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_END,

399 JavaScriptModes.TEXT_MODE)],

400

401 # Matchers for template strings.

402 JavaScriptModes.TEMPLATE_STRING_MODE: [

403 Matcher(cls.TEMPLATE_QUOTE_TEXT, Type.STRING_TEXT),

404 Matcher(cls.TEMPLATE_QUOTE, Type.TEMPLATE_STRING_END,

405 JavaScriptModes.TEXT_MODE)],

406

407 # Matchers for block comments.

408 JavaScriptModes.BLOCK_COMMENT_MODE: [

409 # First we check for exiting a block comment.

410 Matcher(cls.END_BLOCK_COMMENT, Type.END_BLOCK_COMMENT,

411 JavaScriptModes.TEXT_MODE),

412

413 # Match non-comment-ending text..

414 Matcher(cls.BLOCK_COMMENT_TEXT, Type.COMMENT)],

415

416 # Matchers for doc comments.

417 JavaScriptModes.DOC_COMMENT_MODE: cls.COMMON_DOC_MATCHERS + [

418 Matcher(cls.DOC_COMMENT_TEXT, Type.COMMENT)],

419

420 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: cls.COMMON_DOC_MATCHERS + [

421 Matcher(cls.WHITESPACE, Type.COMMENT),

422 Matcher(cls.DOC_COMMENT_NO_SPACES_TEXT, Type.COMMENT)],

423

424 # Matchers for single line comments.

425 JavaScriptModes.LINE_COMMENT_MODE: [

426 # We greedy match until the end of the line in line comment mode.

427 Matcher(cls.ANYTHING, Type.COMMENT, JavaScriptModes.TEXT_MODE)],

428

429 # Matchers for code after the function keyword.

430 JavaScriptModes.FUNCTION_MODE: [

431 # Must match open paren before anything else and move into parameter

432 # mode, otherwise everything inside the parameter list is parsed

433 # incorrectly.

434 Matcher(cls.OPENING_PAREN, Type.START_PARAMETERS,

435 JavaScriptModes.PARAMETER_MODE),

436 Matcher(cls.WHITESPACE, Type.WHITESPACE),

437 Matcher(cls.IDENTIFIER, Type.FUNCTION_NAME)],

438

439 # Matchers for function parameters

440 JavaScriptModes.PARAMETER_MODE: [

441 # When in function parameter mode, a closing paren is treated

442 # specially. Everything else is treated as lines of parameters.

443 Matcher(cls.CLOSING_PAREN_WITH_SPACE, Type.END_PARAMETERS,

444 JavaScriptModes.TEXT_MODE),

445 Matcher(cls.PARAMETERS, Type.PARAMETERS,

446 JavaScriptModes.PARAMETER_MODE)]}

447

448 def __init__(self, parse_js_doc=True):

449 """Create a tokenizer object.

450

451 Args:

452 parse_js_doc: Whether to do detailed parsing of javascript doc comments,

453 or simply treat them as normal comments. Defaults to parsing JsDoc.

454 """

455 matchers = self.BuildMatchers()

456 if not parse_js_doc:

457 # Make a copy so the original doesn't get modified.

458 matchers = copy.deepcopy(matchers)

459 matchers[JavaScriptModes.DOC_COMMENT_MODE] = matchers[

460 JavaScriptModes.BLOCK_COMMENT_MODE]

461

462 tokenizer.Tokenizer.__init__(self, JavaScriptModes.TEXT_MODE, matchers,

463 self.JAVASCRIPT_DEFAULT_TYPES)

464

465 def _CreateToken(self, string, token_type, line, line_number, values=None):

466 """Creates a new JavaScriptToken object.

467

468 Args:

469 string: The string of input the token contains.

470 token_type: The type of token.

471 line: The text of the line this token is in.

472 line_number: The line number of the token.

473 values: A dict of named values within the token. For instance, a

474 function declaration may have a value called 'name' which captures the

475 name of the function.

476 """

477 return javascripttokens.JavaScriptToken(string, token_type, line,

478 line_number, values, line_number)

OLD	NEW