sdk/lib/html/scripts/pegparser.py - Issue 11691009: Moved most of html lib generating scripts into tools.

Side by Side Diff: sdk/lib/html/scripts/pegparser.py

Issue 11691009: Moved most of html lib generating scripts into tools. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 7 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 #!/usr/bin/python

2 # Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file

3 # for details. All rights reserved. Use of this source code is governed by a

4 # BSD-style license that can be found in the LICENSE file.

5

6 import logging

7 import re

8 import weakref

9

10 _logger = logging.getLogger('pegparser')

11

12 # functions can refer to each other, hence creating infinite loops. The

13 # following hashmap is used to memoize functions that were already compiled.

14 _compiled_functions_memory = weakref.WeakKeyDictionary()

15

16 _regex_type = type(re.compile(r''))

17 _list_type = type([])

18 _function_type = type(lambda func: 0)

19

20

21 class _PegParserState(object):

22 """Object for storing parsing state variables and options"""

23

24 def __init__(self, text, whitespace_rule, strings_are_tokens):

25 # Parsing state:

26 self.text = text

27 self.is_whitespace_mode = False

28

29 # Error message helpers:

30 self.max_pos = None

31 self.max_rule = None

32

33 # Parsing options:

34 self.whitespace_rule = whitespace_rule

35 self.strings_are_tokens = strings_are_tokens

36

37

38 class _PegParserRule(object):

39 """Base class for all rules"""

40

41 def __init__(self):

42 return

43

44 def __str__(self):

45 return self.__class__.__name__

46

47 def _match_impl(self, state, pos):

48 """Default implementation of the matching algorithm.

49 Should be overwritten by sub-classes.

50 """

51 raise RuntimeError('_match_impl not implemented')

52

53 def match(self, state, pos):

54 """Matches the rule against the text in the given position.

55

56 The actual rule evaluation is delegated to _match_impl,

57 while this function deals mostly with support tasks such as

58 skipping whitespace, debug information and data for exception.

59

60 Args:

61 state -- the current parsing state and options.

62 pos -- the current offset in the text.

63

64 Returns:

65 (next position, value) if the rule matches, or

66 (None, None) if it doesn't.

67 """

68 if not state.is_whitespace_mode:

69 # Skip whitespace

70 pos = _skip_whitespace(state, pos)

71

72 # Track position for possible error messaging

73 if pos > state.max_pos:

74 # Store position and the rule.

75 state.max_pos = pos

76 if isinstance(self, _StringRule):

77 state.max_rule = [self]

78 else:

79 state.max_rule = []

80 elif pos == state.max_pos:

81 if isinstance(self, _StringRule):

82 state.max_rule.append(self)

83

84 if _logger.isEnabledFor(logging.DEBUG):

85 # Used for debugging

86 _logger.debug('Try: pos=%s char=%s rule=%s' % \

87 (pos, state.text[pos:pos + 1], self))

88

89 # Delegate the matching logic to the the specialized function.

90 res = self._match_impl(state, pos)

91

92 if not state.is_whitespace_mode \

93 and _logger.isEnabledFor(logging.DEBUG):

94 # More debugging information

95 (nextPos, ast) = res

96 if nextPos is not None:

97 _logger.debug('Match! pos=%s char=%s rule=%s' % \

98 (pos, state.text[pos:pos + 1], self))

99 else:

100 _logger.debug('Fail. pos=%s char=%s rule=%s' % \

101 (pos, state.text[pos:pos + 1], self))

102

103 return res

104

105

106 def _compile(rule):

107 """Recursively compiles user-defined rules into parser rules.

108 Compilation is performed by converting strings, regular expressions, lists

109 and functions into _StringRule, _RegExpRule, SEQUENCE and _FunctionRule

110 (respectively). Memoization is used to avoid infinite recursion as rules

111 may refer to each other."""

112 if rule is None:

113 raise RuntimeError('None is not a valid rule')

114 elif isinstance(rule, str):

115 return _StringRule(rule)

116 elif isinstance(rule, _regex_type):

117 return _RegExpRule(rule)

118 elif isinstance(rule, _list_type):

119 return SEQUENCE(*rule)

120 elif isinstance(rule, _function_type):

121 # Memoize compiled functions to avoid infinite compliation loops.

122 if rule in _compiled_functions_memory:

123 return _compiled_functions_memory[rule]

124 else:

125 compiled_function = _FunctionRule(rule)

126 _compiled_functions_memory[rule] = compiled_function

127 compiled_function._sub_rule = _compile(rule())

128 return compiled_function

129 elif isinstance(rule, _PegParserRule):

130 return rule

131 else:

132 raise RuntimeError('Invalid rule type %s: %s', (type(rule), rule))

133

134

135 def _skip_whitespace(state, pos):

136 """Returns the next non-whitespace position.

137 This is done by matching the optional whitespace_rule with the current

138 text."""

139 if not state.whitespace_rule:

140 return pos

141 state.is_whitespace_mode = True

142 nextPos = pos

143 while nextPos is not None:

144 pos = nextPos

145 (nextPos, ast) = state.whitespace_rule.match(state, pos)

146 state.is_whitespace_mode = False

147 return pos

148

149

150 class _StringRule(_PegParserRule):

151 """This rule tries to match a whole string."""

152

153 def __init__(self, string):

154 """Constructor.

155 Args:

156 string -- string to match.

157 """

158 _PegParserRule.__init__(self)

159 self._string = string

160

161 def __str__(self):

162 return '"%s"' % self._string

163

164 def _match_impl(self, state, pos):

165 """Tries to match the string at the current position"""

166 if state.text.startswith(self._string, pos):

167 nextPos = pos + len(self._string)

168 if state.strings_are_tokens:

169 return (nextPos, None)

170 else:

171 return (nextPos, self._string)

172 return (None, None)

173

174

175 class _RegExpRule(_PegParserRule):

176 """This rule tries to matches a regular expression."""

177

178 def __init__(self, reg_exp):

179 """Constructor.

180 Args:

181 reg_exp -- a regular expression used in matching.

182 """

183 _PegParserRule.__init__(self)

184 self.reg_exp = reg_exp

185

186 def __str__(self):

187 return 'regexp'

188

189 def _match_impl(self, state, pos):

190 """Tries to match the regular expression with current text"""

191 matchObj = self.reg_exp.match(state.text, pos)

192 if matchObj:

193 matchStr = matchObj.group()

194 return (pos + len(matchStr), matchStr)

195 return (None, None)

196

197

198 class _FunctionRule(_PegParserRule):

199 """Function rule wraps a rule defined via a Python function.

200

201 Defining rules via functions helps break the grammar into parts, labeling

202 the ast, and supporting recursive definitions in the grammar

203

204 Usage Example:

205 def Func(): return ['function', TOKEN('('), TOKEN(')')]

206 def Var(): return OR('x', 'y')

207 def Program(): return OR(Func, Var)

208

209 When matched with 'function()', will return the tuple:

210 ('Program', ('Func', 'function'))

211 When matched with 'x', will return the tuple:

212 ('Program', ('Var', 'x'))

213

214 Functions who's name begins with '_' will not be labelled. This is useful

215 for creating utility rules. Extending the example above:

216

217 def _Program(): return OR(Func, Var)

218

219 When matched with 'function()', will return the tuple:

220 ('Func', 'function')

221 """

222

223 def __init__(self, func):

224 """Constructor.

225 Args:

226 func -- the original function will be used for labeling output.

227 """

228 _PegParserRule.__init__(self)

229 self._func = func

230 # Sub-rule is compiled by _compile to avoid infinite recursion.

231 self._sub_rule = None

232

233 def __str__(self):

234 return self._func.__name__

235

236 def _match_impl(self, state, pos):

237 """Simply invokes the sub rule"""

238 (nextPos, ast) = self._sub_rule.match(state, pos)

239 if nextPos is not None:

240 if not self._func.__name__.startswith('_'):

241 ast = (self._func.__name__, ast)

242 return (nextPos, ast)

243 return (None, None)

244

245

246 class SEQUENCE(_PegParserRule):

247 """This rule expects all given rules to match in sequence.

248 Note that SEQUENCE is equivalent to a rule composed of a Python list of

249 rules.

250 Usage example: SEQUENCE('A', 'B', 'C')

251 or: ['A', 'B', 'C']

252 Will match 'ABC' but not 'A', 'B' or ''.

253 """

254 def __init__(self, *rules):

255 """Constructor.

256 Args:

257 rules -- one or more rules to match.

258 """

259 _PegParserRule.__init__(self)

260 self._sub_rules = []

261 for rule in rules:

262 self._sub_rules.append(_compile(rule))

263

264 def _match_impl(self, state, pos):

265 """Tries to match all the sub rules"""

266 sequence = []

267 for rule in self._sub_rules:

268 (nextPos, ast) = rule.match(state, pos)

269 if nextPos is not None:

270 if ast:

271 if isinstance(ast, _list_type):

272 sequence.extend(ast)

273 else:

274 sequence.append(ast)

275 pos = nextPos

276 else:

277 return (None, None)

278 return (pos, sequence)

279

280

281 class OR(_PegParserRule):

282 """This rule matches one and only one of multiple sub-rules.

283 Usage example: OR('A', 'B', 'C')

284 Will match 'A', 'B' or 'C'.

285 """

286 def __init__(self, *rules):

287 """Constructor.

288 Args:

289 rules -- rules to choose from.

290 """

291 _PegParserRule.__init__(self)

292 self._sub_rules = []

293 for rule in rules:

294 self._sub_rules.append(_compile(rule))

295

296 def _match_impl(self, state, pos):

297 """Tries to match at leat one of the sub rules"""

298 for rule in self._sub_rules:

299 (nextPos, ast) = rule.match(state, pos)

300 if nextPos is not None:

301 return (nextPos, ast)

302 return (None, None)

303

304

305 class MAYBE(_PegParserRule):

306 """Will try to match the given rule, tolerating absence.

307 Usage example: MAYBE('A')

308 Will match 'A' but also ''.

309 """

310 def __init__(self, rule):

311 """Constructor.

312 Args:

313 rule -- the rule that may be absent.

314 """

315 _PegParserRule.__init__(self)

316 self._sub_rule = _compile(rule)

317

318 def _match_impl(self, state, pos):

319 """Tries to match at leat one of the sub rules"""

320 (nextPos, ast) = self._sub_rule.match(state, pos)

321 if nextPos is not None:

322 return (nextPos, ast)

323 return (pos, None)

324

325

326 class MANY(_PegParserRule):

327 """Will try to match the given rule one or more times.

328 Usage example 1: MANY('A')

329 Will match 'A', 'AAAAA' but not ''.

330 Usage example 2: MANY('A', separator=',')

331 Will match 'A', 'A,A' but not 'AA'.

332 """

333

334 def __init__(self, rule, separator=None):

335 """Constructor.

336 Args:

337 rule -- the rule to match multiple times.

338 separator -- this optional rule is used to match separators.

339 """

340 _PegParserRule.__init__(self)

341 self._sub_rule = _compile(rule)

342 self._separator = _compile(separator) if separator else None

343

344 def _match_impl(self, state, pos):

345 res = []

346 count = 0

347 while True:

348 if count > 0 and self._separator:

349 (nextPos, ast) = self._separator.match(state, pos)

350 if nextPos is not None:

351 pos = nextPos

352 if ast:

353 res.append(ast)

354 else:

355 break

356 (nextPos, ast) = self._sub_rule.match(state, pos)

357 if nextPos is None:

358 break

359 count += 1

360 pos = nextPos

361 res.append(ast)

362 if count > 0:

363 return (pos, res)

364 return (None, None)

365

366

367 class TOKEN(_PegParserRule):

368 """The matched rule will not appear in the the output.

369 Usage example: ['A', TOKEN('.'), 'B']

370 When matching 'A.B', will return the sequence ['A', 'B'].

371 """

372

373 def __init__(self, rule):

374 """Constructor.

375 Args:

376 rule -- the rule to match.

377 """

378 _PegParserRule.__init__(self)

379 self._sub_rule = _compile(rule)

380

381 def _match_impl(self, state, pos):

382 (nextPos, ast) = self._sub_rule.match(state, pos)

383 if nextPos is not None:

384 return (nextPos, None)

385 return (None, None)

386

387

388 class LABEL(_PegParserRule):

389 """The matched rule will appear in the output with the given label.

390 Usage example: LABEL('number', re.compile(r'[0-9]+'))

391 When matched with '1234', will return ('number', '1234').

392

393 Keyword arguments:

394 label -- a string.

395 rule -- the rule to match.

396 """

397

398 def __init__(self, label, rule):

399 """Constructor.

400 Args:

401 rule -- the rule to match.

402 """

403 _PegParserRule.__init__(self)

404 self._label = label

405 self._sub_rule = _compile(rule)

406

407 def _match_impl(self, state, pos):

408 (nextPos, ast) = self._sub_rule.match(state, pos)

409 if nextPos is not None:

410 return (nextPos, (self._label, ast))

411 return (None, None)

412

413

414 class RAISE(_PegParserRule):

415 """Raises a SyntaxError with a user-provided message.

416 Usage example: ['A','B', RAISE('should have not gotten here')]

417 Will not match 'A' but will raise an exception for 'AB'.

418 This rule is useful mostly for debugging grammars.

419 """

420 def __init__(self, message):

421 """Constructor.

422 Args:

423 message -- the message for the raised exception.

424 """

425 _PegParserRule.__init__(self)

426 self._message = message

427

428 def _match_impl(self, state, pos):

429 raise RuntimeError(self._message)

430

431

432 class PegParser(object):

433 """PegParser class.

434 This generic parser can be configured with rules to parse a wide

435 range of inputs.

436 """

437

438 def __init__(self, root_rule, whitespace_rule=None,

439 strings_are_tokens=False):

440 """Initializes a PegParser with rules and parsing options.

441

442 Args:

443 root_rule -- the top level rule to start matching at. Rule can be

444 a regular expression, a string, or one of the special rules

445 such as SEQUENCE, MANY, OR, etc.

446 whitespace_rule -- used to identify and strip whitespace. Default

447 isNone, configuring the parser to not tolerate whitespace.

448 strings_are_tokens -- by default string rules are not treated as

449 tokens. In many programming languages, strings are tokens,

450 so this should be set to True.

451 """

452 self._strings_are_tokens = strings_are_tokens

453 self._root_rule = _compile(root_rule)

454 if whitespace_rule is None:

455 self._whitespace_rule = None

456 else:

457 self._whitespace_rule = _compile(whitespace_rule)

458

459 def parse(self, text, start_pos=0):

460 """Parses the given text input

461 Args:

462 text -- data to parse.

463 start_pos -- the offset to start parsing at.

464

465 Returns:

466 An abstract syntax tree, with nodes being pairs of the format

467 (label, value), where label is a string or a function, and value

468 is a string, a pair or a list of pairs.

469 """

470

471 def calculate_line_number_and_offset(globalOffset):

472 """Calculates the line number and in-line offset"""

473 i = 0

474 lineNumber = 1

475 lineOffset = 0

476 lineData = []

477 while i < globalOffset and i < len(text):

478 if text[i] == '\n':

479 lineNumber += 1

480 lineOffset = 0

481 lineData = []

482 else:

483 lineData.append(text[i])

484 lineOffset += 1

485 i += 1

486 while i < len(text) and text[i] != '\n':

487 lineData.append(text[i])

488 i += 1

489 return (lineNumber, lineOffset, ''.join(lineData))

490

491 def analyze_result(state, pos, ast):

492 """Analyze match output"""

493 if pos is not None:

494 # Its possible that matching is successful but trailing

495 # whitespace remains, so skip it.

496 pos = _skip_whitespace(state, pos)

497 if pos == len(state.text):

498 # End of intput reached. Success!

499 return ast

500

501 # Failure - analyze and raise an error.

502 (lineNumber, lineOffset, lineData) = \

503 calculate_line_number_and_offset(state.max_pos)

504 message = 'unexpected error'

505 if state.max_rule:

506 set = {}

507 map(set.__setitem__, state.max_rule, [])

508

509 def to_str(item):

510 return item.__str__()

511

512 expected = ' or '.join(map(to_str, set.keys()))

513 found = state.text[state.max_pos:state.max_pos + 1]

514 message = 'Expected %s but "%s" found: "%s"' % \

515 (expected, found, lineData)

516 raise SyntaxError(

517 'At line %s offset %s: %s' % \

518 (lineNumber, lineOffset, message))

519

520 # Initialize state

521 state = _PegParserState(text,

522 whitespace_rule=self._whitespace_rule,

523 strings_are_tokens=self._strings_are_tokens)

524

525 # Match and analyze result

526 (pos, ast) = self._root_rule.match(state, start_pos)

527 return analyze_result(state, pos, ast)

OLD	NEW

« no previous file with comments | « sdk/lib/html/scripts/multiemitter_test.py ('k') | sdk/lib/html/scripts/pegparser_test.py » ('j') | no next file with comments »