Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(127)

Side by Side Diff: sdk/lib/html/scripts/pegparser.py

Issue 11691009: Moved most of html lib generating scripts into tools. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 7 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « sdk/lib/html/scripts/multiemitter_test.py ('k') | sdk/lib/html/scripts/pegparser_test.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/python
2 # Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
3 # for details. All rights reserved. Use of this source code is governed by a
4 # BSD-style license that can be found in the LICENSE file.
5
6 import logging
7 import re
8 import weakref
9
10 _logger = logging.getLogger('pegparser')
11
12 # functions can refer to each other, hence creating infinite loops. The
13 # following hashmap is used to memoize functions that were already compiled.
14 _compiled_functions_memory = weakref.WeakKeyDictionary()
15
16 _regex_type = type(re.compile(r''))
17 _list_type = type([])
18 _function_type = type(lambda func: 0)
19
20
21 class _PegParserState(object):
22 """Object for storing parsing state variables and options"""
23
24 def __init__(self, text, whitespace_rule, strings_are_tokens):
25 # Parsing state:
26 self.text = text
27 self.is_whitespace_mode = False
28
29 # Error message helpers:
30 self.max_pos = None
31 self.max_rule = None
32
33 # Parsing options:
34 self.whitespace_rule = whitespace_rule
35 self.strings_are_tokens = strings_are_tokens
36
37
38 class _PegParserRule(object):
39 """Base class for all rules"""
40
41 def __init__(self):
42 return
43
44 def __str__(self):
45 return self.__class__.__name__
46
47 def _match_impl(self, state, pos):
48 """Default implementation of the matching algorithm.
49 Should be overwritten by sub-classes.
50 """
51 raise RuntimeError('_match_impl not implemented')
52
53 def match(self, state, pos):
54 """Matches the rule against the text in the given position.
55
56 The actual rule evaluation is delegated to _match_impl,
57 while this function deals mostly with support tasks such as
58 skipping whitespace, debug information and data for exception.
59
60 Args:
61 state -- the current parsing state and options.
62 pos -- the current offset in the text.
63
64 Returns:
65 (next position, value) if the rule matches, or
66 (None, None) if it doesn't.
67 """
68 if not state.is_whitespace_mode:
69 # Skip whitespace
70 pos = _skip_whitespace(state, pos)
71
72 # Track position for possible error messaging
73 if pos > state.max_pos:
74 # Store position and the rule.
75 state.max_pos = pos
76 if isinstance(self, _StringRule):
77 state.max_rule = [self]
78 else:
79 state.max_rule = []
80 elif pos == state.max_pos:
81 if isinstance(self, _StringRule):
82 state.max_rule.append(self)
83
84 if _logger.isEnabledFor(logging.DEBUG):
85 # Used for debugging
86 _logger.debug('Try: pos=%s char=%s rule=%s' % \
87 (pos, state.text[pos:pos + 1], self))
88
89 # Delegate the matching logic to the the specialized function.
90 res = self._match_impl(state, pos)
91
92 if not state.is_whitespace_mode \
93 and _logger.isEnabledFor(logging.DEBUG):
94 # More debugging information
95 (nextPos, ast) = res
96 if nextPos is not None:
97 _logger.debug('Match! pos=%s char=%s rule=%s' % \
98 (pos, state.text[pos:pos + 1], self))
99 else:
100 _logger.debug('Fail. pos=%s char=%s rule=%s' % \
101 (pos, state.text[pos:pos + 1], self))
102
103 return res
104
105
106 def _compile(rule):
107 """Recursively compiles user-defined rules into parser rules.
108 Compilation is performed by converting strings, regular expressions, lists
109 and functions into _StringRule, _RegExpRule, SEQUENCE and _FunctionRule
110 (respectively). Memoization is used to avoid infinite recursion as rules
111 may refer to each other."""
112 if rule is None:
113 raise RuntimeError('None is not a valid rule')
114 elif isinstance(rule, str):
115 return _StringRule(rule)
116 elif isinstance(rule, _regex_type):
117 return _RegExpRule(rule)
118 elif isinstance(rule, _list_type):
119 return SEQUENCE(*rule)
120 elif isinstance(rule, _function_type):
121 # Memoize compiled functions to avoid infinite compliation loops.
122 if rule in _compiled_functions_memory:
123 return _compiled_functions_memory[rule]
124 else:
125 compiled_function = _FunctionRule(rule)
126 _compiled_functions_memory[rule] = compiled_function
127 compiled_function._sub_rule = _compile(rule())
128 return compiled_function
129 elif isinstance(rule, _PegParserRule):
130 return rule
131 else:
132 raise RuntimeError('Invalid rule type %s: %s', (type(rule), rule))
133
134
135 def _skip_whitespace(state, pos):
136 """Returns the next non-whitespace position.
137 This is done by matching the optional whitespace_rule with the current
138 text."""
139 if not state.whitespace_rule:
140 return pos
141 state.is_whitespace_mode = True
142 nextPos = pos
143 while nextPos is not None:
144 pos = nextPos
145 (nextPos, ast) = state.whitespace_rule.match(state, pos)
146 state.is_whitespace_mode = False
147 return pos
148
149
150 class _StringRule(_PegParserRule):
151 """This rule tries to match a whole string."""
152
153 def __init__(self, string):
154 """Constructor.
155 Args:
156 string -- string to match.
157 """
158 _PegParserRule.__init__(self)
159 self._string = string
160
161 def __str__(self):
162 return '"%s"' % self._string
163
164 def _match_impl(self, state, pos):
165 """Tries to match the string at the current position"""
166 if state.text.startswith(self._string, pos):
167 nextPos = pos + len(self._string)
168 if state.strings_are_tokens:
169 return (nextPos, None)
170 else:
171 return (nextPos, self._string)
172 return (None, None)
173
174
175 class _RegExpRule(_PegParserRule):
176 """This rule tries to matches a regular expression."""
177
178 def __init__(self, reg_exp):
179 """Constructor.
180 Args:
181 reg_exp -- a regular expression used in matching.
182 """
183 _PegParserRule.__init__(self)
184 self.reg_exp = reg_exp
185
186 def __str__(self):
187 return 'regexp'
188
189 def _match_impl(self, state, pos):
190 """Tries to match the regular expression with current text"""
191 matchObj = self.reg_exp.match(state.text, pos)
192 if matchObj:
193 matchStr = matchObj.group()
194 return (pos + len(matchStr), matchStr)
195 return (None, None)
196
197
198 class _FunctionRule(_PegParserRule):
199 """Function rule wraps a rule defined via a Python function.
200
201 Defining rules via functions helps break the grammar into parts, labeling
202 the ast, and supporting recursive definitions in the grammar
203
204 Usage Example:
205 def Func(): return ['function', TOKEN('('), TOKEN(')')]
206 def Var(): return OR('x', 'y')
207 def Program(): return OR(Func, Var)
208
209 When matched with 'function()', will return the tuple:
210 ('Program', ('Func', 'function'))
211 When matched with 'x', will return the tuple:
212 ('Program', ('Var', 'x'))
213
214 Functions who's name begins with '_' will not be labelled. This is useful
215 for creating utility rules. Extending the example above:
216
217 def _Program(): return OR(Func, Var)
218
219 When matched with 'function()', will return the tuple:
220 ('Func', 'function')
221 """
222
223 def __init__(self, func):
224 """Constructor.
225 Args:
226 func -- the original function will be used for labeling output.
227 """
228 _PegParserRule.__init__(self)
229 self._func = func
230 # Sub-rule is compiled by _compile to avoid infinite recursion.
231 self._sub_rule = None
232
233 def __str__(self):
234 return self._func.__name__
235
236 def _match_impl(self, state, pos):
237 """Simply invokes the sub rule"""
238 (nextPos, ast) = self._sub_rule.match(state, pos)
239 if nextPos is not None:
240 if not self._func.__name__.startswith('_'):
241 ast = (self._func.__name__, ast)
242 return (nextPos, ast)
243 return (None, None)
244
245
246 class SEQUENCE(_PegParserRule):
247 """This rule expects all given rules to match in sequence.
248 Note that SEQUENCE is equivalent to a rule composed of a Python list of
249 rules.
250 Usage example: SEQUENCE('A', 'B', 'C')
251 or: ['A', 'B', 'C']
252 Will match 'ABC' but not 'A', 'B' or ''.
253 """
254 def __init__(self, *rules):
255 """Constructor.
256 Args:
257 rules -- one or more rules to match.
258 """
259 _PegParserRule.__init__(self)
260 self._sub_rules = []
261 for rule in rules:
262 self._sub_rules.append(_compile(rule))
263
264 def _match_impl(self, state, pos):
265 """Tries to match all the sub rules"""
266 sequence = []
267 for rule in self._sub_rules:
268 (nextPos, ast) = rule.match(state, pos)
269 if nextPos is not None:
270 if ast:
271 if isinstance(ast, _list_type):
272 sequence.extend(ast)
273 else:
274 sequence.append(ast)
275 pos = nextPos
276 else:
277 return (None, None)
278 return (pos, sequence)
279
280
281 class OR(_PegParserRule):
282 """This rule matches one and only one of multiple sub-rules.
283 Usage example: OR('A', 'B', 'C')
284 Will match 'A', 'B' or 'C'.
285 """
286 def __init__(self, *rules):
287 """Constructor.
288 Args:
289 rules -- rules to choose from.
290 """
291 _PegParserRule.__init__(self)
292 self._sub_rules = []
293 for rule in rules:
294 self._sub_rules.append(_compile(rule))
295
296 def _match_impl(self, state, pos):
297 """Tries to match at leat one of the sub rules"""
298 for rule in self._sub_rules:
299 (nextPos, ast) = rule.match(state, pos)
300 if nextPos is not None:
301 return (nextPos, ast)
302 return (None, None)
303
304
305 class MAYBE(_PegParserRule):
306 """Will try to match the given rule, tolerating absence.
307 Usage example: MAYBE('A')
308 Will match 'A' but also ''.
309 """
310 def __init__(self, rule):
311 """Constructor.
312 Args:
313 rule -- the rule that may be absent.
314 """
315 _PegParserRule.__init__(self)
316 self._sub_rule = _compile(rule)
317
318 def _match_impl(self, state, pos):
319 """Tries to match at leat one of the sub rules"""
320 (nextPos, ast) = self._sub_rule.match(state, pos)
321 if nextPos is not None:
322 return (nextPos, ast)
323 return (pos, None)
324
325
326 class MANY(_PegParserRule):
327 """Will try to match the given rule one or more times.
328 Usage example 1: MANY('A')
329 Will match 'A', 'AAAAA' but not ''.
330 Usage example 2: MANY('A', separator=',')
331 Will match 'A', 'A,A' but not 'AA'.
332 """
333
334 def __init__(self, rule, separator=None):
335 """Constructor.
336 Args:
337 rule -- the rule to match multiple times.
338 separator -- this optional rule is used to match separators.
339 """
340 _PegParserRule.__init__(self)
341 self._sub_rule = _compile(rule)
342 self._separator = _compile(separator) if separator else None
343
344 def _match_impl(self, state, pos):
345 res = []
346 count = 0
347 while True:
348 if count > 0 and self._separator:
349 (nextPos, ast) = self._separator.match(state, pos)
350 if nextPos is not None:
351 pos = nextPos
352 if ast:
353 res.append(ast)
354 else:
355 break
356 (nextPos, ast) = self._sub_rule.match(state, pos)
357 if nextPos is None:
358 break
359 count += 1
360 pos = nextPos
361 res.append(ast)
362 if count > 0:
363 return (pos, res)
364 return (None, None)
365
366
367 class TOKEN(_PegParserRule):
368 """The matched rule will not appear in the the output.
369 Usage example: ['A', TOKEN('.'), 'B']
370 When matching 'A.B', will return the sequence ['A', 'B'].
371 """
372
373 def __init__(self, rule):
374 """Constructor.
375 Args:
376 rule -- the rule to match.
377 """
378 _PegParserRule.__init__(self)
379 self._sub_rule = _compile(rule)
380
381 def _match_impl(self, state, pos):
382 (nextPos, ast) = self._sub_rule.match(state, pos)
383 if nextPos is not None:
384 return (nextPos, None)
385 return (None, None)
386
387
388 class LABEL(_PegParserRule):
389 """The matched rule will appear in the output with the given label.
390 Usage example: LABEL('number', re.compile(r'[0-9]+'))
391 When matched with '1234', will return ('number', '1234').
392
393 Keyword arguments:
394 label -- a string.
395 rule -- the rule to match.
396 """
397
398 def __init__(self, label, rule):
399 """Constructor.
400 Args:
401 rule -- the rule to match.
402 """
403 _PegParserRule.__init__(self)
404 self._label = label
405 self._sub_rule = _compile(rule)
406
407 def _match_impl(self, state, pos):
408 (nextPos, ast) = self._sub_rule.match(state, pos)
409 if nextPos is not None:
410 return (nextPos, (self._label, ast))
411 return (None, None)
412
413
414 class RAISE(_PegParserRule):
415 """Raises a SyntaxError with a user-provided message.
416 Usage example: ['A','B', RAISE('should have not gotten here')]
417 Will not match 'A' but will raise an exception for 'AB'.
418 This rule is useful mostly for debugging grammars.
419 """
420 def __init__(self, message):
421 """Constructor.
422 Args:
423 message -- the message for the raised exception.
424 """
425 _PegParserRule.__init__(self)
426 self._message = message
427
428 def _match_impl(self, state, pos):
429 raise RuntimeError(self._message)
430
431
432 class PegParser(object):
433 """PegParser class.
434 This generic parser can be configured with rules to parse a wide
435 range of inputs.
436 """
437
438 def __init__(self, root_rule, whitespace_rule=None,
439 strings_are_tokens=False):
440 """Initializes a PegParser with rules and parsing options.
441
442 Args:
443 root_rule -- the top level rule to start matching at. Rule can be
444 a regular expression, a string, or one of the special rules
445 such as SEQUENCE, MANY, OR, etc.
446 whitespace_rule -- used to identify and strip whitespace. Default
447 isNone, configuring the parser to not tolerate whitespace.
448 strings_are_tokens -- by default string rules are not treated as
449 tokens. In many programming languages, strings are tokens,
450 so this should be set to True.
451 """
452 self._strings_are_tokens = strings_are_tokens
453 self._root_rule = _compile(root_rule)
454 if whitespace_rule is None:
455 self._whitespace_rule = None
456 else:
457 self._whitespace_rule = _compile(whitespace_rule)
458
459 def parse(self, text, start_pos=0):
460 """Parses the given text input
461 Args:
462 text -- data to parse.
463 start_pos -- the offset to start parsing at.
464
465 Returns:
466 An abstract syntax tree, with nodes being pairs of the format
467 (label, value), where label is a string or a function, and value
468 is a string, a pair or a list of pairs.
469 """
470
471 def calculate_line_number_and_offset(globalOffset):
472 """Calculates the line number and in-line offset"""
473 i = 0
474 lineNumber = 1
475 lineOffset = 0
476 lineData = []
477 while i < globalOffset and i < len(text):
478 if text[i] == '\n':
479 lineNumber += 1
480 lineOffset = 0
481 lineData = []
482 else:
483 lineData.append(text[i])
484 lineOffset += 1
485 i += 1
486 while i < len(text) and text[i] != '\n':
487 lineData.append(text[i])
488 i += 1
489 return (lineNumber, lineOffset, ''.join(lineData))
490
491 def analyze_result(state, pos, ast):
492 """Analyze match output"""
493 if pos is not None:
494 # Its possible that matching is successful but trailing
495 # whitespace remains, so skip it.
496 pos = _skip_whitespace(state, pos)
497 if pos == len(state.text):
498 # End of intput reached. Success!
499 return ast
500
501 # Failure - analyze and raise an error.
502 (lineNumber, lineOffset, lineData) = \
503 calculate_line_number_and_offset(state.max_pos)
504 message = 'unexpected error'
505 if state.max_rule:
506 set = {}
507 map(set.__setitem__, state.max_rule, [])
508
509 def to_str(item):
510 return item.__str__()
511
512 expected = ' or '.join(map(to_str, set.keys()))
513 found = state.text[state.max_pos:state.max_pos + 1]
514 message = 'Expected %s but "%s" found: "%s"' % \
515 (expected, found, lineData)
516 raise SyntaxError(
517 'At line %s offset %s: %s' % \
518 (lineNumber, lineOffset, message))
519
520 # Initialize state
521 state = _PegParserState(text,
522 whitespace_rule=self._whitespace_rule,
523 strings_are_tokens=self._strings_are_tokens)
524
525 # Match and analyze result
526 (pos, ast) = self._root_rule.match(state, start_pos)
527 return analyze_result(state, pos, ast)
OLDNEW
« no previous file with comments | « sdk/lib/html/scripts/multiemitter_test.py ('k') | sdk/lib/html/scripts/pegparser_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698