OLD | NEW |
| (Empty) |
1 #!/usr/bin/python | |
2 # Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file | |
3 # for details. All rights reserved. Use of this source code is governed by a | |
4 # BSD-style license that can be found in the LICENSE file. | |
5 | |
6 import logging | |
7 import re | |
8 import weakref | |
9 | |
10 _logger = logging.getLogger('pegparser') | |
11 | |
12 # functions can refer to each other, hence creating infinite loops. The | |
13 # following hashmap is used to memoize functions that were already compiled. | |
14 _compiled_functions_memory = weakref.WeakKeyDictionary() | |
15 | |
16 _regex_type = type(re.compile(r'')) | |
17 _list_type = type([]) | |
18 _function_type = type(lambda func: 0) | |
19 | |
20 | |
21 class _PegParserState(object): | |
22 """Object for storing parsing state variables and options""" | |
23 | |
24 def __init__(self, text, whitespace_rule, strings_are_tokens): | |
25 # Parsing state: | |
26 self.text = text | |
27 self.is_whitespace_mode = False | |
28 | |
29 # Error message helpers: | |
30 self.max_pos = None | |
31 self.max_rule = None | |
32 | |
33 # Parsing options: | |
34 self.whitespace_rule = whitespace_rule | |
35 self.strings_are_tokens = strings_are_tokens | |
36 | |
37 | |
38 class _PegParserRule(object): | |
39 """Base class for all rules""" | |
40 | |
41 def __init__(self): | |
42 return | |
43 | |
44 def __str__(self): | |
45 return self.__class__.__name__ | |
46 | |
47 def _match_impl(self, state, pos): | |
48 """Default implementation of the matching algorithm. | |
49 Should be overwritten by sub-classes. | |
50 """ | |
51 raise RuntimeError('_match_impl not implemented') | |
52 | |
53 def match(self, state, pos): | |
54 """Matches the rule against the text in the given position. | |
55 | |
56 The actual rule evaluation is delegated to _match_impl, | |
57 while this function deals mostly with support tasks such as | |
58 skipping whitespace, debug information and data for exception. | |
59 | |
60 Args: | |
61 state -- the current parsing state and options. | |
62 pos -- the current offset in the text. | |
63 | |
64 Returns: | |
65 (next position, value) if the rule matches, or | |
66 (None, None) if it doesn't. | |
67 """ | |
68 if not state.is_whitespace_mode: | |
69 # Skip whitespace | |
70 pos = _skip_whitespace(state, pos) | |
71 | |
72 # Track position for possible error messaging | |
73 if pos > state.max_pos: | |
74 # Store position and the rule. | |
75 state.max_pos = pos | |
76 if isinstance(self, _StringRule): | |
77 state.max_rule = [self] | |
78 else: | |
79 state.max_rule = [] | |
80 elif pos == state.max_pos: | |
81 if isinstance(self, _StringRule): | |
82 state.max_rule.append(self) | |
83 | |
84 if _logger.isEnabledFor(logging.DEBUG): | |
85 # Used for debugging | |
86 _logger.debug('Try: pos=%s char=%s rule=%s' % \ | |
87 (pos, state.text[pos:pos + 1], self)) | |
88 | |
89 # Delegate the matching logic to the the specialized function. | |
90 res = self._match_impl(state, pos) | |
91 | |
92 if not state.is_whitespace_mode \ | |
93 and _logger.isEnabledFor(logging.DEBUG): | |
94 # More debugging information | |
95 (nextPos, ast) = res | |
96 if nextPos is not None: | |
97 _logger.debug('Match! pos=%s char=%s rule=%s' % \ | |
98 (pos, state.text[pos:pos + 1], self)) | |
99 else: | |
100 _logger.debug('Fail. pos=%s char=%s rule=%s' % \ | |
101 (pos, state.text[pos:pos + 1], self)) | |
102 | |
103 return res | |
104 | |
105 | |
106 def _compile(rule): | |
107 """Recursively compiles user-defined rules into parser rules. | |
108 Compilation is performed by converting strings, regular expressions, lists | |
109 and functions into _StringRule, _RegExpRule, SEQUENCE and _FunctionRule | |
110 (respectively). Memoization is used to avoid infinite recursion as rules | |
111 may refer to each other.""" | |
112 if rule is None: | |
113 raise RuntimeError('None is not a valid rule') | |
114 elif isinstance(rule, str): | |
115 return _StringRule(rule) | |
116 elif isinstance(rule, _regex_type): | |
117 return _RegExpRule(rule) | |
118 elif isinstance(rule, _list_type): | |
119 return SEQUENCE(*rule) | |
120 elif isinstance(rule, _function_type): | |
121 # Memoize compiled functions to avoid infinite compliation loops. | |
122 if rule in _compiled_functions_memory: | |
123 return _compiled_functions_memory[rule] | |
124 else: | |
125 compiled_function = _FunctionRule(rule) | |
126 _compiled_functions_memory[rule] = compiled_function | |
127 compiled_function._sub_rule = _compile(rule()) | |
128 return compiled_function | |
129 elif isinstance(rule, _PegParserRule): | |
130 return rule | |
131 else: | |
132 raise RuntimeError('Invalid rule type %s: %s', (type(rule), rule)) | |
133 | |
134 | |
135 def _skip_whitespace(state, pos): | |
136 """Returns the next non-whitespace position. | |
137 This is done by matching the optional whitespace_rule with the current | |
138 text.""" | |
139 if not state.whitespace_rule: | |
140 return pos | |
141 state.is_whitespace_mode = True | |
142 nextPos = pos | |
143 while nextPos is not None: | |
144 pos = nextPos | |
145 (nextPos, ast) = state.whitespace_rule.match(state, pos) | |
146 state.is_whitespace_mode = False | |
147 return pos | |
148 | |
149 | |
150 class _StringRule(_PegParserRule): | |
151 """This rule tries to match a whole string.""" | |
152 | |
153 def __init__(self, string): | |
154 """Constructor. | |
155 Args: | |
156 string -- string to match. | |
157 """ | |
158 _PegParserRule.__init__(self) | |
159 self._string = string | |
160 | |
161 def __str__(self): | |
162 return '"%s"' % self._string | |
163 | |
164 def _match_impl(self, state, pos): | |
165 """Tries to match the string at the current position""" | |
166 if state.text.startswith(self._string, pos): | |
167 nextPos = pos + len(self._string) | |
168 if state.strings_are_tokens: | |
169 return (nextPos, None) | |
170 else: | |
171 return (nextPos, self._string) | |
172 return (None, None) | |
173 | |
174 | |
175 class _RegExpRule(_PegParserRule): | |
176 """This rule tries to matches a regular expression.""" | |
177 | |
178 def __init__(self, reg_exp): | |
179 """Constructor. | |
180 Args: | |
181 reg_exp -- a regular expression used in matching. | |
182 """ | |
183 _PegParserRule.__init__(self) | |
184 self.reg_exp = reg_exp | |
185 | |
186 def __str__(self): | |
187 return 'regexp' | |
188 | |
189 def _match_impl(self, state, pos): | |
190 """Tries to match the regular expression with current text""" | |
191 matchObj = self.reg_exp.match(state.text, pos) | |
192 if matchObj: | |
193 matchStr = matchObj.group() | |
194 return (pos + len(matchStr), matchStr) | |
195 return (None, None) | |
196 | |
197 | |
198 class _FunctionRule(_PegParserRule): | |
199 """Function rule wraps a rule defined via a Python function. | |
200 | |
201 Defining rules via functions helps break the grammar into parts, labeling | |
202 the ast, and supporting recursive definitions in the grammar | |
203 | |
204 Usage Example: | |
205 def Func(): return ['function', TOKEN('('), TOKEN(')')] | |
206 def Var(): return OR('x', 'y') | |
207 def Program(): return OR(Func, Var) | |
208 | |
209 When matched with 'function()', will return the tuple: | |
210 ('Program', ('Func', 'function')) | |
211 When matched with 'x', will return the tuple: | |
212 ('Program', ('Var', 'x')) | |
213 | |
214 Functions who's name begins with '_' will not be labelled. This is useful | |
215 for creating utility rules. Extending the example above: | |
216 | |
217 def _Program(): return OR(Func, Var) | |
218 | |
219 When matched with 'function()', will return the tuple: | |
220 ('Func', 'function') | |
221 """ | |
222 | |
223 def __init__(self, func): | |
224 """Constructor. | |
225 Args: | |
226 func -- the original function will be used for labeling output. | |
227 """ | |
228 _PegParserRule.__init__(self) | |
229 self._func = func | |
230 # Sub-rule is compiled by _compile to avoid infinite recursion. | |
231 self._sub_rule = None | |
232 | |
233 def __str__(self): | |
234 return self._func.__name__ | |
235 | |
236 def _match_impl(self, state, pos): | |
237 """Simply invokes the sub rule""" | |
238 (nextPos, ast) = self._sub_rule.match(state, pos) | |
239 if nextPos is not None: | |
240 if not self._func.__name__.startswith('_'): | |
241 ast = (self._func.__name__, ast) | |
242 return (nextPos, ast) | |
243 return (None, None) | |
244 | |
245 | |
246 class SEQUENCE(_PegParserRule): | |
247 """This rule expects all given rules to match in sequence. | |
248 Note that SEQUENCE is equivalent to a rule composed of a Python list of | |
249 rules. | |
250 Usage example: SEQUENCE('A', 'B', 'C') | |
251 or: ['A', 'B', 'C'] | |
252 Will match 'ABC' but not 'A', 'B' or ''. | |
253 """ | |
254 def __init__(self, *rules): | |
255 """Constructor. | |
256 Args: | |
257 rules -- one or more rules to match. | |
258 """ | |
259 _PegParserRule.__init__(self) | |
260 self._sub_rules = [] | |
261 for rule in rules: | |
262 self._sub_rules.append(_compile(rule)) | |
263 | |
264 def _match_impl(self, state, pos): | |
265 """Tries to match all the sub rules""" | |
266 sequence = [] | |
267 for rule in self._sub_rules: | |
268 (nextPos, ast) = rule.match(state, pos) | |
269 if nextPos is not None: | |
270 if ast: | |
271 if isinstance(ast, _list_type): | |
272 sequence.extend(ast) | |
273 else: | |
274 sequence.append(ast) | |
275 pos = nextPos | |
276 else: | |
277 return (None, None) | |
278 return (pos, sequence) | |
279 | |
280 | |
281 class OR(_PegParserRule): | |
282 """This rule matches one and only one of multiple sub-rules. | |
283 Usage example: OR('A', 'B', 'C') | |
284 Will match 'A', 'B' or 'C'. | |
285 """ | |
286 def __init__(self, *rules): | |
287 """Constructor. | |
288 Args: | |
289 rules -- rules to choose from. | |
290 """ | |
291 _PegParserRule.__init__(self) | |
292 self._sub_rules = [] | |
293 for rule in rules: | |
294 self._sub_rules.append(_compile(rule)) | |
295 | |
296 def _match_impl(self, state, pos): | |
297 """Tries to match at leat one of the sub rules""" | |
298 for rule in self._sub_rules: | |
299 (nextPos, ast) = rule.match(state, pos) | |
300 if nextPos is not None: | |
301 return (nextPos, ast) | |
302 return (None, None) | |
303 | |
304 | |
305 class MAYBE(_PegParserRule): | |
306 """Will try to match the given rule, tolerating absence. | |
307 Usage example: MAYBE('A') | |
308 Will match 'A' but also ''. | |
309 """ | |
310 def __init__(self, rule): | |
311 """Constructor. | |
312 Args: | |
313 rule -- the rule that may be absent. | |
314 """ | |
315 _PegParserRule.__init__(self) | |
316 self._sub_rule = _compile(rule) | |
317 | |
318 def _match_impl(self, state, pos): | |
319 """Tries to match at leat one of the sub rules""" | |
320 (nextPos, ast) = self._sub_rule.match(state, pos) | |
321 if nextPos is not None: | |
322 return (nextPos, ast) | |
323 return (pos, None) | |
324 | |
325 | |
326 class MANY(_PegParserRule): | |
327 """Will try to match the given rule one or more times. | |
328 Usage example 1: MANY('A') | |
329 Will match 'A', 'AAAAA' but not ''. | |
330 Usage example 2: MANY('A', separator=',') | |
331 Will match 'A', 'A,A' but not 'AA'. | |
332 """ | |
333 | |
334 def __init__(self, rule, separator=None): | |
335 """Constructor. | |
336 Args: | |
337 rule -- the rule to match multiple times. | |
338 separator -- this optional rule is used to match separators. | |
339 """ | |
340 _PegParserRule.__init__(self) | |
341 self._sub_rule = _compile(rule) | |
342 self._separator = _compile(separator) if separator else None | |
343 | |
344 def _match_impl(self, state, pos): | |
345 res = [] | |
346 count = 0 | |
347 while True: | |
348 if count > 0 and self._separator: | |
349 (nextPos, ast) = self._separator.match(state, pos) | |
350 if nextPos is not None: | |
351 pos = nextPos | |
352 if ast: | |
353 res.append(ast) | |
354 else: | |
355 break | |
356 (nextPos, ast) = self._sub_rule.match(state, pos) | |
357 if nextPos is None: | |
358 break | |
359 count += 1 | |
360 pos = nextPos | |
361 res.append(ast) | |
362 if count > 0: | |
363 return (pos, res) | |
364 return (None, None) | |
365 | |
366 | |
367 class TOKEN(_PegParserRule): | |
368 """The matched rule will not appear in the the output. | |
369 Usage example: ['A', TOKEN('.'), 'B'] | |
370 When matching 'A.B', will return the sequence ['A', 'B']. | |
371 """ | |
372 | |
373 def __init__(self, rule): | |
374 """Constructor. | |
375 Args: | |
376 rule -- the rule to match. | |
377 """ | |
378 _PegParserRule.__init__(self) | |
379 self._sub_rule = _compile(rule) | |
380 | |
381 def _match_impl(self, state, pos): | |
382 (nextPos, ast) = self._sub_rule.match(state, pos) | |
383 if nextPos is not None: | |
384 return (nextPos, None) | |
385 return (None, None) | |
386 | |
387 | |
388 class LABEL(_PegParserRule): | |
389 """The matched rule will appear in the output with the given label. | |
390 Usage example: LABEL('number', re.compile(r'[0-9]+')) | |
391 When matched with '1234', will return ('number', '1234'). | |
392 | |
393 Keyword arguments: | |
394 label -- a string. | |
395 rule -- the rule to match. | |
396 """ | |
397 | |
398 def __init__(self, label, rule): | |
399 """Constructor. | |
400 Args: | |
401 rule -- the rule to match. | |
402 """ | |
403 _PegParserRule.__init__(self) | |
404 self._label = label | |
405 self._sub_rule = _compile(rule) | |
406 | |
407 def _match_impl(self, state, pos): | |
408 (nextPos, ast) = self._sub_rule.match(state, pos) | |
409 if nextPos is not None: | |
410 return (nextPos, (self._label, ast)) | |
411 return (None, None) | |
412 | |
413 | |
414 class RAISE(_PegParserRule): | |
415 """Raises a SyntaxError with a user-provided message. | |
416 Usage example: ['A','B', RAISE('should have not gotten here')] | |
417 Will not match 'A' but will raise an exception for 'AB'. | |
418 This rule is useful mostly for debugging grammars. | |
419 """ | |
420 def __init__(self, message): | |
421 """Constructor. | |
422 Args: | |
423 message -- the message for the raised exception. | |
424 """ | |
425 _PegParserRule.__init__(self) | |
426 self._message = message | |
427 | |
428 def _match_impl(self, state, pos): | |
429 raise RuntimeError(self._message) | |
430 | |
431 | |
432 class PegParser(object): | |
433 """PegParser class. | |
434 This generic parser can be configured with rules to parse a wide | |
435 range of inputs. | |
436 """ | |
437 | |
438 def __init__(self, root_rule, whitespace_rule=None, | |
439 strings_are_tokens=False): | |
440 """Initializes a PegParser with rules and parsing options. | |
441 | |
442 Args: | |
443 root_rule -- the top level rule to start matching at. Rule can be | |
444 a regular expression, a string, or one of the special rules | |
445 such as SEQUENCE, MANY, OR, etc. | |
446 whitespace_rule -- used to identify and strip whitespace. Default | |
447 isNone, configuring the parser to not tolerate whitespace. | |
448 strings_are_tokens -- by default string rules are not treated as | |
449 tokens. In many programming languages, strings are tokens, | |
450 so this should be set to True. | |
451 """ | |
452 self._strings_are_tokens = strings_are_tokens | |
453 self._root_rule = _compile(root_rule) | |
454 if whitespace_rule is None: | |
455 self._whitespace_rule = None | |
456 else: | |
457 self._whitespace_rule = _compile(whitespace_rule) | |
458 | |
459 def parse(self, text, start_pos=0): | |
460 """Parses the given text input | |
461 Args: | |
462 text -- data to parse. | |
463 start_pos -- the offset to start parsing at. | |
464 | |
465 Returns: | |
466 An abstract syntax tree, with nodes being pairs of the format | |
467 (label, value), where label is a string or a function, and value | |
468 is a string, a pair or a list of pairs. | |
469 """ | |
470 | |
471 def calculate_line_number_and_offset(globalOffset): | |
472 """Calculates the line number and in-line offset""" | |
473 i = 0 | |
474 lineNumber = 1 | |
475 lineOffset = 0 | |
476 lineData = [] | |
477 while i < globalOffset and i < len(text): | |
478 if text[i] == '\n': | |
479 lineNumber += 1 | |
480 lineOffset = 0 | |
481 lineData = [] | |
482 else: | |
483 lineData.append(text[i]) | |
484 lineOffset += 1 | |
485 i += 1 | |
486 while i < len(text) and text[i] != '\n': | |
487 lineData.append(text[i]) | |
488 i += 1 | |
489 return (lineNumber, lineOffset, ''.join(lineData)) | |
490 | |
491 def analyze_result(state, pos, ast): | |
492 """Analyze match output""" | |
493 if pos is not None: | |
494 # Its possible that matching is successful but trailing | |
495 # whitespace remains, so skip it. | |
496 pos = _skip_whitespace(state, pos) | |
497 if pos == len(state.text): | |
498 # End of intput reached. Success! | |
499 return ast | |
500 | |
501 # Failure - analyze and raise an error. | |
502 (lineNumber, lineOffset, lineData) = \ | |
503 calculate_line_number_and_offset(state.max_pos) | |
504 message = 'unexpected error' | |
505 if state.max_rule: | |
506 set = {} | |
507 map(set.__setitem__, state.max_rule, []) | |
508 | |
509 def to_str(item): | |
510 return item.__str__() | |
511 | |
512 expected = ' or '.join(map(to_str, set.keys())) | |
513 found = state.text[state.max_pos:state.max_pos + 1] | |
514 message = 'Expected %s but "%s" found: "%s"' % \ | |
515 (expected, found, lineData) | |
516 raise SyntaxError( | |
517 'At line %s offset %s: %s' % \ | |
518 (lineNumber, lineOffset, message)) | |
519 | |
520 # Initialize state | |
521 state = _PegParserState(text, | |
522 whitespace_rule=self._whitespace_rule, | |
523 strings_are_tokens=self._strings_are_tokens) | |
524 | |
525 # Match and analyze result | |
526 (pos, ast) = self._root_rule.match(state, start_pos) | |
527 return analyze_result(state, pos, ast) | |
OLD | NEW |