| OLD | NEW |
| (Empty) |
| 1 #!/usr/bin/python | |
| 2 # Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file | |
| 3 # for details. All rights reserved. Use of this source code is governed by a | |
| 4 # BSD-style license that can be found in the LICENSE file. | |
| 5 | |
| 6 import logging | |
| 7 import re | |
| 8 import weakref | |
| 9 | |
| 10 _logger = logging.getLogger('pegparser') | |
| 11 | |
| 12 # functions can refer to each other, hence creating infinite loops. The | |
| 13 # following hashmap is used to memoize functions that were already compiled. | |
| 14 _compiled_functions_memory = weakref.WeakKeyDictionary() | |
| 15 | |
| 16 _regex_type = type(re.compile(r'')) | |
| 17 _list_type = type([]) | |
| 18 _function_type = type(lambda func: 0) | |
| 19 | |
| 20 | |
| 21 class _PegParserState(object): | |
| 22 """Object for storing parsing state variables and options""" | |
| 23 | |
| 24 def __init__(self, text, whitespace_rule, strings_are_tokens): | |
| 25 # Parsing state: | |
| 26 self.text = text | |
| 27 self.is_whitespace_mode = False | |
| 28 | |
| 29 # Error message helpers: | |
| 30 self.max_pos = None | |
| 31 self.max_rule = None | |
| 32 | |
| 33 # Parsing options: | |
| 34 self.whitespace_rule = whitespace_rule | |
| 35 self.strings_are_tokens = strings_are_tokens | |
| 36 | |
| 37 | |
| 38 class _PegParserRule(object): | |
| 39 """Base class for all rules""" | |
| 40 | |
| 41 def __init__(self): | |
| 42 return | |
| 43 | |
| 44 def __str__(self): | |
| 45 return self.__class__.__name__ | |
| 46 | |
| 47 def _match_impl(self, state, pos): | |
| 48 """Default implementation of the matching algorithm. | |
| 49 Should be overwritten by sub-classes. | |
| 50 """ | |
| 51 raise RuntimeError('_match_impl not implemented') | |
| 52 | |
| 53 def match(self, state, pos): | |
| 54 """Matches the rule against the text in the given position. | |
| 55 | |
| 56 The actual rule evaluation is delegated to _match_impl, | |
| 57 while this function deals mostly with support tasks such as | |
| 58 skipping whitespace, debug information and data for exception. | |
| 59 | |
| 60 Args: | |
| 61 state -- the current parsing state and options. | |
| 62 pos -- the current offset in the text. | |
| 63 | |
| 64 Returns: | |
| 65 (next position, value) if the rule matches, or | |
| 66 (None, None) if it doesn't. | |
| 67 """ | |
| 68 if not state.is_whitespace_mode: | |
| 69 # Skip whitespace | |
| 70 pos = _skip_whitespace(state, pos) | |
| 71 | |
| 72 # Track position for possible error messaging | |
| 73 if pos > state.max_pos: | |
| 74 # Store position and the rule. | |
| 75 state.max_pos = pos | |
| 76 if isinstance(self, _StringRule): | |
| 77 state.max_rule = [self] | |
| 78 else: | |
| 79 state.max_rule = [] | |
| 80 elif pos == state.max_pos: | |
| 81 if isinstance(self, _StringRule): | |
| 82 state.max_rule.append(self) | |
| 83 | |
| 84 if _logger.isEnabledFor(logging.DEBUG): | |
| 85 # Used for debugging | |
| 86 _logger.debug('Try: pos=%s char=%s rule=%s' % \ | |
| 87 (pos, state.text[pos:pos + 1], self)) | |
| 88 | |
| 89 # Delegate the matching logic to the the specialized function. | |
| 90 res = self._match_impl(state, pos) | |
| 91 | |
| 92 if not state.is_whitespace_mode \ | |
| 93 and _logger.isEnabledFor(logging.DEBUG): | |
| 94 # More debugging information | |
| 95 (nextPos, ast) = res | |
| 96 if nextPos is not None: | |
| 97 _logger.debug('Match! pos=%s char=%s rule=%s' % \ | |
| 98 (pos, state.text[pos:pos + 1], self)) | |
| 99 else: | |
| 100 _logger.debug('Fail. pos=%s char=%s rule=%s' % \ | |
| 101 (pos, state.text[pos:pos + 1], self)) | |
| 102 | |
| 103 return res | |
| 104 | |
| 105 | |
| 106 def _compile(rule): | |
| 107 """Recursively compiles user-defined rules into parser rules. | |
| 108 Compilation is performed by converting strings, regular expressions, lists | |
| 109 and functions into _StringRule, _RegExpRule, SEQUENCE and _FunctionRule | |
| 110 (respectively). Memoization is used to avoid infinite recursion as rules | |
| 111 may refer to each other.""" | |
| 112 if rule is None: | |
| 113 raise RuntimeError('None is not a valid rule') | |
| 114 elif isinstance(rule, str): | |
| 115 return _StringRule(rule) | |
| 116 elif isinstance(rule, _regex_type): | |
| 117 return _RegExpRule(rule) | |
| 118 elif isinstance(rule, _list_type): | |
| 119 return SEQUENCE(*rule) | |
| 120 elif isinstance(rule, _function_type): | |
| 121 # Memoize compiled functions to avoid infinite compliation loops. | |
| 122 if rule in _compiled_functions_memory: | |
| 123 return _compiled_functions_memory[rule] | |
| 124 else: | |
| 125 compiled_function = _FunctionRule(rule) | |
| 126 _compiled_functions_memory[rule] = compiled_function | |
| 127 compiled_function._sub_rule = _compile(rule()) | |
| 128 return compiled_function | |
| 129 elif isinstance(rule, _PegParserRule): | |
| 130 return rule | |
| 131 else: | |
| 132 raise RuntimeError('Invalid rule type %s: %s', (type(rule), rule)) | |
| 133 | |
| 134 | |
| 135 def _skip_whitespace(state, pos): | |
| 136 """Returns the next non-whitespace position. | |
| 137 This is done by matching the optional whitespace_rule with the current | |
| 138 text.""" | |
| 139 if not state.whitespace_rule: | |
| 140 return pos | |
| 141 state.is_whitespace_mode = True | |
| 142 nextPos = pos | |
| 143 while nextPos is not None: | |
| 144 pos = nextPos | |
| 145 (nextPos, ast) = state.whitespace_rule.match(state, pos) | |
| 146 state.is_whitespace_mode = False | |
| 147 return pos | |
| 148 | |
| 149 | |
| 150 class _StringRule(_PegParserRule): | |
| 151 """This rule tries to match a whole string.""" | |
| 152 | |
| 153 def __init__(self, string): | |
| 154 """Constructor. | |
| 155 Args: | |
| 156 string -- string to match. | |
| 157 """ | |
| 158 _PegParserRule.__init__(self) | |
| 159 self._string = string | |
| 160 | |
| 161 def __str__(self): | |
| 162 return '"%s"' % self._string | |
| 163 | |
| 164 def _match_impl(self, state, pos): | |
| 165 """Tries to match the string at the current position""" | |
| 166 if state.text.startswith(self._string, pos): | |
| 167 nextPos = pos + len(self._string) | |
| 168 if state.strings_are_tokens: | |
| 169 return (nextPos, None) | |
| 170 else: | |
| 171 return (nextPos, self._string) | |
| 172 return (None, None) | |
| 173 | |
| 174 | |
| 175 class _RegExpRule(_PegParserRule): | |
| 176 """This rule tries to matches a regular expression.""" | |
| 177 | |
| 178 def __init__(self, reg_exp): | |
| 179 """Constructor. | |
| 180 Args: | |
| 181 reg_exp -- a regular expression used in matching. | |
| 182 """ | |
| 183 _PegParserRule.__init__(self) | |
| 184 self.reg_exp = reg_exp | |
| 185 | |
| 186 def __str__(self): | |
| 187 return 'regexp' | |
| 188 | |
| 189 def _match_impl(self, state, pos): | |
| 190 """Tries to match the regular expression with current text""" | |
| 191 matchObj = self.reg_exp.match(state.text, pos) | |
| 192 if matchObj: | |
| 193 matchStr = matchObj.group() | |
| 194 return (pos + len(matchStr), matchStr) | |
| 195 return (None, None) | |
| 196 | |
| 197 | |
| 198 class _FunctionRule(_PegParserRule): | |
| 199 """Function rule wraps a rule defined via a Python function. | |
| 200 | |
| 201 Defining rules via functions helps break the grammar into parts, labeling | |
| 202 the ast, and supporting recursive definitions in the grammar | |
| 203 | |
| 204 Usage Example: | |
| 205 def Func(): return ['function', TOKEN('('), TOKEN(')')] | |
| 206 def Var(): return OR('x', 'y') | |
| 207 def Program(): return OR(Func, Var) | |
| 208 | |
| 209 When matched with 'function()', will return the tuple: | |
| 210 ('Program', ('Func', 'function')) | |
| 211 When matched with 'x', will return the tuple: | |
| 212 ('Program', ('Var', 'x')) | |
| 213 | |
| 214 Functions who's name begins with '_' will not be labelled. This is useful | |
| 215 for creating utility rules. Extending the example above: | |
| 216 | |
| 217 def _Program(): return OR(Func, Var) | |
| 218 | |
| 219 When matched with 'function()', will return the tuple: | |
| 220 ('Func', 'function') | |
| 221 """ | |
| 222 | |
| 223 def __init__(self, func): | |
| 224 """Constructor. | |
| 225 Args: | |
| 226 func -- the original function will be used for labeling output. | |
| 227 """ | |
| 228 _PegParserRule.__init__(self) | |
| 229 self._func = func | |
| 230 # Sub-rule is compiled by _compile to avoid infinite recursion. | |
| 231 self._sub_rule = None | |
| 232 | |
| 233 def __str__(self): | |
| 234 return self._func.__name__ | |
| 235 | |
| 236 def _match_impl(self, state, pos): | |
| 237 """Simply invokes the sub rule""" | |
| 238 (nextPos, ast) = self._sub_rule.match(state, pos) | |
| 239 if nextPos is not None: | |
| 240 if not self._func.__name__.startswith('_'): | |
| 241 ast = (self._func.__name__, ast) | |
| 242 return (nextPos, ast) | |
| 243 return (None, None) | |
| 244 | |
| 245 | |
| 246 class SEQUENCE(_PegParserRule): | |
| 247 """This rule expects all given rules to match in sequence. | |
| 248 Note that SEQUENCE is equivalent to a rule composed of a Python list of | |
| 249 rules. | |
| 250 Usage example: SEQUENCE('A', 'B', 'C') | |
| 251 or: ['A', 'B', 'C'] | |
| 252 Will match 'ABC' but not 'A', 'B' or ''. | |
| 253 """ | |
| 254 def __init__(self, *rules): | |
| 255 """Constructor. | |
| 256 Args: | |
| 257 rules -- one or more rules to match. | |
| 258 """ | |
| 259 _PegParserRule.__init__(self) | |
| 260 self._sub_rules = [] | |
| 261 for rule in rules: | |
| 262 self._sub_rules.append(_compile(rule)) | |
| 263 | |
| 264 def _match_impl(self, state, pos): | |
| 265 """Tries to match all the sub rules""" | |
| 266 sequence = [] | |
| 267 for rule in self._sub_rules: | |
| 268 (nextPos, ast) = rule.match(state, pos) | |
| 269 if nextPos is not None: | |
| 270 if ast: | |
| 271 if isinstance(ast, _list_type): | |
| 272 sequence.extend(ast) | |
| 273 else: | |
| 274 sequence.append(ast) | |
| 275 pos = nextPos | |
| 276 else: | |
| 277 return (None, None) | |
| 278 return (pos, sequence) | |
| 279 | |
| 280 | |
| 281 class OR(_PegParserRule): | |
| 282 """This rule matches one and only one of multiple sub-rules. | |
| 283 Usage example: OR('A', 'B', 'C') | |
| 284 Will match 'A', 'B' or 'C'. | |
| 285 """ | |
| 286 def __init__(self, *rules): | |
| 287 """Constructor. | |
| 288 Args: | |
| 289 rules -- rules to choose from. | |
| 290 """ | |
| 291 _PegParserRule.__init__(self) | |
| 292 self._sub_rules = [] | |
| 293 for rule in rules: | |
| 294 self._sub_rules.append(_compile(rule)) | |
| 295 | |
| 296 def _match_impl(self, state, pos): | |
| 297 """Tries to match at leat one of the sub rules""" | |
| 298 for rule in self._sub_rules: | |
| 299 (nextPos, ast) = rule.match(state, pos) | |
| 300 if nextPos is not None: | |
| 301 return (nextPos, ast) | |
| 302 return (None, None) | |
| 303 | |
| 304 | |
| 305 class MAYBE(_PegParserRule): | |
| 306 """Will try to match the given rule, tolerating absence. | |
| 307 Usage example: MAYBE('A') | |
| 308 Will match 'A' but also ''. | |
| 309 """ | |
| 310 def __init__(self, rule): | |
| 311 """Constructor. | |
| 312 Args: | |
| 313 rule -- the rule that may be absent. | |
| 314 """ | |
| 315 _PegParserRule.__init__(self) | |
| 316 self._sub_rule = _compile(rule) | |
| 317 | |
| 318 def _match_impl(self, state, pos): | |
| 319 """Tries to match at leat one of the sub rules""" | |
| 320 (nextPos, ast) = self._sub_rule.match(state, pos) | |
| 321 if nextPos is not None: | |
| 322 return (nextPos, ast) | |
| 323 return (pos, None) | |
| 324 | |
| 325 | |
| 326 class MANY(_PegParserRule): | |
| 327 """Will try to match the given rule one or more times. | |
| 328 Usage example 1: MANY('A') | |
| 329 Will match 'A', 'AAAAA' but not ''. | |
| 330 Usage example 2: MANY('A', separator=',') | |
| 331 Will match 'A', 'A,A' but not 'AA'. | |
| 332 """ | |
| 333 | |
| 334 def __init__(self, rule, separator=None): | |
| 335 """Constructor. | |
| 336 Args: | |
| 337 rule -- the rule to match multiple times. | |
| 338 separator -- this optional rule is used to match separators. | |
| 339 """ | |
| 340 _PegParserRule.__init__(self) | |
| 341 self._sub_rule = _compile(rule) | |
| 342 self._separator = _compile(separator) if separator else None | |
| 343 | |
| 344 def _match_impl(self, state, pos): | |
| 345 res = [] | |
| 346 count = 0 | |
| 347 while True: | |
| 348 if count > 0 and self._separator: | |
| 349 (nextPos, ast) = self._separator.match(state, pos) | |
| 350 if nextPos is not None: | |
| 351 pos = nextPos | |
| 352 if ast: | |
| 353 res.append(ast) | |
| 354 else: | |
| 355 break | |
| 356 (nextPos, ast) = self._sub_rule.match(state, pos) | |
| 357 if nextPos is None: | |
| 358 break | |
| 359 count += 1 | |
| 360 pos = nextPos | |
| 361 res.append(ast) | |
| 362 if count > 0: | |
| 363 return (pos, res) | |
| 364 return (None, None) | |
| 365 | |
| 366 | |
| 367 class TOKEN(_PegParserRule): | |
| 368 """The matched rule will not appear in the the output. | |
| 369 Usage example: ['A', TOKEN('.'), 'B'] | |
| 370 When matching 'A.B', will return the sequence ['A', 'B']. | |
| 371 """ | |
| 372 | |
| 373 def __init__(self, rule): | |
| 374 """Constructor. | |
| 375 Args: | |
| 376 rule -- the rule to match. | |
| 377 """ | |
| 378 _PegParserRule.__init__(self) | |
| 379 self._sub_rule = _compile(rule) | |
| 380 | |
| 381 def _match_impl(self, state, pos): | |
| 382 (nextPos, ast) = self._sub_rule.match(state, pos) | |
| 383 if nextPos is not None: | |
| 384 return (nextPos, None) | |
| 385 return (None, None) | |
| 386 | |
| 387 | |
| 388 class LABEL(_PegParserRule): | |
| 389 """The matched rule will appear in the output with the given label. | |
| 390 Usage example: LABEL('number', re.compile(r'[0-9]+')) | |
| 391 When matched with '1234', will return ('number', '1234'). | |
| 392 | |
| 393 Keyword arguments: | |
| 394 label -- a string. | |
| 395 rule -- the rule to match. | |
| 396 """ | |
| 397 | |
| 398 def __init__(self, label, rule): | |
| 399 """Constructor. | |
| 400 Args: | |
| 401 rule -- the rule to match. | |
| 402 """ | |
| 403 _PegParserRule.__init__(self) | |
| 404 self._label = label | |
| 405 self._sub_rule = _compile(rule) | |
| 406 | |
| 407 def _match_impl(self, state, pos): | |
| 408 (nextPos, ast) = self._sub_rule.match(state, pos) | |
| 409 if nextPos is not None: | |
| 410 return (nextPos, (self._label, ast)) | |
| 411 return (None, None) | |
| 412 | |
| 413 | |
| 414 class RAISE(_PegParserRule): | |
| 415 """Raises a SyntaxError with a user-provided message. | |
| 416 Usage example: ['A','B', RAISE('should have not gotten here')] | |
| 417 Will not match 'A' but will raise an exception for 'AB'. | |
| 418 This rule is useful mostly for debugging grammars. | |
| 419 """ | |
| 420 def __init__(self, message): | |
| 421 """Constructor. | |
| 422 Args: | |
| 423 message -- the message for the raised exception. | |
| 424 """ | |
| 425 _PegParserRule.__init__(self) | |
| 426 self._message = message | |
| 427 | |
| 428 def _match_impl(self, state, pos): | |
| 429 raise RuntimeError(self._message) | |
| 430 | |
| 431 | |
| 432 class PegParser(object): | |
| 433 """PegParser class. | |
| 434 This generic parser can be configured with rules to parse a wide | |
| 435 range of inputs. | |
| 436 """ | |
| 437 | |
| 438 def __init__(self, root_rule, whitespace_rule=None, | |
| 439 strings_are_tokens=False): | |
| 440 """Initializes a PegParser with rules and parsing options. | |
| 441 | |
| 442 Args: | |
| 443 root_rule -- the top level rule to start matching at. Rule can be | |
| 444 a regular expression, a string, or one of the special rules | |
| 445 such as SEQUENCE, MANY, OR, etc. | |
| 446 whitespace_rule -- used to identify and strip whitespace. Default | |
| 447 isNone, configuring the parser to not tolerate whitespace. | |
| 448 strings_are_tokens -- by default string rules are not treated as | |
| 449 tokens. In many programming languages, strings are tokens, | |
| 450 so this should be set to True. | |
| 451 """ | |
| 452 self._strings_are_tokens = strings_are_tokens | |
| 453 self._root_rule = _compile(root_rule) | |
| 454 if whitespace_rule is None: | |
| 455 self._whitespace_rule = None | |
| 456 else: | |
| 457 self._whitespace_rule = _compile(whitespace_rule) | |
| 458 | |
| 459 def parse(self, text, start_pos=0): | |
| 460 """Parses the given text input | |
| 461 Args: | |
| 462 text -- data to parse. | |
| 463 start_pos -- the offset to start parsing at. | |
| 464 | |
| 465 Returns: | |
| 466 An abstract syntax tree, with nodes being pairs of the format | |
| 467 (label, value), where label is a string or a function, and value | |
| 468 is a string, a pair or a list of pairs. | |
| 469 """ | |
| 470 | |
| 471 def calculate_line_number_and_offset(globalOffset): | |
| 472 """Calculates the line number and in-line offset""" | |
| 473 i = 0 | |
| 474 lineNumber = 1 | |
| 475 lineOffset = 0 | |
| 476 lineData = [] | |
| 477 while i < globalOffset and i < len(text): | |
| 478 if text[i] == '\n': | |
| 479 lineNumber += 1 | |
| 480 lineOffset = 0 | |
| 481 lineData = [] | |
| 482 else: | |
| 483 lineData.append(text[i]) | |
| 484 lineOffset += 1 | |
| 485 i += 1 | |
| 486 while i < len(text) and text[i] != '\n': | |
| 487 lineData.append(text[i]) | |
| 488 i += 1 | |
| 489 return (lineNumber, lineOffset, ''.join(lineData)) | |
| 490 | |
| 491 def analyze_result(state, pos, ast): | |
| 492 """Analyze match output""" | |
| 493 if pos is not None: | |
| 494 # Its possible that matching is successful but trailing | |
| 495 # whitespace remains, so skip it. | |
| 496 pos = _skip_whitespace(state, pos) | |
| 497 if pos == len(state.text): | |
| 498 # End of intput reached. Success! | |
| 499 return ast | |
| 500 | |
| 501 # Failure - analyze and raise an error. | |
| 502 (lineNumber, lineOffset, lineData) = \ | |
| 503 calculate_line_number_and_offset(state.max_pos) | |
| 504 message = 'unexpected error' | |
| 505 if state.max_rule: | |
| 506 set = {} | |
| 507 map(set.__setitem__, state.max_rule, []) | |
| 508 | |
| 509 def to_str(item): | |
| 510 return item.__str__() | |
| 511 | |
| 512 expected = ' or '.join(map(to_str, set.keys())) | |
| 513 found = state.text[state.max_pos:state.max_pos + 1] | |
| 514 message = 'Expected %s but "%s" found: "%s"' % \ | |
| 515 (expected, found, lineData) | |
| 516 raise SyntaxError( | |
| 517 'At line %s offset %s: %s' % \ | |
| 518 (lineNumber, lineOffset, message)) | |
| 519 | |
| 520 # Initialize state | |
| 521 state = _PegParserState(text, | |
| 522 whitespace_rule=self._whitespace_rule, | |
| 523 strings_are_tokens=self._strings_are_tokens) | |
| 524 | |
| 525 # Match and analyze result | |
| 526 (pos, ast) = self._root_rule.match(state, start_pos) | |
| 527 return analyze_result(state, pos, ast) | |
| OLD | NEW |