OLD | NEW |
| (Empty) |
1 # Copyright (c) 2001-2007 Twisted Matrix Laboratories. | |
2 # See LICENSE for details. | |
3 | |
4 # DO NOT EDIT xpathparser.py! | |
5 # | |
6 # It is generated from xpathparser.g using Yapps. Make needed changes there. | |
7 # This also means that the generated Python may not conform to Twisted's coding | |
8 # standards. | |
9 | |
10 # HOWTO Generate me: | |
11 # | |
12 # 1.) Grab a copy of yapps2, version 2.1.1: | |
13 # http://theory.stanford.edu/~amitp/Yapps/ | |
14 # | |
15 # Note: Do NOT use the package in debian/ubuntu as it has incompatible | |
16 # modifications. | |
17 # | |
18 # 2.) Generate the grammar: | |
19 # | |
20 # yapps2 xpathparser.g xpathparser.py.proto | |
21 # | |
22 # 3.) Edit the output to depend on the embedded runtime, not yappsrt. | |
23 # | |
24 # sed -e '/^import yapps/d' -e '/^[^#]/s/yappsrt\.//g' \ | |
25 # xpathparser.py.proto > xpathparser.py | |
26 | |
27 """ | |
28 XPath Parser. | |
29 | |
30 Besides the parser code produced by Yapps, this module also defines the | |
31 parse-time exception classes, a scanner class, a base class for parsers | |
32 produced by Yapps, and a context class that keeps track of the parse stack. | |
33 These have been copied from the Yapps runtime. | |
34 """ | |
35 | |
36 import sys, re | |
37 | |
38 class SyntaxError(Exception): | |
39 """When we run into an unexpected token, this is the exception to use""" | |
40 def __init__(self, charpos=-1, msg="Bad Token", context=None): | |
41 Exception.__init__(self) | |
42 self.charpos = charpos | |
43 self.msg = msg | |
44 self.context = context | |
45 | |
46 def __str__(self): | |
47 if self.charpos < 0: return 'SyntaxError' | |
48 else: return 'SyntaxError@char%s(%s)' % (repr(self.charpos), self.msg) | |
49 | |
50 class NoMoreTokens(Exception): | |
51 """Another exception object, for when we run out of tokens""" | |
52 pass | |
53 | |
54 class Scanner: | |
55 """Yapps scanner. | |
56 | |
57 The Yapps scanner can work in context sensitive or context | |
58 insensitive modes. The token(i) method is used to retrieve the | |
59 i-th token. It takes a restrict set that limits the set of tokens | |
60 it is allowed to return. In context sensitive mode, this restrict | |
61 set guides the scanner. In context insensitive mode, there is no | |
62 restriction (the set is always the full set of tokens). | |
63 | |
64 """ | |
65 | |
66 def __init__(self, patterns, ignore, input): | |
67 """Initialize the scanner. | |
68 | |
69 @param patterns: [(terminal, uncompiled regex), ...] or C{None} | |
70 @param ignore: [terminal,...] | |
71 @param input: string | |
72 | |
73 If patterns is C{None}, we assume that the subclass has defined | |
74 C{self.patterns} : [(terminal, compiled regex), ...]. Note that the | |
75 patterns parameter expects uncompiled regexes, whereas the | |
76 C{self.patterns} field expects compiled regexes. | |
77 """ | |
78 self.tokens = [] # [(begin char pos, end char pos, token name, matched t
ext), ...] | |
79 self.restrictions = [] | |
80 self.input = input | |
81 self.pos = 0 | |
82 self.ignore = ignore | |
83 self.first_line_number = 1 | |
84 | |
85 if patterns is not None: | |
86 # Compile the regex strings into regex objects | |
87 self.patterns = [] | |
88 for terminal, regex in patterns: | |
89 self.patterns.append( (terminal, re.compile(regex)) ) | |
90 | |
91 def get_token_pos(self): | |
92 """Get the current token position in the input text.""" | |
93 return len(self.tokens) | |
94 | |
95 def get_char_pos(self): | |
96 """Get the current char position in the input text.""" | |
97 return self.pos | |
98 | |
99 def get_prev_char_pos(self, i=None): | |
100 """Get the previous position (one token back) in the input text.""" | |
101 if self.pos == 0: return 0 | |
102 if i is None: i = -1 | |
103 return self.tokens[i][0] | |
104 | |
105 def get_line_number(self): | |
106 """Get the line number of the current position in the input text.""" | |
107 # TODO: make this work at any token/char position | |
108 return self.first_line_number + self.get_input_scanned().count('\n') | |
109 | |
110 def get_column_number(self): | |
111 """Get the column number of the current position in the input text.""" | |
112 s = self.get_input_scanned() | |
113 i = s.rfind('\n') # may be -1, but that's okay in this case | |
114 return len(s) - (i+1) | |
115 | |
116 def get_input_scanned(self): | |
117 """Get the portion of the input that has been tokenized.""" | |
118 return self.input[:self.pos] | |
119 | |
120 def get_input_unscanned(self): | |
121 """Get the portion of the input that has not yet been tokenized.""" | |
122 return self.input[self.pos:] | |
123 | |
124 def token(self, i, restrict=None): | |
125 """Get the i'th token in the input. | |
126 | |
127 If C{i} is one past the end, then scan for another token. | |
128 | |
129 @param i: token index | |
130 | |
131 @param restrict: [token, ...] or C{None}; if restrict is | |
132 C{None}, then any token is allowed. You may call | |
133 token(i) more than once. However, the restrict set | |
134 may never be larger than what was passed in on the | |
135 first call to token(i). | |
136 """ | |
137 if i == len(self.tokens): | |
138 self.scan(restrict) | |
139 if i < len(self.tokens): | |
140 # Make sure the restriction is more restricted. This | |
141 # invariant is needed to avoid ruining tokenization at | |
142 # position i+1 and higher. | |
143 if restrict and self.restrictions[i]: | |
144 for r in restrict: | |
145 if r not in self.restrictions[i]: | |
146 raise NotImplementedError("Unimplemented: restriction se
t changed") | |
147 return self.tokens[i] | |
148 raise NoMoreTokens() | |
149 | |
150 def __repr__(self): | |
151 """Print the last 10 tokens that have been scanned in""" | |
152 output = '' | |
153 for t in self.tokens[-10:]: | |
154 output = '%s\n (@%s) %s = %s' % (output,t[0],t[2],repr(t[3])) | |
155 return output | |
156 | |
157 def scan(self, restrict): | |
158 """Should scan another token and add it to the list, self.tokens, | |
159 and add the restriction to self.restrictions""" | |
160 # Keep looking for a token, ignoring any in self.ignore | |
161 while 1: | |
162 # Search the patterns for the longest match, with earlier | |
163 # tokens in the list having preference | |
164 best_match = -1 | |
165 best_pat = '(error)' | |
166 for p, regexp in self.patterns: | |
167 # First check to see if we're ignoring this token | |
168 if restrict and p not in restrict and p not in self.ignore: | |
169 continue | |
170 m = regexp.match(self.input, self.pos) | |
171 if m and len(m.group(0)) > best_match: | |
172 # We got a match that's better than the previous one | |
173 best_pat = p | |
174 best_match = len(m.group(0)) | |
175 | |
176 # If we didn't find anything, raise an error | |
177 if best_pat == '(error)' and best_match < 0: | |
178 msg = 'Bad Token' | |
179 if restrict: | |
180 msg = 'Trying to find one of '+', '.join(restrict) | |
181 raise SyntaxError(self.pos, msg) | |
182 | |
183 # If we found something that isn't to be ignored, return it | |
184 if best_pat not in self.ignore: | |
185 # Create a token with this data | |
186 token = (self.pos, self.pos+best_match, best_pat, | |
187 self.input[self.pos:self.pos+best_match]) | |
188 self.pos = self.pos + best_match | |
189 # Only add this token if it's not in the list | |
190 # (to prevent looping) | |
191 if not self.tokens or token != self.tokens[-1]: | |
192 self.tokens.append(token) | |
193 self.restrictions.append(restrict) | |
194 return | |
195 else: | |
196 # This token should be ignored .. | |
197 self.pos = self.pos + best_match | |
198 | |
199 class Parser: | |
200 """Base class for Yapps-generated parsers. | |
201 | |
202 """ | |
203 | |
204 def __init__(self, scanner): | |
205 self._scanner = scanner | |
206 self._pos = 0 | |
207 | |
208 def _peek(self, *types): | |
209 """Returns the token type for lookahead; if there are any args | |
210 then the list of args is the set of token types to allow""" | |
211 tok = self._scanner.token(self._pos, types) | |
212 return tok[2] | |
213 | |
214 def _scan(self, type): | |
215 """Returns the matched text, and moves to the next token""" | |
216 tok = self._scanner.token(self._pos, [type]) | |
217 if tok[2] != type: | |
218 raise SyntaxError(tok[0], 'Trying to find '+type+' :'+ ' ,'.join(sel
f._scanner.restrictions[self._pos])) | |
219 self._pos = 1 + self._pos | |
220 return tok[3] | |
221 | |
222 class Context: | |
223 """Class to represent the parser's call stack. | |
224 | |
225 Every rule creates a Context that links to its parent rule. The | |
226 contexts can be used for debugging. | |
227 | |
228 """ | |
229 | |
230 def __init__(self, parent, scanner, tokenpos, rule, args=()): | |
231 """Create a new context. | |
232 | |
233 @param parent: Context object or C{None} | |
234 @param scanner: Scanner object | |
235 @param tokenpos: scanner token position | |
236 @type tokenpos: L{int} | |
237 @param rule: name of the rule | |
238 @type rule: L{str} | |
239 @param args: tuple listing parameters to the rule | |
240 | |
241 """ | |
242 self.parent = parent | |
243 self.scanner = scanner | |
244 self.tokenpos = tokenpos | |
245 self.rule = rule | |
246 self.args = args | |
247 | |
248 def __str__(self): | |
249 output = '' | |
250 if self.parent: output = str(self.parent) + ' > ' | |
251 output += self.rule | |
252 return output | |
253 | |
254 def print_line_with_pointer(text, p): | |
255 """Print the line of 'text' that includes position 'p', | |
256 along with a second line with a single caret (^) at position p""" | |
257 | |
258 # TODO: separate out the logic for determining the line/character | |
259 # location from the logic for determining how to display an | |
260 # 80-column line to stderr. | |
261 | |
262 # Now try printing part of the line | |
263 text = text[max(p-80, 0):p+80] | |
264 p = p - max(p-80, 0) | |
265 | |
266 # Strip to the left | |
267 i = text[:p].rfind('\n') | |
268 j = text[:p].rfind('\r') | |
269 if i < 0 or (0 <= j < i): i = j | |
270 if 0 <= i < p: | |
271 p = p - i - 1 | |
272 text = text[i+1:] | |
273 | |
274 # Strip to the right | |
275 i = text.find('\n', p) | |
276 j = text.find('\r', p) | |
277 if i < 0 or (0 <= j < i): i = j | |
278 if i >= 0: | |
279 text = text[:i] | |
280 | |
281 # Now shorten the text | |
282 while len(text) > 70 and p > 60: | |
283 # Cut off 10 chars | |
284 text = "..." + text[10:] | |
285 p = p - 7 | |
286 | |
287 # Now print the string, along with an indicator | |
288 print >>sys.stderr, '> ',text | |
289 print >>sys.stderr, '> ',' '*p + '^' | |
290 | |
291 def print_error(input, err, scanner): | |
292 """Print error messages, the parser stack, and the input text -- for human-r
eadable error messages.""" | |
293 # NOTE: this function assumes 80 columns :-( | |
294 # Figure out the line number | |
295 line_number = scanner.get_line_number() | |
296 column_number = scanner.get_column_number() | |
297 print >>sys.stderr, '%d:%d: %s' % (line_number, column_number, err.msg) | |
298 | |
299 context = err.context | |
300 if not context: | |
301 print_line_with_pointer(input, err.charpos) | |
302 | |
303 while context: | |
304 # TODO: add line number | |
305 print >>sys.stderr, 'while parsing %s%s:' % (context.rule, tuple(context
.args)) | |
306 print_line_with_pointer(input, context.scanner.get_prev_char_pos(context
.tokenpos)) | |
307 context = context.parent | |
308 | |
309 def wrap_error_reporter(parser, rule): | |
310 try: | |
311 return getattr(parser, rule)() | |
312 except SyntaxError, e: | |
313 input = parser._scanner.input | |
314 print_error(input, e, parser._scanner) | |
315 except NoMoreTokens: | |
316 print >>sys.stderr, 'Could not complete parsing; stopped around here:' | |
317 print >>sys.stderr, parser._scanner | |
318 | |
319 | |
320 from twisted.words.xish.xpath import AttribValue, BooleanValue, CompareValue | |
321 from twisted.words.xish.xpath import Function, IndexValue, LiteralValue | |
322 from twisted.words.xish.xpath import _AnyLocation, _Location | |
323 | |
324 | |
325 # Begin -- grammar generated by Yapps | |
326 import sys, re | |
327 | |
328 class XPathParserScanner(Scanner): | |
329 patterns = [ | |
330 ('","', re.compile(',')), | |
331 ('"@"', re.compile('@')), | |
332 ('"\\)"', re.compile('\\)')), | |
333 ('"\\("', re.compile('\\(')), | |
334 ('"\\]"', re.compile('\\]')), | |
335 ('"\\["', re.compile('\\[')), | |
336 ('"//"', re.compile('//')), | |
337 ('"/"', re.compile('/')), | |
338 ('\\s+', re.compile('\\s+')), | |
339 ('INDEX', re.compile('[0-9]+')), | |
340 ('WILDCARD', re.compile('\\*')), | |
341 ('IDENTIFIER', re.compile('[a-zA-Z][a-zA-Z0-9_\\-]*')), | |
342 ('ATTRIBUTE', re.compile('\\@[a-zA-Z][a-zA-Z0-9_\\-]*')), | |
343 ('FUNCNAME', re.compile('[a-zA-Z][a-zA-Z0-9_]*')), | |
344 ('CMP_EQ', re.compile('\\=')), | |
345 ('CMP_NE', re.compile('\\!\\=')), | |
346 ('STR_DQ', re.compile('"([^"]|(\\"))*?"')), | |
347 ('STR_SQ', re.compile("'([^']|(\\'))*?'")), | |
348 ('OP_AND', re.compile('and')), | |
349 ('OP_OR', re.compile('or')), | |
350 ('END', re.compile('$')), | |
351 ] | |
352 def __init__(self, str): | |
353 Scanner.__init__(self,None,['\\s+'],str) | |
354 | |
355 class XPathParser(Parser): | |
356 Context = Context | |
357 def XPATH(self, _parent=None): | |
358 _context = self.Context(_parent, self._scanner, self._pos, 'XPATH', []) | |
359 PATH = self.PATH(_context) | |
360 result = PATH; current = result | |
361 while self._peek('END', '"/"', '"//"') != 'END': | |
362 PATH = self.PATH(_context) | |
363 current.childLocation = PATH; current = current.childLocation | |
364 if self._peek() not in ['END', '"/"', '"//"']: | |
365 raise SyntaxError(charpos=self._scanner.get_prev_char_pos(), context
=_context, msg='Need one of ' + ', '.join(['END', '"/"', '"//"'])) | |
366 END = self._scan('END') | |
367 return result | |
368 | |
369 def PATH(self, _parent=None): | |
370 _context = self.Context(_parent, self._scanner, self._pos, 'PATH', []) | |
371 _token = self._peek('"/"', '"//"') | |
372 if _token == '"/"': | |
373 self._scan('"/"') | |
374 result = _Location() | |
375 else: # == '"//"' | |
376 self._scan('"//"') | |
377 result = _AnyLocation() | |
378 _token = self._peek('IDENTIFIER', 'WILDCARD') | |
379 if _token == 'IDENTIFIER': | |
380 IDENTIFIER = self._scan('IDENTIFIER') | |
381 result.elementName = IDENTIFIER | |
382 else: # == 'WILDCARD' | |
383 WILDCARD = self._scan('WILDCARD') | |
384 result.elementName = None | |
385 while self._peek('"\\["', 'END', '"/"', '"//"') == '"\\["': | |
386 self._scan('"\\["') | |
387 PREDICATE = self.PREDICATE(_context) | |
388 result.predicates.append(PREDICATE) | |
389 self._scan('"\\]"') | |
390 if self._peek() not in ['"\\["', 'END', '"/"', '"//"']: | |
391 raise SyntaxError(charpos=self._scanner.get_prev_char_pos(), context
=_context, msg='Need one of ' + ', '.join(['"\\["', 'END', '"/"', '"//"'])) | |
392 return result | |
393 | |
394 def PREDICATE(self, _parent=None): | |
395 _context = self.Context(_parent, self._scanner, self._pos, 'PREDICATE',
[]) | |
396 _token = self._peek('INDEX', '"\\("', '"@"', 'FUNCNAME', 'STR_DQ', 'STR_
SQ') | |
397 if _token != 'INDEX': | |
398 EXPR = self.EXPR(_context) | |
399 return EXPR | |
400 else: # == 'INDEX' | |
401 INDEX = self._scan('INDEX') | |
402 return IndexValue(INDEX) | |
403 | |
404 def EXPR(self, _parent=None): | |
405 _context = self.Context(_parent, self._scanner, self._pos, 'EXPR', []) | |
406 FACTOR = self.FACTOR(_context) | |
407 e = FACTOR | |
408 while self._peek('OP_AND', 'OP_OR', '"\\)"', '"\\]"') in ['OP_AND', 'OP_
OR']: | |
409 BOOLOP = self.BOOLOP(_context) | |
410 FACTOR = self.FACTOR(_context) | |
411 e = BooleanValue(e, BOOLOP, FACTOR) | |
412 if self._peek() not in ['OP_AND', 'OP_OR', '"\\)"', '"\\]"']: | |
413 raise SyntaxError(charpos=self._scanner.get_prev_char_pos(), context
=_context, msg='Need one of ' + ', '.join(['OP_AND', 'OP_OR', '"\\)"', '"\\]"'])
) | |
414 return e | |
415 | |
416 def BOOLOP(self, _parent=None): | |
417 _context = self.Context(_parent, self._scanner, self._pos, 'BOOLOP', []) | |
418 _token = self._peek('OP_AND', 'OP_OR') | |
419 if _token == 'OP_AND': | |
420 OP_AND = self._scan('OP_AND') | |
421 return OP_AND | |
422 else: # == 'OP_OR' | |
423 OP_OR = self._scan('OP_OR') | |
424 return OP_OR | |
425 | |
426 def FACTOR(self, _parent=None): | |
427 _context = self.Context(_parent, self._scanner, self._pos, 'FACTOR', []) | |
428 _token = self._peek('"\\("', '"@"', 'FUNCNAME', 'STR_DQ', 'STR_SQ') | |
429 if _token != '"\\("': | |
430 TERM = self.TERM(_context) | |
431 return TERM | |
432 else: # == '"\\("' | |
433 self._scan('"\\("') | |
434 EXPR = self.EXPR(_context) | |
435 self._scan('"\\)"') | |
436 return EXPR | |
437 | |
438 def TERM(self, _parent=None): | |
439 _context = self.Context(_parent, self._scanner, self._pos, 'TERM', []) | |
440 VALUE = self.VALUE(_context) | |
441 t = VALUE | |
442 if self._peek('CMP_EQ', 'CMP_NE', 'OP_AND', 'OP_OR', '"\\)"', '"\\]"') i
n ['CMP_EQ', 'CMP_NE']: | |
443 CMP = self.CMP(_context) | |
444 VALUE = self.VALUE(_context) | |
445 t = CompareValue(t, CMP, VALUE) | |
446 return t | |
447 | |
448 def VALUE(self, _parent=None): | |
449 _context = self.Context(_parent, self._scanner, self._pos, 'VALUE', []) | |
450 _token = self._peek('"@"', 'FUNCNAME', 'STR_DQ', 'STR_SQ') | |
451 if _token == '"@"': | |
452 self._scan('"@"') | |
453 IDENTIFIER = self._scan('IDENTIFIER') | |
454 return AttribValue(IDENTIFIER) | |
455 elif _token == 'FUNCNAME': | |
456 FUNCNAME = self._scan('FUNCNAME') | |
457 f = Function(FUNCNAME); args = [] | |
458 self._scan('"\\("') | |
459 if self._peek('"\\)"', '"@"', 'FUNCNAME', '","', 'STR_DQ', 'STR_SQ')
not in ['"\\)"', '","']: | |
460 VALUE = self.VALUE(_context) | |
461 args.append(VALUE) | |
462 while self._peek('","', '"\\)"') == '","': | |
463 self._scan('","') | |
464 VALUE = self.VALUE(_context) | |
465 args.append(VALUE) | |
466 if self._peek() not in ['","', '"\\)"']: | |
467 raise SyntaxError(charpos=self._scanner.get_prev_char_pos(),
context=_context, msg='Need one of ' + ', '.join(['","', '"\\)"'])) | |
468 self._scan('"\\)"') | |
469 f.setParams(*args); return f | |
470 else: # in ['STR_DQ', 'STR_SQ'] | |
471 STR = self.STR(_context) | |
472 return LiteralValue(STR[1:len(STR)-1]) | |
473 | |
474 def CMP(self, _parent=None): | |
475 _context = self.Context(_parent, self._scanner, self._pos, 'CMP', []) | |
476 _token = self._peek('CMP_EQ', 'CMP_NE') | |
477 if _token == 'CMP_EQ': | |
478 CMP_EQ = self._scan('CMP_EQ') | |
479 return CMP_EQ | |
480 else: # == 'CMP_NE' | |
481 CMP_NE = self._scan('CMP_NE') | |
482 return CMP_NE | |
483 | |
484 def STR(self, _parent=None): | |
485 _context = self.Context(_parent, self._scanner, self._pos, 'STR', []) | |
486 _token = self._peek('STR_DQ', 'STR_SQ') | |
487 if _token == 'STR_DQ': | |
488 STR_DQ = self._scan('STR_DQ') | |
489 return STR_DQ | |
490 else: # == 'STR_SQ' | |
491 STR_SQ = self._scan('STR_SQ') | |
492 return STR_SQ | |
493 | |
494 | |
495 def parse(rule, text): | |
496 P = XPathParser(XPathParserScanner(text)) | |
497 return wrap_error_reporter(P, rule) | |
498 | |
499 if __name__ == '__main__': | |
500 from sys import argv, stdin | |
501 if len(argv) >= 2: | |
502 if len(argv) >= 3: | |
503 f = open(argv[2],'r') | |
504 else: | |
505 f = stdin | |
506 print parse(argv[1], f.read()) | |
507 else: print >>sys.stderr, 'Args: <rule> [<filename>]' | |
508 # End -- grammar generated by Yapps | |
OLD | NEW |