OLD | NEW |
| (Empty) |
1 # Copyright (c) 2001-2007 Twisted Matrix Laboratories. | |
2 # See LICENSE for details. | |
3 | |
4 # DO NOT EDIT xpathparser.py! | |
5 # | |
6 # It is generated from xpathparser.g using Yapps. Make needed changes there. | |
7 # This also means that the generated Python may not conform to Twisted's coding | |
8 # standards. | |
9 | |
10 # HOWTO Generate me: | |
11 # | |
12 # 1.) Grab a copy of yapps2, version 2.1.1: | |
13 # http://theory.stanford.edu/~amitp/Yapps/ | |
14 # | |
15 # Note: Do NOT use the package in debian/ubuntu as it has incompatible | |
16 # modifications. | |
17 # | |
18 # 2.) Generate the grammar: | |
19 # | |
20 # yapps2 xpathparser.g xpathparser.py.proto | |
21 # | |
22 # 3.) Edit the output to depend on the embedded runtime, not yappsrt. | |
23 # | |
24 # sed -e '/^import yapps/d' -e '/^[^#]/s/yappsrt\.//g' \ | |
25 # xpathparser.py.proto > xpathparser.py | |
26 | |
27 """ | |
28 XPath Parser. | |
29 | |
30 Besides the parser code produced by Yapps, this module also defines the | |
31 parse-time exception classes, a scanner class, a base class for parsers | |
32 produced by Yapps, and a context class that keeps track of the parse stack. | |
33 These have been copied from the Yapps runtime. | |
34 """ | |
35 | |
36 import sys, re | |
37 | |
38 class SyntaxError(Exception): | |
39 """When we run into an unexpected token, this is the exception to use""" | |
40 def __init__(self, charpos=-1, msg="Bad Token", context=None): | |
41 Exception.__init__(self) | |
42 self.charpos = charpos | |
43 self.msg = msg | |
44 self.context = context | |
45 | |
46 def __str__(self): | |
47 if self.charpos < 0: return 'SyntaxError' | |
48 else: return 'SyntaxError@char%s(%s)' % (repr(self.charpos), self.msg) | |
49 | |
50 class NoMoreTokens(Exception): | |
51 """Another exception object, for when we run out of tokens""" | |
52 pass | |
53 | |
54 class Scanner: | |
55 """Yapps scanner. | |
56 | |
57 The Yapps scanner can work in context sensitive or context | |
58 insensitive modes. The token(i) method is used to retrieve the | |
59 i-th token. It takes a restrict set that limits the set of tokens | |
60 it is allowed to return. In context sensitive mode, this restrict | |
61 set guides the scanner. In context insensitive mode, there is no | |
62 restriction (the set is always the full set of tokens). | |
63 | |
64 """ | |
65 | |
66 def __init__(self, patterns, ignore, input): | |
67 """Initialize the scanner. | |
68 | |
69 @param patterns: [(terminal, uncompiled regex), ...] or C{None} | |
70 @param ignore: [terminal,...] | |
71 @param input: string | |
72 | |
73 If patterns is C{None}, we assume that the subclass has defined | |
74 C{self.patterns} : [(terminal, compiled regex), ...]. Note that the | |
75 patterns parameter expects uncompiled regexes, whereas the | |
76 C{self.patterns} field expects compiled regexes. | |
77 """ | |
78 self.tokens = [] # [(begin char pos, end char pos, token name, matched t
ext), ...] | |
79 self.restrictions = [] | |
80 self.input = input | |
81 self.pos = 0 | |
82 self.ignore = ignore | |
83 self.first_line_number = 1 | |
84 | |
85 if patterns is not None: | |
86 # Compile the regex strings into regex objects | |
87 self.patterns = [] | |
88 for terminal, regex in patterns: | |
89 self.patterns.append( (terminal, re.compile(regex)) ) | |
90 | |
91 def get_token_pos(self): | |
92 """Get the current token position in the input text.""" | |
93 return len(self.tokens) | |
94 | |
95 def get_char_pos(self): | |
96 """Get the current char position in the input text.""" | |
97 return self.pos | |
98 | |
99 def get_prev_char_pos(self, i=None): | |
100 """Get the previous position (one token back) in the input text.""" | |
101 if self.pos == 0: return 0 | |
102 if i is None: i = -1 | |
103 return self.tokens[i][0] | |
104 | |
105 def get_line_number(self): | |
106 """Get the line number of the current position in the input text.""" | |
107 # TODO: make this work at any token/char position | |
108 return self.first_line_number + self.get_input_scanned().count('\n') | |
109 | |
110 def get_column_number(self): | |
111 """Get the column number of the current position in the input text.""" | |
112 s = self.get_input_scanned() | |
113 i = s.rfind('\n') # may be -1, but that's okay in this case | |
114 return len(s) - (i+1) | |
115 | |
116 def get_input_scanned(self): | |
117 """Get the portion of the input that has been tokenized.""" | |
118 return self.input[:self.pos] | |
119 | |
120 def get_input_unscanned(self): | |
121 """Get the portion of the input that has not yet been tokenized.""" | |
122 return self.input[self.pos:] | |
123 | |
124 def token(self, i, restrict=None): | |
125 """Get the i'th token in the input. | |
126 | |
127 If C{i} is one past the end, then scan for another token. | |
128 | |
129 @param i: token index | |
130 | |
131 @param restrict: [token, ...] or C{None}; if restrict is | |
132 C{None}, then any token is allowed. You may call | |
133 token(i) more than once. However, the restrict set | |
134 may never be larger than what was passed in on the | |
135 first call to token(i). | |
136 """ | |
137 if i == len(self.tokens): | |
138 self.scan(restrict) | |
139 if i < len(self.tokens): | |
140 # Make sure the restriction is more restricted. This | |
141 # invariant is needed to avoid ruining tokenization at | |
142 # position i+1 and higher. | |
143 if restrict and self.restrictions[i]: | |
144 for r in restrict: | |
145 if r not in self.restrictions[i]: | |
146 raise NotImplementedError("Unimplemented: restriction se
t changed") | |
147 return self.tokens[i] | |
148 raise NoMoreTokens() | |
149 | |
150 def __repr__(self): | |
151 """Print the last 10 tokens that have been scanned in""" | |
152 output = '' | |
153 for t in self.tokens[-10:]: | |
154 output = '%s\n (@%s) %s = %s' % (output,t[0],t[2],repr(t[3])) | |
155 return output | |
156 | |
157 def scan(self, restrict): | |
158 """Should scan another token and add it to the list, self.tokens, | |
159 and add the restriction to self.restrictions""" | |
160 # Keep looking for a token, ignoring any in self.ignore | |
161 while 1: | |
162 # Search the patterns for the longest match, with earlier | |
163 # tokens in the list having preference | |
164 best_match = -1 | |
165 best_pat = '(error)' | |
166 for p, regexp in self.patterns: | |
167 # First check to see if we're ignoring this token | |
168 if restrict and p not in restrict and p not in self.ignore: | |
169 continue | |
170 m = regexp.match(self.input, self.pos) | |
171 if m and len(m.group(0)) > best_match: | |
172 # We got a match that's better than the previous one | |
173 best_pat = p | |
174 best_match = len(m.group(0)) | |
175 | |
176 # If we didn't find anything, raise an error | |
177 if best_pat == '(error)' and best_match < 0: | |
178 msg = 'Bad Token' | |
179 if restrict: | |
180 msg = 'Trying to find one of '+', '.join(restrict) | |
181 raise SyntaxError(self.pos, msg) | |
182 | |
183 # If we found something that isn't to be ignored, return it | |
184 if best_pat not in self.ignore: | |
185 # Create a token with this data | |
186 token = (self.pos, self.pos+best_match, best_pat, | |
187 self.input[self.pos:self.pos+best_match]) | |
188 self.pos = self.pos + best_match | |
189 # Only add this token if it's not in the list | |
190 # (to prevent looping) | |
191 if not self.tokens or token != self.tokens[-1]: | |
192 self.tokens.append(token) | |
193 self.restrictions.append(restrict) | |
194 return | |
195 else: | |
196 # This token should be ignored .. | |
197 self.pos = self.pos + best_match | |
198 | |
199 class Parser: | |
200 """Base class for Yapps-generated parsers. | |
201 | |
202 """ | |
203 | |
204 def __init__(self, scanner): | |
205 self._scanner = scanner | |
206 self._pos = 0 | |
207 | |
208 def _peek(self, *types): | |
209 """Returns the token type for lookahead; if there are any args | |
210 then the list of args is the set of token types to allow""" | |
211 tok = self._scanner.token(self._pos, types) | |
212 return tok[2] | |
213 | |
214 def _scan(self, type): | |
215 """Returns the matched text, and moves to the next token""" | |
216 tok = self._scanner.token(self._pos, [type]) | |
217 if tok[2] != type: | |
218 raise SyntaxError(tok[0], 'Trying to find '+type+' :'+ ' ,'.join(sel
f._scanner.restrictions[self._pos])) | |
219 self._pos = 1 + self._pos | |
220 return tok[3] | |
221 | |
222 class Context: | |
223 """Class to represent the parser's call stack. | |
224 | |
225 Every rule creates a Context that links to its parent rule. The | |
226 contexts can be used for debugging. | |
227 | |
228 """ | |
229 | |
230 def __init__(self, parent, scanner, tokenpos, rule, args=()): | |
231 """Create a new context. | |
232 | |
233 @param parent: Context object or C{None} | |
234 @param scanner: Scanner object | |
235 @param tokenpos: scanner token position | |
236 @type tokenpos: L{int} | |
237 @param rule: name of the rule | |
238 @type rule: L{str} | |
239 @param args: tuple listing parameters to the rule | |
240 | |
241 """ | |
242 self.parent = parent | |
243 self.scanner = scanner | |
244 self.tokenpos = tokenpos | |
245 self.rule = rule | |
246 self.args = args | |
247 | |
248 def __str__(self): | |
249 output = '' | |
250 if self.parent: output = str(self.parent) + ' > ' | |
251 output += self.rule | |
252 return output | |
253 | |
254 def print_line_with_pointer(text, p): | |
255 """Print the line of 'text' that includes position 'p', | |
256 along with a second line with a single caret (^) at position p""" | |
257 | |
258 # TODO: separate out the logic for determining the line/character | |
259 # location from the logic for determining how to display an | |
260 # 80-column line to stderr. | |
261 | |
262 # Now try printing part of the line | |
263 text = text[max(p-80, 0):p+80] | |
264 p = p - max(p-80, 0) | |
265 | |
266 # Strip to the left | |
267 i = text[:p].rfind('\n') | |
268 j = text[:p].rfind('\r') | |
269 if i < 0 or (0 <= j < i): i = j | |
270 if 0 <= i < p: | |
271 p = p - i - 1 | |
272 text = text[i+1:] | |
273 | |
274 # Strip to the right | |
275 i = text.find('\n', p) | |
276 j = text.find('\r', p) | |
277 if i < 0 or (0 <= j < i): i = j | |
278 if i >= 0: | |
279 text = text[:i] | |
280 | |
281 # Now shorten the text | |
282 while len(text) > 70 and p > 60: | |
283 # Cut off 10 chars | |
284 text = "..." + text[10:] | |
285 p = p - 7 | |
286 | |
287 # Now print the string, along with an indicator | |
288 print >>sys.stderr, '> ',text | |
289 print >>sys.stderr, '> ',' '*p + '^' | |
290 | |
291 def print_error(input, err, scanner): | |
292 """Print error messages, the parser stack, and the input text -- for human-r
eadable error messages.""" | |
293 # NOTE: this function assumes 80 columns :-( | |
294 # Figure out the line number | |
295 line_number = scanner.get_line_number() | |
296 column_number = scanner.get_column_number() | |
297 print >>sys.stderr, '%d:%d: %s' % (line_number, column_number, err.msg) | |
298 | |
299 context = err.context | |
300 if not context: | |
301 print_line_with_pointer(input, err.charpos) | |
302 | |
303 while context: | |
304 # TODO: add line number | |
305 print >>sys.stderr, 'while parsing %s%s:' % (context.rule, tuple(context
.args)) | |
306 print_line_with_pointer(input, context.scanner.get_prev_char_pos(context
.tokenpos)) | |
307 context = context.parent | |
308 | |
309 def wrap_error_reporter(parser, rule): | |
310 try: | |
311 return getattr(parser, rule)() | |
312 except SyntaxError, e: | |
313 input = parser._scanner.input | |
314 print_error(input, e, parser._scanner) | |
315 except NoMoreTokens: | |
316 print >>sys.stderr, 'Could not complete parsing; stopped around here:' | |
317 print >>sys.stderr, parser._scanner | |
318 | |
319 | |
320 from twisted.words.xish.xpath import AttribValue, BooleanValue, CompareValue | |
321 from twisted.words.xish.xpath import Function, IndexValue, LiteralValue | |
322 from twisted.words.xish.xpath import _AnyLocation, _Location | |
323 | |
324 %% | |
325 parser XPathParser: | |
326 ignore: "\\s+" | |
327 token INDEX: "[0-9]+" | |
328 token WILDCARD: "\*" | |
329 token IDENTIFIER: "[a-zA-Z][a-zA-Z0-9_\-]*" | |
330 token ATTRIBUTE: "\@[a-zA-Z][a-zA-Z0-9_\-]*" | |
331 token FUNCNAME: "[a-zA-Z][a-zA-Z0-9_]*" | |
332 token CMP_EQ: "\=" | |
333 token CMP_NE: "\!\=" | |
334 token STR_DQ: '"([^"]|(\\"))*?"' | |
335 token STR_SQ: "'([^']|(\\'))*?'" | |
336 token OP_AND: "and" | |
337 token OP_OR: "or" | |
338 token END: "$" | |
339 | |
340 rule XPATH: PATH {{ result = PATH; current = result }} | |
341 ( PATH {{ current.childLocation = PATH; current = cur
rent.childLocation }} ) * END | |
342 {{ return result }} | |
343 | |
344 rule PATH: ("/" {{ result = _Location() }} | "//" {{ result = _Any
Location() }} ) | |
345 ( IDENTIFIER {{ result.elementName = IDENTIFIER }} |
WILDCARD {{ result.elementName = None }} ) | |
346 ( "\[" PREDICATE {{ result.predicates.append(PREDICAT
E) }} "\]")* | |
347 {{ return result }} | |
348 | |
349 rule PREDICATE: EXPR {{ return EXPR }} | | |
350 INDEX {{ return IndexValue(INDEX) }} | |
351 | |
352 rule EXPR: FACTOR {{ e = FACTOR }} | |
353 ( BOOLOP FACTOR {{ e = BooleanValue(e, BOOLOP, FACTOR
) }} )* | |
354 {{ return e }} | |
355 | |
356 rule BOOLOP: ( OP_AND {{ return OP_AND }} | OP_OR {{ return OP_OR }}
) | |
357 | |
358 rule FACTOR: TERM {{ return TERM }} | |
359 | "\(" EXPR "\)" {{ return EXPR }} | |
360 | |
361 rule TERM: VALUE {{ t = VALUE }} | |
362 [ CMP VALUE {{ t = CompareValue(t, CMP, VALUE) }} ] | |
363 {{ return t }} | |
364 | |
365 rule VALUE: "@" IDENTIFIER {{ return AttribValue(IDENTIFIER) }} | | |
366 FUNCNAME {{ f = Function(FUNCNAME); args = [] }
} | |
367 "\(" [ VALUE {{ args.append(VALUE) }} | |
368 ( | |
369 "," VALUE {{ args.append(VALUE) }} | |
370 )* | |
371 ] "\)" {{ f.setParams(*args); return f }} | | |
372 STR {{ return LiteralValue(STR[1:len(STR)-
1]) }} | |
373 | |
374 rule CMP: (CMP_EQ {{ return CMP_EQ }} | CMP_NE {{ return CMP_NE }}) | |
375 rule STR: (STR_DQ {{ return STR_DQ }} | STR_SQ {{ return STR_SQ }}) | |
OLD | NEW |