Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(831)

Side by Side Diff: tools/idl_parser/idl_lexer.py

Issue 13498002: Add WebIDL compliant parser plus tests (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Clean up IDLNode, make members private Created 7 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | tools/idl_parser/idl_log.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright (c) 2013 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """ Lexer for PPAPI IDL
7
8 The lexer is uses the PLY to build a tokenizer which understands WebIDL
sehr 2013/04/08 16:41:20 s/is uses/uses/
noelallen1 2013/04/11 21:52:09 Done.
9 tokens. The type of the tokens that match WebIDL will be:
10 1- The grammar name exactly: float, integer, string, identifier
11 2- The grammer for single characters operators: '?', ':', '{', ...
sehr 2013/04/08 16:41:20 grammar
noelallen1 2013/04/11 21:52:09 Done.
12 3- The uppercase version of the multicharacter string for keywords
13 ANY, ATTRIBUTE, BOOLEAN, ...
14
15 In addition, there are Pepper specific tokens for comments and inline blocks,
16 and multicharacter operators such as >>, <<, and ...
17
18 WebIDL, and WebIDL regular expressions can be found at:
19 http://www.w3.org/TR/2012/CR-WebIDL-20120419/
20 PLY can be found at:
21 http://www.dabeaz.com/ply/
22 """
23
24 import optparse
25 import os.path
26 import re
27 import sys
28
29 #
30 # Try to load the ply module, if not, then assume it is in the third_party
31 # directory, relative to ppapi
32 #
33 try:
34 # pylint: disable=F0401
35 from ply import lex
36 except:
37 module_path, module_name = os.path.split(__file__)
38 third_party = os.path.join(module_path, '..', '..', 'third_party')
39 sys.path.append(third_party)
40 # pylint: disable=F0401
41 from ply import lex
42
43 #
44 # IDL Lexer
45 #
46 class IDLLexer(object):
47 # 'tokens' is a value required by lex which specifies the complete list
48 # of valid token types.
49 tokens = [
50 # Data types
51 'float',
52 'integer',
53 'string',
54
55 # Operators
56 'ELLIPSIS',
57 'LSHIFT',
58 'RSHIFT',
59
60 # Symbol and keywords types
61 'COMMENT',
62 'identifier',
63
64 # Pepper Extras
65 'INLINE',
66 ]
67
68 # 'keywords' is a map of string to token type. All tokens matching
69 # KEYWORD_OR_SYMBOL are matched against keywords dictionary, to determine
70 # if the token is actually a keyword.
71 keywords = {
72 'any' : 'ANY',
73 'attribute' : 'ATTRIBUTE',
74 'boolean' : 'BOOLEAN',
75 'byte' : 'BYTE',
76 'callback' : 'CALLBACK',
77 'const' : 'CONST',
78 'creator' : 'CREATOR',
79 'Date' : 'DATE',
80 'deleter' : 'DELETER',
81 'dictionary' : 'DICTIONARY',
82 'DOMString' : 'DOMSTRING',
83 'double' : 'DOUBLE',
84 'enum' : 'ENUM',
85 'false' : 'FALSE',
86 'float' : 'FLOAT',
87 'exception' : 'EXCEPTION',
88 'getter': 'GETTER',
89 'implements' : 'IMPLEMENTS',
90 'Infinity' : 'INFINITY',
91 'inherit' : 'INHERIT',
92 'interface' : 'INTERFACE',
93 'label' : 'LABEL',
94 'legacycaller' : 'LEGACYCALLER',
95 'long' : 'LONG',
96 'namespace' : 'NAMESPACE',
97 'Nan' : 'NAN',
98 'null' : 'NULL',
99 'object' : 'OBJECT',
100 'octet' : 'OCTET',
101 'optional' : 'OPTIONAL',
102 'or' : 'OR',
103 'partial' : 'PARTIAL',
104 'readonly' : 'READONLY',
105 'sequence' : 'SEQUENCE',
106 'setter': 'SETTER',
107 'short' : 'SHORT',
108 'static' : 'STATIC',
109 'stringifier' : 'STRINGIFIER',
110 'struct' : 'STRUCT',
111 'typedef' : 'TYPEDEF',
112 'true' : 'TRUE',
113 'unsigned' : 'UNSIGNED',
114 'unrestricted' : 'UNRESTRICTED',
115 'void' : 'VOID'
116 }
117
118 # Add keywords
119 for key in keywords:
120 tokens.append(keywords[key])
121
122 # 'literals' is a value expected by lex which specifies a list of valid
123 # literal tokens, meaning the token type and token value are identical.
124 literals = '"*.(){}[],;:=+-/~|&^?<>'
125
126 # Token definitions
127 #
128 # Lex assumes any value or function in the form of 't_<TYPE>' represents a
129 # regular expression where a match will emit a token of type <TYPE>. In the
130 # case of a function, the function is called when a match is made. These
131 # definitions come from WebIDL.
132
133 # 't_ignore' is a special match of items to ignore
134 t_ignore = ' \t'
135
136 # Constant values
137 t_integer = r'-?(0[Xx][0-9A-Fa-f]+)|([0-9]+[uU]?)'
138 t_float = r'-?(\d+\.\d*|\d*\.\d+)([Ee][+-]?\d+)?|-?\d+[Ee][+-]?\d+'
139
140 # Special multi-character operators
141 t_ELLIPSIS = r'\.\.\.'
142 t_LSHIFT = r'<<'
143 t_RSHIFT = r'>>'
144
145 # A line ending '\n', we use this to increment the line number
146 def t_LINE_END(self, t):
147 r'\n+'
148 self.AddLines(len(t.value))
149
150 # We do not process escapes in the IDL strings. Strings are exclusively
151 # used for attributes and enums, and not used as typical 'C' constants.
152 def t_string(self, t):
153 r'"[^"]*"'
154 t.value = t.value[1:-1]
155 self.AddLines(t.value.count('\n'))
156 return t
157
158 # A C or C++ style comment: /* xxx */ or //
159 def t_COMMENT(self, t):
160 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
sehr 2013/04/08 16:41:20 what is the "?" there for?
noelallen1 2013/04/11 21:52:09 I will add a new test to the lexer to vet comments
161 self.AddLines(t.value.count('\n'))
162 return t
163
164 # Return a "preprocessor" inline block
165 def t_INLINE(self, t):
166 r'\#inline (.|\n)*?\#endinl.*'
167 self.AddLines(t.value.count('\n'))
168 return t
169
170 # A symbol or keyword.
171 def t_KEYWORD_OR_SYMBOL(self, t):
172 r'_?[A-Za-z][A-Za-z_0-9]*'
173
174 # All non-keywords are assumed to be symbols
175 t.type = self.keywords.get(t.value, 'identifier')
176
177 # We strip leading underscores so that you can specify symbols with the same
178 # value as a keywords (E.g. a dictionary named 'interface').
179 if t.value[0] == '_':
180 t.value = t.value[1:]
181 return t
182
183 def t_ANY_error(self, t):
184 msg = 'Unrecognized input'
185 line = self.lexobj.lineno
186
187 # If that line has not been accounted for, then we must have hit
188 # EoF, so compute the beginning of the line that caused the problem.
189 if line >= len(self.index):
190 # Find the offset in the line of the first word causing the issue
191 word = t.value.split()[0]
192 offs = self.lines[line - 1].find(word)
193 # Add the computed line's starting position
194 self.index.append(self.lexobj.lexpos - offs)
195 msg = 'Unexpected EoF reached after'
196
197 pos = self.lexobj.lexpos - self.index[line]
198 out = self.ErrorMessage(line, pos, msg)
199 sys.stderr.write(out + '\n')
200 self.lex_errors += 1
201
202
203 def AddLines(self, count):
204 # Set the lexer position for the beginning of the next line. In the case
205 # of multiple lines, tokens can not exist on any of the lines except the
206 # last one, so the recorded value for previous lines are unused. We still
207 # fill the array however, to make sure the line count is correct.
208 self.lexobj.lineno += count
209 # pylint: disable=W0612
210 for i in range(count):
211 self.index.append(self.lexobj.lexpos)
212
213 def FileLineMsg(self, line, msg):
214 # Generate a message containing the file and line number of a token.
215 filename = self.lexobj.filename
216 if filename:
217 return "%s(%d) : %s" % (filename, line + 1, msg)
218 return "<BuiltIn> : %s" % msg
219
220 def SourceLine(self, line, pos):
221 # Create a source line marker
222 caret = '\t^'.expandtabs(pos)
sehr 2013/04/08 16:41:20 This is a "cute" way of generating pos * ' '. Is
noelallen1 2013/04/11 21:52:09 ' ' * pos + '^' would also work. So would using w
223 # We decrement the line number since the array is 0 based while the
224 # line numbers are 1 based.
225 return "%s\n%s" % (self.lines[line - 1], caret)
226
227 def ErrorMessage(self, line, pos, msg):
228 return "\n%s\n%s" % (
229 self.FileLineMsg(line, msg),
230 self.SourceLine(line, pos))
231
232 def GetTokens(self):
233 outlist = []
234 while 1:
235 t = self.lexobj.token()
236 if t is None:
237 break
238 outlist.append(t)
239 return outlist
240
241 def __init__(self, filename, data):
242 self.index = [0]
243 self.lex_errors = 0
244 self.lines = data.split('\n')
245 self.lexobj = lex.lex(object=self, lextab=None, optimize=0)
246 self.lexobj.filename = filename
247 self.lexobj.input(data)
248
249
250 #
251 # FileToTokens
252 #
253 # From a source file generate a list of tokens.
254 #
255 def FileToTokens(filename):
256 with open(filename, 'rb') as srcfile:
257 lexer = IDLLexer(filename, srcfile.read())
258 return lexer.GetTokens()
259
260
261 #
262 # TextToTokens
263 #
264 # From a source file generate a list of tokens.
265 #
266 def TextToTokens(text):
267 lexer = IDLLexer(None, text)
268 return lexer.GetTokens()
269
270
271 #
272 # TestSameText
273 #
274 # From a set of tokens, generate a new source text by joining with a
275 # single space. The new source is then tokenized and compared against the
sehr 2013/04/08 16:41:20 s/space/newline/ ?
noelallen1 2013/04/11 21:52:09 Done.
276 # old set.
277 #
278 def TestSameText(filename):
279 tokens1 = FileToTokens(filename)
280 tokens2 = TextToTokens('\n'.join(tokens1))
281
282 count1 = len(tokens1)
283 count2 = len(tokens2)
284 if count1 != count2:
285 print "Size mismatch original %d vs %d\n" % (count1, count2)
286 if count1 > count2:
287 count1 = count2
288
289 failed = 0
290 for i in range(count1):
291 if tokens1[i] != tokens2[i]:
292 print "%d >>%s<< >>%s<<" % (i, tokens1[i], tokens2[i])
293 failed = failed + 1
294
295 return failed
296
297
298 #
299 # TestExpectedText
300 #
301 # From a set of tokens pairs, verify the type field of the second matches
302 # the value of the first, so that:
303 # integer 123 float 1.1
304 # will generate a passing test, where the first token has both the type and
305 # value of the keyword integer and the second has the type of integer and
306 # value of 123.
307 #
308 def TestExpect(tokens):
309 count = len(tokens)
310 index = 0
311 errors = 0
312 while index < count:
313 expect_type = tokens[index].value
314 actual_type = tokens[index + 1].type
315 index += 2
316
317 if expect_type != actual_type:
318 sys.stderr.write('Mismatch: Expected %s, but got %s = %s.\n' %
319 (expect_type, actual_type, token.value))
320 errors += 1
321
322 return errors
323
324
325 def Main(args):
326 parser = optparse.OptionParser()
327 parser.add_option('--test', help='Run tests.', action='store_true')
328
329 # If no arguments are provided, run tests.
330 if len(args) == 0:
331 args = ['--test', 'test_lexer/values.in', 'test_lexer/keywords.in']
sehr 2013/04/08 16:41:20 This is sort of ugly, baking in test input paths.
noelallen1 2013/04/11 21:52:09 This is fixed in a different CL On 2013/04/08 16:4
332
333 global options
334 options, filenames = parser.parse_args(args[1:])
335
336 if not filenames:
337 parser.error('No files specified.')
338
339 for filename in filenames:
340 try:
341 tokens = FileToTokens(filename)
342 values = [tok.value for tok in tokens]
343 errors = 0
344
345 if options.test:
346 if TestSameText(values):
347 sys.stderr.write('Failed text match on %s.\n' % filename)
348 return -1
349 if TestExpect(tokens):
350 sys.stderr.write('Failed expected type match on %s.\n' % filename)
351 return -1
352
353 except lex.LexError as le:
354 sys.stderr.write('%s\n' % str(le))
355
356 return 0
357
358
359 if __name__ == '__main__':
360 sys.exit(Main(sys.argv[1:]))
OLDNEW
« no previous file with comments | « no previous file | tools/idl_parser/idl_log.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698