Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 #!/usr/bin/env python | |
| 2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. | |
| 3 # Use of this source code is governed by a BSD-style license that can be | |
| 4 # found in the LICENSE file. | |
| 5 | |
| 6 """ Lexer for PPAPI IDL | |
| 7 | |
| 8 The lexer is uses the PLY to build a tokenizer which understands WebIDL | |
|
sehr
2013/04/08 16:41:20
s/is uses/uses/
noelallen1
2013/04/11 21:52:09
Done.
| |
| 9 tokens. The type of the tokens that match WebIDL will be: | |
| 10 1- The grammar name exactly: float, integer, string, identifier | |
| 11 2- The grammer for single characters operators: '?', ':', '{', ... | |
|
sehr
2013/04/08 16:41:20
grammar
noelallen1
2013/04/11 21:52:09
Done.
| |
| 12 3- The uppercase version of the multicharacter string for keywords | |
| 13 ANY, ATTRIBUTE, BOOLEAN, ... | |
| 14 | |
| 15 In addition, there are Pepper specific tokens for comments and inline blocks, | |
| 16 and multicharacter operators such as >>, <<, and ... | |
| 17 | |
| 18 WebIDL, and WebIDL regular expressions can be found at: | |
| 19 http://www.w3.org/TR/2012/CR-WebIDL-20120419/ | |
| 20 PLY can be found at: | |
| 21 http://www.dabeaz.com/ply/ | |
| 22 """ | |
| 23 | |
| 24 import optparse | |
| 25 import os.path | |
| 26 import re | |
| 27 import sys | |
| 28 | |
| 29 # | |
| 30 # Try to load the ply module, if not, then assume it is in the third_party | |
| 31 # directory, relative to ppapi | |
| 32 # | |
| 33 try: | |
| 34 # pylint: disable=F0401 | |
| 35 from ply import lex | |
| 36 except: | |
| 37 module_path, module_name = os.path.split(__file__) | |
| 38 third_party = os.path.join(module_path, '..', '..', 'third_party') | |
| 39 sys.path.append(third_party) | |
| 40 # pylint: disable=F0401 | |
| 41 from ply import lex | |
| 42 | |
| 43 # | |
| 44 # IDL Lexer | |
| 45 # | |
| 46 class IDLLexer(object): | |
| 47 # 'tokens' is a value required by lex which specifies the complete list | |
| 48 # of valid token types. | |
| 49 tokens = [ | |
| 50 # Data types | |
| 51 'float', | |
| 52 'integer', | |
| 53 'string', | |
| 54 | |
| 55 # Operators | |
| 56 'ELLIPSIS', | |
| 57 'LSHIFT', | |
| 58 'RSHIFT', | |
| 59 | |
| 60 # Symbol and keywords types | |
| 61 'COMMENT', | |
| 62 'identifier', | |
| 63 | |
| 64 # Pepper Extras | |
| 65 'INLINE', | |
| 66 ] | |
| 67 | |
| 68 # 'keywords' is a map of string to token type. All tokens matching | |
| 69 # KEYWORD_OR_SYMBOL are matched against keywords dictionary, to determine | |
| 70 # if the token is actually a keyword. | |
| 71 keywords = { | |
| 72 'any' : 'ANY', | |
| 73 'attribute' : 'ATTRIBUTE', | |
| 74 'boolean' : 'BOOLEAN', | |
| 75 'byte' : 'BYTE', | |
| 76 'callback' : 'CALLBACK', | |
| 77 'const' : 'CONST', | |
| 78 'creator' : 'CREATOR', | |
| 79 'Date' : 'DATE', | |
| 80 'deleter' : 'DELETER', | |
| 81 'dictionary' : 'DICTIONARY', | |
| 82 'DOMString' : 'DOMSTRING', | |
| 83 'double' : 'DOUBLE', | |
| 84 'enum' : 'ENUM', | |
| 85 'false' : 'FALSE', | |
| 86 'float' : 'FLOAT', | |
| 87 'exception' : 'EXCEPTION', | |
| 88 'getter': 'GETTER', | |
| 89 'implements' : 'IMPLEMENTS', | |
| 90 'Infinity' : 'INFINITY', | |
| 91 'inherit' : 'INHERIT', | |
| 92 'interface' : 'INTERFACE', | |
| 93 'label' : 'LABEL', | |
| 94 'legacycaller' : 'LEGACYCALLER', | |
| 95 'long' : 'LONG', | |
| 96 'namespace' : 'NAMESPACE', | |
| 97 'Nan' : 'NAN', | |
| 98 'null' : 'NULL', | |
| 99 'object' : 'OBJECT', | |
| 100 'octet' : 'OCTET', | |
| 101 'optional' : 'OPTIONAL', | |
| 102 'or' : 'OR', | |
| 103 'partial' : 'PARTIAL', | |
| 104 'readonly' : 'READONLY', | |
| 105 'sequence' : 'SEQUENCE', | |
| 106 'setter': 'SETTER', | |
| 107 'short' : 'SHORT', | |
| 108 'static' : 'STATIC', | |
| 109 'stringifier' : 'STRINGIFIER', | |
| 110 'struct' : 'STRUCT', | |
| 111 'typedef' : 'TYPEDEF', | |
| 112 'true' : 'TRUE', | |
| 113 'unsigned' : 'UNSIGNED', | |
| 114 'unrestricted' : 'UNRESTRICTED', | |
| 115 'void' : 'VOID' | |
| 116 } | |
| 117 | |
| 118 # Add keywords | |
| 119 for key in keywords: | |
| 120 tokens.append(keywords[key]) | |
| 121 | |
| 122 # 'literals' is a value expected by lex which specifies a list of valid | |
| 123 # literal tokens, meaning the token type and token value are identical. | |
| 124 literals = '"*.(){}[],;:=+-/~|&^?<>' | |
| 125 | |
| 126 # Token definitions | |
| 127 # | |
| 128 # Lex assumes any value or function in the form of 't_<TYPE>' represents a | |
| 129 # regular expression where a match will emit a token of type <TYPE>. In the | |
| 130 # case of a function, the function is called when a match is made. These | |
| 131 # definitions come from WebIDL. | |
| 132 | |
| 133 # 't_ignore' is a special match of items to ignore | |
| 134 t_ignore = ' \t' | |
| 135 | |
| 136 # Constant values | |
| 137 t_integer = r'-?(0[Xx][0-9A-Fa-f]+)|([0-9]+[uU]?)' | |
| 138 t_float = r'-?(\d+\.\d*|\d*\.\d+)([Ee][+-]?\d+)?|-?\d+[Ee][+-]?\d+' | |
| 139 | |
| 140 # Special multi-character operators | |
| 141 t_ELLIPSIS = r'\.\.\.' | |
| 142 t_LSHIFT = r'<<' | |
| 143 t_RSHIFT = r'>>' | |
| 144 | |
| 145 # A line ending '\n', we use this to increment the line number | |
| 146 def t_LINE_END(self, t): | |
| 147 r'\n+' | |
| 148 self.AddLines(len(t.value)) | |
| 149 | |
| 150 # We do not process escapes in the IDL strings. Strings are exclusively | |
| 151 # used for attributes and enums, and not used as typical 'C' constants. | |
| 152 def t_string(self, t): | |
| 153 r'"[^"]*"' | |
| 154 t.value = t.value[1:-1] | |
| 155 self.AddLines(t.value.count('\n')) | |
| 156 return t | |
| 157 | |
| 158 # A C or C++ style comment: /* xxx */ or // | |
| 159 def t_COMMENT(self, t): | |
| 160 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)' | |
|
sehr
2013/04/08 16:41:20
what is the "?" there for?
noelallen1
2013/04/11 21:52:09
I will add a new test to the lexer to vet comments
| |
| 161 self.AddLines(t.value.count('\n')) | |
| 162 return t | |
| 163 | |
| 164 # Return a "preprocessor" inline block | |
| 165 def t_INLINE(self, t): | |
| 166 r'\#inline (.|\n)*?\#endinl.*' | |
| 167 self.AddLines(t.value.count('\n')) | |
| 168 return t | |
| 169 | |
| 170 # A symbol or keyword. | |
| 171 def t_KEYWORD_OR_SYMBOL(self, t): | |
| 172 r'_?[A-Za-z][A-Za-z_0-9]*' | |
| 173 | |
| 174 # All non-keywords are assumed to be symbols | |
| 175 t.type = self.keywords.get(t.value, 'identifier') | |
| 176 | |
| 177 # We strip leading underscores so that you can specify symbols with the same | |
| 178 # value as a keywords (E.g. a dictionary named 'interface'). | |
| 179 if t.value[0] == '_': | |
| 180 t.value = t.value[1:] | |
| 181 return t | |
| 182 | |
| 183 def t_ANY_error(self, t): | |
| 184 msg = 'Unrecognized input' | |
| 185 line = self.lexobj.lineno | |
| 186 | |
| 187 # If that line has not been accounted for, then we must have hit | |
| 188 # EoF, so compute the beginning of the line that caused the problem. | |
| 189 if line >= len(self.index): | |
| 190 # Find the offset in the line of the first word causing the issue | |
| 191 word = t.value.split()[0] | |
| 192 offs = self.lines[line - 1].find(word) | |
| 193 # Add the computed line's starting position | |
| 194 self.index.append(self.lexobj.lexpos - offs) | |
| 195 msg = 'Unexpected EoF reached after' | |
| 196 | |
| 197 pos = self.lexobj.lexpos - self.index[line] | |
| 198 out = self.ErrorMessage(line, pos, msg) | |
| 199 sys.stderr.write(out + '\n') | |
| 200 self.lex_errors += 1 | |
| 201 | |
| 202 | |
| 203 def AddLines(self, count): | |
| 204 # Set the lexer position for the beginning of the next line. In the case | |
| 205 # of multiple lines, tokens can not exist on any of the lines except the | |
| 206 # last one, so the recorded value for previous lines are unused. We still | |
| 207 # fill the array however, to make sure the line count is correct. | |
| 208 self.lexobj.lineno += count | |
| 209 # pylint: disable=W0612 | |
| 210 for i in range(count): | |
| 211 self.index.append(self.lexobj.lexpos) | |
| 212 | |
| 213 def FileLineMsg(self, line, msg): | |
| 214 # Generate a message containing the file and line number of a token. | |
| 215 filename = self.lexobj.filename | |
| 216 if filename: | |
| 217 return "%s(%d) : %s" % (filename, line + 1, msg) | |
| 218 return "<BuiltIn> : %s" % msg | |
| 219 | |
| 220 def SourceLine(self, line, pos): | |
| 221 # Create a source line marker | |
| 222 caret = '\t^'.expandtabs(pos) | |
|
sehr
2013/04/08 16:41:20
This is a "cute" way of generating pos * ' '. Is
noelallen1
2013/04/11 21:52:09
' ' * pos + '^' would also work.
So would using w
| |
| 223 # We decrement the line number since the array is 0 based while the | |
| 224 # line numbers are 1 based. | |
| 225 return "%s\n%s" % (self.lines[line - 1], caret) | |
| 226 | |
| 227 def ErrorMessage(self, line, pos, msg): | |
| 228 return "\n%s\n%s" % ( | |
| 229 self.FileLineMsg(line, msg), | |
| 230 self.SourceLine(line, pos)) | |
| 231 | |
| 232 def GetTokens(self): | |
| 233 outlist = [] | |
| 234 while 1: | |
| 235 t = self.lexobj.token() | |
| 236 if t is None: | |
| 237 break | |
| 238 outlist.append(t) | |
| 239 return outlist | |
| 240 | |
| 241 def __init__(self, filename, data): | |
| 242 self.index = [0] | |
| 243 self.lex_errors = 0 | |
| 244 self.lines = data.split('\n') | |
| 245 self.lexobj = lex.lex(object=self, lextab=None, optimize=0) | |
| 246 self.lexobj.filename = filename | |
| 247 self.lexobj.input(data) | |
| 248 | |
| 249 | |
| 250 # | |
| 251 # FileToTokens | |
| 252 # | |
| 253 # From a source file generate a list of tokens. | |
| 254 # | |
| 255 def FileToTokens(filename): | |
| 256 with open(filename, 'rb') as srcfile: | |
| 257 lexer = IDLLexer(filename, srcfile.read()) | |
| 258 return lexer.GetTokens() | |
| 259 | |
| 260 | |
| 261 # | |
| 262 # TextToTokens | |
| 263 # | |
| 264 # From a source file generate a list of tokens. | |
| 265 # | |
| 266 def TextToTokens(text): | |
| 267 lexer = IDLLexer(None, text) | |
| 268 return lexer.GetTokens() | |
| 269 | |
| 270 | |
| 271 # | |
| 272 # TestSameText | |
| 273 # | |
| 274 # From a set of tokens, generate a new source text by joining with a | |
| 275 # single space. The new source is then tokenized and compared against the | |
|
sehr
2013/04/08 16:41:20
s/space/newline/ ?
noelallen1
2013/04/11 21:52:09
Done.
| |
| 276 # old set. | |
| 277 # | |
| 278 def TestSameText(filename): | |
| 279 tokens1 = FileToTokens(filename) | |
| 280 tokens2 = TextToTokens('\n'.join(tokens1)) | |
| 281 | |
| 282 count1 = len(tokens1) | |
| 283 count2 = len(tokens2) | |
| 284 if count1 != count2: | |
| 285 print "Size mismatch original %d vs %d\n" % (count1, count2) | |
| 286 if count1 > count2: | |
| 287 count1 = count2 | |
| 288 | |
| 289 failed = 0 | |
| 290 for i in range(count1): | |
| 291 if tokens1[i] != tokens2[i]: | |
| 292 print "%d >>%s<< >>%s<<" % (i, tokens1[i], tokens2[i]) | |
| 293 failed = failed + 1 | |
| 294 | |
| 295 return failed | |
| 296 | |
| 297 | |
| 298 # | |
| 299 # TestExpectedText | |
| 300 # | |
| 301 # From a set of tokens pairs, verify the type field of the second matches | |
| 302 # the value of the first, so that: | |
| 303 # integer 123 float 1.1 | |
| 304 # will generate a passing test, where the first token has both the type and | |
| 305 # value of the keyword integer and the second has the type of integer and | |
| 306 # value of 123. | |
| 307 # | |
| 308 def TestExpect(tokens): | |
| 309 count = len(tokens) | |
| 310 index = 0 | |
| 311 errors = 0 | |
| 312 while index < count: | |
| 313 expect_type = tokens[index].value | |
| 314 actual_type = tokens[index + 1].type | |
| 315 index += 2 | |
| 316 | |
| 317 if expect_type != actual_type: | |
| 318 sys.stderr.write('Mismatch: Expected %s, but got %s = %s.\n' % | |
| 319 (expect_type, actual_type, token.value)) | |
| 320 errors += 1 | |
| 321 | |
| 322 return errors | |
| 323 | |
| 324 | |
| 325 def Main(args): | |
| 326 parser = optparse.OptionParser() | |
| 327 parser.add_option('--test', help='Run tests.', action='store_true') | |
| 328 | |
| 329 # If no arguments are provided, run tests. | |
| 330 if len(args) == 0: | |
| 331 args = ['--test', 'test_lexer/values.in', 'test_lexer/keywords.in'] | |
|
sehr
2013/04/08 16:41:20
This is sort of ugly, baking in test input paths.
noelallen1
2013/04/11 21:52:09
This is fixed in a different CL
On 2013/04/08 16:4
| |
| 332 | |
| 333 global options | |
| 334 options, filenames = parser.parse_args(args[1:]) | |
| 335 | |
| 336 if not filenames: | |
| 337 parser.error('No files specified.') | |
| 338 | |
| 339 for filename in filenames: | |
| 340 try: | |
| 341 tokens = FileToTokens(filename) | |
| 342 values = [tok.value for tok in tokens] | |
| 343 errors = 0 | |
| 344 | |
| 345 if options.test: | |
| 346 if TestSameText(values): | |
| 347 sys.stderr.write('Failed text match on %s.\n' % filename) | |
| 348 return -1 | |
| 349 if TestExpect(tokens): | |
| 350 sys.stderr.write('Failed expected type match on %s.\n' % filename) | |
| 351 return -1 | |
| 352 | |
| 353 except lex.LexError as le: | |
| 354 sys.stderr.write('%s\n' % str(le)) | |
| 355 | |
| 356 return 0 | |
| 357 | |
| 358 | |
| 359 if __name__ == '__main__': | |
| 360 sys.exit(Main(sys.argv[1:])) | |
| OLD | NEW |