OLD | NEW |
| (Empty) |
1 #!/usr/bin/env python | |
2 # | |
3 # Copyright 2007 The Closure Linter Authors. All Rights Reserved. | |
4 # | |
5 # Licensed under the Apache License, Version 2.0 (the "License"); | |
6 # you may not use this file except in compliance with the License. | |
7 # You may obtain a copy of the License at | |
8 # | |
9 # http://www.apache.org/licenses/LICENSE-2.0 | |
10 # | |
11 # Unless required by applicable law or agreed to in writing, software | |
12 # distributed under the License is distributed on an "AS-IS" BASIS, | |
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 # See the License for the specific language governing permissions and | |
15 # limitations under the License. | |
16 | |
17 """Regular expression based lexer.""" | |
18 | |
19 __author__ = ('robbyw@google.com (Robert Walker)', | |
20 'ajp@google.com (Andy Perelson)') | |
21 | |
22 from closure_linter.common import tokens | |
23 | |
24 # Shorthand | |
25 Type = tokens.TokenType | |
26 | |
27 | |
28 class Tokenizer(object): | |
29 """General purpose tokenizer. | |
30 | |
31 Attributes: | |
32 mode: The latest mode of the tokenizer. This allows patterns to distinguish | |
33 if they are mid-comment, mid-parameter list, etc. | |
34 matchers: Dictionary of modes to sequences of matchers that define the | |
35 patterns to check at any given time. | |
36 default_types: Dictionary of modes to types, defining what type to give | |
37 non-matched text when in the given mode. Defaults to Type.NORMAL. | |
38 """ | |
39 | |
40 def __init__(self, starting_mode, matchers, default_types): | |
41 """Initialize the tokenizer. | |
42 | |
43 Args: | |
44 starting_mode: Mode to start in. | |
45 matchers: Dictionary of modes to sequences of matchers that defines the | |
46 patterns to check at any given time. | |
47 default_types: Dictionary of modes to types, defining what type to give | |
48 non-matched text when in the given mode. Defaults to Type.NORMAL. | |
49 """ | |
50 self.__starting_mode = starting_mode | |
51 self.matchers = matchers | |
52 self.default_types = default_types | |
53 | |
54 def TokenizeFile(self, file): | |
55 """Tokenizes the given file. | |
56 | |
57 Args: | |
58 file: An iterable that yields one line of the file at a time. | |
59 | |
60 Returns: | |
61 The first token in the file | |
62 """ | |
63 # The current mode. | |
64 self.mode = self.__starting_mode | |
65 # The first token in the stream. | |
66 self.__first_token = None | |
67 # The last token added to the token stream. | |
68 self.__last_token = None | |
69 # The current line number. | |
70 self.__line_number = 0 | |
71 | |
72 for line in file: | |
73 self.__line_number += 1 | |
74 self.__TokenizeLine(line) | |
75 | |
76 return self.__first_token | |
77 | |
78 def _CreateToken(self, string, token_type, line, line_number, values=None): | |
79 """Creates a new Token object (or subclass). | |
80 | |
81 Args: | |
82 string: The string of input the token represents. | |
83 token_type: The type of token. | |
84 line: The text of the line this token is in. | |
85 line_number: The line number of the token. | |
86 values: A dict of named values within the token. For instance, a | |
87 function declaration may have a value called 'name' which captures the | |
88 name of the function. | |
89 | |
90 Returns: | |
91 The newly created Token object. | |
92 """ | |
93 return tokens.Token(string, token_type, line, line_number, values, | |
94 line_number) | |
95 | |
96 def __TokenizeLine(self, line): | |
97 """Tokenizes the given line. | |
98 | |
99 Args: | |
100 line: The contents of the line. | |
101 """ | |
102 string = line.rstrip('\n\r\f') | |
103 line_number = self.__line_number | |
104 self.__start_index = 0 | |
105 | |
106 if not string: | |
107 self.__AddToken(self._CreateToken('', Type.BLANK_LINE, line, line_number)) | |
108 return | |
109 | |
110 normal_token = '' | |
111 index = 0 | |
112 while index < len(string): | |
113 for matcher in self.matchers[self.mode]: | |
114 if matcher.line_start and index > 0: | |
115 continue | |
116 | |
117 match = matcher.regex.match(string, index) | |
118 | |
119 if match: | |
120 if normal_token: | |
121 self.__AddToken( | |
122 self.__CreateNormalToken(self.mode, normal_token, line, | |
123 line_number)) | |
124 normal_token = '' | |
125 | |
126 # Add the match. | |
127 self.__AddToken(self._CreateToken(match.group(), matcher.type, line, | |
128 line_number, match.groupdict())) | |
129 | |
130 # Change the mode to the correct one for after this match. | |
131 self.mode = matcher.result_mode or self.mode | |
132 | |
133 # Shorten the string to be matched. | |
134 index = match.end() | |
135 | |
136 break | |
137 | |
138 else: | |
139 # If the for loop finishes naturally (i.e. no matches) we just add the | |
140 # first character to the string of consecutive non match characters. | |
141 # These will constitute a NORMAL token. | |
142 if string: | |
143 normal_token += string[index:index + 1] | |
144 index += 1 | |
145 | |
146 if normal_token: | |
147 self.__AddToken( | |
148 self.__CreateNormalToken(self.mode, normal_token, line, line_number)) | |
149 | |
150 def __CreateNormalToken(self, mode, string, line, line_number): | |
151 """Creates a normal token. | |
152 | |
153 Args: | |
154 mode: The current mode. | |
155 string: The string to tokenize. | |
156 line: The line of text. | |
157 line_number: The line number within the file. | |
158 | |
159 Returns: | |
160 A Token object, of the default type for the current mode. | |
161 """ | |
162 type = Type.NORMAL | |
163 if mode in self.default_types: | |
164 type = self.default_types[mode] | |
165 return self._CreateToken(string, type, line, line_number) | |
166 | |
167 def __AddToken(self, token): | |
168 """Add the given token to the token stream. | |
169 | |
170 Args: | |
171 token: The token to add. | |
172 """ | |
173 # Store the first token, or point the previous token to this one. | |
174 if not self.__first_token: | |
175 self.__first_token = token | |
176 else: | |
177 self.__last_token.next = token | |
178 | |
179 # Establish the doubly linked list | |
180 token.previous = self.__last_token | |
181 self.__last_token = token | |
182 | |
183 # Compute the character indices | |
184 token.start_index = self.__start_index | |
185 self.__start_index += token.length | |
OLD | NEW |