Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2)

Side by Side Diff: src/lexer/lexer.re

Issue 26531003: Minimal push mode scanner around the experimental rules. (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser
Patch Set: moar fixes Created 7 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /*!re2c 1 #include <fcntl.h>
2 2 #include <stdio.h>
3 3 #include <stddef.h>
4 re2c:define:YYCTYPE = "uint8_t"; 4 #include <stdlib.h>
5 re2c:define:YYCURSOR = p; 5 #include <string.h>
6 re2c:yyfill:enable = 0; 6
7 re2c:yych:conversion = 0; 7 /*!types:re2c */
8 re2c:indent:top = 1; 8
9 9 #if defined(WIN32)
10 10
11 eof = [\000]; 11 typedef signed char int8_t;
12 any = [\000-\377]; 12 typedef signed short int16_t;
13 whitespace_char = [ \h\t\v\f\r]; 13 typedef signed int int32_t;
14 whitespace = whitespace_char+; 14
15 identifier_start = [$_\\a-zA-z]; 15 typedef unsigned char uint8_t;
16 identifier_char = [$_\\a-zA-z0-9]; 16 typedef unsigned short uint16_t;
17 number_start = [0-9]; 17 typedef unsigned int uint32_t;
18 line_terminator = [\n\r]+; 18
19 19 #else
20 20
21 <NORMAL> "(" { PUSH_T(LPAREN); } 21 #include <stdint.h>
22 <NORMAL> ")" { PUSH_T(RPAREN); } 22 #include <unistd.h>
23 <NORMAL> "[" { PUSH_T(LBRACK); } 23
24 <NORMAL> "]" { PUSH_T(RBRACK); } 24 #ifndef O_BINARY
25 <NORMAL> "{" { PUSH_T(LBRACE); } 25 #define O_BINARY 0
26 <NORMAL> "}" { PUSH_T(RBRACE); } 26 #endif
27 <NORMAL> ":" { PUSH_T(COLON); } 27
28 <NORMAL> ";" { PUSH_T(SEMICOLON); } 28 #endif
29 <NORMAL> "." { PUSH_T(PERIOD); } 29
30 <NORMAL> "?" { PUSH_T(CONDITIONAL); } 30 // ----------------------------------------------------------------------
31 <NORMAL> "++" { PUSH_T(INC); } 31 #define PUSH_EOS(T) { printf("got eos\n"); }
32 <NORMAL> "--" { PUSH_T(DEC); } 32 #define PUSH_T(T) { printf("got token %d\n", T); SKIP(); }
33 33 #define PUSH_STRING() { printf("got string\n"); SKIP(); }
34 34 #define PUSH_NUMBER() { printf("got number\n"); SKIP(); }
35 <NORMAL> "=" { PUSH_T(ASSIGN); } 35 #define PUSH_IDENTIFIER() { \
36 <NORMAL> "|=" { PUSH_T(ASSIGN_BIT_OR); } 36 printf("got identifier: "); \
37 <NORMAL> "^=" { PUSH_T(ASSIGN_BIT_XOR); } 37 size_t tokenSize = cursor-start; \
38 <NORMAL> "&=" { PUSH_T(ASSIGN_BIT_AND); } 38 fwrite(start, tokenSize, 1, stdout); \
39 <NORMAL> "<<=" { PUSH_T(ASSIGN_SHL); } 39 printf("\n"); \
40 <NORMAL> ">>=" { PUSH_T(ASSIGN_SAR); } 40 SKIP(); }
41 <NORMAL> ">>>=" { PUSH_T(ASSIGN_SHR); } 41 #define PUSH_LINE_TERMINATOR() { printf("got line terminator\n"); SKIP();}
42 <NORMAL> "+=" { PUSH_T(ASSIGN_ADD); } 42 #define TERMINATE_ILLEGAL() { return 1; }
43 <NORMAL> "-=" { PUSH_T(ASSIGN_SUB); } 43
44 <NORMAL> "*=" { PUSH_T(ASSIGN_MUL); } 44 #define TOKENS \
45 <NORMAL> "/=" { PUSH_T(ASSIGN_DIV); } 45 TOK(EOS) \
46 <NORMAL> "%=" { PUSH_T(ASSIGN_MOD); } 46 TOK(LPAREN) \
47 47 TOK(RPAREN) \
48 48 TOK(LBRACK) \
49 <NORMAL> "," { PUSH_T(COMMA); } 49 TOK(RBRACK) \
50 <NORMAL> "||" { PUSH_T(OR); } 50 TOK(LBRACE) \
51 <NORMAL> "&&" { PUSH_T(AND); } 51 TOK(RBRACE) \
52 <NORMAL> "|" { PUSH_T(BIT_OR); } 52 TOK(COLON) \
53 <NORMAL> "^" { PUSH_T(BIT_XOR); } 53 TOK(SEMICOLON) \
54 <NORMAL> "&" { PUSH_T(BIT_AND); } 54 TOK(PERIOD) \
55 <NORMAL> "<<" { PUSH_T(SHL); } 55 TOK(CONDITIONAL) \
56 <NORMAL> ">>" { PUSH_T(SAR); } 56 TOK(INC) \
57 <NORMAL> "+" { PUSH_T(ADD); } 57 TOK(DEC) \
58 <NORMAL> "-" { PUSH_T(SUB); } 58 TOK(ASSIGN) \
59 <NORMAL> "*" { PUSH_T(MUL); } 59 TOK(ASSIGN_BIT_OR) \
60 <NORMAL> "/" { PUSH_T(DIV); } 60 TOK(ASSIGN_BIT_XOR) \
61 <NORMAL> "%" { PUSH_T(MOD); } 61 TOK(ASSIGN_BIT_AND) \
62 62 TOK(ASSIGN_SHL) \
63 63 TOK(ASSIGN_SAR) \
64 <NORMAL> "==" { PUSH_T(EQ); } 64 TOK(ASSIGN_SHR) \
65 <NORMAL> "!=" { PUSH_T(NE); } 65 TOK(ASSIGN_ADD) \
66 <NORMAL> "===" { PUSH_T(EQ_STRICT); } 66 TOK(ASSIGN_SUB) \
67 <NORMAL> "!==" { PUSH_T(NE_STRICT); } 67 TOK(ASSIGN_MUL) \
68 <NORMAL> "<" { PUSH_T(LT); } 68 TOK(ASSIGN_DIV) \
69 <NORMAL> ">" { PUSH_T(GT); } 69 TOK(ASSIGN_MOD) \
70 <NORMAL> "<=" { PUSH_T(LTE); } 70 TOK(COMMA) \
71 <NORMAL> ">=" { PUSH_T(GTE); } 71 TOK(OR) \
72 72 TOK(AND) \
73 73 TOK(BIT_OR) \
74 <NORMAL> "!" { PUSH_T(NOT); } 74 TOK(BIT_XOR) \
75 <NORMAL> "~" { PUSH_T(BIT_NOT); } 75 TOK(BIT_AND) \
76 76 TOK(SHL) \
77 <NORMAL> line_terminator+ { PUSH_LINE_TERMINATOR(); } 77 TOK(SAR) \
78 78 TOK(ADD) \
79 <NORMAL> whitespace {} 79 TOK(SUB) \
80 80 TOK(MUL) \
81 81 TOK(DIV) \
82 <NORMAL> "//" :=> SINGLE_LINE_COMMENT 82 TOK(MOD) \
83 <NORMAL> "/*" :=> MULTILINE_COMMENT 83 TOK(EQ) \
84 <NORMAL> "<!--" :=> HTML_COMMENT 84 TOK(NE) \
85 85 TOK(EQ_STRICT) \
86 86 TOK(NE_STRICT) \
87 <NORMAL> ["] :=> STRING 87 TOK(LT) \
88 <NORMAL> ['] :=> SINGLE_QUOTE_STRING 88 TOK(GT) \
89 89 TOK(LTE) \
90 90 TOK(GTE) \
91 <NORMAL> identifier_start :=> IDENTIFIER 91 TOK(NOT) \
92 92 TOK(BIT_NOT) \
93 <NORMAL> number_start :=> NUMBER 93
94 94 // ----------------------------------------------------------------------
95 95 static const char *tokenNames[] =
96 <NORMAL> eof { PUSH_T(EOS); } 96 {
97 <NORMAL> any { TERMINATE_ILLEGAL(); } 97 #define TOK(x) #x,
98 98 TOKENS
99 99 #undef TOK
100 100 };
101 <STRING> "\\\"" {} 101
102 <STRING> ["] { PUSH_STRING(); TRANSITION(NORMAL); } 102 // ----------------------------------------------------------------------
103 <STRING> any {} 103 class PushScanner
104 104 {
105 105 public:
106 <SINGLE_QUOTE_STRING> "\\'" {} 106
107 <SINGLE_QUOTE_STRING> "'" { PUSH_STRING(); TRANSITION(NORMAL); } 107 enum Token
108 <SINGLE_QUOTE_STRING> any {} 108 {
109 109 #define TOK(x) x,
110 110 TOKENS
111 111 #undef TOK
112 <IDENTIFIER> identifier_char+ {} 112 };
113 <IDENTIFIER> any { PUSH_IDENTIFIER(); TRANSITION(NORMAL); } 113
114 114 private:
115 115
116 116 bool eof;
117 <SINGLE_LINE_COMMENT> line_terminator 117 int32_t state;
118 { PUSH_LINE_TERMINATOR(); TRANSITION(NORMAL); } 118 int32_t condition;
119 119
120 <SINGLE_LINE_COMMENT> any+ {} 120 uint8_t *limit;
121 121 uint8_t *start;
122 122 uint8_t *cursor;
123 123 uint8_t *marker;
124 <MULTILINE_COMMENT> [*][//] { PUSH_LINE_TERMINATOR(); TRANSITION(NORMAL); } 124
125 <MULTILINE_COMMENT> eof { TERMINATE_ILLEGAL(); } 125 uint8_t *buffer;
126 <MULTILINE_COMMENT> any+ {} 126 uint8_t *bufferEnd;
127 127
128 */ 128 uint8_t yych;
129 uint32_t yyaccept;
130
131 public:
132
133 // ----------------------------------------------------------------------
134 PushScanner()
135 {
136 limit = 0;
137 start = 0;
138 state = -1;
139 condition = EConditionNormal;
140 cursor = 0;
141 marker = 0;
142 buffer = 0;
143 eof = false;
144 bufferEnd = 0;
145 }
146
147 // ----------------------------------------------------------------------
148 ~PushScanner()
149 {
150 }
151
152 // ----------------------------------------------------------------------
153 void send(
154 Token token
155 )
156 {
157 size_t tokenSize = cursor-start;
158 const char *tokenName = tokenNames[token];
159 printf(
160 "scanner is pushing out a token of type %d (%s)",
161 token,
162 tokenName
163 );
164
165 if(token==EOS) putchar('\n');
166 else
167 {
168 size_t tokenNameSize = strlen(tokenNames[token]);
169 size_t padSize = 20-(20<tokenNameSize ? 20 : tokenNameSize);
170 for(size_t i=0; i<padSize; ++i) putchar(' ');
171 printf(" : ---->");
172
173 fwrite(
174 start,
175 tokenSize,
176 1,
177 stdout
178 );
179
180 printf("<----\n");
181 }
182 }
183
184 // ----------------------------------------------------------------------
185 uint32_t push(
186 const void *input,
187 ssize_t inputSize
188 )
189 {
190 printf(
191 "scanner is receiving a new data batch of length %ld\n"
192 "scanner continues with saved state = %d\n",
193 inputSize,
194 state
195 );
196
197 /*
198 * Data source is signaling end of file when batch size
199 * is less than maxFill. This is slightly annoying because
200 * maxFill is a value that can only be known after re2c does
201 * its thing. Practically though, maxFill is never bigger than
202 * the longest keyword, so given our grammar, 32 is a safe bet.
203 */
204 uint8_t null[64];
205 const ssize_t maxFill = 32;
206 if(inputSize<maxFill) // FIXME: do something about this!!!
207 {
208 eof = true;
209 input = null;
210 inputSize = sizeof(null);
211 memset(null, 0, sizeof(null));
212 }
213
214 /*
215 * When we get here, we have a partially
216 * consumed buffer which is in the following state:
217 * last v alid char last valid buffer spot
218 * v v
219 * +-------------------+-------------+---------------+-------------+---- ------------------+
220 * ^ ^ ^ ^ ^ ^
221 * buffer start marker cursor limit bufferEnd
222 *
223 * We need to stretch the buffer and concatenate the new chunk of input to it
224 *
225 */
226 size_t used = limit-buffer;
227 size_t needed = used+inputSize;
228 size_t allocated = bufferEnd-buffer;
229 if(allocated<needed)
230 {
231 size_t limitOffset = limit-buffer;
232 size_t startOffset = start-buffer;
233 size_t markerOffset = marker-buffer;
234 size_t cursorOffset = cursor-buffer;
235
236 buffer = (uint8_t*)realloc(buffer, needed);
237 bufferEnd = needed+buffer;
238
239 marker = markerOffset + buffer;
240 cursor = cursorOffset + buffer;
241 start = buffer + startOffset;
242 limit = limitOffset + buffer;
243 }
244 memcpy(limit, input, inputSize);
245 limit += inputSize;
246
247 // The scanner starts here
248 #define YYLIMIT limit
249 #define YYCURSOR cursor
250 #define YYMARKER marker
251 #define YYCTYPE uint8_t
252
253 #define SKIP() { start = cursor; YYSETCONDITION(EConditionNorma l); goto yy0; }
254 #define SEND(x) { send(x); SKIP(); }
255 #define YYFILL(n) { goto fill; }
256
257 #define YYGETSTATE() state
258 #define YYSETSTATE(x) { state = (x); }
259
260 #define YYGETCONDITION() condition
261 #define YYSETCONDITION(x) { condition = (x); }
262
263 start:
264
265 printf("Starting a round; state: %d, condition: %d\n", state, condition) ;
266
267 /*!re2c
268 re2c:indent:top = 1;
269 re2c:yych:conversion = 0;
270 re2c:condenumprefix = ECondition;
271 re2c:define:YYCONDTYPE = Condition;
272
273 eof = "\000";
274 any = [\000-\377];
275 whitespace_char = [ \t\v\f\r];
276 whitespace = whitespace_char+;
277 identifier_start = [$_\\a-zA-z];
278 identifier_char = [$_\\a-zA-z0-9];
279 number_start = [0-9];
280 number_char = [0-9\.e];
281 line_terminator = [\n\r]+;
282
283 <Normal> "(" { PUSH_T(LPAREN); }
284 <Normal> ")" { PUSH_T(RPAREN); }
285 <Normal> "[" { PUSH_T(LBRACK); }
286 <Normal> "]" { PUSH_T(RBRACK); }
287 <Normal> "{" { PUSH_T(LBRACE); }
288 <Normal> "}" { PUSH_T(RBRACE); }
289 <Normal> ":" { PUSH_T(COLON); }
290 <Normal> ";" { PUSH_T(SEMICOLON); }
291 <Normal> "." { PUSH_T(PERIOD); }
292 <Normal> "?" { PUSH_T(CONDITIONAL); }
293 <Normal> "++" { PUSH_T(INC); }
294 <Normal> "--" { PUSH_T(DEC); }
295
296 <Normal> "|=" { PUSH_T(ASSIGN_BIT_OR); }
297 <Normal> "^=" { PUSH_T(ASSIGN_BIT_XOR); }
298 <Normal> "&=" { PUSH_T(ASSIGN_BIT_AND); }
299 <Normal> "<<=" { PUSH_T(ASSIGN_SHL); }
300 <Normal> ">>=" { PUSH_T(ASSIGN_SAR); }
301 <Normal> ">>>=" { PUSH_T(ASSIGN_SHR); }
302 <Normal> "+=" { PUSH_T(ASSIGN_ADD); }
303 <Normal> "-=" { PUSH_T(ASSIGN_SUB); }
304 <Normal> "*=" { PUSH_T(ASSIGN_MUL); }
305 <Normal> "/=" { PUSH_T(ASSIGN_DIV); }
306 <Normal> "%=" { PUSH_T(ASSIGN_MOD); }
307
308 <Normal> "," { PUSH_T(COMMA); }
309 <Normal> "||" { PUSH_T(OR); }
310 <Normal> "&&" { PUSH_T(AND); }
311 <Normal> "|" { PUSH_T(BIT_OR); }
312 <Normal> "^" { PUSH_T(BIT_XOR); }
313 <Normal> "&" { PUSH_T(BIT_AND); }
314 <Normal> "<<" { PUSH_T(SHL); }
315 <Normal> ">>" { PUSH_T(SAR); }
316 <Normal> "+" { PUSH_T(ADD); }
317 <Normal> "-" { PUSH_T(SUB); }
318 <Normal> "*" { PUSH_T(MUL); }
319 <Normal> "/" { PUSH_T(DIV); }
320 <Normal> "%" { PUSH_T(MOD); }
321
322 <Normal> "===" { PUSH_T(EQ_STRICT); }
323 <Normal> "==" { PUSH_T(EQ); }
324 <Normal> "!==" { PUSH_T(NE_STRICT); }
325 <Normal> "!=" { PUSH_T(NE); }
326 <Normal> "<=" { PUSH_T(LTE); }
327 <Normal> ">=" { PUSH_T(GTE); }
328 <Normal> "<" { PUSH_T(LT); }
329 <Normal> ">" { PUSH_T(GT); }
330
331 <Normal> "=" { PUSH_T(ASSIGN); }
332
333 <Normal> "!" { PUSH_T(NOT); }
334 <Normal> "~" { PUSH_T(BIT_NOT); }
335
336 <Normal> line_terminator+ { PUSH_LINE_TERMINATOR(); }
337 <Normal> whitespace { SKIP();}
338
339 <Normal> "//" :=> SingleLineComment
340 <Normal> "/*" :=> MultiLineComment
341 <Normal> "<!--" :=> HtmlComment
342
343 <Normal> ["] :=> DoubleQuoteString
344 <Normal> ['] :=> SingleQuoteString
345
346 <Normal> identifier_start :=> Identifier
347 <Normal> number_start :=> Number
348
349 <Normal> eof { PUSH_EOS(); return 1; }
350 <Normal> any { TERMINATE_ILLEGAL(); }
351
352 <DoubleQuoteString> "\\\"" {}
353 <DoubleQuoteString> ["] { PUSH_STRING();}
354 <DoubleQuoteString> any {}
355
356 <SingleQuoteString> "\\'" {}
357 <SingleQuoteString> "'" { PUSH_STRING();}
358 <SingleQuoteString> any {}
359
360 <Identifier> identifier_char+ {}
361 <Identifier> any { PUSH_IDENTIFIER(); }
362
363 <SingleLineComment> line_terminator
364 { PUSH_LINE_TERMINATOR();}
365
366 <SingleLineComment> any+ {}
367
368 <MultiLineComment> [*][//] { PUSH_LINE_TERMINATOR();}
369 <MultiLineComment> eof { TERMINATE_ILLEGAL(); }
370 <MultiLineComment> any+ {}
371
372 <HtmlComment> any+ {}
373 <HtmlComment> eof { TERMINATE_ILLEGAL(); }
374 <HtmlComment> "-->" { }
375
376 <Number> number_char+ { }
377 <Number> any { PUSH_NUMBER(); }
378
379 */
380
381 fill:
382 ssize_t unfinishedSize = cursor-start;
383 printf(
384 "scanner needs a refill. Exiting for now with:\n"
385 " saved fill state = %d\n"
386 " unfinished token size = %ld\n",
387 state,
388 unfinishedSize
389 );
390
391 if(0<unfinishedSize && start<limit)
392 {
393 printf(" unfinished token is: ");
394 fwrite(start, 1, cursor-start, stdout);
395 putchar('\n');
396 }
397 putchar('\n');
398
399 /*
400 * Once we get here, we can get rid of
401 * everything before start and after limit.
402 */
403 if(eof==true) goto start;
404 if(buffer<start)
405 {
406 size_t startOffset = start-buffer;
407 memmove(buffer, start, limit-start);
408 marker -= startOffset;
409 cursor -= startOffset;
410 limit -= startOffset;
411 start -= startOffset;
412 }
413 return 0;
414 }
415 };
416
417 // ----------------------------------------------------------------------
418 int main(
419 int argc,
420 char **argv
421 )
422 {
423 // Parse cmd line
424 int input = 0;
425 if(1<argc)
426 {
427 input = open(argv[1], O_RDONLY | O_BINARY);
428 if(input<0)
429 {
430 fprintf(
431 stderr,
432 "could not open file %s\n",
433 argv[1]
434 );
435 exit(1);
436 }
437 }
438
439 /*
440 * Tokenize input file by pushing batches
441 * of data one by one into the scanner.
442 */
443 const size_t batchSize = 256;
444 uint8_t buffer[batchSize];
445 PushScanner scanner;
446 while(1)
447 {
448 ssize_t n = read(input, buffer, batchSize);
449 if (scanner.push(buffer, n)) {
450 printf("Scanner: illegal data\n");
451 return 1;
452 }
453 if(n<batchSize) break;
454 }
455 scanner.push(0, -1);
456 close(input);
457
458 // Done
459 return 0;
460 }
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698