OLD | NEW |
| (Empty) |
1 /* | |
2 ****************************************************************************** | |
3 * Copyright (C) 2005-2007, International Business Machines Corporation and * | |
4 * others. All Rights Reserved. * | |
5 ****************************************************************************** | |
6 */ | |
7 | |
8 #include <stdio.h> | |
9 #include <string.h> | |
10 #include <stdlib.h> | |
11 #include <time.h> | |
12 | |
13 #include "wbnf.h" | |
14 | |
15 // Most of this code is meant to test the test code. It's a self test. | |
16 // Normally this isn't run. | |
17 #define TEST_WBNF_TEST 0 | |
18 | |
19 /////////////////////////////////////////////////////////// | |
20 // | |
21 // Constants and the most basic helper classes | |
22 // | |
23 | |
24 static const char DIGIT_CHAR[] = "0123456789"; | |
25 static const char WHITE_SPACE[] = {'\t', ' ', '\r', '\n', 0}; | |
26 static const char ALPHABET[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUV
WXYZ"; | |
27 static const char SPECIAL[] = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; | |
28 | |
29 static inline UBool isInList(const char c /*in*/, const char list[] /*in*/){ | |
30 const char * p = list; | |
31 for (;*p != 0 && *p != c; p++); | |
32 return *p?TRUE:FALSE; | |
33 } | |
34 static inline UBool isDigit(char c) {return isInList(c, DIGIT_CHAR);} | |
35 static inline UBool isWhiteSpace(char c) {return isInList(c, WHITE_SPACE);} | |
36 static inline UBool isAlphabet(char c) {return isInList(c, ALPHABET);} | |
37 static inline UBool isSpecialAsciiChar(char c) {return isInList(c,SPECIAL);} | |
38 | |
39 | |
40 | |
41 /////////////////////////////////////////////////////////// | |
42 // | |
43 // Helper classes | |
44 // | |
45 | |
46 class Buffer_byte{ | |
47 // Utility class, can be treated as an auto expanded array. no boundary check. | |
48 | |
49 typedef char byte; | |
50 byte * start; | |
51 byte * current; | |
52 int buffer_size; // size unit is byte | |
53 public: | |
54 inline int content_size(){return current - start;} // size unit is byte | |
55 | |
56 private: | |
57 inline void expand(int add_size = 100){ // size unit is byte | |
58 int new_size = buffer_size + add_size; | |
59 | |
60 int cs_snap = content_size(); | |
61 start = (byte *) realloc(start, new_size); // may change the value of
start | |
62 current = start + cs_snap; | |
63 | |
64 memset(current, 0, add_size); | |
65 buffer_size = new_size; | |
66 } | |
67 | |
68 inline void expand_to(int size){ | |
69 int r = size - buffer_size; | |
70 if (r > 0) { | |
71 expand(r); // simply expand, no block alignment | |
72 } | |
73 } | |
74 Buffer_byte(const Buffer_byte &); | |
75 Buffer_byte & operator = (const Buffer_byte &); | |
76 public: | |
77 Buffer_byte():start(NULL),current(start),buffer_size(0){ | |
78 expand(); | |
79 } | |
80 ~Buffer_byte(){ | |
81 free(start); | |
82 } | |
83 | |
84 inline void reset(){ | |
85 start != NULL ? memset(start, 0, buffer_size) : 0; | |
86 current = start; | |
87 } | |
88 | |
89 // Using memory copy method to append a C array to buffer, | |
90 inline void append(const void * c, int size){ // size unit is byte | |
91 expand_to(content_size() + size) ; | |
92 memcpy(current, c, size); | |
93 current = current + size; | |
94 } | |
95 | |
96 byte * buffer(){ | |
97 return start; | |
98 } | |
99 }; | |
100 | |
101 /* | |
102 The class(es) try to work as bulid-in array, so it overloads these two operato
rs | |
103 operator type *(); | |
104 type & operator[]; | |
105 The first is used to auto type convert, the latter is used to select member. | |
106 | |
107 A small trick is the class does not overload the address-of operator. This | |
108 behavior is different from bulid-in array, but it give us the opportunity | |
109 to get the address of the class itself. | |
110 */ | |
111 //template<typename type> | |
112 // class BUFFER{ | |
113 // typedef BUFFER name; | |
114 #define BUFFER(type, name)\ | |
115 class name {\ | |
116 private:\ | |
117 Buffer_byte buf;\ | |
118 public:\ | |
119 name & reset() {buf.reset(); return *this;}\ | |
120 name & append(type c) {buf.append(&c, sizeof(type)); return *this;}\ | |
121 name & append_array(const type * p, int size) {buf.append(p, sizeof(type
)*size); return *this;}\ | |
122 type & operator [] (int i) { return ((type *) buf.buffer())[i];}\ | |
123 operator type *(){return (type *) buf.buffer();} \ | |
124 int content_size(){return buf.content_size() / sizeof(type);}\ | |
125 } | |
126 | |
127 | |
128 class Pick{ | |
129 /* The Pick is the basic language generator element*/ | |
130 public: | |
131 // generate a string accroding the syntax | |
132 // Return a null-terminated c-string. The buffer is owned by callee. | |
133 virtual const char* next() = 0; | |
134 virtual ~Pick(){}; | |
135 }; | |
136 | |
137 //typedef BUFFER<char> Buffer_char; | |
138 //typedef BUFFER<int> Buffer_int; | |
139 //typedef BUFFER<Pick *> Buffer_pPick; | |
140 BUFFER(char, Buffer_char); | |
141 BUFFER(int, Buffer_int); | |
142 BUFFER(Pick *, Buffer_pPick); | |
143 | |
144 class SymbolTable{ | |
145 /* Helper class. | |
146 * It's a mapping table between 'variable name' and its 'active Pick object' | |
147 */ | |
148 private: | |
149 Buffer_char name_buffer; // var names storage space | |
150 | |
151 Buffer_int names; // points to name (offset in name_buffer) | |
152 Buffer_pPick refs; // points to Pick | |
153 | |
154 int get_index(const char *const var_name){ | |
155 int len = names.content_size(); | |
156 for (int i=0; i< len; i++){ | |
157 if (strcmp(var_name, name_buffer + names[i]) == 0){ | |
158 return i; | |
159 } | |
160 } | |
161 return -1; | |
162 } | |
163 | |
164 public: | |
165 enum RESULT {EMPTY, NO_VAR, NO_REF, HAS_REF}; | |
166 | |
167 RESULT find(const char *const var_name /*[in] c-string*/, Pick * * ref = NUL
L /*[out] Pick* */){ | |
168 if (!var_name) return EMPTY; // NULL name | |
169 | |
170 int i = get_index(var_name); | |
171 if (i == -1){ | |
172 return NO_VAR; // new name | |
173 } | |
174 if (!refs[i]){ // exist name, no ref | |
175 return NO_REF; | |
176 } else { | |
177 if (ref) { | |
178 *ref = refs[i]; | |
179 } | |
180 return HAS_REF; // exist name, has ref | |
181 } | |
182 } | |
183 | |
184 void put(const char *const var_name, Pick *const var_ref = NULL){ | |
185 int i = get_index(var_name); | |
186 switch(find(var_name)){ | |
187 case EMPTY: // NULL name | |
188 break; | |
189 case NO_VAR: // new name | |
190 int offset; | |
191 offset = name_buffer.content_size(); | |
192 name_buffer.append_array(var_name, strlen(var_name) + 1); | |
193 names.append(offset); | |
194 refs.append(var_ref); | |
195 break; | |
196 case NO_REF: // exist name, no ref | |
197 refs[i] = var_ref; // link definition with variable | |
198 break; | |
199 case HAS_REF: // exist name, has ref | |
200 if (var_ref){ | |
201 refs[i] = var_ref; | |
202 } | |
203 break; | |
204 default: | |
205 ; // ASSERT(FALSE); | |
206 } | |
207 return; | |
208 } | |
209 | |
210 UBool is_complete(){ | |
211 int n = names.content_size(); | |
212 for (int i=0; i<n; ++i){ | |
213 if (refs[i] == NULL){ | |
214 return FALSE; | |
215 } | |
216 } | |
217 return TRUE; | |
218 } | |
219 | |
220 void reset(){ | |
221 names.reset(); | |
222 name_buffer.reset(); | |
223 | |
224 // release memory here | |
225 int s = refs.content_size(); | |
226 for (int i=0; i < s; i++){ | |
227 delete refs[i]; // TOFIX: point alias/recursion problem | |
228 } | |
229 refs.reset(); | |
230 } | |
231 | |
232 ~SymbolTable(){ | |
233 reset(); | |
234 } | |
235 }; | |
236 | |
237 | |
238 /* | |
239 // Document of class Escaper | |
240 // | |
241 // ATTENTION: | |
242 // From http://icu-project.org/userguide/Collate_Customization.html. | |
243 // We get the precedence of escape/quote operations | |
244 // | |
245 // (highest) 1. backslash \ | |
246 // 2. two single quotes '' | |
247 // 3. quoting ' ' | |
248 // | |
249 // ICU Collation should accept following as the same string. | |
250 // | |
251 // 1) 'ab'c _ | |
252 // 2) a\bc \ | |
253 // 3) a'b'\c |- They are equal. | |
254 // 4) abc _/ | |
255 // | |
256 // From "two single quotes", we have following deductions | |
257 // D1. empty quoting is illgal. (obviously) | |
258 // D2. no contact operation between two quotings | |
259 // '.''.' is not .. it is .'. | |
260 // D3. "two single quotes" cannot contact two quoting simultaneously | |
261 // '..''''.' is not ..'. it is ..''. | |
262 // NOTICE: | |
263 // "two single quotes" can contact before one quoting | |
264 // '''.' is '. | |
265 // "two single quotes" can literally contact after one quoting | |
266 // But, from syntax, it's one quoting including a "two single quotes" | |
267 // '.''' is .' | |
268 // D4. "two single quotes" cannot solely be included in quoting | |
269 // '''' is not ' it is '' | |
270 // NOTICE: These are legal | |
271 // '.''.' is .'. | |
272 // '.''' is .' | |
273 // | |
274 // dicision | |
275 // /\ | |
276 // /__\ | |
277 // output buffer input buffer | |
278 // | |
279 // To make our dicision (within an atom operation) without caring input and outp
ut buffer, | |
280 // following calling pattern (within an atom operation) shall be avoided | |
281 // | |
282 // P1 open_quoting() then close_quoting() (direct violation) D1 | |
283 // P2 close_quoting() then open_quoting() (direct violation) D2 | |
284 // P3 empty open_quoting() (indirect violation) D1, D4 | |
285 // P4 empty close_quoting() (indirect violation) D2, D3 | |
286 // P5 open_quoting() then two single quotes (indirect violation) D4 | |
287 // P6 close_quoting() then two single quotes (indirect violation) D3 | |
288 // | |
289 // two single quotes escaping will not open_ or close_ quoting() | |
290 // The choice will not lose some quoing forms. | |
291 // | |
292 // For open_quoting(), | |
293 // we may get this form quoting ''' P5 | |
294 // It may raise a bug ''''x | |
295 // If we expect | |
296 // '''.' let the next char open the quoting | |
297 // '.''.' the quoting is already opened by preceding char | |
298 // | |
299 // For close_quoting() | |
300 // we will get this form quoting '.''' P6 | |
301 // It may raise a bug '.''''.' | |
302 // If we expect | |
303 // '.'''\. let the next char close the quoting | |
304 // '.''''.' the expectation is wrong! using '.'\''.' instead | |
305 // | |
306 // It's a hard work to re-adjust generation opportunity for various escaping for
m. | |
307 // We just simply ignore it. | |
308 */ | |
309 class Escaper{ | |
310 public: | |
311 enum CHOICE {YES, NO, RAND}; | |
312 enum ESCAPE_FORM {BSLASH_ONLY, QUOTE_ONLY, QUOTE_AND_BSLAH, RAND_ESC}; | |
313 private: | |
314 class Bool{ // A wrapper class for CHOICE, to auto adapter UBool class | |
315 private: | |
316 const CHOICE tag; | |
317 public: | |
318 Bool(CHOICE flag=RAND):tag(flag){} | |
319 operator UBool() { // conversion operator | |
320 return tag == RAND ? rand()%2 : tag == YES; | |
321 //if (tag == RAND){ | |
322 // return rand()%2 == 1; | |
323 //} else { | |
324 // return tag == YES ? TRUE : FALSE; | |
325 //} | |
326 } | |
327 }; | |
328 public: | |
329 Escaper(CHOICE escapeLiteral = RAND, | |
330 CHOICE twoQuotesEscape = RAND, | |
331 ESCAPE_FORM escapeForm = RAND_ESC): | |
332 escape_form(escapeForm), | |
333 escape_literal(escapeLiteral), | |
334 two_quotes_escape(twoQuotesEscape), | |
335 is_quoting(FALSE){} | |
336 private: | |
337 Buffer_char str; | |
338 ESCAPE_FORM escape_form; | |
339 Bool escape_literal; | |
340 Bool two_quotes_escape; | |
341 UBool quote_escape; | |
342 UBool bslash_escape; | |
343 UBool is_quoting; | |
344 | |
345 void set_options(){ | |
346 ESCAPE_FORM t = escape_form == RAND_ESC ? (ESCAPE_FORM) (rand()%3) : esc
ape_form; | |
347 switch (t){ | |
348 case BSLASH_ONLY : | |
349 bslash_escape = TRUE; quote_escape = FALSE; break; | |
350 case QUOTE_ONLY: | |
351 bslash_escape = FALSE;quote_escape = TRUE; break; | |
352 case QUOTE_AND_BSLAH: | |
353 bslash_escape = TRUE; quote_escape = TRUE; break; | |
354 default: | |
355 ;// error | |
356 } | |
357 } | |
358 | |
359 void reset(){ | |
360 str.reset(); | |
361 is_quoting = FALSE; | |
362 } | |
363 | |
364 inline void open_quoting(){ | |
365 if(is_quoting){ | |
366 // do nothing | |
367 } else { | |
368 str.append('\''); | |
369 is_quoting = TRUE; | |
370 } | |
371 } | |
372 inline void close_quoting(){ | |
373 if(is_quoting){ | |
374 str.append('\''); | |
375 is_quoting = FALSE; | |
376 } else { | |
377 // do nothing | |
378 } | |
379 } | |
380 | |
381 // str [in] null-terminated c-string | |
382 void append(const char * strToAppend){ | |
383 for(;*strToAppend != 0; strToAppend++){ | |
384 append(*strToAppend); | |
385 } | |
386 } | |
387 | |
388 inline void append(const char c){ | |
389 set_options(); | |
390 | |
391 if (c == '\\'){ | |
392 quote_escape ? open_quoting() : close_quoting(); | |
393 //bslash_escape always true here | |
394 str.append('\\'); | |
395 str.append('\\'); | |
396 } else if (c == '\''){ | |
397 if (two_quotes_escape){ // quoted using two single quotes | |
398 // See documents in anonymous.design | |
399 str.append('\''); | |
400 str.append('\''); | |
401 } else{ | |
402 quote_escape ? open_quoting() : close_quoting(); | |
403 //bslash_escape always true here | |
404 str.append('\\'); | |
405 str.append('\''); | |
406 } | |
407 } else if (isSpecialAsciiChar(c) || isWhiteSpace(c)){ | |
408 quote_escape ? open_quoting() : close_quoting(); | |
409 if (bslash_escape) str.append('\\'); | |
410 str.append(c); | |
411 } else { //if (isAlphabet(c) || isDigit(c) || TRUE){ // treat others as
literal | |
412 if (escape_literal){ | |
413 quote_escape ? open_quoting() : close_quoting(); | |
414 if (bslash_escape) str.append('\\'); | |
415 str.append(c); | |
416 } else { | |
417 close_quoting(); | |
418 str.append(c); | |
419 } | |
420 } | |
421 } | |
422 | |
423 public: | |
424 // Return a null-terminate c-string. The buffer is owned by callee. | |
425 char * operator()(const char * literal /*c-string*/){ | |
426 str.reset(); | |
427 for(;*literal != 0; literal++){ | |
428 append(*literal); | |
429 } | |
430 close_quoting(); // P4 exception, to close whole quoting | |
431 return str; | |
432 } | |
433 }; | |
434 | |
435 class WeightedRand{ | |
436 // Return a random number in [0, size) | |
437 // Every number has different chance (aka weight) to be selected. | |
438 private: | |
439 Buffer_int weights; | |
440 double total; | |
441 WeightedRand(const WeightedRand &); | |
442 WeightedRand & operator = (const WeightedRand &); | |
443 public: | |
444 WeightedRand(Buffer_int * weight_list = NULL, int size = 0){ | |
445 if ( weight_list == NULL){ | |
446 for (int i=0; i<size; ++i) weights.append(DEFAULT_WEIGHT); | |
447 } else { | |
448 int s = weight_list->content_size(); | |
449 if (s < size){ | |
450 weights.append_array( (*weight_list),s); | |
451 for (int i=s; i<size; ++i) weights.append(DEFAULT_WEIGHT); | |
452 } else { // s >= size | |
453 weights.append_array( (*weight_list),size); | |
454 } | |
455 } | |
456 total = 0; | |
457 int c = weights.content_size(); | |
458 for (int i=0; i<c; ++i){ | |
459 total += weights[i]; | |
460 } | |
461 } | |
462 | |
463 void append(int weight){ | |
464 weights.append(weight); | |
465 total += weight; | |
466 } | |
467 | |
468 // Give a random number with the consideration of weight. | |
469 // Every random number is associated with a weight. | |
470 // It identifies the chance to be selected, | |
471 // larger weight has more chance to be selected. | |
472 // | |
473 // | |
474 // ______________________ every slot has equal chance | |
475 // | |
476 // [____][_][___][______] each item has different slots, hence different
chance | |
477 // | |
478 // | |
479 // The algorithms to generate the number is illustrated by preceding figure
. | |
480 // First, a slot is selected by rand(). Then we translate the slot to corre
sponding item. | |
481 // | |
482 int next(){ | |
483 // get a random in [0,1] | |
484 double reference_mark = (double)rand() / (double)RAND_MAX; | |
485 | |
486 // get the slot's index, 0 <= mark <= total; | |
487 double mark = total * reference_mark; | |
488 | |
489 // translate the slot to corresponding item | |
490 int i=0; | |
491 for (;;){ | |
492 mark -= weights[i]; // 0 <= mark <= total | |
493 if (mark <= 0) | |
494 break; | |
495 i++; | |
496 } | |
497 return i; | |
498 } | |
499 }; | |
500 | |
501 /////////////////////////////////////////////////////////// | |
502 // | |
503 // The parser result nodes | |
504 // | |
505 | |
506 class Literal : public Pick { | |
507 public: | |
508 virtual const char* next(){ | |
509 return str; | |
510 } | |
511 Literal(const char * s /*c-string*/){ | |
512 str.append_array(s, strlen(s) + 1); | |
513 } | |
514 private: | |
515 Buffer_char str; //null-terminated c-string | |
516 }; | |
517 | |
518 class Variable : public Pick { | |
519 public: | |
520 Variable(SymbolTable * symbols, const char * varName, Pick * varRef = NULL){ | |
521 this->var_name.append_array(varName, strlen(varName) + 1); | |
522 if ((symbol_table = symbols)){ | |
523 symbol_table->put(varName, varRef); | |
524 } | |
525 } | |
526 | |
527 operator const char *(){ | |
528 return var_name; | |
529 } | |
530 | |
531 virtual const char* next(){ | |
532 if (symbol_table){ | |
533 Pick * var_ref = NULL; | |
534 symbol_table->find(var_name, &var_ref); | |
535 if (var_ref) { | |
536 return var_ref->next(); | |
537 } | |
538 } | |
539 return ""; // dumb string | |
540 } | |
541 private: | |
542 Buffer_char var_name; | |
543 SymbolTable * symbol_table; | |
544 }; | |
545 | |
546 class Quote : public Pick{ | |
547 public: | |
548 Quote(Pick & base):item(base),e(Escaper::NO, Escaper::NO, Escaper::BSLASH_ON
LY){ | |
549 } | |
550 virtual const char* next(){ | |
551 return e(item.next()); | |
552 } | |
553 private: | |
554 Pick & item; | |
555 Buffer_char str; | |
556 Escaper e; | |
557 }; | |
558 | |
559 | |
560 class Morph : public Pick{ | |
561 /* | |
562 The difference between morph and an arbitrary random string is that | |
563 a morph changes slowly. When we build collation rules, for example, | |
564 it is a much better test if the strings we use are all in the same | |
565 'neighborhood'; they share many common characters. | |
566 */ | |
567 public: | |
568 Morph(Pick & base):item(base){} | |
569 | |
570 virtual const char* next(){ | |
571 current.reset(); | |
572 const char * s = item.next(); | |
573 current.append_array(s, strlen(s) + 1); | |
574 if (last.content_size() == 0) { | |
575 str.reset(); | |
576 last.reset(); | |
577 str.append_array(current, current.content_size()); | |
578 last.append_array(current, current.content_size()); | |
579 } else { | |
580 morph(); | |
581 } | |
582 return str; | |
583 } | |
584 private: | |
585 Pick & item; | |
586 Buffer_char str; | |
587 Buffer_char last; | |
588 Buffer_char current; | |
589 | |
590 char * p_last; | |
591 char * p_curr; | |
592 | |
593 void copy_curr(){ | |
594 if (*p_curr) { | |
595 str.append(*p_curr); | |
596 p_curr++; | |
597 } | |
598 } | |
599 | |
600 void copy_last(){ | |
601 if (*p_last) { | |
602 str.append(*p_last); | |
603 p_last++; | |
604 } | |
605 } | |
606 | |
607 // copy 0, 1, or 2 character(s) to str | |
608 void copy(){ | |
609 static WeightedRand wr(& Buffer_int().append(DEFAULT_WEIGHT * 10), 5); | |
610 | |
611 switch (wr.next()){ | |
612 case 0: // copy last -- has 10 times chance than others | |
613 copy_last(); | |
614 break; | |
615 case 1: // copy both | |
616 copy_curr(); | |
617 copy_last(); | |
618 break; | |
619 case 2: // copy both | |
620 copy_last(); | |
621 copy_curr(); | |
622 break; | |
623 case 3: | |
624 copy_curr(); | |
625 break; | |
626 case 4: // copy nothing | |
627 break; | |
628 default: | |
629 // ASSERT(FALSE); | |
630 ; | |
631 } | |
632 } | |
633 | |
634 void morph(void){ | |
635 int min = strlen(last); | |
636 int max = strlen(current); | |
637 if (min > max){ | |
638 int temp = min; | |
639 min = max; | |
640 max = temp; | |
641 } | |
642 | |
643 int len = min + rand()%(max - min + 1); // min + [0, diff] | |
644 p_curr = current; | |
645 p_last = last; | |
646 str.reset(); | |
647 | |
648 for (; str.content_size()<len && *p_curr && *p_last;){ | |
649 copy(); // copy 0, 1, or 2 character(s) to str | |
650 } | |
651 | |
652 if (str.content_size() == len) { | |
653 str.append(0); | |
654 final(); | |
655 return; | |
656 } | |
657 | |
658 if (str.content_size() > len) { // if the last copy copied two character
s | |
659 str[len]=0; | |
660 final(); | |
661 return; | |
662 } | |
663 | |
664 // str.content_size() < len | |
665 if (*p_last) { | |
666 for (; str.content_size() < len; copy_last()); | |
667 } else if (*p_curr){ | |
668 for (; str.content_size() < len; copy_curr()); | |
669 } | |
670 | |
671 int last_len = last.content_size(); | |
672 for (;str.content_size() < len;){ | |
673 str.append(last[rand()%last_len]); | |
674 } | |
675 str.append(0); | |
676 final(); | |
677 } | |
678 | |
679 void final(){ | |
680 last.reset(); | |
681 last.append_array(current, current.content_size()); | |
682 } | |
683 }; | |
684 | |
685 class Sequence : public Pick { | |
686 public: | |
687 virtual const char* next(){ | |
688 str.reset(); | |
689 int s = items.content_size(); | |
690 for(int i=0; i < s; i++){ | |
691 const char * t = items[i]->next(); | |
692 str.append_array(t, strlen(t)); | |
693 } | |
694 str.append(0); // terminal null | |
695 return str; | |
696 } | |
697 | |
698 void append (Pick * node){ | |
699 items.append(node); | |
700 } | |
701 | |
702 virtual ~Sequence(){ | |
703 int s = items.content_size(); | |
704 for(int i=0; i < s; i++){ | |
705 //How can assure the item is got from heap? | |
706 //Let's assume it. | |
707 delete items[i]; // TOFIX: point alias/recursion problem | |
708 items[i] = NULL; | |
709 } | |
710 } | |
711 private: | |
712 Buffer_pPick items; | |
713 Buffer_char str; //null-terminated c-string | |
714 }; | |
715 | |
716 class Repeat : public Pick { | |
717 private: | |
718 Pick * item; | |
719 Buffer_char str; | |
720 WeightedRand wr; | |
721 int min; | |
722 int max; | |
723 int select_a_count(){ | |
724 return min + wr.next(); | |
725 } | |
726 public: | |
727 virtual const char* next(){ | |
728 str.reset(); | |
729 int c = select_a_count(); | |
730 for(int i=0; i< c; i++){ | |
731 const char * t = item->next(); | |
732 str.append_array(t, strlen(t)); | |
733 } | |
734 str.append(0); | |
735 return str; | |
736 } | |
737 | |
738 Repeat(Pick * base, int minCount =0, int maxCount = 1, Buffer_int * weights
= NULL): | |
739 wr(weights, maxCount-minCount +1) { | |
740 this->item = base; | |
741 this->min = minCount; | |
742 this->max = maxCount; | |
743 } | |
744 virtual ~Repeat(){ | |
745 delete item; // TOFIX: point alias/recursion problem | |
746 item = NULL; | |
747 } | |
748 }; | |
749 | |
750 | |
751 class Alternation : public Pick { | |
752 public: | |
753 virtual const char* next(){ | |
754 str.reset(); | |
755 int i = wr.next(); | |
756 const char * t = items[i]->next(); | |
757 str.append_array(t, strlen(t) + 1); | |
758 return str; | |
759 } | |
760 virtual ~Alternation(){ | |
761 int s = items.content_size(); | |
762 for(int i=0; i < s; i++){ | |
763 delete items[i]; // TOFIX: point alias/recursion problem | |
764 items[i] = NULL; | |
765 } | |
766 } | |
767 | |
768 Alternation & append (Pick * node, int weight = DEFAULT_WEIGHT){ | |
769 items.append(node); | |
770 wr.append(weight); | |
771 return *this; | |
772 } | |
773 private: | |
774 Buffer_pPick items; | |
775 Buffer_char str; // null-terminated c-string | |
776 WeightedRand wr; | |
777 }; | |
778 | |
779 /////////////////////////////////////////////////////////// | |
780 // | |
781 // The parser | |
782 // | |
783 | |
784 enum TokenType {STRING, VAR, NUMBER, STREAM_END, ERROR, QUESTION, STAR, PLUS, LB
RACE, RBRACE, LPAR, RPAR, SEMI, EQ, COMMA, BAR, AT, WAVE, PERCENT}; | |
785 | |
786 class Scanner{ | |
787 friend int DumpScanner(Scanner & s, UBool dumb); | |
788 private: | |
789 const char * source; | |
790 const char * working; | |
791 const char * history; // for debug | |
792 enum StateType {START, IN_NUM, IN_VAR_FIRST, IN_VAR, IN_QUOTE, IN_QUOTE_BSLA
SH, IN_BSLASH, IN_STRING, DONE}; | |
793 StateType state; | |
794 void terminated(TokenType t){ | |
795 working--; // return the peeked character | |
796 tokenType = t; | |
797 token.append(0); // close buffer | |
798 state = DONE; | |
799 } | |
800 public: | |
801 // the buffer of "source" is owned by caller | |
802 Scanner(const char *src/*[in] c-string*/ = NULL):source(src){ | |
803 working = src; | |
804 history = working; | |
805 state = DONE; | |
806 tokenType = ERROR; | |
807 } | |
808 | |
809 //void setSource(const char *const src /*[in] c-string*/){ | |
810 // *(&const_cast<const char *>(source)) = src; | |
811 //} | |
812 | |
813 Buffer_char token; | |
814 TokenType tokenType; | |
815 | |
816 TokenType getNextToken(){ | |
817 token.reset(); | |
818 state = START; | |
819 history = working; // for debug | |
820 while (state != DONE){ | |
821 char c = *working++; | |
822 if (c == 0 && state != START){//avoid buffer overflow. for IN_QUOE,
IN_ESCAPE | |
823 terminated(ERROR); | |
824 break; // while | |
825 } | |
826 switch(state){ | |
827 case START: | |
828 tokenType = ERROR; | |
829 switch(c){ | |
830 case '?' : tokenType = QUESTION; break; | |
831 case '*' : tokenType = STAR; break; | |
832 case '+' : tokenType = PLUS; break; | |
833 case '{' : tokenType = LBRACE; break; | |
834 case '}' : tokenType = RBRACE; break; | |
835 case '(' : tokenType = LPAR; break; | |
836 case ')' : tokenType = RPAR; break; | |
837 case ';' : tokenType = SEMI; break; | |
838 case '=' : tokenType = EQ; break; | |
839 case ',' : tokenType = COMMA; break; | |
840 case '|' : tokenType = BAR; break; | |
841 case '@' : tokenType = AT; break; | |
842 case '~' : tokenType = WAVE; break; | |
843 case '%' : tokenType = PERCENT; break; | |
844 case 0 : tokenType = STREAM_END; working-- /*avoid bu
ffer overflow*/; break; | |
845 } | |
846 if (tokenType != ERROR){ | |
847 token.append(c); | |
848 token.append(0); | |
849 state = DONE; | |
850 break; // START | |
851 } | |
852 switch(c){ | |
853 case '$' : state = IN_VAR_FIRST; token.append(c); break
; | |
854 case '\'' : state = IN_QUOTE; break; | |
855 case '\\' : state = IN_BSLASH; break; | |
856 default: | |
857 if (isWhiteSpace(c)){ // state = START; //do no
thing | |
858 } else if (isDigit(c)){ state = IN_NUM; token
.append(c); | |
859 } else if (isAlphabet(c)){ state = IN_STRING; token
.append(c); | |
860 } else {terminated(ERROR);} | |
861 } | |
862 break;//START | |
863 case IN_NUM: | |
864 if (isDigit(c)){ | |
865 token.append(c); | |
866 } else { | |
867 terminated(NUMBER); | |
868 } | |
869 break;//IN_NUM | |
870 case IN_VAR_FIRST: | |
871 if (isAlphabet(c)){ | |
872 token.append(c); | |
873 state = IN_VAR; | |
874 } else { | |
875 terminated(ERROR); | |
876 } | |
877 break; // IN_VAR_FISRT | |
878 case IN_VAR: | |
879 if (isAlphabet(c) || isDigit(c)){ | |
880 token.append(c); | |
881 } else { | |
882 terminated(VAR); | |
883 } | |
884 break;//IN_VAR | |
885 case IN_STRING: | |
886 // About the scanner's behavior for STRING, AT, and ESCAPE: | |
887 // All of them can be contacted with each other. | |
888 // This means the scanner will eat up as much as possible st
rings | |
889 // (STRING, AT, and ESCAPE) at one time, with no regard of
their | |
890 // combining sequence. | |
891 // | |
892 if (c == '\''){ | |
893 state = IN_QUOTE; // the first time we see single quote | |
894 } else if (c =='\\'){ // back slash character | |
895 state = IN_BSLASH; | |
896 } else if (isAlphabet(c) || isDigit(c)){ | |
897 token.append(c); | |
898 } else{ | |
899 terminated(STRING); | |
900 } | |
901 break;//IN_STRING | |
902 case IN_QUOTE: | |
903 if (c == '\''){ // the second time we see single quote | |
904 state = IN_STRING; // see document in IN_STRING | |
905 } else if ( c== '\\') { // backslah escape in quote | |
906 state = IN_QUOTE_BSLASH; | |
907 } else { | |
908 token.append(c); // eat up everything, includes back sl
ash | |
909 } | |
910 break;//IN_QUOTE | |
911 case IN_QUOTE_BSLASH: | |
912 case IN_BSLASH: | |
913 switch (c){ | |
914 case 'n' : token.append('\n'); break; | |
915 case 'r' : token.append('\r'); break; | |
916 case 't' : token.append('\t'); break; | |
917 case '\'' : token.append('\''); break; | |
918 case '\\' : token.append('\\'); break; | |
919 default: token.append(c); // unknown escaping, treat it
as literal | |
920 } | |
921 if (state == IN_BSLASH){ | |
922 state = IN_STRING; // see document in IN_STRING | |
923 } else { // state == IN_QUOTE_BSLASH | |
924 state = IN_QUOTE; | |
925 } | |
926 break;//IN_BSLASH | |
927 case DONE: /* should never happen */ | |
928 default: | |
929 working--; | |
930 tokenType = ERROR; | |
931 state = DONE; | |
932 break; | |
933 }//switch(state) | |
934 }//while (state != DONE) | |
935 | |
936 return tokenType; | |
937 } | |
938 };//class Scanner | |
939 | |
940 class Parser{ | |
941 friend UBool TestParser(); | |
942 friend class TestParserT; | |
943 friend class LanguageGenerator_impl; | |
944 private: | |
945 Scanner s; | |
946 TokenType & token; | |
947 int min_max; // for the evil infinite | |
948 | |
949 UBool match(TokenType expected){ | |
950 if (token == expected) { | |
951 token = s.getNextToken(); | |
952 return TRUE; | |
953 } else { | |
954 //s.dumpCurrentPoint(); | |
955 return FALSE; | |
956 } | |
957 } | |
958 | |
959 UBool weight(int & value){ | |
960 if (token == NUMBER){ | |
961 int temp = atoi(s.token); | |
962 match(NUMBER); | |
963 if (match(PERCENT)){ | |
964 value = temp; | |
965 return TRUE; | |
966 } | |
967 } | |
968 return FALSE; | |
969 } | |
970 | |
971 UBool repeat (Pick* &node /*in,out*/){ | |
972 if (node == NULL) return FALSE; | |
973 | |
974 int count = -2; | |
975 int min = -2; | |
976 int max = -2; | |
977 UBool question = FALSE; | |
978 switch (token){ | |
979 case QUESTION: | |
980 match(QUESTION); | |
981 min = 0; | |
982 max = 1; | |
983 count = 2; | |
984 question = TRUE; | |
985 break; | |
986 case STAR: | |
987 match(STAR); | |
988 min = 0; | |
989 max = -1; | |
990 count = -1; | |
991 break; | |
992 case PLUS: | |
993 match(PLUS); | |
994 min = 1; | |
995 max = -1; | |
996 count = -1; | |
997 break; | |
998 case LBRACE: | |
999 match(LBRACE); | |
1000 if (token != NUMBER){ | |
1001 return FALSE; | |
1002 }else { | |
1003 min = atoi(s.token); | |
1004 match(NUMBER); | |
1005 if (token == RBRACE){ | |
1006 match(RBRACE); | |
1007 max = min; | |
1008 count = 1; | |
1009 } else if (token == COMMA) { | |
1010 match(COMMA); | |
1011 if (token == RBRACE){ | |
1012 match(RBRACE); | |
1013 max = -1; | |
1014 count = -1; | |
1015 } else if (token == NUMBER) { | |
1016 max = atoi(s.token); | |
1017 match(NUMBER); | |
1018 count = max - min + 1; | |
1019 if (!match(RBRACE)) { | |
1020 return FALSE; | |
1021 } | |
1022 } else { | |
1023 return FALSE; | |
1024 } | |
1025 } else { | |
1026 return FALSE; | |
1027 } | |
1028 } | |
1029 break; | |
1030 default: | |
1031 return FALSE; | |
1032 } | |
1033 | |
1034 if (count == -2 || min == -2 || max == -2){ | |
1035 //ASSERT(FALSE); | |
1036 return FALSE; | |
1037 } | |
1038 | |
1039 // eat up following weights | |
1040 Buffer_int weights; | |
1041 int w; | |
1042 while (weight(w)){ | |
1043 weights.append(w); | |
1044 } | |
1045 | |
1046 // for the evil infinite | |
1047 min_max = min_max > min ? min_max : min; | |
1048 min_max = min_max > max ? min_max : max; | |
1049 if (min_max > PSEUDO_INFINIT){ | |
1050 return FALSE; // PSEUDO_INFINIT is less than the real maximum | |
1051 } | |
1052 if (max == -1){ // the evil infinite | |
1053 max = PSEUDO_INFINIT; | |
1054 } | |
1055 // for the strange question mark | |
1056 if (question && weights.content_size() > 0){ | |
1057 Buffer_int w2; | |
1058 w2.append(DEFAULT_WEIGHT - weights[0]).append(weights[0]); | |
1059 node = new Repeat(node,min,max,&w2); | |
1060 return TRUE; | |
1061 } | |
1062 node = new Repeat(node,min,max,&weights); | |
1063 return TRUE; | |
1064 } | |
1065 | |
1066 UBool core(Pick* &node /*out*/){ | |
1067 if (node != NULL) return FALSE; //assert node == NULL | |
1068 | |
1069 switch(token){ | |
1070 case LPAR: | |
1071 match(LPAR); | |
1072 if(defination(node) && match(RPAR)){ | |
1073 return TRUE; | |
1074 } | |
1075 return FALSE; | |
1076 case VAR: | |
1077 node = new Variable(&symbols, s.token); | |
1078 match(VAR); | |
1079 return TRUE; | |
1080 case STRING: | |
1081 node = new Literal(s.token); | |
1082 match(STRING); | |
1083 return TRUE; | |
1084 default: | |
1085 return FALSE; | |
1086 } | |
1087 } | |
1088 UBool modified(Pick* &node /*out*/){ | |
1089 if (node != NULL) return FALSE; //assert node == NULL | |
1090 | |
1091 if (!core(node)) { | |
1092 return FALSE; | |
1093 } | |
1094 | |
1095 for (;;){ | |
1096 switch(token){ | |
1097 case WAVE: | |
1098 match(WAVE); | |
1099 node = new Morph(*node); | |
1100 break; | |
1101 case AT: | |
1102 match(AT); | |
1103 node = new Quote(*node); | |
1104 break; | |
1105 case QUESTION: | |
1106 case STAR: | |
1107 case PLUS: | |
1108 case LBRACE: | |
1109 if (!repeat(node)) return FALSE; | |
1110 break; | |
1111 case SEMI: // rule definiation closed | |
1112 case RPAR: // within parenthesis (core closed) | |
1113 case BAR: // in alternation | |
1114 case NUMBER: // in alternation, with weight | |
1115 case LPAR: // in sequence | |
1116 case VAR: // in sequence | |
1117 case STRING: // in sequence | |
1118 return TRUE; | |
1119 default: | |
1120 return FALSE; | |
1121 } | |
1122 } | |
1123 } | |
1124 | |
1125 | |
1126 UBool sequence_list(Pick* &node /*in,out*/){ | |
1127 if (node == NULL) return FALSE; // assert node != NULL | |
1128 | |
1129 Sequence* seq = new Sequence(); | |
1130 Pick * n = node; | |
1131 | |
1132 while (token == VAR || token == STRING || token == LPAR){ | |
1133 seq->append(n); | |
1134 n = NULL; | |
1135 if (modified(n)){ | |
1136 // go on | |
1137 } else { | |
1138 goto FAIL; | |
1139 } | |
1140 } | |
1141 | |
1142 if (token == SEMI || token == RPAR || token == BAR){ | |
1143 seq->append(n); | |
1144 node = seq; | |
1145 return TRUE; | |
1146 } | |
1147 FAIL: | |
1148 delete seq; | |
1149 return FALSE; | |
1150 | |
1151 } | |
1152 | |
1153 UBool sequence(Pick* &node /*out*/){ | |
1154 if (node != NULL) return FALSE; //assert node == NULL | |
1155 | |
1156 if (!modified(node)) { | |
1157 return FALSE; | |
1158 } | |
1159 | |
1160 if (token == VAR || token == STRING || token == LPAR){ | |
1161 return sequence_list(node); | |
1162 } else { | |
1163 return TRUE; // just a modified | |
1164 } | |
1165 } | |
1166 | |
1167 UBool alternation_list(Pick* &node /*in,out*/){ | |
1168 if (node == NULL) return FALSE; // assert node != NULL | |
1169 | |
1170 Alternation * alt = new Alternation(); | |
1171 Pick * n = node; | |
1172 int w = DEFAULT_WEIGHT; | |
1173 | |
1174 while (token == NUMBER || token == BAR){ | |
1175 if(token == NUMBER) { | |
1176 if (weight(w)){ | |
1177 if (token == BAR){ | |
1178 // the middle item, go on | |
1179 } else { | |
1180 // the last item or encounter error | |
1181 break; //while | |
1182 } | |
1183 } else { | |
1184 goto FAIL; | |
1185 } | |
1186 } // else token == BAR | |
1187 match(BAR); | |
1188 alt->append(n,w); | |
1189 | |
1190 n = NULL; | |
1191 w = DEFAULT_WEIGHT; | |
1192 if (sequence(n)){ | |
1193 // go on | |
1194 } else { | |
1195 goto FAIL; | |
1196 } | |
1197 } | |
1198 | |
1199 if (token == SEMI || token == RPAR) { | |
1200 alt->append(n,w); | |
1201 node = alt; | |
1202 return TRUE; | |
1203 } | |
1204 FAIL: | |
1205 delete alt; | |
1206 return FALSE; | |
1207 } | |
1208 | |
1209 UBool alternation(Pick* &node /*out*/){ | |
1210 if (node != NULL) return FALSE; //assert node == NULL | |
1211 | |
1212 // 'sequence' has higher precedence than 'alternation' | |
1213 if (!sequence(node)){ | |
1214 return FALSE; | |
1215 } | |
1216 | |
1217 if (token == BAR || token == NUMBER){ // find a real alternation1, creat
e it. | |
1218 return alternation_list(node); | |
1219 } else { | |
1220 return TRUE; // just a sequence_old | |
1221 } | |
1222 } | |
1223 | |
1224 | |
1225 UBool defination(Pick* &node /*out*/){ | |
1226 if (node != NULL) return FALSE; //assert node == NULL | |
1227 return alternation(node); | |
1228 } | |
1229 | |
1230 UBool rule(){ | |
1231 if (token == VAR){ | |
1232 Buffer_char name; | |
1233 name.append_array(s.token, strlen(s.token) + 1); | |
1234 match(VAR); | |
1235 | |
1236 if (match(EQ)){ | |
1237 Pick * t = NULL; | |
1238 if(defination(t)){ | |
1239 symbols.put(name, t); | |
1240 return match(SEMI); | |
1241 } | |
1242 } | |
1243 } | |
1244 return FALSE; | |
1245 } | |
1246 public: | |
1247 UBool rules(){ | |
1248 symbols.reset(); | |
1249 token = s.getNextToken(); | |
1250 while (rule()){ | |
1251 } | |
1252 if (token == STREAM_END){ | |
1253 return TRUE; | |
1254 } else { | |
1255 //s.dumpCurrentPoint(); | |
1256 return FALSE; | |
1257 } | |
1258 } | |
1259 | |
1260 public: | |
1261 SymbolTable symbols; | |
1262 | |
1263 Parser(const char *const source):s(source), token(s.tokenType){ | |
1264 min_max = -2; | |
1265 } | |
1266 UBool parse(){ | |
1267 return rules(); | |
1268 } | |
1269 | |
1270 }; // class Parser | |
1271 | |
1272 | |
1273 /////////////////////////////////////////////////////////// | |
1274 // | |
1275 // | |
1276 // | |
1277 | |
1278 int DumpScanner(Scanner & s, UBool dump = TRUE){ | |
1279 int len = strlen(s.source); | |
1280 int error_start_offset = s.history - s.source; | |
1281 if (dump){ | |
1282 printf("\n=================== DumpScanner ================\n"); | |
1283 fwrite(s.source, len, 1, stdout); | |
1284 printf("\n-----parsed-------------------------------------\n"); | |
1285 fwrite(s.source, s.history - s.source, 1, stdout); | |
1286 printf("\n-----current------------------------------------\n"); | |
1287 fwrite(s.history, s.working - s.history, 1, stdout); | |
1288 printf("\n-----unparsed-----------------------------------\n"); | |
1289 fwrite(s.working, (s.source + len - s.working), 1, stdout); | |
1290 printf("\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"); | |
1291 } | |
1292 return error_start_offset; | |
1293 } | |
1294 | |
1295 class LanguageGenerator_impl{ | |
1296 public: | |
1297 LanguageGenerator_impl(const char *const bnf_definition, const char *const t
op_node) | |
1298 :par(bnf_definition), top_node_name(top_node){ | |
1299 srand((unsigned)time( NULL )); | |
1300 } | |
1301 | |
1302 LanguageGenerator::PARSE_RESULT parseBNF(UBool debug = TRUE){ | |
1303 if (par.parse()){ | |
1304 if (par.symbols.find(top_node_name, &top_node_ref) == SymbolTable::H
AS_REF) { | |
1305 if (par.symbols.is_complete()) { | |
1306 return LanguageGenerator::OK; | |
1307 } else { | |
1308 if (debug) printf("The bnf definition is incomplete.\n"); | |
1309 return LanguageGenerator::INCOMPLETE; | |
1310 } | |
1311 } else { | |
1312 if (debug) printf("No top node is found.\n"); | |
1313 return LanguageGenerator::NO_TOP_NODE; | |
1314 } | |
1315 } else { | |
1316 if(debug) { | |
1317 printf("The bnf definition is wrong\n"); | |
1318 DumpScanner(par.s, TRUE); | |
1319 } | |
1320 return LanguageGenerator::BNF_DEF_WRONG; | |
1321 } | |
1322 } | |
1323 const char * next(){ | |
1324 return top_node_ref->next(); | |
1325 } | |
1326 | |
1327 private: | |
1328 Parser par; | |
1329 const char *const top_node_name; | |
1330 Pick * top_node_ref; | |
1331 }; | |
1332 | |
1333 LanguageGenerator::LanguageGenerator():lang_gen(NULL){ | |
1334 } | |
1335 | |
1336 LanguageGenerator::~LanguageGenerator(){ | |
1337 delete lang_gen; | |
1338 } | |
1339 | |
1340 LanguageGenerator::PARSE_RESULT LanguageGenerator::parseBNF(const char *const bn
f_definition /*in*/, const char *const top_node/*in*/, UBool debug){ | |
1341 if (lang_gen){ | |
1342 delete lang_gen; | |
1343 } | |
1344 lang_gen = new LanguageGenerator_impl(bnf_definition, top_node); | |
1345 PARSE_RESULT r = lang_gen->parseBNF(debug); | |
1346 if (r != OK){ | |
1347 delete lang_gen; | |
1348 lang_gen = NULL; | |
1349 return r; | |
1350 } else { | |
1351 return r; | |
1352 } | |
1353 } | |
1354 const char *LanguageGenerator::next(){ // Return a null-terminated c-string. The
buffer is owned by callee. | |
1355 if (lang_gen){ | |
1356 return lang_gen->next(); | |
1357 }else { | |
1358 return ""; | |
1359 } | |
1360 } | |
1361 | |
1362 /////////////////////////////////////////////////////////// | |
1363 // | |
1364 // The test code for WBNF | |
1365 // | |
1366 | |
1367 #define CALL(fun) \ | |
1368 if (fun()){ \ | |
1369 printf("Pass: " #fun "\n");\ | |
1370 } else { \ | |
1371 printf("FAILED: !!! " #fun " !!!\n"); \ | |
1372 } | |
1373 | |
1374 #define DUMP_R(fun, var, times) \ | |
1375 {printf("\n========= " #fun " =============\n"); \ | |
1376 for (int i=0; i<times; i++) { \ | |
1377 const char * t = var.next();\ | |
1378 fwrite(t,strlen(t),1,stdout); \ | |
1379 printf("\n"); \ | |
1380 } \ | |
1381 printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n");} | |
1382 | |
1383 | |
1384 | |
1385 #if TEST_WBNF_TEST | |
1386 static UBool TestQuote(){ | |
1387 const char *const str = "This ' A !,z| qq [] .new\tline"; | |
1388 //const char *const str_r = "This \\' A '!,'z'|' qq '[]' '.'new\tline"; | |
1389 //// | |
1390 //// :( we must quote our string to following C syntax | |
1391 //// cannot type the literal here, it makes our code rather human unread
able | |
1392 //// very very unconformable! | |
1393 //// | |
1394 ///* | |
1395 //*/ | |
1396 | |
1397 //const char *const s1 = "ab'c"; | |
1398 //const char (* s1_r1) [] = { "ab''c", // ab''c | |
1399 // "ab\\'c", // ab\'c | |
1400 // };// | |
1401 ///* | |
1402 // . '.' \. | |
1403 // .. \.\. '.'\. '.'\. '..' // '.''.' wrong | |
1404 //*/ | |
1405 | |
1406 //const char *const s2 = "a..'.b"; // a..'.b | |
1407 //const char (*s2_r) [] = { "a'..''.'b" // a'..''.'b | |
1408 // ,"a'..\\'.'b" // a'..\'.'b | |
1409 // ,"a'..'\\''.'b" // a'..'\''.'b | |
1410 // };// | |
1411 | |
1412 //const char *const s3 = "a..\\.b"; // a..\.b | |
1413 //const char (*s3_r) [] = { "a'..\\\\.'b" // a'..\\.'b | |
1414 // ,"a'..'\\\\'.'b" // a'..'\\'.'b | |
1415 // };// | |
1416 | |
1417 // // no catact operation, no choice, must be com
pact | |
1418 | |
1419 srand((unsigned)time( NULL )); | |
1420 | |
1421 //Escaper l(Escaper::NO, Escaper::NO, Escaper::RAND_ESC); | |
1422 Pick *p = new Literal(str); | |
1423 Quote q(*p); | |
1424 | |
1425 DUMP_R(TestQuote, (*p), 1); | |
1426 DUMP_R(TestQuote, q, 20); | |
1427 return FALSE; | |
1428 } | |
1429 static UBool TestLiteral(){ | |
1430 const char * s = "test string99."; | |
1431 Literal n(s); | |
1432 const char * r = n.next(); | |
1433 return strcmp(s,r) == 0; | |
1434 } | |
1435 | |
1436 static UBool TestSequence(){ | |
1437 Sequence seq; | |
1438 seq.append(new Literal("abc ")); | |
1439 seq.append(new Literal(", s")); | |
1440 | |
1441 return strcmp(seq.next(), "abc , s") == 0; | |
1442 } | |
1443 static UBool TestAlternation(){ | |
1444 srand((unsigned)time( NULL )); | |
1445 Alternation alt; | |
1446 alt.append(new Literal("aaa_10%"),10); | |
1447 alt.append(new Literal("bbb_0%"),0); | |
1448 alt.append(new Literal("ccc_10%"),10); | |
1449 alt.append(new Literal("ddddddd_50%"),50); | |
1450 | |
1451 DUMP_R(TestAlternation, alt, 50); | |
1452 | |
1453 return FALSE; | |
1454 } | |
1455 | |
1456 static UBool TestBuffer(){ | |
1457 Buffer_int t; | |
1458 t.append(1).append(0).append(5); | |
1459 int s = t.content_size(); | |
1460 for (int i=0; i<s; ++i){ | |
1461 printf("%d\n", t[i]); | |
1462 } | |
1463 return FALSE; | |
1464 } | |
1465 | |
1466 static UBool TestWeightedRand(){ | |
1467 srand((unsigned)time( NULL )); | |
1468 Buffer_int t; | |
1469 t.append(1).append(0).append(5); | |
1470 WeightedRand wr(&Buffer_int().append(10).append(0).append(50),4); | |
1471 // WeightedRand wr(&t,3); | |
1472 for (int i=0; i< 50; ++i){ | |
1473 printf("%d\n", wr.next()); | |
1474 } | |
1475 return FALSE; | |
1476 } | |
1477 | |
1478 static UBool TestRepeat(){ | |
1479 srand((unsigned)time( NULL )); | |
1480 Repeat rep(new Literal("aaa1-5 "), 1, 5); | |
1481 DUMP_R(TestRepeat, rep, 50); | |
1482 | |
1483 Repeat r2(new Literal("b{1,3}1%0%5% "), 1, 3, &Buffer_int().append(1).append
(0).append(5)); | |
1484 DUMP_R(TestRepeat, r2, 50); | |
1485 | |
1486 Repeat r3(new Literal("aaa5-5 "), 5, 5); | |
1487 DUMP_R(TestRepeat, r3, 50); | |
1488 | |
1489 return FALSE; | |
1490 } | |
1491 | |
1492 static UBool TestVariable(){ | |
1493 SymbolTable tab; | |
1494 Pick * value = new Literal("string1"); | |
1495 Variable var1(&tab, "x", value); | |
1496 | |
1497 Variable var2(&tab, "y"); | |
1498 // tab.put(var2, value); // TOFIX: point alias/recursion problem | |
1499 Pick * value2 = new Literal("string2"); | |
1500 tab.put(var2, value2); | |
1501 | |
1502 Pick * value3 = new Literal("string3"); | |
1503 Variable var3(&tab, "z"); | |
1504 tab.put("z", value3); | |
1505 | |
1506 UBool pass; | |
1507 pass = strcmp(var1.next(), value->next()) == 0; | |
1508 pass = pass && strcmp(var2.next(), value2->next()) == 0; | |
1509 pass = pass && strcmp(var3.next(), value3->next()) == 0; | |
1510 return pass; | |
1511 } | |
1512 | |
1513 static UBool TestSymbolTable(){ | |
1514 Literal * n1 = new Literal("string1"); | |
1515 Literal * n2 = new Literal("string2"); | |
1516 SymbolTable t; | |
1517 t.put("abc", n1); | |
1518 t.put("$aaa", n2); | |
1519 // t.put("alias", n1); // TOFIX: point alias/recursion problem | |
1520 t.put("bbb"); | |
1521 | |
1522 UBool pass; | |
1523 pass = t.find(NULL) == SymbolTable::EMPTY; | |
1524 pass = pass && t.find("ccc") == SymbolTable::NO_VAR; | |
1525 pass = pass && t.find("bbb") == SymbolTable::NO_REF; | |
1526 pass = pass && t.find("abc") == SymbolTable::HAS_REF; | |
1527 pass = pass && t.find("$aaa") == SymbolTable::HAS_REF; | |
1528 | |
1529 t.reset(); | |
1530 pass = pass && t.find("abc") == SymbolTable::NO_VAR; | |
1531 return pass; | |
1532 } | |
1533 | |
1534 | |
1535 static UBool TestScanner(void){ | |
1536 //const char str1[] = "$root = $command{0,5} $reset $mostRules{1,20};"; | |
1537 //const char str1_r[][20] = {"$root", "=", "$command", "{", "0", ",", "5", "
}", | |
1538 // "$reset", "$mostRules", "{", "1", ",", "20", "}", ";"}; | |
1539 | |
1540 const char str2[] = "$p2 =(\\\\ $s $string $s)? 25%;"; | |
1541 const char str2_r[][20] = {"$p2", "=", "(", "\\", "$s", "$string", "$s", ")"
, "?", "25", "%", ";"}; | |
1542 | |
1543 const char *str = str2; | |
1544 const char (*str_r)[20] = str2_r; | |
1545 int tokenNum = sizeof(str2_r)/sizeof(char[20]); | |
1546 | |
1547 Scanner t(str); | |
1548 UBool pass = TRUE; | |
1549 t.getNextToken(); | |
1550 int i = 0; | |
1551 while (pass){ | |
1552 if (t.tokenType == STREAM_END){ | |
1553 pass = pass? i == tokenNum : FALSE; | |
1554 break;//while | |
1555 } else if (t.tokenType == ERROR){ | |
1556 pass = FALSE; | |
1557 break;//while | |
1558 } else { | |
1559 pass = strcmp( &(t.token[0]), str_r[i++]) == 0; | |
1560 t.getNextToken(); | |
1561 } | |
1562 } | |
1563 | |
1564 //const char ts[] = "$commandList = '['" | |
1565 //" ( alternate ' ' $alternateOptions" | |
1566 //" | backwards ' 2'" | |
1567 //" | normalization ' ' $onoff " | |
1568 //" | caseLevel ' ' $onoff " | |
1569 //" | hiraganaQ ' ' $onoff" | |
1570 //" | caseFirst ' ' $caseFirstOptions" | |
1571 //" | strength ' ' $strengthOptions" | |
1572 //" ) ']';" ; | |
1573 | |
1574 //Scanner t2(ts); | |
1575 //pass = TRUE; | |
1576 //do { | |
1577 // t2.getNextToken(); | |
1578 // if (t2.tokenType == ERROR){ | |
1579 // DumpScanner(t2); | |
1580 // return FALSE; | |
1581 // } | |
1582 //}while (t.tokenType != STREAM_END); | |
1583 | |
1584 return pass; | |
1585 } | |
1586 | |
1587 class TestParserT { | |
1588 public: | |
1589 UBool operator () (const char *const str, const int exp_error_offset = -1, const
UBool dump = TRUE){ | |
1590 Parser par(str); | |
1591 if (par.rules()){ | |
1592 if ( exp_error_offset == -1){ | |
1593 return TRUE; | |
1594 }else { | |
1595 DumpScanner(par.s,dump); | |
1596 return FALSE; | |
1597 } | |
1598 }else { | |
1599 return DumpScanner(par.s, dump) == exp_error_offset; | |
1600 } | |
1601 } | |
1602 }; | |
1603 | |
1604 UBool TestParser(){ | |
1605 TestParserT test; | |
1606 | |
1607 UBool pass = TRUE; | |
1608 pass = pass && test ("$s = ' ' ? 50%;"); | |
1609 pass = pass && test("$x = ($var {1,2}) 3%;"); // legal | |
1610 pass = pass && test("$x = $var {1,2} 3% | b 4%;"); // legal | |
1611 pass = pass && test("$x = $var {1,2} 3%;"); // legal | |
1612 pass = pass && test("$m = $c ? 2% 4% | $r 5% | $n 25%;"); // legal | |
1613 pass = pass && test("$a = b ? 2% | c 5%;"); // legal | |
1614 pass = pass && test("$x = A B 5% C 10% | D;", 8, FALSE); // illegal 5% | |
1615 pass = pass && test("$x = aa 45% | bb 5% cc;", 19, FALSE);// illegal cc | |
1616 pass = pass && test("$x = (b 5%) (c 6%);"); // legal | |
1617 pass = pass && test("$x = (b 5%) c 6%;", 13, FALSE); // illegal 6% | |
1618 pass = pass && test("$x = b 5% (c 6%);", 9, FALSE); // illegal (c 6%) | |
1619 pass = pass && test("$x = b 5% c 6%;", 9, FALSE); // illegal c 6% | |
1620 pass = pass && test("$x = b 5%;"); // legal | |
1621 pass = pass && test("$x = aa 45% | bb 5% cc;", 19, FALSE);// illegal cc | |
1622 pass = pass && test("$x = a | b | c 4% | d 5%;"); // legal | |
1623 pass = pass && test("$s = ' ' ? 50% abc;"); // legal | |
1624 pass = pass && test("$s = a | c d | e f;"); // legal | |
1625 pass = pass && test( "$z = q 0% | p 1% | r 100%;"); // legal How to
check parsed tree?? | |
1626 | |
1627 pass = pass && test("$s = ' ' ? 50%;"); | |
1628 pass = pass && test("$relationList = '<' | '<<' | ';' | '<<<' | ',' | '=';"
); | |
1629 pass = pass && test("$p1 = ($string $s '|' $s)? 25%;"); | |
1630 pass = pass && test("$p2 = (\\\\ $s $string $s)? 25%;"); | |
1631 pass = pass && test("$rel2 = $p1 $string $s $p2;"); | |
1632 pass = pass && test("$relation = $relationList $s ($rel1 | $rel2) $crlf;"); | |
1633 pass = pass && test("$command = $commandList $crlf;"); | |
1634 pass = pass && test("$reset = '&' $s ($beforeList $s)? 10% ($positionList 10
0% | $string 10%) $crlf;"); | |
1635 pass = pass && test("$mostRules = $command 1% | $reset 5% | $relation 25%;")
; | |
1636 pass = pass && test("$root = $command{0,5} $reset $mostRules{1,20};"); | |
1637 | |
1638 const char collationBNF[] = | |
1639 "$s = ' '? 50%;" | |
1640 "$crlf = '\r\n';" | |
1641 | |
1642 "$alternateOptions = non'-'ignorable | shifted;" | |
1643 "$onoff = on | off;" | |
1644 "$caseFirstOptions = off | upper | lower;" | |
1645 "$strengthOptions = '1' | '2' | '3' | '4' | 'I';" | |
1646 "$commandList = '['" | |
1647 " ( alternate ' ' $alternateOptions" | |
1648 " | backwards ' 2'" | |
1649 " | normalization ' ' $onoff " | |
1650 " | caseLevel ' ' $onoff " | |
1651 " | hiraganaQ ' ' $onoff" | |
1652 " | caseFirst ' ' $caseFirstOptions" | |
1653 " | strength ' ' $strengthOptions" | |
1654 " ) ']';" | |
1655 "$command = $commandList $crlf;" | |
1656 | |
1657 "$ignorableTypes = (tertiary | secondary | primary) ' ' ignorable;" | |
1658 "$allTypes = variable | regular | implicit | trailing | $ignorableTypes;" | |
1659 "$positionList = '[' (first | last) ' ' $allTypes ']';" | |
1660 | |
1661 "$beforeList = '[before ' ('1' | '2' | '3') ']';" | |
1662 | |
1663 "$relationList = (" | |
1664 " '<'" | |
1665 " | '<<'" | |
1666 " | ';'" | |
1667 " | '<<<'" | |
1668 " | ','" | |
1669 " | '='" | |
1670 ");" | |
1671 "$string = $magic;" | |
1672 "$rel1 = '[variable top]' $s;" | |
1673 "$p1 = ($string $s '|' $s)? 25%;" | |
1674 "$p2 = (\\\\ $s $string $s)? 25%;" | |
1675 "$rel2 = $p1 $string $s $p2;" | |
1676 "$relation = $relationList $s ($rel1 | $rel2) $crlf;" | |
1677 | |
1678 "$reset = '&' $s ($beforeList $s)? 10% ($positionList 1% | $string 10%) $crl
f;" | |
1679 "$mostRules = $command 1% | $reset 5% | $relation 25%;" | |
1680 "$root = $command{0,5} $reset $mostRules{1,20};" | |
1681 ; | |
1682 | |
1683 pass = pass && test(collationBNF); | |
1684 | |
1685 | |
1686 return pass; | |
1687 } | |
1688 | |
1689 static UBool TestMorph(){ | |
1690 srand((unsigned)time( NULL )); | |
1691 | |
1692 Alternation * alt = new Alternation(); | |
1693 | |
1694 (*alt) | |
1695 .append(new Literal("a")).append(new Literal("b")).append(new Literal("c")) | |
1696 .append(new Literal("d")).append(new Literal("e")).append(new Literal("f")) | |
1697 .append(new Literal("g")).append(new Literal("h")).append(new Literal("i")) | |
1698 .append(new Literal("j")).append(new Literal("k")).append(new Literal("l")) | |
1699 .append(new Literal("m")).append(new Literal("n")).append(new Literal("o")) | |
1700 ; | |
1701 | |
1702 Repeat * rep = new Repeat( alt ,5,5 ); | |
1703 Morph m( *rep); | |
1704 | |
1705 // DUMP_R(TestMorph,(*rep),20); | |
1706 DUMP_R(TestMorph,m,100); | |
1707 | |
1708 return FALSE; | |
1709 } | |
1710 | |
1711 #endif | |
1712 | |
1713 static UBool TestLanguageGenerator(){ | |
1714 //LanguageGenerator g; | |
1715 //const char *const s = "$s = p 0% | q 1%;"; | |
1716 //g.parseBNF(s, "$s"); | |
1717 UBool pass; | |
1718 //= strcmp("q", g.next()) == 0; | |
1719 | |
1720 const char *const def = | |
1721 //"$a = $b;" | |
1722 //"$b = $c;" | |
1723 //"$c = $t;" | |
1724 //"$t = abc $z{1,2};" | |
1725 //"$k = a | b | c | d | e | f | g ;" | |
1726 //"$z = q 0% | p 1% | r 1%;" | |
1727 "$x = a ? 0%;" | |
1728 ; // end of string | |
1729 // const char * s = "abczz"; | |
1730 // | |
1731 // | |
1732 LanguageGenerator g; | |
1733 pass = g.parseBNF(def, "$x",TRUE); | |
1734 //// LanguageGenerator g(collationBNF, "$root", "$magic", new MagicNode()); | |
1735 // | |
1736 if (pass != LanguageGenerator::OK) return FALSE; | |
1737 | |
1738 DUMP_R(TestLanguageGenerator, g, 20); | |
1739 return pass; | |
1740 | |
1741 ////UBool pass = strcmp(s,r) == 0; | |
1742 | |
1743 //if (pass){ | |
1744 // printf("TestRandomLanguageGenerator passed.\n"); | |
1745 //} else { | |
1746 // printf("TestRandomLanguageGenerator FAILED!!!\n"); | |
1747 //} | |
1748 //return pass; | |
1749 } | |
1750 | |
1751 void TestWbnf(void){ | |
1752 srand((unsigned)time( NULL )); | |
1753 | |
1754 //CALL(TestLiteral); | |
1755 //CALL(TestSequence); | |
1756 //CALL(TestSymbolTable); | |
1757 //CALL(TestVariable); | |
1758 | |
1759 //TestRepeat(); | |
1760 //TestAlternation(); | |
1761 //TestMorph(); | |
1762 | |
1763 //TestQuote(); | |
1764 //TestBuffer(); | |
1765 //TestWeightedRand(); | |
1766 | |
1767 //CALL(TestScanner); | |
1768 //CALL(TestParser); | |
1769 CALL(TestLanguageGenerator); | |
1770 } | |
1771 | |
OLD | NEW |