OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ****************************************************************************** |
| 3 * Copyright (C) 2005-2007, International Business Machines Corporation and * |
| 4 * others. All Rights Reserved. * |
| 5 ****************************************************************************** |
| 6 */ |
| 7 |
| 8 #include <stdio.h> |
| 9 #include <string.h> |
| 10 #include <stdlib.h> |
| 11 #include <time.h> |
| 12 |
| 13 #include "wbnf.h" |
| 14 |
| 15 // Most of this code is meant to test the test code. It's a self test. |
| 16 // Normally this isn't run. |
| 17 #define TEST_WBNF_TEST 0 |
| 18 |
| 19 /////////////////////////////////////////////////////////// |
| 20 // |
| 21 // Constants and the most basic helper classes |
| 22 // |
| 23 |
| 24 static const char DIGIT_CHAR[] = "0123456789"; |
| 25 static const char WHITE_SPACE[] = {'\t', ' ', '\r', '\n', 0}; |
| 26 static const char ALPHABET[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUV
WXYZ"; |
| 27 static const char SPECIAL[] = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; |
| 28 |
| 29 static inline UBool isInList(const char c /*in*/, const char list[] /*in*/){ |
| 30 const char * p = list; |
| 31 for (;*p != 0 && *p != c; p++); |
| 32 return *p?TRUE:FALSE; |
| 33 } |
| 34 static inline UBool isDigit(char c) {return isInList(c, DIGIT_CHAR);} |
| 35 static inline UBool isWhiteSpace(char c) {return isInList(c, WHITE_SPACE);} |
| 36 static inline UBool isAlphabet(char c) {return isInList(c, ALPHABET);} |
| 37 static inline UBool isSpecialAsciiChar(char c) {return isInList(c,SPECIAL);} |
| 38 |
| 39 |
| 40 |
| 41 /////////////////////////////////////////////////////////// |
| 42 // |
| 43 // Helper classes |
| 44 // |
| 45 |
| 46 class Buffer_byte{ |
| 47 // Utility class, can be treated as an auto expanded array. no boundary check. |
| 48 |
| 49 typedef char byte; |
| 50 byte * start; |
| 51 byte * current; |
| 52 int buffer_size; // size unit is byte |
| 53 public: |
| 54 inline int content_size(){return current - start;} // size unit is byte |
| 55 |
| 56 private: |
| 57 inline void expand(int add_size = 100){ // size unit is byte |
| 58 int new_size = buffer_size + add_size; |
| 59 |
| 60 int cs_snap = content_size(); |
| 61 start = (byte *) realloc(start, new_size); // may change the value of
start |
| 62 current = start + cs_snap; |
| 63 |
| 64 memset(current, 0, add_size); |
| 65 buffer_size = new_size; |
| 66 } |
| 67 |
| 68 inline void expand_to(int size){ |
| 69 int r = size - buffer_size; |
| 70 if (r > 0) { |
| 71 expand(r); // simply expand, no block alignment |
| 72 } |
| 73 } |
| 74 Buffer_byte(const Buffer_byte &); |
| 75 Buffer_byte & operator = (const Buffer_byte &); |
| 76 public: |
| 77 Buffer_byte():start(NULL),current(start),buffer_size(0){ |
| 78 expand(); |
| 79 } |
| 80 ~Buffer_byte(){ |
| 81 free(start); |
| 82 } |
| 83 |
| 84 inline void reset(){ |
| 85 start != NULL ? memset(start, 0, buffer_size) : 0; |
| 86 current = start; |
| 87 } |
| 88 |
| 89 // Using memory copy method to append a C array to buffer, |
| 90 inline void append(const void * c, int size){ // size unit is byte |
| 91 expand_to(content_size() + size) ; |
| 92 memcpy(current, c, size); |
| 93 current = current + size; |
| 94 } |
| 95 |
| 96 byte * buffer(){ |
| 97 return start; |
| 98 } |
| 99 }; |
| 100 |
| 101 /* |
| 102 The class(es) try to work as bulid-in array, so it overloads these two operato
rs |
| 103 operator type *(); |
| 104 type & operator[]; |
| 105 The first is used to auto type convert, the latter is used to select member. |
| 106 |
| 107 A small trick is the class does not overload the address-of operator. This |
| 108 behavior is different from bulid-in array, but it give us the opportunity |
| 109 to get the address of the class itself. |
| 110 */ |
| 111 //template<typename type> |
| 112 // class BUFFER{ |
| 113 // typedef BUFFER name; |
| 114 #define BUFFER(type, name)\ |
| 115 class name {\ |
| 116 private:\ |
| 117 Buffer_byte buf;\ |
| 118 public:\ |
| 119 name & reset() {buf.reset(); return *this;}\ |
| 120 name & append(type c) {buf.append(&c, sizeof(type)); return *this;}\ |
| 121 name & append_array(const type * p, int size) {buf.append(p, sizeof(type
)*size); return *this;}\ |
| 122 type & operator [] (int i) { return ((type *) buf.buffer())[i];}\ |
| 123 operator type *(){return (type *) buf.buffer();} \ |
| 124 int content_size(){return buf.content_size() / sizeof(type);}\ |
| 125 } |
| 126 |
| 127 |
| 128 class Pick{ |
| 129 /* The Pick is the basic language generator element*/ |
| 130 public: |
| 131 // generate a string accroding the syntax |
| 132 // Return a null-terminated c-string. The buffer is owned by callee. |
| 133 virtual const char* next() = 0; |
| 134 virtual ~Pick(){}; |
| 135 }; |
| 136 |
| 137 //typedef BUFFER<char> Buffer_char; |
| 138 //typedef BUFFER<int> Buffer_int; |
| 139 //typedef BUFFER<Pick *> Buffer_pPick; |
| 140 BUFFER(char, Buffer_char); |
| 141 BUFFER(int, Buffer_int); |
| 142 BUFFER(Pick *, Buffer_pPick); |
| 143 |
| 144 class SymbolTable{ |
| 145 /* Helper class. |
| 146 * It's a mapping table between 'variable name' and its 'active Pick object' |
| 147 */ |
| 148 private: |
| 149 Buffer_char name_buffer; // var names storage space |
| 150 |
| 151 Buffer_int names; // points to name (offset in name_buffer) |
| 152 Buffer_pPick refs; // points to Pick |
| 153 |
| 154 int get_index(const char *const var_name){ |
| 155 int len = names.content_size(); |
| 156 for (int i=0; i< len; i++){ |
| 157 if (strcmp(var_name, name_buffer + names[i]) == 0){ |
| 158 return i; |
| 159 } |
| 160 } |
| 161 return -1; |
| 162 } |
| 163 |
| 164 public: |
| 165 enum RESULT {EMPTY, NO_VAR, NO_REF, HAS_REF}; |
| 166 |
| 167 RESULT find(const char *const var_name /*[in] c-string*/, Pick * * ref = NUL
L /*[out] Pick* */){ |
| 168 if (!var_name) return EMPTY; // NULL name |
| 169 |
| 170 int i = get_index(var_name); |
| 171 if (i == -1){ |
| 172 return NO_VAR; // new name |
| 173 } |
| 174 if (!refs[i]){ // exist name, no ref |
| 175 return NO_REF; |
| 176 } else { |
| 177 if (ref) { |
| 178 *ref = refs[i]; |
| 179 } |
| 180 return HAS_REF; // exist name, has ref |
| 181 } |
| 182 } |
| 183 |
| 184 void put(const char *const var_name, Pick *const var_ref = NULL){ |
| 185 int i = get_index(var_name); |
| 186 switch(find(var_name)){ |
| 187 case EMPTY: // NULL name |
| 188 break; |
| 189 case NO_VAR: // new name |
| 190 int offset; |
| 191 offset = name_buffer.content_size(); |
| 192 name_buffer.append_array(var_name, strlen(var_name) + 1); |
| 193 names.append(offset); |
| 194 refs.append(var_ref); |
| 195 break; |
| 196 case NO_REF: // exist name, no ref |
| 197 refs[i] = var_ref; // link definition with variable |
| 198 break; |
| 199 case HAS_REF: // exist name, has ref |
| 200 if (var_ref){ |
| 201 refs[i] = var_ref; |
| 202 } |
| 203 break; |
| 204 default: |
| 205 ; // ASSERT(FALSE); |
| 206 } |
| 207 return; |
| 208 } |
| 209 |
| 210 UBool is_complete(){ |
| 211 int n = names.content_size(); |
| 212 for (int i=0; i<n; ++i){ |
| 213 if (refs[i] == NULL){ |
| 214 return FALSE; |
| 215 } |
| 216 } |
| 217 return TRUE; |
| 218 } |
| 219 |
| 220 void reset(){ |
| 221 names.reset(); |
| 222 name_buffer.reset(); |
| 223 |
| 224 // release memory here |
| 225 int s = refs.content_size(); |
| 226 for (int i=0; i < s; i++){ |
| 227 delete refs[i]; // TOFIX: point alias/recursion problem |
| 228 } |
| 229 refs.reset(); |
| 230 } |
| 231 |
| 232 ~SymbolTable(){ |
| 233 reset(); |
| 234 } |
| 235 }; |
| 236 |
| 237 |
| 238 /* |
| 239 // Document of class Escaper |
| 240 // |
| 241 // ATTENTION: |
| 242 // From http://icu-project.org/userguide/Collate_Customization.html. |
| 243 // We get the precedence of escape/quote operations |
| 244 // |
| 245 // (highest) 1. backslash \ |
| 246 // 2. two single quotes '' |
| 247 // 3. quoting ' ' |
| 248 // |
| 249 // ICU Collation should accept following as the same string. |
| 250 // |
| 251 // 1) 'ab'c _ |
| 252 // 2) a\bc \ |
| 253 // 3) a'b'\c |- They are equal. |
| 254 // 4) abc _/ |
| 255 // |
| 256 // From "two single quotes", we have following deductions |
| 257 // D1. empty quoting is illgal. (obviously) |
| 258 // D2. no contact operation between two quotings |
| 259 // '.''.' is not .. it is .'. |
| 260 // D3. "two single quotes" cannot contact two quoting simultaneously |
| 261 // '..''''.' is not ..'. it is ..''. |
| 262 // NOTICE: |
| 263 // "two single quotes" can contact before one quoting |
| 264 // '''.' is '. |
| 265 // "two single quotes" can literally contact after one quoting |
| 266 // But, from syntax, it's one quoting including a "two single quotes" |
| 267 // '.''' is .' |
| 268 // D4. "two single quotes" cannot solely be included in quoting |
| 269 // '''' is not ' it is '' |
| 270 // NOTICE: These are legal |
| 271 // '.''.' is .'. |
| 272 // '.''' is .' |
| 273 // |
| 274 // dicision |
| 275 // /\ |
| 276 // /__\ |
| 277 // output buffer input buffer |
| 278 // |
| 279 // To make our dicision (within an atom operation) without caring input and outp
ut buffer, |
| 280 // following calling pattern (within an atom operation) shall be avoided |
| 281 // |
| 282 // P1 open_quoting() then close_quoting() (direct violation) D1 |
| 283 // P2 close_quoting() then open_quoting() (direct violation) D2 |
| 284 // P3 empty open_quoting() (indirect violation) D1, D4 |
| 285 // P4 empty close_quoting() (indirect violation) D2, D3 |
| 286 // P5 open_quoting() then two single quotes (indirect violation) D4 |
| 287 // P6 close_quoting() then two single quotes (indirect violation) D3 |
| 288 // |
| 289 // two single quotes escaping will not open_ or close_ quoting() |
| 290 // The choice will not lose some quoing forms. |
| 291 // |
| 292 // For open_quoting(), |
| 293 // we may get this form quoting ''' P5 |
| 294 // It may raise a bug ''''x |
| 295 // If we expect |
| 296 // '''.' let the next char open the quoting |
| 297 // '.''.' the quoting is already opened by preceding char |
| 298 // |
| 299 // For close_quoting() |
| 300 // we will get this form quoting '.''' P6 |
| 301 // It may raise a bug '.''''.' |
| 302 // If we expect |
| 303 // '.'''\. let the next char close the quoting |
| 304 // '.''''.' the expectation is wrong! using '.'\''.' instead |
| 305 // |
| 306 // It's a hard work to re-adjust generation opportunity for various escaping for
m. |
| 307 // We just simply ignore it. |
| 308 */ |
| 309 class Escaper{ |
| 310 public: |
| 311 enum CHOICE {YES, NO, RAND}; |
| 312 enum ESCAPE_FORM {BSLASH_ONLY, QUOTE_ONLY, QUOTE_AND_BSLAH, RAND_ESC}; |
| 313 private: |
| 314 class Bool{ // A wrapper class for CHOICE, to auto adapter UBool class |
| 315 private: |
| 316 const CHOICE tag; |
| 317 public: |
| 318 Bool(CHOICE flag=RAND):tag(flag){} |
| 319 operator UBool() { // conversion operator |
| 320 return tag == RAND ? rand()%2 : tag == YES; |
| 321 //if (tag == RAND){ |
| 322 // return rand()%2 == 1; |
| 323 //} else { |
| 324 // return tag == YES ? TRUE : FALSE; |
| 325 //} |
| 326 } |
| 327 }; |
| 328 public: |
| 329 Escaper(CHOICE escapeLiteral = RAND, |
| 330 CHOICE twoQuotesEscape = RAND, |
| 331 ESCAPE_FORM escapeForm = RAND_ESC): |
| 332 escape_form(escapeForm), |
| 333 escape_literal(escapeLiteral), |
| 334 two_quotes_escape(twoQuotesEscape), |
| 335 is_quoting(FALSE){} |
| 336 private: |
| 337 Buffer_char str; |
| 338 ESCAPE_FORM escape_form; |
| 339 Bool escape_literal; |
| 340 Bool two_quotes_escape; |
| 341 UBool quote_escape; |
| 342 UBool bslash_escape; |
| 343 UBool is_quoting; |
| 344 |
| 345 void set_options(){ |
| 346 ESCAPE_FORM t = escape_form == RAND_ESC ? (ESCAPE_FORM) (rand()%3) : esc
ape_form; |
| 347 switch (t){ |
| 348 case BSLASH_ONLY : |
| 349 bslash_escape = TRUE; quote_escape = FALSE; break; |
| 350 case QUOTE_ONLY: |
| 351 bslash_escape = FALSE;quote_escape = TRUE; break; |
| 352 case QUOTE_AND_BSLAH: |
| 353 bslash_escape = TRUE; quote_escape = TRUE; break; |
| 354 default: |
| 355 ;// error |
| 356 } |
| 357 } |
| 358 |
| 359 void reset(){ |
| 360 str.reset(); |
| 361 is_quoting = FALSE; |
| 362 } |
| 363 |
| 364 inline void open_quoting(){ |
| 365 if(is_quoting){ |
| 366 // do nothing |
| 367 } else { |
| 368 str.append('\''); |
| 369 is_quoting = TRUE; |
| 370 } |
| 371 } |
| 372 inline void close_quoting(){ |
| 373 if(is_quoting){ |
| 374 str.append('\''); |
| 375 is_quoting = FALSE; |
| 376 } else { |
| 377 // do nothing |
| 378 } |
| 379 } |
| 380 |
| 381 // str [in] null-terminated c-string |
| 382 void append(const char * strToAppend){ |
| 383 for(;*strToAppend != 0; strToAppend++){ |
| 384 append(*strToAppend); |
| 385 } |
| 386 } |
| 387 |
| 388 inline void append(const char c){ |
| 389 set_options(); |
| 390 |
| 391 if (c == '\\'){ |
| 392 quote_escape ? open_quoting() : close_quoting(); |
| 393 //bslash_escape always true here |
| 394 str.append('\\'); |
| 395 str.append('\\'); |
| 396 } else if (c == '\''){ |
| 397 if (two_quotes_escape){ // quoted using two single quotes |
| 398 // See documents in anonymous.design |
| 399 str.append('\''); |
| 400 str.append('\''); |
| 401 } else{ |
| 402 quote_escape ? open_quoting() : close_quoting(); |
| 403 //bslash_escape always true here |
| 404 str.append('\\'); |
| 405 str.append('\''); |
| 406 } |
| 407 } else if (isSpecialAsciiChar(c) || isWhiteSpace(c)){ |
| 408 quote_escape ? open_quoting() : close_quoting(); |
| 409 if (bslash_escape) str.append('\\'); |
| 410 str.append(c); |
| 411 } else { //if (isAlphabet(c) || isDigit(c) || TRUE){ // treat others as
literal |
| 412 if (escape_literal){ |
| 413 quote_escape ? open_quoting() : close_quoting(); |
| 414 if (bslash_escape) str.append('\\'); |
| 415 str.append(c); |
| 416 } else { |
| 417 close_quoting(); |
| 418 str.append(c); |
| 419 } |
| 420 } |
| 421 } |
| 422 |
| 423 public: |
| 424 // Return a null-terminate c-string. The buffer is owned by callee. |
| 425 char * operator()(const char * literal /*c-string*/){ |
| 426 str.reset(); |
| 427 for(;*literal != 0; literal++){ |
| 428 append(*literal); |
| 429 } |
| 430 close_quoting(); // P4 exception, to close whole quoting |
| 431 return str; |
| 432 } |
| 433 }; |
| 434 |
| 435 class WeightedRand{ |
| 436 // Return a random number in [0, size) |
| 437 // Every number has different chance (aka weight) to be selected. |
| 438 private: |
| 439 Buffer_int weights; |
| 440 double total; |
| 441 WeightedRand(const WeightedRand &); |
| 442 WeightedRand & operator = (const WeightedRand &); |
| 443 public: |
| 444 WeightedRand(Buffer_int * weight_list = NULL, int size = 0){ |
| 445 if ( weight_list == NULL){ |
| 446 for (int i=0; i<size; ++i) weights.append(DEFAULT_WEIGHT); |
| 447 } else { |
| 448 int s = weight_list->content_size(); |
| 449 if (s < size){ |
| 450 weights.append_array( (*weight_list),s); |
| 451 for (int i=s; i<size; ++i) weights.append(DEFAULT_WEIGHT); |
| 452 } else { // s >= size |
| 453 weights.append_array( (*weight_list),size); |
| 454 } |
| 455 } |
| 456 total = 0; |
| 457 int c = weights.content_size(); |
| 458 for (int i=0; i<c; ++i){ |
| 459 total += weights[i]; |
| 460 } |
| 461 } |
| 462 |
| 463 void append(int weight){ |
| 464 weights.append(weight); |
| 465 total += weight; |
| 466 } |
| 467 |
| 468 // Give a random number with the consideration of weight. |
| 469 // Every random number is associated with a weight. |
| 470 // It identifies the chance to be selected, |
| 471 // larger weight has more chance to be selected. |
| 472 // |
| 473 // |
| 474 // ______________________ every slot has equal chance |
| 475 // |
| 476 // [____][_][___][______] each item has different slots, hence different
chance |
| 477 // |
| 478 // |
| 479 // The algorithms to generate the number is illustrated by preceding figure
. |
| 480 // First, a slot is selected by rand(). Then we translate the slot to corre
sponding item. |
| 481 // |
| 482 int next(){ |
| 483 // get a random in [0,1] |
| 484 double reference_mark = (double)rand() / (double)RAND_MAX; |
| 485 |
| 486 // get the slot's index, 0 <= mark <= total; |
| 487 double mark = total * reference_mark; |
| 488 |
| 489 // translate the slot to corresponding item |
| 490 int i=0; |
| 491 for (;;){ |
| 492 mark -= weights[i]; // 0 <= mark <= total |
| 493 if (mark <= 0) |
| 494 break; |
| 495 i++; |
| 496 } |
| 497 return i; |
| 498 } |
| 499 }; |
| 500 |
| 501 /////////////////////////////////////////////////////////// |
| 502 // |
| 503 // The parser result nodes |
| 504 // |
| 505 |
| 506 class Literal : public Pick { |
| 507 public: |
| 508 virtual const char* next(){ |
| 509 return str; |
| 510 } |
| 511 Literal(const char * s /*c-string*/){ |
| 512 str.append_array(s, strlen(s) + 1); |
| 513 } |
| 514 private: |
| 515 Buffer_char str; //null-terminated c-string |
| 516 }; |
| 517 |
| 518 class Variable : public Pick { |
| 519 public: |
| 520 Variable(SymbolTable * symbols, const char * varName, Pick * varRef = NULL){ |
| 521 this->var_name.append_array(varName, strlen(varName) + 1); |
| 522 if ((symbol_table = symbols)){ |
| 523 symbol_table->put(varName, varRef); |
| 524 } |
| 525 } |
| 526 |
| 527 operator const char *(){ |
| 528 return var_name; |
| 529 } |
| 530 |
| 531 virtual const char* next(){ |
| 532 if (symbol_table){ |
| 533 Pick * var_ref = NULL; |
| 534 symbol_table->find(var_name, &var_ref); |
| 535 if (var_ref) { |
| 536 return var_ref->next(); |
| 537 } |
| 538 } |
| 539 return ""; // dumb string |
| 540 } |
| 541 private: |
| 542 Buffer_char var_name; |
| 543 SymbolTable * symbol_table; |
| 544 }; |
| 545 |
| 546 class Quote : public Pick{ |
| 547 public: |
| 548 Quote(Pick & base):item(base),e(Escaper::NO, Escaper::NO, Escaper::BSLASH_ON
LY){ |
| 549 } |
| 550 virtual const char* next(){ |
| 551 return e(item.next()); |
| 552 } |
| 553 private: |
| 554 Pick & item; |
| 555 Buffer_char str; |
| 556 Escaper e; |
| 557 }; |
| 558 |
| 559 |
| 560 class Morph : public Pick{ |
| 561 /* |
| 562 The difference between morph and an arbitrary random string is that |
| 563 a morph changes slowly. When we build collation rules, for example, |
| 564 it is a much better test if the strings we use are all in the same |
| 565 'neighborhood'; they share many common characters. |
| 566 */ |
| 567 public: |
| 568 Morph(Pick & base):item(base){} |
| 569 |
| 570 virtual const char* next(){ |
| 571 current.reset(); |
| 572 const char * s = item.next(); |
| 573 current.append_array(s, strlen(s) + 1); |
| 574 if (last.content_size() == 0) { |
| 575 str.reset(); |
| 576 last.reset(); |
| 577 str.append_array(current, current.content_size()); |
| 578 last.append_array(current, current.content_size()); |
| 579 } else { |
| 580 morph(); |
| 581 } |
| 582 return str; |
| 583 } |
| 584 private: |
| 585 Pick & item; |
| 586 Buffer_char str; |
| 587 Buffer_char last; |
| 588 Buffer_char current; |
| 589 |
| 590 char * p_last; |
| 591 char * p_curr; |
| 592 |
| 593 void copy_curr(){ |
| 594 if (*p_curr) { |
| 595 str.append(*p_curr); |
| 596 p_curr++; |
| 597 } |
| 598 } |
| 599 |
| 600 void copy_last(){ |
| 601 if (*p_last) { |
| 602 str.append(*p_last); |
| 603 p_last++; |
| 604 } |
| 605 } |
| 606 |
| 607 // copy 0, 1, or 2 character(s) to str |
| 608 void copy(){ |
| 609 static WeightedRand wr(& Buffer_int().append(DEFAULT_WEIGHT * 10), 5); |
| 610 |
| 611 switch (wr.next()){ |
| 612 case 0: // copy last -- has 10 times chance than others |
| 613 copy_last(); |
| 614 break; |
| 615 case 1: // copy both |
| 616 copy_curr(); |
| 617 copy_last(); |
| 618 break; |
| 619 case 2: // copy both |
| 620 copy_last(); |
| 621 copy_curr(); |
| 622 break; |
| 623 case 3: |
| 624 copy_curr(); |
| 625 break; |
| 626 case 4: // copy nothing |
| 627 break; |
| 628 default: |
| 629 // ASSERT(FALSE); |
| 630 ; |
| 631 } |
| 632 } |
| 633 |
| 634 void morph(void){ |
| 635 int min = strlen(last); |
| 636 int max = strlen(current); |
| 637 if (min > max){ |
| 638 int temp = min; |
| 639 min = max; |
| 640 max = temp; |
| 641 } |
| 642 |
| 643 int len = min + rand()%(max - min + 1); // min + [0, diff] |
| 644 p_curr = current; |
| 645 p_last = last; |
| 646 str.reset(); |
| 647 |
| 648 for (; str.content_size()<len && *p_curr && *p_last;){ |
| 649 copy(); // copy 0, 1, or 2 character(s) to str |
| 650 } |
| 651 |
| 652 if (str.content_size() == len) { |
| 653 str.append(0); |
| 654 final(); |
| 655 return; |
| 656 } |
| 657 |
| 658 if (str.content_size() > len) { // if the last copy copied two character
s |
| 659 str[len]=0; |
| 660 final(); |
| 661 return; |
| 662 } |
| 663 |
| 664 // str.content_size() < len |
| 665 if (*p_last) { |
| 666 for (; str.content_size() < len; copy_last()); |
| 667 } else if (*p_curr){ |
| 668 for (; str.content_size() < len; copy_curr()); |
| 669 } |
| 670 |
| 671 int last_len = last.content_size(); |
| 672 for (;str.content_size() < len;){ |
| 673 str.append(last[rand()%last_len]); |
| 674 } |
| 675 str.append(0); |
| 676 final(); |
| 677 } |
| 678 |
| 679 void final(){ |
| 680 last.reset(); |
| 681 last.append_array(current, current.content_size()); |
| 682 } |
| 683 }; |
| 684 |
| 685 class Sequence : public Pick { |
| 686 public: |
| 687 virtual const char* next(){ |
| 688 str.reset(); |
| 689 int s = items.content_size(); |
| 690 for(int i=0; i < s; i++){ |
| 691 const char * t = items[i]->next(); |
| 692 str.append_array(t, strlen(t)); |
| 693 } |
| 694 str.append(0); // terminal null |
| 695 return str; |
| 696 } |
| 697 |
| 698 void append (Pick * node){ |
| 699 items.append(node); |
| 700 } |
| 701 |
| 702 virtual ~Sequence(){ |
| 703 int s = items.content_size(); |
| 704 for(int i=0; i < s; i++){ |
| 705 //How can assure the item is got from heap? |
| 706 //Let's assume it. |
| 707 delete items[i]; // TOFIX: point alias/recursion problem |
| 708 items[i] = NULL; |
| 709 } |
| 710 } |
| 711 private: |
| 712 Buffer_pPick items; |
| 713 Buffer_char str; //null-terminated c-string |
| 714 }; |
| 715 |
| 716 class Repeat : public Pick { |
| 717 private: |
| 718 Pick * item; |
| 719 Buffer_char str; |
| 720 WeightedRand wr; |
| 721 int min; |
| 722 int max; |
| 723 int select_a_count(){ |
| 724 return min + wr.next(); |
| 725 } |
| 726 public: |
| 727 virtual const char* next(){ |
| 728 str.reset(); |
| 729 int c = select_a_count(); |
| 730 for(int i=0; i< c; i++){ |
| 731 const char * t = item->next(); |
| 732 str.append_array(t, strlen(t)); |
| 733 } |
| 734 str.append(0); |
| 735 return str; |
| 736 } |
| 737 |
| 738 Repeat(Pick * base, int minCount =0, int maxCount = 1, Buffer_int * weights
= NULL): |
| 739 wr(weights, maxCount-minCount +1) { |
| 740 this->item = base; |
| 741 this->min = minCount; |
| 742 this->max = maxCount; |
| 743 } |
| 744 virtual ~Repeat(){ |
| 745 delete item; // TOFIX: point alias/recursion problem |
| 746 item = NULL; |
| 747 } |
| 748 }; |
| 749 |
| 750 |
| 751 class Alternation : public Pick { |
| 752 public: |
| 753 virtual const char* next(){ |
| 754 str.reset(); |
| 755 int i = wr.next(); |
| 756 const char * t = items[i]->next(); |
| 757 str.append_array(t, strlen(t) + 1); |
| 758 return str; |
| 759 } |
| 760 virtual ~Alternation(){ |
| 761 int s = items.content_size(); |
| 762 for(int i=0; i < s; i++){ |
| 763 delete items[i]; // TOFIX: point alias/recursion problem |
| 764 items[i] = NULL; |
| 765 } |
| 766 } |
| 767 |
| 768 Alternation & append (Pick * node, int weight = DEFAULT_WEIGHT){ |
| 769 items.append(node); |
| 770 wr.append(weight); |
| 771 return *this; |
| 772 } |
| 773 private: |
| 774 Buffer_pPick items; |
| 775 Buffer_char str; // null-terminated c-string |
| 776 WeightedRand wr; |
| 777 }; |
| 778 |
| 779 /////////////////////////////////////////////////////////// |
| 780 // |
| 781 // The parser |
| 782 // |
| 783 |
| 784 enum TokenType {STRING, VAR, NUMBER, STREAM_END, ERROR, QUESTION, STAR, PLUS, LB
RACE, RBRACE, LPAR, RPAR, SEMI, EQ, COMMA, BAR, AT, WAVE, PERCENT}; |
| 785 |
| 786 class Scanner{ |
| 787 friend int DumpScanner(Scanner & s, UBool dumb); |
| 788 private: |
| 789 const char * source; |
| 790 const char * working; |
| 791 const char * history; // for debug |
| 792 enum StateType {START, IN_NUM, IN_VAR_FIRST, IN_VAR, IN_QUOTE, IN_QUOTE_BSLA
SH, IN_BSLASH, IN_STRING, DONE}; |
| 793 StateType state; |
| 794 void terminated(TokenType t){ |
| 795 working--; // return the peeked character |
| 796 tokenType = t; |
| 797 token.append(0); // close buffer |
| 798 state = DONE; |
| 799 } |
| 800 public: |
| 801 // the buffer of "source" is owned by caller |
| 802 Scanner(const char *src/*[in] c-string*/ = NULL):source(src){ |
| 803 working = src; |
| 804 history = working; |
| 805 state = DONE; |
| 806 tokenType = ERROR; |
| 807 } |
| 808 |
| 809 //void setSource(const char *const src /*[in] c-string*/){ |
| 810 // *(&const_cast<const char *>(source)) = src; |
| 811 //} |
| 812 |
| 813 Buffer_char token; |
| 814 TokenType tokenType; |
| 815 |
| 816 TokenType getNextToken(){ |
| 817 token.reset(); |
| 818 state = START; |
| 819 history = working; // for debug |
| 820 while (state != DONE){ |
| 821 char c = *working++; |
| 822 if (c == 0 && state != START){//avoid buffer overflow. for IN_QUOE,
IN_ESCAPE |
| 823 terminated(ERROR); |
| 824 break; // while |
| 825 } |
| 826 switch(state){ |
| 827 case START: |
| 828 tokenType = ERROR; |
| 829 switch(c){ |
| 830 case '?' : tokenType = QUESTION; break; |
| 831 case '*' : tokenType = STAR; break; |
| 832 case '+' : tokenType = PLUS; break; |
| 833 case '{' : tokenType = LBRACE; break; |
| 834 case '}' : tokenType = RBRACE; break; |
| 835 case '(' : tokenType = LPAR; break; |
| 836 case ')' : tokenType = RPAR; break; |
| 837 case ';' : tokenType = SEMI; break; |
| 838 case '=' : tokenType = EQ; break; |
| 839 case ',' : tokenType = COMMA; break; |
| 840 case '|' : tokenType = BAR; break; |
| 841 case '@' : tokenType = AT; break; |
| 842 case '~' : tokenType = WAVE; break; |
| 843 case '%' : tokenType = PERCENT; break; |
| 844 case 0 : tokenType = STREAM_END; working-- /*avoid bu
ffer overflow*/; break; |
| 845 } |
| 846 if (tokenType != ERROR){ |
| 847 token.append(c); |
| 848 token.append(0); |
| 849 state = DONE; |
| 850 break; // START |
| 851 } |
| 852 switch(c){ |
| 853 case '$' : state = IN_VAR_FIRST; token.append(c); break
; |
| 854 case '\'' : state = IN_QUOTE; break; |
| 855 case '\\' : state = IN_BSLASH; break; |
| 856 default: |
| 857 if (isWhiteSpace(c)){ // state = START; //do no
thing |
| 858 } else if (isDigit(c)){ state = IN_NUM; token
.append(c); |
| 859 } else if (isAlphabet(c)){ state = IN_STRING; token
.append(c); |
| 860 } else {terminated(ERROR);} |
| 861 } |
| 862 break;//START |
| 863 case IN_NUM: |
| 864 if (isDigit(c)){ |
| 865 token.append(c); |
| 866 } else { |
| 867 terminated(NUMBER); |
| 868 } |
| 869 break;//IN_NUM |
| 870 case IN_VAR_FIRST: |
| 871 if (isAlphabet(c)){ |
| 872 token.append(c); |
| 873 state = IN_VAR; |
| 874 } else { |
| 875 terminated(ERROR); |
| 876 } |
| 877 break; // IN_VAR_FISRT |
| 878 case IN_VAR: |
| 879 if (isAlphabet(c) || isDigit(c)){ |
| 880 token.append(c); |
| 881 } else { |
| 882 terminated(VAR); |
| 883 } |
| 884 break;//IN_VAR |
| 885 case IN_STRING: |
| 886 // About the scanner's behavior for STRING, AT, and ESCAPE: |
| 887 // All of them can be contacted with each other. |
| 888 // This means the scanner will eat up as much as possible st
rings |
| 889 // (STRING, AT, and ESCAPE) at one time, with no regard of
their |
| 890 // combining sequence. |
| 891 // |
| 892 if (c == '\''){ |
| 893 state = IN_QUOTE; // the first time we see single quote |
| 894 } else if (c =='\\'){ // back slash character |
| 895 state = IN_BSLASH; |
| 896 } else if (isAlphabet(c) || isDigit(c)){ |
| 897 token.append(c); |
| 898 } else{ |
| 899 terminated(STRING); |
| 900 } |
| 901 break;//IN_STRING |
| 902 case IN_QUOTE: |
| 903 if (c == '\''){ // the second time we see single quote |
| 904 state = IN_STRING; // see document in IN_STRING |
| 905 } else if ( c== '\\') { // backslah escape in quote |
| 906 state = IN_QUOTE_BSLASH; |
| 907 } else { |
| 908 token.append(c); // eat up everything, includes back sl
ash |
| 909 } |
| 910 break;//IN_QUOTE |
| 911 case IN_QUOTE_BSLASH: |
| 912 case IN_BSLASH: |
| 913 switch (c){ |
| 914 case 'n' : token.append('\n'); break; |
| 915 case 'r' : token.append('\r'); break; |
| 916 case 't' : token.append('\t'); break; |
| 917 case '\'' : token.append('\''); break; |
| 918 case '\\' : token.append('\\'); break; |
| 919 default: token.append(c); // unknown escaping, treat it
as literal |
| 920 } |
| 921 if (state == IN_BSLASH){ |
| 922 state = IN_STRING; // see document in IN_STRING |
| 923 } else { // state == IN_QUOTE_BSLASH |
| 924 state = IN_QUOTE; |
| 925 } |
| 926 break;//IN_BSLASH |
| 927 case DONE: /* should never happen */ |
| 928 default: |
| 929 working--; |
| 930 tokenType = ERROR; |
| 931 state = DONE; |
| 932 break; |
| 933 }//switch(state) |
| 934 }//while (state != DONE) |
| 935 |
| 936 return tokenType; |
| 937 } |
| 938 };//class Scanner |
| 939 |
| 940 class Parser{ |
| 941 friend UBool TestParser(); |
| 942 friend class TestParserT; |
| 943 friend class LanguageGenerator_impl; |
| 944 private: |
| 945 Scanner s; |
| 946 TokenType & token; |
| 947 int min_max; // for the evil infinite |
| 948 |
| 949 UBool match(TokenType expected){ |
| 950 if (token == expected) { |
| 951 token = s.getNextToken(); |
| 952 return TRUE; |
| 953 } else { |
| 954 //s.dumpCurrentPoint(); |
| 955 return FALSE; |
| 956 } |
| 957 } |
| 958 |
| 959 UBool weight(int & value){ |
| 960 if (token == NUMBER){ |
| 961 int temp = atoi(s.token); |
| 962 match(NUMBER); |
| 963 if (match(PERCENT)){ |
| 964 value = temp; |
| 965 return TRUE; |
| 966 } |
| 967 } |
| 968 return FALSE; |
| 969 } |
| 970 |
| 971 UBool repeat (Pick* &node /*in,out*/){ |
| 972 if (node == NULL) return FALSE; |
| 973 |
| 974 int count = -2; |
| 975 int min = -2; |
| 976 int max = -2; |
| 977 UBool question = FALSE; |
| 978 switch (token){ |
| 979 case QUESTION: |
| 980 match(QUESTION); |
| 981 min = 0; |
| 982 max = 1; |
| 983 count = 2; |
| 984 question = TRUE; |
| 985 break; |
| 986 case STAR: |
| 987 match(STAR); |
| 988 min = 0; |
| 989 max = -1; |
| 990 count = -1; |
| 991 break; |
| 992 case PLUS: |
| 993 match(PLUS); |
| 994 min = 1; |
| 995 max = -1; |
| 996 count = -1; |
| 997 break; |
| 998 case LBRACE: |
| 999 match(LBRACE); |
| 1000 if (token != NUMBER){ |
| 1001 return FALSE; |
| 1002 }else { |
| 1003 min = atoi(s.token); |
| 1004 match(NUMBER); |
| 1005 if (token == RBRACE){ |
| 1006 match(RBRACE); |
| 1007 max = min; |
| 1008 count = 1; |
| 1009 } else if (token == COMMA) { |
| 1010 match(COMMA); |
| 1011 if (token == RBRACE){ |
| 1012 match(RBRACE); |
| 1013 max = -1; |
| 1014 count = -1; |
| 1015 } else if (token == NUMBER) { |
| 1016 max = atoi(s.token); |
| 1017 match(NUMBER); |
| 1018 count = max - min + 1; |
| 1019 if (!match(RBRACE)) { |
| 1020 return FALSE; |
| 1021 } |
| 1022 } else { |
| 1023 return FALSE; |
| 1024 } |
| 1025 } else { |
| 1026 return FALSE; |
| 1027 } |
| 1028 } |
| 1029 break; |
| 1030 default: |
| 1031 return FALSE; |
| 1032 } |
| 1033 |
| 1034 if (count == -2 || min == -2 || max == -2){ |
| 1035 //ASSERT(FALSE); |
| 1036 return FALSE; |
| 1037 } |
| 1038 |
| 1039 // eat up following weights |
| 1040 Buffer_int weights; |
| 1041 int w; |
| 1042 while (weight(w)){ |
| 1043 weights.append(w); |
| 1044 } |
| 1045 |
| 1046 // for the evil infinite |
| 1047 min_max = min_max > min ? min_max : min; |
| 1048 min_max = min_max > max ? min_max : max; |
| 1049 if (min_max > PSEUDO_INFINIT){ |
| 1050 return FALSE; // PSEUDO_INFINIT is less than the real maximum |
| 1051 } |
| 1052 if (max == -1){ // the evil infinite |
| 1053 max = PSEUDO_INFINIT; |
| 1054 } |
| 1055 // for the strange question mark |
| 1056 if (question && weights.content_size() > 0){ |
| 1057 Buffer_int w2; |
| 1058 w2.append(DEFAULT_WEIGHT - weights[0]).append(weights[0]); |
| 1059 node = new Repeat(node,min,max,&w2); |
| 1060 return TRUE; |
| 1061 } |
| 1062 node = new Repeat(node,min,max,&weights); |
| 1063 return TRUE; |
| 1064 } |
| 1065 |
| 1066 UBool core(Pick* &node /*out*/){ |
| 1067 if (node != NULL) return FALSE; //assert node == NULL |
| 1068 |
| 1069 switch(token){ |
| 1070 case LPAR: |
| 1071 match(LPAR); |
| 1072 if(defination(node) && match(RPAR)){ |
| 1073 return TRUE; |
| 1074 } |
| 1075 return FALSE; |
| 1076 case VAR: |
| 1077 node = new Variable(&symbols, s.token); |
| 1078 match(VAR); |
| 1079 return TRUE; |
| 1080 case STRING: |
| 1081 node = new Literal(s.token); |
| 1082 match(STRING); |
| 1083 return TRUE; |
| 1084 default: |
| 1085 return FALSE; |
| 1086 } |
| 1087 } |
| 1088 UBool modified(Pick* &node /*out*/){ |
| 1089 if (node != NULL) return FALSE; //assert node == NULL |
| 1090 |
| 1091 if (!core(node)) { |
| 1092 return FALSE; |
| 1093 } |
| 1094 |
| 1095 for (;;){ |
| 1096 switch(token){ |
| 1097 case WAVE: |
| 1098 match(WAVE); |
| 1099 node = new Morph(*node); |
| 1100 break; |
| 1101 case AT: |
| 1102 match(AT); |
| 1103 node = new Quote(*node); |
| 1104 break; |
| 1105 case QUESTION: |
| 1106 case STAR: |
| 1107 case PLUS: |
| 1108 case LBRACE: |
| 1109 if (!repeat(node)) return FALSE; |
| 1110 break; |
| 1111 case SEMI: // rule definiation closed |
| 1112 case RPAR: // within parenthesis (core closed) |
| 1113 case BAR: // in alternation |
| 1114 case NUMBER: // in alternation, with weight |
| 1115 case LPAR: // in sequence |
| 1116 case VAR: // in sequence |
| 1117 case STRING: // in sequence |
| 1118 return TRUE; |
| 1119 default: |
| 1120 return FALSE; |
| 1121 } |
| 1122 } |
| 1123 } |
| 1124 |
| 1125 |
| 1126 UBool sequence_list(Pick* &node /*in,out*/){ |
| 1127 if (node == NULL) return FALSE; // assert node != NULL |
| 1128 |
| 1129 Sequence* seq = new Sequence(); |
| 1130 Pick * n = node; |
| 1131 |
| 1132 while (token == VAR || token == STRING || token == LPAR){ |
| 1133 seq->append(n); |
| 1134 n = NULL; |
| 1135 if (modified(n)){ |
| 1136 // go on |
| 1137 } else { |
| 1138 goto FAIL; |
| 1139 } |
| 1140 } |
| 1141 |
| 1142 if (token == SEMI || token == RPAR || token == BAR){ |
| 1143 seq->append(n); |
| 1144 node = seq; |
| 1145 return TRUE; |
| 1146 } |
| 1147 FAIL: |
| 1148 delete seq; |
| 1149 return FALSE; |
| 1150 |
| 1151 } |
| 1152 |
| 1153 UBool sequence(Pick* &node /*out*/){ |
| 1154 if (node != NULL) return FALSE; //assert node == NULL |
| 1155 |
| 1156 if (!modified(node)) { |
| 1157 return FALSE; |
| 1158 } |
| 1159 |
| 1160 if (token == VAR || token == STRING || token == LPAR){ |
| 1161 return sequence_list(node); |
| 1162 } else { |
| 1163 return TRUE; // just a modified |
| 1164 } |
| 1165 } |
| 1166 |
| 1167 UBool alternation_list(Pick* &node /*in,out*/){ |
| 1168 if (node == NULL) return FALSE; // assert node != NULL |
| 1169 |
| 1170 Alternation * alt = new Alternation(); |
| 1171 Pick * n = node; |
| 1172 int w = DEFAULT_WEIGHT; |
| 1173 |
| 1174 while (token == NUMBER || token == BAR){ |
| 1175 if(token == NUMBER) { |
| 1176 if (weight(w)){ |
| 1177 if (token == BAR){ |
| 1178 // the middle item, go on |
| 1179 } else { |
| 1180 // the last item or encounter error |
| 1181 break; //while |
| 1182 } |
| 1183 } else { |
| 1184 goto FAIL; |
| 1185 } |
| 1186 } // else token == BAR |
| 1187 match(BAR); |
| 1188 alt->append(n,w); |
| 1189 |
| 1190 n = NULL; |
| 1191 w = DEFAULT_WEIGHT; |
| 1192 if (sequence(n)){ |
| 1193 // go on |
| 1194 } else { |
| 1195 goto FAIL; |
| 1196 } |
| 1197 } |
| 1198 |
| 1199 if (token == SEMI || token == RPAR) { |
| 1200 alt->append(n,w); |
| 1201 node = alt; |
| 1202 return TRUE; |
| 1203 } |
| 1204 FAIL: |
| 1205 delete alt; |
| 1206 return FALSE; |
| 1207 } |
| 1208 |
| 1209 UBool alternation(Pick* &node /*out*/){ |
| 1210 if (node != NULL) return FALSE; //assert node == NULL |
| 1211 |
| 1212 // 'sequence' has higher precedence than 'alternation' |
| 1213 if (!sequence(node)){ |
| 1214 return FALSE; |
| 1215 } |
| 1216 |
| 1217 if (token == BAR || token == NUMBER){ // find a real alternation1, creat
e it. |
| 1218 return alternation_list(node); |
| 1219 } else { |
| 1220 return TRUE; // just a sequence_old |
| 1221 } |
| 1222 } |
| 1223 |
| 1224 |
| 1225 UBool defination(Pick* &node /*out*/){ |
| 1226 if (node != NULL) return FALSE; //assert node == NULL |
| 1227 return alternation(node); |
| 1228 } |
| 1229 |
| 1230 UBool rule(){ |
| 1231 if (token == VAR){ |
| 1232 Buffer_char name; |
| 1233 name.append_array(s.token, strlen(s.token) + 1); |
| 1234 match(VAR); |
| 1235 |
| 1236 if (match(EQ)){ |
| 1237 Pick * t = NULL; |
| 1238 if(defination(t)){ |
| 1239 symbols.put(name, t); |
| 1240 return match(SEMI); |
| 1241 } |
| 1242 } |
| 1243 } |
| 1244 return FALSE; |
| 1245 } |
| 1246 public: |
| 1247 UBool rules(){ |
| 1248 symbols.reset(); |
| 1249 token = s.getNextToken(); |
| 1250 while (rule()){ |
| 1251 } |
| 1252 if (token == STREAM_END){ |
| 1253 return TRUE; |
| 1254 } else { |
| 1255 //s.dumpCurrentPoint(); |
| 1256 return FALSE; |
| 1257 } |
| 1258 } |
| 1259 |
| 1260 public: |
| 1261 SymbolTable symbols; |
| 1262 |
| 1263 Parser(const char *const source):s(source), token(s.tokenType){ |
| 1264 min_max = -2; |
| 1265 } |
| 1266 UBool parse(){ |
| 1267 return rules(); |
| 1268 } |
| 1269 |
| 1270 }; // class Parser |
| 1271 |
| 1272 |
| 1273 /////////////////////////////////////////////////////////// |
| 1274 // |
| 1275 // |
| 1276 // |
| 1277 |
| 1278 int DumpScanner(Scanner & s, UBool dump = TRUE){ |
| 1279 int len = strlen(s.source); |
| 1280 int error_start_offset = s.history - s.source; |
| 1281 if (dump){ |
| 1282 printf("\n=================== DumpScanner ================\n"); |
| 1283 fwrite(s.source, len, 1, stdout); |
| 1284 printf("\n-----parsed-------------------------------------\n"); |
| 1285 fwrite(s.source, s.history - s.source, 1, stdout); |
| 1286 printf("\n-----current------------------------------------\n"); |
| 1287 fwrite(s.history, s.working - s.history, 1, stdout); |
| 1288 printf("\n-----unparsed-----------------------------------\n"); |
| 1289 fwrite(s.working, (s.source + len - s.working), 1, stdout); |
| 1290 printf("\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"); |
| 1291 } |
| 1292 return error_start_offset; |
| 1293 } |
| 1294 |
| 1295 class LanguageGenerator_impl{ |
| 1296 public: |
| 1297 LanguageGenerator_impl(const char *const bnf_definition, const char *const t
op_node) |
| 1298 :par(bnf_definition), top_node_name(top_node){ |
| 1299 srand((unsigned)time( NULL )); |
| 1300 } |
| 1301 |
| 1302 LanguageGenerator::PARSE_RESULT parseBNF(UBool debug = TRUE){ |
| 1303 if (par.parse()){ |
| 1304 if (par.symbols.find(top_node_name, &top_node_ref) == SymbolTable::H
AS_REF) { |
| 1305 if (par.symbols.is_complete()) { |
| 1306 return LanguageGenerator::OK; |
| 1307 } else { |
| 1308 if (debug) printf("The bnf definition is incomplete.\n"); |
| 1309 return LanguageGenerator::INCOMPLETE; |
| 1310 } |
| 1311 } else { |
| 1312 if (debug) printf("No top node is found.\n"); |
| 1313 return LanguageGenerator::NO_TOP_NODE; |
| 1314 } |
| 1315 } else { |
| 1316 if(debug) { |
| 1317 printf("The bnf definition is wrong\n"); |
| 1318 DumpScanner(par.s, TRUE); |
| 1319 } |
| 1320 return LanguageGenerator::BNF_DEF_WRONG; |
| 1321 } |
| 1322 } |
| 1323 const char * next(){ |
| 1324 return top_node_ref->next(); |
| 1325 } |
| 1326 |
| 1327 private: |
| 1328 Parser par; |
| 1329 const char *const top_node_name; |
| 1330 Pick * top_node_ref; |
| 1331 }; |
| 1332 |
| 1333 LanguageGenerator::LanguageGenerator():lang_gen(NULL){ |
| 1334 } |
| 1335 |
| 1336 LanguageGenerator::~LanguageGenerator(){ |
| 1337 delete lang_gen; |
| 1338 } |
| 1339 |
| 1340 LanguageGenerator::PARSE_RESULT LanguageGenerator::parseBNF(const char *const bn
f_definition /*in*/, const char *const top_node/*in*/, UBool debug){ |
| 1341 if (lang_gen){ |
| 1342 delete lang_gen; |
| 1343 } |
| 1344 lang_gen = new LanguageGenerator_impl(bnf_definition, top_node); |
| 1345 PARSE_RESULT r = lang_gen->parseBNF(debug); |
| 1346 if (r != OK){ |
| 1347 delete lang_gen; |
| 1348 lang_gen = NULL; |
| 1349 return r; |
| 1350 } else { |
| 1351 return r; |
| 1352 } |
| 1353 } |
| 1354 const char *LanguageGenerator::next(){ // Return a null-terminated c-string. The
buffer is owned by callee. |
| 1355 if (lang_gen){ |
| 1356 return lang_gen->next(); |
| 1357 }else { |
| 1358 return ""; |
| 1359 } |
| 1360 } |
| 1361 |
| 1362 /////////////////////////////////////////////////////////// |
| 1363 // |
| 1364 // The test code for WBNF |
| 1365 // |
| 1366 |
| 1367 #define CALL(fun) \ |
| 1368 if (fun()){ \ |
| 1369 printf("Pass: " #fun "\n");\ |
| 1370 } else { \ |
| 1371 printf("FAILED: !!! " #fun " !!!\n"); \ |
| 1372 } |
| 1373 |
| 1374 #define DUMP_R(fun, var, times) \ |
| 1375 {printf("\n========= " #fun " =============\n"); \ |
| 1376 for (int i=0; i<times; i++) { \ |
| 1377 const char * t = var.next();\ |
| 1378 fwrite(t,strlen(t),1,stdout); \ |
| 1379 printf("\n"); \ |
| 1380 } \ |
| 1381 printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n");} |
| 1382 |
| 1383 |
| 1384 |
| 1385 #if TEST_WBNF_TEST |
| 1386 static UBool TestQuote(){ |
| 1387 const char *const str = "This ' A !,z| qq [] .new\tline"; |
| 1388 //const char *const str_r = "This \\' A '!,'z'|' qq '[]' '.'new\tline"; |
| 1389 //// |
| 1390 //// :( we must quote our string to following C syntax |
| 1391 //// cannot type the literal here, it makes our code rather human unread
able |
| 1392 //// very very unconformable! |
| 1393 //// |
| 1394 ///* |
| 1395 //*/ |
| 1396 |
| 1397 //const char *const s1 = "ab'c"; |
| 1398 //const char (* s1_r1) [] = { "ab''c", // ab''c |
| 1399 // "ab\\'c", // ab\'c |
| 1400 // };// |
| 1401 ///* |
| 1402 // . '.' \. |
| 1403 // .. \.\. '.'\. '.'\. '..' // '.''.' wrong |
| 1404 //*/ |
| 1405 |
| 1406 //const char *const s2 = "a..'.b"; // a..'.b |
| 1407 //const char (*s2_r) [] = { "a'..''.'b" // a'..''.'b |
| 1408 // ,"a'..\\'.'b" // a'..\'.'b |
| 1409 // ,"a'..'\\''.'b" // a'..'\''.'b |
| 1410 // };// |
| 1411 |
| 1412 //const char *const s3 = "a..\\.b"; // a..\.b |
| 1413 //const char (*s3_r) [] = { "a'..\\\\.'b" // a'..\\.'b |
| 1414 // ,"a'..'\\\\'.'b" // a'..'\\'.'b |
| 1415 // };// |
| 1416 |
| 1417 // // no catact operation, no choice, must be com
pact |
| 1418 |
| 1419 srand((unsigned)time( NULL )); |
| 1420 |
| 1421 //Escaper l(Escaper::NO, Escaper::NO, Escaper::RAND_ESC); |
| 1422 Pick *p = new Literal(str); |
| 1423 Quote q(*p); |
| 1424 |
| 1425 DUMP_R(TestQuote, (*p), 1); |
| 1426 DUMP_R(TestQuote, q, 20); |
| 1427 return FALSE; |
| 1428 } |
| 1429 static UBool TestLiteral(){ |
| 1430 const char * s = "test string99."; |
| 1431 Literal n(s); |
| 1432 const char * r = n.next(); |
| 1433 return strcmp(s,r) == 0; |
| 1434 } |
| 1435 |
| 1436 static UBool TestSequence(){ |
| 1437 Sequence seq; |
| 1438 seq.append(new Literal("abc ")); |
| 1439 seq.append(new Literal(", s")); |
| 1440 |
| 1441 return strcmp(seq.next(), "abc , s") == 0; |
| 1442 } |
| 1443 static UBool TestAlternation(){ |
| 1444 srand((unsigned)time( NULL )); |
| 1445 Alternation alt; |
| 1446 alt.append(new Literal("aaa_10%"),10); |
| 1447 alt.append(new Literal("bbb_0%"),0); |
| 1448 alt.append(new Literal("ccc_10%"),10); |
| 1449 alt.append(new Literal("ddddddd_50%"),50); |
| 1450 |
| 1451 DUMP_R(TestAlternation, alt, 50); |
| 1452 |
| 1453 return FALSE; |
| 1454 } |
| 1455 |
| 1456 static UBool TestBuffer(){ |
| 1457 Buffer_int t; |
| 1458 t.append(1).append(0).append(5); |
| 1459 int s = t.content_size(); |
| 1460 for (int i=0; i<s; ++i){ |
| 1461 printf("%d\n", t[i]); |
| 1462 } |
| 1463 return FALSE; |
| 1464 } |
| 1465 |
| 1466 static UBool TestWeightedRand(){ |
| 1467 srand((unsigned)time( NULL )); |
| 1468 Buffer_int t; |
| 1469 t.append(1).append(0).append(5); |
| 1470 WeightedRand wr(&Buffer_int().append(10).append(0).append(50),4); |
| 1471 // WeightedRand wr(&t,3); |
| 1472 for (int i=0; i< 50; ++i){ |
| 1473 printf("%d\n", wr.next()); |
| 1474 } |
| 1475 return FALSE; |
| 1476 } |
| 1477 |
| 1478 static UBool TestRepeat(){ |
| 1479 srand((unsigned)time( NULL )); |
| 1480 Repeat rep(new Literal("aaa1-5 "), 1, 5); |
| 1481 DUMP_R(TestRepeat, rep, 50); |
| 1482 |
| 1483 Repeat r2(new Literal("b{1,3}1%0%5% "), 1, 3, &Buffer_int().append(1).append
(0).append(5)); |
| 1484 DUMP_R(TestRepeat, r2, 50); |
| 1485 |
| 1486 Repeat r3(new Literal("aaa5-5 "), 5, 5); |
| 1487 DUMP_R(TestRepeat, r3, 50); |
| 1488 |
| 1489 return FALSE; |
| 1490 } |
| 1491 |
| 1492 static UBool TestVariable(){ |
| 1493 SymbolTable tab; |
| 1494 Pick * value = new Literal("string1"); |
| 1495 Variable var1(&tab, "x", value); |
| 1496 |
| 1497 Variable var2(&tab, "y"); |
| 1498 // tab.put(var2, value); // TOFIX: point alias/recursion problem |
| 1499 Pick * value2 = new Literal("string2"); |
| 1500 tab.put(var2, value2); |
| 1501 |
| 1502 Pick * value3 = new Literal("string3"); |
| 1503 Variable var3(&tab, "z"); |
| 1504 tab.put("z", value3); |
| 1505 |
| 1506 UBool pass; |
| 1507 pass = strcmp(var1.next(), value->next()) == 0; |
| 1508 pass = pass && strcmp(var2.next(), value2->next()) == 0; |
| 1509 pass = pass && strcmp(var3.next(), value3->next()) == 0; |
| 1510 return pass; |
| 1511 } |
| 1512 |
| 1513 static UBool TestSymbolTable(){ |
| 1514 Literal * n1 = new Literal("string1"); |
| 1515 Literal * n2 = new Literal("string2"); |
| 1516 SymbolTable t; |
| 1517 t.put("abc", n1); |
| 1518 t.put("$aaa", n2); |
| 1519 // t.put("alias", n1); // TOFIX: point alias/recursion problem |
| 1520 t.put("bbb"); |
| 1521 |
| 1522 UBool pass; |
| 1523 pass = t.find(NULL) == SymbolTable::EMPTY; |
| 1524 pass = pass && t.find("ccc") == SymbolTable::NO_VAR; |
| 1525 pass = pass && t.find("bbb") == SymbolTable::NO_REF; |
| 1526 pass = pass && t.find("abc") == SymbolTable::HAS_REF; |
| 1527 pass = pass && t.find("$aaa") == SymbolTable::HAS_REF; |
| 1528 |
| 1529 t.reset(); |
| 1530 pass = pass && t.find("abc") == SymbolTable::NO_VAR; |
| 1531 return pass; |
| 1532 } |
| 1533 |
| 1534 |
| 1535 static UBool TestScanner(void){ |
| 1536 //const char str1[] = "$root = $command{0,5} $reset $mostRules{1,20};"; |
| 1537 //const char str1_r[][20] = {"$root", "=", "$command", "{", "0", ",", "5", "
}", |
| 1538 // "$reset", "$mostRules", "{", "1", ",", "20", "}", ";"}; |
| 1539 |
| 1540 const char str2[] = "$p2 =(\\\\ $s $string $s)? 25%;"; |
| 1541 const char str2_r[][20] = {"$p2", "=", "(", "\\", "$s", "$string", "$s", ")"
, "?", "25", "%", ";"}; |
| 1542 |
| 1543 const char *str = str2; |
| 1544 const char (*str_r)[20] = str2_r; |
| 1545 int tokenNum = sizeof(str2_r)/sizeof(char[20]); |
| 1546 |
| 1547 Scanner t(str); |
| 1548 UBool pass = TRUE; |
| 1549 t.getNextToken(); |
| 1550 int i = 0; |
| 1551 while (pass){ |
| 1552 if (t.tokenType == STREAM_END){ |
| 1553 pass = pass? i == tokenNum : FALSE; |
| 1554 break;//while |
| 1555 } else if (t.tokenType == ERROR){ |
| 1556 pass = FALSE; |
| 1557 break;//while |
| 1558 } else { |
| 1559 pass = strcmp( &(t.token[0]), str_r[i++]) == 0; |
| 1560 t.getNextToken(); |
| 1561 } |
| 1562 } |
| 1563 |
| 1564 //const char ts[] = "$commandList = '['" |
| 1565 //" ( alternate ' ' $alternateOptions" |
| 1566 //" | backwards ' 2'" |
| 1567 //" | normalization ' ' $onoff " |
| 1568 //" | caseLevel ' ' $onoff " |
| 1569 //" | hiraganaQ ' ' $onoff" |
| 1570 //" | caseFirst ' ' $caseFirstOptions" |
| 1571 //" | strength ' ' $strengthOptions" |
| 1572 //" ) ']';" ; |
| 1573 |
| 1574 //Scanner t2(ts); |
| 1575 //pass = TRUE; |
| 1576 //do { |
| 1577 // t2.getNextToken(); |
| 1578 // if (t2.tokenType == ERROR){ |
| 1579 // DumpScanner(t2); |
| 1580 // return FALSE; |
| 1581 // } |
| 1582 //}while (t.tokenType != STREAM_END); |
| 1583 |
| 1584 return pass; |
| 1585 } |
| 1586 |
| 1587 class TestParserT { |
| 1588 public: |
| 1589 UBool operator () (const char *const str, const int exp_error_offset = -1, const
UBool dump = TRUE){ |
| 1590 Parser par(str); |
| 1591 if (par.rules()){ |
| 1592 if ( exp_error_offset == -1){ |
| 1593 return TRUE; |
| 1594 }else { |
| 1595 DumpScanner(par.s,dump); |
| 1596 return FALSE; |
| 1597 } |
| 1598 }else { |
| 1599 return DumpScanner(par.s, dump) == exp_error_offset; |
| 1600 } |
| 1601 } |
| 1602 }; |
| 1603 |
| 1604 UBool TestParser(){ |
| 1605 TestParserT test; |
| 1606 |
| 1607 UBool pass = TRUE; |
| 1608 pass = pass && test ("$s = ' ' ? 50%;"); |
| 1609 pass = pass && test("$x = ($var {1,2}) 3%;"); // legal |
| 1610 pass = pass && test("$x = $var {1,2} 3% | b 4%;"); // legal |
| 1611 pass = pass && test("$x = $var {1,2} 3%;"); // legal |
| 1612 pass = pass && test("$m = $c ? 2% 4% | $r 5% | $n 25%;"); // legal |
| 1613 pass = pass && test("$a = b ? 2% | c 5%;"); // legal |
| 1614 pass = pass && test("$x = A B 5% C 10% | D;", 8, FALSE); // illegal 5% |
| 1615 pass = pass && test("$x = aa 45% | bb 5% cc;", 19, FALSE);// illegal cc |
| 1616 pass = pass && test("$x = (b 5%) (c 6%);"); // legal |
| 1617 pass = pass && test("$x = (b 5%) c 6%;", 13, FALSE); // illegal 6% |
| 1618 pass = pass && test("$x = b 5% (c 6%);", 9, FALSE); // illegal (c 6%) |
| 1619 pass = pass && test("$x = b 5% c 6%;", 9, FALSE); // illegal c 6% |
| 1620 pass = pass && test("$x = b 5%;"); // legal |
| 1621 pass = pass && test("$x = aa 45% | bb 5% cc;", 19, FALSE);// illegal cc |
| 1622 pass = pass && test("$x = a | b | c 4% | d 5%;"); // legal |
| 1623 pass = pass && test("$s = ' ' ? 50% abc;"); // legal |
| 1624 pass = pass && test("$s = a | c d | e f;"); // legal |
| 1625 pass = pass && test( "$z = q 0% | p 1% | r 100%;"); // legal How to
check parsed tree?? |
| 1626 |
| 1627 pass = pass && test("$s = ' ' ? 50%;"); |
| 1628 pass = pass && test("$relationList = '<' | '<<' | ';' | '<<<' | ',' | '=';"
); |
| 1629 pass = pass && test("$p1 = ($string $s '|' $s)? 25%;"); |
| 1630 pass = pass && test("$p2 = (\\\\ $s $string $s)? 25%;"); |
| 1631 pass = pass && test("$rel2 = $p1 $string $s $p2;"); |
| 1632 pass = pass && test("$relation = $relationList $s ($rel1 | $rel2) $crlf;"); |
| 1633 pass = pass && test("$command = $commandList $crlf;"); |
| 1634 pass = pass && test("$reset = '&' $s ($beforeList $s)? 10% ($positionList 10
0% | $string 10%) $crlf;"); |
| 1635 pass = pass && test("$mostRules = $command 1% | $reset 5% | $relation 25%;")
; |
| 1636 pass = pass && test("$root = $command{0,5} $reset $mostRules{1,20};"); |
| 1637 |
| 1638 const char collationBNF[] = |
| 1639 "$s = ' '? 50%;" |
| 1640 "$crlf = '\r\n';" |
| 1641 |
| 1642 "$alternateOptions = non'-'ignorable | shifted;" |
| 1643 "$onoff = on | off;" |
| 1644 "$caseFirstOptions = off | upper | lower;" |
| 1645 "$strengthOptions = '1' | '2' | '3' | '4' | 'I';" |
| 1646 "$commandList = '['" |
| 1647 " ( alternate ' ' $alternateOptions" |
| 1648 " | backwards ' 2'" |
| 1649 " | normalization ' ' $onoff " |
| 1650 " | caseLevel ' ' $onoff " |
| 1651 " | hiraganaQ ' ' $onoff" |
| 1652 " | caseFirst ' ' $caseFirstOptions" |
| 1653 " | strength ' ' $strengthOptions" |
| 1654 " ) ']';" |
| 1655 "$command = $commandList $crlf;" |
| 1656 |
| 1657 "$ignorableTypes = (tertiary | secondary | primary) ' ' ignorable;" |
| 1658 "$allTypes = variable | regular | implicit | trailing | $ignorableTypes;" |
| 1659 "$positionList = '[' (first | last) ' ' $allTypes ']';" |
| 1660 |
| 1661 "$beforeList = '[before ' ('1' | '2' | '3') ']';" |
| 1662 |
| 1663 "$relationList = (" |
| 1664 " '<'" |
| 1665 " | '<<'" |
| 1666 " | ';'" |
| 1667 " | '<<<'" |
| 1668 " | ','" |
| 1669 " | '='" |
| 1670 ");" |
| 1671 "$string = $magic;" |
| 1672 "$rel1 = '[variable top]' $s;" |
| 1673 "$p1 = ($string $s '|' $s)? 25%;" |
| 1674 "$p2 = (\\\\ $s $string $s)? 25%;" |
| 1675 "$rel2 = $p1 $string $s $p2;" |
| 1676 "$relation = $relationList $s ($rel1 | $rel2) $crlf;" |
| 1677 |
| 1678 "$reset = '&' $s ($beforeList $s)? 10% ($positionList 1% | $string 10%) $crl
f;" |
| 1679 "$mostRules = $command 1% | $reset 5% | $relation 25%;" |
| 1680 "$root = $command{0,5} $reset $mostRules{1,20};" |
| 1681 ; |
| 1682 |
| 1683 pass = pass && test(collationBNF); |
| 1684 |
| 1685 |
| 1686 return pass; |
| 1687 } |
| 1688 |
| 1689 static UBool TestMorph(){ |
| 1690 srand((unsigned)time( NULL )); |
| 1691 |
| 1692 Alternation * alt = new Alternation(); |
| 1693 |
| 1694 (*alt) |
| 1695 .append(new Literal("a")).append(new Literal("b")).append(new Literal("c")) |
| 1696 .append(new Literal("d")).append(new Literal("e")).append(new Literal("f")) |
| 1697 .append(new Literal("g")).append(new Literal("h")).append(new Literal("i")) |
| 1698 .append(new Literal("j")).append(new Literal("k")).append(new Literal("l")) |
| 1699 .append(new Literal("m")).append(new Literal("n")).append(new Literal("o")) |
| 1700 ; |
| 1701 |
| 1702 Repeat * rep = new Repeat( alt ,5,5 ); |
| 1703 Morph m( *rep); |
| 1704 |
| 1705 // DUMP_R(TestMorph,(*rep),20); |
| 1706 DUMP_R(TestMorph,m,100); |
| 1707 |
| 1708 return FALSE; |
| 1709 } |
| 1710 |
| 1711 #endif |
| 1712 |
| 1713 static UBool TestLanguageGenerator(){ |
| 1714 //LanguageGenerator g; |
| 1715 //const char *const s = "$s = p 0% | q 1%;"; |
| 1716 //g.parseBNF(s, "$s"); |
| 1717 UBool pass; |
| 1718 //= strcmp("q", g.next()) == 0; |
| 1719 |
| 1720 const char *const def = |
| 1721 //"$a = $b;" |
| 1722 //"$b = $c;" |
| 1723 //"$c = $t;" |
| 1724 //"$t = abc $z{1,2};" |
| 1725 //"$k = a | b | c | d | e | f | g ;" |
| 1726 //"$z = q 0% | p 1% | r 1%;" |
| 1727 "$x = a ? 0%;" |
| 1728 ; // end of string |
| 1729 // const char * s = "abczz"; |
| 1730 // |
| 1731 // |
| 1732 LanguageGenerator g; |
| 1733 pass = g.parseBNF(def, "$x",TRUE); |
| 1734 //// LanguageGenerator g(collationBNF, "$root", "$magic", new MagicNode()); |
| 1735 // |
| 1736 if (pass != LanguageGenerator::OK) return FALSE; |
| 1737 |
| 1738 DUMP_R(TestLanguageGenerator, g, 20); |
| 1739 return pass; |
| 1740 |
| 1741 ////UBool pass = strcmp(s,r) == 0; |
| 1742 |
| 1743 //if (pass){ |
| 1744 // printf("TestRandomLanguageGenerator passed.\n"); |
| 1745 //} else { |
| 1746 // printf("TestRandomLanguageGenerator FAILED!!!\n"); |
| 1747 //} |
| 1748 //return pass; |
| 1749 } |
| 1750 |
| 1751 void TestWbnf(void){ |
| 1752 srand((unsigned)time( NULL )); |
| 1753 |
| 1754 //CALL(TestLiteral); |
| 1755 //CALL(TestSequence); |
| 1756 //CALL(TestSymbolTable); |
| 1757 //CALL(TestVariable); |
| 1758 |
| 1759 //TestRepeat(); |
| 1760 //TestAlternation(); |
| 1761 //TestMorph(); |
| 1762 |
| 1763 //TestQuote(); |
| 1764 //TestBuffer(); |
| 1765 //TestWeightedRand(); |
| 1766 |
| 1767 //CALL(TestScanner); |
| 1768 //CALL(TestParser); |
| 1769 CALL(TestLanguageGenerator); |
| 1770 } |
| 1771 |
OLD | NEW |