| OLD | NEW |
| 1 --- source/common/brkeng.cpp» 2007-09-11 20:53:13.000000000 -0700 | 1 --- source/common/brkeng.cpp» 2009-11-11 07:47:22.000000000 -0800 |
| 2 +++ source/common/brkeng.cpp» 2009-07-29 12:57:49.973382000 -0700 | 2 +++ source/common/brkeng.cpp» 2011-01-21 14:12:45.479922000 -0800 |
| 3 @@ -24,6 +24,7 @@ | 3 @@ -226,6 +226,30 @@ |
| 4 #include "umutex.h" | |
| 5 #include "uresimp.h" | |
| 6 #include "ubrkimpl.h" | |
| 7 +#include <stdio.h> | |
| 8 | |
| 9 U_NAMESPACE_BEGIN | |
| 10 | |
| 11 @@ -226,6 +227,30 @@ | |
| 12 case USCRIPT_THAI: | 4 case USCRIPT_THAI: |
| 13 engine = new ThaiBreakEngine(dict, status); | 5 engine = new ThaiBreakEngine(dict, status); |
| 14 break; | 6 break; |
| 15 + | 7 + |
| 16 + case USCRIPT_HANGUL: | 8 + case USCRIPT_HANGUL: |
| 17 + engine = new CjkBreakEngine(dict, kKorean, status); | 9 + engine = new CjkBreakEngine(dict, kKorean, status); |
| 18 + break; | 10 + break; |
| 19 + | 11 + |
| 20 + // use same BreakEngine and dictionary for both Chinese and Japanes
e | 12 + // use same BreakEngine and dictionary for both Chinese and Japanes
e |
| 21 + case USCRIPT_HIRAGANA: | 13 + case USCRIPT_HIRAGANA: |
| (...skipping 10 matching lines...) Expand all Loading... |
| 32 + { | 24 + { |
| 33 + UBlockCode block = ublock_getCode(code); | 25 + UBlockCode block = ublock_getCode(code); |
| 34 + if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) | 26 + if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) |
| 35 + engine = new CjkBreakEngine(dict, kChineseJapanese, status); | 27 + engine = new CjkBreakEngine(dict, kChineseJapanese, status); |
| 36 + break; | 28 + break; |
| 37 + } | 29 + } |
| 38 +#endif | 30 +#endif |
| 39 default: | 31 default: |
| 40 break; | 32 break; |
| 41 } | 33 } |
| 42 @@ -281,6 +306,13 @@ | 34 @@ -281,6 +305,13 @@ |
| 43 dict = NULL; | 35 dict = NULL; |
| 44 } | 36 } |
| 45 return dict; | 37 return dict; |
| 46 + } else if (dictfname != NULL){ | 38 + } else if (dictfname != NULL){ |
| 47 + //create dummy dict if dictionary filename not valid | 39 + //create dummy dict if dictionary filename not valid |
| 48 + UChar c = 0x0020; | 40 + UChar c = 0x0020; |
| 49 + status = U_ZERO_ERROR; | 41 + status = U_ZERO_ERROR; |
| 50 + MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE)
; | 42 + MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE)
; |
| 51 + mtd->addWord(&c, 1, status, 1); | 43 + mtd->addWord(&c, 1, status, 1); |
| 52 + return new CompactTrieDictionary(*mtd, status); | 44 + return new CompactTrieDictionary(*mtd, status); |
| 53 } | 45 } |
| 54 return NULL; | 46 return NULL; |
| 55 } | 47 } |
| 56 --- source/common/dictbe.cpp 2008-06-13 12:21:12.000000000 -0700 | 48 --- source/common/dictbe.cpp 2008-06-13 12:21:12.000000000 -0700 |
| 57 +++ source/common/dictbe.cpp» 2009-11-11 12:58:40.199829000 -0800 | 49 +++ source/common/dictbe.cpp» 2011-01-21 14:12:45.468928000 -0800 |
| 58 @@ -16,6 +16,11 @@ | 50 @@ -16,6 +16,9 @@ |
| 59 #include "unicode/ubrk.h" | 51 #include "unicode/ubrk.h" |
| 60 #include "uvector.h" | 52 #include "uvector.h" |
| 61 #include "triedict.h" | 53 #include "triedict.h" |
| 62 +#include "uassert.h" | 54 +#include "uassert.h" |
| 63 +#include "unicode/normlzr.h" | 55 +#include "unicode/normlzr.h" |
| 64 +#include "cmemory.h" | 56 +#include "cmemory.h" |
| 65 + | |
| 66 +#include <stdio.h> | |
| 67 | 57 |
| 68 U_NAMESPACE_BEGIN | 58 U_NAMESPACE_BEGIN |
| 69 | 59 |
| 70 @@ -422,6 +427,294 @@ | 60 @@ -422,6 +425,294 @@ |
| 71 return wordsFound; | 61 return wordsFound; |
| 72 } | 62 } |
| 73 | 63 |
| 74 +/* | 64 +/* |
| 75 + ****************************************************************** | 65 + ****************************************************************** |
| 76 + * CjkBreakEngine | 66 + * CjkBreakEngine |
| 77 + */ | 67 + */ |
| 78 +static const uint32_t kuint32max = 0xFFFFFFFF; | 68 +static const uint32_t kuint32max = 0xFFFFFFFF; |
| 79 +CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, Langu
ageType type, UErrorCode &status) | 69 +CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, Langu
ageType type, UErrorCode &status) |
| 80 +: DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){ | 70 +: DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){ |
| (...skipping 275 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 356 + } | 346 + } |
| 357 + | 347 + |
| 358 + utext_close(&normalizedText); | 348 + utext_close(&normalizedText); |
| 359 + return numBreaks; | 349 + return numBreaks; |
| 360 +} | 350 +} |
| 361 + | 351 + |
| 362 U_NAMESPACE_END | 352 U_NAMESPACE_END |
| 363 | 353 |
| 364 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | 354 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
| 365 --- source/common/dictbe.h 2006-09-29 17:37:45.000000000 -0700 | 355 --- source/common/dictbe.h 2006-09-29 17:37:45.000000000 -0700 |
| 366 +++ source/common/dictbe.h» 2009-07-27 13:01:17.704415000 -0700 | 356 +++ source/common/dictbe.h» 2011-01-21 14:12:45.492920000 -0800 |
| 367 @@ -1,8 +1,8 @@ | 357 @@ -1,8 +1,8 @@ |
| 368 /** | 358 /** |
| 369 - ******************************************************************************
* | 359 - ******************************************************************************
* |
| 370 - * Copyright (C) 2006, International Business Machines Corporation and others.
* | 360 - * Copyright (C) 2006, International Business Machines Corporation and others.
* |
| 371 - * All Rights Reserved.
* | 361 - * All Rights Reserved.
* |
| 372 - ******************************************************************************
* | 362 - ******************************************************************************
* |
| 373 + ******************************************************************************
**** | 363 + ******************************************************************************
**** |
| 374 + * Copyright (C) 2006,2007, International Business Machines Corporation and oth
ers. | 364 + * Copyright (C) 2006-2010, International Business Machines Corporation and oth
ers. |
| 375 + * All Rights Reserved. | 365 + * All Rights Reserved. |
| 376 + ******************************************************************************
**** | 366 + ******************************************************************************
**** |
| 377 */ | 367 */ |
| 378 | 368 |
| 379 #ifndef DICTBE_H | 369 #ifndef DICTBE_H |
| 380 @@ -65,37 +65,37 @@ | 370 @@ -65,31 +65,31 @@ |
| 381 */ | 371 */ |
| 382 virtual ~DictionaryBreakEngine(); | 372 virtual ~DictionaryBreakEngine(); |
| 383 | 373 |
| 384 - /** | 374 - /** |
| 385 - * <p>Indicate whether this engine handles a particular character for | 375 - * <p>Indicate whether this engine handles a particular character for |
| 386 - * a particular kind of break.</p> | 376 - * a particular kind of break.</p> |
| 387 - * | 377 - * |
| 388 - * @param c A character which begins a run that the engine might handle | 378 - * @param c A character which begins a run that the engine might handle |
| 389 - * @param breakType The type of text break which the caller wants to determine | 379 - * @param breakType The type of text break which the caller wants to determine |
| 390 - * @return TRUE if this engine handles the particular character and break | 380 - * @return TRUE if this engine handles the particular character and break |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 423 + * that starts from the first (or last) character in the range. | 413 + * that starts from the first (or last) character in the range. |
| 424 + * @param startPos The start of the run within the supplied text. | 414 + * @param startPos The start of the run within the supplied text. |
| 425 + * @param endPos The end of the run within the supplied text. | 415 + * @param endPos The end of the run within the supplied text. |
| 426 + * @param reverse Whether the caller is looking for breaks in a reverse | 416 + * @param reverse Whether the caller is looking for breaks in a reverse |
| 427 + * direction. | 417 + * direction. |
| 428 + * @param breakType The type of break desired, or -1. | 418 + * @param breakType The type of break desired, or -1. |
| 429 + * @param foundBreaks An allocated C array of the breaks found, if any | 419 + * @param foundBreaks An allocated C array of the breaks found, if any |
| 430 + * @return The number of breaks found. | 420 + * @return The number of breaks found. |
| 431 + */ | 421 + */ |
| 432 virtual int32_t findBreaks( UText *text, | 422 virtual int32_t findBreaks( UText *text, |
| 433 - int32_t startPos, | 423 int32_t startPos, |
| 434 - int32_t endPos, | 424 int32_t endPos, |
| 435 - UBool reverse, | |
| 436 - int32_t breakType, | |
| 437 - UStack &foundBreaks ) const; | |
| 438 + int32_t startPos, | |
| 439 + int32_t endPos, | |
| 440 + UBool reverse, | |
| 441 + int32_t breakType, | |
| 442 + UStack &foundBreaks ) const; | |
| 443 | |
| 444 protected: | |
| 445 | |
| 446 @@ -114,7 +114,7 @@ | 425 @@ -114,7 +114,7 @@ |
| 447 // virtual void setBreakTypes( uint32_t breakTypes ); | 426 // virtual void setBreakTypes( uint32_t breakTypes ); |
| 448 | 427 |
| 449 /** | 428 /** |
| 450 - * <p>Divide up a range of known dictionary characters.</p> | 429 - * <p>Divide up a range of known dictionary characters.</p> |
| 451 + * <p>Divide up a range of known dictionary characters handled by this break e
ngine.</p> | 430 + * <p>Divide up a range of known dictionary characters handled by this break e
ngine.</p> |
| 452 * | 431 * |
| 453 * @param text A UText representing the text | 432 * @param text A UText representing the text |
| 454 * @param rangeStart The start of the range of dictionary characters | 433 * @param rangeStart The start of the range of dictionary characters |
| 455 @@ -171,7 +171,7 @@ | 434 @@ -171,7 +171,7 @@ |
| 456 | 435 |
| 457 protected: | 436 protected: |
| 458 /** | 437 /** |
| 459 - * <p>Divide up a range of known dictionary characters.</p> | 438 - * <p>Divide up a range of known dictionary characters.</p> |
| 460 + * <p>Divide up a range of known dictionary characters handled by this break e
ngine.</p> | 439 + * <p>Divide up a range of known dictionary characters handled by this break e
ngine.</p> |
| 461 * | 440 * |
| 462 * @param text A UText representing the text | 441 * @param text A UText representing the text |
| 463 * @param rangeStart The start of the range of dictionary characters | 442 * @param rangeStart The start of the range of dictionary characters |
| 464 @@ -180,12 +180,72 @@ | 443 @@ -186,6 +186,66 @@ |
| 465 * @return The number of breaks found | |
| 466 */ | |
| 467 virtual int32_t divideUpDictionaryRange( UText *text, | |
| 468 - int32_t rangeStart, | |
| 469 - int32_t rangeEnd, | |
| 470 - UStack &foundBreaks ) const; | |
| 471 + int32_t rangeStart, | |
| 472 + int32_t rangeEnd, | |
| 473 + UStack &foundBreaks ) const; | |
| 474 | 444 |
| 475 }; | 445 }; |
| 476 | 446 |
| 477 +/******************************************************************* | 447 +/******************************************************************* |
| 478 + * CjkBreakEngine | 448 + * CjkBreakEngine |
| 479 + */ | 449 + */ |
| 480 + | 450 + |
| 481 +//indicates language/script that the CjkBreakEngine will handle | 451 +//indicates language/script that the CjkBreakEngine will handle |
| 482 +enum LanguageType { | 452 +enum LanguageType { |
| 483 + kKorean, | 453 + kKorean, |
| (...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 530 + */ | 500 + */ |
| 531 + virtual int32_t divideUpDictionaryRange( UText *text, | 501 + virtual int32_t divideUpDictionaryRange( UText *text, |
| 532 + int32_t rangeStart, | 502 + int32_t rangeStart, |
| 533 + int32_t rangeEnd, | 503 + int32_t rangeEnd, |
| 534 + UStack &foundBreaks ) const; | 504 + UStack &foundBreaks ) const; |
| 535 + | 505 + |
| 536 +}; | 506 +}; |
| 537 | 507 |
| 538 U_NAMESPACE_END | 508 U_NAMESPACE_END |
| 539 | 509 |
| 540 --- source/common/rbbi.cpp» 2008-09-24 22:48:27.000000000 -0700 | 510 --- source/common/rbbi.cpp» 2010-07-22 17:15:37.000000000 -0700 |
| 541 +++ source/common/rbbi.cpp» 2009-07-27 13:01:17.710416000 -0700 | 511 +++ source/common/rbbi.cpp» 2011-01-21 14:12:45.457938000 -0800 |
| 542 @@ -29,6 +29,7 @@ | 512 @@ -1555,10 +1555,12 @@ |
| 543 | |
| 544 #include "uassert.h" | |
| 545 #include "uvector.h" | |
| 546 +#include <stdio.h> | |
| 547 | |
| 548 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be incl
uded. | |
| 549 #if U_LOCAL_SERVICE_HOOK | |
| 550 @@ -1552,10 +1553,14 @@ | |
| 551 int32_t endPos, | 513 int32_t endPos, |
| 552 UBool reverse) { | 514 UBool reverse) { |
| 553 // Reset the old break cache first. | 515 // Reset the old break cache first. |
| 554 - uint32_t dictionaryCount = fDictionaryCharCount; | 516 - uint32_t dictionaryCount = fDictionaryCharCount; |
| 555 +// uint32_t dictionaryCount = fDictionaryCharCount; | |
| 556 reset(); | 517 reset(); |
| 557 | 518 |
| 558 - if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { | 519 - if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { |
| 559 + // note: code segment below assumes that dictionary chars are in the | 520 + // note: code segment below assumes that dictionary chars are in the |
| 560 + // startPos-endPos range | 521 + // startPos-endPos range |
| 561 + // value returned should be next character in sequence | 522 + // value returned should be next character in sequence |
| 562 +// if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { | |
| 563 + if ((endPos - startPos) <= 1) { | 523 + if ((endPos - startPos) <= 1) { |
| 564 return (reverse ? startPos : endPos); | 524 return (reverse ? startPos : endPos); |
| 565 } | 525 } |
| 566 | 526 |
| 567 @@ -1684,7 +1689,7 @@ | 527 @@ -1711,7 +1713,7 @@ |
| 568 // proposed break by one of the breaks we found. Use following() an
d | 528 // proposed break by one of the breaks we found. Use following() an
d |
| 569 // preceding() to do the work. They should never recurse in this ca
se. | 529 // preceding() to do the work. They should never recurse in this ca
se. |
| 570 if (reverse) { | 530 if (reverse) { |
| 571 - return preceding(endPos - 1); | 531 - return preceding(endPos - 1); |
| 572 + return preceding(endPos); | 532 + return preceding(endPos); |
| 573 } | 533 } |
| 574 else { | 534 else { |
| 575 return following(startPos); | 535 return following(startPos); |
| 576 --- source/common/triedict.cpp 2008-02-13 01:35:50.000000000 -0800 | 536 --- source/common/triedict.cpp 2008-02-13 01:35:50.000000000 -0800 |
| 577 +++ source/common/triedict.cpp» 2009-07-27 13:01:17.718409000 -0700 | 537 +++ source/common/triedict.cpp» 2011-01-21 14:12:45.271006000 -0800 |
| 578 @@ -20,6 +20,7 @@ | 538 @@ -20,6 +20,7 @@ |
| 579 #include "uvector.h" | 539 #include "uvector.h" |
| 580 #include "uvectr32.h" | 540 #include "uvectr32.h" |
| 581 #include "uarrsort.h" | 541 #include "uarrsort.h" |
| 582 +#include "hash.h" | 542 +#include "hash.h" |
| 583 | 543 |
| 584 //#define DEBUG_TRIE_DICT 1 | 544 //#define DEBUG_TRIE_DICT 1 |
| 585 | 545 |
| 586 @@ -27,6 +28,11 @@ | 546 @@ -27,6 +28,11 @@ |
| 587 #include <sys/times.h> | 547 #include <sys/times.h> |
| (...skipping 18 matching lines...) Expand all Loading... |
| 606 + | 566 + |
| 607 // Node structure for the ternary, uncompressed trie | 567 // Node structure for the ternary, uncompressed trie |
| 608 struct TernaryNode : public UMemory { | 568 struct TernaryNode : public UMemory { |
| 609 UChar ch; // UTF-16 code unit | 569 UChar ch; // UTF-16 code unit |
| 610 @@ -77,7 +88,8 @@ | 570 @@ -77,7 +88,8 @@ |
| 611 delete high; | 571 delete high; |
| 612 } | 572 } |
| 613 | 573 |
| 614 -MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status
) { | 574 -MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status
) { |
| 615 +MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status, | 575 +MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status, |
| 616 + UBool containsValue /* = FALSE */ ) { | 576 + UBool containsValue /* = FALSE */
) { |
| 617 // Start the trie off with something. Having the root node already present | 577 // Start the trie off with something. Having the root node already present |
| 618 // cuts a special case out of the search/insertion functions. | 578 // cuts a special case out of the search/insertion functions. |
| 619 // Making it a median character cuts the worse case for searches from | 579 // Making it a median character cuts the worse case for searches from |
| 620 @@ -91,14 +103,19 @@ | 580 @@ -91,14 +103,19 @@ |
| 621 if (U_SUCCESS(status) && fIter == NULL) { | 581 if (U_SUCCESS(status) && fIter == NULL) { |
| 622 status = U_MEMORY_ALLOCATION_ERROR; | 582 status = U_MEMORY_ALLOCATION_ERROR; |
| 623 } | 583 } |
| 624 + | 584 + |
| 625 + fValued = containsValue; | 585 + fValued = containsValue; |
| 626 } | 586 } |
| 627 | 587 |
| 628 -MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) { | 588 -MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) { |
| 629 +MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status, | 589 +MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status, |
| 630 + UBool containsValue /* = false */ ) { | 590 + UBool containsValue /* = false */
) { |
| 631 fTrie = NULL; | 591 fTrie = NULL; |
| 632 fIter = utext_openUChars(NULL, NULL, 0, &status); | 592 fIter = utext_openUChars(NULL, NULL, 0, &status); |
| 633 if (U_SUCCESS(status) && fIter == NULL) { | 593 if (U_SUCCESS(status) && fIter == NULL) { |
| 634 status = U_MEMORY_ALLOCATION_ERROR; | 594 status = U_MEMORY_ALLOCATION_ERROR; |
| 635 } | 595 } |
| 636 + | 596 + |
| 637 + fValued = containsValue; | 597 + fValued = containsValue; |
| 638 } | 598 } |
| 639 | 599 |
| 640 MutableTrieDictionary::~MutableTrieDictionary() { | 600 MutableTrieDictionary::~MutableTrieDictionary() { |
| 641 @@ -113,7 +130,8 @@ | 601 @@ -108,12 +125,13 @@ |
| 642 int &count, | 602 |
| 643 int limit, | 603 int32_t |
| 644 TernaryNode *&parent, | 604 MutableTrieDictionary::search( UText *text, |
| 605 - int32_t maxLength, |
| 606 - int32_t *lengths, |
| 607 - int &count, |
| 608 - int limit, |
| 609 - TernaryNode *&parent, |
| 645 - UBool &pMatched ) const { | 610 - UBool &pMatched ) const { |
| 646 + UBool &pMatched, | 611 + int32_t maxLength, |
| 647 + uint16_t *values /*=NULL*/) const { | 612 + int32_t *lengths, |
| 613 + int &count, |
| 614 + int limit, |
| 615 + TernaryNode *&parent, |
| 616 + UBool &pMatched, |
| 617 + uint16_t *values /*=NULL*/) const { |
| 648 // TODO: current implementation works in UTF-16 space | 618 // TODO: current implementation works in UTF-16 space |
| 649 const TernaryNode *up = NULL; | 619 const TernaryNode *up = NULL; |
| 650 const TernaryNode *p = fTrie; | 620 const TernaryNode *p = fTrie; |
| 651 @@ -121,6 +139,10 @@ | 621 @@ -121,6 +139,10 @@ |
| 652 pMatched = TRUE; | 622 pMatched = TRUE; |
| 653 int i; | 623 int i; |
| 654 | 624 |
| 655 + if (!fValued) { | 625 + if (!fValued) { |
| 656 + values = NULL; | 626 + values = NULL; |
| 657 + } | 627 + } |
| (...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 693 int count; | 663 int count; |
| 694 @@ -177,7 +204,7 @@ | 664 @@ -177,7 +204,7 @@ |
| 695 matched = search(fIter, length, NULL, count, 0, parent, pMatched); | 665 matched = search(fIter, length, NULL, count, 0, parent, pMatched); |
| 696 | 666 |
| 697 while (matched++ < length) { | 667 while (matched++ < length) { |
| 698 - UChar32 uc = utext_next32(fIter); // TODO: supplemetary support? | 668 - UChar32 uc = utext_next32(fIter); // TODO: supplemetary support? |
| 699 + UChar32 uc = utext_next32(fIter); // TODO: supplementary support? | 669 + UChar32 uc = utext_next32(fIter); // TODO: supplementary support? |
| 700 U_ASSERT(uc != U_SENTINEL); | 670 U_ASSERT(uc != U_SENTINEL); |
| 701 TernaryNode *newNode = new TernaryNode(uc); | 671 TernaryNode *newNode = new TernaryNode(uc); |
| 702 if (newNode == NULL) { | 672 if (newNode == NULL) { |
| 703 @@ -199,7 +226,11 @@ | 673 @@ -199,30 +226,23 @@ |
| 704 parent = newNode; | 674 parent = newNode; |
| 705 } | 675 } |
| 706 | 676 |
| 707 - parent->flags |= kEndsWord; | 677 - parent->flags |= kEndsWord; |
| 678 -} |
| 679 - |
| 680 -#if 0 |
| 681 -void |
| 682 -MutableTrieDictionary::addWords( UEnumeration *words, |
| 683 - UErrorCode &status ) { |
| 684 - int32_t length; |
| 685 - const UChar *word; |
| 686 - while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status))
{ |
| 687 - addWord(word, length, status); |
| 708 + if(fValued && value > 0){ | 688 + if(fValued && value > 0){ |
| 709 + parent->flags = value; | 689 + parent->flags = value; |
| 710 + } else { | 690 + } else { |
| 711 + parent->flags |= kEndsWord; | 691 + parent->flags |= kEndsWord; |
| 712 + } | 692 } |
| 713 } | 693 } |
| 694 -#endif |
| 714 | 695 |
| 715 #if 0 | 696 int32_t |
| 716 @@ -219,10 +250,11 @@ | 697 MutableTrieDictionary::matches( UText *text, |
| 717 int32_t maxLength, | 698 int32_t maxLength, |
| 718 int32_t *lengths, | 699 int32_t *lengths, |
| 719 int &count, | 700 int &count, |
| 720 - int limit ) const { | 701 - int limit ) const { |
| 721 + int limit, | 702 + int limit, |
| 722 + uint16_t *values /*=NULL*/) const { | 703 + uint16_t *values /*=NULL*/) const { |
| 723 TernaryNode *parent; | 704 TernaryNode *parent; |
| 724 UBool pMatched; | 705 UBool pMatched; |
| 725 - return search(text, maxLength, lengths, count, limit, parent, pMatched); | 706 - return search(text, maxLength, lengths, count, limit, parent, pMatched); |
| 726 + return search(text, maxLength, lengths, count, limit, parent, pMatched, val
ues); | 707 + return search(text, maxLength, lengths, count, limit, parent, pMatched, val
ues); |
| 727 } | 708 } |
| 728 | 709 |
| 729 // Implementation of iteration for MutableTrieDictionary | 710 // Implementation of iteration for MutableTrieDictionary |
| 730 @@ -277,7 +309,7 @@ | 711 @@ -277,7 +297,7 @@ |
| 731 break; | 712 break; |
| 732 } | 713 } |
| 733 case kEqual: | 714 case kEqual: |
| 734 - emit = (node->flags & kEndsWord) != 0; | 715 - emit = (node->flags & kEndsWord) != 0; |
| 735 + emit = node->flags > 0; | 716 + emit = node->flags > 0; |
| 736 equal = (node->equal != NULL); | 717 equal = (node->equal != NULL); |
| 737 // If this node should be part of the next emitted string, appe
nd | 718 // If this node should be part of the next emitted string, appe
nd |
| 738 // the UChar to the string, and make sure we pop it when we com
e | 719 // the UChar to the string, and make sure we pop it when we com
e |
| 739 @@ -299,7 +331,7 @@ | 720 @@ -299,7 +319,7 @@ |
| 740 } | 721 } |
| 741 case kGreaterThan: | 722 case kGreaterThan: |
| 742 // If this node's character is in the string, remove it. | 723 // If this node's character is in the string, remove it. |
| 743 - if (node->equal != NULL || (node->flags & kEndsWord)) { | 724 - if (node->equal != NULL || (node->flags & kEndsWord)) { |
| 744 + if (node->equal != NULL || node->flags > 0) { | 725 + if (node->equal != NULL || node->flags > 0) { |
| 745 unistr.truncate(unistr.length()-1); | 726 unistr.truncate(unistr.length()-1); |
| 746 } | 727 } |
| 747 if (node->high != NULL) { | 728 if (node->high != NULL) { |
| 748 @@ -354,12 +386,74 @@ | 729 @@ -354,12 +374,75 @@ |
| 749 * CompactTrieDictionary | 730 * CompactTrieDictionary |
| 750 */ | 731 */ |
| 751 | 732 |
| 752 +//TODO if time permits: minimise size of trie with logprobs by storing values | 733 +//TODO further optimization: |
| 734 +// minimise size of trie with logprobs by storing values |
| 753 +// for terminal nodes directly in offsets[] | 735 +// for terminal nodes directly in offsets[] |
| 754 +// --> calculating from next offset *might* be simpler, but would have to add | 736 +// --> calculating from next offset *might* be simpler, but would have to add |
| 755 +// one last offset for logprob of last node | 737 +// one last offset for logprob of last node |
| 756 +// --> if calculate from current offset, need to factor in possible overflow | 738 +// --> if calculate from current offset, need to factor in possible overflow |
| 757 +// as well. | 739 +// as well. |
| 758 +// idea: store in offset, set first bit to indicate logprob storage-->won't | 740 +// idea: store in offset, set first bit to indicate logprob storage-->won't |
| 759 +// have to access additional node | 741 +// have to access additional node |
| 760 + | 742 + |
| 761 +// {'Dic', 1}, version 1: uses old header, no values | 743 +// {'Dic', 1}, version 1: uses old header, no values |
| 762 +#define COMPACT_TRIE_MAGIC_1 0x44696301 | 744 +#define COMPACT_TRIE_MAGIC_1 0x44696301 |
| (...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 814 + offsets = &(header->offsets[0]); | 796 + offsets = &(header->offsets[0]); |
| 815 + address = (uint8_t *)header; | 797 + address = (uint8_t *)header; |
| 816 + } | 798 + } |
| 817 + } | 799 + } |
| 818 + } | 800 + } |
| 819 + | 801 + |
| 820 + ~CompactTrieInfo(){} | 802 + ~CompactTrieInfo(){} |
| 821 }; | 803 }; |
| 822 | 804 |
| 823 // Note that to avoid platform-specific alignment issues, all members of the no
de | 805 // Note that to avoid platform-specific alignment issues, all members of the no
de |
| 824 @@ -375,10 +469,14 @@ | 806 @@ -375,10 +458,14 @@ |
| 825 enum CompactTrieNodeFlags { | 807 enum CompactTrieNodeFlags { |
| 826 kVerticalNode = 0x1000, // This is a vertical node | 808 kVerticalNode = 0x1000, // This is a vertical node |
| 827 kParentEndsWord = 0x2000, // The node whose equal link points to this
ends a word | 809 kParentEndsWord = 0x2000, // The node whose equal link points to this
ends a word |
| 828 - kReservedFlag1 = 0x4000, | 810 - kReservedFlag1 = 0x4000, |
| 829 - kReservedFlag2 = 0x8000, | 811 - kReservedFlag2 = 0x8000, |
| 830 + kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kR
eservedFlag1 | 812 + kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kR
eservedFlag1 |
| 831 + kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReserved
Flag2 | 813 + kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReserved
Flag2 |
| 832 kCountMask = 0x0FFF, // The count portion of flagscount | 814 kCountMask = 0x0FFF, // The count portion of flagscount |
| 833 - kFlagMask = 0xF000 // The flags portion of flagscount | 815 - kFlagMask = 0xF000 // The flags portion of flagscount |
| 834 + kFlagMask = 0xF000, // The flags portion of flagscount | 816 + kFlagMask = 0xF000, // The flags portion of flagscount |
| 835 + kRootCountMask = 0x7FFF // The count portion of flagscount in the r
oot node | 817 + kRootCountMask = 0x7FFF // The count portion of flagscount in the r
oot node |
| 836 + | 818 + |
| 837 + //offset flags: | 819 + //offset flags: |
| 838 + //kOffsetContainsValue = 0x80000000 // Offset contains value for pare
nt node | 820 + //kOffsetContainsValue = 0x80000000 // Offset contains value for pare
nt node |
| 839 }; | 821 }; |
| 840 | 822 |
| 841 // The two node types are distinguished by the kVerticalNode flag. | 823 // The two node types are distinguished by the kVerticalNode flag. |
| 842 @@ -402,63 +500,177 @@ | 824 @@ -402,63 +489,177 @@ |
| 843 uint16_t chars[1]; // Code units | 825 uint16_t chars[1]; // Code units |
| 844 }; | 826 }; |
| 845 | 827 |
| 846 -// {'Dic', 1}, version 1 | 828 -// {'Dic', 1}, version 1 |
| 847 -#define COMPACT_TRIE_MAGIC_1 0x44696301 | 829 -#define COMPACT_TRIE_MAGIC_1 0x44696301 |
| 848 - | 830 - |
| 849 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj, | 831 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj, |
| 850 UErrorCode &status ) | 832 UErrorCode &status ) |
| 851 : fUData(dataObj) | 833 : fUData(dataObj) |
| 852 { | 834 { |
| (...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 907 uint32_t | 889 uint32_t |
| 908 CompactTrieDictionary::dataSize() const { | 890 CompactTrieDictionary::dataSize() const { |
| 909 - return fData->size; | 891 - return fData->size; |
| 910 + return fInfo->size; | 892 + return fInfo->size; |
| 911 } | 893 } |
| 912 | 894 |
| 913 const void * | 895 const void * |
| 914 CompactTrieDictionary::data() const { | 896 CompactTrieDictionary::data() const { |
| 915 - return fData; | 897 - return fData; |
| 916 + return fInfo->address; | 898 + return fInfo->address; |
| 917 } | 899 +} |
| 918 | 900 + |
| 919 -// This function finds the address of a node for us, given its node ID | |
| 920 +//This function finds the address of a node for us, given its node ID | 901 +//This function finds the address of a node for us, given its node ID |
| 921 static inline const CompactTrieNode * | 902 +static inline const CompactTrieNode * |
| 922 -getCompactNode(const CompactTrieHeader *header, uint16_t node) { | |
| 923 - return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[
node]); | |
| 924 +getCompactNode(const CompactTrieInfo *info, uint32_t node) { | 903 +getCompactNode(const CompactTrieInfo *info, uint32_t node) { |
| 925 + if(node < info->root-1) { | 904 + if(node < info->root-1) { |
| 926 + return (const CompactTrieNode *)(&info->offsets[node]); | 905 + return (const CompactTrieNode *)(&info->offsets[node]); |
| 927 + } else { | 906 + } else { |
| 928 + return (const CompactTrieNode *)(info->address + info->offsets[node]); | 907 + return (const CompactTrieNode *)(info->address + info->offsets[node]); |
| 929 + } | 908 + } |
| 930 +} | 909 } |
| 931 + | 910 |
| 911 -// This function finds the address of a node for us, given its node ID |
| 932 +//this version of getCompactNode is currently only used in compactMutableTrieDi
ctionary() | 912 +//this version of getCompactNode is currently only used in compactMutableTrieDi
ctionary() |
| 933 +static inline const CompactTrieNode * | 913 static inline const CompactTrieNode * |
| 914 -getCompactNode(const CompactTrieHeader *header, uint16_t node) { |
| 915 - return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[
node]); |
| 934 +getCompactNode(const CompactTrieHeader *header, uint32_t node) { | 916 +getCompactNode(const CompactTrieHeader *header, uint32_t node) { |
| 935 + if(node < header->root-1) { | 917 + if(node < header->root-1) { |
| 936 + return (const CompactTrieNode *)(&header->offsets[node]); | 918 + return (const CompactTrieNode *)(&header->offsets[node]); |
| 937 + } else { | 919 + } else { |
| 938 + return (const CompactTrieNode *)((const uint8_t *)header + header->offs
ets[node]); | 920 + return (const CompactTrieNode *)((const uint8_t *)header + header->offs
ets[node]); |
| 939 + } | 921 + } |
| 940 +} | 922 +} |
| 941 + | 923 + |
| 942 + | 924 + |
| 943 +/** | 925 +/** |
| (...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1030 + } | 1012 + } |
| 1031 + else { | 1013 + else { |
| 1032 + low = middle+1; | 1014 + low = middle+1; |
| 1033 + } | 1015 + } |
| 1034 + } | 1016 + } |
| 1035 + | 1017 + |
| 1036 + return -1; | 1018 + return -1; |
| 1037 } | 1019 } |
| 1038 | 1020 |
| 1039 int32_t | 1021 int32_t |
| 1040 @@ -466,17 +678,38 @@ | 1022 @@ -466,17 +667,38 @@ |
| 1041 int32_t maxLength, | 1023 int32_t maxLength, |
| 1042 int32_t *lengths, | 1024 int32_t *lengths, |
| 1043 int &count, | 1025 int &count, |
| 1044 - int limit ) const { | 1026 - int limit ) const { |
| 1045 + int limit, | 1027 + int limit, |
| 1046 + uint16_t *values /*= NULL*/) const { | 1028 + uint16_t *values /*= NULL*/) const { |
| 1047 + if (fInfo->magic == COMPACT_TRIE_MAGIC_2) | 1029 + if (fInfo->magic == COMPACT_TRIE_MAGIC_2) |
| 1048 + values = NULL; | 1030 + values = NULL; |
| 1049 + | 1031 + |
| 1050 // TODO: current implementation works in UTF-16 space | 1032 // TODO: current implementation works in UTF-16 space |
| (...skipping 20 matching lines...) Expand all Loading... |
| 1071 + | 1053 + |
| 1072 while (node != NULL) { | 1054 while (node != NULL) { |
| 1073 // Check if the node we just exited ends a word | 1055 // Check if the node we just exited ends a word |
| 1074 if (limit > 0 && (node->flagscount & kParentEndsWord)) { | 1056 if (limit > 0 && (node->flagscount & kParentEndsWord)) { |
| 1075 + if(values != NULL){ | 1057 + if(values != NULL){ |
| 1076 + values[mycount] = getValue(node); | 1058 + values[mycount] = getValue(node); |
| 1077 + } | 1059 + } |
| 1078 lengths[mycount++] = i; | 1060 lengths[mycount++] = i; |
| 1079 --limit; | 1061 --limit; |
| 1080 } | 1062 } |
| 1081 @@ -487,7 +720,7 @@ | 1063 @@ -487,7 +709,7 @@ |
| 1082 break; | 1064 break; |
| 1083 } | 1065 } |
| 1084 | 1066 |
| 1085 - int nodeCount = (node->flagscount & kCountMask); | 1067 - int nodeCount = (node->flagscount & kCountMask); |
| 1086 + int nodeCount = getCount(node); | 1068 + int nodeCount = getCount(node); |
| 1087 if (nodeCount == 0) { | 1069 if (nodeCount == 0) { |
| 1088 // Special terminal node; return now | 1070 // Special terminal node; return now |
| 1089 break; | 1071 break; |
| 1090 @@ -507,35 +740,27 @@ | 1072 @@ -507,35 +729,27 @@ |
| 1091 // To get here we must have come through the whole list successfull
y; | 1073 // To get here we must have come through the whole list successfull
y; |
| 1092 // go on to the next node. Note that a word cannot end in the middl
e | 1074 // go on to the next node. Note that a word cannot end in the middl
e |
| 1093 // of a vertical node. | 1075 // of a vertical node. |
| 1094 - node = getCompactNode(fData, vnode->equal); | 1076 - node = getCompactNode(fData, vnode->equal); |
| 1095 + node = getCompactNode(fInfo, calcEqualLink(vnode)); | 1077 + node = getCompactNode(fInfo, calcEqualLink(vnode)); |
| 1096 } | 1078 } |
| 1097 else { | 1079 else { |
| 1098 // Horizontal node; do binary search | 1080 // Horizontal node; do binary search |
| 1099 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizont
alNode *)node; | 1081 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizont
alNode *)node; |
| 1100 - int low = 0; | 1082 - int low = 0; |
| (...skipping 29 matching lines...) Expand all Loading... |
| 1130 + }else{ | 1112 + }else{ |
| 1131 + node = NULL; // If we don't find a match, we'll fall out of
the loop | 1113 + node = NULL; // If we don't find a match, we'll fall out of
the loop |
| 1132 } | 1114 } |
| 1133 } | 1115 } |
| 1134 } | 1116 } |
| 1135 -exit: | 1117 -exit: |
| 1136 + exit: | 1118 + exit: |
| 1137 count = mycount; | 1119 count = mycount; |
| 1138 return i; | 1120 return i; |
| 1139 } | 1121 } |
| 1140 @@ -545,16 +770,16 @@ | 1122 @@ -545,16 +759,16 @@ |
| 1141 private: | 1123 private: |
| 1142 UVector32 fNodeStack; // Stack of nodes to process | 1124 UVector32 fNodeStack; // Stack of nodes to process |
| 1143 UVector32 fIndexStack; // Stack of where in node we are | 1125 UVector32 fIndexStack; // Stack of where in node we are |
| 1144 - const CompactTrieHeader *fHeader; // Trie data | 1126 - const CompactTrieHeader *fHeader; // Trie data |
| 1145 + const CompactTrieInfo *fInfo; // Trie data | 1127 + const CompactTrieInfo *fInfo; // Trie data |
| 1146 | 1128 |
| 1147 public: | 1129 public: |
| 1148 static UClassID U_EXPORT2 getStaticClassID(void); | 1130 static UClassID U_EXPORT2 getStaticClassID(void); |
| 1149 virtual UClassID getDynamicClassID(void) const; | 1131 virtual UClassID getDynamicClassID(void) const; |
| 1150 public: | 1132 public: |
| 1151 - CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status)
| 1133 - CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status)
|
| 1152 + CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status) | 1134 + CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status) |
| 1153 : fNodeStack(status), fIndexStack(status) { | 1135 : fNodeStack(status), fIndexStack(status) { |
| 1154 - fHeader = header; | 1136 - fHeader = header; |
| 1155 - fNodeStack.push(header->root, status); | 1137 - fNodeStack.push(header->root, status); |
| 1156 + fInfo = info; | 1138 + fInfo = info; |
| 1157 + fNodeStack.push(info->root, status); | 1139 + fNodeStack.push(info->root, status); |
| 1158 fIndexStack.push(0, status); | 1140 fIndexStack.push(0, status); |
| 1159 unistr.remove(); | 1141 unistr.remove(); |
| 1160 } | 1142 } |
| 1161 @@ -564,14 +789,14 @@ | 1143 @@ -564,14 +778,14 @@ |
| 1162 | 1144 |
| 1163 virtual StringEnumeration *clone() const { | 1145 virtual StringEnumeration *clone() const { |
| 1164 UErrorCode status = U_ZERO_ERROR; | 1146 UErrorCode status = U_ZERO_ERROR; |
| 1165 - return new CompactTrieEnumeration(fHeader, status); | 1147 - return new CompactTrieEnumeration(fHeader, status); |
| 1166 + return new CompactTrieEnumeration(fInfo, status); | 1148 + return new CompactTrieEnumeration(fInfo, status); |
| 1167 } | 1149 } |
| 1168 | 1150 |
| 1169 virtual const UnicodeString * snext(UErrorCode &status); | 1151 virtual const UnicodeString * snext(UErrorCode &status); |
| 1170 | 1152 |
| 1171 // Very expensive, but this should never be used. | 1153 // Very expensive, but this should never be used. |
| 1172 virtual int32_t count(UErrorCode &status) const { | 1154 virtual int32_t count(UErrorCode &status) const { |
| 1173 - CompactTrieEnumeration counter(fHeader, status); | 1155 - CompactTrieEnumeration counter(fHeader, status); |
| 1174 + CompactTrieEnumeration counter(fInfo, status); | 1156 + CompactTrieEnumeration counter(fInfo, status); |
| 1175 int32_t result = 0; | 1157 int32_t result = 0; |
| 1176 while (counter.snext(status) != NULL && U_SUCCESS(status)) { | 1158 while (counter.snext(status) != NULL && U_SUCCESS(status)) { |
| 1177 ++result; | 1159 ++result; |
| 1178 @@ -582,7 +807,7 @@ | 1160 @@ -582,7 +796,7 @@ |
| 1179 virtual void reset(UErrorCode &status) { | 1161 virtual void reset(UErrorCode &status) { |
| 1180 fNodeStack.removeAllElements(); | 1162 fNodeStack.removeAllElements(); |
| 1181 fIndexStack.removeAllElements(); | 1163 fIndexStack.removeAllElements(); |
| 1182 - fNodeStack.push(fHeader->root, status); | 1164 - fNodeStack.push(fHeader->root, status); |
| 1183 + fNodeStack.push(fInfo->root, status); | 1165 + fNodeStack.push(fInfo->root, status); |
| 1184 fIndexStack.push(0, status); | 1166 fIndexStack.push(0, status); |
| 1185 unistr.remove(); | 1167 unistr.remove(); |
| 1186 } | 1168 } |
| 1187 @@ -595,26 +820,34 @@ | 1169 @@ -595,26 +809,34 @@ |
| 1188 if (fNodeStack.empty() || U_FAILURE(status)) { | 1170 if (fNodeStack.empty() || U_FAILURE(status)) { |
| 1189 return NULL; | 1171 return NULL; |
| 1190 } | 1172 } |
| 1191 - const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki()); | 1173 - const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki()); |
| 1192 + const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki()); | 1174 + const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki()); |
| 1193 int where = fIndexStack.peeki(); | 1175 int where = fIndexStack.peeki(); |
| 1194 while (!fNodeStack.empty() && U_SUCCESS(status)) { | 1176 while (!fNodeStack.empty() && U_SUCCESS(status)) { |
| 1195 - int nodeCount = (node->flagscount & kCountMask); | 1177 - int nodeCount = (node->flagscount & kCountMask); |
| 1196 + int nodeCount; | 1178 + int nodeCount; |
| 1197 + | 1179 + |
| (...skipping 20 matching lines...) Expand all Loading... |
| 1218 if (where == 0) { | 1200 if (where == 0) { |
| 1219 // Going down | 1201 // Going down |
| 1220 - unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount)
; | 1202 - unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount)
; |
| 1221 + unistr.append((const UChar *)vnode->chars, nodeCount); | 1203 + unistr.append((const UChar *)vnode->chars, nodeCount); |
| 1222 fIndexStack.setElementAt(1, fIndexStack.size()-1); | 1204 fIndexStack.setElementAt(1, fIndexStack.size()-1); |
| 1223 - node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, st
atus)); | 1205 - node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, st
atus)); |
| 1224 + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnod
e), status)); | 1206 + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnod
e), status)); |
| 1225 where = fIndexStack.push(0, status); | 1207 where = fIndexStack.push(0, status); |
| 1226 goingDown = TRUE; | 1208 goingDown = TRUE; |
| 1227 } | 1209 } |
| 1228 @@ -623,7 +856,7 @@ | 1210 @@ -623,7 +845,7 @@ |
| 1229 unistr.truncate(unistr.length()-nodeCount); | 1211 unistr.truncate(unistr.length()-nodeCount); |
| 1230 fNodeStack.popi(); | 1212 fNodeStack.popi(); |
| 1231 fIndexStack.popi(); | 1213 fIndexStack.popi(); |
| 1232 - node = getCompactNode(fHeader, fNodeStack.peeki()); | 1214 - node = getCompactNode(fHeader, fNodeStack.peeki()); |
| 1233 + node = getCompactNode(fInfo, fNodeStack.peeki()); | 1215 + node = getCompactNode(fInfo, fNodeStack.peeki()); |
| 1234 where = fIndexStack.peeki(); | 1216 where = fIndexStack.peeki(); |
| 1235 } | 1217 } |
| 1236 } | 1218 } |
| 1237 @@ -638,7 +871,7 @@ | 1219 @@ -638,7 +860,7 @@ |
| 1238 // Push on next node | 1220 // Push on next node |
| 1239 unistr.append((UChar)hnode->entries[where].ch); | 1221 unistr.append((UChar)hnode->entries[where].ch); |
| 1240 fIndexStack.setElementAt(where+1, fIndexStack.size()-1); | 1222 fIndexStack.setElementAt(where+1, fIndexStack.size()-1); |
| 1241 - node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[w
here].equal, status)); | 1223 - node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[w
here].equal, status)); |
| 1242 + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnod
e, where, nodeCount), status)); | 1224 + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnod
e, where, nodeCount), status)); |
| 1243 where = fIndexStack.push(0, status); | 1225 where = fIndexStack.push(0, status); |
| 1244 goingDown = TRUE; | 1226 goingDown = TRUE; |
| 1245 } | 1227 } |
| 1246 @@ -646,12 +879,14 @@ | 1228 @@ -646,12 +868,14 @@ |
| 1247 // Going up | 1229 // Going up |
| 1248 fNodeStack.popi(); | 1230 fNodeStack.popi(); |
| 1249 fIndexStack.popi(); | 1231 fIndexStack.popi(); |
| 1250 - node = getCompactNode(fHeader, fNodeStack.peeki()); | 1232 - node = getCompactNode(fHeader, fNodeStack.peeki()); |
| 1251 + node = getCompactNode(fInfo, fNodeStack.peeki()); | 1233 + node = getCompactNode(fInfo, fNodeStack.peeki()); |
| 1252 where = fIndexStack.peeki(); | 1234 where = fIndexStack.peeki(); |
| 1253 } | 1235 } |
| 1254 } | 1236 } |
| 1255 + | 1237 + |
| 1256 // Check if the parent of the node we've just gone down to ends a | 1238 // Check if the parent of the node we've just gone down to ends a |
| 1257 // word. If so, return it. | 1239 // word. If so, return it. |
| 1258 + // The root node should never end up here. | 1240 + // The root node should never end up here. |
| 1259 if (goingDown && (node->flagscount & kParentEndsWord)) { | 1241 if (goingDown && (node->flagscount & kParentEndsWord)) { |
| 1260 return &unistr; | 1242 return &unistr; |
| 1261 } | 1243 } |
| 1262 @@ -664,7 +899,7 @@ | 1244 @@ -664,7 +888,7 @@ |
| 1263 if (U_FAILURE(status)) { | 1245 if (U_FAILURE(status)) { |
| 1264 return NULL; | 1246 return NULL; |
| 1265 } | 1247 } |
| 1266 - return new CompactTrieEnumeration(fData, status); | 1248 - return new CompactTrieEnumeration(fData, status); |
| 1267 + return new CompactTrieEnumeration(fInfo, status); | 1249 + return new CompactTrieEnumeration(fInfo, status); |
| 1268 } | 1250 } |
| 1269 | 1251 |
| 1270 // | 1252 // |
| 1271 @@ -672,21 +907,36 @@ | 1253 @@ -672,21 +896,36 @@ |
| 1272 // and back again | 1254 // and back again |
| 1273 // | 1255 // |
| 1274 | 1256 |
| 1275 -// Helper classes to construct the compact trie | 1257 -// Helper classes to construct the compact trie |
| 1276 +enum CompactTrieNodeType { | 1258 +enum CompactTrieNodeType { |
| 1277 + kHorizontalType = 0, | 1259 + kHorizontalType = 0, |
| 1278 + kVerticalType = 1, | 1260 + kVerticalType = 1, |
| 1279 + kValueType = 2 | 1261 + kValueType = 2 |
| 1280 +}; | 1262 +}; |
| 1281 + | 1263 + |
| (...skipping 22 matching lines...) Expand all Loading... |
| 1304 fParentEndsWord = parentEndsWord; | 1286 fParentEndsWord = parentEndsWord; |
| 1305 fHasDuplicate = FALSE; | 1287 fHasDuplicate = FALSE; |
| 1306 - fVertical = vertical; | 1288 - fVertical = vertical; |
| 1307 + fNodeType = nodeType; | 1289 + fNodeType = nodeType; |
| 1308 + fEqualOverflows = FALSE; | 1290 + fEqualOverflows = FALSE; |
| 1309 fNodeID = nodes.size(); | 1291 fNodeID = nodes.size(); |
| 1310 + fValue = parentEndsWord? value : 0; | 1292 + fValue = parentEndsWord? value : 0; |
| 1311 nodes.push(this, status); | 1293 nodes.push(this, status); |
| 1312 } | 1294 } |
| 1313 | 1295 |
| 1314 @@ -694,87 +944,225 @@ | 1296 @@ -694,87 +933,225 @@ |
| 1315 } | 1297 } |
| 1316 | 1298 |
| 1317 virtual uint32_t size() { | 1299 virtual uint32_t size() { |
| 1318 - return sizeof(uint16_t); | 1300 - return sizeof(uint16_t); |
| 1319 + if(fValue > 0) | 1301 + if(fValue > 0) |
| 1320 + return sizeof(uint16_t) * 2; | 1302 + return sizeof(uint16_t) * 2; |
| 1321 + else | 1303 + else |
| 1322 + return sizeof(uint16_t); | 1304 + return sizeof(uint16_t); |
| 1323 } | 1305 } |
| 1324 | 1306 |
| (...skipping 221 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1546 + // append 16 bits of to end for equal node if fEqualOverflows | 1528 + // append 16 bits of to end for equal node if fEqualOverflows |
| 1547 + if (fEqualOverflows) { | 1529 + if (fEqualOverflows) { |
| 1548 + *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNode
ID) >> 16); | 1530 + *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNode
ID) >> 16); |
| 1549 + offset += sizeof(uint16_t); | 1531 + offset += sizeof(uint16_t); |
| 1550 + } | 1532 + } |
| 1551 + | 1533 + |
| 1552 + BuildCompactTrieNode::writeValue(bytes, offset); | 1534 + BuildCompactTrieNode::writeValue(bytes, offset); |
| 1553 } | 1535 } |
| 1554 | 1536 |
| 1555 void addChar(UChar ch) { | 1537 void addChar(UChar ch) { |
| 1556 @@ -784,60 +1172,85 @@ | 1538 @@ -784,60 +1161,85 @@ |
| 1557 void setLink(BuildCompactTrieNode *node) { | 1539 void setLink(BuildCompactTrieNode *node) { |
| 1558 fEqual = node; | 1540 fEqual = node; |
| 1559 } | 1541 } |
| 1560 + | 1542 + |
| 1561 }; | 1543 }; |
| 1562 | 1544 |
| 1563 // Forward declaration | 1545 // Forward declaration |
| 1564 static void walkHorizontal(const TernaryNode *node, | 1546 static void walkHorizontal(const TernaryNode *node, |
| 1565 BuildCompactTrieHorizontalNode *building, | 1547 BuildCompactTrieHorizontalNode *building, |
| 1566 UStack &nodes, | 1548 UStack &nodes, |
| (...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1644 + } else { | 1626 + } else { |
| 1645 vResult->setLink((BuildCompactTrieNode *)nodes[1]); | 1627 vResult->setLink((BuildCompactTrieNode *)nodes[1]); |
| 1646 } | 1628 } |
| 1647 } | 1629 } |
| 1648 else { | 1630 else { |
| 1649 - vResult->setLink(compactOneNode(node, endsWord, nodes, status))
; | 1631 - vResult->setLink(compactOneNode(node, endsWord, nodes, status))
; |
| 1650 + vResult->setLink(compactOneNode(node, endsWord, nodes, status,
values, value)); | 1632 + vResult->setLink(compactOneNode(node, endsWord, nodes, status,
values, value)); |
| 1651 } | 1633 } |
| 1652 result = vResult; | 1634 result = vResult; |
| 1653 } | 1635 } |
| 1654 @@ -849,19 +1262,28 @@ | 1636 @@ -849,19 +1251,28 @@ |
| 1655 // Uses recursion. | 1637 // Uses recursion. |
| 1656 | 1638 |
| 1657 static void walkHorizontal(const TernaryNode *node, | 1639 static void walkHorizontal(const TernaryNode *node, |
| 1658 - BuildCompactTrieHorizontalNode *building, | 1640 - BuildCompactTrieHorizontalNode *building, |
| 1659 - UStack &nodes, | 1641 - UStack &nodes, |
| 1660 - UErrorCode &status) { | 1642 - UErrorCode &status) { |
| 1661 + BuildCompactTrieHorizontalNode *building, | 1643 + BuildCompactTrieHorizontalNode *building, |
| 1662 + UStack &nodes, | 1644 + UStack &nodes, |
| 1663 + UErrorCode &status, Hashtable *values = NULL) { | 1645 + UErrorCode &status, Hashtable *values = NULL) { |
| 1664 while (U_SUCCESS(status) && node != NULL) { | 1646 while (U_SUCCESS(status) && node != NULL) { |
| 1665 if (node->low != NULL) { | 1647 if (node->low != NULL) { |
| 1666 - walkHorizontal(node->low, building, nodes, status); | 1648 - walkHorizontal(node->low, building, nodes, status); |
| 1667 + walkHorizontal(node->low, building, nodes, status, values); | 1649 + walkHorizontal(node->low, building, nodes, status, values); |
| 1668 } | 1650 } |
| 1669 BuildCompactTrieNode *link = NULL; | 1651 BuildCompactTrieNode *link = NULL; |
| 1670 if (node->equal != NULL) { | 1652 if (node->equal != NULL) { |
| 1671 - link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0,
nodes, status); | 1653 - link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0,
nodes, status); |
| 1672 + link = compactOneNode(node->equal, node->flags > 0, nodes, status,
values, node->flags); | 1654 + link = compactOneNode(node->equal, node->flags > 0, nodes, status,
values, node->flags); |
| 1673 } | 1655 } |
| 1674 - else if (node->flags & kEndsWord) { | 1656 - else if (node->flags & kEndsWord) { |
| 1675 - link = (BuildCompactTrieNode *)nodes[1]; | 1657 - link = (BuildCompactTrieNode *)nodes[1]; |
| 1676 + else if (node->flags > 0) { | 1658 + else if (node->flags > 0) { |
| 1677 + if(values != NULL) { | 1659 + if(values != NULL) { |
| 1678 + UnicodeString key(node->flags); //store value as a single-char
UnicodeString | 1660 + UnicodeString key(node->flags); //store value as a single-char
UnicodeString |
| 1679 + link = (BuildCompactTrieValueNode *) values->get(key); | 1661 + link = (BuildCompactTrieValueNode *) values->get(key); |
| 1680 + if(link == NULL) { | 1662 + if(link == NULL) { |
| 1681 + link = new BuildCompactTrieValueNode(nodes, status, node->f
lags); //take out nodes? | 1663 + link = new BuildCompactTrieValueNode(nodes, status, node->f
lags); //take out nodes? |
| 1682 + values->put(key, link, status); | 1664 + values->put(key, link, status); |
| 1683 + } | 1665 + } |
| 1684 + } else { | 1666 + } else { |
| 1685 + link = (BuildCompactTrieNode *)nodes[1]; | 1667 + link = (BuildCompactTrieNode *)nodes[1]; |
| 1686 + } | 1668 + } |
| 1687 } | 1669 } |
| 1688 if (U_SUCCESS(status) && link != NULL) { | 1670 if (U_SUCCESS(status) && link != NULL) { |
| 1689 building->addNode(node->ch, link, status); | 1671 building->addNode(node->ch, link, status); |
| 1690 @@ -881,13 +1303,15 @@ | 1672 @@ -881,13 +1292,15 @@ |
| 1691 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr)
{ | 1673 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr)
{ |
| 1692 BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl; | 1674 BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl; |
| 1693 BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr; | 1675 BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr; |
| 1694 + | 1676 + |
| 1695 // Check for comparing a node to itself, to avoid spurious duplicates | 1677 // Check for comparing a node to itself, to avoid spurious duplicates |
| 1696 if (left == right) { | 1678 if (left == right) { |
| 1697 return 0; | 1679 return 0; |
| 1698 } | 1680 } |
| 1699 + | 1681 + |
| 1700 // Most significant is type of node. Can never coalesce. | 1682 // Most significant is type of node. Can never coalesce. |
| 1701 - if (left->fVertical != right->fVertical) { | 1683 - if (left->fVertical != right->fVertical) { |
| 1702 - return left->fVertical - right->fVertical; | 1684 - return left->fVertical - right->fVertical; |
| 1703 + if (left->fNodeType != right->fNodeType) { | 1685 + if (left->fNodeType != right->fNodeType) { |
| 1704 + return left->fNodeType - right->fNodeType; | 1686 + return left->fNodeType - right->fNodeType; |
| 1705 } | 1687 } |
| 1706 // Next, the "parent ends word" flag. If that differs, we cannot coalesce. | 1688 // Next, the "parent ends word" flag. If that differs, we cannot coalesce. |
| 1707 if (left->fParentEndsWord != right->fParentEndsWord) { | 1689 if (left->fParentEndsWord != right->fParentEndsWord) { |
| 1708 @@ -898,12 +1322,19 @@ | 1690 @@ -898,12 +1311,19 @@ |
| 1709 if (result != 0) { | 1691 if (result != 0) { |
| 1710 return result; | 1692 return result; |
| 1711 } | 1693 } |
| 1712 + | 1694 + |
| 1713 + // If the node value differs, we should not coalesce. | 1695 + // If the node value differs, we should not coalesce. |
| 1714 + // If values aren't stored, all fValues should be 0. | 1696 + // If values aren't stored, all fValues should be 0. |
| 1715 + if (left->fValue != right->fValue) { | 1697 + if (left->fValue != right->fValue) { |
| 1716 + return left->fValue - right->fValue; | 1698 + return left->fValue - right->fValue; |
| 1717 + } | 1699 + } |
| 1718 + | 1700 + |
| 1719 // We know they're both the same node type, so branch for the two cases. | 1701 // We know they're both the same node type, so branch for the two cases. |
| 1720 - if (left->fVertical) { | 1702 - if (left->fVertical) { |
| 1721 + if (left->fNodeType == kVerticalType) { | 1703 + if (left->fNodeType == kVerticalType) { |
| 1722 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID | 1704 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID |
| 1723 - - ((BuildCompactTrieVerticalNode *)right)->fEqual->
fNodeID; | 1705 - - ((BuildCompactTrieVerticalNode *)right)->fEqual->
fNodeID; |
| 1724 + - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID; | 1706 + - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID; |
| 1725 } | 1707 } |
| 1726 - else { | 1708 - else { |
| 1727 + else if(left->fChars.length() > 0 && right->fChars.length() > 0){ | 1709 + else if(left->fChars.length() > 0 && right->fChars.length() > 0){ |
| 1728 // We need to compare the links vectors. They should be the | 1710 // We need to compare the links vectors. They should be the |
| 1729 // same size because the strings were equal. | 1711 // same size because the strings were equal. |
| 1730 // We compare the node IDs instead of the pointers, to handle | 1712 // We compare the node IDs instead of the pointers, to handle |
| 1731 @@ -914,9 +1345,10 @@ | 1713 @@ -914,9 +1334,10 @@ |
| 1732 int32_t count = hleft->fLinks.size(); | 1714 int32_t count = hleft->fLinks.size(); |
| 1733 for (int32_t i = 0; i < count && result == 0; ++i) { | 1715 for (int32_t i = 0; i < count && result == 0; ++i) { |
| 1734 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID - | 1716 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID - |
| 1735 - ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; | 1717 - ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; |
| 1736 + ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; | 1718 + ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; |
| 1737 } | 1719 } |
| 1738 } | 1720 } |
| 1739 + | 1721 + |
| 1740 // If they are equal to each other, mark them (speeds coalescing) | 1722 // If they are equal to each other, mark them (speeds coalescing) |
| 1741 if (result == 0) { | 1723 if (result == 0) { |
| 1742 left->fHasDuplicate = TRUE; | 1724 left->fHasDuplicate = TRUE; |
| 1743 @@ -1031,20 +1463,25 @@ | 1725 @@ -1031,20 +1452,25 @@ |
| 1744 // Add node 0, used as the NULL pointer/sentinel. | 1726 // Add node 0, used as the NULL pointer/sentinel. |
| 1745 nodes.addElement((int32_t)0, status); | 1727 nodes.addElement((int32_t)0, status); |
| 1746 | 1728 |
| 1747 + Hashtable *values = NULL; // Index of (unique) va
lues | 1729 + Hashtable *values = NULL; // Index of (unique) va
lues |
| 1748 + if (dict.fValued) { | 1730 + if (dict.fValued) { |
| 1749 + values = new Hashtable(status); | 1731 + values = new Hashtable(status); |
| 1750 + } | 1732 + } |
| 1751 + | 1733 + |
| 1752 // Start by creating the special empty node we use to indicate that the par
ent | 1734 // Start by creating the special empty node we use to indicate that the par
ent |
| 1753 // terminates a word. This must be node 1, because the builder assumes | 1735 // terminates a word. This must be node 1, because the builder assumes |
| 1754 - // that. | 1736 - // that. |
| 1755 + // that. This node will never be used for tries storing numerical values. | 1737 + // that. This node will never be used for tries storing numerical values. |
| 1756 if (U_FAILURE(status)) { | 1738 if (U_FAILURE(status)) { |
| 1757 return NULL; | 1739 return NULL; |
| 1758 } | 1740 } |
| 1759 - BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, node
s, status); | 1741 - BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, node
s, status); |
| 1760 + BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontal
Type, nodes, status); | 1742 + BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontal
Type, nodes, status); |
| 1761 if (terminal == NULL) { | 1743 if (terminal == NULL) { |
| 1762 status = U_MEMORY_ALLOCATION_ERROR; | 1744 status = U_MEMORY_ALLOCATION_ERROR; |
| 1763 } | 1745 } |
| 1764 | 1746 |
| 1765 // This call does all the work of building the new trie structure. The root | 1747 // This call does all the work of building the new trie structure. The root |
| 1766 - // will be node 2. | 1748 - // will be node 2. |
| 1767 - BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, statu
s); | 1749 - BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, statu
s); |
| 1768 + // will have node ID 2 before writing to memory. | 1750 + // will have node ID 2 before writing to memory. |
| 1769 + BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, statu
s, values); | 1751 + BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, statu
s, values); |
| 1770 #ifdef DEBUG_TRIE_DICT | 1752 #ifdef DEBUG_TRIE_DICT |
| 1771 (void) ::times(&timing); | 1753 (void) ::times(&timing); |
| 1772 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n", | 1754 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n", |
| 1773 @@ -1077,21 +1514,37 @@ | 1755 @@ -1077,21 +1503,37 @@ |
| 1774 return NULL; | 1756 return NULL; |
| 1775 } | 1757 } |
| 1776 | 1758 |
| 1777 + //map terminal value nodes | 1759 + //map terminal value nodes |
| 1778 + int valueCount = 0; | 1760 + int valueCount = 0; |
| 1779 + UVector valueNodes(status); | 1761 + UVector valueNodes(status); |
| 1780 + if(values != NULL) { | 1762 + if(values != NULL) { |
| 1781 + valueCount = values->count(); //number of unique terminal value nodes | 1763 + valueCount = values->count(); //number of unique terminal value nodes |
| 1782 + } | 1764 + } |
| 1783 + | 1765 + |
| (...skipping 23 matching lines...) Expand all Loading... |
| 1807 } | 1789 } |
| 1808 - | 1790 - |
| 1809 - // Check for overflowing 16 bits worth of nodes. | 1791 - // Check for overflowing 16 bits worth of nodes. |
| 1810 - if (nodeCount > 0x10000) { | 1792 - if (nodeCount > 0x10000) { |
| 1811 + | 1793 + |
| 1812 + // Check for overflowing 20 bits worth of nodes. | 1794 + // Check for overflowing 20 bits worth of nodes. |
| 1813 + if (nodeCount > 0x100000) { | 1795 + if (nodeCount > 0x100000) { |
| 1814 status = U_ILLEGAL_ARGUMENT_ERROR; | 1796 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 1815 return NULL; | 1797 return NULL; |
| 1816 } | 1798 } |
| 1817 @@ -1111,9 +1564,14 @@ | 1799 @@ -1111,9 +1553,14 @@ |
| 1818 status = U_MEMORY_ALLOCATION_ERROR; | 1800 status = U_MEMORY_ALLOCATION_ERROR; |
| 1819 return NULL; | 1801 return NULL; |
| 1820 } | 1802 } |
| 1821 - | 1803 - |
| 1822 + | 1804 + |
| 1823 CompactTrieHeader *header = (CompactTrieHeader *)bytes; | 1805 CompactTrieHeader *header = (CompactTrieHeader *)bytes; |
| 1824 - header->size = totalSize; | 1806 - header->size = totalSize; |
| 1825 + //header->size = totalSize; | 1807 + //header->size = totalSize; |
| 1826 + if(dict.fValued){ | 1808 + if(dict.fValued){ |
| 1827 + header->magic = COMPACT_TRIE_MAGIC_3; | 1809 + header->magic = COMPACT_TRIE_MAGIC_3; |
| 1828 + } else { | 1810 + } else { |
| 1829 + header->magic = COMPACT_TRIE_MAGIC_2; | 1811 + header->magic = COMPACT_TRIE_MAGIC_2; |
| 1830 + } | 1812 + } |
| 1831 header->nodeCount = nodeCount; | 1813 header->nodeCount = nodeCount; |
| 1832 header->offsets[0] = 0; // Sentinel | 1814 header->offsets[0] = 0; // Sentinel |
| 1833 header->root = translate.elementAti(root->fNodeID); | 1815 header->root = translate.elementAti(root->fNodeID); |
| 1834 @@ -1123,23 +1581,40 @@ | 1816 @@ -1123,23 +1570,40 @@ |
| 1835 } | 1817 } |
| 1836 #endif | 1818 #endif |
| 1837 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uin
t32_t)); | 1819 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uin
t32_t)); |
| 1838 - nodeCount = 1; | 1820 - nodeCount = 1; |
| 1839 + nodeCount = valueCount + 1; | 1821 + nodeCount = valueCount + 1; |
| 1840 + | 1822 + |
| 1841 + // Write terminal value nodes to memory | 1823 + // Write terminal value nodes to memory |
| 1842 + for (i=0; i < valueNodes.size(); i++) { | 1824 + for (i=0; i < valueNodes.size(); i++) { |
| 1843 + //header->offsets[i + 1] = offset; | 1825 + //header->offsets[i + 1] = offset; |
| 1844 + uint32_t tmpOffset = 0; | 1826 + uint32_t tmpOffset = 0; |
| (...skipping 23 matching lines...) Expand all Loading... |
| 1868 fprintf(stderr, "Trie built, time user %f system %f\n", | 1850 fprintf(stderr, "Trie built, time user %f system %f\n", |
| 1869 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, | 1851 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, |
| 1870 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); | 1852 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); |
| 1871 previous = timing; | 1853 previous = timing; |
| 1872 fprintf(stderr, "Final offset is %d\n", offset); | 1854 fprintf(stderr, "Final offset is %d\n", offset); |
| 1873 - | 1855 - |
| 1874 + | 1856 + |
| 1875 // Collect statistics on node types and sizes | 1857 // Collect statistics on node types and sizes |
| 1876 int hCount = 0; | 1858 int hCount = 0; |
| 1877 int vCount = 0; | 1859 int vCount = 0; |
| 1878 @@ -1148,68 +1623,85 @@ | 1860 @@ -1148,68 +1612,85 @@ |
| 1879 size_t hItemCount = 0; | 1861 size_t hItemCount = 0; |
| 1880 size_t vItemCount = 0; | 1862 size_t vItemCount = 0; |
| 1881 uint32_t previousOff = offset; | 1863 uint32_t previousOff = offset; |
| 1882 - for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { | 1864 - for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { |
| 1883 + uint32_t numOverflow = 0; | 1865 + uint32_t numOverflow = 0; |
| 1884 + uint32_t valueSpace = 0; | 1866 + uint32_t valueSpace = 0; |
| 1885 + for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { | 1867 + for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { |
| 1886 const CompactTrieNode *node = getCompactNode(header, nodeIdx); | 1868 const CompactTrieNode *node = getCompactNode(header, nodeIdx); |
| 1887 - if (node->flagscount & kVerticalNode) { | 1869 - if (node->flagscount & kVerticalNode) { |
| 1888 + int itemCount; | 1870 + int itemCount; |
| (...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1974 | 1956 |
| 1975 // Convert one compact trie node into a ternary subtrie | 1957 // Convert one compact trie node into a ternary subtrie |
| 1976 static TernaryNode * | 1958 static TernaryNode * |
| 1977 -unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UE
rrorCode &status ) { | 1959 -unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UE
rrorCode &status ) { |
| 1978 - int nodeCount = (node->flagscount & kCountMask); | 1960 - int nodeCount = (node->flagscount & kCountMask); |
| 1979 +unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UError
Code &status ) { | 1961 +unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UError
Code &status ) { |
| 1980 + int nodeCount = getCount(node); | 1962 + int nodeCount = getCount(node); |
| 1981 if (nodeCount == 0 || U_FAILURE(status)) { | 1963 if (nodeCount == 0 || U_FAILURE(status)) { |
| 1982 // Failure, or terminal node | 1964 // Failure, or terminal node |
| 1983 return NULL; | 1965 return NULL; |
| 1984 @@ -1234,29 +1726,41 @@ | 1966 @@ -1234,29 +1715,41 @@ |
| 1985 previous = latest; | 1967 previous = latest; |
| 1986 } | 1968 } |
| 1987 if (latest != NULL) { | 1969 if (latest != NULL) { |
| 1988 - const CompactTrieNode *equal = getCompactNode(header, vnode->equal)
; | 1970 - const CompactTrieNode *equal = getCompactNode(header, vnode->equal)
; |
| 1989 + const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(v
node)); | 1971 + const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(v
node)); |
| 1990 if (equal->flagscount & kParentEndsWord) { | 1972 if (equal->flagscount & kParentEndsWord) { |
| 1991 - latest->flags |= kEndsWord; | 1973 - latest->flags |= kEndsWord; |
| 1992 + if(info->magic == COMPACT_TRIE_MAGIC_3){ | 1974 + if(info->magic == COMPACT_TRIE_MAGIC_3){ |
| 1993 + latest->flags = getValue(equal); | 1975 + latest->flags = getValue(equal); |
| 1994 + } else { | 1976 + } else { |
| (...skipping 27 matching lines...) Expand all Loading... |
| 2022 + // because only kEqualOverflows flag should be checked in root's flagscount | 2004 + // because only kEqualOverflows flag should be checked in root's flagscount |
| 2023 + const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *
) | 2005 + const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *
) |
| 2024 + getCompactNode(fInfo, fInfo->root); | 2006 + getCompactNode(fInfo, fInfo->root); |
| 2025 + uint16_t nodeCount = hnode->flagscount & kRootCountMask; | 2007 + uint16_t nodeCount = hnode->flagscount & kRootCountMask; |
| 2026 + TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1, | 2008 + TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1, |
| 2027 + nodeCount, status); | 2009 + nodeCount, status); |
| 2028 + | 2010 + |
| 2029 if (U_FAILURE(status)) { | 2011 if (U_FAILURE(status)) { |
| 2030 delete root; // Clean up | 2012 delete root; // Clean up |
| 2031 delete result; | 2013 delete result; |
| 2032 @@ -1270,8 +1774,8 @@ | 2014 @@ -1270,8 +1763,8 @@ |
| 2033 | 2015 |
| 2034 U_CAPI int32_t U_EXPORT2 | 2016 U_CAPI int32_t U_EXPORT2 |
| 2035 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void
*outData, | 2017 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void
*outData, |
| 2036 - UErrorCode *status) { | 2018 - UErrorCode *status) { |
| 2037 - | 2019 - |
| 2038 + UErrorCode *status) { | 2020 + UErrorCode *status) { |
| 2039 + | 2021 + |
| 2040 if (status == NULL || U_FAILURE(*status)) { | 2022 if (status == NULL || U_FAILURE(*status)) { |
| 2041 return 0; | 2023 return 0; |
| 2042 } | 2024 } |
| 2043 @@ -1286,14 +1790,14 @@ | 2025 @@ -1286,14 +1779,14 @@ |
| 2044 // | 2026 // |
| 2045 const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4); | 2027 const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4); |
| 2046 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */ | 2028 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */ |
| 2047 - pInfo->dataFormat[1]==0x72 && | 2029 - pInfo->dataFormat[1]==0x72 && |
| 2048 - pInfo->dataFormat[2]==0x44 && | 2030 - pInfo->dataFormat[2]==0x44 && |
| 2049 - pInfo->dataFormat[3]==0x63 && | 2031 - pInfo->dataFormat[3]==0x63 && |
| 2050 - pInfo->formatVersion[0]==1 )) { | 2032 - pInfo->formatVersion[0]==1 )) { |
| 2051 + pInfo->dataFormat[1]==0x72 && | 2033 + pInfo->dataFormat[1]==0x72 && |
| 2052 + pInfo->dataFormat[2]==0x44 && | 2034 + pInfo->dataFormat[2]==0x44 && |
| 2053 + pInfo->dataFormat[3]==0x63 && | 2035 + pInfo->dataFormat[3]==0x63 && |
| 2054 + pInfo->formatVersion[0]==1 )) { | 2036 + pInfo->formatVersion[0]==1 )) { |
| 2055 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x
(format version %02x) is not recognized\n", | 2037 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x
(format version %02x) is not recognized\n", |
| 2056 - pInfo->dataFormat[0], pInfo->dataFormat[1], | 2038 - pInfo->dataFormat[0], pInfo->dataFormat[1], |
| 2057 - pInfo->dataFormat[2], pInfo->dataFormat[3], | 2039 - pInfo->dataFormat[2], pInfo->dataFormat[3], |
| 2058 - pInfo->formatVersion[0]); | 2040 - pInfo->formatVersion[0]); |
| 2059 + pInfo->dataFormat[0], pInfo->dataFormat[1], | 2041 + pInfo->dataFormat[0], pInfo->dataFormat[1], |
| 2060 + pInfo->dataFormat[2], pInfo->dataFormat[3], | 2042 + pInfo->dataFormat[2], pInfo->dataFormat[3], |
| 2061 + pInfo->formatVersion[0]); | 2043 + pInfo->formatVersion[0]); |
| 2062 *status=U_UNSUPPORTED_ERROR; | 2044 *status=U_UNSUPPORTED_ERROR; |
| 2063 return 0; | 2045 return 0; |
| 2064 } | 2046 } |
| 2065 @@ -1311,8 +1815,10 @@ | 2047 @@ -1311,8 +1804,10 @@ |
| 2066 // | 2048 // |
| 2067 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; | 2049 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; |
| 2068 const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes; | 2050 const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes; |
| 2069 - if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1 | 2051 - if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1 |
| 2070 - || ds->readUInt32(header->size) < sizeof(CompactTrieHeader)) | 2052 - || ds->readUInt32(header->size) < sizeof(CompactTrieHeader)) |
| 2071 + uint32_t magic = ds->readUInt32(header->magic); | 2053 + uint32_t magic = ds->readUInt32(header->magic); |
| 2072 + if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic
!= COMPACT_TRIE_MAGIC_3 | 2054 + if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic
!= COMPACT_TRIE_MAGIC_3 |
| 2073 + || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) <
sizeof(CompactTrieHeaderV1) | 2055 + || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) <
sizeof(CompactTrieHeaderV1) |
| 2074 + || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) <
sizeof(CompactTrieHeader)) | 2056 + || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) <
sizeof(CompactTrieHeader)) |
| 2075 { | 2057 { |
| 2076 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n"
); | 2058 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n"
); |
| 2077 *status=U_UNSUPPORTED_ERROR; | 2059 *status=U_UNSUPPORTED_ERROR; |
| 2078 @@ -1333,10 +1839,10 @@ | 2060 @@ -1333,10 +1828,10 @@ |
| 2079 // | 2061 // |
| 2080 if (length < sizeWithUData) { | 2062 if (length < sizeWithUData) { |
| 2081 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data
header) for trie data.\n", | 2063 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data
header) for trie data.\n", |
| 2082 - totalSize); | 2064 - totalSize); |
| 2083 + totalSize); | 2065 + totalSize); |
| 2084 *status=U_INDEX_OUTOFBOUNDS_ERROR; | 2066 *status=U_INDEX_OUTOFBOUNDS_ERROR; |
| 2085 return 0; | 2067 return 0; |
| 2086 - } | 2068 - } |
| 2087 + } | 2069 + } |
| 2088 | 2070 |
| 2089 // | 2071 // |
| 2090 // Swap the Data. Do the data itself first, then the CompactTrieHeader, be
cause | 2072 // Swap the Data. Do the data itself first, then the CompactTrieHeader, be
cause |
| 2091 @@ -1355,20 +1861,38 @@ | 2073 @@ -1355,20 +1850,38 @@ |
| 2092 } | 2074 } |
| 2093 | 2075 |
| 2094 // We need to loop through all the nodes in the offset table, and swap each
one. | 2076 // We need to loop through all the nodes in the offset table, and swap each
one. |
| 2095 - uint16_t nodeCount = ds->readUInt16(header->nodeCount); | 2077 - uint16_t nodeCount = ds->readUInt16(header->nodeCount); |
| 2096 + uint32_t nodeCount, rootId; | 2078 + uint32_t nodeCount, rootId; |
| 2097 + if(header->magic == COMPACT_TRIE_MAGIC_1) { | 2079 + if(header->magic == COMPACT_TRIE_MAGIC_1) { |
| 2098 + nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount); | 2080 + nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount); |
| 2099 + rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root); | 2081 + rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root); |
| 2100 + } else { | 2082 + } else { |
| 2101 + nodeCount = ds->readUInt32(header->nodeCount); | 2083 + nodeCount = ds->readUInt32(header->nodeCount); |
| (...skipping 24 matching lines...) Expand all Loading... |
| 2126 + overflow += 1; | 2108 + overflow += 1; |
| 2127 + } | 2109 + } |
| 2128 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVertica
lNode,chars), | 2110 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVertica
lNode,chars), |
| 2129 - itemCount*sizeof(uint16_t), | 2111 - itemCount*sizeof(uint16_t), |
| 2130 - outBytes+nodeOff+offsetof(CompactTrieVertic
alNode,chars), status); | 2112 - outBytes+nodeOff+offsetof(CompactTrieVertic
alNode,chars), status); |
| 2131 + (itemCount + overflow)*sizeof(uint16_t), | 2113 + (itemCount + overflow)*sizeof(uint16_t), |
| 2132 + outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars
), status); | 2114 + outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars
), status); |
| 2133 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(Compac
tTrieVerticalNode,equal); | 2115 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(Compac
tTrieVerticalNode,equal); |
| 2134 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNo
de,equal)); | 2116 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNo
de,equal)); |
| 2135 } | 2117 } |
| 2136 @@ -1381,26 +1905,62 @@ | 2118 @@ -1381,26 +1894,62 @@ |
| 2137 word = ds->readUInt16(inHNode->entries[j].equal); | 2119 word = ds->readUInt16(inHNode->entries[j].equal); |
| 2138 ds->writeUInt16(&outHNode->entries[j].equal, word); | 2120 ds->writeUInt16(&outHNode->entries[j].equal, word); |
| 2139 } | 2121 } |
| 2140 + | 2122 + |
| 2141 + // swap overflow/value information | 2123 + // swap overflow/value information |
| 2142 + if(flagscount & kEqualOverflows){ | 2124 + if(flagscount & kEqualOverflows){ |
| 2143 + overflow += (itemCount + 3) / 4; | 2125 + overflow += (itemCount + 3) / 4; |
| 2144 + } | 2126 + } |
| 2145 + | 2127 + |
| 2146 + if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && fla
gscount & kEndsParentWord) { | 2128 + if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && fla
gscount & kEndsParentWord) { |
| (...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2202 + ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff
, status); | 2184 + ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff
, status); |
| 2203 + | 2185 + |
| 2204 + //swap offsets | 2186 + //swap offsets |
| 2205 + ds->swapArray32(ds, inBytes+offsetPos, | 2187 + ds->swapArray32(ds, inBytes+offsetPos, |
| 2206 + sizeof(uint32_t)*(uint32_t)nodeCount, | 2188 + sizeof(uint32_t)*(uint32_t)nodeCount, |
| 2207 + outBytes+offsetPos, status); | 2189 + outBytes+offsetPos, status); |
| 2208 | 2190 |
| 2209 return sizeWithUData; | 2191 return sizeWithUData; |
| 2210 } | 2192 } |
| 2211 --- source/common/triedict.h 2006-06-06 15:38:49.000000000 -0700 | 2193 --- source/common/triedict.h 2006-06-06 15:38:49.000000000 -0700 |
| 2212 +++ source/common/triedict.h» 2009-07-27 13:01:17.723390000 -0700 | 2194 +++ source/common/triedict.h» 2011-01-21 14:12:45.496927000 -0800 |
| 2213 @@ -47,7 +47,6 @@ | 2195 @@ -47,7 +47,6 @@ |
| 2214 U_NAMESPACE_BEGIN | 2196 U_NAMESPACE_BEGIN |
| 2215 | 2197 |
| 2216 class StringEnumeration; | 2198 class StringEnumeration; |
| 2217 -struct CompactTrieHeader; | 2199 -struct CompactTrieHeader; |
| 2218 | 2200 |
| 2219 /******************************************************************* | 2201 /******************************************************************* |
| 2220 * TrieWordDictionary | 2202 * TrieWordDictionary |
| 2221 @@ -72,23 +71,29 @@ | 2203 @@ -72,23 +71,29 @@ |
| 2222 */ | 2204 */ |
| (...skipping 218 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2441 * | 2423 * |
| 2442 * @return The data for the compact dictionary, suitable for passing to the | 2424 * @return The data for the compact dictionary, suitable for passing to the |
| 2443 * constructor. | 2425 * constructor. |
| 2444 @@ -342,5 +386,5 @@ | 2426 @@ -342,5 +386,5 @@ |
| 2445 | 2427 |
| 2446 U_NAMESPACE_END | 2428 U_NAMESPACE_END |
| 2447 | 2429 |
| 2448 - /* TRIEDICT_H */ | 2430 - /* TRIEDICT_H */ |
| 2449 +/* TRIEDICT_H */ | 2431 +/* TRIEDICT_H */ |
| 2450 #endif | 2432 #endif |
| 2451 --- source/data/brkitr/brkfiles.mk» 2009-04-21 15:42:37.000000000 -0700 | 2433 --- source/data/Makefile.in» 2010-10-29 13:21:33.000000000 -0700 |
| 2452 +++ source/data/brkitr/brkfiles.mk» 2009-07-27 13:01:17.730379000 -0700 | 2434 +++ source/data/Makefile.in» 2011-01-26 16:24:24.856798000 -0800 |
| 2453 @@ -34,13 +34,12 @@ | 2435 @@ -509,8 +520,9 @@ |
| 2436 #################################################### CTD |
| 2437 # CTD FILES |
| 2454 | 2438 |
| 2439 -$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_
FILES) |
| 2440 - $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $< |
| 2441 +# .ctd file now generated regardless of whether dictionary file exists |
| 2442 +$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES) |
| 2443 + $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F
).txt |
| 2455 | 2444 |
| 2456 # List of compact trie dictionary files (ctd). | 2445 #################################################### CFU |
| 2457 -BRK_CTD_SOURCE = thaidict.txt | 2446 # CFU FILES |
| 2458 +BRK_CTD_SOURCE = thaidict.txt cjdict.txt | 2447 --- source/data/brkitr/root.txt»2010-07-28 17:18:28.000000000 -0700 |
| 2459 | 2448 +++ source/data/brkitr/root.txt»2011-01-21 14:12:45.653922000 -0800 |
| 2460 | |
| 2461 # List of break iterator files (brk). | |
| 2462 -BRK_SOURCE = word_POSIX.txt word_ja.txt sent_el.txt char_th.txt char.txt word.t
xt line.txt sent.txt title.txt | |
| 2463 +BRK_SOURCE = word_POSIX.txt sent_el.txt char_th.txt char.txt word.txt line.txt
sent.txt title.txt | |
| 2464 | |
| 2465 | |
| 2466 # Ordinary resources | |
| 2467 -BRK_RES_SOURCE = el.txt en.txt en_US.txt en_US_POSIX.txt ja.txt th.txt | |
| 2468 - | |
| 2469 +BRK_RES_SOURCE = el.txt en.txt en_US.txt en_US_POSIX.txt th.txt | |
| 2470 --- source/data/brkitr/root.txt»2009-06-24 14:06:38.000000000 -0700 | |
| 2471 +++ source/data/brkitr/root.txt»2009-07-27 13:01:17.733382000 -0700 | |
| 2472 @@ -17,5 +17,8 @@ | 2449 @@ -17,5 +17,8 @@ |
| 2473 } | 2450 } |
| 2474 dictionaries{ | 2451 dictionaries{ |
| 2475 Thai:process(dependency){"thaidict.ctd"} | 2452 Thai:process(dependency){"thaidict.ctd"} |
| 2476 + Hani:process(dependency){"cjdict.ctd"} | 2453 + Hani:process(dependency){"cjdict.ctd"} |
| 2477 + Hira:process(dependency){"cjdict.ctd"} | 2454 + Hira:process(dependency){"cjdict.ctd"} |
| 2478 + Kata:process(dependency){"cjdict.ctd"} | 2455 + Kata:process(dependency){"cjdict.ctd"} |
| 2479 } | 2456 } |
| 2480 } | 2457 } |
| 2481 --- source/data/brkitr/word.txt»2009-06-24 14:06:38.000000000 -0700 | 2458 --- source/data/xml/brkitr/root.xml» 2010-03-01 15:13:18.000000000 -0800 |
| 2482 +++ source/data/brkitr/word.txt»2010-08-27 16:24:25.969372000 -0700 | 2459 +++ source/data/xml/brkitr/root.xml» 2011-01-21 14:12:45.735922000 -0800 |
| 2483 @@ -29,29 +29,49 @@ | |
| 2484 $Newline = [\p{Word_Break = Newline}]; | |
| 2485 $Extend = [\p{Word_Break = Extend}]; | |
| 2486 $Format = [\p{Word_Break = Format}]; | |
| 2487 +$Hiragana = [:Hiragana:]; | |
| 2488 $Katakana = [\p{Word_Break = Katakana}]; | |
| 2489 +$Han = [:Han:]; | |
| 2490 $ALetter = [\p{Word_Break = ALetter}]; | |
| 2491 -$MidNumLet = [\p{Word_Break = MidNumLet}]; | |
| 2492 +# Remove two full stop characters from $MidNumLet and add them to $MidNum | |
| 2493 +# to break a hostname into its components at the cost of breaking | |
| 2494 +# 'e.g.' and 'i.e.' as well. | |
| 2495 +# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12. | |
| 2496 +# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected | |
| 2497 +# while rules 6/7 are reverted to the old behavior we want. | |
| 2498 +$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]]; | |
| 2499 $MidLetter = [\p{Word_Break = MidLetter}]; | |
| 2500 -$MidNum = [\p{Word_Break = MidNum}]; | |
| 2501 -$Numeric = [\p{Word_Break = Numeric}]; | |
| 2502 +$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]]; | |
| 2503 +$Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth
digits | |
| 2504 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; | |
| 2505 | |
| 2506 +# Extra sets not to break 'HebrewLetter U+0022 HebrewLetter'. | |
| 2507 +$HebrewLet = [\p{Word_Break = ALetter} & \p{Script = Hebrew} - [\u05F3]]; | |
| 2508 +# U+05F3 is ALetter and U+05F4 is MidLetter so that they're covered by | |
| 2509 +# the current rule 6/7. | |
| 2510 +$HebrewMidLet = [\u0022]; | |
| 2511 | |
| 2512 # Dictionary character set, for triggering language-based break engines. Curr
ently | |
| 2513 -# limited to LineBreak=Complex_Context. Note that this set only works in Unic
ode | |
| 2514 -# 5.0 or later as the definition of Complex_Context was corrected to include
all | |
| 2515 +# limited to LineBreak=Complex_Context and CJK. Note that this set only works | |
| 2516 +# in Unicode 5.0 or later as the definition of Complex_Context was corrected
to include all | |
| 2517 # characters requiring dictionary break. | |
| 2518 | |
| 2519 -$dictionary = [:LineBreak = Complex_Context:]; | |
| 2520 $Control = [\p{Grapheme_Cluster_Break = Control}]; | |
| 2521 -$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default A
Letter does not | |
| 2522 - # include the dic
tionary characters. | |
| 2523 +$HangulSyllable = [\uac00-\ud7a3]; | |
| 2524 +$ComplexContext = [:LineBreak = Complex_Context:]; | |
| 2525 +$KanaKanji = [$Han $Hiragana $Katakana]; | |
| 2526 +$dictionaryCJK = [$KanaKanji $HangulSyllable]; | |
| 2527 +$dictionary = [$ComplexContext $dictionaryCJK]; | |
| 2528 + | |
| 2529 +# leave CJK scripts out of ALetterPlus | |
| 2530 +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; | |
| 2531 + | |
| 2532 | |
| 2533 # | |
| 2534 # Rules 4 Ignore Format and Extend characters, | |
| 2535 # except when they appear at the beginning of a region of text. | |
| 2536 # | |
| 2537 +# TODO: check if handling of katakana in dictionary makes rules incorrect/void. | |
| 2538 $KatakanaEx = $Katakana ($Extend | $Format)*; | |
| 2539 $ALetterEx = $ALetterPlus ($Extend | $Format)*; | |
| 2540 $MidNumLetEx = $MidNumLet ($Extend | $Format)*; | |
| 2541 @@ -59,8 +79,8 @@ | |
| 2542 $MidNumEx = $MidNum ($Extend | $Format)*; | |
| 2543 $NumericEx = $Numeric ($Extend | $Format)*; | |
| 2544 $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; | |
| 2545 +$HebrewLetEx = $HebrewLet ($Extend | $Format)*; | |
| 2546 | |
| 2547 -$Hiragana = [\p{script=Hiragana}]; | |
| 2548 $Ideographic = [\p{Ideographic}]; | |
| 2549 $HiraganaEx = $Hiragana ($Extend | $Format)*; | |
| 2550 $IdeographicEx = $Ideographic ($Extend | $Format)*; | |
| 2551 @@ -79,12 +99,14 @@ | |
| 2552 # begins with a group of Format chars, or with a "word" consisting of
a single | |
| 2553 # char that is not in any of the listed word break categories followed
by | |
| 2554 # format char(s). | |
| 2555 -[^$CR $LF $Newline]? ($Extend | $Format)+; | |
| 2556 + # format char(s), or is not a CJK dictionary character. | |
| 2557 +[^$CR $LF $Newline $dictionaryCJK]? ($Extend | $Format)+; | |
| 2558 | |
| 2559 $NumericEx {100}; | |
| 2560 $ALetterEx {200}; | |
| 2561 -$KatakanaEx {300}; # note: these status values override those from rule
5 | |
| 2562 -$HiraganaEx {300}; # by virtual of being numerically larger. | |
| 2563 +$HangulSyllable {200}; | |
| 2564 +$KatakanaEx {400}; #originally 300 | |
| 2565 +$HiraganaEx {400}; #originally 300 | |
| 2566 $IdeographicEx {400}; # | |
| 2567 | |
| 2568 # | |
| 2569 @@ -96,6 +118,9 @@ | |
| 2570 # rule 6 and 7 | |
| 2571 $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; | |
| 2572 | |
| 2573 +# Chrome addition | |
| 2574 +$HebrewLetEx $HebrewMidLet $HebrewLetEx {200}; | |
| 2575 + | |
| 2576 # rule 8 | |
| 2577 | |
| 2578 $NumericEx $NumericEx {100}; | |
| 2579 @@ -114,19 +139,25 @@ | |
| 2580 | |
| 2581 # rule 13 | |
| 2582 | |
| 2583 -$KatakanaEx $KatakanaEx {300}; | |
| 2584 +# To be consistent with '$KanaKanji $KanaKanji', changed | |
| 2585 +# from 300 to 400. | |
| 2586 +# See also TestRuleStatus in intltest/rbbiapts.cpp | |
| 2587 +$KatakanaEx $KatakanaEx {400}; | |
| 2588 | |
| 2589 # rule 13a/b | |
| 2590 | |
| 2591 $ALetterEx $ExtendNumLetEx {200}; # (13a) | |
| 2592 $NumericEx $ExtendNumLetEx {100}; # (13a) | |
| 2593 -$KatakanaEx $ExtendNumLetEx {300}; # (13a) | |
| 2594 +$KatakanaEx $ExtendNumLetEx {400}; # (13a) | |
| 2595 $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) | |
| 2596 | |
| 2597 $ExtendNumLetEx $ALetterEx {200}; # (13b) | |
| 2598 $ExtendNumLetEx $NumericEx {100}; # (13b) | |
| 2599 -$ExtendNumLetEx $KatakanaEx {300}; # (13b) | |
| 2600 - | |
| 2601 +$ExtendNumLetEx $KatakanaEx {400}; # (13b) | |
| 2602 + | |
| 2603 +# special handling for CJK characters: chain for later dictionary segmentation | |
| 2604 +$HangulSyllable $HangulSyllable {200}; | |
| 2605 +$KanaKanji $KanaKanji {400}; #different rule status if both kanji and kana foun
d | |
| 2606 | |
| 2607 | |
| 2608 ## ------------------------------------------------- | |
| 2609 @@ -139,13 +170,15 @@ | |
| 2610 $BackMidNumEx = ($Format | $Extend)* $MidNum; | |
| 2611 $BackMidLetterEx = ($Format | $Extend)* $MidLetter; | |
| 2612 $BackKatakanaEx = ($Format | $Extend)* $Katakana; | |
| 2613 +$BackHiraganaEx = ($Extend | $Format)* $Hiragana; | |
| 2614 $BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet; | |
| 2615 +$BackHebrewLetEx = ($Format | $Extend)* $HebrewLet; | |
| 2616 | |
| 2617 # rule 3 | |
| 2618 $LF $CR; | |
| 2619 | |
| 2620 # rule 4 | |
| 2621 -($Format | $Extend)* [^$CR $LF $Newline]?; | |
| 2622 +($Format | $Extend)* [^$CR $LF $Newline $dictionaryCJK]?; | |
| 2623 | |
| 2624 # rule 5 | |
| 2625 | |
| 2626 @@ -155,6 +188,8 @@ | |
| 2627 | |
| 2628 $BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx; | |
| 2629 | |
| 2630 +# Chrome addition | |
| 2631 +$BackHebrewLetEx $HebrewMidLet $BackHebrewLetEx; | |
| 2632 | |
| 2633 # rule 8 | |
| 2634 | |
| 2635 @@ -181,6 +216,10 @@ | |
| 2636 $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackE
xtendNumLetEx); | |
| 2637 ($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; | |
| 2638 | |
| 2639 +# special handling for CJK characters: chain for later dictionary segmentation | |
| 2640 +$HangulSyllable $HangulSyllable; | |
| 2641 +$KanaKanji $KanaKanji; #different rule status if both kanji and kana found | |
| 2642 + | |
| 2643 ## ------------------------------------------------- | |
| 2644 | |
| 2645 !!safe_reverse; | |
| 2646 --- source/data/xml/brkitr/root.xml» 2007-08-28 23:10:43.000000000 -0700 | |
| 2647 +++ source/data/xml/brkitr/root.xml» 2009-07-27 13:01:17.746367000 -0700 | |
| 2648 @@ -25,6 +25,9 @@ | 2460 @@ -25,6 +25,9 @@ |
| 2649 </icu:boundaries> | 2461 </icu:boundaries> |
| 2650 <icu:dictionaries> | 2462 <icu:dictionaries> |
| 2651 <icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/> | 2463 <icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/> |
| 2652 + <icu:dictionary type="Hani" icu:dependency="cjdict.ctd"/> | 2464 + <icu:dictionary type="Hani" icu:dependency="cjdict.ctd"/> |
| 2653 + <icu:dictionary type="Hira" icu:dependency="cjdict.ctd"/> | 2465 + <icu:dictionary type="Hira" icu:dependency="cjdict.ctd"/> |
| 2654 + <icu:dictionary type="Kata" icu:dependency="cjdict.ctd"/> | 2466 + <icu:dictionary type="Kata" icu:dependency="cjdict.ctd"/> |
| 2655 </icu:dictionaries> | 2467 </icu:dictionaries> |
| 2656 </icu:breakIteratorData> | 2468 </icu:breakIteratorData> |
| 2657 </special> | 2469 </special> |
| 2658 --- source/test/cintltst/creststn.c» 2009-06-26 09:49:55.000000000 -0700 | 2470 --- source/test/cintltst/creststn.c» 2010-10-28 10:44:02.000000000 -0700 |
| 2659 +++ source/test/cintltst/creststn.c» 2009-07-29 12:46:05.997405000 -0700 | 2471 +++ source/test/cintltst/creststn.c» 2011-01-21 14:12:44.995020000 -0800 |
| 2660 @@ -2181,21 +2181,21 @@ | 2472 @@ -2188,21 +2188,21 @@ |
| 2661 | 2473 |
| 2662 | 2474 |
| 2663 { | 2475 { |
| 2664 - UResourceBundle* ja = ures_open(U_ICUDATA_BRKITR,"ja", &status); | 2476 - UResourceBundle* ja = ures_open(U_ICUDATA_BRKITR,"ja", &status); |
| 2665 + UResourceBundle* th = ures_open(U_ICUDATA_BRKITR,"th", &status); | 2477 + UResourceBundle* th = ures_open(U_ICUDATA_BRKITR,"th", &status); |
| 2666 const UChar *got = NULL, *exp=NULL; | 2478 const UChar *got = NULL, *exp=NULL; |
| 2667 int32_t gotLen = 0, expLen=0; | 2479 int32_t gotLen = 0, expLen=0; |
| 2668 - ja = ures_getByKey(ja, "boundaries", ja, &status); | 2480 - ja = ures_getByKey(ja, "boundaries", ja, &status); |
| 2669 - exp = tres_getString(ja, -1, "word", &expLen, &status); | 2481 - exp = tres_getString(ja, -1, "word", &expLen, &status); |
| 2670 + th = ures_getByKey(th, "boundaries", th, &status); | 2482 + th = ures_getByKey(th, "boundaries", th, &status); |
| 2671 + exp = tres_getString(th, -1, "grapheme", &expLen, &status); | 2483 + exp = tres_getString(th, -1, "grapheme", &expLen, &status); |
| 2672 | 2484 |
| 2673 tb = ures_getByKey(aliasB, "boundaries", tb, &status); | 2485 tb = ures_getByKey(aliasB, "boundaries", tb, &status); |
| 2674 - got = tres_getString(tb, -1, "word", &gotLen, &status); | 2486 - got = tres_getString(tb, -1, "word", &gotLen, &status); |
| 2675 + got = tres_getString(tb, -1, "grapheme", &gotLen, &status); | 2487 + got = tres_getString(tb, -1, "grapheme", &gotLen, &status); |
| 2676 | 2488 |
| 2677 if(U_FAILURE(status)) { | 2489 if(U_FAILURE(status)) { |
| 2678 log_err("%s trying to read str boundaries\n", u_errorName(statu
s)); | 2490 log_err("%s trying to read str boundaries\n", u_errorName(statu
s)); |
| 2679 } else if(gotLen != expLen || u_strncmp(exp, got, gotLen) != 0) { | 2491 } else if(gotLen != expLen || u_strncmp(exp, got, gotLen) != 0) { |
| 2680 log_err("Referencing alias didn't get the right data\n"); | 2492 log_err("Referencing alias didn't get the right data\n"); |
| 2681 } | 2493 } |
| 2682 - ures_close(ja); | 2494 - ures_close(ja); |
| 2683 + ures_close(th); | 2495 + ures_close(th); |
| 2684 status = U_ZERO_ERROR; | 2496 status = U_ZERO_ERROR; |
| 2685 } | 2497 } |
| 2686 /* simple alias */ | 2498 /* simple alias */ |
| 2687 @@ -3024,4 +3024,3 @@ | 2499 --- source/test/intltest/rbbiapts.cpp» 2010-07-12 11:03:29.000000000 -0700 |
| 2688 } | 2500 +++ source/test/intltest/rbbiapts.cpp» 2011-01-21 14:12:45.033014000 -0800 |
| 2689 | |
| 2690 } | |
| 2691 - | |
| 2692 --- source/test/intltest/rbbiapts.cpp» 2009-06-26 09:49:55.000000000 -0700 | |
| 2693 +++ source/test/intltest/rbbiapts.cpp» 2009-07-28 13:56:30.208042000 -0700 | |
| 2694 @@ -156,9 +156,13 @@ | 2501 @@ -156,9 +156,13 @@ |
| 2695 if(*a!=*b){ | 2502 if(*a!=*b){ |
| 2696 errln("Failed: boilerplate method operator!= does not return correct re
sults"); | 2503 errln("Failed: boilerplate method operator!= does not return correct re
sults"); |
| 2697 } | 2504 } |
| 2698 - BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status); | 2505 - BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status); |
| 2699 - if(a && c){ | 2506 - if(a && c){ |
| 2700 - if(*c==*a){ | 2507 - if(*c==*a){ |
| 2701 + // Japanese word break iteratos is identical to root with | 2508 + // Japanese word break iteratos is identical to root with |
| 2702 + // a dictionary-based break iterator, but Thai character break iterator | 2509 + // a dictionary-based break iterator, but Thai character break iterator |
| 2703 + // is still different from Root. | 2510 + // is still different from Root. |
| 2704 + BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),stat
us); | 2511 + BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),stat
us); |
| 2705 + BreakIterator* d = BreakIterator::createCharacterInstance(Locale("th"),stat
us); | 2512 + BreakIterator* d = BreakIterator::createCharacterInstance(Locale("th"),stat
us); |
| 2706 + if(c && d){ | 2513 + if(c && d){ |
| 2707 + if(*c==*d){ | 2514 + if(*c==*d){ |
| 2708 errln("Failed: boilerplate method opertator== does not return corre
ct results"); | 2515 errln("Failed: boilerplate method opertator== does not return corre
ct results"); |
| 2709 } | 2516 } |
| 2710 }else{ | 2517 }else{ |
| 2711 @@ -167,6 +171,7 @@ | 2518 @@ -167,6 +171,7 @@ |
| 2712 delete a; | 2519 delete a; |
| 2713 delete b; | 2520 delete b; |
| 2714 delete c; | 2521 delete c; |
| 2715 + delete d; | 2522 + delete d; |
| 2716 } | 2523 } |
| 2717 | 2524 |
| 2718 void RBBIAPITest::TestgetRules() | 2525 void RBBIAPITest::TestgetRules() |
| 2719 @@ -643,21 +648,21 @@ | 2526 @@ -635,21 +640,21 @@ |
| 2720 // | 2527 // |
| 2721 void RBBIAPITest::TestRuleStatus() { | 2528 void RBBIAPITest::TestRuleStatus() { |
| 2722 UChar str[30]; | 2529 UChar str[30]; |
| 2723 - u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094
", | 2530 - u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094
", |
| 2724 - // 012345678901234567 8 9 0 1 2 3 4 5
6 | 2531 - // 012345678901234567 8 9 0 1 2 3 4 5
6 |
| 2725 - // Ideographic Katakana Hiragana | 2532 - // Ideographic Katakana Hiragana |
| 2726 + //no longer test Han or hiragana breaking here: ruleStatusVec would return
nothing | 2533 + //no longer test Han or hiragana breaking here: ruleStatusVec would return
nothing |
| 2727 + // changed UBRK_WORD_KANA to UBRK_WORD_IDEO | 2534 + // changed UBRK_WORD_KANA to UBRK_WORD_IDEO |
| 2728 + u_unescape("plain word 123.45 \\u30a1\\u30a2 ", | 2535 + u_unescape("plain word 123.45 \\u30a1\\u30a2 ", |
| 2729 + // 012345678901234567 8 9 0 | 2536 + // 012345678901234567 8 9 0 |
| 2730 + // Katakana | 2537 + // Katakana |
| 2731 str, 30); | 2538 str, 30); |
| 2732 UnicodeString testString1(str); | 2539 UnicodeString testString1(str); |
| 2733 - int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26}; | 2540 - int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26}; |
| 2734 + int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21}; | 2541 + int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21}; |
| 2735 int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE,
UBRK_WORD_LETTER, | 2542 int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE,
UBRK_WORD_LETTER, |
| 2736 UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE, | 2543 UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE, |
| 2737 - UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE, | 2544 - UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE, |
| 2738 - UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA,
UBRK_WORD_KANA}; | 2545 - UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA,
UBRK_WORD_KANA}; |
| 2739 + UBRK_WORD_IDEO, UBRK_WORD_NONE}; | 2546 + UBRK_WORD_IDEO, UBRK_WORD_NONE}; |
| 2740 | 2547 |
| 2741 int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WO
RD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, | 2548 int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WO
RD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, |
| 2742 UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WO
RD_NONE_LIMIT, | 2549 UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WO
RD_NONE_LIMIT, |
| 2743 - UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WO
RD_NONE_LIMIT, | 2550 - UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WO
RD_NONE_LIMIT, |
| 2744 - UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WO
RD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT}; | 2551 - UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WO
RD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT}; |
| 2745 + UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT}; | 2552 + UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT}; |
| 2746 | 2553 |
| 2747 UErrorCode status=U_ZERO_ERROR; | 2554 UErrorCode status=U_ZERO_ERROR; |
| 2748 | 2555 |
| 2749 @@ -896,9 +901,11 @@ | 2556 @@ -888,9 +893,11 @@ |
| 2750 | 2557 |
| 2751 URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD
, status); | 2558 URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD
, status); |
| 2752 { | 2559 { |
| 2753 +#if 0 // With a dictionary based word breaking, ja_word is identical to root. | 2560 +#if 0 // With a dictionary based word breaking, ja_word is identical to root. |
| 2754 if (ja_word && *ja_word == *root_word) { | 2561 if (ja_word && *ja_word == *root_word) { |
| 2755 errln("japan not different from root"); | 2562 errln("japan not different from root"); |
| 2756 } | 2563 } |
| 2757 +#endif | 2564 +#endif |
| 2758 } | 2565 } |
| 2759 | 2566 |
| 2760 { | 2567 { |
| 2761 --- source/test/intltest/rbbitst.cpp» 2009-06-26 09:49:55.000000000 -0700 | 2568 --- source/test/intltest/rbbitst.cpp» 2010-10-08 18:23:28.000000000 -0700 |
| 2762 +++ source/test/intltest/rbbitst.cpp» 2009-07-28 15:35:18.933226000 -0700 | 2569 +++ source/test/intltest/rbbitst.cpp» 2011-01-21 14:12:45.180030000 -0800 |
| 2763 @@ -33,6 +33,8 @@ | 2570 @@ -35,6 +35,8 @@ |
| 2764 #include <string.h> | 2571 #include <string.h> |
| 2765 #include <stdio.h> | 2572 #include <stdio.h> |
| 2766 #include <stdlib.h> | 2573 #include <stdlib.h> |
| 2767 +#include "unicode/numfmt.h" | 2574 +#include "unicode/numfmt.h" |
| 2768 +#include "unicode/uscript.h" | 2575 +#include "unicode/uscript.h" |
| 2769 | 2576 |
| 2770 #define TEST_ASSERT(x) {if (!(x)) { \ | 2577 #define TEST_ASSERT(x) {if (!(x)) { \ |
| 2771 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} | 2578 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} |
| 2772 @@ -108,6 +110,8 @@ | 2579 @@ -138,11 +140,13 @@ |
| 2773 if (exec) TestThaiBreaks(); break; | 2580 if (exec) TestThaiBreaks(); break; |
| 2774 case 23: name = "TestTailoredBreaks"; | 2581 case 23: name = "TestTailoredBreaks"; |
| 2775 if (exec) TestTailoredBreaks(); break; | 2582 if (exec) TestTailoredBreaks(); break; |
| 2776 + case 24: name = "TestTrieDictWithValue"; | 2583 + case 24: name = "TestTrieDictWithValue"; |
| 2777 + if(exec) TestTrieDictWithValue(); break; | 2584 + if(exec) TestTrieDictWithValue(); break; |
| 2778 | 2585 #else |
| 2779 default: name = ""; break; //needed to end loop | 2586 - case 21: case 22: case 23: name = "skip"; |
| 2780 } | 2587 + case 21: case 22: case 23: case 24: name = "skip"; |
| 2781 @@ -570,6 +574,8 @@ | 2588 break; |
| 2589 #endif |
| 2590 - case 24: name = "TestDictRules"; |
| 2591 + case 25: name = "TestDictRules"; |
| 2592 if (exec) TestDictRules(); break; |
| 2593 case 25: name = "TestBug5532"; |
| 2594 if (exec) TestBug5532(); break; |
| 2595 @@ -607,6 +611,8 @@ |
| 2782 | 2596 |
| 2783 | 2597 |
| 2784 void RBBITest::TestJapaneseWordBreak() { | 2598 void RBBITest::TestJapaneseWordBreak() { |
| 2785 +// TODO: Rewrite this test for a dictionary-based word breaking. | 2599 +// TODO: Rewrite this test for a dictionary-based word breaking. |
| 2786 +#if 0 | 2600 +#if 0 |
| 2787 UErrorCode status = U_ZERO_ERROR; | 2601 UErrorCode status = U_ZERO_ERROR; |
| 2788 BITestData japaneseWordSelection(status); | 2602 BITestData japaneseWordSelection(status); |
| 2789 | 2603 |
| 2790 @@ -591,6 +597,7 @@ | 2604 @@ -628,6 +634,7 @@ |
| 2791 | 2605 |
| 2792 generalIteratorTest(*e, japaneseWordSelection); | 2606 generalIteratorTest(*e, japaneseWordSelection); |
| 2793 delete e; | 2607 delete e; |
| 2794 +#endif | 2608 +#endif |
| 2795 } | 2609 } |
| 2796 | 2610 |
| 2797 void RBBITest::TestTrieDict() { | 2611 void RBBITest::TestTrieDict() { |
| 2798 @@ -812,6 +819,372 @@ | 2612 @@ -849,6 +856,372 @@ |
| 2799 delete compact2; | 2613 delete compact2; |
| 2800 } | 2614 } |
| 2801 | 2615 |
| 2802 +/*TODO: delete later*/ | 2616 +/*TODO: delete later*/ |
| 2803 +inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){ | 2617 +inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){ |
| 2804 + UErrorCode status = U_ZERO_ERROR; | 2618 + UErrorCode status = U_ZERO_ERROR; |
| 2805 + FILE *outfile = fopen(filename,"w"); | 2619 + FILE *outfile = fopen(filename,"w"); |
| 2806 + UConverter *cvt = ucnv_open("UTF-8", &status); | 2620 + UConverter *cvt = ucnv_open("UTF-8", &status); |
| 2807 + if (U_FAILURE(status)) | 2621 + if (U_FAILURE(status)) |
| 2808 + return; | 2622 + return; |
| (...skipping 352 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3161 + delete cloneEnum; | 2975 + delete cloneEnum; |
| 3162 + delete compact2; | 2976 + delete compact2; |
| 3163 + utext_close(originalText); | 2977 + utext_close(originalText); |
| 3164 + utext_close(cloneText); | 2978 + utext_close(cloneText); |
| 3165 + | 2979 + |
| 3166 + | 2980 + |
| 3167 +} | 2981 +} |
| 3168 | 2982 |
| 3169 //---------------------------------------------------------------------------- | 2983 //---------------------------------------------------------------------------- |
| 3170 // | 2984 // |
| 3171 @@ -1832,8 +2205,15 @@ | 2985 @@ -1870,8 +2243,15 @@ |
| 3172 // Don't break in runs of hiragana or runs of ideograph, where the latter inclu
des \u3005 \u3007 \u303B (cldrbug #2009). | 2986 // Don't break in runs of hiragana or runs of ideograph, where the latter inclu
des \u3005 \u3007 \u303B (cldrbug #2009). |
| 3173 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u
3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF" | 2987 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u
3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF" |
| 3174 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u
3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002"; | 2988 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u
3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002"; |
| 3175 +#if 0 | 2989 +#if 0 |
| 3176 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 1
7, 18, 20, 21, 24, 27, 28 }; | 2990 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 1
7, 18, 20, 21, 24, 27, 28 }; |
| 3177 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 1
7, 18, 19, 20, 21, 24, 25, 26, 27, 28 }; | 2991 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 1
7, 18, 19, 20, 21, 24, 25, 26, 27, 28 }; |
| 3178 +#endif | 2992 +#endif |
| 3179 +// There's no separate Japanese word break iterator. Root is the same as Japane
se. | 2993 +// There's no separate Japanese word break iterator. Root is the same as Japane
se. |
| 3180 +// Our dictionary-based iterator has to be tweaked to better handle U+3005, | 2994 +// Our dictionary-based iterator has to be tweaked to better handle U+3005, |
| 3181 +// U+3007, U+300B and some other cases. | 2995 +// U+3007, U+300B and some other cases. |
| 3182 +static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 1
5, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; | 2996 +static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 1
5, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; |
| 3183 +static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 1
5, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; | 2997 +static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 1
5, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; |
| 3184 | 2998 |
| 3185 // UBreakIteratorType UBRK_SENTENCE, Locale "el" | 2999 // UBreakIteratorType UBRK_SENTENCE, Locale "el" |
| 3186 // Add break after Greek question mark (cldrbug #2069). | 3000 // Add break after Greek question mark (cldrbug #2069). |
| 3187 @@ -2580,6 +2960,8 @@ | 3001 @@ -2672,6 +3052,8 @@ |
| 3188 UnicodeSet *fNewlineSet; | 3002 UnicodeSet *fNewlineSet; |
| 3189 UnicodeSet *fKatakanaSet; | 3003 UnicodeSet *fKatakanaSet; |
| 3190 UnicodeSet *fALetterSet; | 3004 UnicodeSet *fALetterSet; |
| 3191 + // TODO(jungshik): Do we still need this change? | 3005 + // TODO(jungshik): Do we still need this change? |
| 3192 + // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt | 3006 + // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt |
| 3193 UnicodeSet *fMidNumLetSet; | 3007 UnicodeSet *fMidNumLetSet; |
| 3194 UnicodeSet *fMidLetterSet; | 3008 UnicodeSet *fMidLetterSet; |
| 3195 UnicodeSet *fMidNumSet; | 3009 UnicodeSet *fMidNumSet; |
| 3196 @@ -2588,6 +2970,7 @@ | 3010 @@ -2680,6 +3062,7 @@ |
| 3197 UnicodeSet *fOtherSet; | 3011 UnicodeSet *fOtherSet; |
| 3198 UnicodeSet *fExtendSet; | 3012 UnicodeSet *fExtendSet; |
| 3199 UnicodeSet *fExtendNumLetSet; | 3013 UnicodeSet *fExtendNumLetSet; |
| 3200 + UnicodeSet *fDictionaryCjkSet; | 3014 + UnicodeSet *fDictionaryCjkSet; |
| 3201 | 3015 |
| 3202 RegexMatcher *fMatcher; | 3016 RegexMatcher *fMatcher; |
| 3203 | 3017 |
| 3204 @@ -2604,12 +2987,24 @@ | 3018 @@ -2696,12 +3079,24 @@ |
| 3205 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
CR}]"), status); | 3019 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
CR}]"), status); |
| 3206 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
LF}]"), status); | 3020 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
LF}]"), status); |
| 3207 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
Newline}]"), status); | 3021 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
Newline}]"), status); |
| 3208 - fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
ALetter}]"), status); | 3022 - fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
ALetter}]"), status); |
| 3209 + fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]",
status); | 3023 + fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]",
status); |
| 3210 + // Exclude Hangul syllables from ALetterSet during testing. | 3024 + // Exclude Hangul syllables from ALetterSet during testing. |
| 3211 + // Leave CJK dictionary characters out from the monkey tests! | 3025 + // Leave CJK dictionary characters out from the monkey tests! |
| 3212 +#if 0 | 3026 +#if 0 |
| 3213 + fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" | 3027 + fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" |
| 3214 + "[\\p{Line_Break = Complex_Context}" | 3028 + "[\\p{Line_Break = Complex_Context}" |
| 3215 + "-\\p{Grapheme_Cluster_Break = Extend}" | 3029 + "-\\p{Grapheme_Cluster_Break = Extend}" |
| 3216 + "-\\p{Grapheme_Cluster_Break = Control}" | 3030 + "-\\p{Grapheme_Cluster_Break = Control}" |
| 3217 + "]]", | 3031 + "]]", |
| 3218 + status); | 3032 + status); |
| 3219 +#endif | 3033 +#endif |
| 3220 + fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
ALetter}]"), status); | 3034 + fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
ALetter}]"), status); |
| 3221 + fALetterSet->removeAll(*fDictionaryCjkSet); | 3035 + fALetterSet->removeAll(*fDictionaryCjkSet); |
| 3222 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
Katakana}]"), status); | 3036 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
Katakana}]"), status); |
| 3223 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
MidNumLet}]"), status); | 3037 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
MidNumLet}]"), status); |
| 3224 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
MidLetter}]"), status); | 3038 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
MidLetter}]"), status); |
| 3225 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
MidNum}]"), status); | 3039 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
MidNum}]"), status); |
| 3226 - fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
Numeric}]"), status); | 3040 - fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
Numeric}]"), status); |
| 3227 + fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
Numeric}[\\uff10-\\uff19]]"), status); | 3041 + fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
Numeric}[\\uff10-\\uff19]]"), status); |
| 3228 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
Format}]"), status); | 3042 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
Format}]"), status); |
| 3229 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
ExtendNumLet}]"), status); | 3043 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
ExtendNumLet}]"), status); |
| 3230 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
Extend}]"), status); | 3044 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
Extend}]"), status); |
| 3231 @@ -2633,13 +3028,14 @@ | 3045 @@ -2725,13 +3120,14 @@ |
| 3232 fOtherSet->removeAll(*fFormatSet); | 3046 fOtherSet->removeAll(*fFormatSet); |
| 3233 fOtherSet->removeAll(*fExtendSet); | 3047 fOtherSet->removeAll(*fExtendSet); |
| 3234 // Inhibit dictionary characters from being tested at all. | 3048 // Inhibit dictionary characters from being tested at all. |
| 3235 + fOtherSet->removeAll(*fDictionaryCjkSet); | 3049 + fOtherSet->removeAll(*fDictionaryCjkSet); |
| 3236 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Com
plex_Context}]"), status)); | 3050 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Com
plex_Context}]"), status)); |
| 3237 | 3051 |
| 3238 fSets->addElement(fCRSet, status); | 3052 fSets->addElement(fCRSet, status); |
| 3239 fSets->addElement(fLFSet, status); | 3053 fSets->addElement(fLFSet, status); |
| 3240 fSets->addElement(fNewlineSet, status); | 3054 fSets->addElement(fNewlineSet, status); |
| 3241 fSets->addElement(fALetterSet, status); | 3055 fSets->addElement(fALetterSet, status); |
| 3242 - fSets->addElement(fKatakanaSet, status); | 3056 - fSets->addElement(fKatakanaSet, status); |
| 3243 + //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test ka
takana | 3057 + //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test ka
takana |
| 3244 fSets->addElement(fMidLetterSet, status); | 3058 fSets->addElement(fMidLetterSet, status); |
| 3245 fSets->addElement(fMidNumLetSet, status); | 3059 fSets->addElement(fMidNumLetSet, status); |
| 3246 fSets->addElement(fMidNumSet, status); | 3060 fSets->addElement(fMidNumSet, status); |
| 3247 @@ -3871,6 +4267,7 @@ | 3061 @@ -3978,6 +4374,7 @@ |
| 3248 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { | 3062 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { |
| 3249 count --; | 3063 count --; |
| 3250 if (forward[count] != i) { | 3064 if (forward[count] != i) { |
| 3251 + printStringBreaks(ustr, expected, expectedcount); | 3065 + printStringBreaks(ustr, expected, expectedcount); |
| 3252 test->errln("happy break test previous() failed: expected %d but go
t %d", | 3066 test->errln("happy break test previous() failed: expected %d but go
t %d", |
| 3253 forward[count], i); | 3067 forward[count], i); |
| 3254 break; | 3068 break; |
| 3255 @@ -3904,23 +4301,25 @@ | 3069 @@ -4011,23 +4408,25 @@ |
| 3256 UErrorCode status = U_ZERO_ERROR; | 3070 UErrorCode status = U_ZERO_ERROR; |
| 3257 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, stat
us); | 3071 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, stat
us); |
| 3258 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); | 3072 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); |
| 3259 + // Replaced any C+J characters in a row with a random sequence of character
s | 3073 + // Replaced any C+J characters in a row with a random sequence of character
s |
| 3260 + // of the same length to make our C+J segmentation not get in the way. | 3074 + // of the same length to make our C+J segmentation not get in the way. |
| 3261 static const char *strlist[] = | 3075 static const char *strlist[] = |
| 3262 { | 3076 { |
| 3263 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", | 3077 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", |
| 3264 - "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e004
0\\u003b", | 3078 - "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e004
0\\u003b", |
| 3265 + "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e004
0\\u003b", | 3079 + "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e004
0\\u003b", |
| 3266 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000
e0061\\u003a", | 3080 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000
e0061\\u003a", |
| 3267 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", | 3081 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", |
| 3268 - "\\u90ca\\u3588\\u009c\\u0953\\u194b", | 3082 - "\\u90ca\\u3588\\u009c\\u0953\\u194b", |
| 3269 + "\\uac00\\u3588\\u009c\\u0953\\u194b", | 3083 + "\\uac00\\u3588\\u009c\\u0953\\u194b", |
| 3270 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", | 3084 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", |
| 3271 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e
", | 3085 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e
", |
| 3272 - "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e", | 3086 - "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e", |
| 3273 + "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", | 3087 + "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", |
| 3274 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", | 3088 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", |
| 3275 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", | 3089 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", |
| 3276 "\\u2027\\U000e0067\\u0a47\\u00b7", | 3090 "\\u2027\\U000e0067\\u0a47\\u00b7", |
| 3277 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", | 3091 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", |
| 3278 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", | 3092 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", |
| 3279 "\\u0589\\U000e006e\\u0a42\\U000104a5", | 3093 "\\u0589\\U000e006e\\u0a42\\U000104a5", |
| 3280 - "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", | 3094 - "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", |
| 3281 + "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", | 3095 + "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", |
| 3282 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", | 3096 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", |
| 3283 "\\u0027\\u11af\\U000e0057\\u0602", | 3097 "\\u0027\\u11af\\U000e0057\\u0602", |
| 3284 "\\U0001d7f2\\U000e007\\u0004\\u0589", | 3098 "\\U0001d7f2\\U000e007\\u0004\\u0589", |
| 3285 @@ -3932,7 +4331,7 @@ | 3099 @@ -4039,7 +4438,7 @@ |
| 3286 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", | 3100 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", |
| 3287 "\\u0233\\U000e0020\\u0a69\\u0d6a", | 3101 "\\u0233\\U000e0020\\u0a69\\u0d6a", |
| 3288 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", | 3102 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", |
| 3289 - "\\u58f4\\U000e0049\\u20e7\\u2027", | 3103 - "\\u58f4\\U000e0049\\u20e7\\u2027", |
| 3290 + "\\u18f4\\U000e0049\\u20e7\\u2027", | 3104 + "\\u18f4\\U000e0049\\u20e7\\u2027", |
| 3291 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", | 3105 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", |
| 3292 "\\ua183\\u102d\\u0bec\\u003a", | 3106 "\\ua183\\u102d\\u0bec\\u003a", |
| 3293 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", | 3107 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", |
| 3294 @@ -3942,7 +4341,7 @@ | 3108 @@ -4049,7 +4448,7 @@ |
| 3295 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", | 3109 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", |
| 3296 "\\u003a\\u0664\\u00b7\\u1fba", | 3110 "\\u003a\\u0664\\u00b7\\u1fba", |
| 3297 "\\u003b\\u0027\\u00b7\\u47a3", | 3111 "\\u003b\\u0027\\u00b7\\u47a3", |
| 3298 - "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b", | 3112 - "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b", |
| 3299 + "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", | 3113 + "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", |
| 3300 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\
u0e51\\u1058\\U000e0058\\u00b7\\u0673", | 3114 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\
u0e51\\u1058\\U000e0058\\u00b7\\u0673", |
| 3301 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", | 3115 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", |
| 3302 }; | 3116 }; |
| 3303 @@ -3997,12 +4396,12 @@ | 3117 @@ -4104,12 +4503,12 @@ |
| 3304 "\\U0001d7f2\\U000e007d\\u0004\\u0589", | 3118 "\\U0001d7f2\\U000e007d\\u0004\\u0589", |
| 3305 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", | 3119 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", |
| 3306 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", | 3120 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", |
| 3307 - "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", | 3121 - "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", |
| 3308 + "\\U000e0065\\u302c\\u09ee\\U000e0068", | 3122 + "\\U000e0065\\u302c\\u09ee\\U000e0068", |
| 3309 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", | 3123 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", |
| 3310 "\\u0233\\U000e0020\\u0a69\\u0d6a", | 3124 "\\u0233\\U000e0020\\u0a69\\u0d6a", |
| 3311 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", | 3125 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", |
| 3312 "\\u58f4\\U000e0049\\u20e7\\u2027", | 3126 "\\u58f4\\U000e0049\\u20e7\\u2027", |
| 3313 - "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", | 3127 - "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", |
| 3314 + "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", | 3128 + "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", |
| 3315 "\\ua183\\u102d\\u0bec\\u003a", | 3129 "\\ua183\\u102d\\u0bec\\u003a", |
| 3316 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", | 3130 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", |
| 3317 "\\u003a\\u0e57\\u0fad\\u002e", | 3131 "\\u003a\\u0e57\\u0fad\\u002e", |
| 3318 --- source/test/intltest/rbbitst.h» 2009-04-22 00:53:50.000000000 -0700 | 3132 --- source/test/intltest/rbbitst.h» 2010-07-22 17:15:37.000000000 -0700 |
| 3319 +++ source/test/intltest/rbbitst.h» 2009-07-27 13:01:17.767342000 -0700 | 3133 +++ source/test/intltest/rbbitst.h» 2011-01-21 14:12:45.152007000 -0800 |
| 3320 @@ -70,6 +70,7 @@ | 3134 @@ -70,6 +70,7 @@ |
| 3321 void TestBug5775(); | 3135 void TestBug5775(); |
| 3322 void TestThaiBreaks(); | 3136 void TestThaiBreaks(); |
| 3323 void TestTailoredBreaks(); | 3137 void TestTailoredBreaks(); |
| 3324 + void TestTrieDictWithValue(); | 3138 + void TestTrieDictWithValue(); |
| 3139 void TestDictRules(); |
| 3140 void TestBug5532(); |
| 3325 | 3141 |
| 3326 void TestDebug(); | 3142 --- source/test/testdata/rbbitst.txt» 2010-07-28 17:18:28.000000000 -0700 |
| 3327 | 3143 +++ source/test/testdata/rbbitst.txt» 2011-01-21 14:12:45.221011000 -0800 |
| 3328 --- source/test/testdata/rbbitst.txt» 2009-06-24 14:06:38.000000000 -0700 | 3144 @@ -161,7 +161,23 @@ |
| 3329 +++ source/test/testdata/rbbitst.txt» 2009-07-29 12:56:31.483710000 -0700 | |
| 3330 @@ -162,7 +162,23 @@ | |
| 3331 <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data> | 3145 <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data> |
| 3332 | 3146 |
| 3333 # Hiragana & Katakana stay together, but separates from each other and Latin. | 3147 # Hiragana & Katakana stay together, but separates from each other and Latin. |
| 3334 -<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINI
NG ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}
\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA
LETTER N}<300>def<200>#•</data> | 3148 -<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINI
NG ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}
\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA
LETTER N}<300>def<200>#•</data> |
| 3335 +# *** what to do about theoretical combos of chars? i.e. hiragana + accent | 3149 +# *** what to do about theoretical combos of chars? i.e. hiragana + accent |
| 3336 +#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBIN
ING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A
}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKAN
A LETTER N}<300>def<200>#•</data> | 3150 +#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBIN
ING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A
}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKAN
A LETTER N}<300>def<200>#•</data> |
| 3337 + | 3151 + |
| 3338 +# test normalization/dictionary handling of halfwidth katakana: same dictionary
phrase in fullwidth and halfwidth | 3152 +# test normalization/dictionary handling of halfwidth katakana: same dictionary
phrase in fullwidth and halfwidth |
| 3339 +<data>•芽キャベツ<400>芽キャベツ<400></data> | 3153 +<data>•芽キャベツ<400>芽キャベツ<400></data> |
| 3340 + | 3154 + |
| 3341 +# more Japanese tests | 3155 +# more Japanese tests |
| 3342 +# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana | 3156 +# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana |
| 3343 +# and the Katakana block are not treated correctly. Enable this later. | 3157 +# and the Katakana block are not treated correctly. Enable this later. |
| 3344 +#<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400
>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data> | 3158 +#<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400
>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data> |
| 3345 +<data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>で
も<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data> | 3159 +<data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>で
も<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data> |
| 3346 + | 3160 + |
| 3347 +# Testing of word boundary for dictionary word containing both kanji and kana | 3161 +# Testing of word boundary for dictionary word containing both kanji and kana |
| 3348 +<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data> | 3162 +<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data> |
| 3349 + | 3163 + |
| 3350 +# Testing of Chinese segmentation (taken from a Chinese news article) | 3164 +# Testing of Chinese segmentation (taken from a Chinese news article) |
| 3351 +<data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400
>到了<400>“•推荐<400>票<400>”•,•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400>
的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>,•选出<400>他们<400>
属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</d
ata> | 3165 +<data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400
>到了<400>“•推荐<400>票<400>”•,•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400>
的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>,•选出<400>他们<400>
属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</d
ata> |
| 3352 | 3166 |
| 3353 # Words with interior formatting characters | 3167 # Words with interior formatting characters |
| 3354 <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</dat
a> | 3168 <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</dat
a> |
| 3355 @@ -170,6 +186,8 @@ | 3169 @@ -169,6 +185,8 @@ |
| 3356 # to test for bug #4097779 | 3170 # to test for bug #4097779 |
| 3357 <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data> | 3171 <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data> |
| 3358 | 3172 |
| 3359 +# fullwidth numeric, midletter characters etc should be treated like their half
width counterparts | 3173 +# fullwidth numeric, midletter characters etc should be treated like their half
width counterparts |
| 3360 +<data>•ISN'T<200> •19<100>日<400></data> | 3174 +<data>•ISN'T<200> •19<100>日<400></data> |
| 3361 | 3175 |
| 3362 # to test for bug #4098467 | 3176 # to test for bug #4098467 |
| 3363 # What follows is a string of Korean characters (I found it in the Yellow
Pages | 3177 # What follows is a string of Korean characters (I found it in the Yellow
Pages |
| 3364 @@ -179,9 +197,15 @@ | 3178 @@ -178,9 +196,15 @@ |
| 3365 # precomposed syllables... | 3179 # precomposed syllables... |
| 3366 <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\ua
d50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u1
10b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u1
1bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data> | 3180 <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\ua
d50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u1
10b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u1
1bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data> |
| 3367 | 3181 |
| 3368 -<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200>
•</data> | 3182 -<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200>
•</data> |
| 3369 +# more Korean tests (Jamo not tested here, not counted as dictionary characters
) | 3183 +# more Korean tests (Jamo not tested here, not counted as dictionary characters
) |
| 3370 +# Disable them now because we don't include a Korean dictionary. | 3184 +# Disable them now because we don't include a Korean dictionary. |
| 3371 +#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<2
00>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data> | 3185 +#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<2
00>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data> |
| 3372 +#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2d
d<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200
> •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data> | 3186 +#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2d
d<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200
> •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data> |
| 3373 + | 3187 + |
| 3374 +<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</da
ta> | 3188 +<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</da
ta> |
| 3375 + | 3189 + |
| 3376 +<data>•\u06c9<200>\uc799<200>\ufffa•</data> | 3190 +<data>•\u06c9<200>\uc799<200>\ufffa•</data> |
| 3377 | 3191 |
| 3378 -<data>•\u06c9\uc799\ufffa<200></data> | 3192 -<data>•\u06c9\uc799\ufffa<200></data> |
| 3379 | 3193 |
| 3380 # | 3194 # |
| 3381 # Try some words from other scripts. | 3195 # Try some words from other scripts. |
| 3382 @@ -492,8 +516,7 @@ | 3196 @@ -491,8 +515,7 @@ |
| 3383 <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c
•</data> | 3197 <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c
•</data> |
| 3384 | 3198 |
| 3385 # conjoining jamo... | 3199 # conjoining jamo... |
| 3386 -# TODO: rules update needed | 3200 -# TODO: rules update needed |
| 3387 -#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\
u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\
u1100\u116d•\u1112\u116c•</data> | 3201 -#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\
u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\
u1100\u116d•\u1112\u116c•</data> |
| 3388 +<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u
11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1
100\u116d•\u1112\u116c•</data> | 3202 +<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u
11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1
100\u116d•\u1112\u116c•</data> |
| 3389 | 3203 |
| 3390 # to test for bug #4117554: Fullwidth .!? should be treated as postJwrd | 3204 # to test for bug #4117554: Fullwidth .!? should be treated as postJwrd |
| 3391 <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data> | 3205 <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data> |
| 3392 --- source/test/testdata/testaliases.txt» 2009-06-24 14:06:38.000000000 -0
700 | 3206 --- source/test/testdata/testaliases.txt» 2009-11-12 13:53:42.000000000 -0
800 |
| 3393 +++ source/test/testdata/testaliases.txt» 2009-07-28 17:07:26.251120000 -0
700 | 3207 +++ source/test/testdata/testaliases.txt» 2011-01-21 14:12:45.204005000 -0
800 |
| 3394 @@ -28,7 +28,7 @@ | 3208 @@ -28,7 +28,7 @@ |
| 3395 LocaleScript:alias { "/ICUDATA/ja/LocaleScript" } | 3209 LocaleScript:alias { "/ICUDATA/ja/LocaleScript" } |
| 3396 | 3210 |
| 3397 // aliasing using position | 3211 // aliasing using position |
| 3398 - boundaries:alias { "/ICUDATA-brkitr/ja" } // Referencing corresponding reso
urce in another bundle | 3212 - boundaries:alias { "/ICUDATA-brkitr/ja" } // Referencing corresponding reso
urce in another bundle |
| 3399 + boundaries:alias { "/ICUDATA-brkitr/th" } // Referencing corresponding reso
urce in another bundle | 3213 + boundaries:alias { "/ICUDATA-brkitr/th" } // Referencing corresponding reso
urce in another bundle |
| 3400 | 3214 |
| 3401 // aliasing arrays | 3215 // aliasing arrays |
| 3402 zoneTests { | 3216 zoneTests { |
| 3403 --- source/tools/genctd/genctd.cpp» 2006-09-04 09:28:24.000000000 -0700 | 3217 --- source/tools/genctd/genctd.cpp» 2009-08-04 14:09:17.000000000 -0700 |
| 3404 +++ source/tools/genctd/genctd.cpp» 2009-07-27 13:01:17.776335000 -0700 | 3218 +++ source/tools/genctd/genctd.cpp» 2011-01-21 14:12:45.564923000 -0800 |
| 3405 @@ -1,6 +1,6 @@ | 3219 @@ -1,6 +1,6 @@ |
| 3406 /* | 3220 /* |
| 3407 ********************************************************************** | 3221 ********************************************************************** |
| 3408 -* Copyright (C) 2002-2006, International Business Machines | 3222 -* Copyright (C) 2002-2009, International Business Machines |
| 3409 +* Copyright (C) 2002-2006,2008, International Business Machines | 3223 +* Copyright (C) 2002-2010, International Business Machines |
| 3410 * Corporation and others. All Rights Reserved. | 3224 * Corporation and others. All Rights Reserved. |
| 3411 ********************************************************************** | 3225 ********************************************************************** |
| 3412 * | 3226 * |
| 3413 @@ -34,12 +34,15 @@ | 3227 @@ -34,12 +34,15 @@ |
| 3414 #include "unicode/udata.h" | 3228 #include "unicode/udata.h" |
| 3415 #include "unicode/putil.h" | 3229 #include "unicode/putil.h" |
| 3416 | 3230 |
| 3417 +//#include "unicode/ustdio.h" | 3231 +//#include "unicode/ustdio.h" |
| 3418 + | 3232 + |
| 3419 #include "uoptions.h" | 3233 #include "uoptions.h" |
| 3420 #include "unewdata.h" | 3234 #include "unewdata.h" |
| 3421 #include "ucmndata.h" | 3235 #include "ucmndata.h" |
| 3422 #include "rbbidata.h" | 3236 #include "rbbidata.h" |
| 3423 #include "triedict.h" | 3237 #include "triedict.h" |
| 3424 #include "cmemory.h" | 3238 #include "cmemory.h" |
| 3425 +#include "uassert.h" | 3239 +#include "uassert.h" |
| 3426 | 3240 |
| 3427 #include <stdio.h> | 3241 #include <stdio.h> |
| 3428 #include <stdlib.h> | 3242 #include <stdlib.h> |
| 3429 @@ -198,147 +201,191 @@ | 3243 @@ -199,147 +202,191 @@ |
| 3430 long wordFileSize; | 3244 long wordFileSize; |
| 3431 FILE *file; | 3245 FILE *file; |
| 3432 char *wordBufferC; | 3246 char *wordBufferC; |
| 3433 - | 3247 - |
| 3434 + MutableTrieDictionary *mtd = NULL; | 3248 + MutableTrieDictionary *mtd = NULL; |
| 3435 + | 3249 + |
| 3436 file = fopen(wordFileName, "rb"); | 3250 file = fopen(wordFileName, "rb"); |
| 3437 - if( file == 0 ) { | 3251 - if( file == 0 ) { |
| 3438 - fprintf(stderr, "Could not open file \"%s\"\n", wordFileName); | 3252 - fprintf(stderr, "Could not open file \"%s\"\n", wordFileName); |
| 3439 - exit(-1); | 3253 - exit(-1); |
| (...skipping 301 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3741 + // Get rid of the Unicode text buffer | 3555 + // Get rid of the Unicode text buffer |
| 3742 + delete[] wordSourceU; | 3556 + delete[] wordSourceU; |
| 3743 } | 3557 } |
| 3744 | 3558 |
| 3745 - // Get rid of the Unicode text buffer | 3559 - // Get rid of the Unicode text buffer |
| 3746 - delete[] wordSourceU; | 3560 - delete[] wordSourceU; |
| 3747 - | 3561 - |
| 3748 // Now, create a CompactTrieDictionary from the mutable dictionary | 3562 // Now, create a CompactTrieDictionary from the mutable dictionary |
| 3749 CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status); | 3563 CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status); |
| 3750 if (U_FAILURE(status)) { | 3564 if (U_FAILURE(status)) { |
| 3751 @@ -392,4 +439,3 @@ | 3565 @@ -393,4 +440,3 @@ |
| 3752 | 3566 |
| 3753 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | 3567 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
| 3754 } | 3568 } |
| 3755 - | 3569 - |
| 3756 --- source/tools/genctd/Makefile.in 2006-12-16 13:07:01.000000000 -0800 | 3570 --- source/tools/genctd/Makefile.in 2006-12-16 13:07:01.000000000 -0800 |
| 3757 +++ source/tools/genctd/Makefile.in» 2009-07-27 13:01:17.782326000 -0700 | 3571 +++ source/tools/genctd/Makefile.in» 2011-01-21 14:12:45.555920000 -0800 |
| 3758 @@ -23,13 +23,13 @@ | 3572 @@ -23,13 +23,13 @@ |
| 3759 ## Extra files to remove for 'make clean' | 3573 ## Extra files to remove for 'make clean' |
| 3760 CLEANFILES = *~ $(DEPS) $(MAN_FILES) | 3574 CLEANFILES = *~ $(DEPS) $(MAN_FILES) |
| 3761 | 3575 |
| 3762 -## Target information | 3576 -## Target information |
| 3763 +## Target informationcd | 3577 +## Target informationcd |
| 3764 TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) | 3578 TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) |
| 3765 | 3579 |
| 3766 ifneq ($(top_builddir),$(top_srcdir)) | 3580 ifneq ($(top_builddir),$(top_srcdir)) |
| 3767 CPPFLAGS += -I$(top_builddir)/common | 3581 CPPFLAGS += -I$(top_builddir)/common |
| 3768 endif | 3582 endif |
| 3769 -CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil | 3583 -CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil |
| 3770 +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -I$(top_srcdir)/i18n | 3584 +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -I$(top_srcdir)/i18n |
| 3771 LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) | 3585 LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) |
| 3772 | 3586 |
| 3773 OBJECTS = genctd.o | 3587 OBJECTS = genctd.o |
| 3774 --- source/data/Makefile.in 2009-05-20 23:03:54.000000000 -0700 | |
| 3775 +++ source/data/Makefile.in 2009-10-21 15:43:18.235201000 -0700 | |
| 3776 @@ -452,8 +452,9 @@ | |
| 3777 #################################################### CTD | |
| 3778 # CTD FILES | |
| 3779 | |
| 3780 -$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_
FILES) | |
| 3781 - $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $< | |
| 3782 +# .ctd file now generated regardless of whether dictionary file exists | |
| 3783 +$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES) | |
| 3784 + $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F
).txt | |
| 3785 | |
| 3786 #################################################### CFU | |
| 3787 # CFU FILES | |
| OLD | NEW |