Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(179)

Side by Side Diff: icu46/patches/segmentation.patch

Issue 6370014: CJK segmentation patch for ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/
Patch Set: Created 9 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | icu46/source/common/brkeng.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 --- source/common/brkeng.cpp» 2007-09-11 20:53:13.000000000 -0700 1 --- source/common/brkeng.cpp» 2009-11-11 07:47:22.000000000 -0800
2 +++ source/common/brkeng.cpp» 2009-07-29 12:57:49.973382000 -0700 2 +++ source/common/brkeng.cpp» 2011-01-21 14:12:45.479922000 -0800
3 @@ -24,6 +24,7 @@ 3 @@ -226,6 +226,30 @@
4 #include "umutex.h"
5 #include "uresimp.h"
6 #include "ubrkimpl.h"
7 +#include <stdio.h>
8
9 U_NAMESPACE_BEGIN
10
11 @@ -226,6 +227,30 @@
12 case USCRIPT_THAI: 4 case USCRIPT_THAI:
13 engine = new ThaiBreakEngine(dict, status); 5 engine = new ThaiBreakEngine(dict, status);
14 break; 6 break;
15 + 7 +
16 + case USCRIPT_HANGUL: 8 + case USCRIPT_HANGUL:
17 + engine = new CjkBreakEngine(dict, kKorean, status); 9 + engine = new CjkBreakEngine(dict, kKorean, status);
18 + break; 10 + break;
19 + 11 +
20 + // use same BreakEngine and dictionary for both Chinese and Japanes e 12 + // use same BreakEngine and dictionary for both Chinese and Japanes e
21 + case USCRIPT_HIRAGANA: 13 + case USCRIPT_HIRAGANA:
(...skipping 10 matching lines...) Expand all
32 + { 24 + {
33 + UBlockCode block = ublock_getCode(code); 25 + UBlockCode block = ublock_getCode(code);
34 + if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) 26 + if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
35 + engine = new CjkBreakEngine(dict, kChineseJapanese, status); 27 + engine = new CjkBreakEngine(dict, kChineseJapanese, status);
36 + break; 28 + break;
37 + } 29 + }
38 +#endif 30 +#endif
39 default: 31 default:
40 break; 32 break;
41 } 33 }
42 @@ -281,6 +306,13 @@ 34 @@ -281,6 +305,13 @@
43 dict = NULL; 35 dict = NULL;
44 } 36 }
45 return dict; 37 return dict;
46 + } else if (dictfname != NULL){ 38 + } else if (dictfname != NULL){
47 + //create dummy dict if dictionary filename not valid 39 + //create dummy dict if dictionary filename not valid
48 + UChar c = 0x0020; 40 + UChar c = 0x0020;
49 + status = U_ZERO_ERROR; 41 + status = U_ZERO_ERROR;
50 + MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE) ; 42 + MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE) ;
51 + mtd->addWord(&c, 1, status, 1); 43 + mtd->addWord(&c, 1, status, 1);
52 + return new CompactTrieDictionary(*mtd, status); 44 + return new CompactTrieDictionary(*mtd, status);
53 } 45 }
54 return NULL; 46 return NULL;
55 } 47 }
56 --- source/common/dictbe.cpp 2008-06-13 12:21:12.000000000 -0700 48 --- source/common/dictbe.cpp 2008-06-13 12:21:12.000000000 -0700
57 +++ source/common/dictbe.cpp» 2009-11-11 12:58:40.199829000 -0800 49 +++ source/common/dictbe.cpp» 2011-01-21 14:12:45.468928000 -0800
58 @@ -16,6 +16,11 @@ 50 @@ -16,6 +16,9 @@
59 #include "unicode/ubrk.h" 51 #include "unicode/ubrk.h"
60 #include "uvector.h" 52 #include "uvector.h"
61 #include "triedict.h" 53 #include "triedict.h"
62 +#include "uassert.h" 54 +#include "uassert.h"
63 +#include "unicode/normlzr.h" 55 +#include "unicode/normlzr.h"
64 +#include "cmemory.h" 56 +#include "cmemory.h"
65 +
66 +#include <stdio.h>
67 57
68 U_NAMESPACE_BEGIN 58 U_NAMESPACE_BEGIN
69 59
70 @@ -422,6 +427,294 @@ 60 @@ -422,6 +425,294 @@
71 return wordsFound; 61 return wordsFound;
72 } 62 }
73 63
74 +/* 64 +/*
75 + ****************************************************************** 65 + ******************************************************************
76 + * CjkBreakEngine 66 + * CjkBreakEngine
77 + */ 67 + */
78 +static const uint32_t kuint32max = 0xFFFFFFFF; 68 +static const uint32_t kuint32max = 0xFFFFFFFF;
79 +CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, Langu ageType type, UErrorCode &status) 69 +CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, Langu ageType type, UErrorCode &status)
80 +: DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){ 70 +: DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){
(...skipping 275 matching lines...) Expand 10 before | Expand all | Expand 10 after
356 + } 346 + }
357 + 347 +
358 + utext_close(&normalizedText); 348 + utext_close(&normalizedText);
359 + return numBreaks; 349 + return numBreaks;
360 +} 350 +}
361 + 351 +
362 U_NAMESPACE_END 352 U_NAMESPACE_END
363 353
364 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 354 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
365 --- source/common/dictbe.h 2006-09-29 17:37:45.000000000 -0700 355 --- source/common/dictbe.h 2006-09-29 17:37:45.000000000 -0700
366 +++ source/common/dictbe.h» 2009-07-27 13:01:17.704415000 -0700 356 +++ source/common/dictbe.h» 2011-01-21 14:12:45.492920000 -0800
367 @@ -1,8 +1,8 @@ 357 @@ -1,8 +1,8 @@
368 /** 358 /**
369 - ****************************************************************************** * 359 - ****************************************************************************** *
370 - * Copyright (C) 2006, International Business Machines Corporation and others. * 360 - * Copyright (C) 2006, International Business Machines Corporation and others. *
371 - * All Rights Reserved. * 361 - * All Rights Reserved. *
372 - ****************************************************************************** * 362 - ****************************************************************************** *
373 + ****************************************************************************** **** 363 + ****************************************************************************** ****
374 + * Copyright (C) 2006,2007, International Business Machines Corporation and oth ers. 364 + * Copyright (C) 2006-2010, International Business Machines Corporation and oth ers.
375 + * All Rights Reserved. 365 + * All Rights Reserved.
376 + ****************************************************************************** **** 366 + ****************************************************************************** ****
377 */ 367 */
378 368
379 #ifndef DICTBE_H 369 #ifndef DICTBE_H
380 @@ -65,37 +65,37 @@ 370 @@ -65,31 +65,31 @@
381 */ 371 */
382 virtual ~DictionaryBreakEngine(); 372 virtual ~DictionaryBreakEngine();
383 373
384 - /** 374 - /**
385 - * <p>Indicate whether this engine handles a particular character for 375 - * <p>Indicate whether this engine handles a particular character for
386 - * a particular kind of break.</p> 376 - * a particular kind of break.</p>
387 - * 377 - *
388 - * @param c A character which begins a run that the engine might handle 378 - * @param c A character which begins a run that the engine might handle
389 - * @param breakType The type of text break which the caller wants to determine 379 - * @param breakType The type of text break which the caller wants to determine
390 - * @return TRUE if this engine handles the particular character and break 380 - * @return TRUE if this engine handles the particular character and break
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
423 + * that starts from the first (or last) character in the range. 413 + * that starts from the first (or last) character in the range.
424 + * @param startPos The start of the run within the supplied text. 414 + * @param startPos The start of the run within the supplied text.
425 + * @param endPos The end of the run within the supplied text. 415 + * @param endPos The end of the run within the supplied text.
426 + * @param reverse Whether the caller is looking for breaks in a reverse 416 + * @param reverse Whether the caller is looking for breaks in a reverse
427 + * direction. 417 + * direction.
428 + * @param breakType The type of break desired, or -1. 418 + * @param breakType The type of break desired, or -1.
429 + * @param foundBreaks An allocated C array of the breaks found, if any 419 + * @param foundBreaks An allocated C array of the breaks found, if any
430 + * @return The number of breaks found. 420 + * @return The number of breaks found.
431 + */ 421 + */
432 virtual int32_t findBreaks( UText *text, 422 virtual int32_t findBreaks( UText *text,
433 - int32_t startPos, 423 int32_t startPos,
434 - int32_t endPos, 424 int32_t endPos,
435 - UBool reverse,
436 - int32_t breakType,
437 - UStack &foundBreaks ) const;
438 + int32_t startPos,
439 + int32_t endPos,
440 + UBool reverse,
441 + int32_t breakType,
442 + UStack &foundBreaks ) const;
443
444 protected:
445
446 @@ -114,7 +114,7 @@ 425 @@ -114,7 +114,7 @@
447 // virtual void setBreakTypes( uint32_t breakTypes ); 426 // virtual void setBreakTypes( uint32_t breakTypes );
448 427
449 /** 428 /**
450 - * <p>Divide up a range of known dictionary characters.</p> 429 - * <p>Divide up a range of known dictionary characters.</p>
451 + * <p>Divide up a range of known dictionary characters handled by this break e ngine.</p> 430 + * <p>Divide up a range of known dictionary characters handled by this break e ngine.</p>
452 * 431 *
453 * @param text A UText representing the text 432 * @param text A UText representing the text
454 * @param rangeStart The start of the range of dictionary characters 433 * @param rangeStart The start of the range of dictionary characters
455 @@ -171,7 +171,7 @@ 434 @@ -171,7 +171,7 @@
456 435
457 protected: 436 protected:
458 /** 437 /**
459 - * <p>Divide up a range of known dictionary characters.</p> 438 - * <p>Divide up a range of known dictionary characters.</p>
460 + * <p>Divide up a range of known dictionary characters handled by this break e ngine.</p> 439 + * <p>Divide up a range of known dictionary characters handled by this break e ngine.</p>
461 * 440 *
462 * @param text A UText representing the text 441 * @param text A UText representing the text
463 * @param rangeStart The start of the range of dictionary characters 442 * @param rangeStart The start of the range of dictionary characters
464 @@ -180,12 +180,72 @@ 443 @@ -186,6 +186,66 @@
465 * @return The number of breaks found
466 */
467 virtual int32_t divideUpDictionaryRange( UText *text,
468 - int32_t rangeStart,
469 - int32_t rangeEnd,
470 - UStack &foundBreaks ) const;
471 + int32_t rangeStart,
472 + int32_t rangeEnd,
473 + UStack &foundBreaks ) const;
474 444
475 }; 445 };
476 446
477 +/******************************************************************* 447 +/*******************************************************************
478 + * CjkBreakEngine 448 + * CjkBreakEngine
479 + */ 449 + */
480 + 450 +
481 +//indicates language/script that the CjkBreakEngine will handle 451 +//indicates language/script that the CjkBreakEngine will handle
482 +enum LanguageType { 452 +enum LanguageType {
483 + kKorean, 453 + kKorean,
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
530 + */ 500 + */
531 + virtual int32_t divideUpDictionaryRange( UText *text, 501 + virtual int32_t divideUpDictionaryRange( UText *text,
532 + int32_t rangeStart, 502 + int32_t rangeStart,
533 + int32_t rangeEnd, 503 + int32_t rangeEnd,
534 + UStack &foundBreaks ) const; 504 + UStack &foundBreaks ) const;
535 + 505 +
536 +}; 506 +};
537 507
538 U_NAMESPACE_END 508 U_NAMESPACE_END
539 509
540 --- source/common/rbbi.cpp» 2008-09-24 22:48:27.000000000 -0700 510 --- source/common/rbbi.cpp» 2010-07-22 17:15:37.000000000 -0700
541 +++ source/common/rbbi.cpp» 2009-07-27 13:01:17.710416000 -0700 511 +++ source/common/rbbi.cpp» 2011-01-21 14:12:45.457938000 -0800
542 @@ -29,6 +29,7 @@ 512 @@ -1555,10 +1555,12 @@
543
544 #include "uassert.h"
545 #include "uvector.h"
546 +#include <stdio.h>
547
548 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be incl uded.
549 #if U_LOCAL_SERVICE_HOOK
550 @@ -1552,10 +1553,14 @@
551 int32_t endPos, 513 int32_t endPos,
552 UBool reverse) { 514 UBool reverse) {
553 // Reset the old break cache first. 515 // Reset the old break cache first.
554 - uint32_t dictionaryCount = fDictionaryCharCount; 516 - uint32_t dictionaryCount = fDictionaryCharCount;
555 +// uint32_t dictionaryCount = fDictionaryCharCount;
556 reset(); 517 reset();
557 518
558 - if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { 519 - if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
559 + // note: code segment below assumes that dictionary chars are in the 520 + // note: code segment below assumes that dictionary chars are in the
560 + // startPos-endPos range 521 + // startPos-endPos range
561 + // value returned should be next character in sequence 522 + // value returned should be next character in sequence
562 +// if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
563 + if ((endPos - startPos) <= 1) { 523 + if ((endPos - startPos) <= 1) {
564 return (reverse ? startPos : endPos); 524 return (reverse ? startPos : endPos);
565 } 525 }
566 526
567 @@ -1684,7 +1689,7 @@ 527 @@ -1711,7 +1713,7 @@
568 // proposed break by one of the breaks we found. Use following() an d 528 // proposed break by one of the breaks we found. Use following() an d
569 // preceding() to do the work. They should never recurse in this ca se. 529 // preceding() to do the work. They should never recurse in this ca se.
570 if (reverse) { 530 if (reverse) {
571 - return preceding(endPos - 1); 531 - return preceding(endPos - 1);
572 + return preceding(endPos); 532 + return preceding(endPos);
573 } 533 }
574 else { 534 else {
575 return following(startPos); 535 return following(startPos);
576 --- source/common/triedict.cpp 2008-02-13 01:35:50.000000000 -0800 536 --- source/common/triedict.cpp 2008-02-13 01:35:50.000000000 -0800
577 +++ source/common/triedict.cpp» 2009-07-27 13:01:17.718409000 -0700 537 +++ source/common/triedict.cpp» 2011-01-21 14:12:45.271006000 -0800
578 @@ -20,6 +20,7 @@ 538 @@ -20,6 +20,7 @@
579 #include "uvector.h" 539 #include "uvector.h"
580 #include "uvectr32.h" 540 #include "uvectr32.h"
581 #include "uarrsort.h" 541 #include "uarrsort.h"
582 +#include "hash.h" 542 +#include "hash.h"
583 543
584 //#define DEBUG_TRIE_DICT 1 544 //#define DEBUG_TRIE_DICT 1
585 545
586 @@ -27,6 +28,11 @@ 546 @@ -27,6 +28,11 @@
587 #include <sys/times.h> 547 #include <sys/times.h>
(...skipping 18 matching lines...) Expand all
606 + 566 +
607 // Node structure for the ternary, uncompressed trie 567 // Node structure for the ternary, uncompressed trie
608 struct TernaryNode : public UMemory { 568 struct TernaryNode : public UMemory {
609 UChar ch; // UTF-16 code unit 569 UChar ch; // UTF-16 code unit
610 @@ -77,7 +88,8 @@ 570 @@ -77,7 +88,8 @@
611 delete high; 571 delete high;
612 } 572 }
613 573
614 -MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) { 574 -MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) {
615 +MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status, 575 +MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status,
616 + UBool containsValue /* = FALSE */ ) { 576 + UBool containsValue /* = FALSE */ ) {
617 // Start the trie off with something. Having the root node already present 577 // Start the trie off with something. Having the root node already present
618 // cuts a special case out of the search/insertion functions. 578 // cuts a special case out of the search/insertion functions.
619 // Making it a median character cuts the worse case for searches from 579 // Making it a median character cuts the worse case for searches from
620 @@ -91,14 +103,19 @@ 580 @@ -91,14 +103,19 @@
621 if (U_SUCCESS(status) && fIter == NULL) { 581 if (U_SUCCESS(status) && fIter == NULL) {
622 status = U_MEMORY_ALLOCATION_ERROR; 582 status = U_MEMORY_ALLOCATION_ERROR;
623 } 583 }
624 + 584 +
625 + fValued = containsValue; 585 + fValued = containsValue;
626 } 586 }
627 587
628 -MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) { 588 -MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) {
629 +MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status, 589 +MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status,
630 + UBool containsValue /* = false */ ) { 590 + UBool containsValue /* = false */ ) {
631 fTrie = NULL; 591 fTrie = NULL;
632 fIter = utext_openUChars(NULL, NULL, 0, &status); 592 fIter = utext_openUChars(NULL, NULL, 0, &status);
633 if (U_SUCCESS(status) && fIter == NULL) { 593 if (U_SUCCESS(status) && fIter == NULL) {
634 status = U_MEMORY_ALLOCATION_ERROR; 594 status = U_MEMORY_ALLOCATION_ERROR;
635 } 595 }
636 + 596 +
637 + fValued = containsValue; 597 + fValued = containsValue;
638 } 598 }
639 599
640 MutableTrieDictionary::~MutableTrieDictionary() { 600 MutableTrieDictionary::~MutableTrieDictionary() {
641 @@ -113,7 +130,8 @@ 601 @@ -108,12 +125,13 @@
642 int &count, 602
643 int limit, 603 int32_t
644 TernaryNode *&parent, 604 MutableTrieDictionary::search( UText *text,
605 - int32_t maxLength,
606 - int32_t *lengths,
607 - int &count,
608 - int limit,
609 - TernaryNode *&parent,
645 - UBool &pMatched ) const { 610 - UBool &pMatched ) const {
646 + UBool &pMatched, 611 + int32_t maxLength,
647 + uint16_t *values /*=NULL*/) const { 612 + int32_t *lengths,
613 + int &count,
614 + int limit,
615 + TernaryNode *&parent,
616 + UBool &pMatched,
617 + uint16_t *values /*=NULL*/) const {
648 // TODO: current implementation works in UTF-16 space 618 // TODO: current implementation works in UTF-16 space
649 const TernaryNode *up = NULL; 619 const TernaryNode *up = NULL;
650 const TernaryNode *p = fTrie; 620 const TernaryNode *p = fTrie;
651 @@ -121,6 +139,10 @@ 621 @@ -121,6 +139,10 @@
652 pMatched = TRUE; 622 pMatched = TRUE;
653 int i; 623 int i;
654 624
655 + if (!fValued) { 625 + if (!fValued) {
656 + values = NULL; 626 + values = NULL;
657 + } 627 + }
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
693 int count; 663 int count;
694 @@ -177,7 +204,7 @@ 664 @@ -177,7 +204,7 @@
695 matched = search(fIter, length, NULL, count, 0, parent, pMatched); 665 matched = search(fIter, length, NULL, count, 0, parent, pMatched);
696 666
697 while (matched++ < length) { 667 while (matched++ < length) {
698 - UChar32 uc = utext_next32(fIter); // TODO: supplemetary support? 668 - UChar32 uc = utext_next32(fIter); // TODO: supplemetary support?
699 + UChar32 uc = utext_next32(fIter); // TODO: supplementary support? 669 + UChar32 uc = utext_next32(fIter); // TODO: supplementary support?
700 U_ASSERT(uc != U_SENTINEL); 670 U_ASSERT(uc != U_SENTINEL);
701 TernaryNode *newNode = new TernaryNode(uc); 671 TernaryNode *newNode = new TernaryNode(uc);
702 if (newNode == NULL) { 672 if (newNode == NULL) {
703 @@ -199,7 +226,11 @@ 673 @@ -199,30 +226,23 @@
704 parent = newNode; 674 parent = newNode;
705 } 675 }
706 676
707 - parent->flags |= kEndsWord; 677 - parent->flags |= kEndsWord;
678 -}
679 -
680 -#if 0
681 -void
682 -MutableTrieDictionary::addWords( UEnumeration *words,
683 - UErrorCode &status ) {
684 - int32_t length;
685 - const UChar *word;
686 - while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) {
687 - addWord(word, length, status);
708 + if(fValued && value > 0){ 688 + if(fValued && value > 0){
709 + parent->flags = value; 689 + parent->flags = value;
710 + } else { 690 + } else {
711 + parent->flags |= kEndsWord; 691 + parent->flags |= kEndsWord;
712 + } 692 }
713 } 693 }
694 -#endif
714 695
715 #if 0 696 int32_t
716 @@ -219,10 +250,11 @@ 697 MutableTrieDictionary::matches( UText *text,
717 int32_t maxLength, 698 int32_t maxLength,
718 int32_t *lengths, 699 int32_t *lengths,
719 int &count, 700 int &count,
720 - int limit ) const { 701 - int limit ) const {
721 + int limit, 702 + int limit,
722 + uint16_t *values /*=NULL*/) const { 703 + uint16_t *values /*=NULL*/) const {
723 TernaryNode *parent; 704 TernaryNode *parent;
724 UBool pMatched; 705 UBool pMatched;
725 - return search(text, maxLength, lengths, count, limit, parent, pMatched); 706 - return search(text, maxLength, lengths, count, limit, parent, pMatched);
726 + return search(text, maxLength, lengths, count, limit, parent, pMatched, val ues); 707 + return search(text, maxLength, lengths, count, limit, parent, pMatched, val ues);
727 } 708 }
728 709
729 // Implementation of iteration for MutableTrieDictionary 710 // Implementation of iteration for MutableTrieDictionary
730 @@ -277,7 +309,7 @@ 711 @@ -277,7 +297,7 @@
731 break; 712 break;
732 } 713 }
733 case kEqual: 714 case kEqual:
734 - emit = (node->flags & kEndsWord) != 0; 715 - emit = (node->flags & kEndsWord) != 0;
735 + emit = node->flags > 0; 716 + emit = node->flags > 0;
736 equal = (node->equal != NULL); 717 equal = (node->equal != NULL);
737 // If this node should be part of the next emitted string, appe nd 718 // If this node should be part of the next emitted string, appe nd
738 // the UChar to the string, and make sure we pop it when we com e 719 // the UChar to the string, and make sure we pop it when we com e
739 @@ -299,7 +331,7 @@ 720 @@ -299,7 +319,7 @@
740 } 721 }
741 case kGreaterThan: 722 case kGreaterThan:
742 // If this node's character is in the string, remove it. 723 // If this node's character is in the string, remove it.
743 - if (node->equal != NULL || (node->flags & kEndsWord)) { 724 - if (node->equal != NULL || (node->flags & kEndsWord)) {
744 + if (node->equal != NULL || node->flags > 0) { 725 + if (node->equal != NULL || node->flags > 0) {
745 unistr.truncate(unistr.length()-1); 726 unistr.truncate(unistr.length()-1);
746 } 727 }
747 if (node->high != NULL) { 728 if (node->high != NULL) {
748 @@ -354,12 +386,74 @@ 729 @@ -354,12 +374,75 @@
749 * CompactTrieDictionary 730 * CompactTrieDictionary
750 */ 731 */
751 732
752 +//TODO if time permits: minimise size of trie with logprobs by storing values 733 +//TODO further optimization:
734 +// minimise size of trie with logprobs by storing values
753 +// for terminal nodes directly in offsets[] 735 +// for terminal nodes directly in offsets[]
754 +// --> calculating from next offset *might* be simpler, but would have to add 736 +// --> calculating from next offset *might* be simpler, but would have to add
755 +// one last offset for logprob of last node 737 +// one last offset for logprob of last node
756 +// --> if calculate from current offset, need to factor in possible overflow 738 +// --> if calculate from current offset, need to factor in possible overflow
757 +// as well. 739 +// as well.
758 +// idea: store in offset, set first bit to indicate logprob storage-->won't 740 +// idea: store in offset, set first bit to indicate logprob storage-->won't
759 +// have to access additional node 741 +// have to access additional node
760 + 742 +
761 +// {'Dic', 1}, version 1: uses old header, no values 743 +// {'Dic', 1}, version 1: uses old header, no values
762 +#define COMPACT_TRIE_MAGIC_1 0x44696301 744 +#define COMPACT_TRIE_MAGIC_1 0x44696301
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after
814 + offsets = &(header->offsets[0]); 796 + offsets = &(header->offsets[0]);
815 + address = (uint8_t *)header; 797 + address = (uint8_t *)header;
816 + } 798 + }
817 + } 799 + }
818 + } 800 + }
819 + 801 +
820 + ~CompactTrieInfo(){} 802 + ~CompactTrieInfo(){}
821 }; 803 };
822 804
823 // Note that to avoid platform-specific alignment issues, all members of the no de 805 // Note that to avoid platform-specific alignment issues, all members of the no de
824 @@ -375,10 +469,14 @@ 806 @@ -375,10 +458,14 @@
825 enum CompactTrieNodeFlags { 807 enum CompactTrieNodeFlags {
826 kVerticalNode = 0x1000, // This is a vertical node 808 kVerticalNode = 0x1000, // This is a vertical node
827 kParentEndsWord = 0x2000, // The node whose equal link points to this ends a word 809 kParentEndsWord = 0x2000, // The node whose equal link points to this ends a word
828 - kReservedFlag1 = 0x4000, 810 - kReservedFlag1 = 0x4000,
829 - kReservedFlag2 = 0x8000, 811 - kReservedFlag2 = 0x8000,
830 + kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kR eservedFlag1 812 + kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kR eservedFlag1
831 + kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReserved Flag2 813 + kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReserved Flag2
832 kCountMask = 0x0FFF, // The count portion of flagscount 814 kCountMask = 0x0FFF, // The count portion of flagscount
833 - kFlagMask = 0xF000 // The flags portion of flagscount 815 - kFlagMask = 0xF000 // The flags portion of flagscount
834 + kFlagMask = 0xF000, // The flags portion of flagscount 816 + kFlagMask = 0xF000, // The flags portion of flagscount
835 + kRootCountMask = 0x7FFF // The count portion of flagscount in the r oot node 817 + kRootCountMask = 0x7FFF // The count portion of flagscount in the r oot node
836 + 818 +
837 + //offset flags: 819 + //offset flags:
838 + //kOffsetContainsValue = 0x80000000 // Offset contains value for pare nt node 820 + //kOffsetContainsValue = 0x80000000 // Offset contains value for pare nt node
839 }; 821 };
840 822
841 // The two node types are distinguished by the kVerticalNode flag. 823 // The two node types are distinguished by the kVerticalNode flag.
842 @@ -402,63 +500,177 @@ 824 @@ -402,63 +489,177 @@
843 uint16_t chars[1]; // Code units 825 uint16_t chars[1]; // Code units
844 }; 826 };
845 827
846 -// {'Dic', 1}, version 1 828 -// {'Dic', 1}, version 1
847 -#define COMPACT_TRIE_MAGIC_1 0x44696301 829 -#define COMPACT_TRIE_MAGIC_1 0x44696301
848 - 830 -
849 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj, 831 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj,
850 UErrorCode &status ) 832 UErrorCode &status )
851 : fUData(dataObj) 833 : fUData(dataObj)
852 { 834 {
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after
907 uint32_t 889 uint32_t
908 CompactTrieDictionary::dataSize() const { 890 CompactTrieDictionary::dataSize() const {
909 - return fData->size; 891 - return fData->size;
910 + return fInfo->size; 892 + return fInfo->size;
911 } 893 }
912 894
913 const void * 895 const void *
914 CompactTrieDictionary::data() const { 896 CompactTrieDictionary::data() const {
915 - return fData; 897 - return fData;
916 + return fInfo->address; 898 + return fInfo->address;
917 } 899 +}
918 900 +
919 -// This function finds the address of a node for us, given its node ID
920 +//This function finds the address of a node for us, given its node ID 901 +//This function finds the address of a node for us, given its node ID
921 static inline const CompactTrieNode * 902 +static inline const CompactTrieNode *
922 -getCompactNode(const CompactTrieHeader *header, uint16_t node) {
923 - return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[ node]);
924 +getCompactNode(const CompactTrieInfo *info, uint32_t node) { 903 +getCompactNode(const CompactTrieInfo *info, uint32_t node) {
925 + if(node < info->root-1) { 904 + if(node < info->root-1) {
926 + return (const CompactTrieNode *)(&info->offsets[node]); 905 + return (const CompactTrieNode *)(&info->offsets[node]);
927 + } else { 906 + } else {
928 + return (const CompactTrieNode *)(info->address + info->offsets[node]); 907 + return (const CompactTrieNode *)(info->address + info->offsets[node]);
929 + } 908 + }
930 +} 909 }
931 + 910
911 -// This function finds the address of a node for us, given its node ID
932 +//this version of getCompactNode is currently only used in compactMutableTrieDi ctionary() 912 +//this version of getCompactNode is currently only used in compactMutableTrieDi ctionary()
933 +static inline const CompactTrieNode * 913 static inline const CompactTrieNode *
914 -getCompactNode(const CompactTrieHeader *header, uint16_t node) {
915 - return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[ node]);
934 +getCompactNode(const CompactTrieHeader *header, uint32_t node) { 916 +getCompactNode(const CompactTrieHeader *header, uint32_t node) {
935 + if(node < header->root-1) { 917 + if(node < header->root-1) {
936 + return (const CompactTrieNode *)(&header->offsets[node]); 918 + return (const CompactTrieNode *)(&header->offsets[node]);
937 + } else { 919 + } else {
938 + return (const CompactTrieNode *)((const uint8_t *)header + header->offs ets[node]); 920 + return (const CompactTrieNode *)((const uint8_t *)header + header->offs ets[node]);
939 + } 921 + }
940 +} 922 +}
941 + 923 +
942 + 924 +
943 +/** 925 +/**
(...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after
1030 + } 1012 + }
1031 + else { 1013 + else {
1032 + low = middle+1; 1014 + low = middle+1;
1033 + } 1015 + }
1034 + } 1016 + }
1035 + 1017 +
1036 + return -1; 1018 + return -1;
1037 } 1019 }
1038 1020
1039 int32_t 1021 int32_t
1040 @@ -466,17 +678,38 @@ 1022 @@ -466,17 +667,38 @@
1041 int32_t maxLength, 1023 int32_t maxLength,
1042 int32_t *lengths, 1024 int32_t *lengths,
1043 int &count, 1025 int &count,
1044 - int limit ) const { 1026 - int limit ) const {
1045 + int limit, 1027 + int limit,
1046 + uint16_t *values /*= NULL*/) const { 1028 + uint16_t *values /*= NULL*/) const {
1047 + if (fInfo->magic == COMPACT_TRIE_MAGIC_2) 1029 + if (fInfo->magic == COMPACT_TRIE_MAGIC_2)
1048 + values = NULL; 1030 + values = NULL;
1049 + 1031 +
1050 // TODO: current implementation works in UTF-16 space 1032 // TODO: current implementation works in UTF-16 space
(...skipping 20 matching lines...) Expand all
1071 + 1053 +
1072 while (node != NULL) { 1054 while (node != NULL) {
1073 // Check if the node we just exited ends a word 1055 // Check if the node we just exited ends a word
1074 if (limit > 0 && (node->flagscount & kParentEndsWord)) { 1056 if (limit > 0 && (node->flagscount & kParentEndsWord)) {
1075 + if(values != NULL){ 1057 + if(values != NULL){
1076 + values[mycount] = getValue(node); 1058 + values[mycount] = getValue(node);
1077 + } 1059 + }
1078 lengths[mycount++] = i; 1060 lengths[mycount++] = i;
1079 --limit; 1061 --limit;
1080 } 1062 }
1081 @@ -487,7 +720,7 @@ 1063 @@ -487,7 +709,7 @@
1082 break; 1064 break;
1083 } 1065 }
1084 1066
1085 - int nodeCount = (node->flagscount & kCountMask); 1067 - int nodeCount = (node->flagscount & kCountMask);
1086 + int nodeCount = getCount(node); 1068 + int nodeCount = getCount(node);
1087 if (nodeCount == 0) { 1069 if (nodeCount == 0) {
1088 // Special terminal node; return now 1070 // Special terminal node; return now
1089 break; 1071 break;
1090 @@ -507,35 +740,27 @@ 1072 @@ -507,35 +729,27 @@
1091 // To get here we must have come through the whole list successfull y; 1073 // To get here we must have come through the whole list successfull y;
1092 // go on to the next node. Note that a word cannot end in the middl e 1074 // go on to the next node. Note that a word cannot end in the middl e
1093 // of a vertical node. 1075 // of a vertical node.
1094 - node = getCompactNode(fData, vnode->equal); 1076 - node = getCompactNode(fData, vnode->equal);
1095 + node = getCompactNode(fInfo, calcEqualLink(vnode)); 1077 + node = getCompactNode(fInfo, calcEqualLink(vnode));
1096 } 1078 }
1097 else { 1079 else {
1098 // Horizontal node; do binary search 1080 // Horizontal node; do binary search
1099 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizont alNode *)node; 1081 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizont alNode *)node;
1100 - int low = 0; 1082 - int low = 0;
(...skipping 29 matching lines...) Expand all
1130 + }else{ 1112 + }else{
1131 + node = NULL; // If we don't find a match, we'll fall out of the loop 1113 + node = NULL; // If we don't find a match, we'll fall out of the loop
1132 } 1114 }
1133 } 1115 }
1134 } 1116 }
1135 -exit: 1117 -exit:
1136 + exit: 1118 + exit:
1137 count = mycount; 1119 count = mycount;
1138 return i; 1120 return i;
1139 } 1121 }
1140 @@ -545,16 +770,16 @@ 1122 @@ -545,16 +759,16 @@
1141 private: 1123 private:
1142 UVector32 fNodeStack; // Stack of nodes to process 1124 UVector32 fNodeStack; // Stack of nodes to process
1143 UVector32 fIndexStack; // Stack of where in node we are 1125 UVector32 fIndexStack; // Stack of where in node we are
1144 - const CompactTrieHeader *fHeader; // Trie data 1126 - const CompactTrieHeader *fHeader; // Trie data
1145 + const CompactTrieInfo *fInfo; // Trie data 1127 + const CompactTrieInfo *fInfo; // Trie data
1146 1128
1147 public: 1129 public:
1148 static UClassID U_EXPORT2 getStaticClassID(void); 1130 static UClassID U_EXPORT2 getStaticClassID(void);
1149 virtual UClassID getDynamicClassID(void) const; 1131 virtual UClassID getDynamicClassID(void) const;
1150 public: 1132 public:
1151 - CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status) 1133 - CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status)
1152 + CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status) 1134 + CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status)
1153 : fNodeStack(status), fIndexStack(status) { 1135 : fNodeStack(status), fIndexStack(status) {
1154 - fHeader = header; 1136 - fHeader = header;
1155 - fNodeStack.push(header->root, status); 1137 - fNodeStack.push(header->root, status);
1156 + fInfo = info; 1138 + fInfo = info;
1157 + fNodeStack.push(info->root, status); 1139 + fNodeStack.push(info->root, status);
1158 fIndexStack.push(0, status); 1140 fIndexStack.push(0, status);
1159 unistr.remove(); 1141 unistr.remove();
1160 } 1142 }
1161 @@ -564,14 +789,14 @@ 1143 @@ -564,14 +778,14 @@
1162 1144
1163 virtual StringEnumeration *clone() const { 1145 virtual StringEnumeration *clone() const {
1164 UErrorCode status = U_ZERO_ERROR; 1146 UErrorCode status = U_ZERO_ERROR;
1165 - return new CompactTrieEnumeration(fHeader, status); 1147 - return new CompactTrieEnumeration(fHeader, status);
1166 + return new CompactTrieEnumeration(fInfo, status); 1148 + return new CompactTrieEnumeration(fInfo, status);
1167 } 1149 }
1168 1150
1169 virtual const UnicodeString * snext(UErrorCode &status); 1151 virtual const UnicodeString * snext(UErrorCode &status);
1170 1152
1171 // Very expensive, but this should never be used. 1153 // Very expensive, but this should never be used.
1172 virtual int32_t count(UErrorCode &status) const { 1154 virtual int32_t count(UErrorCode &status) const {
1173 - CompactTrieEnumeration counter(fHeader, status); 1155 - CompactTrieEnumeration counter(fHeader, status);
1174 + CompactTrieEnumeration counter(fInfo, status); 1156 + CompactTrieEnumeration counter(fInfo, status);
1175 int32_t result = 0; 1157 int32_t result = 0;
1176 while (counter.snext(status) != NULL && U_SUCCESS(status)) { 1158 while (counter.snext(status) != NULL && U_SUCCESS(status)) {
1177 ++result; 1159 ++result;
1178 @@ -582,7 +807,7 @@ 1160 @@ -582,7 +796,7 @@
1179 virtual void reset(UErrorCode &status) { 1161 virtual void reset(UErrorCode &status) {
1180 fNodeStack.removeAllElements(); 1162 fNodeStack.removeAllElements();
1181 fIndexStack.removeAllElements(); 1163 fIndexStack.removeAllElements();
1182 - fNodeStack.push(fHeader->root, status); 1164 - fNodeStack.push(fHeader->root, status);
1183 + fNodeStack.push(fInfo->root, status); 1165 + fNodeStack.push(fInfo->root, status);
1184 fIndexStack.push(0, status); 1166 fIndexStack.push(0, status);
1185 unistr.remove(); 1167 unistr.remove();
1186 } 1168 }
1187 @@ -595,26 +820,34 @@ 1169 @@ -595,26 +809,34 @@
1188 if (fNodeStack.empty() || U_FAILURE(status)) { 1170 if (fNodeStack.empty() || U_FAILURE(status)) {
1189 return NULL; 1171 return NULL;
1190 } 1172 }
1191 - const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki()); 1173 - const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki());
1192 + const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki()); 1174 + const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki());
1193 int where = fIndexStack.peeki(); 1175 int where = fIndexStack.peeki();
1194 while (!fNodeStack.empty() && U_SUCCESS(status)) { 1176 while (!fNodeStack.empty() && U_SUCCESS(status)) {
1195 - int nodeCount = (node->flagscount & kCountMask); 1177 - int nodeCount = (node->flagscount & kCountMask);
1196 + int nodeCount; 1178 + int nodeCount;
1197 + 1179 +
(...skipping 20 matching lines...) Expand all
1218 if (where == 0) { 1200 if (where == 0) {
1219 // Going down 1201 // Going down
1220 - unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount) ; 1202 - unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount) ;
1221 + unistr.append((const UChar *)vnode->chars, nodeCount); 1203 + unistr.append((const UChar *)vnode->chars, nodeCount);
1222 fIndexStack.setElementAt(1, fIndexStack.size()-1); 1204 fIndexStack.setElementAt(1, fIndexStack.size()-1);
1223 - node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, st atus)); 1205 - node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, st atus));
1224 + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnod e), status)); 1206 + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnod e), status));
1225 where = fIndexStack.push(0, status); 1207 where = fIndexStack.push(0, status);
1226 goingDown = TRUE; 1208 goingDown = TRUE;
1227 } 1209 }
1228 @@ -623,7 +856,7 @@ 1210 @@ -623,7 +845,7 @@
1229 unistr.truncate(unistr.length()-nodeCount); 1211 unistr.truncate(unistr.length()-nodeCount);
1230 fNodeStack.popi(); 1212 fNodeStack.popi();
1231 fIndexStack.popi(); 1213 fIndexStack.popi();
1232 - node = getCompactNode(fHeader, fNodeStack.peeki()); 1214 - node = getCompactNode(fHeader, fNodeStack.peeki());
1233 + node = getCompactNode(fInfo, fNodeStack.peeki()); 1215 + node = getCompactNode(fInfo, fNodeStack.peeki());
1234 where = fIndexStack.peeki(); 1216 where = fIndexStack.peeki();
1235 } 1217 }
1236 } 1218 }
1237 @@ -638,7 +871,7 @@ 1219 @@ -638,7 +860,7 @@
1238 // Push on next node 1220 // Push on next node
1239 unistr.append((UChar)hnode->entries[where].ch); 1221 unistr.append((UChar)hnode->entries[where].ch);
1240 fIndexStack.setElementAt(where+1, fIndexStack.size()-1); 1222 fIndexStack.setElementAt(where+1, fIndexStack.size()-1);
1241 - node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[w here].equal, status)); 1223 - node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[w here].equal, status));
1242 + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnod e, where, nodeCount), status)); 1224 + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnod e, where, nodeCount), status));
1243 where = fIndexStack.push(0, status); 1225 where = fIndexStack.push(0, status);
1244 goingDown = TRUE; 1226 goingDown = TRUE;
1245 } 1227 }
1246 @@ -646,12 +879,14 @@ 1228 @@ -646,12 +868,14 @@
1247 // Going up 1229 // Going up
1248 fNodeStack.popi(); 1230 fNodeStack.popi();
1249 fIndexStack.popi(); 1231 fIndexStack.popi();
1250 - node = getCompactNode(fHeader, fNodeStack.peeki()); 1232 - node = getCompactNode(fHeader, fNodeStack.peeki());
1251 + node = getCompactNode(fInfo, fNodeStack.peeki()); 1233 + node = getCompactNode(fInfo, fNodeStack.peeki());
1252 where = fIndexStack.peeki(); 1234 where = fIndexStack.peeki();
1253 } 1235 }
1254 } 1236 }
1255 + 1237 +
1256 // Check if the parent of the node we've just gone down to ends a 1238 // Check if the parent of the node we've just gone down to ends a
1257 // word. If so, return it. 1239 // word. If so, return it.
1258 + // The root node should never end up here. 1240 + // The root node should never end up here.
1259 if (goingDown && (node->flagscount & kParentEndsWord)) { 1241 if (goingDown && (node->flagscount & kParentEndsWord)) {
1260 return &unistr; 1242 return &unistr;
1261 } 1243 }
1262 @@ -664,7 +899,7 @@ 1244 @@ -664,7 +888,7 @@
1263 if (U_FAILURE(status)) { 1245 if (U_FAILURE(status)) {
1264 return NULL; 1246 return NULL;
1265 } 1247 }
1266 - return new CompactTrieEnumeration(fData, status); 1248 - return new CompactTrieEnumeration(fData, status);
1267 + return new CompactTrieEnumeration(fInfo, status); 1249 + return new CompactTrieEnumeration(fInfo, status);
1268 } 1250 }
1269 1251
1270 // 1252 //
1271 @@ -672,21 +907,36 @@ 1253 @@ -672,21 +896,36 @@
1272 // and back again 1254 // and back again
1273 // 1255 //
1274 1256
1275 -// Helper classes to construct the compact trie 1257 -// Helper classes to construct the compact trie
1276 +enum CompactTrieNodeType { 1258 +enum CompactTrieNodeType {
1277 + kHorizontalType = 0, 1259 + kHorizontalType = 0,
1278 + kVerticalType = 1, 1260 + kVerticalType = 1,
1279 + kValueType = 2 1261 + kValueType = 2
1280 +}; 1262 +};
1281 + 1263 +
(...skipping 22 matching lines...) Expand all
1304 fParentEndsWord = parentEndsWord; 1286 fParentEndsWord = parentEndsWord;
1305 fHasDuplicate = FALSE; 1287 fHasDuplicate = FALSE;
1306 - fVertical = vertical; 1288 - fVertical = vertical;
1307 + fNodeType = nodeType; 1289 + fNodeType = nodeType;
1308 + fEqualOverflows = FALSE; 1290 + fEqualOverflows = FALSE;
1309 fNodeID = nodes.size(); 1291 fNodeID = nodes.size();
1310 + fValue = parentEndsWord? value : 0; 1292 + fValue = parentEndsWord? value : 0;
1311 nodes.push(this, status); 1293 nodes.push(this, status);
1312 } 1294 }
1313 1295
1314 @@ -694,87 +944,225 @@ 1296 @@ -694,87 +933,225 @@
1315 } 1297 }
1316 1298
1317 virtual uint32_t size() { 1299 virtual uint32_t size() {
1318 - return sizeof(uint16_t); 1300 - return sizeof(uint16_t);
1319 + if(fValue > 0) 1301 + if(fValue > 0)
1320 + return sizeof(uint16_t) * 2; 1302 + return sizeof(uint16_t) * 2;
1321 + else 1303 + else
1322 + return sizeof(uint16_t); 1304 + return sizeof(uint16_t);
1323 } 1305 }
1324 1306
(...skipping 221 matching lines...) Expand 10 before | Expand all | Expand 10 after
1546 + // append 16 bits of to end for equal node if fEqualOverflows 1528 + // append 16 bits of to end for equal node if fEqualOverflows
1547 + if (fEqualOverflows) { 1529 + if (fEqualOverflows) {
1548 + *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNode ID) >> 16); 1530 + *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNode ID) >> 16);
1549 + offset += sizeof(uint16_t); 1531 + offset += sizeof(uint16_t);
1550 + } 1532 + }
1551 + 1533 +
1552 + BuildCompactTrieNode::writeValue(bytes, offset); 1534 + BuildCompactTrieNode::writeValue(bytes, offset);
1553 } 1535 }
1554 1536
1555 void addChar(UChar ch) { 1537 void addChar(UChar ch) {
1556 @@ -784,60 +1172,85 @@ 1538 @@ -784,60 +1161,85 @@
1557 void setLink(BuildCompactTrieNode *node) { 1539 void setLink(BuildCompactTrieNode *node) {
1558 fEqual = node; 1540 fEqual = node;
1559 } 1541 }
1560 + 1542 +
1561 }; 1543 };
1562 1544
1563 // Forward declaration 1545 // Forward declaration
1564 static void walkHorizontal(const TernaryNode *node, 1546 static void walkHorizontal(const TernaryNode *node,
1565 BuildCompactTrieHorizontalNode *building, 1547 BuildCompactTrieHorizontalNode *building,
1566 UStack &nodes, 1548 UStack &nodes,
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after
1644 + } else { 1626 + } else {
1645 vResult->setLink((BuildCompactTrieNode *)nodes[1]); 1627 vResult->setLink((BuildCompactTrieNode *)nodes[1]);
1646 } 1628 }
1647 } 1629 }
1648 else { 1630 else {
1649 - vResult->setLink(compactOneNode(node, endsWord, nodes, status)) ; 1631 - vResult->setLink(compactOneNode(node, endsWord, nodes, status)) ;
1650 + vResult->setLink(compactOneNode(node, endsWord, nodes, status, values, value)); 1632 + vResult->setLink(compactOneNode(node, endsWord, nodes, status, values, value));
1651 } 1633 }
1652 result = vResult; 1634 result = vResult;
1653 } 1635 }
1654 @@ -849,19 +1262,28 @@ 1636 @@ -849,19 +1251,28 @@
1655 // Uses recursion. 1637 // Uses recursion.
1656 1638
1657 static void walkHorizontal(const TernaryNode *node, 1639 static void walkHorizontal(const TernaryNode *node,
1658 - BuildCompactTrieHorizontalNode *building, 1640 - BuildCompactTrieHorizontalNode *building,
1659 - UStack &nodes, 1641 - UStack &nodes,
1660 - UErrorCode &status) { 1642 - UErrorCode &status) {
1661 + BuildCompactTrieHorizontalNode *building, 1643 + BuildCompactTrieHorizontalNode *building,
1662 + UStack &nodes, 1644 + UStack &nodes,
1663 + UErrorCode &status, Hashtable *values = NULL) { 1645 + UErrorCode &status, Hashtable *values = NULL) {
1664 while (U_SUCCESS(status) && node != NULL) { 1646 while (U_SUCCESS(status) && node != NULL) {
1665 if (node->low != NULL) { 1647 if (node->low != NULL) {
1666 - walkHorizontal(node->low, building, nodes, status); 1648 - walkHorizontal(node->low, building, nodes, status);
1667 + walkHorizontal(node->low, building, nodes, status, values); 1649 + walkHorizontal(node->low, building, nodes, status, values);
1668 } 1650 }
1669 BuildCompactTrieNode *link = NULL; 1651 BuildCompactTrieNode *link = NULL;
1670 if (node->equal != NULL) { 1652 if (node->equal != NULL) {
1671 - link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, nodes, status); 1653 - link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, nodes, status);
1672 + link = compactOneNode(node->equal, node->flags > 0, nodes, status, values, node->flags); 1654 + link = compactOneNode(node->equal, node->flags > 0, nodes, status, values, node->flags);
1673 } 1655 }
1674 - else if (node->flags & kEndsWord) { 1656 - else if (node->flags & kEndsWord) {
1675 - link = (BuildCompactTrieNode *)nodes[1]; 1657 - link = (BuildCompactTrieNode *)nodes[1];
1676 + else if (node->flags > 0) { 1658 + else if (node->flags > 0) {
1677 + if(values != NULL) { 1659 + if(values != NULL) {
1678 + UnicodeString key(node->flags); //store value as a single-char UnicodeString 1660 + UnicodeString key(node->flags); //store value as a single-char UnicodeString
1679 + link = (BuildCompactTrieValueNode *) values->get(key); 1661 + link = (BuildCompactTrieValueNode *) values->get(key);
1680 + if(link == NULL) { 1662 + if(link == NULL) {
1681 + link = new BuildCompactTrieValueNode(nodes, status, node->f lags); //take out nodes? 1663 + link = new BuildCompactTrieValueNode(nodes, status, node->f lags); //take out nodes?
1682 + values->put(key, link, status); 1664 + values->put(key, link, status);
1683 + } 1665 + }
1684 + } else { 1666 + } else {
1685 + link = (BuildCompactTrieNode *)nodes[1]; 1667 + link = (BuildCompactTrieNode *)nodes[1];
1686 + } 1668 + }
1687 } 1669 }
1688 if (U_SUCCESS(status) && link != NULL) { 1670 if (U_SUCCESS(status) && link != NULL) {
1689 building->addNode(node->ch, link, status); 1671 building->addNode(node->ch, link, status);
1690 @@ -881,13 +1303,15 @@ 1672 @@ -881,13 +1292,15 @@
1691 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) { 1673 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) {
1692 BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl; 1674 BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl;
1693 BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr; 1675 BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr;
1694 + 1676 +
1695 // Check for comparing a node to itself, to avoid spurious duplicates 1677 // Check for comparing a node to itself, to avoid spurious duplicates
1696 if (left == right) { 1678 if (left == right) {
1697 return 0; 1679 return 0;
1698 } 1680 }
1699 + 1681 +
1700 // Most significant is type of node. Can never coalesce. 1682 // Most significant is type of node. Can never coalesce.
1701 - if (left->fVertical != right->fVertical) { 1683 - if (left->fVertical != right->fVertical) {
1702 - return left->fVertical - right->fVertical; 1684 - return left->fVertical - right->fVertical;
1703 + if (left->fNodeType != right->fNodeType) { 1685 + if (left->fNodeType != right->fNodeType) {
1704 + return left->fNodeType - right->fNodeType; 1686 + return left->fNodeType - right->fNodeType;
1705 } 1687 }
1706 // Next, the "parent ends word" flag. If that differs, we cannot coalesce. 1688 // Next, the "parent ends word" flag. If that differs, we cannot coalesce.
1707 if (left->fParentEndsWord != right->fParentEndsWord) { 1689 if (left->fParentEndsWord != right->fParentEndsWord) {
1708 @@ -898,12 +1322,19 @@ 1690 @@ -898,12 +1311,19 @@
1709 if (result != 0) { 1691 if (result != 0) {
1710 return result; 1692 return result;
1711 } 1693 }
1712 + 1694 +
1713 + // If the node value differs, we should not coalesce. 1695 + // If the node value differs, we should not coalesce.
1714 + // If values aren't stored, all fValues should be 0. 1696 + // If values aren't stored, all fValues should be 0.
1715 + if (left->fValue != right->fValue) { 1697 + if (left->fValue != right->fValue) {
1716 + return left->fValue - right->fValue; 1698 + return left->fValue - right->fValue;
1717 + } 1699 + }
1718 + 1700 +
1719 // We know they're both the same node type, so branch for the two cases. 1701 // We know they're both the same node type, so branch for the two cases.
1720 - if (left->fVertical) { 1702 - if (left->fVertical) {
1721 + if (left->fNodeType == kVerticalType) { 1703 + if (left->fNodeType == kVerticalType) {
1722 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID 1704 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID
1723 - - ((BuildCompactTrieVerticalNode *)right)->fEqual-> fNodeID; 1705 - - ((BuildCompactTrieVerticalNode *)right)->fEqual-> fNodeID;
1724 + - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID; 1706 + - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;
1725 } 1707 }
1726 - else { 1708 - else {
1727 + else if(left->fChars.length() > 0 && right->fChars.length() > 0){ 1709 + else if(left->fChars.length() > 0 && right->fChars.length() > 0){
1728 // We need to compare the links vectors. They should be the 1710 // We need to compare the links vectors. They should be the
1729 // same size because the strings were equal. 1711 // same size because the strings were equal.
1730 // We compare the node IDs instead of the pointers, to handle 1712 // We compare the node IDs instead of the pointers, to handle
1731 @@ -914,9 +1345,10 @@ 1713 @@ -914,9 +1334,10 @@
1732 int32_t count = hleft->fLinks.size(); 1714 int32_t count = hleft->fLinks.size();
1733 for (int32_t i = 0; i < count && result == 0; ++i) { 1715 for (int32_t i = 0; i < count && result == 0; ++i) {
1734 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID - 1716 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID -
1735 - ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; 1717 - ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;
1736 + ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; 1718 + ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;
1737 } 1719 }
1738 } 1720 }
1739 + 1721 +
1740 // If they are equal to each other, mark them (speeds coalescing) 1722 // If they are equal to each other, mark them (speeds coalescing)
1741 if (result == 0) { 1723 if (result == 0) {
1742 left->fHasDuplicate = TRUE; 1724 left->fHasDuplicate = TRUE;
1743 @@ -1031,20 +1463,25 @@ 1725 @@ -1031,20 +1452,25 @@
1744 // Add node 0, used as the NULL pointer/sentinel. 1726 // Add node 0, used as the NULL pointer/sentinel.
1745 nodes.addElement((int32_t)0, status); 1727 nodes.addElement((int32_t)0, status);
1746 1728
1747 + Hashtable *values = NULL; // Index of (unique) va lues 1729 + Hashtable *values = NULL; // Index of (unique) va lues
1748 + if (dict.fValued) { 1730 + if (dict.fValued) {
1749 + values = new Hashtable(status); 1731 + values = new Hashtable(status);
1750 + } 1732 + }
1751 + 1733 +
1752 // Start by creating the special empty node we use to indicate that the par ent 1734 // Start by creating the special empty node we use to indicate that the par ent
1753 // terminates a word. This must be node 1, because the builder assumes 1735 // terminates a word. This must be node 1, because the builder assumes
1754 - // that. 1736 - // that.
1755 + // that. This node will never be used for tries storing numerical values. 1737 + // that. This node will never be used for tries storing numerical values.
1756 if (U_FAILURE(status)) { 1738 if (U_FAILURE(status)) {
1757 return NULL; 1739 return NULL;
1758 } 1740 }
1759 - BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, node s, status); 1741 - BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, node s, status);
1760 + BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontal Type, nodes, status); 1742 + BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontal Type, nodes, status);
1761 if (terminal == NULL) { 1743 if (terminal == NULL) {
1762 status = U_MEMORY_ALLOCATION_ERROR; 1744 status = U_MEMORY_ALLOCATION_ERROR;
1763 } 1745 }
1764 1746
1765 // This call does all the work of building the new trie structure. The root 1747 // This call does all the work of building the new trie structure. The root
1766 - // will be node 2. 1748 - // will be node 2.
1767 - BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, statu s); 1749 - BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, statu s);
1768 + // will have node ID 2 before writing to memory. 1750 + // will have node ID 2 before writing to memory.
1769 + BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, statu s, values); 1751 + BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, statu s, values);
1770 #ifdef DEBUG_TRIE_DICT 1752 #ifdef DEBUG_TRIE_DICT
1771 (void) ::times(&timing); 1753 (void) ::times(&timing);
1772 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n", 1754 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n",
1773 @@ -1077,21 +1514,37 @@ 1755 @@ -1077,21 +1503,37 @@
1774 return NULL; 1756 return NULL;
1775 } 1757 }
1776 1758
1777 + //map terminal value nodes 1759 + //map terminal value nodes
1778 + int valueCount = 0; 1760 + int valueCount = 0;
1779 + UVector valueNodes(status); 1761 + UVector valueNodes(status);
1780 + if(values != NULL) { 1762 + if(values != NULL) {
1781 + valueCount = values->count(); //number of unique terminal value nodes 1763 + valueCount = values->count(); //number of unique terminal value nodes
1782 + } 1764 + }
1783 + 1765 +
(...skipping 23 matching lines...) Expand all
1807 } 1789 }
1808 - 1790 -
1809 - // Check for overflowing 16 bits worth of nodes. 1791 - // Check for overflowing 16 bits worth of nodes.
1810 - if (nodeCount > 0x10000) { 1792 - if (nodeCount > 0x10000) {
1811 + 1793 +
1812 + // Check for overflowing 20 bits worth of nodes. 1794 + // Check for overflowing 20 bits worth of nodes.
1813 + if (nodeCount > 0x100000) { 1795 + if (nodeCount > 0x100000) {
1814 status = U_ILLEGAL_ARGUMENT_ERROR; 1796 status = U_ILLEGAL_ARGUMENT_ERROR;
1815 return NULL; 1797 return NULL;
1816 } 1798 }
1817 @@ -1111,9 +1564,14 @@ 1799 @@ -1111,9 +1553,14 @@
1818 status = U_MEMORY_ALLOCATION_ERROR; 1800 status = U_MEMORY_ALLOCATION_ERROR;
1819 return NULL; 1801 return NULL;
1820 } 1802 }
1821 - 1803 -
1822 + 1804 +
1823 CompactTrieHeader *header = (CompactTrieHeader *)bytes; 1805 CompactTrieHeader *header = (CompactTrieHeader *)bytes;
1824 - header->size = totalSize; 1806 - header->size = totalSize;
1825 + //header->size = totalSize; 1807 + //header->size = totalSize;
1826 + if(dict.fValued){ 1808 + if(dict.fValued){
1827 + header->magic = COMPACT_TRIE_MAGIC_3; 1809 + header->magic = COMPACT_TRIE_MAGIC_3;
1828 + } else { 1810 + } else {
1829 + header->magic = COMPACT_TRIE_MAGIC_2; 1811 + header->magic = COMPACT_TRIE_MAGIC_2;
1830 + } 1812 + }
1831 header->nodeCount = nodeCount; 1813 header->nodeCount = nodeCount;
1832 header->offsets[0] = 0; // Sentinel 1814 header->offsets[0] = 0; // Sentinel
1833 header->root = translate.elementAti(root->fNodeID); 1815 header->root = translate.elementAti(root->fNodeID);
1834 @@ -1123,23 +1581,40 @@ 1816 @@ -1123,23 +1570,40 @@
1835 } 1817 }
1836 #endif 1818 #endif
1837 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uin t32_t)); 1819 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uin t32_t));
1838 - nodeCount = 1; 1820 - nodeCount = 1;
1839 + nodeCount = valueCount + 1; 1821 + nodeCount = valueCount + 1;
1840 + 1822 +
1841 + // Write terminal value nodes to memory 1823 + // Write terminal value nodes to memory
1842 + for (i=0; i < valueNodes.size(); i++) { 1824 + for (i=0; i < valueNodes.size(); i++) {
1843 + //header->offsets[i + 1] = offset; 1825 + //header->offsets[i + 1] = offset;
1844 + uint32_t tmpOffset = 0; 1826 + uint32_t tmpOffset = 0;
(...skipping 23 matching lines...) Expand all
1868 fprintf(stderr, "Trie built, time user %f system %f\n", 1850 fprintf(stderr, "Trie built, time user %f system %f\n",
1869 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, 1851 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1870 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); 1852 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1871 previous = timing; 1853 previous = timing;
1872 fprintf(stderr, "Final offset is %d\n", offset); 1854 fprintf(stderr, "Final offset is %d\n", offset);
1873 - 1855 -
1874 + 1856 +
1875 // Collect statistics on node types and sizes 1857 // Collect statistics on node types and sizes
1876 int hCount = 0; 1858 int hCount = 0;
1877 int vCount = 0; 1859 int vCount = 0;
1878 @@ -1148,68 +1623,85 @@ 1860 @@ -1148,68 +1612,85 @@
1879 size_t hItemCount = 0; 1861 size_t hItemCount = 0;
1880 size_t vItemCount = 0; 1862 size_t vItemCount = 0;
1881 uint32_t previousOff = offset; 1863 uint32_t previousOff = offset;
1882 - for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { 1864 - for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {
1883 + uint32_t numOverflow = 0; 1865 + uint32_t numOverflow = 0;
1884 + uint32_t valueSpace = 0; 1866 + uint32_t valueSpace = 0;
1885 + for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { 1867 + for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {
1886 const CompactTrieNode *node = getCompactNode(header, nodeIdx); 1868 const CompactTrieNode *node = getCompactNode(header, nodeIdx);
1887 - if (node->flagscount & kVerticalNode) { 1869 - if (node->flagscount & kVerticalNode) {
1888 + int itemCount; 1870 + int itemCount;
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after
1974 1956
1975 // Convert one compact trie node into a ternary subtrie 1957 // Convert one compact trie node into a ternary subtrie
1976 static TernaryNode * 1958 static TernaryNode *
1977 -unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UE rrorCode &status ) { 1959 -unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UE rrorCode &status ) {
1978 - int nodeCount = (node->flagscount & kCountMask); 1960 - int nodeCount = (node->flagscount & kCountMask);
1979 +unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UError Code &status ) { 1961 +unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UError Code &status ) {
1980 + int nodeCount = getCount(node); 1962 + int nodeCount = getCount(node);
1981 if (nodeCount == 0 || U_FAILURE(status)) { 1963 if (nodeCount == 0 || U_FAILURE(status)) {
1982 // Failure, or terminal node 1964 // Failure, or terminal node
1983 return NULL; 1965 return NULL;
1984 @@ -1234,29 +1726,41 @@ 1966 @@ -1234,29 +1715,41 @@
1985 previous = latest; 1967 previous = latest;
1986 } 1968 }
1987 if (latest != NULL) { 1969 if (latest != NULL) {
1988 - const CompactTrieNode *equal = getCompactNode(header, vnode->equal) ; 1970 - const CompactTrieNode *equal = getCompactNode(header, vnode->equal) ;
1989 + const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(v node)); 1971 + const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(v node));
1990 if (equal->flagscount & kParentEndsWord) { 1972 if (equal->flagscount & kParentEndsWord) {
1991 - latest->flags |= kEndsWord; 1973 - latest->flags |= kEndsWord;
1992 + if(info->magic == COMPACT_TRIE_MAGIC_3){ 1974 + if(info->magic == COMPACT_TRIE_MAGIC_3){
1993 + latest->flags = getValue(equal); 1975 + latest->flags = getValue(equal);
1994 + } else { 1976 + } else {
(...skipping 27 matching lines...) Expand all
2022 + // because only kEqualOverflows flag should be checked in root's flagscount 2004 + // because only kEqualOverflows flag should be checked in root's flagscount
2023 + const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode * ) 2005 + const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode * )
2024 + getCompactNode(fInfo, fInfo->root); 2006 + getCompactNode(fInfo, fInfo->root);
2025 + uint16_t nodeCount = hnode->flagscount & kRootCountMask; 2007 + uint16_t nodeCount = hnode->flagscount & kRootCountMask;
2026 + TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1, 2008 + TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1,
2027 + nodeCount, status); 2009 + nodeCount, status);
2028 + 2010 +
2029 if (U_FAILURE(status)) { 2011 if (U_FAILURE(status)) {
2030 delete root; // Clean up 2012 delete root; // Clean up
2031 delete result; 2013 delete result;
2032 @@ -1270,8 +1774,8 @@ 2014 @@ -1270,8 +1763,8 @@
2033 2015
2034 U_CAPI int32_t U_EXPORT2 2016 U_CAPI int32_t U_EXPORT2
2035 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, 2017 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
2036 - UErrorCode *status) { 2018 - UErrorCode *status) {
2037 - 2019 -
2038 + UErrorCode *status) { 2020 + UErrorCode *status) {
2039 + 2021 +
2040 if (status == NULL || U_FAILURE(*status)) { 2022 if (status == NULL || U_FAILURE(*status)) {
2041 return 0; 2023 return 0;
2042 } 2024 }
2043 @@ -1286,14 +1790,14 @@ 2025 @@ -1286,14 +1779,14 @@
2044 // 2026 //
2045 const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4); 2027 const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4);
2046 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */ 2028 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */
2047 - pInfo->dataFormat[1]==0x72 && 2029 - pInfo->dataFormat[1]==0x72 &&
2048 - pInfo->dataFormat[2]==0x44 && 2030 - pInfo->dataFormat[2]==0x44 &&
2049 - pInfo->dataFormat[3]==0x63 && 2031 - pInfo->dataFormat[3]==0x63 &&
2050 - pInfo->formatVersion[0]==1 )) { 2032 - pInfo->formatVersion[0]==1 )) {
2051 + pInfo->dataFormat[1]==0x72 && 2033 + pInfo->dataFormat[1]==0x72 &&
2052 + pInfo->dataFormat[2]==0x44 && 2034 + pInfo->dataFormat[2]==0x44 &&
2053 + pInfo->dataFormat[3]==0x63 && 2035 + pInfo->dataFormat[3]==0x63 &&
2054 + pInfo->formatVersion[0]==1 )) { 2036 + pInfo->formatVersion[0]==1 )) {
2055 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n", 2037 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
2056 - pInfo->dataFormat[0], pInfo->dataFormat[1], 2038 - pInfo->dataFormat[0], pInfo->dataFormat[1],
2057 - pInfo->dataFormat[2], pInfo->dataFormat[3], 2039 - pInfo->dataFormat[2], pInfo->dataFormat[3],
2058 - pInfo->formatVersion[0]); 2040 - pInfo->formatVersion[0]);
2059 + pInfo->dataFormat[0], pInfo->dataFormat[1], 2041 + pInfo->dataFormat[0], pInfo->dataFormat[1],
2060 + pInfo->dataFormat[2], pInfo->dataFormat[3], 2042 + pInfo->dataFormat[2], pInfo->dataFormat[3],
2061 + pInfo->formatVersion[0]); 2043 + pInfo->formatVersion[0]);
2062 *status=U_UNSUPPORTED_ERROR; 2044 *status=U_UNSUPPORTED_ERROR;
2063 return 0; 2045 return 0;
2064 } 2046 }
2065 @@ -1311,8 +1815,10 @@ 2047 @@ -1311,8 +1804,10 @@
2066 // 2048 //
2067 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; 2049 const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
2068 const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes; 2050 const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes;
2069 - if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1 2051 - if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1
2070 - || ds->readUInt32(header->size) < sizeof(CompactTrieHeader)) 2052 - || ds->readUInt32(header->size) < sizeof(CompactTrieHeader))
2071 + uint32_t magic = ds->readUInt32(header->magic); 2053 + uint32_t magic = ds->readUInt32(header->magic);
2072 + if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic != COMPACT_TRIE_MAGIC_3 2054 + if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic != COMPACT_TRIE_MAGIC_3
2073 + || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeaderV1) 2055 + || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeaderV1)
2074 + || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeader)) 2056 + || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeader))
2075 { 2057 {
2076 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n" ); 2058 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n" );
2077 *status=U_UNSUPPORTED_ERROR; 2059 *status=U_UNSUPPORTED_ERROR;
2078 @@ -1333,10 +1839,10 @@ 2060 @@ -1333,10 +1828,10 @@
2079 // 2061 //
2080 if (length < sizeWithUData) { 2062 if (length < sizeWithUData) {
2081 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n", 2063 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",
2082 - totalSize); 2064 - totalSize);
2083 + totalSize); 2065 + totalSize);
2084 *status=U_INDEX_OUTOFBOUNDS_ERROR; 2066 *status=U_INDEX_OUTOFBOUNDS_ERROR;
2085 return 0; 2067 return 0;
2086 - } 2068 - }
2087 + } 2069 + }
2088 2070
2089 // 2071 //
2090 // Swap the Data. Do the data itself first, then the CompactTrieHeader, be cause 2072 // Swap the Data. Do the data itself first, then the CompactTrieHeader, be cause
2091 @@ -1355,20 +1861,38 @@ 2073 @@ -1355,20 +1850,38 @@
2092 } 2074 }
2093 2075
2094 // We need to loop through all the nodes in the offset table, and swap each one. 2076 // We need to loop through all the nodes in the offset table, and swap each one.
2095 - uint16_t nodeCount = ds->readUInt16(header->nodeCount); 2077 - uint16_t nodeCount = ds->readUInt16(header->nodeCount);
2096 + uint32_t nodeCount, rootId; 2078 + uint32_t nodeCount, rootId;
2097 + if(header->magic == COMPACT_TRIE_MAGIC_1) { 2079 + if(header->magic == COMPACT_TRIE_MAGIC_1) {
2098 + nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount); 2080 + nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount);
2099 + rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root); 2081 + rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root);
2100 + } else { 2082 + } else {
2101 + nodeCount = ds->readUInt32(header->nodeCount); 2083 + nodeCount = ds->readUInt32(header->nodeCount);
(...skipping 24 matching lines...) Expand all
2126 + overflow += 1; 2108 + overflow += 1;
2127 + } 2109 + }
2128 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVertica lNode,chars), 2110 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVertica lNode,chars),
2129 - itemCount*sizeof(uint16_t), 2111 - itemCount*sizeof(uint16_t),
2130 - outBytes+nodeOff+offsetof(CompactTrieVertic alNode,chars), status); 2112 - outBytes+nodeOff+offsetof(CompactTrieVertic alNode,chars), status);
2131 + (itemCount + overflow)*sizeof(uint16_t), 2113 + (itemCount + overflow)*sizeof(uint16_t),
2132 + outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars ), status); 2114 + outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars ), status);
2133 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(Compac tTrieVerticalNode,equal); 2115 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(Compac tTrieVerticalNode,equal);
2134 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNo de,equal)); 2116 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNo de,equal));
2135 } 2117 }
2136 @@ -1381,26 +1905,62 @@ 2118 @@ -1381,26 +1894,62 @@
2137 word = ds->readUInt16(inHNode->entries[j].equal); 2119 word = ds->readUInt16(inHNode->entries[j].equal);
2138 ds->writeUInt16(&outHNode->entries[j].equal, word); 2120 ds->writeUInt16(&outHNode->entries[j].equal, word);
2139 } 2121 }
2140 + 2122 +
2141 + // swap overflow/value information 2123 + // swap overflow/value information
2142 + if(flagscount & kEqualOverflows){ 2124 + if(flagscount & kEqualOverflows){
2143 + overflow += (itemCount + 3) / 4; 2125 + overflow += (itemCount + 3) / 4;
2144 + } 2126 + }
2145 + 2127 +
2146 + if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && fla gscount & kEndsParentWord) { 2128 + if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && fla gscount & kEndsParentWord) {
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after
2202 + ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff , status); 2184 + ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff , status);
2203 + 2185 +
2204 + //swap offsets 2186 + //swap offsets
2205 + ds->swapArray32(ds, inBytes+offsetPos, 2187 + ds->swapArray32(ds, inBytes+offsetPos,
2206 + sizeof(uint32_t)*(uint32_t)nodeCount, 2188 + sizeof(uint32_t)*(uint32_t)nodeCount,
2207 + outBytes+offsetPos, status); 2189 + outBytes+offsetPos, status);
2208 2190
2209 return sizeWithUData; 2191 return sizeWithUData;
2210 } 2192 }
2211 --- source/common/triedict.h 2006-06-06 15:38:49.000000000 -0700 2193 --- source/common/triedict.h 2006-06-06 15:38:49.000000000 -0700
2212 +++ source/common/triedict.h» 2009-07-27 13:01:17.723390000 -0700 2194 +++ source/common/triedict.h» 2011-01-21 14:12:45.496927000 -0800
2213 @@ -47,7 +47,6 @@ 2195 @@ -47,7 +47,6 @@
2214 U_NAMESPACE_BEGIN 2196 U_NAMESPACE_BEGIN
2215 2197
2216 class StringEnumeration; 2198 class StringEnumeration;
2217 -struct CompactTrieHeader; 2199 -struct CompactTrieHeader;
2218 2200
2219 /******************************************************************* 2201 /*******************************************************************
2220 * TrieWordDictionary 2202 * TrieWordDictionary
2221 @@ -72,23 +71,29 @@ 2203 @@ -72,23 +71,29 @@
2222 */ 2204 */
(...skipping 218 matching lines...) Expand 10 before | Expand all | Expand 10 after
2441 * 2423 *
2442 * @return The data for the compact dictionary, suitable for passing to the 2424 * @return The data for the compact dictionary, suitable for passing to the
2443 * constructor. 2425 * constructor.
2444 @@ -342,5 +386,5 @@ 2426 @@ -342,5 +386,5 @@
2445 2427
2446 U_NAMESPACE_END 2428 U_NAMESPACE_END
2447 2429
2448 - /* TRIEDICT_H */ 2430 - /* TRIEDICT_H */
2449 +/* TRIEDICT_H */ 2431 +/* TRIEDICT_H */
2450 #endif 2432 #endif
2451 --- source/data/brkitr/brkfiles.mk» 2009-04-21 15:42:37.000000000 -0700 2433 --- source/data/Makefile.in» 2010-10-29 13:21:33.000000000 -0700
2452 +++ source/data/brkitr/brkfiles.mk» 2009-07-27 13:01:17.730379000 -0700 2434 +++ source/data/Makefile.in» 2011-01-26 16:24:24.856798000 -0800
2453 @@ -34,13 +34,12 @@ 2435 @@ -509,8 +520,9 @@
2436 #################################################### CTD
2437 # CTD FILES
2454 2438
2439 -$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_ FILES)
2440 - $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<
2441 +# .ctd file now generated regardless of whether dictionary file exists
2442 +$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)
2443 + $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F ).txt
2455 2444
2456 # List of compact trie dictionary files (ctd). 2445 #################################################### CFU
2457 -BRK_CTD_SOURCE = thaidict.txt 2446 # CFU FILES
2458 +BRK_CTD_SOURCE = thaidict.txt cjdict.txt 2447 --- source/data/brkitr/root.txt»2010-07-28 17:18:28.000000000 -0700
2459 2448 +++ source/data/brkitr/root.txt»2011-01-21 14:12:45.653922000 -0800
2460
2461 # List of break iterator files (brk).
2462 -BRK_SOURCE = word_POSIX.txt word_ja.txt sent_el.txt char_th.txt char.txt word.t xt line.txt sent.txt title.txt
2463 +BRK_SOURCE = word_POSIX.txt sent_el.txt char_th.txt char.txt word.txt line.txt sent.txt title.txt
2464
2465
2466 # Ordinary resources
2467 -BRK_RES_SOURCE = el.txt en.txt en_US.txt en_US_POSIX.txt ja.txt th.txt
2468 -
2469 +BRK_RES_SOURCE = el.txt en.txt en_US.txt en_US_POSIX.txt th.txt
2470 --- source/data/brkitr/root.txt»2009-06-24 14:06:38.000000000 -0700
2471 +++ source/data/brkitr/root.txt»2009-07-27 13:01:17.733382000 -0700
2472 @@ -17,5 +17,8 @@ 2449 @@ -17,5 +17,8 @@
2473 } 2450 }
2474 dictionaries{ 2451 dictionaries{
2475 Thai:process(dependency){"thaidict.ctd"} 2452 Thai:process(dependency){"thaidict.ctd"}
2476 + Hani:process(dependency){"cjdict.ctd"} 2453 + Hani:process(dependency){"cjdict.ctd"}
2477 + Hira:process(dependency){"cjdict.ctd"} 2454 + Hira:process(dependency){"cjdict.ctd"}
2478 + Kata:process(dependency){"cjdict.ctd"} 2455 + Kata:process(dependency){"cjdict.ctd"}
2479 } 2456 }
2480 } 2457 }
2481 --- source/data/brkitr/word.txt»2009-06-24 14:06:38.000000000 -0700 2458 --- source/data/xml/brkitr/root.xml» 2010-03-01 15:13:18.000000000 -0800
2482 +++ source/data/brkitr/word.txt»2010-08-27 16:24:25.969372000 -0700 2459 +++ source/data/xml/brkitr/root.xml» 2011-01-21 14:12:45.735922000 -0800
2483 @@ -29,29 +29,49 @@
2484 $Newline = [\p{Word_Break = Newline}];
2485 $Extend = [\p{Word_Break = Extend}];
2486 $Format = [\p{Word_Break = Format}];
2487 +$Hiragana = [:Hiragana:];
2488 $Katakana = [\p{Word_Break = Katakana}];
2489 +$Han = [:Han:];
2490 $ALetter = [\p{Word_Break = ALetter}];
2491 -$MidNumLet = [\p{Word_Break = MidNumLet}];
2492 +# Remove two full stop characters from $MidNumLet and add them to $MidNum
2493 +# to break a hostname into its components at the cost of breaking
2494 +# 'e.g.' and 'i.e.' as well.
2495 +# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12.
2496 +# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected
2497 +# while rules 6/7 are reverted to the old behavior we want.
2498 +$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]];
2499 $MidLetter = [\p{Word_Break = MidLetter}];
2500 -$MidNum = [\p{Word_Break = MidNum}];
2501 -$Numeric = [\p{Word_Break = Numeric}];
2502 +$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]];
2503 +$Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth digits
2504 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
2505
2506 +# Extra sets not to break 'HebrewLetter U+0022 HebrewLetter'.
2507 +$HebrewLet = [\p{Word_Break = ALetter} & \p{Script = Hebrew} - [\u05F3]];
2508 +# U+05F3 is ALetter and U+05F4 is MidLetter so that they're covered by
2509 +# the current rule 6/7.
2510 +$HebrewMidLet = [\u0022];
2511
2512 # Dictionary character set, for triggering language-based break engines. Curr ently
2513 -# limited to LineBreak=Complex_Context. Note that this set only works in Unic ode
2514 -# 5.0 or later as the definition of Complex_Context was corrected to include all
2515 +# limited to LineBreak=Complex_Context and CJK. Note that this set only works
2516 +# in Unicode 5.0 or later as the definition of Complex_Context was corrected to include all
2517 # characters requiring dictionary break.
2518
2519 -$dictionary = [:LineBreak = Complex_Context:];
2520 $Control = [\p{Grapheme_Cluster_Break = Control}];
2521 -$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default A Letter does not
2522 - # include the dic tionary characters.
2523 +$HangulSyllable = [\uac00-\ud7a3];
2524 +$ComplexContext = [:LineBreak = Complex_Context:];
2525 +$KanaKanji = [$Han $Hiragana $Katakana];
2526 +$dictionaryCJK = [$KanaKanji $HangulSyllable];
2527 +$dictionary = [$ComplexContext $dictionaryCJK];
2528 +
2529 +# leave CJK scripts out of ALetterPlus
2530 +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
2531 +
2532
2533 #
2534 # Rules 4 Ignore Format and Extend characters,
2535 # except when they appear at the beginning of a region of text.
2536 #
2537 +# TODO: check if handling of katakana in dictionary makes rules incorrect/void.
2538 $KatakanaEx = $Katakana ($Extend | $Format)*;
2539 $ALetterEx = $ALetterPlus ($Extend | $Format)*;
2540 $MidNumLetEx = $MidNumLet ($Extend | $Format)*;
2541 @@ -59,8 +79,8 @@
2542 $MidNumEx = $MidNum ($Extend | $Format)*;
2543 $NumericEx = $Numeric ($Extend | $Format)*;
2544 $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
2545 +$HebrewLetEx = $HebrewLet ($Extend | $Format)*;
2546
2547 -$Hiragana = [\p{script=Hiragana}];
2548 $Ideographic = [\p{Ideographic}];
2549 $HiraganaEx = $Hiragana ($Extend | $Format)*;
2550 $IdeographicEx = $Ideographic ($Extend | $Format)*;
2551 @@ -79,12 +99,14 @@
2552 # begins with a group of Format chars, or with a "word" consisting of a single
2553 # char that is not in any of the listed word break categories followed by
2554 # format char(s).
2555 -[^$CR $LF $Newline]? ($Extend | $Format)+;
2556 + # format char(s), or is not a CJK dictionary character.
2557 +[^$CR $LF $Newline $dictionaryCJK]? ($Extend | $Format)+;
2558
2559 $NumericEx {100};
2560 $ALetterEx {200};
2561 -$KatakanaEx {300}; # note: these status values override those from rule 5
2562 -$HiraganaEx {300}; # by virtual of being numerically larger.
2563 +$HangulSyllable {200};
2564 +$KatakanaEx {400}; #originally 300
2565 +$HiraganaEx {400}; #originally 300
2566 $IdeographicEx {400}; #
2567
2568 #
2569 @@ -96,6 +118,9 @@
2570 # rule 6 and 7
2571 $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
2572
2573 +# Chrome addition
2574 +$HebrewLetEx $HebrewMidLet $HebrewLetEx {200};
2575 +
2576 # rule 8
2577
2578 $NumericEx $NumericEx {100};
2579 @@ -114,19 +139,25 @@
2580
2581 # rule 13
2582
2583 -$KatakanaEx $KatakanaEx {300};
2584 +# To be consistent with '$KanaKanji $KanaKanji', changed
2585 +# from 300 to 400.
2586 +# See also TestRuleStatus in intltest/rbbiapts.cpp
2587 +$KatakanaEx $KatakanaEx {400};
2588
2589 # rule 13a/b
2590
2591 $ALetterEx $ExtendNumLetEx {200}; # (13a)
2592 $NumericEx $ExtendNumLetEx {100}; # (13a)
2593 -$KatakanaEx $ExtendNumLetEx {300}; # (13a)
2594 +$KatakanaEx $ExtendNumLetEx {400}; # (13a)
2595 $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
2596
2597 $ExtendNumLetEx $ALetterEx {200}; # (13b)
2598 $ExtendNumLetEx $NumericEx {100}; # (13b)
2599 -$ExtendNumLetEx $KatakanaEx {300}; # (13b)
2600 -
2601 +$ExtendNumLetEx $KatakanaEx {400}; # (13b)
2602 +
2603 +# special handling for CJK characters: chain for later dictionary segmentation
2604 +$HangulSyllable $HangulSyllable {200};
2605 +$KanaKanji $KanaKanji {400}; #different rule status if both kanji and kana foun d
2606
2607
2608 ## -------------------------------------------------
2609 @@ -139,13 +170,15 @@
2610 $BackMidNumEx = ($Format | $Extend)* $MidNum;
2611 $BackMidLetterEx = ($Format | $Extend)* $MidLetter;
2612 $BackKatakanaEx = ($Format | $Extend)* $Katakana;
2613 +$BackHiraganaEx = ($Extend | $Format)* $Hiragana;
2614 $BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;
2615 +$BackHebrewLetEx = ($Format | $Extend)* $HebrewLet;
2616
2617 # rule 3
2618 $LF $CR;
2619
2620 # rule 4
2621 -($Format | $Extend)* [^$CR $LF $Newline]?;
2622 +($Format | $Extend)* [^$CR $LF $Newline $dictionaryCJK]?;
2623
2624 # rule 5
2625
2626 @@ -155,6 +188,8 @@
2627
2628 $BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;
2629
2630 +# Chrome addition
2631 +$BackHebrewLetEx $HebrewMidLet $BackHebrewLetEx;
2632
2633 # rule 8
2634
2635 @@ -181,6 +216,10 @@
2636 $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackE xtendNumLetEx);
2637 ($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
2638
2639 +# special handling for CJK characters: chain for later dictionary segmentation
2640 +$HangulSyllable $HangulSyllable;
2641 +$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
2642 +
2643 ## -------------------------------------------------
2644
2645 !!safe_reverse;
2646 --- source/data/xml/brkitr/root.xml» 2007-08-28 23:10:43.000000000 -0700
2647 +++ source/data/xml/brkitr/root.xml» 2009-07-27 13:01:17.746367000 -0700
2648 @@ -25,6 +25,9 @@ 2460 @@ -25,6 +25,9 @@
2649 </icu:boundaries> 2461 </icu:boundaries>
2650 <icu:dictionaries> 2462 <icu:dictionaries>
2651 <icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/> 2463 <icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/>
2652 + <icu:dictionary type="Hani" icu:dependency="cjdict.ctd"/> 2464 + <icu:dictionary type="Hani" icu:dependency="cjdict.ctd"/>
2653 + <icu:dictionary type="Hira" icu:dependency="cjdict.ctd"/> 2465 + <icu:dictionary type="Hira" icu:dependency="cjdict.ctd"/>
2654 + <icu:dictionary type="Kata" icu:dependency="cjdict.ctd"/> 2466 + <icu:dictionary type="Kata" icu:dependency="cjdict.ctd"/>
2655 </icu:dictionaries> 2467 </icu:dictionaries>
2656 </icu:breakIteratorData> 2468 </icu:breakIteratorData>
2657 </special> 2469 </special>
2658 --- source/test/cintltst/creststn.c» 2009-06-26 09:49:55.000000000 -0700 2470 --- source/test/cintltst/creststn.c» 2010-10-28 10:44:02.000000000 -0700
2659 +++ source/test/cintltst/creststn.c» 2009-07-29 12:46:05.997405000 -0700 2471 +++ source/test/cintltst/creststn.c» 2011-01-21 14:12:44.995020000 -0800
2660 @@ -2181,21 +2181,21 @@ 2472 @@ -2188,21 +2188,21 @@
2661 2473
2662 2474
2663 { 2475 {
2664 - UResourceBundle* ja = ures_open(U_ICUDATA_BRKITR,"ja", &status); 2476 - UResourceBundle* ja = ures_open(U_ICUDATA_BRKITR,"ja", &status);
2665 + UResourceBundle* th = ures_open(U_ICUDATA_BRKITR,"th", &status); 2477 + UResourceBundle* th = ures_open(U_ICUDATA_BRKITR,"th", &status);
2666 const UChar *got = NULL, *exp=NULL; 2478 const UChar *got = NULL, *exp=NULL;
2667 int32_t gotLen = 0, expLen=0; 2479 int32_t gotLen = 0, expLen=0;
2668 - ja = ures_getByKey(ja, "boundaries", ja, &status); 2480 - ja = ures_getByKey(ja, "boundaries", ja, &status);
2669 - exp = tres_getString(ja, -1, "word", &expLen, &status); 2481 - exp = tres_getString(ja, -1, "word", &expLen, &status);
2670 + th = ures_getByKey(th, "boundaries", th, &status); 2482 + th = ures_getByKey(th, "boundaries", th, &status);
2671 + exp = tres_getString(th, -1, "grapheme", &expLen, &status); 2483 + exp = tres_getString(th, -1, "grapheme", &expLen, &status);
2672 2484
2673 tb = ures_getByKey(aliasB, "boundaries", tb, &status); 2485 tb = ures_getByKey(aliasB, "boundaries", tb, &status);
2674 - got = tres_getString(tb, -1, "word", &gotLen, &status); 2486 - got = tres_getString(tb, -1, "word", &gotLen, &status);
2675 + got = tres_getString(tb, -1, "grapheme", &gotLen, &status); 2487 + got = tres_getString(tb, -1, "grapheme", &gotLen, &status);
2676 2488
2677 if(U_FAILURE(status)) { 2489 if(U_FAILURE(status)) {
2678 log_err("%s trying to read str boundaries\n", u_errorName(statu s)); 2490 log_err("%s trying to read str boundaries\n", u_errorName(statu s));
2679 } else if(gotLen != expLen || u_strncmp(exp, got, gotLen) != 0) { 2491 } else if(gotLen != expLen || u_strncmp(exp, got, gotLen) != 0) {
2680 log_err("Referencing alias didn't get the right data\n"); 2492 log_err("Referencing alias didn't get the right data\n");
2681 } 2493 }
2682 - ures_close(ja); 2494 - ures_close(ja);
2683 + ures_close(th); 2495 + ures_close(th);
2684 status = U_ZERO_ERROR; 2496 status = U_ZERO_ERROR;
2685 } 2497 }
2686 /* simple alias */ 2498 /* simple alias */
2687 @@ -3024,4 +3024,3 @@ 2499 --- source/test/intltest/rbbiapts.cpp» 2010-07-12 11:03:29.000000000 -0700
2688 } 2500 +++ source/test/intltest/rbbiapts.cpp» 2011-01-21 14:12:45.033014000 -0800
2689
2690 }
2691 -
2692 --- source/test/intltest/rbbiapts.cpp» 2009-06-26 09:49:55.000000000 -0700
2693 +++ source/test/intltest/rbbiapts.cpp» 2009-07-28 13:56:30.208042000 -0700
2694 @@ -156,9 +156,13 @@ 2501 @@ -156,9 +156,13 @@
2695 if(*a!=*b){ 2502 if(*a!=*b){
2696 errln("Failed: boilerplate method operator!= does not return correct re sults"); 2503 errln("Failed: boilerplate method operator!= does not return correct re sults");
2697 } 2504 }
2698 - BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status); 2505 - BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status);
2699 - if(a && c){ 2506 - if(a && c){
2700 - if(*c==*a){ 2507 - if(*c==*a){
2701 + // Japanese word break iteratos is identical to root with 2508 + // Japanese word break iteratos is identical to root with
2702 + // a dictionary-based break iterator, but Thai character break iterator 2509 + // a dictionary-based break iterator, but Thai character break iterator
2703 + // is still different from Root. 2510 + // is still different from Root.
2704 + BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),stat us); 2511 + BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),stat us);
2705 + BreakIterator* d = BreakIterator::createCharacterInstance(Locale("th"),stat us); 2512 + BreakIterator* d = BreakIterator::createCharacterInstance(Locale("th"),stat us);
2706 + if(c && d){ 2513 + if(c && d){
2707 + if(*c==*d){ 2514 + if(*c==*d){
2708 errln("Failed: boilerplate method opertator== does not return corre ct results"); 2515 errln("Failed: boilerplate method opertator== does not return corre ct results");
2709 } 2516 }
2710 }else{ 2517 }else{
2711 @@ -167,6 +171,7 @@ 2518 @@ -167,6 +171,7 @@
2712 delete a; 2519 delete a;
2713 delete b; 2520 delete b;
2714 delete c; 2521 delete c;
2715 + delete d; 2522 + delete d;
2716 } 2523 }
2717 2524
2718 void RBBIAPITest::TestgetRules() 2525 void RBBIAPITest::TestgetRules()
2719 @@ -643,21 +648,21 @@ 2526 @@ -635,21 +640,21 @@
2720 // 2527 //
2721 void RBBIAPITest::TestRuleStatus() { 2528 void RBBIAPITest::TestRuleStatus() {
2722 UChar str[30]; 2529 UChar str[30];
2723 - u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094 ", 2530 - u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094 ",
2724 - // 012345678901234567 8 9 0 1 2 3 4 5 6 2531 - // 012345678901234567 8 9 0 1 2 3 4 5 6
2725 - // Ideographic Katakana Hiragana 2532 - // Ideographic Katakana Hiragana
2726 + //no longer test Han or hiragana breaking here: ruleStatusVec would return nothing 2533 + //no longer test Han or hiragana breaking here: ruleStatusVec would return nothing
2727 + // changed UBRK_WORD_KANA to UBRK_WORD_IDEO 2534 + // changed UBRK_WORD_KANA to UBRK_WORD_IDEO
2728 + u_unescape("plain word 123.45 \\u30a1\\u30a2 ", 2535 + u_unescape("plain word 123.45 \\u30a1\\u30a2 ",
2729 + // 012345678901234567 8 9 0 2536 + // 012345678901234567 8 9 0
2730 + // Katakana 2537 + // Katakana
2731 str, 30); 2538 str, 30);
2732 UnicodeString testString1(str); 2539 UnicodeString testString1(str);
2733 - int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26}; 2540 - int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26};
2734 + int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21}; 2541 + int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21};
2735 int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER, 2542 int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER,
2736 UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE, 2543 UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE,
2737 - UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE, 2544 - UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE,
2738 - UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA, UBRK_WORD_KANA}; 2545 - UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA, UBRK_WORD_KANA};
2739 + UBRK_WORD_IDEO, UBRK_WORD_NONE}; 2546 + UBRK_WORD_IDEO, UBRK_WORD_NONE};
2740 2547
2741 int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WO RD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, 2548 int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WO RD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,
2742 UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WO RD_NONE_LIMIT, 2549 UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WO RD_NONE_LIMIT,
2743 - UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WO RD_NONE_LIMIT, 2550 - UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WO RD_NONE_LIMIT,
2744 - UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WO RD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT}; 2551 - UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WO RD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT};
2745 + UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT}; 2552 + UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT};
2746 2553
2747 UErrorCode status=U_ZERO_ERROR; 2554 UErrorCode status=U_ZERO_ERROR;
2748 2555
2749 @@ -896,9 +901,11 @@ 2556 @@ -888,9 +893,11 @@
2750 2557
2751 URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD , status); 2558 URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD , status);
2752 { 2559 {
2753 +#if 0 // With a dictionary based word breaking, ja_word is identical to root. 2560 +#if 0 // With a dictionary based word breaking, ja_word is identical to root.
2754 if (ja_word && *ja_word == *root_word) { 2561 if (ja_word && *ja_word == *root_word) {
2755 errln("japan not different from root"); 2562 errln("japan not different from root");
2756 } 2563 }
2757 +#endif 2564 +#endif
2758 } 2565 }
2759 2566
2760 { 2567 {
2761 --- source/test/intltest/rbbitst.cpp» 2009-06-26 09:49:55.000000000 -0700 2568 --- source/test/intltest/rbbitst.cpp» 2010-10-08 18:23:28.000000000 -0700
2762 +++ source/test/intltest/rbbitst.cpp» 2009-07-28 15:35:18.933226000 -0700 2569 +++ source/test/intltest/rbbitst.cpp» 2011-01-21 14:12:45.180030000 -0800
2763 @@ -33,6 +33,8 @@ 2570 @@ -35,6 +35,8 @@
2764 #include <string.h> 2571 #include <string.h>
2765 #include <stdio.h> 2572 #include <stdio.h>
2766 #include <stdlib.h> 2573 #include <stdlib.h>
2767 +#include "unicode/numfmt.h" 2574 +#include "unicode/numfmt.h"
2768 +#include "unicode/uscript.h" 2575 +#include "unicode/uscript.h"
2769 2576
2770 #define TEST_ASSERT(x) {if (!(x)) { \ 2577 #define TEST_ASSERT(x) {if (!(x)) { \
2771 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 2578 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
2772 @@ -108,6 +110,8 @@ 2579 @@ -138,11 +140,13 @@
2773 if (exec) TestThaiBreaks(); break; 2580 if (exec) TestThaiBreaks(); break;
2774 case 23: name = "TestTailoredBreaks"; 2581 case 23: name = "TestTailoredBreaks";
2775 if (exec) TestTailoredBreaks(); break; 2582 if (exec) TestTailoredBreaks(); break;
2776 + case 24: name = "TestTrieDictWithValue"; 2583 + case 24: name = "TestTrieDictWithValue";
2777 + if(exec) TestTrieDictWithValue(); break; 2584 + if(exec) TestTrieDictWithValue(); break;
2778 2585 #else
2779 default: name = ""; break; //needed to end loop 2586 - case 21: case 22: case 23: name = "skip";
2780 } 2587 + case 21: case 22: case 23: case 24: name = "skip";
2781 @@ -570,6 +574,8 @@ 2588 break;
2589 #endif
2590 - case 24: name = "TestDictRules";
2591 + case 25: name = "TestDictRules";
2592 if (exec) TestDictRules(); break;
2593 case 25: name = "TestBug5532";
2594 if (exec) TestBug5532(); break;
2595 @@ -607,6 +611,8 @@
2782 2596
2783 2597
2784 void RBBITest::TestJapaneseWordBreak() { 2598 void RBBITest::TestJapaneseWordBreak() {
2785 +// TODO: Rewrite this test for a dictionary-based word breaking. 2599 +// TODO: Rewrite this test for a dictionary-based word breaking.
2786 +#if 0 2600 +#if 0
2787 UErrorCode status = U_ZERO_ERROR; 2601 UErrorCode status = U_ZERO_ERROR;
2788 BITestData japaneseWordSelection(status); 2602 BITestData japaneseWordSelection(status);
2789 2603
2790 @@ -591,6 +597,7 @@ 2604 @@ -628,6 +634,7 @@
2791 2605
2792 generalIteratorTest(*e, japaneseWordSelection); 2606 generalIteratorTest(*e, japaneseWordSelection);
2793 delete e; 2607 delete e;
2794 +#endif 2608 +#endif
2795 } 2609 }
2796 2610
2797 void RBBITest::TestTrieDict() { 2611 void RBBITest::TestTrieDict() {
2798 @@ -812,6 +819,372 @@ 2612 @@ -849,6 +856,372 @@
2799 delete compact2; 2613 delete compact2;
2800 } 2614 }
2801 2615
2802 +/*TODO: delete later*/ 2616 +/*TODO: delete later*/
2803 +inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){ 2617 +inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){
2804 + UErrorCode status = U_ZERO_ERROR; 2618 + UErrorCode status = U_ZERO_ERROR;
2805 + FILE *outfile = fopen(filename,"w"); 2619 + FILE *outfile = fopen(filename,"w");
2806 + UConverter *cvt = ucnv_open("UTF-8", &status); 2620 + UConverter *cvt = ucnv_open("UTF-8", &status);
2807 + if (U_FAILURE(status)) 2621 + if (U_FAILURE(status))
2808 + return; 2622 + return;
(...skipping 352 matching lines...) Expand 10 before | Expand all | Expand 10 after
3161 + delete cloneEnum; 2975 + delete cloneEnum;
3162 + delete compact2; 2976 + delete compact2;
3163 + utext_close(originalText); 2977 + utext_close(originalText);
3164 + utext_close(cloneText); 2978 + utext_close(cloneText);
3165 + 2979 +
3166 + 2980 +
3167 +} 2981 +}
3168 2982
3169 //---------------------------------------------------------------------------- 2983 //----------------------------------------------------------------------------
3170 // 2984 //
3171 @@ -1832,8 +2205,15 @@ 2985 @@ -1870,8 +2243,15 @@
3172 // Don't break in runs of hiragana or runs of ideograph, where the latter inclu des \u3005 \u3007 \u303B (cldrbug #2009). 2986 // Don't break in runs of hiragana or runs of ideograph, where the latter inclu des \u3005 \u3007 \u303B (cldrbug #2009).
3173 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u 3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF" 2987 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u 3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
3174 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u 3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002"; 2988 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u 3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
3175 +#if 0 2989 +#if 0
3176 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 1 7, 18, 20, 21, 24, 27, 28 }; 2990 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 1 7, 18, 20, 21, 24, 27, 28 };
3177 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 1 7, 18, 19, 20, 21, 24, 25, 26, 27, 28 }; 2991 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 1 7, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
3178 +#endif 2992 +#endif
3179 +// There's no separate Japanese word break iterator. Root is the same as Japane se. 2993 +// There's no separate Japanese word break iterator. Root is the same as Japane se.
3180 +// Our dictionary-based iterator has to be tweaked to better handle U+3005, 2994 +// Our dictionary-based iterator has to be tweaked to better handle U+3005,
3181 +// U+3007, U+300B and some other cases. 2995 +// U+3007, U+300B and some other cases.
3182 +static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 1 5, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; 2996 +static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 1 5, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };
3183 +static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 1 5, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; 2997 +static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 1 5, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };
3184 2998
3185 // UBreakIteratorType UBRK_SENTENCE, Locale "el" 2999 // UBreakIteratorType UBRK_SENTENCE, Locale "el"
3186 // Add break after Greek question mark (cldrbug #2069). 3000 // Add break after Greek question mark (cldrbug #2069).
3187 @@ -2580,6 +2960,8 @@ 3001 @@ -2672,6 +3052,8 @@
3188 UnicodeSet *fNewlineSet; 3002 UnicodeSet *fNewlineSet;
3189 UnicodeSet *fKatakanaSet; 3003 UnicodeSet *fKatakanaSet;
3190 UnicodeSet *fALetterSet; 3004 UnicodeSet *fALetterSet;
3191 + // TODO(jungshik): Do we still need this change? 3005 + // TODO(jungshik): Do we still need this change?
3192 + // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt 3006 + // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
3193 UnicodeSet *fMidNumLetSet; 3007 UnicodeSet *fMidNumLetSet;
3194 UnicodeSet *fMidLetterSet; 3008 UnicodeSet *fMidLetterSet;
3195 UnicodeSet *fMidNumSet; 3009 UnicodeSet *fMidNumSet;
3196 @@ -2588,6 +2970,7 @@ 3010 @@ -2680,6 +3062,7 @@
3197 UnicodeSet *fOtherSet; 3011 UnicodeSet *fOtherSet;
3198 UnicodeSet *fExtendSet; 3012 UnicodeSet *fExtendSet;
3199 UnicodeSet *fExtendNumLetSet; 3013 UnicodeSet *fExtendNumLetSet;
3200 + UnicodeSet *fDictionaryCjkSet; 3014 + UnicodeSet *fDictionaryCjkSet;
3201 3015
3202 RegexMatcher *fMatcher; 3016 RegexMatcher *fMatcher;
3203 3017
3204 @@ -2604,12 +2987,24 @@ 3018 @@ -2696,12 +3079,24 @@
3205 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); 3019 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
3206 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); 3020 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
3207 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); 3021 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
3208 - fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); 3022 - fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
3209 + fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status); 3023 + fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
3210 + // Exclude Hangul syllables from ALetterSet during testing. 3024 + // Exclude Hangul syllables from ALetterSet during testing.
3211 + // Leave CJK dictionary characters out from the monkey tests! 3025 + // Leave CJK dictionary characters out from the monkey tests!
3212 +#if 0 3026 +#if 0
3213 + fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" 3027 + fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
3214 + "[\\p{Line_Break = Complex_Context}" 3028 + "[\\p{Line_Break = Complex_Context}"
3215 + "-\\p{Grapheme_Cluster_Break = Extend}" 3029 + "-\\p{Grapheme_Cluster_Break = Extend}"
3216 + "-\\p{Grapheme_Cluster_Break = Control}" 3030 + "-\\p{Grapheme_Cluster_Break = Control}"
3217 + "]]", 3031 + "]]",
3218 + status); 3032 + status);
3219 +#endif 3033 +#endif
3220 + fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); 3034 + fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
3221 + fALetterSet->removeAll(*fDictionaryCjkSet); 3035 + fALetterSet->removeAll(*fDictionaryCjkSet);
3222 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); 3036 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
3223 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); 3037 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
3224 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); 3038 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);
3225 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); 3039 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
3226 - fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status); 3040 - fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
3227 + fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}[\\uff10-\\uff19]]"), status); 3041 + fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}[\\uff10-\\uff19]]"), status);
3228 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); 3042 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
3229 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); 3043 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
3230 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); 3044 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
3231 @@ -2633,13 +3028,14 @@ 3045 @@ -2725,13 +3120,14 @@
3232 fOtherSet->removeAll(*fFormatSet); 3046 fOtherSet->removeAll(*fFormatSet);
3233 fOtherSet->removeAll(*fExtendSet); 3047 fOtherSet->removeAll(*fExtendSet);
3234 // Inhibit dictionary characters from being tested at all. 3048 // Inhibit dictionary characters from being tested at all.
3235 + fOtherSet->removeAll(*fDictionaryCjkSet); 3049 + fOtherSet->removeAll(*fDictionaryCjkSet);
3236 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Com plex_Context}]"), status)); 3050 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Com plex_Context}]"), status));
3237 3051
3238 fSets->addElement(fCRSet, status); 3052 fSets->addElement(fCRSet, status);
3239 fSets->addElement(fLFSet, status); 3053 fSets->addElement(fLFSet, status);
3240 fSets->addElement(fNewlineSet, status); 3054 fSets->addElement(fNewlineSet, status);
3241 fSets->addElement(fALetterSet, status); 3055 fSets->addElement(fALetterSet, status);
3242 - fSets->addElement(fKatakanaSet, status); 3056 - fSets->addElement(fKatakanaSet, status);
3243 + //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test ka takana 3057 + //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test ka takana
3244 fSets->addElement(fMidLetterSet, status); 3058 fSets->addElement(fMidLetterSet, status);
3245 fSets->addElement(fMidNumLetSet, status); 3059 fSets->addElement(fMidNumLetSet, status);
3246 fSets->addElement(fMidNumSet, status); 3060 fSets->addElement(fMidNumSet, status);
3247 @@ -3871,6 +4267,7 @@ 3061 @@ -3978,6 +4374,7 @@
3248 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 3062 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3249 count --; 3063 count --;
3250 if (forward[count] != i) { 3064 if (forward[count] != i) {
3251 + printStringBreaks(ustr, expected, expectedcount); 3065 + printStringBreaks(ustr, expected, expectedcount);
3252 test->errln("happy break test previous() failed: expected %d but go t %d", 3066 test->errln("happy break test previous() failed: expected %d but go t %d",
3253 forward[count], i); 3067 forward[count], i);
3254 break; 3068 break;
3255 @@ -3904,23 +4301,25 @@ 3069 @@ -4011,23 +4408,25 @@
3256 UErrorCode status = U_ZERO_ERROR; 3070 UErrorCode status = U_ZERO_ERROR;
3257 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, stat us); 3071 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, stat us);
3258 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3072 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3259 + // Replaced any C+J characters in a row with a random sequence of character s 3073 + // Replaced any C+J characters in a row with a random sequence of character s
3260 + // of the same length to make our C+J segmentation not get in the way. 3074 + // of the same length to make our C+J segmentation not get in the way.
3261 static const char *strlist[] = 3075 static const char *strlist[] =
3262 { 3076 {
3263 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 3077 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3264 - "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e004 0\\u003b", 3078 - "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e004 0\\u003b",
3265 + "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e004 0\\u003b", 3079 + "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e004 0\\u003b",
3266 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000 e0061\\u003a", 3080 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000 e0061\\u003a",
3267 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 3081 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3268 - "\\u90ca\\u3588\\u009c\\u0953\\u194b", 3082 - "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3269 + "\\uac00\\u3588\\u009c\\u0953\\u194b", 3083 + "\\uac00\\u3588\\u009c\\u0953\\u194b",
3270 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3084 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3271 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e ", 3085 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e ",
3272 - "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e", 3086 - "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3273 + "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", 3087 + "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3274 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3088 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3275 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3089 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3276 "\\u2027\\U000e0067\\u0a47\\u00b7", 3090 "\\u2027\\U000e0067\\u0a47\\u00b7",
3277 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3091 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3278 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3092 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3279 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3093 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3280 - "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 3094 - "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3281 + "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", 3095 + "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3282 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3096 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3283 "\\u0027\\u11af\\U000e0057\\u0602", 3097 "\\u0027\\u11af\\U000e0057\\u0602",
3284 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3098 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3285 @@ -3932,7 +4331,7 @@ 3099 @@ -4039,7 +4438,7 @@
3286 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3100 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3287 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3101 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3288 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3102 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3289 - "\\u58f4\\U000e0049\\u20e7\\u2027", 3103 - "\\u58f4\\U000e0049\\u20e7\\u2027",
3290 + "\\u18f4\\U000e0049\\u20e7\\u2027", 3104 + "\\u18f4\\U000e0049\\u20e7\\u2027",
3291 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3105 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3292 "\\ua183\\u102d\\u0bec\\u003a", 3106 "\\ua183\\u102d\\u0bec\\u003a",
3293 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3107 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3294 @@ -3942,7 +4341,7 @@ 3108 @@ -4049,7 +4448,7 @@
3295 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 3109 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3296 "\\u003a\\u0664\\u00b7\\u1fba", 3110 "\\u003a\\u0664\\u00b7\\u1fba",
3297 "\\u003b\\u0027\\u00b7\\u47a3", 3111 "\\u003b\\u0027\\u00b7\\u47a3",
3298 - "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b", 3112 - "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
3299 + "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", 3113 + "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3300 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\ u0e51\\u1058\\U000e0058\\u00b7\\u0673", 3114 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\ u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3301 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 3115 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3302 }; 3116 };
3303 @@ -3997,12 +4396,12 @@ 3117 @@ -4104,12 +4503,12 @@
3304 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3118 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3305 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3119 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3306 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3120 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3307 - "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 3121 - "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3308 + "\\U000e0065\\u302c\\u09ee\\U000e0068", 3122 + "\\U000e0065\\u302c\\u09ee\\U000e0068",
3309 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3123 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3310 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3124 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3311 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3125 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3312 "\\u58f4\\U000e0049\\u20e7\\u2027", 3126 "\\u58f4\\U000e0049\\u20e7\\u2027",
3313 - "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3127 - "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3314 + "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3128 + "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3315 "\\ua183\\u102d\\u0bec\\u003a", 3129 "\\ua183\\u102d\\u0bec\\u003a",
3316 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3130 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3317 "\\u003a\\u0e57\\u0fad\\u002e", 3131 "\\u003a\\u0e57\\u0fad\\u002e",
3318 --- source/test/intltest/rbbitst.h» 2009-04-22 00:53:50.000000000 -0700 3132 --- source/test/intltest/rbbitst.h» 2010-07-22 17:15:37.000000000 -0700
3319 +++ source/test/intltest/rbbitst.h» 2009-07-27 13:01:17.767342000 -0700 3133 +++ source/test/intltest/rbbitst.h» 2011-01-21 14:12:45.152007000 -0800
3320 @@ -70,6 +70,7 @@ 3134 @@ -70,6 +70,7 @@
3321 void TestBug5775(); 3135 void TestBug5775();
3322 void TestThaiBreaks(); 3136 void TestThaiBreaks();
3323 void TestTailoredBreaks(); 3137 void TestTailoredBreaks();
3324 + void TestTrieDictWithValue(); 3138 + void TestTrieDictWithValue();
3139 void TestDictRules();
3140 void TestBug5532();
3325 3141
3326 void TestDebug(); 3142 --- source/test/testdata/rbbitst.txt» 2010-07-28 17:18:28.000000000 -0700
3327 3143 +++ source/test/testdata/rbbitst.txt» 2011-01-21 14:12:45.221011000 -0800
3328 --- source/test/testdata/rbbitst.txt» 2009-06-24 14:06:38.000000000 -0700 3144 @@ -161,7 +161,23 @@
3329 +++ source/test/testdata/rbbitst.txt» 2009-07-29 12:56:31.483710000 -0700
3330 @@ -162,7 +162,23 @@
3331 <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data> 3145 <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>
3332 3146
3333 # Hiragana & Katakana stay together, but separates from each other and Latin. 3147 # Hiragana & Katakana stay together, but separates from each other and Latin.
3334 -<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINI NG ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A} \N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data> 3148 -<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINI NG ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A} \N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>
3335 +# *** what to do about theoretical combos of chars? i.e. hiragana + accent 3149 +# *** what to do about theoretical combos of chars? i.e. hiragana + accent
3336 +#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBIN ING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A }\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKAN A LETTER N}<300>def<200>#•</data> 3150 +#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBIN ING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A }\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKAN A LETTER N}<300>def<200>#•</data>
3337 + 3151 +
3338 +# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth 3152 +# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth
3339 +<data>•芽キャベツ<400>芽キャベツ<400></data> 3153 +<data>•芽キャベツ<400>芽キャベツ<400></data>
3340 + 3154 +
3341 +# more Japanese tests 3155 +# more Japanese tests
3342 +# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana 3156 +# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana
3343 +# and the Katakana block are not treated correctly. Enable this later. 3157 +# and the Katakana block are not treated correctly. Enable this later.
3344 +#<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400 >は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data> 3158 +#<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400 >は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>
3345 +<data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>で も<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data> 3159 +<data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>で も<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>
3346 + 3160 +
3347 +# Testing of word boundary for dictionary word containing both kanji and kana 3161 +# Testing of word boundary for dictionary word containing both kanji and kana
3348 +<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data> 3162 +<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data>
3349 + 3163 +
3350 +# Testing of Chinese segmentation (taken from a Chinese news article) 3164 +# Testing of Chinese segmentation (taken from a Chinese news article)
3351 +<data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400 >到了<400>“•推荐<400>票<400>”•,•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400> 的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>,•选出<400>他们<400> 属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</d ata> 3165 +<data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400 >到了<400>“•推荐<400>票<400>”•,•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400> 的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>,•选出<400>他们<400> 属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</d ata>
3352 3166
3353 # Words with interior formatting characters 3167 # Words with interior formatting characters
3354 <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</dat a> 3168 <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</dat a>
3355 @@ -170,6 +186,8 @@ 3169 @@ -169,6 +185,8 @@
3356 # to test for bug #4097779 3170 # to test for bug #4097779
3357 <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data> 3171 <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>
3358 3172
3359 +# fullwidth numeric, midletter characters etc should be treated like their half width counterparts 3173 +# fullwidth numeric, midletter characters etc should be treated like their half width counterparts
3360 +<data>•ISN'T<200> •19<100>日<400></data> 3174 +<data>•ISN'T<200> •19<100>日<400></data>
3361 3175
3362 # to test for bug #4098467 3176 # to test for bug #4098467
3363 # What follows is a string of Korean characters (I found it in the Yellow Pages 3177 # What follows is a string of Korean characters (I found it in the Yellow Pages
3364 @@ -179,9 +197,15 @@ 3178 @@ -178,9 +196,15 @@
3365 # precomposed syllables... 3179 # precomposed syllables...
3366 <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\ua d50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u1 10b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u1 1bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data> 3180 <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\ua d50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u1 10b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u1 1bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>
3367 3181
3368 -<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data> 3182 -<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data>
3369 +# more Korean tests (Jamo not tested here, not counted as dictionary characters ) 3183 +# more Korean tests (Jamo not tested here, not counted as dictionary characters )
3370 +# Disable them now because we don't include a Korean dictionary. 3184 +# Disable them now because we don't include a Korean dictionary.
3371 +#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<2 00>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data> 3185 +#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<2 00>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data>
3372 +#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2d d<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200 > •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data> 3186 +#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2d d<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200 > •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data>
3373 + 3187 +
3374 +<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</da ta> 3188 +<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</da ta>
3375 + 3189 +
3376 +<data>•\u06c9<200>\uc799<200>\ufffa•</data> 3190 +<data>•\u06c9<200>\uc799<200>\ufffa•</data>
3377 3191
3378 -<data>•\u06c9\uc799\ufffa<200></data> 3192 -<data>•\u06c9\uc799\ufffa<200></data>
3379 3193
3380 # 3194 #
3381 # Try some words from other scripts. 3195 # Try some words from other scripts.
3382 @@ -492,8 +516,7 @@ 3196 @@ -491,8 +515,7 @@
3383 <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c •</data> 3197 <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c •</data>
3384 3198
3385 # conjoining jamo... 3199 # conjoining jamo...
3386 -# TODO: rules update needed 3200 -# TODO: rules update needed
3387 -#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\ u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\ u1100\u116d•\u1112\u116c•</data> 3201 -#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\ u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\ u1100\u116d•\u1112\u116c•</data>
3388 +<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u 11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1 100\u116d•\u1112\u116c•</data> 3202 +<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u 11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1 100\u116d•\u1112\u116c•</data>
3389 3203
3390 # to test for bug #4117554: Fullwidth .!? should be treated as postJwrd 3204 # to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
3391 <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data> 3205 <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data>
3392 --- source/test/testdata/testaliases.txt» 2009-06-24 14:06:38.000000000 -0 700 3206 --- source/test/testdata/testaliases.txt» 2009-11-12 13:53:42.000000000 -0 800
3393 +++ source/test/testdata/testaliases.txt» 2009-07-28 17:07:26.251120000 -0 700 3207 +++ source/test/testdata/testaliases.txt» 2011-01-21 14:12:45.204005000 -0 800
3394 @@ -28,7 +28,7 @@ 3208 @@ -28,7 +28,7 @@
3395 LocaleScript:alias { "/ICUDATA/ja/LocaleScript" } 3209 LocaleScript:alias { "/ICUDATA/ja/LocaleScript" }
3396 3210
3397 // aliasing using position 3211 // aliasing using position
3398 - boundaries:alias { "/ICUDATA-brkitr/ja" } // Referencing corresponding reso urce in another bundle 3212 - boundaries:alias { "/ICUDATA-brkitr/ja" } // Referencing corresponding reso urce in another bundle
3399 + boundaries:alias { "/ICUDATA-brkitr/th" } // Referencing corresponding reso urce in another bundle 3213 + boundaries:alias { "/ICUDATA-brkitr/th" } // Referencing corresponding reso urce in another bundle
3400 3214
3401 // aliasing arrays 3215 // aliasing arrays
3402 zoneTests { 3216 zoneTests {
3403 --- source/tools/genctd/genctd.cpp» 2006-09-04 09:28:24.000000000 -0700 3217 --- source/tools/genctd/genctd.cpp» 2009-08-04 14:09:17.000000000 -0700
3404 +++ source/tools/genctd/genctd.cpp» 2009-07-27 13:01:17.776335000 -0700 3218 +++ source/tools/genctd/genctd.cpp» 2011-01-21 14:12:45.564923000 -0800
3405 @@ -1,6 +1,6 @@ 3219 @@ -1,6 +1,6 @@
3406 /* 3220 /*
3407 ********************************************************************** 3221 **********************************************************************
3408 -* Copyright (C) 2002-2006, International Business Machines 3222 -* Copyright (C) 2002-2009, International Business Machines
3409 +* Copyright (C) 2002-2006,2008, International Business Machines 3223 +* Copyright (C) 2002-2010, International Business Machines
3410 * Corporation and others. All Rights Reserved. 3224 * Corporation and others. All Rights Reserved.
3411 ********************************************************************** 3225 **********************************************************************
3412 * 3226 *
3413 @@ -34,12 +34,15 @@ 3227 @@ -34,12 +34,15 @@
3414 #include "unicode/udata.h" 3228 #include "unicode/udata.h"
3415 #include "unicode/putil.h" 3229 #include "unicode/putil.h"
3416 3230
3417 +//#include "unicode/ustdio.h" 3231 +//#include "unicode/ustdio.h"
3418 + 3232 +
3419 #include "uoptions.h" 3233 #include "uoptions.h"
3420 #include "unewdata.h" 3234 #include "unewdata.h"
3421 #include "ucmndata.h" 3235 #include "ucmndata.h"
3422 #include "rbbidata.h" 3236 #include "rbbidata.h"
3423 #include "triedict.h" 3237 #include "triedict.h"
3424 #include "cmemory.h" 3238 #include "cmemory.h"
3425 +#include "uassert.h" 3239 +#include "uassert.h"
3426 3240
3427 #include <stdio.h> 3241 #include <stdio.h>
3428 #include <stdlib.h> 3242 #include <stdlib.h>
3429 @@ -198,147 +201,191 @@ 3243 @@ -199,147 +202,191 @@
3430 long wordFileSize; 3244 long wordFileSize;
3431 FILE *file; 3245 FILE *file;
3432 char *wordBufferC; 3246 char *wordBufferC;
3433 - 3247 -
3434 + MutableTrieDictionary *mtd = NULL; 3248 + MutableTrieDictionary *mtd = NULL;
3435 + 3249 +
3436 file = fopen(wordFileName, "rb"); 3250 file = fopen(wordFileName, "rb");
3437 - if( file == 0 ) { 3251 - if( file == 0 ) {
3438 - fprintf(stderr, "Could not open file \"%s\"\n", wordFileName); 3252 - fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);
3439 - exit(-1); 3253 - exit(-1);
(...skipping 301 matching lines...) Expand 10 before | Expand all | Expand 10 after
3741 + // Get rid of the Unicode text buffer 3555 + // Get rid of the Unicode text buffer
3742 + delete[] wordSourceU; 3556 + delete[] wordSourceU;
3743 } 3557 }
3744 3558
3745 - // Get rid of the Unicode text buffer 3559 - // Get rid of the Unicode text buffer
3746 - delete[] wordSourceU; 3560 - delete[] wordSourceU;
3747 - 3561 -
3748 // Now, create a CompactTrieDictionary from the mutable dictionary 3562 // Now, create a CompactTrieDictionary from the mutable dictionary
3749 CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status); 3563 CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
3750 if (U_FAILURE(status)) { 3564 if (U_FAILURE(status)) {
3751 @@ -392,4 +439,3 @@ 3565 @@ -393,4 +440,3 @@
3752 3566
3753 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 3567 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
3754 } 3568 }
3755 - 3569 -
3756 --- source/tools/genctd/Makefile.in 2006-12-16 13:07:01.000000000 -0800 3570 --- source/tools/genctd/Makefile.in 2006-12-16 13:07:01.000000000 -0800
3757 +++ source/tools/genctd/Makefile.in» 2009-07-27 13:01:17.782326000 -0700 3571 +++ source/tools/genctd/Makefile.in» 2011-01-21 14:12:45.555920000 -0800
3758 @@ -23,13 +23,13 @@ 3572 @@ -23,13 +23,13 @@
3759 ## Extra files to remove for 'make clean' 3573 ## Extra files to remove for 'make clean'
3760 CLEANFILES = *~ $(DEPS) $(MAN_FILES) 3574 CLEANFILES = *~ $(DEPS) $(MAN_FILES)
3761 3575
3762 -## Target information 3576 -## Target information
3763 +## Target informationcd 3577 +## Target informationcd
3764 TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) 3578 TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
3765 3579
3766 ifneq ($(top_builddir),$(top_srcdir)) 3580 ifneq ($(top_builddir),$(top_srcdir))
3767 CPPFLAGS += -I$(top_builddir)/common 3581 CPPFLAGS += -I$(top_builddir)/common
3768 endif 3582 endif
3769 -CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil 3583 -CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil
3770 +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -I$(top_srcdir)/i18n 3584 +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -I$(top_srcdir)/i18n
3771 LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) 3585 LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
3772 3586
3773 OBJECTS = genctd.o 3587 OBJECTS = genctd.o
3774 --- source/data/Makefile.in 2009-05-20 23:03:54.000000000 -0700
3775 +++ source/data/Makefile.in 2009-10-21 15:43:18.235201000 -0700
3776 @@ -452,8 +452,9 @@
3777 #################################################### CTD
3778 # CTD FILES
3779
3780 -$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_ FILES)
3781 - $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<
3782 +# .ctd file now generated regardless of whether dictionary file exists
3783 +$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)
3784 + $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F ).txt
3785
3786 #################################################### CFU
3787 # CFU FILES
OLDNEW
« no previous file with comments | « no previous file | icu46/source/common/brkeng.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698