icu46/patches/segmentation.patch - Issue 6370014: CJK segmentation patch for ICU 4.6...

Side by Side Diff: icu46/patches/segmentation.patch

Issue 6370014: CJK segmentation patch for ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 9 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 --- source/common/brkeng.cpp» 2007-09-11 20:53:13.000000000 -0700	1 --- source/common/brkeng.cpp» 2009-11-11 07:47:22.000000000 -0800

2 +++ source/common/brkeng.cpp» 2009-07-29 12:57:49.973382000 -0700	2 +++ source/common/brkeng.cpp» 2011-01-21 14:12:45.479922000 -0800

3 @@ -24,6 +24,7 @@	3 @@ -226,6 +226,30 @@

4 #include "umutex.h"

5 #include "uresimp.h"

6 #include "ubrkimpl.h"

7 +#include <stdio.h>

8

9 U_NAMESPACE_BEGIN

10

11 @@ -226,6 +227,30 @@

12 case USCRIPT_THAI:	4 case USCRIPT_THAI:

13 engine = new ThaiBreakEngine(dict, status);	5 engine = new ThaiBreakEngine(dict, status);

14 break;	6 break;

15 +	7 +

16 + case USCRIPT_HANGUL:	8 + case USCRIPT_HANGUL:

17 + engine = new CjkBreakEngine(dict, kKorean, status);	9 + engine = new CjkBreakEngine(dict, kKorean, status);

18 + break;	10 + break;

19 +	11 +

20 + // use same BreakEngine and dictionary for both Chinese and Japanes e	12 + // use same BreakEngine and dictionary for both Chinese and Japanes e

21 + case USCRIPT_HIRAGANA:	13 + case USCRIPT_HIRAGANA:

(...skipping 10 matching lines...) Expand all Loading...
32 + {	24 + {

33 + UBlockCode block = ublock_getCode(code);	25 + UBlockCode block = ublock_getCode(code);

34 + if (block == UBLOCK_HIRAGANA \|\| block == UBLOCK_KATAKANA)	26 + if (block == UBLOCK_HIRAGANA \|\| block == UBLOCK_KATAKANA)

35 + engine = new CjkBreakEngine(dict, kChineseJapanese, status);	27 + engine = new CjkBreakEngine(dict, kChineseJapanese, status);

36 + break;	28 + break;

37 + }	29 + }

38 +#endif	30 +#endif

39 default:	31 default:

40 break;	32 break;

41 }	33 }

42 @@ -281,6 +306,13 @@	34 @@ -281,6 +305,13 @@

43 dict = NULL;	35 dict = NULL;

44 }	36 }

45 return dict;	37 return dict;

46 + } else if (dictfname != NULL){	38 + } else if (dictfname != NULL){

47 + //create dummy dict if dictionary filename not valid	39 + //create dummy dict if dictionary filename not valid

48 + UChar c = 0x0020;	40 + UChar c = 0x0020;

49 + status = U_ZERO_ERROR;	41 + status = U_ZERO_ERROR;

50 + MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE) ;	42 + MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE) ;

51 + mtd->addWord(&c, 1, status, 1);	43 + mtd->addWord(&c, 1, status, 1);

52 + return new CompactTrieDictionary(*mtd, status);	44 + return new CompactTrieDictionary(*mtd, status);

53 }	45 }

54 return NULL;	46 return NULL;

55 }	47 }

56 --- source/common/dictbe.cpp 2008-06-13 12:21:12.000000000 -0700	48 --- source/common/dictbe.cpp 2008-06-13 12:21:12.000000000 -0700

57 +++ source/common/dictbe.cpp» 2009-11-11 12:58:40.199829000 -0800	49 +++ source/common/dictbe.cpp» 2011-01-21 14:12:45.468928000 -0800

58 @@ -16,6 +16,11 @@	50 @@ -16,6 +16,9 @@

59 #include "unicode/ubrk.h"	51 #include "unicode/ubrk.h"

60 #include "uvector.h"	52 #include "uvector.h"

61 #include "triedict.h"	53 #include "triedict.h"

62 +#include "uassert.h"	54 +#include "uassert.h"

63 +#include "unicode/normlzr.h"	55 +#include "unicode/normlzr.h"

64 +#include "cmemory.h"	56 +#include "cmemory.h"

65 +

66 +#include <stdio.h>

67	57

68 U_NAMESPACE_BEGIN	58 U_NAMESPACE_BEGIN

69	59

70 @@ -422,6 +427,294 @@	60 @@ -422,6 +425,294 @@

71 return wordsFound;	61 return wordsFound;

72 }	62 }

73	63

74 +/*	64 +/*

75 + ******************************************************************	65 + ******************************************************************

76 + * CjkBreakEngine	66 + * CjkBreakEngine

77 + */	67 + */

78 +static const uint32_t kuint32max = 0xFFFFFFFF;	68 +static const uint32_t kuint32max = 0xFFFFFFFF;

79 +CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, Langu ageType type, UErrorCode &status)	69 +CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, Langu ageType type, UErrorCode &status)

80 +: DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){	70 +: DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){

(...skipping 275 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
356 + }	346 + }

357 +	347 +

358 + utext_close(&normalizedText);	348 + utext_close(&normalizedText);

359 + return numBreaks;	349 + return numBreaks;

360 +}	350 +}

361 +	351 +

362 U_NAMESPACE_END	352 U_NAMESPACE_END

363	353

364 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */	354 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

365 --- source/common/dictbe.h 2006-09-29 17:37:45.000000000 -0700	355 --- source/common/dictbe.h 2006-09-29 17:37:45.000000000 -0700

366 +++ source/common/dictbe.h» 2009-07-27 13:01:17.704415000 -0700	356 +++ source/common/dictbe.h» 2011-01-21 14:12:45.492920000 -0800

367 @@ -1,8 +1,8 @@	357 @@ -1,8 +1,8 @@

368 /**	358 /**

369 - ****************************************************************************** *	359 - ****************************************************************************** *

370 - * Copyright (C) 2006, International Business Machines Corporation and others. *	360 - * Copyright (C) 2006, International Business Machines Corporation and others. *

371 - * All Rights Reserved. *	361 - * All Rights Reserved. *

372 - ****************************************************************************** *	362 - ****************************************************************************** *

373 + **************************************************************************** **	363 + **************************************************************************** **

374 + * Copyright (C) 2006,2007, International Business Machines Corporation and oth ers.	364 + * Copyright (C) 2006-2010, International Business Machines Corporation and oth ers.

375 + * All Rights Reserved.	365 + * All Rights Reserved.

376 + **************************************************************************** **	366 + **************************************************************************** **

377 */	367 */

378	368

379 #ifndef DICTBE_H	369 #ifndef DICTBE_H

380 @@ -65,37 +65,37 @@	370 @@ -65,31 +65,31 @@

381 */	371 */

382 virtual ~DictionaryBreakEngine();	372 virtual ~DictionaryBreakEngine();

383	373

384 - /**	374 - /**

385 - * <p>Indicate whether this engine handles a particular character for	375 - * <p>Indicate whether this engine handles a particular character for

386 - * a particular kind of break.</p>	376 - * a particular kind of break.</p>

387 - *	377 - *

388 - * @param c A character which begins a run that the engine might handle	378 - * @param c A character which begins a run that the engine might handle

389 - * @param breakType The type of text break which the caller wants to determine	379 - * @param breakType The type of text break which the caller wants to determine

390 - * @return TRUE if this engine handles the particular character and break	380 - * @return TRUE if this engine handles the particular character and break

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
423 + * that starts from the first (or last) character in the range.	413 + * that starts from the first (or last) character in the range.

424 + * @param startPos The start of the run within the supplied text.	414 + * @param startPos The start of the run within the supplied text.

425 + * @param endPos The end of the run within the supplied text.	415 + * @param endPos The end of the run within the supplied text.

426 + * @param reverse Whether the caller is looking for breaks in a reverse	416 + * @param reverse Whether the caller is looking for breaks in a reverse

427 + * direction.	417 + * direction.

428 + * @param breakType The type of break desired, or -1.	418 + * @param breakType The type of break desired, or -1.

429 + * @param foundBreaks An allocated C array of the breaks found, if any	419 + * @param foundBreaks An allocated C array of the breaks found, if any

430 + * @return The number of breaks found.	420 + * @return The number of breaks found.

431 + */	421 + */

432 virtual int32_t findBreaks( UText *text,	422 virtual int32_t findBreaks( UText *text,

433 - int32_t startPos,	423 int32_t startPos,

434 - int32_t endPos,	424 int32_t endPos,

435 - UBool reverse,

436 - int32_t breakType,

437 - UStack &foundBreaks ) const;

438 + int32_t startPos,

439 + int32_t endPos,

440 + UBool reverse,

441 + int32_t breakType,

442 + UStack &foundBreaks ) const;

443

444 protected:

445

446 @@ -114,7 +114,7 @@	425 @@ -114,7 +114,7 @@

447 // virtual void setBreakTypes( uint32_t breakTypes );	426 // virtual void setBreakTypes( uint32_t breakTypes );

448	427

449 /**	428 /**

450 - * <p>Divide up a range of known dictionary characters.</p>	429 - * <p>Divide up a range of known dictionary characters.</p>

451 + * <p>Divide up a range of known dictionary characters handled by this break e ngine.</p>	430 + * <p>Divide up a range of known dictionary characters handled by this break e ngine.</p>

452 *	431 *

453 * @param text A UText representing the text	432 * @param text A UText representing the text

454 * @param rangeStart The start of the range of dictionary characters	433 * @param rangeStart The start of the range of dictionary characters

455 @@ -171,7 +171,7 @@	434 @@ -171,7 +171,7 @@

456	435

457 protected:	436 protected:

458 /**	437 /**

459 - * <p>Divide up a range of known dictionary characters.</p>	438 - * <p>Divide up a range of known dictionary characters.</p>

460 + * <p>Divide up a range of known dictionary characters handled by this break e ngine.</p>	439 + * <p>Divide up a range of known dictionary characters handled by this break e ngine.</p>

461 *	440 *

462 * @param text A UText representing the text	441 * @param text A UText representing the text

463 * @param rangeStart The start of the range of dictionary characters	442 * @param rangeStart The start of the range of dictionary characters

464 @@ -180,12 +180,72 @@	443 @@ -186,6 +186,66 @@

465 * @return The number of breaks found

466 */

467 virtual int32_t divideUpDictionaryRange( UText *text,

468 - int32_t rangeStart,

469 - int32_t rangeEnd,

470 - UStack &foundBreaks ) const;

471 + int32_t rangeStart,

472 + int32_t rangeEnd,

473 + UStack &foundBreaks ) const;

474	444

475 };	445 };

476	446

477 +/*******************************************************************	447 +/*******************************************************************

478 + * CjkBreakEngine	448 + * CjkBreakEngine

479 + */	449 + */

480 +	450 +

481 +//indicates language/script that the CjkBreakEngine will handle	451 +//indicates language/script that the CjkBreakEngine will handle

482 +enum LanguageType {	452 +enum LanguageType {

483 + kKorean,	453 + kKorean,

(...skipping 46 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
530 + */	500 + */

531 + virtual int32_t divideUpDictionaryRange( UText *text,	501 + virtual int32_t divideUpDictionaryRange( UText *text,

532 + int32_t rangeStart,	502 + int32_t rangeStart,

533 + int32_t rangeEnd,	503 + int32_t rangeEnd,

534 + UStack &foundBreaks ) const;	504 + UStack &foundBreaks ) const;

535 +	505 +

536 +};	506 +};

537	507

538 U_NAMESPACE_END	508 U_NAMESPACE_END

539	509

540 --- source/common/rbbi.cpp» 2008-09-24 22:48:27.000000000 -0700	510 --- source/common/rbbi.cpp» 2010-07-22 17:15:37.000000000 -0700

541 +++ source/common/rbbi.cpp» 2009-07-27 13:01:17.710416000 -0700	511 +++ source/common/rbbi.cpp» 2011-01-21 14:12:45.457938000 -0800

542 @@ -29,6 +29,7 @@	512 @@ -1555,10 +1555,12 @@

543

544 #include "uassert.h"

545 #include "uvector.h"

546 +#include <stdio.h>

547

548 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be incl uded.

549 #if U_LOCAL_SERVICE_HOOK

550 @@ -1552,10 +1553,14 @@

551 int32_t endPos,	513 int32_t endPos,

552 UBool reverse) {	514 UBool reverse) {

553 // Reset the old break cache first.	515 // Reset the old break cache first.

554 - uint32_t dictionaryCount = fDictionaryCharCount;	516 - uint32_t dictionaryCount = fDictionaryCharCount;

555 +// uint32_t dictionaryCount = fDictionaryCharCount;

556 reset();	517 reset();

557	518

558 - if (dictionaryCount <= 1 \|\| (endPos - startPos) <= 1) {	519 - if (dictionaryCount <= 1 \|\| (endPos - startPos) <= 1) {

559 + // note: code segment below assumes that dictionary chars are in the	520 + // note: code segment below assumes that dictionary chars are in the

560 + // startPos-endPos range	521 + // startPos-endPos range

561 + // value returned should be next character in sequence	522 + // value returned should be next character in sequence

562 +// if (dictionaryCount <= 1 \|\| (endPos - startPos) <= 1) {

563 + if ((endPos - startPos) <= 1) {	523 + if ((endPos - startPos) <= 1) {

564 return (reverse ? startPos : endPos);	524 return (reverse ? startPos : endPos);

565 }	525 }

566	526

567 @@ -1684,7 +1689,7 @@	527 @@ -1711,7 +1713,7 @@

568 // proposed break by one of the breaks we found. Use following() an d	528 // proposed break by one of the breaks we found. Use following() an d

569 // preceding() to do the work. They should never recurse in this ca se.	529 // preceding() to do the work. They should never recurse in this ca se.

570 if (reverse) {	530 if (reverse) {

571 - return preceding(endPos - 1);	531 - return preceding(endPos - 1);

572 + return preceding(endPos);	532 + return preceding(endPos);

573 }	533 }

574 else {	534 else {

575 return following(startPos);	535 return following(startPos);

576 --- source/common/triedict.cpp 2008-02-13 01:35:50.000000000 -0800	536 --- source/common/triedict.cpp 2008-02-13 01:35:50.000000000 -0800

577 +++ source/common/triedict.cpp» 2009-07-27 13:01:17.718409000 -0700	537 +++ source/common/triedict.cpp» 2011-01-21 14:12:45.271006000 -0800

578 @@ -20,6 +20,7 @@	538 @@ -20,6 +20,7 @@

579 #include "uvector.h"	539 #include "uvector.h"

580 #include "uvectr32.h"	540 #include "uvectr32.h"

581 #include "uarrsort.h"	541 #include "uarrsort.h"

582 +#include "hash.h"	542 +#include "hash.h"

583	543

584 //#define DEBUG_TRIE_DICT 1	544 //#define DEBUG_TRIE_DICT 1

585	545

586 @@ -27,6 +28,11 @@	546 @@ -27,6 +28,11 @@

587 #include <sys/times.h>	547 #include <sys/times.h>

(...skipping 18 matching lines...) Expand all Loading...
606 +	566 +

607 // Node structure for the ternary, uncompressed trie	567 // Node structure for the ternary, uncompressed trie

608 struct TernaryNode : public UMemory {	568 struct TernaryNode : public UMemory {

609 UChar ch; // UTF-16 code unit	569 UChar ch; // UTF-16 code unit

610 @@ -77,7 +88,8 @@	570 @@ -77,7 +88,8 @@

611 delete high;	571 delete high;

612 }	572 }

613	573

614 -MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) {	574 -MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) {

615 +MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status,	575 +MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status,

616 + UBool containsValue /* = FALSE */ ) {	576 + UBool containsValue /* = FALSE */ ) {

617 // Start the trie off with something. Having the root node already present	577 // Start the trie off with something. Having the root node already present

618 // cuts a special case out of the search/insertion functions.	578 // cuts a special case out of the search/insertion functions.

619 // Making it a median character cuts the worse case for searches from	579 // Making it a median character cuts the worse case for searches from

620 @@ -91,14 +103,19 @@	580 @@ -91,14 +103,19 @@

621 if (U_SUCCESS(status) && fIter == NULL) {	581 if (U_SUCCESS(status) && fIter == NULL) {

622 status = U_MEMORY_ALLOCATION_ERROR;	582 status = U_MEMORY_ALLOCATION_ERROR;

623 }	583 }

624 +	584 +

625 + fValued = containsValue;	585 + fValued = containsValue;

626 }	586 }

627	587

628 -MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) {	588 -MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) {

629 +MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status,	589 +MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status,

630 + UBool containsValue /* = false */ ) {	590 + UBool containsValue /* = false */ ) {

631 fTrie = NULL;	591 fTrie = NULL;

632 fIter = utext_openUChars(NULL, NULL, 0, &status);	592 fIter = utext_openUChars(NULL, NULL, 0, &status);

633 if (U_SUCCESS(status) && fIter == NULL) {	593 if (U_SUCCESS(status) && fIter == NULL) {

634 status = U_MEMORY_ALLOCATION_ERROR;	594 status = U_MEMORY_ALLOCATION_ERROR;

635 }	595 }

636 +	596 +

637 + fValued = containsValue;	597 + fValued = containsValue;

638 }	598 }

639	599

640 MutableTrieDictionary::~MutableTrieDictionary() {	600 MutableTrieDictionary::~MutableTrieDictionary() {

641 @@ -113,7 +130,8 @@	601 @@ -108,12 +125,13 @@

642 int &count,	602

643 int limit,	603 int32_t

644 TernaryNode *&parent,	604 MutableTrieDictionary::search( UText *text,

	605 - int32_t maxLength,

	606 - int32_t *lengths,

	607 - int &count,

	608 - int limit,

	609 - TernaryNode *&parent,

645 - UBool &pMatched ) const {	610 - UBool &pMatched ) const {

646 + UBool &pMatched,	611 + int32_t maxLength,

647 + uint16_t values /=NULL*/) const {	612 + int32_t *lengths,

	613 + int &count,

	614 + int limit,

	615 + TernaryNode *&parent,

	616 + UBool &pMatched,

	617 + uint16_t values /=NULL*/) const {

648 // TODO: current implementation works in UTF-16 space	618 // TODO: current implementation works in UTF-16 space

649 const TernaryNode *up = NULL;	619 const TernaryNode *up = NULL;

650 const TernaryNode *p = fTrie;	620 const TernaryNode *p = fTrie;

651 @@ -121,6 +139,10 @@	621 @@ -121,6 +139,10 @@

652 pMatched = TRUE;	622 pMatched = TRUE;

653 int i;	623 int i;

654	624

655 + if (!fValued) {	625 + if (!fValued) {

656 + values = NULL;	626 + values = NULL;

657 + }	627 + }

(...skipping 35 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
693 int count;	663 int count;

694 @@ -177,7 +204,7 @@	664 @@ -177,7 +204,7 @@

695 matched = search(fIter, length, NULL, count, 0, parent, pMatched);	665 matched = search(fIter, length, NULL, count, 0, parent, pMatched);

696	666

697 while (matched++ < length) {	667 while (matched++ < length) {

698 - UChar32 uc = utext_next32(fIter); // TODO: supplemetary support?	668 - UChar32 uc = utext_next32(fIter); // TODO: supplemetary support?

699 + UChar32 uc = utext_next32(fIter); // TODO: supplementary support?	669 + UChar32 uc = utext_next32(fIter); // TODO: supplementary support?

700 U_ASSERT(uc != U_SENTINEL);	670 U_ASSERT(uc != U_SENTINEL);

701 TernaryNode *newNode = new TernaryNode(uc);	671 TernaryNode *newNode = new TernaryNode(uc);

702 if (newNode == NULL) {	672 if (newNode == NULL) {

703 @@ -199,7 +226,11 @@	673 @@ -199,30 +226,23 @@

704 parent = newNode;	674 parent = newNode;

705 }	675 }

706	676

707 - parent->flags \|= kEndsWord;	677 - parent->flags \|= kEndsWord;

	678 -}

	679 -

	680 -#if 0

	681 -void

	682 -MutableTrieDictionary::addWords( UEnumeration *words,

	683 - UErrorCode &status ) {

	684 - int32_t length;

	685 - const UChar *word;

	686 - while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) {

	687 - addWord(word, length, status);

708 + if(fValued && value > 0){	688 + if(fValued && value > 0){

709 + parent->flags = value;	689 + parent->flags = value;

710 + } else {	690 + } else {

711 + parent->flags \|= kEndsWord;	691 + parent->flags \|= kEndsWord;

712 + }	692 }

713 }	693 }

	694 -#endif

714	695

715 #if 0	696 int32_t

716 @@ -219,10 +250,11 @@	697 MutableTrieDictionary::matches( UText *text,

717 int32_t maxLength,	698 int32_t maxLength,

718 int32_t *lengths,	699 int32_t *lengths,

719 int &count,	700 int &count,

720 - int limit ) const {	701 - int limit ) const {

721 + int limit,	702 + int limit,

722 + uint16_t values /=NULL*/) const {	703 + uint16_t values /=NULL*/) const {

723 TernaryNode *parent;	704 TernaryNode *parent;

724 UBool pMatched;	705 UBool pMatched;

725 - return search(text, maxLength, lengths, count, limit, parent, pMatched);	706 - return search(text, maxLength, lengths, count, limit, parent, pMatched);

726 + return search(text, maxLength, lengths, count, limit, parent, pMatched, val ues);	707 + return search(text, maxLength, lengths, count, limit, parent, pMatched, val ues);

727 }	708 }

728	709

729 // Implementation of iteration for MutableTrieDictionary	710 // Implementation of iteration for MutableTrieDictionary

730 @@ -277,7 +309,7 @@	711 @@ -277,7 +297,7 @@

731 break;	712 break;

732 }	713 }

733 case kEqual:	714 case kEqual:

734 - emit = (node->flags & kEndsWord) != 0;	715 - emit = (node->flags & kEndsWord) != 0;

735 + emit = node->flags > 0;	716 + emit = node->flags > 0;

736 equal = (node->equal != NULL);	717 equal = (node->equal != NULL);

737 // If this node should be part of the next emitted string, appe nd	718 // If this node should be part of the next emitted string, appe nd

738 // the UChar to the string, and make sure we pop it when we com e	719 // the UChar to the string, and make sure we pop it when we com e

739 @@ -299,7 +331,7 @@	720 @@ -299,7 +319,7 @@

740 }	721 }

741 case kGreaterThan:	722 case kGreaterThan:

742 // If this node's character is in the string, remove it.	723 // If this node's character is in the string, remove it.

743 - if (node->equal != NULL \|\| (node->flags & kEndsWord)) {	724 - if (node->equal != NULL \|\| (node->flags & kEndsWord)) {

744 + if (node->equal != NULL \|\| node->flags > 0) {	725 + if (node->equal != NULL \|\| node->flags > 0) {

745 unistr.truncate(unistr.length()-1);	726 unistr.truncate(unistr.length()-1);

746 }	727 }

747 if (node->high != NULL) {	728 if (node->high != NULL) {

748 @@ -354,12 +386,74 @@	729 @@ -354,12 +374,75 @@

749 * CompactTrieDictionary	730 * CompactTrieDictionary

750 */	731 */

751	732

752 +//TODO if time permits: minimise size of trie with logprobs by storing values	733 +//TODO further optimization:

	734 +// minimise size of trie with logprobs by storing values

753 +// for terminal nodes directly in offsets[]	735 +// for terminal nodes directly in offsets[]

754 +// --> calculating from next offset might be simpler, but would have to add	736 +// --> calculating from next offset might be simpler, but would have to add

755 +// one last offset for logprob of last node	737 +// one last offset for logprob of last node

756 +// --> if calculate from current offset, need to factor in possible overflow	738 +// --> if calculate from current offset, need to factor in possible overflow

757 +// as well.	739 +// as well.

758 +// idea: store in offset, set first bit to indicate logprob storage-->won't	740 +// idea: store in offset, set first bit to indicate logprob storage-->won't

759 +// have to access additional node	741 +// have to access additional node

760 +	742 +

761 +// {'Dic', 1}, version 1: uses old header, no values	743 +// {'Dic', 1}, version 1: uses old header, no values

762 +#define COMPACT_TRIE_MAGIC_1 0x44696301	744 +#define COMPACT_TRIE_MAGIC_1 0x44696301

(...skipping 51 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
814 + offsets = &(header->offsets[0]);	796 + offsets = &(header->offsets[0]);

815 + address = (uint8_t *)header;	797 + address = (uint8_t *)header;

816 + }	798 + }

817 + }	799 + }

818 + }	800 + }

819 +	801 +

820 + ~CompactTrieInfo(){}	802 + ~CompactTrieInfo(){}

821 };	803 };

822	804

823 // Note that to avoid platform-specific alignment issues, all members of the no de	805 // Note that to avoid platform-specific alignment issues, all members of the no de

824 @@ -375,10 +469,14 @@	806 @@ -375,10 +458,14 @@

825 enum CompactTrieNodeFlags {	807 enum CompactTrieNodeFlags {

826 kVerticalNode = 0x1000, // This is a vertical node	808 kVerticalNode = 0x1000, // This is a vertical node

827 kParentEndsWord = 0x2000, // The node whose equal link points to this ends a word	809 kParentEndsWord = 0x2000, // The node whose equal link points to this ends a word

828 - kReservedFlag1 = 0x4000,	810 - kReservedFlag1 = 0x4000,

829 - kReservedFlag2 = 0x8000,	811 - kReservedFlag2 = 0x8000,

830 + kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kR eservedFlag1	812 + kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kR eservedFlag1

831 + kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReserved Flag2	813 + kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReserved Flag2

832 kCountMask = 0x0FFF, // The count portion of flagscount	814 kCountMask = 0x0FFF, // The count portion of flagscount

833 - kFlagMask = 0xF000 // The flags portion of flagscount	815 - kFlagMask = 0xF000 // The flags portion of flagscount

834 + kFlagMask = 0xF000, // The flags portion of flagscount	816 + kFlagMask = 0xF000, // The flags portion of flagscount

835 + kRootCountMask = 0x7FFF // The count portion of flagscount in the r oot node	817 + kRootCountMask = 0x7FFF // The count portion of flagscount in the r oot node

836 +	818 +

837 + //offset flags:	819 + //offset flags:

838 + //kOffsetContainsValue = 0x80000000 // Offset contains value for pare nt node	820 + //kOffsetContainsValue = 0x80000000 // Offset contains value for pare nt node

839 };	821 };

840	822

841 // The two node types are distinguished by the kVerticalNode flag.	823 // The two node types are distinguished by the kVerticalNode flag.

842 @@ -402,63 +500,177 @@	824 @@ -402,63 +489,177 @@

843 uint16_t chars[1]; // Code units	825 uint16_t chars[1]; // Code units

844 };	826 };

845	827

846 -// {'Dic', 1}, version 1	828 -// {'Dic', 1}, version 1

847 -#define COMPACT_TRIE_MAGIC_1 0x44696301	829 -#define COMPACT_TRIE_MAGIC_1 0x44696301

848 -	830 -

849 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj,	831 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj,

850 UErrorCode &status )	832 UErrorCode &status )

851 : fUData(dataObj)	833 : fUData(dataObj)

852 {	834 {

(...skipping 54 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
907 uint32_t	889 uint32_t

908 CompactTrieDictionary::dataSize() const {	890 CompactTrieDictionary::dataSize() const {

909 - return fData->size;	891 - return fData->size;

910 + return fInfo->size;	892 + return fInfo->size;

911 }	893 }

912	894

913 const void *	895 const void *

914 CompactTrieDictionary::data() const {	896 CompactTrieDictionary::data() const {

915 - return fData;	897 - return fData;

916 + return fInfo->address;	898 + return fInfo->address;

917 }	899 +}

918	900 +

919 -// This function finds the address of a node for us, given its node ID

920 +//This function finds the address of a node for us, given its node ID	901 +//This function finds the address of a node for us, given its node ID

921 static inline const CompactTrieNode *	902 +static inline const CompactTrieNode *

922 -getCompactNode(const CompactTrieHeader *header, uint16_t node) {

923 - return (const CompactTrieNode )((const uint8_t )header + header->offsets[ node]);

924 +getCompactNode(const CompactTrieInfo *info, uint32_t node) {	903 +getCompactNode(const CompactTrieInfo *info, uint32_t node) {

925 + if(node < info->root-1) {	904 + if(node < info->root-1) {

926 + return (const CompactTrieNode *)(&info->offsets[node]);	905 + return (const CompactTrieNode *)(&info->offsets[node]);

927 + } else {	906 + } else {

928 + return (const CompactTrieNode *)(info->address + info->offsets[node]);	907 + return (const CompactTrieNode *)(info->address + info->offsets[node]);

929 + }	908 + }

930 +}	909 }

931 +	910

	911 -// This function finds the address of a node for us, given its node ID

932 +//this version of getCompactNode is currently only used in compactMutableTrieDi ctionary()	912 +//this version of getCompactNode is currently only used in compactMutableTrieDi ctionary()

933 +static inline const CompactTrieNode *	913 static inline const CompactTrieNode *

	914 -getCompactNode(const CompactTrieHeader *header, uint16_t node) {

	915 - return (const CompactTrieNode )((const uint8_t )header + header->offsets[ node]);

934 +getCompactNode(const CompactTrieHeader *header, uint32_t node) {	916 +getCompactNode(const CompactTrieHeader *header, uint32_t node) {

935 + if(node < header->root-1) {	917 + if(node < header->root-1) {

936 + return (const CompactTrieNode *)(&header->offsets[node]);	918 + return (const CompactTrieNode *)(&header->offsets[node]);

937 + } else {	919 + } else {

938 + return (const CompactTrieNode )((const uint8_t )header + header->offs ets[node]);	920 + return (const CompactTrieNode )((const uint8_t )header + header->offs ets[node]);

939 + }	921 + }

940 +}	922 +}

941 +	923 +

942 +	924 +

943 +/**	925 +/**

(...skipping 86 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1030 + }	1012 + }

1031 + else {	1013 + else {

1032 + low = middle+1;	1014 + low = middle+1;

1033 + }	1015 + }

1034 + }	1016 + }

1035 +	1017 +

1036 + return -1;	1018 + return -1;

1037 }	1019 }

1038	1020

1039 int32_t	1021 int32_t

1040 @@ -466,17 +678,38 @@	1022 @@ -466,17 +667,38 @@

1041 int32_t maxLength,	1023 int32_t maxLength,

1042 int32_t *lengths,	1024 int32_t *lengths,

1043 int &count,	1025 int &count,

1044 - int limit ) const {	1026 - int limit ) const {

1045 + int limit,	1027 + int limit,

1046 + uint16_t values /= NULL*/) const {	1028 + uint16_t values /= NULL*/) const {

1047 + if (fInfo->magic == COMPACT_TRIE_MAGIC_2)	1029 + if (fInfo->magic == COMPACT_TRIE_MAGIC_2)

1048 + values = NULL;	1030 + values = NULL;

1049 +	1031 +

1050 // TODO: current implementation works in UTF-16 space	1032 // TODO: current implementation works in UTF-16 space

(...skipping 20 matching lines...) Expand all Loading...
1071 +	1053 +

1072 while (node != NULL) {	1054 while (node != NULL) {

1073 // Check if the node we just exited ends a word	1055 // Check if the node we just exited ends a word

1074 if (limit > 0 && (node->flagscount & kParentEndsWord)) {	1056 if (limit > 0 && (node->flagscount & kParentEndsWord)) {

1075 + if(values != NULL){	1057 + if(values != NULL){

1076 + values[mycount] = getValue(node);	1058 + values[mycount] = getValue(node);

1077 + }	1059 + }

1078 lengths[mycount++] = i;	1060 lengths[mycount++] = i;

1079 --limit;	1061 --limit;

1080 }	1062 }

1081 @@ -487,7 +720,7 @@	1063 @@ -487,7 +709,7 @@

1082 break;	1064 break;

1083 }	1065 }

1084	1066

1085 - int nodeCount = (node->flagscount & kCountMask);	1067 - int nodeCount = (node->flagscount & kCountMask);

1086 + int nodeCount = getCount(node);	1068 + int nodeCount = getCount(node);

1087 if (nodeCount == 0) {	1069 if (nodeCount == 0) {

1088 // Special terminal node; return now	1070 // Special terminal node; return now

1089 break;	1071 break;

1090 @@ -507,35 +740,27 @@	1072 @@ -507,35 +729,27 @@

1091 // To get here we must have come through the whole list successfull y;	1073 // To get here we must have come through the whole list successfull y;

1092 // go on to the next node. Note that a word cannot end in the middl e	1074 // go on to the next node. Note that a word cannot end in the middl e

1093 // of a vertical node.	1075 // of a vertical node.

1094 - node = getCompactNode(fData, vnode->equal);	1076 - node = getCompactNode(fData, vnode->equal);

1095 + node = getCompactNode(fInfo, calcEqualLink(vnode));	1077 + node = getCompactNode(fInfo, calcEqualLink(vnode));

1096 }	1078 }

1097 else {	1079 else {

1098 // Horizontal node; do binary search	1080 // Horizontal node; do binary search

1099 const CompactTrieHorizontalNode hnode = (const CompactTrieHorizont alNode )node;	1081 const CompactTrieHorizontalNode hnode = (const CompactTrieHorizont alNode )node;

1100 - int low = 0;	1082 - int low = 0;

(...skipping 29 matching lines...) Expand all Loading...
1130 + }else{	1112 + }else{

1131 + node = NULL; // If we don't find a match, we'll fall out of the loop	1113 + node = NULL; // If we don't find a match, we'll fall out of the loop

1132 }	1114 }

1133 }	1115 }

1134 }	1116 }

1135 -exit:	1117 -exit:

1136 + exit:	1118 + exit:

1137 count = mycount;	1119 count = mycount;

1138 return i;	1120 return i;

1139 }	1121 }

1140 @@ -545,16 +770,16 @@	1122 @@ -545,16 +759,16 @@

1141 private:	1123 private:

1142 UVector32 fNodeStack; // Stack of nodes to process	1124 UVector32 fNodeStack; // Stack of nodes to process

1143 UVector32 fIndexStack; // Stack of where in node we are	1125 UVector32 fIndexStack; // Stack of where in node we are

1144 - const CompactTrieHeader *fHeader; // Trie data	1126 - const CompactTrieHeader *fHeader; // Trie data

1145 + const CompactTrieInfo *fInfo; // Trie data	1127 + const CompactTrieInfo *fInfo; // Trie data

1146	1128

1147 public:	1129 public:

1148 static UClassID U_EXPORT2 getStaticClassID(void);	1130 static UClassID U_EXPORT2 getStaticClassID(void);

1149 virtual UClassID getDynamicClassID(void) const;	1131 virtual UClassID getDynamicClassID(void) const;

1150 public:	1132 public:

1151 - CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status)	1133 - CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status)

1152 + CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status)	1134 + CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status)

1153 : fNodeStack(status), fIndexStack(status) {	1135 : fNodeStack(status), fIndexStack(status) {

1154 - fHeader = header;	1136 - fHeader = header;

1155 - fNodeStack.push(header->root, status);	1137 - fNodeStack.push(header->root, status);

1156 + fInfo = info;	1138 + fInfo = info;

1157 + fNodeStack.push(info->root, status);	1139 + fNodeStack.push(info->root, status);

1158 fIndexStack.push(0, status);	1140 fIndexStack.push(0, status);

1159 unistr.remove();	1141 unistr.remove();

1160 }	1142 }

1161 @@ -564,14 +789,14 @@	1143 @@ -564,14 +778,14 @@

1162	1144

1163 virtual StringEnumeration *clone() const {	1145 virtual StringEnumeration *clone() const {

1164 UErrorCode status = U_ZERO_ERROR;	1146 UErrorCode status = U_ZERO_ERROR;

1165 - return new CompactTrieEnumeration(fHeader, status);	1147 - return new CompactTrieEnumeration(fHeader, status);

1166 + return new CompactTrieEnumeration(fInfo, status);	1148 + return new CompactTrieEnumeration(fInfo, status);

1167 }	1149 }

1168	1150

1169 virtual const UnicodeString * snext(UErrorCode &status);	1151 virtual const UnicodeString * snext(UErrorCode &status);

1170	1152

1171 // Very expensive, but this should never be used.	1153 // Very expensive, but this should never be used.

1172 virtual int32_t count(UErrorCode &status) const {	1154 virtual int32_t count(UErrorCode &status) const {

1173 - CompactTrieEnumeration counter(fHeader, status);	1155 - CompactTrieEnumeration counter(fHeader, status);

1174 + CompactTrieEnumeration counter(fInfo, status);	1156 + CompactTrieEnumeration counter(fInfo, status);

1175 int32_t result = 0;	1157 int32_t result = 0;

1176 while (counter.snext(status) != NULL && U_SUCCESS(status)) {	1158 while (counter.snext(status) != NULL && U_SUCCESS(status)) {

1177 ++result;	1159 ++result;

1178 @@ -582,7 +807,7 @@	1160 @@ -582,7 +796,7 @@

1179 virtual void reset(UErrorCode &status) {	1161 virtual void reset(UErrorCode &status) {

1180 fNodeStack.removeAllElements();	1162 fNodeStack.removeAllElements();

1181 fIndexStack.removeAllElements();	1163 fIndexStack.removeAllElements();

1182 - fNodeStack.push(fHeader->root, status);	1164 - fNodeStack.push(fHeader->root, status);

1183 + fNodeStack.push(fInfo->root, status);	1165 + fNodeStack.push(fInfo->root, status);

1184 fIndexStack.push(0, status);	1166 fIndexStack.push(0, status);

1185 unistr.remove();	1167 unistr.remove();

1186 }	1168 }

1187 @@ -595,26 +820,34 @@	1169 @@ -595,26 +809,34 @@

1188 if (fNodeStack.empty() \|\| U_FAILURE(status)) {	1170 if (fNodeStack.empty() \|\| U_FAILURE(status)) {

1189 return NULL;	1171 return NULL;

1190 }	1172 }

1191 - const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki());	1173 - const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki());

1192 + const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki());	1174 + const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki());

1193 int where = fIndexStack.peeki();	1175 int where = fIndexStack.peeki();

1194 while (!fNodeStack.empty() && U_SUCCESS(status)) {	1176 while (!fNodeStack.empty() && U_SUCCESS(status)) {

1195 - int nodeCount = (node->flagscount & kCountMask);	1177 - int nodeCount = (node->flagscount & kCountMask);

1196 + int nodeCount;	1178 + int nodeCount;

1197 +	1179 +

(...skipping 20 matching lines...) Expand all Loading...
1218 if (where == 0) {	1200 if (where == 0) {

1219 // Going down	1201 // Going down

1220 - unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount) ;	1202 - unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount) ;

1221 + unistr.append((const UChar *)vnode->chars, nodeCount);	1203 + unistr.append((const UChar *)vnode->chars, nodeCount);

1222 fIndexStack.setElementAt(1, fIndexStack.size()-1);	1204 fIndexStack.setElementAt(1, fIndexStack.size()-1);

1223 - node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, st atus));	1205 - node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, st atus));

1224 + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnod e), status));	1206 + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnod e), status));

1225 where = fIndexStack.push(0, status);	1207 where = fIndexStack.push(0, status);

1226 goingDown = TRUE;	1208 goingDown = TRUE;

1227 }	1209 }

1228 @@ -623,7 +856,7 @@	1210 @@ -623,7 +845,7 @@

1229 unistr.truncate(unistr.length()-nodeCount);	1211 unistr.truncate(unistr.length()-nodeCount);

1230 fNodeStack.popi();	1212 fNodeStack.popi();

1231 fIndexStack.popi();	1213 fIndexStack.popi();

1232 - node = getCompactNode(fHeader, fNodeStack.peeki());	1214 - node = getCompactNode(fHeader, fNodeStack.peeki());

1233 + node = getCompactNode(fInfo, fNodeStack.peeki());	1215 + node = getCompactNode(fInfo, fNodeStack.peeki());

1234 where = fIndexStack.peeki();	1216 where = fIndexStack.peeki();

1235 }	1217 }

1236 }	1218 }

1237 @@ -638,7 +871,7 @@	1219 @@ -638,7 +860,7 @@

1238 // Push on next node	1220 // Push on next node

1239 unistr.append((UChar)hnode->entries[where].ch);	1221 unistr.append((UChar)hnode->entries[where].ch);

1240 fIndexStack.setElementAt(where+1, fIndexStack.size()-1);	1222 fIndexStack.setElementAt(where+1, fIndexStack.size()-1);

1241 - node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[w here].equal, status));	1223 - node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[w here].equal, status));

1242 + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnod e, where, nodeCount), status));	1224 + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnod e, where, nodeCount), status));

1243 where = fIndexStack.push(0, status);	1225 where = fIndexStack.push(0, status);

1244 goingDown = TRUE;	1226 goingDown = TRUE;

1245 }	1227 }

1246 @@ -646,12 +879,14 @@	1228 @@ -646,12 +868,14 @@

1247 // Going up	1229 // Going up

1248 fNodeStack.popi();	1230 fNodeStack.popi();

1249 fIndexStack.popi();	1231 fIndexStack.popi();

1250 - node = getCompactNode(fHeader, fNodeStack.peeki());	1232 - node = getCompactNode(fHeader, fNodeStack.peeki());

1251 + node = getCompactNode(fInfo, fNodeStack.peeki());	1233 + node = getCompactNode(fInfo, fNodeStack.peeki());

1252 where = fIndexStack.peeki();	1234 where = fIndexStack.peeki();

1253 }	1235 }

1254 }	1236 }

1255 +	1237 +

1256 // Check if the parent of the node we've just gone down to ends a	1238 // Check if the parent of the node we've just gone down to ends a

1257 // word. If so, return it.	1239 // word. If so, return it.

1258 + // The root node should never end up here.	1240 + // The root node should never end up here.

1259 if (goingDown && (node->flagscount & kParentEndsWord)) {	1241 if (goingDown && (node->flagscount & kParentEndsWord)) {

1260 return &unistr;	1242 return &unistr;

1261 }	1243 }

1262 @@ -664,7 +899,7 @@	1244 @@ -664,7 +888,7 @@

1263 if (U_FAILURE(status)) {	1245 if (U_FAILURE(status)) {

1264 return NULL;	1246 return NULL;

1265 }	1247 }

1266 - return new CompactTrieEnumeration(fData, status);	1248 - return new CompactTrieEnumeration(fData, status);

1267 + return new CompactTrieEnumeration(fInfo, status);	1249 + return new CompactTrieEnumeration(fInfo, status);

1268 }	1250 }

1269	1251

1270 //	1252 //

1271 @@ -672,21 +907,36 @@	1253 @@ -672,21 +896,36 @@

1272 // and back again	1254 // and back again

1273 //	1255 //

1274	1256

1275 -// Helper classes to construct the compact trie	1257 -// Helper classes to construct the compact trie

1276 +enum CompactTrieNodeType {	1258 +enum CompactTrieNodeType {

1277 + kHorizontalType = 0,	1259 + kHorizontalType = 0,

1278 + kVerticalType = 1,	1260 + kVerticalType = 1,

1279 + kValueType = 2	1261 + kValueType = 2

1280 +};	1262 +};

1281 +	1263 +

(...skipping 22 matching lines...) Expand all Loading...
1304 fParentEndsWord = parentEndsWord;	1286 fParentEndsWord = parentEndsWord;

1305 fHasDuplicate = FALSE;	1287 fHasDuplicate = FALSE;

1306 - fVertical = vertical;	1288 - fVertical = vertical;

1307 + fNodeType = nodeType;	1289 + fNodeType = nodeType;

1308 + fEqualOverflows = FALSE;	1290 + fEqualOverflows = FALSE;

1309 fNodeID = nodes.size();	1291 fNodeID = nodes.size();

1310 + fValue = parentEndsWord? value : 0;	1292 + fValue = parentEndsWord? value : 0;

1311 nodes.push(this, status);	1293 nodes.push(this, status);

1312 }	1294 }

1313	1295

1314 @@ -694,87 +944,225 @@	1296 @@ -694,87 +933,225 @@

1315 }	1297 }

1316	1298

1317 virtual uint32_t size() {	1299 virtual uint32_t size() {

1318 - return sizeof(uint16_t);	1300 - return sizeof(uint16_t);

1319 + if(fValue > 0)	1301 + if(fValue > 0)

1320 + return sizeof(uint16_t) * 2;	1302 + return sizeof(uint16_t) * 2;

1321 + else	1303 + else

1322 + return sizeof(uint16_t);	1304 + return sizeof(uint16_t);

1323 }	1305 }

1324	1306

(...skipping 221 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1546 + // append 16 bits of to end for equal node if fEqualOverflows	1528 + // append 16 bits of to end for equal node if fEqualOverflows

1547 + if (fEqualOverflows) {	1529 + if (fEqualOverflows) {

1548 + ((uint16_t )(bytes+offset)) = (translate.elementAti(fEqual->fNode ID) >> 16);	1530 + ((uint16_t )(bytes+offset)) = (translate.elementAti(fEqual->fNode ID) >> 16);

1549 + offset += sizeof(uint16_t);	1531 + offset += sizeof(uint16_t);

1550 + }	1532 + }

1551 +	1533 +

1552 + BuildCompactTrieNode::writeValue(bytes, offset);	1534 + BuildCompactTrieNode::writeValue(bytes, offset);

1553 }	1535 }

1554	1536

1555 void addChar(UChar ch) {	1537 void addChar(UChar ch) {

1556 @@ -784,60 +1172,85 @@	1538 @@ -784,60 +1161,85 @@

1557 void setLink(BuildCompactTrieNode *node) {	1539 void setLink(BuildCompactTrieNode *node) {

1558 fEqual = node;	1540 fEqual = node;

1559 }	1541 }

1560 +	1542 +

1561 };	1543 };

1562	1544

1563 // Forward declaration	1545 // Forward declaration

1564 static void walkHorizontal(const TernaryNode *node,	1546 static void walkHorizontal(const TernaryNode *node,

1565 BuildCompactTrieHorizontalNode *building,	1547 BuildCompactTrieHorizontalNode *building,

1566 UStack &nodes,	1548 UStack &nodes,

(...skipping 77 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1644 + } else {	1626 + } else {

1645 vResult->setLink((BuildCompactTrieNode *)nodes[1]);	1627 vResult->setLink((BuildCompactTrieNode *)nodes[1]);

1646 }	1628 }

1647 }	1629 }

1648 else {	1630 else {

1649 - vResult->setLink(compactOneNode(node, endsWord, nodes, status)) ;	1631 - vResult->setLink(compactOneNode(node, endsWord, nodes, status)) ;

1650 + vResult->setLink(compactOneNode(node, endsWord, nodes, status, values, value));	1632 + vResult->setLink(compactOneNode(node, endsWord, nodes, status, values, value));

1651 }	1633 }

1652 result = vResult;	1634 result = vResult;

1653 }	1635 }

1654 @@ -849,19 +1262,28 @@	1636 @@ -849,19 +1251,28 @@

1655 // Uses recursion.	1637 // Uses recursion.

1656	1638

1657 static void walkHorizontal(const TernaryNode *node,	1639 static void walkHorizontal(const TernaryNode *node,

1658 - BuildCompactTrieHorizontalNode *building,	1640 - BuildCompactTrieHorizontalNode *building,

1659 - UStack &nodes,	1641 - UStack &nodes,

1660 - UErrorCode &status) {	1642 - UErrorCode &status) {

1661 + BuildCompactTrieHorizontalNode *building,	1643 + BuildCompactTrieHorizontalNode *building,

1662 + UStack &nodes,	1644 + UStack &nodes,

1663 + UErrorCode &status, Hashtable *values = NULL) {	1645 + UErrorCode &status, Hashtable *values = NULL) {

1664 while (U_SUCCESS(status) && node != NULL) {	1646 while (U_SUCCESS(status) && node != NULL) {

1665 if (node->low != NULL) {	1647 if (node->low != NULL) {

1666 - walkHorizontal(node->low, building, nodes, status);	1648 - walkHorizontal(node->low, building, nodes, status);

1667 + walkHorizontal(node->low, building, nodes, status, values);	1649 + walkHorizontal(node->low, building, nodes, status, values);

1668 }	1650 }

1669 BuildCompactTrieNode *link = NULL;	1651 BuildCompactTrieNode *link = NULL;

1670 if (node->equal != NULL) {	1652 if (node->equal != NULL) {

1671 - link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, nodes, status);	1653 - link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, nodes, status);

1672 + link = compactOneNode(node->equal, node->flags > 0, nodes, status, values, node->flags);	1654 + link = compactOneNode(node->equal, node->flags > 0, nodes, status, values, node->flags);

1673 }	1655 }

1674 - else if (node->flags & kEndsWord) {	1656 - else if (node->flags & kEndsWord) {

1675 - link = (BuildCompactTrieNode *)nodes[1];	1657 - link = (BuildCompactTrieNode *)nodes[1];

1676 + else if (node->flags > 0) {	1658 + else if (node->flags > 0) {

1677 + if(values != NULL) {	1659 + if(values != NULL) {

1678 + UnicodeString key(node->flags); //store value as a single-char UnicodeString	1660 + UnicodeString key(node->flags); //store value as a single-char UnicodeString

1679 + link = (BuildCompactTrieValueNode *) values->get(key);	1661 + link = (BuildCompactTrieValueNode *) values->get(key);

1680 + if(link == NULL) {	1662 + if(link == NULL) {

1681 + link = new BuildCompactTrieValueNode(nodes, status, node->f lags); //take out nodes?	1663 + link = new BuildCompactTrieValueNode(nodes, status, node->f lags); //take out nodes?

1682 + values->put(key, link, status);	1664 + values->put(key, link, status);

1683 + }	1665 + }

1684 + } else {	1666 + } else {

1685 + link = (BuildCompactTrieNode *)nodes[1];	1667 + link = (BuildCompactTrieNode *)nodes[1];

1686 + }	1668 + }

1687 }	1669 }

1688 if (U_SUCCESS(status) && link != NULL) {	1670 if (U_SUCCESS(status) && link != NULL) {

1689 building->addNode(node->ch, link, status);	1671 building->addNode(node->ch, link, status);

1690 @@ -881,13 +1303,15 @@	1672 @@ -881,13 +1292,15 @@

1691 _sortBuildNodes(const void * /context/, const void voidl, const void voidr) {	1673 _sortBuildNodes(const void * /context/, const void voidl, const void voidr) {

1692 BuildCompactTrieNode left = (BuildCompactTrieNode **)voidl;	1674 BuildCompactTrieNode left = (BuildCompactTrieNode **)voidl;

1693 BuildCompactTrieNode right = (BuildCompactTrieNode **)voidr;	1675 BuildCompactTrieNode right = (BuildCompactTrieNode **)voidr;

1694 +	1676 +

1695 // Check for comparing a node to itself, to avoid spurious duplicates	1677 // Check for comparing a node to itself, to avoid spurious duplicates

1696 if (left == right) {	1678 if (left == right) {

1697 return 0;	1679 return 0;

1698 }	1680 }

1699 +	1681 +

1700 // Most significant is type of node. Can never coalesce.	1682 // Most significant is type of node. Can never coalesce.

1701 - if (left->fVertical != right->fVertical) {	1683 - if (left->fVertical != right->fVertical) {

1702 - return left->fVertical - right->fVertical;	1684 - return left->fVertical - right->fVertical;

1703 + if (left->fNodeType != right->fNodeType) {	1685 + if (left->fNodeType != right->fNodeType) {

1704 + return left->fNodeType - right->fNodeType;	1686 + return left->fNodeType - right->fNodeType;

1705 }	1687 }

1706 // Next, the "parent ends word" flag. If that differs, we cannot coalesce.	1688 // Next, the "parent ends word" flag. If that differs, we cannot coalesce.

1707 if (left->fParentEndsWord != right->fParentEndsWord) {	1689 if (left->fParentEndsWord != right->fParentEndsWord) {

1708 @@ -898,12 +1322,19 @@	1690 @@ -898,12 +1311,19 @@

1709 if (result != 0) {	1691 if (result != 0) {

1710 return result;	1692 return result;

1711 }	1693 }

1712 +	1694 +

1713 + // If the node value differs, we should not coalesce.	1695 + // If the node value differs, we should not coalesce.

1714 + // If values aren't stored, all fValues should be 0.	1696 + // If values aren't stored, all fValues should be 0.

1715 + if (left->fValue != right->fValue) {	1697 + if (left->fValue != right->fValue) {

1716 + return left->fValue - right->fValue;	1698 + return left->fValue - right->fValue;

1717 + }	1699 + }

1718 +	1700 +

1719 // We know they're both the same node type, so branch for the two cases.	1701 // We know they're both the same node type, so branch for the two cases.

1720 - if (left->fVertical) {	1702 - if (left->fVertical) {

1721 + if (left->fNodeType == kVerticalType) {	1703 + if (left->fNodeType == kVerticalType) {

1722 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID	1704 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID

1723 - - ((BuildCompactTrieVerticalNode *)right)->fEqual-> fNodeID;	1705 - - ((BuildCompactTrieVerticalNode *)right)->fEqual-> fNodeID;

1724 + - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;	1706 + - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;

1725 }	1707 }

1726 - else {	1708 - else {

1727 + else if(left->fChars.length() > 0 && right->fChars.length() > 0){	1709 + else if(left->fChars.length() > 0 && right->fChars.length() > 0){

1728 // We need to compare the links vectors. They should be the	1710 // We need to compare the links vectors. They should be the

1729 // same size because the strings were equal.	1711 // same size because the strings were equal.

1730 // We compare the node IDs instead of the pointers, to handle	1712 // We compare the node IDs instead of the pointers, to handle

1731 @@ -914,9 +1345,10 @@	1713 @@ -914,9 +1334,10 @@

1732 int32_t count = hleft->fLinks.size();	1714 int32_t count = hleft->fLinks.size();

1733 for (int32_t i = 0; i < count && result == 0; ++i) {	1715 for (int32_t i = 0; i < count && result == 0; ++i) {

1734 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID -	1716 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID -

1735 - ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;	1717 - ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;

1736 + ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;	1718 + ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;

1737 }	1719 }

1738 }	1720 }

1739 +	1721 +

1740 // If they are equal to each other, mark them (speeds coalescing)	1722 // If they are equal to each other, mark them (speeds coalescing)

1741 if (result == 0) {	1723 if (result == 0) {

1742 left->fHasDuplicate = TRUE;	1724 left->fHasDuplicate = TRUE;

1743 @@ -1031,20 +1463,25 @@	1725 @@ -1031,20 +1452,25 @@

1744 // Add node 0, used as the NULL pointer/sentinel.	1726 // Add node 0, used as the NULL pointer/sentinel.

1745 nodes.addElement((int32_t)0, status);	1727 nodes.addElement((int32_t)0, status);

1746	1728

1747 + Hashtable *values = NULL; // Index of (unique) va lues	1729 + Hashtable *values = NULL; // Index of (unique) va lues

1748 + if (dict.fValued) {	1730 + if (dict.fValued) {

1749 + values = new Hashtable(status);	1731 + values = new Hashtable(status);

1750 + }	1732 + }

1751 +	1733 +

1752 // Start by creating the special empty node we use to indicate that the par ent	1734 // Start by creating the special empty node we use to indicate that the par ent

1753 // terminates a word. This must be node 1, because the builder assumes	1735 // terminates a word. This must be node 1, because the builder assumes

1754 - // that.	1736 - // that.

1755 + // that. This node will never be used for tries storing numerical values.	1737 + // that. This node will never be used for tries storing numerical values.

1756 if (U_FAILURE(status)) {	1738 if (U_FAILURE(status)) {

1757 return NULL;	1739 return NULL;

1758 }	1740 }

1759 - BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, node s, status);	1741 - BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, node s, status);

1760 + BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontal Type, nodes, status);	1742 + BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontal Type, nodes, status);

1761 if (terminal == NULL) {	1743 if (terminal == NULL) {

1762 status = U_MEMORY_ALLOCATION_ERROR;	1744 status = U_MEMORY_ALLOCATION_ERROR;

1763 }	1745 }

1764	1746

1765 // This call does all the work of building the new trie structure. The root	1747 // This call does all the work of building the new trie structure. The root

1766 - // will be node 2.	1748 - // will be node 2.

1767 - BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, statu s);	1749 - BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, statu s);

1768 + // will have node ID 2 before writing to memory.	1750 + // will have node ID 2 before writing to memory.

1769 + BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, statu s, values);	1751 + BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, statu s, values);

1770 #ifdef DEBUG_TRIE_DICT	1752 #ifdef DEBUG_TRIE_DICT

1771 (void) ::times(&timing);	1753 (void) ::times(&timing);

1772 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n",	1754 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n",

1773 @@ -1077,21 +1514,37 @@	1755 @@ -1077,21 +1503,37 @@

1774 return NULL;	1756 return NULL;

1775 }	1757 }

1776	1758

1777 + //map terminal value nodes	1759 + //map terminal value nodes

1778 + int valueCount = 0;	1760 + int valueCount = 0;

1779 + UVector valueNodes(status);	1761 + UVector valueNodes(status);

1780 + if(values != NULL) {	1762 + if(values != NULL) {

1781 + valueCount = values->count(); //number of unique terminal value nodes	1763 + valueCount = values->count(); //number of unique terminal value nodes

1782 + }	1764 + }

1783 +	1765 +

(...skipping 23 matching lines...) Expand all Loading...
1807 }	1789 }

1808 -	1790 -

1809 - // Check for overflowing 16 bits worth of nodes.	1791 - // Check for overflowing 16 bits worth of nodes.

1810 - if (nodeCount > 0x10000) {	1792 - if (nodeCount > 0x10000) {

1811 +	1793 +

1812 + // Check for overflowing 20 bits worth of nodes.	1794 + // Check for overflowing 20 bits worth of nodes.

1813 + if (nodeCount > 0x100000) {	1795 + if (nodeCount > 0x100000) {

1814 status = U_ILLEGAL_ARGUMENT_ERROR;	1796 status = U_ILLEGAL_ARGUMENT_ERROR;

1815 return NULL;	1797 return NULL;

1816 }	1798 }

1817 @@ -1111,9 +1564,14 @@	1799 @@ -1111,9 +1553,14 @@

1818 status = U_MEMORY_ALLOCATION_ERROR;	1800 status = U_MEMORY_ALLOCATION_ERROR;

1819 return NULL;	1801 return NULL;

1820 }	1802 }

1821 -	1803 -

1822 +	1804 +

1823 CompactTrieHeader header = (CompactTrieHeader )bytes;	1805 CompactTrieHeader header = (CompactTrieHeader )bytes;

1824 - header->size = totalSize;	1806 - header->size = totalSize;

1825 + //header->size = totalSize;	1807 + //header->size = totalSize;

1826 + if(dict.fValued){	1808 + if(dict.fValued){

1827 + header->magic = COMPACT_TRIE_MAGIC_3;	1809 + header->magic = COMPACT_TRIE_MAGIC_3;

1828 + } else {	1810 + } else {

1829 + header->magic = COMPACT_TRIE_MAGIC_2;	1811 + header->magic = COMPACT_TRIE_MAGIC_2;

1830 + }	1812 + }

1831 header->nodeCount = nodeCount;	1813 header->nodeCount = nodeCount;

1832 header->offsets[0] = 0; // Sentinel	1814 header->offsets[0] = 0; // Sentinel

1833 header->root = translate.elementAti(root->fNodeID);	1815 header->root = translate.elementAti(root->fNodeID);

1834 @@ -1123,23 +1581,40 @@	1816 @@ -1123,23 +1570,40 @@

1835 }	1817 }

1836 #endif	1818 #endif

1837 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uin t32_t));	1819 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uin t32_t));

1838 - nodeCount = 1;	1820 - nodeCount = 1;

1839 + nodeCount = valueCount + 1;	1821 + nodeCount = valueCount + 1;

1840 +	1822 +

1841 + // Write terminal value nodes to memory	1823 + // Write terminal value nodes to memory

1842 + for (i=0; i < valueNodes.size(); i++) {	1824 + for (i=0; i < valueNodes.size(); i++) {

1843 + //header->offsets[i + 1] = offset;	1825 + //header->offsets[i + 1] = offset;

1844 + uint32_t tmpOffset = 0;	1826 + uint32_t tmpOffset = 0;

(...skipping 23 matching lines...) Expand all Loading...
1868 fprintf(stderr, "Trie built, time user %f system %f\n",	1850 fprintf(stderr, "Trie built, time user %f system %f\n",

1869 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,	1851 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,

1870 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);	1852 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);

1871 previous = timing;	1853 previous = timing;

1872 fprintf(stderr, "Final offset is %d\n", offset);	1854 fprintf(stderr, "Final offset is %d\n", offset);

1873 -	1855 -

1874 +	1856 +

1875 // Collect statistics on node types and sizes	1857 // Collect statistics on node types and sizes

1876 int hCount = 0;	1858 int hCount = 0;

1877 int vCount = 0;	1859 int vCount = 0;

1878 @@ -1148,68 +1623,85 @@	1860 @@ -1148,68 +1612,85 @@

1879 size_t hItemCount = 0;	1861 size_t hItemCount = 0;

1880 size_t vItemCount = 0;	1862 size_t vItemCount = 0;

1881 uint32_t previousOff = offset;	1863 uint32_t previousOff = offset;

1882 - for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {	1864 - for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {

1883 + uint32_t numOverflow = 0;	1865 + uint32_t numOverflow = 0;

1884 + uint32_t valueSpace = 0;	1866 + uint32_t valueSpace = 0;

1885 + for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {	1867 + for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {

1886 const CompactTrieNode *node = getCompactNode(header, nodeIdx);	1868 const CompactTrieNode *node = getCompactNode(header, nodeIdx);

1887 - if (node->flagscount & kVerticalNode) {	1869 - if (node->flagscount & kVerticalNode) {

1888 + int itemCount;	1870 + int itemCount;

(...skipping 85 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1974	1956

1975 // Convert one compact trie node into a ternary subtrie	1957 // Convert one compact trie node into a ternary subtrie

1976 static TernaryNode *	1958 static TernaryNode *

1977 -unpackOneNode( const CompactTrieHeader header, const CompactTrieNode node, UE rrorCode &status ) {	1959 -unpackOneNode( const CompactTrieHeader header, const CompactTrieNode node, UE rrorCode &status ) {

1978 - int nodeCount = (node->flagscount & kCountMask);	1960 - int nodeCount = (node->flagscount & kCountMask);

1979 +unpackOneNode( const CompactTrieInfo info, const CompactTrieNode node, UError Code &status ) {	1961 +unpackOneNode( const CompactTrieInfo info, const CompactTrieNode node, UError Code &status ) {

1980 + int nodeCount = getCount(node);	1962 + int nodeCount = getCount(node);

1981 if (nodeCount == 0 \|\| U_FAILURE(status)) {	1963 if (nodeCount == 0 \|\| U_FAILURE(status)) {

1982 // Failure, or terminal node	1964 // Failure, or terminal node

1983 return NULL;	1965 return NULL;

1984 @@ -1234,29 +1726,41 @@	1966 @@ -1234,29 +1715,41 @@

1985 previous = latest;	1967 previous = latest;

1986 }	1968 }

1987 if (latest != NULL) {	1969 if (latest != NULL) {

1988 - const CompactTrieNode *equal = getCompactNode(header, vnode->equal) ;	1970 - const CompactTrieNode *equal = getCompactNode(header, vnode->equal) ;

1989 + const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(v node));	1971 + const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(v node));

1990 if (equal->flagscount & kParentEndsWord) {	1972 if (equal->flagscount & kParentEndsWord) {

1991 - latest->flags \|= kEndsWord;	1973 - latest->flags \|= kEndsWord;

1992 + if(info->magic == COMPACT_TRIE_MAGIC_3){	1974 + if(info->magic == COMPACT_TRIE_MAGIC_3){

1993 + latest->flags = getValue(equal);	1975 + latest->flags = getValue(equal);

1994 + } else {	1976 + } else {

(...skipping 27 matching lines...) Expand all Loading...
2022 + // because only kEqualOverflows flag should be checked in root's flagscount	2004 + // because only kEqualOverflows flag should be checked in root's flagscount

2023 + const CompactTrieHorizontalNode hnode = (const CompactTrieHorizontalNode )	2005 + const CompactTrieHorizontalNode hnode = (const CompactTrieHorizontalNode )

2024 + getCompactNode(fInfo, fInfo->root);	2006 + getCompactNode(fInfo, fInfo->root);

2025 + uint16_t nodeCount = hnode->flagscount & kRootCountMask;	2007 + uint16_t nodeCount = hnode->flagscount & kRootCountMask;

2026 + TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1,	2008 + TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1,

2027 + nodeCount, status);	2009 + nodeCount, status);

2028 +	2010 +

2029 if (U_FAILURE(status)) {	2011 if (U_FAILURE(status)) {

2030 delete root; // Clean up	2012 delete root; // Clean up

2031 delete result;	2013 delete result;

2032 @@ -1270,8 +1774,8 @@	2014 @@ -1270,8 +1763,8 @@

2033	2015

2034 U_CAPI int32_t U_EXPORT2	2016 U_CAPI int32_t U_EXPORT2

2035 triedict_swap(const UDataSwapper ds, const void inData, int32_t length, void *outData,	2017 triedict_swap(const UDataSwapper ds, const void inData, int32_t length, void *outData,

2036 - UErrorCode *status) {	2018 - UErrorCode *status) {

2037 -	2019 -

2038 + UErrorCode *status) {	2020 + UErrorCode *status) {

2039 +	2021 +

2040 if (status == NULL \|\| U_FAILURE(*status)) {	2022 if (status == NULL \|\| U_FAILURE(*status)) {

2041 return 0;	2023 return 0;

2042 }	2024 }

2043 @@ -1286,14 +1790,14 @@	2025 @@ -1286,14 +1779,14 @@

2044 //	2026 //

2045 const UDataInfo pInfo = (const UDataInfo )((const uint8_t *)inData+4);	2027 const UDataInfo pInfo = (const UDataInfo )((const uint8_t *)inData+4);

2046 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */	2028 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */

2047 - pInfo->dataFormat[1]==0x72 &&	2029 - pInfo->dataFormat[1]==0x72 &&

2048 - pInfo->dataFormat[2]==0x44 &&	2030 - pInfo->dataFormat[2]==0x44 &&

2049 - pInfo->dataFormat[3]==0x63 &&	2031 - pInfo->dataFormat[3]==0x63 &&

2050 - pInfo->formatVersion[0]==1 )) {	2032 - pInfo->formatVersion[0]==1 )) {

2051 + pInfo->dataFormat[1]==0x72 &&	2033 + pInfo->dataFormat[1]==0x72 &&

2052 + pInfo->dataFormat[2]==0x44 &&	2034 + pInfo->dataFormat[2]==0x44 &&

2053 + pInfo->dataFormat[3]==0x63 &&	2035 + pInfo->dataFormat[3]==0x63 &&

2054 + pInfo->formatVersion[0]==1 )) {	2036 + pInfo->formatVersion[0]==1 )) {

2055 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",	2037 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",

2056 - pInfo->dataFormat[0], pInfo->dataFormat[1],	2038 - pInfo->dataFormat[0], pInfo->dataFormat[1],

2057 - pInfo->dataFormat[2], pInfo->dataFormat[3],	2039 - pInfo->dataFormat[2], pInfo->dataFormat[3],

2058 - pInfo->formatVersion[0]);	2040 - pInfo->formatVersion[0]);

2059 + pInfo->dataFormat[0], pInfo->dataFormat[1],	2041 + pInfo->dataFormat[0], pInfo->dataFormat[1],

2060 + pInfo->dataFormat[2], pInfo->dataFormat[3],	2042 + pInfo->dataFormat[2], pInfo->dataFormat[3],

2061 + pInfo->formatVersion[0]);	2043 + pInfo->formatVersion[0]);

2062 *status=U_UNSUPPORTED_ERROR;	2044 *status=U_UNSUPPORTED_ERROR;

2063 return 0;	2045 return 0;

2064 }	2046 }

2065 @@ -1311,8 +1815,10 @@	2047 @@ -1311,8 +1804,10 @@

2066 //	2048 //

2067 const uint8_t inBytes =(const uint8_t )inData+headerSize;	2049 const uint8_t inBytes =(const uint8_t )inData+headerSize;

2068 const CompactTrieHeader header = (const CompactTrieHeader )inBytes;	2050 const CompactTrieHeader header = (const CompactTrieHeader )inBytes;

2069 - if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1	2051 - if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1

2070 - \|\| ds->readUInt32(header->size) < sizeof(CompactTrieHeader))	2052 - \|\| ds->readUInt32(header->size) < sizeof(CompactTrieHeader))

2071 + uint32_t magic = ds->readUInt32(header->magic);	2053 + uint32_t magic = ds->readUInt32(header->magic);

2072 + if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic != COMPACT_TRIE_MAGIC_3	2054 + if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic != COMPACT_TRIE_MAGIC_3

2073 + \|\| magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeaderV1)	2055 + \|\| magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeaderV1)

2074 + \|\| magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeader))	2056 + \|\| magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeader))

2075 {	2057 {

2076 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n" );	2058 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n" );

2077 *status=U_UNSUPPORTED_ERROR;	2059 *status=U_UNSUPPORTED_ERROR;

2078 @@ -1333,10 +1839,10 @@	2060 @@ -1333,10 +1828,10 @@

2079 //	2061 //

2080 if (length < sizeWithUData) {	2062 if (length < sizeWithUData) {

2081 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",	2063 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",

2082 - totalSize);	2064 - totalSize);

2083 + totalSize);	2065 + totalSize);

2084 *status=U_INDEX_OUTOFBOUNDS_ERROR;	2066 *status=U_INDEX_OUTOFBOUNDS_ERROR;

2085 return 0;	2067 return 0;

2086 - }	2068 - }

2087 + }	2069 + }

2088	2070

2089 //	2071 //

2090 // Swap the Data. Do the data itself first, then the CompactTrieHeader, be cause	2072 // Swap the Data. Do the data itself first, then the CompactTrieHeader, be cause

2091 @@ -1355,20 +1861,38 @@	2073 @@ -1355,20 +1850,38 @@

2092 }	2074 }

2093	2075

2094 // We need to loop through all the nodes in the offset table, and swap each one.	2076 // We need to loop through all the nodes in the offset table, and swap each one.

2095 - uint16_t nodeCount = ds->readUInt16(header->nodeCount);	2077 - uint16_t nodeCount = ds->readUInt16(header->nodeCount);

2096 + uint32_t nodeCount, rootId;	2078 + uint32_t nodeCount, rootId;

2097 + if(header->magic == COMPACT_TRIE_MAGIC_1) {	2079 + if(header->magic == COMPACT_TRIE_MAGIC_1) {

2098 + nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount);	2080 + nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount);

2099 + rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root);	2081 + rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root);

2100 + } else {	2082 + } else {

2101 + nodeCount = ds->readUInt32(header->nodeCount);	2083 + nodeCount = ds->readUInt32(header->nodeCount);

(...skipping 24 matching lines...) Expand all Loading...
2126 + overflow += 1;	2108 + overflow += 1;

2127 + }	2109 + }

2128 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVertica lNode,chars),	2110 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVertica lNode,chars),

2129 - itemCount*sizeof(uint16_t),	2111 - itemCount*sizeof(uint16_t),

2130 - outBytes+nodeOff+offsetof(CompactTrieVertic alNode,chars), status);	2112 - outBytes+nodeOff+offsetof(CompactTrieVertic alNode,chars), status);

2131 + (itemCount + overflow)*sizeof(uint16_t),	2113 + (itemCount + overflow)*sizeof(uint16_t),

2132 + outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars ), status);	2114 + outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars ), status);

2133 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(Compac tTrieVerticalNode,equal);	2115 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(Compac tTrieVerticalNode,equal);

2134 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNo de,equal));	2116 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNo de,equal));

2135 }	2117 }

2136 @@ -1381,26 +1905,62 @@	2118 @@ -1381,26 +1894,62 @@

2137 word = ds->readUInt16(inHNode->entries[j].equal);	2119 word = ds->readUInt16(inHNode->entries[j].equal);

2138 ds->writeUInt16(&outHNode->entries[j].equal, word);	2120 ds->writeUInt16(&outHNode->entries[j].equal, word);

2139 }	2121 }

2140 +	2122 +

2141 + // swap overflow/value information	2123 + // swap overflow/value information

2142 + if(flagscount & kEqualOverflows){	2124 + if(flagscount & kEqualOverflows){

2143 + overflow += (itemCount + 3) / 4;	2125 + overflow += (itemCount + 3) / 4;

2144 + }	2126 + }

2145 +	2127 +

2146 + if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && fla gscount & kEndsParentWord) {	2128 + if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && fla gscount & kEndsParentWord) {

(...skipping 55 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2202 + ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff , status);	2184 + ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff , status);

2203 +	2185 +

2204 + //swap offsets	2186 + //swap offsets

2205 + ds->swapArray32(ds, inBytes+offsetPos,	2187 + ds->swapArray32(ds, inBytes+offsetPos,

2206 + sizeof(uint32_t)*(uint32_t)nodeCount,	2188 + sizeof(uint32_t)*(uint32_t)nodeCount,

2207 + outBytes+offsetPos, status);	2189 + outBytes+offsetPos, status);

2208	2190

2209 return sizeWithUData;	2191 return sizeWithUData;

2210 }	2192 }

2211 --- source/common/triedict.h 2006-06-06 15:38:49.000000000 -0700	2193 --- source/common/triedict.h 2006-06-06 15:38:49.000000000 -0700

2212 +++ source/common/triedict.h» 2009-07-27 13:01:17.723390000 -0700	2194 +++ source/common/triedict.h» 2011-01-21 14:12:45.496927000 -0800

2213 @@ -47,7 +47,6 @@	2195 @@ -47,7 +47,6 @@

2214 U_NAMESPACE_BEGIN	2196 U_NAMESPACE_BEGIN

2215	2197

2216 class StringEnumeration;	2198 class StringEnumeration;

2217 -struct CompactTrieHeader;	2199 -struct CompactTrieHeader;

2218	2200

2219 /*******************************************************************	2201 /*******************************************************************

2220 * TrieWordDictionary	2202 * TrieWordDictionary

2221 @@ -72,23 +71,29 @@	2203 @@ -72,23 +71,29 @@

2222 */	2204 */

(...skipping 218 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2441 *	2423 *

2442 * @return The data for the compact dictionary, suitable for passing to the	2424 * @return The data for the compact dictionary, suitable for passing to the

2443 * constructor.	2425 * constructor.

2444 @@ -342,5 +386,5 @@	2426 @@ -342,5 +386,5 @@

2445	2427

2446 U_NAMESPACE_END	2428 U_NAMESPACE_END

2447	2429

2448 - /* TRIEDICT_H */	2430 - /* TRIEDICT_H */

2449 +/* TRIEDICT_H */	2431 +/* TRIEDICT_H */

2450 #endif	2432 #endif

2451 --- source/data/brkitr/brkfiles.mk» 2009-04-21 15:42:37.000000000 -0700	2433 --- source/data/Makefile.in» 2010-10-29 13:21:33.000000000 -0700

2452 +++ source/data/brkitr/brkfiles.mk» 2009-07-27 13:01:17.730379000 -0700	2434 +++ source/data/Makefile.in» 2011-01-26 16:24:24.856798000 -0800

2453 @@ -34,13 +34,12 @@	2435 @@ -509,8 +520,9 @@

	2436 #################################################### CTD

	2437 # CTD FILES

2454	2438

	2439 -$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_ FILES)

	2440 - $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<

	2441 +# .ctd file now generated regardless of whether dictionary file exists

	2442 +$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)

	2443 + $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F ).txt

2455	2444

2456 # List of compact trie dictionary files (ctd).	2445 #################################################### CFU

2457 -BRK_CTD_SOURCE = thaidict.txt	2446 # CFU FILES

2458 +BRK_CTD_SOURCE = thaidict.txt cjdict.txt	2447 --- source/data/brkitr/root.txt»2010-07-28 17:18:28.000000000 -0700

2459	2448 +++ source/data/brkitr/root.txt»2011-01-21 14:12:45.653922000 -0800

2460

2461 # List of break iterator files (brk).

2462 -BRK_SOURCE = word_POSIX.txt word_ja.txt sent_el.txt char_th.txt char.txt word.t xt line.txt sent.txt title.txt

2463 +BRK_SOURCE = word_POSIX.txt sent_el.txt char_th.txt char.txt word.txt line.txt sent.txt title.txt

2464

2465

2466 # Ordinary resources

2467 -BRK_RES_SOURCE = el.txt en.txt en_US.txt en_US_POSIX.txt ja.txt th.txt

2468 -

2469 +BRK_RES_SOURCE = el.txt en.txt en_US.txt en_US_POSIX.txt th.txt

2470 --- source/data/brkitr/root.txt»2009-06-24 14:06:38.000000000 -0700

2471 +++ source/data/brkitr/root.txt»2009-07-27 13:01:17.733382000 -0700

2472 @@ -17,5 +17,8 @@	2449 @@ -17,5 +17,8 @@

2473 }	2450 }

2474 dictionaries{	2451 dictionaries{

2475 Thai:process(dependency){"thaidict.ctd"}	2452 Thai:process(dependency){"thaidict.ctd"}

2476 + Hani:process(dependency){"cjdict.ctd"}	2453 + Hani:process(dependency){"cjdict.ctd"}

2477 + Hira:process(dependency){"cjdict.ctd"}	2454 + Hira:process(dependency){"cjdict.ctd"}

2478 + Kata:process(dependency){"cjdict.ctd"}	2455 + Kata:process(dependency){"cjdict.ctd"}

2479 }	2456 }

2480 }	2457 }

2481 --- source/data/brkitr/word.txt»2009-06-24 14:06:38.000000000 -0700	2458 --- source/data/xml/brkitr/root.xml» 2010-03-01 15:13:18.000000000 -0800

2482 +++ source/data/brkitr/word.txt»2010-08-27 16:24:25.969372000 -0700	2459 +++ source/data/xml/brkitr/root.xml» 2011-01-21 14:12:45.735922000 -0800

2483 @@ -29,29 +29,49 @@

2484 $Newline = [\p{Word_Break = Newline}];

2485 $Extend = [\p{Word_Break = Extend}];

2486 $Format = [\p{Word_Break = Format}];

2487 +$Hiragana = [:Hiragana:];

2488 $Katakana = [\p{Word_Break = Katakana}];

2489 +$Han = [:Han:];

2490 $ALetter = [\p{Word_Break = ALetter}];

2491 -$MidNumLet = [\p{Word_Break = MidNumLet}];

2492 +# Remove two full stop characters from $MidNumLet and add them to $MidNum

2493 +# to break a hostname into its components at the cost of breaking

2494 +# 'e.g.' and 'i.e.' as well.

2495 +# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12.

2496 +# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected

2497 +# while rules 6/7 are reverted to the old behavior we want.

2498 +$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]];

2499 $MidLetter = [\p{Word_Break = MidLetter}];

2500 -$MidNum = [\p{Word_Break = MidNum}];

2501 -$Numeric = [\p{Word_Break = Numeric}];

2502 +$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]];

2503 +$Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth digits

2504 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];

2505

2506 +# Extra sets not to break 'HebrewLetter U+0022 HebrewLetter'.

2507 +$HebrewLet = [\p{Word_Break = ALetter} & \p{Script = Hebrew} - [\u05F3]];

2508 +# U+05F3 is ALetter and U+05F4 is MidLetter so that they're covered by

2509 +# the current rule 6/7.

2510 +$HebrewMidLet = [\u0022];

2511

2512 # Dictionary character set, for triggering language-based break engines. Curr ently

2513 -# limited to LineBreak=Complex_Context. Note that this set only works in Unic ode

2514 -# 5.0 or later as the definition of Complex_Context was corrected to include all

2515 +# limited to LineBreak=Complex_Context and CJK. Note that this set only works

2516 +# in Unicode 5.0 or later as the definition of Complex_Context was corrected to include all

2517 # characters requiring dictionary break.

2518

2519 -$dictionary = [:LineBreak = Complex_Context:];

2520 $Control = [\p{Grapheme_Cluster_Break = Control}];

2521 -$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default A Letter does not

2522 - # include the dic tionary characters.

2523 +$HangulSyllable = [\uac00-\ud7a3];

2524 +$ComplexContext = [:LineBreak = Complex_Context:];

2525 +$KanaKanji = [$Han $Hiragana $Katakana];

2526 +$dictionaryCJK = [$KanaKanji $HangulSyllable];

2527 +$dictionary = [$ComplexContext $dictionaryCJK];

2528 +

2529 +# leave CJK scripts out of ALetterPlus

2530 +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];

2531 +

2532

2533 #

2534 # Rules 4 Ignore Format and Extend characters,

2535 # except when they appear at the beginning of a region of text.

2536 #

2537 +# TODO: check if handling of katakana in dictionary makes rules incorrect/void.

2538 $KatakanaEx = $Katakana ($Extend \| $Format)*;

2539 $ALetterEx = $ALetterPlus ($Extend \| $Format)*;

2540 $MidNumLetEx = $MidNumLet ($Extend \| $Format)*;

2541 @@ -59,8 +79,8 @@

2542 $MidNumEx = $MidNum ($Extend \| $Format)*;

2543 $NumericEx = $Numeric ($Extend \| $Format)*;

2544 $ExtendNumLetEx = $ExtendNumLet ($Extend \| $Format)*;

2545 +$HebrewLetEx = $HebrewLet ($Extend \| $Format)*;

2546

2547 -$Hiragana = [\p{script=Hiragana}];

2548 $Ideographic = [\p{Ideographic}];

2549 $HiraganaEx = $Hiragana ($Extend \| $Format)*;

2550 $IdeographicEx = $Ideographic ($Extend \| $Format)*;

2551 @@ -79,12 +99,14 @@

2552 # begins with a group of Format chars, or with a "word" consisting of a single

2553 # char that is not in any of the listed word break categories followed by

2554 # format char(s).

2555 -[^$CR $LF $Newline]? ($Extend \| $Format)+;

2556 + # format char(s), or is not a CJK dictionary character.

2557 +[^$CR $LF $Newline $dictionaryCJK]? ($Extend \| $Format)+;

2558

2559 $NumericEx {100};

2560 $ALetterEx {200};

2561 -$KatakanaEx {300}; # note: these status values override those from rule 5

2562 -$HiraganaEx {300}; # by virtual of being numerically larger.

2563 +$HangulSyllable {200};

2564 +$KatakanaEx {400}; #originally 300

2565 +$HiraganaEx {400}; #originally 300

2566 $IdeographicEx {400}; #

2567

2568 #

2569 @@ -96,6 +118,9 @@

2570 # rule 6 and 7

2571 $ALetterEx ($MidLetterEx \| $MidNumLetEx) $ALetterEx {200};

2572

2573 +# Chrome addition

2574 +$HebrewLetEx $HebrewMidLet $HebrewLetEx {200};

2575 +

2576 # rule 8

2577

2578 $NumericEx $NumericEx {100};

2579 @@ -114,19 +139,25 @@

2580

2581 # rule 13

2582

2583 -$KatakanaEx $KatakanaEx {300};

2584 +# To be consistent with '$KanaKanji $KanaKanji', changed

2585 +# from 300 to 400.

2586 +# See also TestRuleStatus in intltest/rbbiapts.cpp

2587 +$KatakanaEx $KatakanaEx {400};

2588

2589 # rule 13a/b

2590

2591 $ALetterEx $ExtendNumLetEx {200}; # (13a)

2592 $NumericEx $ExtendNumLetEx {100}; # (13a)

2593 -$KatakanaEx $ExtendNumLetEx {300}; # (13a)

2594 +$KatakanaEx $ExtendNumLetEx {400}; # (13a)

2595 $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)

2596

2597 $ExtendNumLetEx $ALetterEx {200}; # (13b)

2598 $ExtendNumLetEx $NumericEx {100}; # (13b)

2599 -$ExtendNumLetEx $KatakanaEx {300}; # (13b)

2600 -

2601 +$ExtendNumLetEx $KatakanaEx {400}; # (13b)

2602 +

2603 +# special handling for CJK characters: chain for later dictionary segmentation

2604 +$HangulSyllable $HangulSyllable {200};

2605 +$KanaKanji $KanaKanji {400}; #different rule status if both kanji and kana foun d

2606

2607

2608 ## -------------------------------------------------

2609 @@ -139,13 +170,15 @@

2610 $BackMidNumEx = ($Format \| $Extend)* $MidNum;

2611 $BackMidLetterEx = ($Format \| $Extend)* $MidLetter;

2612 $BackKatakanaEx = ($Format \| $Extend)* $Katakana;

2613 +$BackHiraganaEx = ($Extend \| $Format)* $Hiragana;

2614 $BackExtendNumLetEx= ($Format \| $Extend)* $ExtendNumLet;

2615 +$BackHebrewLetEx = ($Format \| $Extend)* $HebrewLet;

2616

2617 # rule 3

2618 $LF $CR;

2619

2620 # rule 4

2621 -($Format \| $Extend)* [^$CR $LF $Newline]?;

2622 +($Format \| $Extend)* [^$CR $LF $Newline $dictionaryCJK]?;

2623

2624 # rule 5

2625

2626 @@ -155,6 +188,8 @@

2627

2628 $BackALetterEx ($BackMidLetterEx \| $BackMidNumLetEx) $BackALetterEx;

2629

2630 +# Chrome addition

2631 +$BackHebrewLetEx $HebrewMidLet $BackHebrewLetEx;

2632

2633 # rule 8

2634

2635 @@ -181,6 +216,10 @@

2636 $BackExtendNumLetEx ($BackALetterEx \| $BackNumericEx \| $BackKatakanaEx \| $BackE xtendNumLetEx);

2637 ($BackALetterEx \| $BackNumericEx \| $BackKatakanaEx) $BackExtendNumLetEx;

2638

2639 +# special handling for CJK characters: chain for later dictionary segmentation

2640 +$HangulSyllable $HangulSyllable;

2641 +$KanaKanji $KanaKanji; #different rule status if both kanji and kana found

2642 +

2643 ## -------------------------------------------------

2644

2645 !!safe_reverse;

2646 --- source/data/xml/brkitr/root.xml» 2007-08-28 23:10:43.000000000 -0700

2647 +++ source/data/xml/brkitr/root.xml» 2009-07-27 13:01:17.746367000 -0700

2648 @@ -25,6 +25,9 @@	2460 @@ -25,6 +25,9 @@

2649 </icu:boundaries>	2461 </icu:boundaries>

2650 <icu:dictionaries>	2462 <icu:dictionaries>

2651 <icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/>	2463 <icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/>

2652 + <icu:dictionary type="Hani" icu:dependency="cjdict.ctd"/>	2464 + <icu:dictionary type="Hani" icu:dependency="cjdict.ctd"/>

2653 + <icu:dictionary type="Hira" icu:dependency="cjdict.ctd"/>	2465 + <icu:dictionary type="Hira" icu:dependency="cjdict.ctd"/>

2654 + <icu:dictionary type="Kata" icu:dependency="cjdict.ctd"/>	2466 + <icu:dictionary type="Kata" icu:dependency="cjdict.ctd"/>

2655 </icu:dictionaries>	2467 </icu:dictionaries>

2656 </icu:breakIteratorData>	2468 </icu:breakIteratorData>

2657 </special>	2469 </special>

2658 --- source/test/cintltst/creststn.c» 2009-06-26 09:49:55.000000000 -0700	2470 --- source/test/cintltst/creststn.c» 2010-10-28 10:44:02.000000000 -0700

2659 +++ source/test/cintltst/creststn.c» 2009-07-29 12:46:05.997405000 -0700	2471 +++ source/test/cintltst/creststn.c» 2011-01-21 14:12:44.995020000 -0800

2660 @@ -2181,21 +2181,21 @@	2472 @@ -2188,21 +2188,21 @@

2661	2473

2662	2474

2663 {	2475 {

2664 - UResourceBundle* ja = ures_open(U_ICUDATA_BRKITR,"ja", &status);	2476 - UResourceBundle* ja = ures_open(U_ICUDATA_BRKITR,"ja", &status);

2665 + UResourceBundle* th = ures_open(U_ICUDATA_BRKITR,"th", &status);	2477 + UResourceBundle* th = ures_open(U_ICUDATA_BRKITR,"th", &status);

2666 const UChar got = NULL, exp=NULL;	2478 const UChar got = NULL, exp=NULL;

2667 int32_t gotLen = 0, expLen=0;	2479 int32_t gotLen = 0, expLen=0;

2668 - ja = ures_getByKey(ja, "boundaries", ja, &status);	2480 - ja = ures_getByKey(ja, "boundaries", ja, &status);

2669 - exp = tres_getString(ja, -1, "word", &expLen, &status);	2481 - exp = tres_getString(ja, -1, "word", &expLen, &status);

2670 + th = ures_getByKey(th, "boundaries", th, &status);	2482 + th = ures_getByKey(th, "boundaries", th, &status);

2671 + exp = tres_getString(th, -1, "grapheme", &expLen, &status);	2483 + exp = tres_getString(th, -1, "grapheme", &expLen, &status);

2672	2484

2673 tb = ures_getByKey(aliasB, "boundaries", tb, &status);	2485 tb = ures_getByKey(aliasB, "boundaries", tb, &status);

2674 - got = tres_getString(tb, -1, "word", &gotLen, &status);	2486 - got = tres_getString(tb, -1, "word", &gotLen, &status);

2675 + got = tres_getString(tb, -1, "grapheme", &gotLen, &status);	2487 + got = tres_getString(tb, -1, "grapheme", &gotLen, &status);

2676	2488

2677 if(U_FAILURE(status)) {	2489 if(U_FAILURE(status)) {

2678 log_err("%s trying to read str boundaries\n", u_errorName(statu s));	2490 log_err("%s trying to read str boundaries\n", u_errorName(statu s));

2679 } else if(gotLen != expLen \|\| u_strncmp(exp, got, gotLen) != 0) {	2491 } else if(gotLen != expLen \|\| u_strncmp(exp, got, gotLen) != 0) {

2680 log_err("Referencing alias didn't get the right data\n");	2492 log_err("Referencing alias didn't get the right data\n");

2681 }	2493 }

2682 - ures_close(ja);	2494 - ures_close(ja);

2683 + ures_close(th);	2495 + ures_close(th);

2684 status = U_ZERO_ERROR;	2496 status = U_ZERO_ERROR;

2685 }	2497 }

2686 /* simple alias */	2498 /* simple alias */

2687 @@ -3024,4 +3024,3 @@	2499 --- source/test/intltest/rbbiapts.cpp» 2010-07-12 11:03:29.000000000 -0700

2688 }	2500 +++ source/test/intltest/rbbiapts.cpp» 2011-01-21 14:12:45.033014000 -0800

2689

2690 }

2691 -

2692 --- source/test/intltest/rbbiapts.cpp» 2009-06-26 09:49:55.000000000 -0700

2693 +++ source/test/intltest/rbbiapts.cpp» 2009-07-28 13:56:30.208042000 -0700

2694 @@ -156,9 +156,13 @@	2501 @@ -156,9 +156,13 @@

2695 if(a!=b){	2502 if(a!=b){

2696 errln("Failed: boilerplate method operator!= does not return correct re sults");	2503 errln("Failed: boilerplate method operator!= does not return correct re sults");

2697 }	2504 }

2698 - BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status);	2505 - BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status);

2699 - if(a && c){	2506 - if(a && c){

2700 - if(c==a){	2507 - if(c==a){

2701 + // Japanese word break iteratos is identical to root with	2508 + // Japanese word break iteratos is identical to root with

2702 + // a dictionary-based break iterator, but Thai character break iterator	2509 + // a dictionary-based break iterator, but Thai character break iterator

2703 + // is still different from Root.	2510 + // is still different from Root.

2704 + BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),stat us);	2511 + BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),stat us);

2705 + BreakIterator* d = BreakIterator::createCharacterInstance(Locale("th"),stat us);	2512 + BreakIterator* d = BreakIterator::createCharacterInstance(Locale("th"),stat us);

2706 + if(c && d){	2513 + if(c && d){

2707 + if(c==d){	2514 + if(c==d){

2708 errln("Failed: boilerplate method opertator== does not return corre ct results");	2515 errln("Failed: boilerplate method opertator== does not return corre ct results");

2709 }	2516 }

2710 }else{	2517 }else{

2711 @@ -167,6 +171,7 @@	2518 @@ -167,6 +171,7 @@

2712 delete a;	2519 delete a;

2713 delete b;	2520 delete b;

2714 delete c;	2521 delete c;

2715 + delete d;	2522 + delete d;

2716 }	2523 }

2717	2524

2718 void RBBIAPITest::TestgetRules()	2525 void RBBIAPITest::TestgetRules()

2719 @@ -643,21 +648,21 @@	2526 @@ -635,21 +640,21 @@

2720 //	2527 //

2721 void RBBIAPITest::TestRuleStatus() {	2528 void RBBIAPITest::TestRuleStatus() {

2722 UChar str[30];	2529 UChar str[30];

2723 - u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094 ",	2530 - u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094 ",

2724 - // 012345678901234567 8 9 0 1 2 3 4 5 6	2531 - // 012345678901234567 8 9 0 1 2 3 4 5 6

2725 - // Ideographic Katakana Hiragana	2532 - // Ideographic Katakana Hiragana

2726 + //no longer test Han or hiragana breaking here: ruleStatusVec would return nothing	2533 + //no longer test Han or hiragana breaking here: ruleStatusVec would return nothing

2727 + // changed UBRK_WORD_KANA to UBRK_WORD_IDEO	2534 + // changed UBRK_WORD_KANA to UBRK_WORD_IDEO

2728 + u_unescape("plain word 123.45 \\u30a1\\u30a2 ",	2535 + u_unescape("plain word 123.45 \\u30a1\\u30a2 ",

2729 + // 012345678901234567 8 9 0	2536 + // 012345678901234567 8 9 0

2730 + // Katakana	2537 + // Katakana

2731 str, 30);	2538 str, 30);

2732 UnicodeString testString1(str);	2539 UnicodeString testString1(str);

2733 - int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26};	2540 - int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26};

2734 + int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21};	2541 + int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21};

2735 int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER,	2542 int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER,

2736 UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE,	2543 UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE,

2737 - UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE,	2544 - UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE,

2738 - UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA, UBRK_WORD_KANA};	2545 - UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA, UBRK_WORD_KANA};

2739 + UBRK_WORD_IDEO, UBRK_WORD_NONE};	2546 + UBRK_WORD_IDEO, UBRK_WORD_NONE};

2740	2547

2741 int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WO RD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,	2548 int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WO RD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,

2742 UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WO RD_NONE_LIMIT,	2549 UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WO RD_NONE_LIMIT,

2743 - UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WO RD_NONE_LIMIT,	2550 - UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WO RD_NONE_LIMIT,

2744 - UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WO RD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT};	2551 - UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WO RD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT};

2745 + UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT};	2552 + UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT};

2746	2553

2747 UErrorCode status=U_ZERO_ERROR;	2554 UErrorCode status=U_ZERO_ERROR;

2748	2555

2749 @@ -896,9 +901,11 @@	2556 @@ -888,9 +893,11 @@

2750	2557

2751 URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD , status);	2558 URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD , status);

2752 {	2559 {

2753 +#if 0 // With a dictionary based word breaking, ja_word is identical to root.	2560 +#if 0 // With a dictionary based word breaking, ja_word is identical to root.

2754 if (ja_word && ja_word == root_word) {	2561 if (ja_word && ja_word == root_word) {

2755 errln("japan not different from root");	2562 errln("japan not different from root");

2756 }	2563 }

2757 +#endif	2564 +#endif

2758 }	2565 }

2759	2566

2760 {	2567 {

2761 --- source/test/intltest/rbbitst.cpp» 2009-06-26 09:49:55.000000000 -0700	2568 --- source/test/intltest/rbbitst.cpp» 2010-10-08 18:23:28.000000000 -0700

2762 +++ source/test/intltest/rbbitst.cpp» 2009-07-28 15:35:18.933226000 -0700	2569 +++ source/test/intltest/rbbitst.cpp» 2011-01-21 14:12:45.180030000 -0800

2763 @@ -33,6 +33,8 @@	2570 @@ -35,6 +35,8 @@

2764 #include <string.h>	2571 #include <string.h>

2765 #include <stdio.h>	2572 #include <stdio.h>

2766 #include <stdlib.h>	2573 #include <stdlib.h>

2767 +#include "unicode/numfmt.h"	2574 +#include "unicode/numfmt.h"

2768 +#include "unicode/uscript.h"	2575 +#include "unicode/uscript.h"

2769	2576

2770 #define TEST_ASSERT(x) {if (!(x)) { \	2577 #define TEST_ASSERT(x) {if (!(x)) { \

2771 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}	2578 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}

2772 @@ -108,6 +110,8 @@	2579 @@ -138,11 +140,13 @@

2773 if (exec) TestThaiBreaks(); break;	2580 if (exec) TestThaiBreaks(); break;

2774 case 23: name = "TestTailoredBreaks";	2581 case 23: name = "TestTailoredBreaks";

2775 if (exec) TestTailoredBreaks(); break;	2582 if (exec) TestTailoredBreaks(); break;

2776 + case 24: name = "TestTrieDictWithValue";	2583 + case 24: name = "TestTrieDictWithValue";

2777 + if(exec) TestTrieDictWithValue(); break;	2584 + if(exec) TestTrieDictWithValue(); break;

2778	2585 #else

2779 default: name = ""; break; //needed to end loop	2586 - case 21: case 22: case 23: name = "skip";

2780 }	2587 + case 21: case 22: case 23: case 24: name = "skip";

2781 @@ -570,6 +574,8 @@	2588 break;

	2589 #endif

	2590 - case 24: name = "TestDictRules";

	2591 + case 25: name = "TestDictRules";

	2592 if (exec) TestDictRules(); break;

	2593 case 25: name = "TestBug5532";

	2594 if (exec) TestBug5532(); break;

	2595 @@ -607,6 +611,8 @@

2782	2596

2783	2597

2784 void RBBITest::TestJapaneseWordBreak() {	2598 void RBBITest::TestJapaneseWordBreak() {

2785 +// TODO: Rewrite this test for a dictionary-based word breaking.	2599 +// TODO: Rewrite this test for a dictionary-based word breaking.

2786 +#if 0	2600 +#if 0

2787 UErrorCode status = U_ZERO_ERROR;	2601 UErrorCode status = U_ZERO_ERROR;

2788 BITestData japaneseWordSelection(status);	2602 BITestData japaneseWordSelection(status);

2789	2603

2790 @@ -591,6 +597,7 @@	2604 @@ -628,6 +634,7 @@

2791	2605

2792 generalIteratorTest(*e, japaneseWordSelection);	2606 generalIteratorTest(*e, japaneseWordSelection);

2793 delete e;	2607 delete e;

2794 +#endif	2608 +#endif

2795 }	2609 }

2796	2610

2797 void RBBITest::TestTrieDict() {	2611 void RBBITest::TestTrieDict() {

2798 @@ -812,6 +819,372 @@	2612 @@ -849,6 +856,372 @@

2799 delete compact2;	2613 delete compact2;

2800 }	2614 }

2801	2615

2802 +/TODO: delete later/	2616 +/TODO: delete later/

2803 +inline void writeEnumerationToFile(StringEnumeration enumer, char filename){	2617 +inline void writeEnumerationToFile(StringEnumeration enumer, char filename){

2804 + UErrorCode status = U_ZERO_ERROR;	2618 + UErrorCode status = U_ZERO_ERROR;

2805 + FILE *outfile = fopen(filename,"w");	2619 + FILE *outfile = fopen(filename,"w");

2806 + UConverter *cvt = ucnv_open("UTF-8", &status);	2620 + UConverter *cvt = ucnv_open("UTF-8", &status);

2807 + if (U_FAILURE(status))	2621 + if (U_FAILURE(status))

2808 + return;	2622 + return;

(...skipping 352 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3161 + delete cloneEnum;	2975 + delete cloneEnum;

3162 + delete compact2;	2976 + delete compact2;

3163 + utext_close(originalText);	2977 + utext_close(originalText);

3164 + utext_close(cloneText);	2978 + utext_close(cloneText);

3165 +	2979 +

3166 +	2980 +

3167 +}	2981 +}

3168	2982

3169 //----------------------------------------------------------------------------	2983 //----------------------------------------------------------------------------

3170 //	2984 //

3171 @@ -1832,8 +2205,15 @@	2985 @@ -1870,8 +2243,15 @@

3172 // Don't break in runs of hiragana or runs of ideograph, where the latter inclu des \u3005 \u3007 \u303B (cldrbug #2009).	2986 // Don't break in runs of hiragana or runs of ideograph, where the latter inclu des \u3005 \u3007 \u303B (cldrbug #2009).

3173 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u 3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"	2987 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u 3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"

3174 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u 3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";	2988 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u 3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";

3175 +#if 0	2989 +#if 0

3176 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 1 7, 18, 20, 21, 24, 27, 28 };	2990 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 1 7, 18, 20, 21, 24, 27, 28 };

3177 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 1 7, 18, 19, 20, 21, 24, 25, 26, 27, 28 };	2991 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 1 7, 18, 19, 20, 21, 24, 25, 26, 27, 28 };

3178 +#endif	2992 +#endif

3179 +// There's no separate Japanese word break iterator. Root is the same as Japane se.	2993 +// There's no separate Japanese word break iterator. Root is the same as Japane se.

3180 +// Our dictionary-based iterator has to be tweaked to better handle U+3005,	2994 +// Our dictionary-based iterator has to be tweaked to better handle U+3005,

3181 +// U+3007, U+300B and some other cases.	2995 +// U+3007, U+300B and some other cases.

3182 +static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 1 5, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };	2996 +static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 1 5, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };

3183 +static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 1 5, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };	2997 +static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 1 5, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };

3184	2998

3185 // UBreakIteratorType UBRK_SENTENCE, Locale "el"	2999 // UBreakIteratorType UBRK_SENTENCE, Locale "el"

3186 // Add break after Greek question mark (cldrbug #2069).	3000 // Add break after Greek question mark (cldrbug #2069).

3187 @@ -2580,6 +2960,8 @@	3001 @@ -2672,6 +3052,8 @@

3188 UnicodeSet *fNewlineSet;	3002 UnicodeSet *fNewlineSet;

3189 UnicodeSet *fKatakanaSet;	3003 UnicodeSet *fKatakanaSet;

3190 UnicodeSet *fALetterSet;	3004 UnicodeSet *fALetterSet;

3191 + // TODO(jungshik): Do we still need this change?	3005 + // TODO(jungshik): Do we still need this change?

3192 + // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt	3006 + // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt

3193 UnicodeSet *fMidNumLetSet;	3007 UnicodeSet *fMidNumLetSet;

3194 UnicodeSet *fMidLetterSet;	3008 UnicodeSet *fMidLetterSet;

3195 UnicodeSet *fMidNumSet;	3009 UnicodeSet *fMidNumSet;

3196 @@ -2588,6 +2970,7 @@	3010 @@ -2680,6 +3062,7 @@

3197 UnicodeSet *fOtherSet;	3011 UnicodeSet *fOtherSet;

3198 UnicodeSet *fExtendSet;	3012 UnicodeSet *fExtendSet;

3199 UnicodeSet *fExtendNumLetSet;	3013 UnicodeSet *fExtendNumLetSet;

3200 + UnicodeSet *fDictionaryCjkSet;	3014 + UnicodeSet *fDictionaryCjkSet;

3201	3015

3202 RegexMatcher *fMatcher;	3016 RegexMatcher *fMatcher;

3203	3017

3204 @@ -2604,12 +2987,24 @@	3018 @@ -2696,12 +3079,24 @@

3205 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);	3019 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);

3206 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);	3020 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);

3207 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);	3021 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);

3208 - fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);	3022 - fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);

3209 + fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);	3023 + fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);

3210 + // Exclude Hangul syllables from ALetterSet during testing.	3024 + // Exclude Hangul syllables from ALetterSet during testing.

3211 + // Leave CJK dictionary characters out from the monkey tests!	3025 + // Leave CJK dictionary characters out from the monkey tests!

3212 +#if 0	3026 +#if 0

3213 + fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"	3027 + fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"

3214 + "[\\p{Line_Break = Complex_Context}"	3028 + "[\\p{Line_Break = Complex_Context}"

3215 + "-\\p{Grapheme_Cluster_Break = Extend}"	3029 + "-\\p{Grapheme_Cluster_Break = Extend}"

3216 + "-\\p{Grapheme_Cluster_Break = Control}"	3030 + "-\\p{Grapheme_Cluster_Break = Control}"

3217 + "]]",	3031 + "]]",

3218 + status);	3032 + status);

3219 +#endif	3033 +#endif

3220 + fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);	3034 + fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);

3221 + fALetterSet->removeAll(*fDictionaryCjkSet);	3035 + fALetterSet->removeAll(*fDictionaryCjkSet);

3222 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);	3036 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);

3223 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);	3037 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);

3224 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);	3038 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);

3225 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);	3039 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);

3226 - fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);	3040 - fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);

3227 + fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}[\\uff10-\\uff19]]"), status);	3041 + fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}[\\uff10-\\uff19]]"), status);

3228 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);	3042 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);

3229 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);	3043 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);

3230 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);	3044 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);

3231 @@ -2633,13 +3028,14 @@	3045 @@ -2725,13 +3120,14 @@

3232 fOtherSet->removeAll(*fFormatSet);	3046 fOtherSet->removeAll(*fFormatSet);

3233 fOtherSet->removeAll(*fExtendSet);	3047 fOtherSet->removeAll(*fExtendSet);

3234 // Inhibit dictionary characters from being tested at all.	3048 // Inhibit dictionary characters from being tested at all.

3235 + fOtherSet->removeAll(*fDictionaryCjkSet);	3049 + fOtherSet->removeAll(*fDictionaryCjkSet);

3236 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Com plex_Context}]"), status));	3050 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Com plex_Context}]"), status));

3237	3051

3238 fSets->addElement(fCRSet, status);	3052 fSets->addElement(fCRSet, status);

3239 fSets->addElement(fLFSet, status);	3053 fSets->addElement(fLFSet, status);

3240 fSets->addElement(fNewlineSet, status);	3054 fSets->addElement(fNewlineSet, status);

3241 fSets->addElement(fALetterSet, status);	3055 fSets->addElement(fALetterSet, status);

3242 - fSets->addElement(fKatakanaSet, status);	3056 - fSets->addElement(fKatakanaSet, status);

3243 + //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test ka takana	3057 + //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test ka takana

3244 fSets->addElement(fMidLetterSet, status);	3058 fSets->addElement(fMidLetterSet, status);

3245 fSets->addElement(fMidNumLetSet, status);	3059 fSets->addElement(fMidNumLetSet, status);

3246 fSets->addElement(fMidNumSet, status);	3060 fSets->addElement(fMidNumSet, status);

3247 @@ -3871,6 +4267,7 @@	3061 @@ -3978,6 +4374,7 @@

3248 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {	3062 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {

3249 count --;	3063 count --;

3250 if (forward[count] != i) {	3064 if (forward[count] != i) {

3251 + printStringBreaks(ustr, expected, expectedcount);	3065 + printStringBreaks(ustr, expected, expectedcount);

3252 test->errln("happy break test previous() failed: expected %d but go t %d",	3066 test->errln("happy break test previous() failed: expected %d but go t %d",

3253 forward[count], i);	3067 forward[count], i);

3254 break;	3068 break;

3255 @@ -3904,23 +4301,25 @@	3069 @@ -4011,23 +4408,25 @@

3256 UErrorCode status = U_ZERO_ERROR;	3070 UErrorCode status = U_ZERO_ERROR;

3257 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, stat us);	3071 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, stat us);

3258 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);	3072 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);

3259 + // Replaced any C+J characters in a row with a random sequence of character s	3073 + // Replaced any C+J characters in a row with a random sequence of character s

3260 + // of the same length to make our C+J segmentation not get in the way.	3074 + // of the same length to make our C+J segmentation not get in the way.

3261 static const char *strlist[] =	3075 static const char *strlist[] =

3262 {	3076 {

3263 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",	3077 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",

3264 - "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e004 0\\u003b",	3078 - "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e004 0\\u003b",

3265 + "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e004 0\\u003b",	3079 + "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e004 0\\u003b",

3266 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000 e0061\\u003a",	3080 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000 e0061\\u003a",

3267 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",	3081 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",

3268 - "\\u90ca\\u3588\\u009c\\u0953\\u194b",	3082 - "\\u90ca\\u3588\\u009c\\u0953\\u194b",

3269 + "\\uac00\\u3588\\u009c\\u0953\\u194b",	3083 + "\\uac00\\u3588\\u009c\\u0953\\u194b",

3270 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",	3084 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",

3271 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e ",	3085 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e ",

3272 - "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",	3086 - "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",

3273 + "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",	3087 + "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",

3274 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",	3088 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",

3275 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",	3089 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",

3276 "\\u2027\\U000e0067\\u0a47\\u00b7",	3090 "\\u2027\\U000e0067\\u0a47\\u00b7",

3277 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",	3091 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",

3278 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",	3092 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",

3279 "\\u0589\\U000e006e\\u0a42\\U000104a5",	3093 "\\u0589\\U000e006e\\u0a42\\U000104a5",

3280 - "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",	3094 - "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",

3281 + "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",	3095 + "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",

3282 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",	3096 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",

3283 "\\u0027\\u11af\\U000e0057\\u0602",	3097 "\\u0027\\u11af\\U000e0057\\u0602",

3284 "\\U0001d7f2\\U000e007\\u0004\\u0589",	3098 "\\U0001d7f2\\U000e007\\u0004\\u0589",

3285 @@ -3932,7 +4331,7 @@	3099 @@ -4039,7 +4438,7 @@

3286 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",	3100 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",

3287 "\\u0233\\U000e0020\\u0a69\\u0d6a",	3101 "\\u0233\\U000e0020\\u0a69\\u0d6a",

3288 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",	3102 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",

3289 - "\\u58f4\\U000e0049\\u20e7\\u2027",	3103 - "\\u58f4\\U000e0049\\u20e7\\u2027",

3290 + "\\u18f4\\U000e0049\\u20e7\\u2027",	3104 + "\\u18f4\\U000e0049\\u20e7\\u2027",

3291 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",	3105 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",

3292 "\\ua183\\u102d\\u0bec\\u003a",	3106 "\\ua183\\u102d\\u0bec\\u003a",

3293 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",	3107 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",

3294 @@ -3942,7 +4341,7 @@	3108 @@ -4049,7 +4448,7 @@

3295 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",	3109 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",

3296 "\\u003a\\u0664\\u00b7\\u1fba",	3110 "\\u003a\\u0664\\u00b7\\u1fba",

3297 "\\u003b\\u0027\\u00b7\\u47a3",	3111 "\\u003b\\u0027\\u00b7\\u47a3",

3298 - "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",	3112 - "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",

3299 + "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",	3113 + "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",

3300 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\ u0e51\\u1058\\U000e0058\\u00b7\\u0673",	3114 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\ u0e51\\u1058\\U000e0058\\u00b7\\u0673",

3301 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",	3115 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",

3302 };	3116 };

3303 @@ -3997,12 +4396,12 @@	3117 @@ -4104,12 +4503,12 @@

3304 "\\U0001d7f2\\U000e007d\\u0004\\u0589",	3118 "\\U0001d7f2\\U000e007d\\u0004\\u0589",

3305 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",	3119 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",

3306 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",	3120 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",

3307 - "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",	3121 - "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",

3308 + "\\U000e0065\\u302c\\u09ee\\U000e0068",	3122 + "\\U000e0065\\u302c\\u09ee\\U000e0068",

3309 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",	3123 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",

3310 "\\u0233\\U000e0020\\u0a69\\u0d6a",	3124 "\\u0233\\U000e0020\\u0a69\\u0d6a",

3311 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",	3125 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",

3312 "\\u58f4\\U000e0049\\u20e7\\u2027",	3126 "\\u58f4\\U000e0049\\u20e7\\u2027",

3313 - "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",	3127 - "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",

3314 + "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",	3128 + "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",

3315 "\\ua183\\u102d\\u0bec\\u003a",	3129 "\\ua183\\u102d\\u0bec\\u003a",

3316 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",	3130 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",

3317 "\\u003a\\u0e57\\u0fad\\u002e",	3131 "\\u003a\\u0e57\\u0fad\\u002e",

3318 --- source/test/intltest/rbbitst.h» 2009-04-22 00:53:50.000000000 -0700	3132 --- source/test/intltest/rbbitst.h» 2010-07-22 17:15:37.000000000 -0700

3319 +++ source/test/intltest/rbbitst.h» 2009-07-27 13:01:17.767342000 -0700	3133 +++ source/test/intltest/rbbitst.h» 2011-01-21 14:12:45.152007000 -0800

3320 @@ -70,6 +70,7 @@	3134 @@ -70,6 +70,7 @@

3321 void TestBug5775();	3135 void TestBug5775();

3322 void TestThaiBreaks();	3136 void TestThaiBreaks();

3323 void TestTailoredBreaks();	3137 void TestTailoredBreaks();

3324 + void TestTrieDictWithValue();	3138 + void TestTrieDictWithValue();

	3139 void TestDictRules();

	3140 void TestBug5532();

3325	3141

3326 void TestDebug();	3142 --- source/test/testdata/rbbitst.txt» 2010-07-28 17:18:28.000000000 -0700

3327	3143 +++ source/test/testdata/rbbitst.txt» 2011-01-21 14:12:45.221011000 -0800

3328 --- source/test/testdata/rbbitst.txt» 2009-06-24 14:06:38.000000000 -0700	3144 @@ -161,7 +161,23 @@

3329 +++ source/test/testdata/rbbitst.txt» 2009-07-29 12:56:31.483710000 -0700

3330 @@ -162,7 +162,23 @@

3331 <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>	3145 <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>

3332	3146

3333 # Hiragana & Katakana stay together, but separates from each other and Latin.	3147 # Hiragana & Katakana stay together, but separates from each other and Latin.

3334 -<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINI NG ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A} \N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>	3148 -<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINI NG ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A} \N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>

3335 +# *** what to do about theoretical combos of chars? i.e. hiragana + accent	3149 +# *** what to do about theoretical combos of chars? i.e. hiragana + accent

3336 +#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBIN ING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A }\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKAN A LETTER N}<300>def<200>#•</data>	3150 +#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBIN ING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A }\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKAN A LETTER N}<300>def<200>#•</data>

3337 +	3151 +

3338 +# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth	3152 +# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth

3339 +<data>•芽キャベツ<400>芽キャﾍﾞツ<400></data>	3153 +<data>•芽キャベツ<400>芽キャﾍﾞツ<400></data>

3340 +	3154 +

3341 +# more Japanese tests	3155 +# more Japanese tests

3342 +# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana	3156 +# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana

3343 +# and the Katakana block are not treated correctly. Enable this later.	3157 +# and the Katakana block are not treated correctly. Enable this later.

3344 +#<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400>　•て<400>こと<400 >は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>	3158 +#<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400>　•て<400>こと<400 >は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>

3345 +<data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400>　•て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>	3159 +<data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400>　•て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>

3346 +	3160 +

3347 +# Testing of word boundary for dictionary word containing both kanji and kana	3161 +# Testing of word boundary for dictionary word containing both kanji and kana

3348 +<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data>	3162 +<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data>

3349 +	3163 +

3350 +# Testing of Chinese segmentation (taken from a Chinese news article)	3164 +# Testing of Chinese segmentation (taken from a Chinese news article)

3351 +<data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400 >到了<400>“•推荐<400>票<400>”•，•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400> 的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>，•选出<400>他们<400> 属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</d ata>	3165 +<data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400 >到了<400>“•推荐<400>票<400>”•，•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400> 的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>，•选出<400>他们<400> 属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</d ata>

3352	3166

3353 # Words with interior formatting characters	3167 # Words with interior formatting characters

3354 <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</dat a>	3168 <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</dat a>

3355 @@ -170,6 +186,8 @@	3169 @@ -169,6 +185,8 @@

3356 # to test for bug #4097779	3170 # to test for bug #4097779

3357 <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>	3171 <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>

3358	3172

3359 +# fullwidth numeric, midletter characters etc should be treated like their half width counterparts	3173 +# fullwidth numeric, midletter characters etc should be treated like their half width counterparts

3360 +<data>•ＩＳＮ'Ｔ<200> •１９<100>日<400></data>	3174 +<data>•ＩＳＮ'Ｔ<200> •１９<100>日<400></data>

3361	3175

3362 # to test for bug #4098467	3176 # to test for bug #4098467

3363 # What follows is a string of Korean characters (I found it in the Yellow Pages	3177 # What follows is a string of Korean characters (I found it in the Yellow Pages

3364 @@ -179,9 +197,15 @@	3178 @@ -178,9 +196,15 @@

3365 # precomposed syllables...	3179 # precomposed syllables...

3366 <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\ua d50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u1 10b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u1 1bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>	3180 <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\ua d50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u1 10b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u1 1bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>

3367	3181

3368 -<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data>	3182 -<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data>

3369 +# more Korean tests (Jamo not tested here, not counted as dictionary characters )	3183 +# more Korean tests (Jamo not tested here, not counted as dictionary characters )

3370 +# Disable them now because we don't include a Korean dictionary.	3184 +# Disable them now because we don't include a Korean dictionary.

3371 +#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<2 00>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data>	3185 +#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<2 00>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data>

3372 +#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2d d<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200 > •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data>	3186 +#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2d d<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200 > •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data>

3373 +	3187 +

3374 +<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</da ta>	3188 +<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</da ta>

3375 +	3189 +

3376 +<data>•\u06c9<200>\uc799<200>\ufffa•</data>	3190 +<data>•\u06c9<200>\uc799<200>\ufffa•</data>

3377	3191

3378 -<data>•\u06c9\uc799\ufffa<200></data>	3192 -<data>•\u06c9\uc799\ufffa<200></data>

3379	3193

3380 #	3194 #

3381 # Try some words from other scripts.	3195 # Try some words from other scripts.

3382 @@ -492,8 +516,7 @@	3196 @@ -491,8 +515,7 @@

3383 <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c •</data>	3197 <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c •</data>

3384	3198

3385 # conjoining jamo...	3199 # conjoining jamo...

3386 -# TODO: rules update needed	3200 -# TODO: rules update needed

3387 -#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\ u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\ u1100\u116d•\u1112\u116c•</data>	3201 -#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\ u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\ u1100\u116d•\u1112\u116c•</data>

3388 +<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u 11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1 100\u116d•\u1112\u116c•</data>	3202 +<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u 11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1 100\u116d•\u1112\u116c•</data>

3389	3203

3390 # to test for bug #4117554: Fullwidth .!? should be treated as postJwrd	3204 # to test for bug #4117554: Fullwidth .!? should be treated as postJwrd

3391 <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data>	3205 <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data>

3392 --- source/test/testdata/testaliases.txt» 2009-06-24 14:06:38.000000000 -0 700	3206 --- source/test/testdata/testaliases.txt» 2009-11-12 13:53:42.000000000 -0 800

3393 +++ source/test/testdata/testaliases.txt» 2009-07-28 17:07:26.251120000 -0 700	3207 +++ source/test/testdata/testaliases.txt» 2011-01-21 14:12:45.204005000 -0 800

3394 @@ -28,7 +28,7 @@	3208 @@ -28,7 +28,7 @@

3395 LocaleScript:alias { "/ICUDATA/ja/LocaleScript" }	3209 LocaleScript:alias { "/ICUDATA/ja/LocaleScript" }

3396	3210

3397 // aliasing using position	3211 // aliasing using position

3398 - boundaries:alias { "/ICUDATA-brkitr/ja" } // Referencing corresponding reso urce in another bundle	3212 - boundaries:alias { "/ICUDATA-brkitr/ja" } // Referencing corresponding reso urce in another bundle

3399 + boundaries:alias { "/ICUDATA-brkitr/th" } // Referencing corresponding reso urce in another bundle	3213 + boundaries:alias { "/ICUDATA-brkitr/th" } // Referencing corresponding reso urce in another bundle

3400	3214

3401 // aliasing arrays	3215 // aliasing arrays

3402 zoneTests {	3216 zoneTests {

3403 --- source/tools/genctd/genctd.cpp» 2006-09-04 09:28:24.000000000 -0700	3217 --- source/tools/genctd/genctd.cpp» 2009-08-04 14:09:17.000000000 -0700

3404 +++ source/tools/genctd/genctd.cpp» 2009-07-27 13:01:17.776335000 -0700	3218 +++ source/tools/genctd/genctd.cpp» 2011-01-21 14:12:45.564923000 -0800

3405 @@ -1,6 +1,6 @@	3219 @@ -1,6 +1,6 @@

3406 /*	3220 /*

3407 **********************************************************************	3221 **********************************************************************

3408 -* Copyright (C) 2002-2006, International Business Machines	3222 -* Copyright (C) 2002-2009, International Business Machines

3409 +* Copyright (C) 2002-2006,2008, International Business Machines	3223 +* Copyright (C) 2002-2010, International Business Machines

3410 * Corporation and others. All Rights Reserved.	3224 * Corporation and others. All Rights Reserved.

3411 **********************************************************************	3225 **********************************************************************

3412 *	3226 *

3413 @@ -34,12 +34,15 @@	3227 @@ -34,12 +34,15 @@

3414 #include "unicode/udata.h"	3228 #include "unicode/udata.h"

3415 #include "unicode/putil.h"	3229 #include "unicode/putil.h"

3416	3230

3417 +//#include "unicode/ustdio.h"	3231 +//#include "unicode/ustdio.h"

3418 +	3232 +

3419 #include "uoptions.h"	3233 #include "uoptions.h"

3420 #include "unewdata.h"	3234 #include "unewdata.h"

3421 #include "ucmndata.h"	3235 #include "ucmndata.h"

3422 #include "rbbidata.h"	3236 #include "rbbidata.h"

3423 #include "triedict.h"	3237 #include "triedict.h"

3424 #include "cmemory.h"	3238 #include "cmemory.h"

3425 +#include "uassert.h"	3239 +#include "uassert.h"

3426	3240

3427 #include <stdio.h>	3241 #include <stdio.h>

3428 #include <stdlib.h>	3242 #include <stdlib.h>

3429 @@ -198,147 +201,191 @@	3243 @@ -199,147 +202,191 @@

3430 long wordFileSize;	3244 long wordFileSize;

3431 FILE *file;	3245 FILE *file;

3432 char *wordBufferC;	3246 char *wordBufferC;

3433 -	3247 -

3434 + MutableTrieDictionary *mtd = NULL;	3248 + MutableTrieDictionary *mtd = NULL;

3435 +	3249 +

3436 file = fopen(wordFileName, "rb");	3250 file = fopen(wordFileName, "rb");

3437 - if( file == 0 ) {	3251 - if( file == 0 ) {

3438 - fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);	3252 - fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);

3439 - exit(-1);	3253 - exit(-1);

(...skipping 301 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3741 + // Get rid of the Unicode text buffer	3555 + // Get rid of the Unicode text buffer

3742 + delete[] wordSourceU;	3556 + delete[] wordSourceU;

3743 }	3557 }

3744	3558

3745 - // Get rid of the Unicode text buffer	3559 - // Get rid of the Unicode text buffer

3746 - delete[] wordSourceU;	3560 - delete[] wordSourceU;

3747 -	3561 -

3748 // Now, create a CompactTrieDictionary from the mutable dictionary	3562 // Now, create a CompactTrieDictionary from the mutable dictionary

3749 CompactTrieDictionary ctd = new CompactTrieDictionary(mtd, status);	3563 CompactTrieDictionary ctd = new CompactTrieDictionary(mtd, status);

3750 if (U_FAILURE(status)) {	3564 if (U_FAILURE(status)) {

3751 @@ -392,4 +439,3 @@	3565 @@ -393,4 +440,3 @@

3752	3566

3753 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */	3567 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

3754 }	3568 }

3755 -	3569 -

3756 --- source/tools/genctd/Makefile.in 2006-12-16 13:07:01.000000000 -0800	3570 --- source/tools/genctd/Makefile.in 2006-12-16 13:07:01.000000000 -0800

3757 +++ source/tools/genctd/Makefile.in» 2009-07-27 13:01:17.782326000 -0700	3571 +++ source/tools/genctd/Makefile.in» 2011-01-21 14:12:45.555920000 -0800

3758 @@ -23,13 +23,13 @@	3572 @@ -23,13 +23,13 @@

3759 ## Extra files to remove for 'make clean'	3573 ## Extra files to remove for 'make clean'

3760 CLEANFILES = *~ $(DEPS) $(MAN_FILES)	3574 CLEANFILES = *~ $(DEPS) $(MAN_FILES)

3761	3575

3762 -## Target information	3576 -## Target information

3763 +## Target informationcd	3577 +## Target informationcd

3764 TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)	3578 TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)

3765	3579

3766 ifneq ($(top_builddir),$(top_srcdir))	3580 ifneq ($(top_builddir),$(top_srcdir))

3767 CPPFLAGS += -I$(top_builddir)/common	3581 CPPFLAGS += -I$(top_builddir)/common

3768 endif	3582 endif

3769 -CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil	3583 -CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil

3770 +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -I$(top_srcdir)/i18n	3584 +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -I$(top_srcdir)/i18n

3771 LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)	3585 LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)

3772	3586

3773 OBJECTS = genctd.o	3587 OBJECTS = genctd.o

3774 --- source/data/Makefile.in 2009-05-20 23:03:54.000000000 -0700

3775 +++ source/data/Makefile.in 2009-10-21 15:43:18.235201000 -0700

3776 @@ -452,8 +452,9 @@

3777 #################################################### CTD

3778 # CTD FILES

3779

3780 -$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_ FILES)

3781 - $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<

3782 +# .ctd file now generated regardless of whether dictionary file exists

3783 +$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)

3784 + $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F ).txt

3785

3786 #################################################### CFU

3787 # CFU FILES

OLD	NEW

« no previous file with comments | « no previous file | icu46/source/common/brkeng.cpp » ('j') | no next file with comments »