icu52/patches/segmentation.patch - Issue 224943002: icu local change part1

Unified Diff: icu52/patches/segmentation.patch

Issue 224943002: icu local change part1 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/

Patch Set: function indentation changed Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: icu52/patches/segmentation.patch

===================================================================

--- icu52/patches/segmentation.patch (revision 261238)

+++ icu52/patches/segmentation.patch (working copy)

@@ -1,3587 +0,0 @@

---- source/common/brkeng.cpp 2009-11-11 07:47:22.000000000 -0800

-+++ source/common/brkeng.cpp 2011-01-21 14:12:45.479922000 -0800

-@@ -226,6 +226,30 @@

- case USCRIPT_THAI:

- engine = new ThaiBreakEngine(dict, status);

- break;

-+ case USCRIPT_HANGUL:

-+ engine = new CjkBreakEngine(dict, kKorean, status);

-+ break;

-+ // use same BreakEngine and dictionary for both Chinese and Japanese

-+ case USCRIPT_HIRAGANA:

-+ case USCRIPT_KATAKANA:

-+ case USCRIPT_HAN:

-+ engine = new CjkBreakEngine(dict, kChineseJapanese, status);

-+ break;

-+#if 0

-+ // TODO: Have to get some characters with script=common handled

-+ // by CjkBreakEngine (e.g. U+309B). Simply subjecting

-+ // them to CjkBreakEngine does not work. The engine has to

-+ // special-case them.

-+ case USCRIPT_COMMON:

-+ {

-+ UBlockCode block = ublock_getCode(code);

-+ if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)

-+ engine = new CjkBreakEngine(dict, kChineseJapanese, status);

-+ break;

-+ }

-+#endif

- default:

- break;

- }

-@@ -281,6 +305,13 @@

- dict = NULL;

- }

- return dict;

-+ } else if (dictfname != NULL){

-+ //create dummy dict if dictionary filename not valid

-+ UChar c = 0x0020;

-+ status = U_ZERO_ERROR;

-+ MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE);

-+ mtd->addWord(&c, 1, status, 1);

-+ return new CompactTrieDictionary(*mtd, status);

- }

- return NULL;

- }

---- source/common/dictbe.cpp 2008-06-13 12:21:12.000000000 -0700

-+++ source/common/dictbe.cpp 2011-01-21 14:12:45.468928000 -0800

-@@ -16,6 +16,9 @@

- #include "unicode/ubrk.h"

- #include "uvector.h"

- #include "triedict.h"

-+#include "uassert.h"

-+#include "unicode/normlzr.h"

-+#include "cmemory.h"

- U_NAMESPACE_BEGIN

-@@ -422,6 +425,294 @@

- return wordsFound;

- }

-+/*

-+ ******************************************************************

-+ * CjkBreakEngine

-+ */

-+static const uint32_t kuint32max = 0xFFFFFFFF;

-+CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status)

-+: DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){

-+ if (!adoptDictionary->getValued()) {

-+ status = U_ILLEGAL_ARGUMENT_ERROR;

-+ return;

-+ }

-+ // Korean dictionary only includes Hangul syllables

-+ fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);

-+ fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);

-+ fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);

-+ fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);

-+ if (U_SUCCESS(status)) {

-+ // handle Korean and Japanese/Chinese using different dictionaries

-+ if (type == kKorean) {

-+ setCharacters(fHangulWordSet);

-+ } else { //Chinese and Japanese

-+ UnicodeSet cjSet;

-+ cjSet.addAll(fHanWordSet);

-+ cjSet.addAll(fKatakanaWordSet);

-+ cjSet.addAll(fHiraganaWordSet);

-+ cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc"));

-+ setCharacters(cjSet);

-+ }

-+}

-+CjkBreakEngine::~CjkBreakEngine(){

-+ delete fDictionary;

-+}

-+// The katakanaCost values below are based on the length frequencies of all

-+// katakana phrases in the dictionary

-+static const int kMaxKatakanaLength = 8;

-+static const int kMaxKatakanaGroupLength = 20;

-+static const uint32_t maxSnlp = 255;

-+static inline uint32_t getKatakanaCost(int wordLength){

-+ //TODO: fill array with actual values from dictionary!

-+ static const uint32_t katakanaCost[kMaxKatakanaLength + 1]

-+ = {8192, 984, 408, 240, 204, 252, 300, 372, 480};

-+ return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];

-+}

-+static inline bool isKatakana(uint16_t value) {

-+ return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||

-+ (value >= 0xFF66u && value <= 0xFF9fu);

-+}

-+// A very simple helper class to streamline the buffer handling in

-+// divideUpDictionaryRange.

-+template<class T, size_t N>

-+class AutoBuffer {

-+ public:

-+ AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {

-+ if (size > N) {

-+ buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));

-+ capacity = size;

-+ }

-+ ~AutoBuffer() {

-+ if (buffer != stackBuffer)

-+ uprv_free(buffer);

-+ }

-+#if 0

-+ T* operator& () {

-+ return buffer;

-+ }

-+#endif

-+ T* elems() {

-+ return buffer;

-+ }

-+ const T& operator[] (size_t i) const {

-+ return buffer[i];

-+ }

-+ T& operator[] (size_t i) {

-+ return buffer[i];

-+ }

-+ // resize without copy

-+ void resize(size_t size) {

-+ if (size <= capacity)

-+ return;

-+ if (buffer != stackBuffer)

-+ uprv_free(buffer);

-+ buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));

-+ capacity = size;

-+ }

-+ private:

-+ T stackBuffer[N];

-+ T* buffer;

-+ AutoBuffer();

-+ size_t capacity;

-+};

-+/*

-+ * @param text A UText representing the text

-+ * @param rangeStart The start of the range of dictionary characters

-+ * @param rangeEnd The end of the range of dictionary characters

-+ * @param foundBreaks Output of C array of int32_t break positions, or 0

-+ * @return The number of breaks found

-+ */

-+int32_t

-+CjkBreakEngine::divideUpDictionaryRange( UText *text,

-+ int32_t rangeStart,

-+ int32_t rangeEnd,

-+ UStack &foundBreaks ) const {

-+ if (rangeStart >= rangeEnd) {

-+ return 0;

-+ }

-+ const size_t defaultInputLength = 80;

-+ size_t inputLength = rangeEnd - rangeStart;

-+ AutoBuffer<UChar, defaultInputLength> charString(inputLength);

-+ // Normalize the input string and put it in normalizedText.

-+ // The map from the indices of the normalized input to the raw

-+ // input is kept in charPositions.

-+ UErrorCode status = U_ZERO_ERROR;

-+ utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);

-+ if (U_FAILURE(status))

-+ return 0;

-+ UnicodeString inputString(charString.elems(), inputLength);

-+ UNormalizationMode norm_mode = UNORM_NFKC;

-+ UBool isNormalized =

-+ Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||

-+ Normalizer::isNormalized(inputString, norm_mode, status);

-+ AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);

-+ int numChars = 0;

-+ UText normalizedText = UTEXT_INITIALIZER;

-+ // Needs to be declared here because normalizedText holds onto its buffer.

-+ UnicodeString normalizedString;

-+ if (isNormalized) {

-+ int32_t index = 0;

-+ charPositions[0] = 0;

-+ while(index < inputString.length()) {

-+ index = inputString.moveIndex32(index, 1);

-+ charPositions[++numChars] = index;

-+ }

-+ utext_openUnicodeString(&normalizedText, &inputString, &status);

-+ }

-+ else {

-+ Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);

-+ if (U_FAILURE(status))

-+ return 0;

-+ charPositions.resize(normalizedString.length() + 1);

-+ Normalizer normalizer(charString.elems(), inputLength, norm_mode);

-+ int32_t index = 0;

-+ charPositions[0] = 0;

-+ while(index < normalizer.endIndex()){

-+ UChar32 uc = normalizer.next();

-+ charPositions[++numChars] = index = normalizer.getIndex();

-+ }

-+ utext_openUnicodeString(&normalizedText, &normalizedString, &status);

-+ }

-+ if (U_FAILURE(status))

-+ return 0;

-+ // From this point on, all the indices refer to the indices of

-+ // the normalized input string.

-+ // bestSnlp[i] is the snlp of the best segmentation of the first i

-+ // characters in the range to be matched.

-+ AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);

-+ bestSnlp[0] = 0;

-+ for(int i=1; i<=numChars; i++){

-+ bestSnlp[i] = kuint32max;

-+ }

-+ // prev[i] is the index of the last CJK character in the previous word in

-+ // the best segmentation of the first i characters.

-+ AutoBuffer<int, defaultInputLength> prev(numChars + 1);

-+ for(int i=0; i<=numChars; i++){

-+ prev[i] = -1;

-+ }

-+ const size_t maxWordSize = 20;

-+ AutoBuffer<uint16_t, maxWordSize> values(numChars);

-+ AutoBuffer<int32_t, maxWordSize> lengths(numChars);

-+ // Dynamic programming to find the best segmentation.

-+ bool is_prev_katakana = false;

-+ for (int i = 0; i < numChars; ++i) {

-+ //utext_setNativeIndex(text, rangeStart + i);

-+ utext_setNativeIndex(&normalizedText, i);

-+ if (bestSnlp[i] == kuint32max)

-+ continue;

-+ int count;

-+ // limit maximum word length matched to size of current substring

-+ int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize: numChars - i;

-+ fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());

-+ // if there are no single character matches found in the dictionary

-+ // starting with this charcter, treat character as a 1-character word

-+ // with the highest value possible, i.e. the least likely to occur.

-+ // Exclude Korean characters from this treatment, as they should be left

-+ // together by default.

-+ if((count == 0 || lengths[0] != 1) &&

-+ !fHangulWordSet.contains(utext_current32(&normalizedText))){

-+ values[count] = maxSnlp;

-+ lengths[count++] = 1;

-+ }

-+ for (int j = 0; j < count; j++){

-+ //U_ASSERT(values[j] >= 0 && values[j] <= maxSnlp);

-+ uint32_t newSnlp = bestSnlp[i] + values[j];

-+ if (newSnlp < bestSnlp[lengths[j] + i]) {

-+ bestSnlp[lengths[j] + i] = newSnlp;

-+ prev[lengths[j] + i] = i;

-+ }

-+ // In Japanese,

-+ // Katakana word in single character is pretty rare. So we apply

-+ // the following heuristic to Katakana: any continuous run of Katakana

-+ // characters is considered a candidate word with a default cost

-+ // specified in the katakanaCost table according to its length.

-+ //utext_setNativeIndex(text, rangeStart + i);

-+ utext_setNativeIndex(&normalizedText, i);

-+ bool is_katakana = isKatakana(utext_current32(&normalizedText));

-+ if (!is_prev_katakana && is_katakana) {

-+ int j = i + 1;

-+ utext_next32(&normalizedText);

-+ // Find the end of the continuous run of Katakana characters

-+ while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&

-+ isKatakana(utext_current32(&normalizedText))) {

-+ utext_next32(&normalizedText);

-+ ++j;

-+ }

-+ if ((j - i) < kMaxKatakanaGroupLength) {

-+ uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);

-+ if (newSnlp < bestSnlp[j]) {

-+ bestSnlp[j] = newSnlp;

-+ prev[j] = i;

-+ }

-+ is_prev_katakana = is_katakana;

-+ }

-+ // Start pushing the optimal offset index into t_boundary (t for tentative).

-+ // prev[numChars] is guaranteed to be meaningful.

-+ // We'll first push in the reverse order, i.e.,

-+ // t_boundary[0] = numChars, and afterwards do a swap.

-+ AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);

-+ int numBreaks = 0;

-+ // No segmentation found, set boundary to end of range

-+ if (bestSnlp[numChars] == kuint32max) {

-+ t_boundary[numBreaks++] = numChars;

-+ } else {

-+ for (int i = numChars; i > 0; i = prev[i]){

-+ t_boundary[numBreaks++] = i;

-+ }

-+ U_ASSERT(prev[t_boundary[numBreaks-1]] == 0);

-+ }

-+ // Reverse offset index in t_boundary.

-+ // Don't add a break for the start of the dictionary range if there is one

-+ // there already.

-+ if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {

-+ t_boundary[numBreaks++] = 0;

-+ }

-+ // Now that we're done, convert positions in t_bdry[] (indices in

-+ // the normalized input string) back to indices in the raw input string

-+ // while reversing t_bdry and pushing values to foundBreaks.

-+ for (int i = numBreaks-1; i >= 0; i--) {

-+ foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);

-+ }

-+ utext_close(&normalizedText);

-+ return numBreaks;

-+}

- U_NAMESPACE_END

- #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

---- source/common/dictbe.h 2006-09-29 17:37:45.000000000 -0700

-+++ source/common/dictbe.h 2011-01-21 14:12:45.492920000 -0800

-@@ -1,8 +1,8 @@

- /**

-- *******************************************************************************

-+ **********************************************************************************

- */

- #ifndef DICTBE_H

-@@ -65,31 +65,31 @@

- */

- virtual ~DictionaryBreakEngine();

-- /**

-- * Indicate whether this engine handles a particular character for

-- * a particular kind of break.

-- *

-- * @param c A character which begins a run that the engine might handle

-- * @param breakType The type of text break which the caller wants to determine

-- * @return TRUE if this engine handles the particular character and break

-- * type.

-- */

-+ /**

-+ * Indicate whether this engine handles a particular character for

-+ * a particular kind of break.

-+ *

-+ * @param c A character which begins a run that the engine might handle

-+ * @param breakType The type of text break which the caller wants to determine

-+ * @return TRUE if this engine handles the particular character and break

-+ * type.

-+ */

- virtual UBool handles( UChar32 c, int32_t breakType ) const;

-- /**

-- * Find any breaks within a run in the supplied text.

-- *

-- * @param text A UText representing the text. The

-- * iterator is left at the end of the run of characters which the engine

-- * is capable of handling.

-- * @param startPos The start of the run within the supplied text.

-- * @param endPos The end of the run within the supplied text.

-- * @param reverse Whether the caller is looking for breaks in a reverse

-- * direction.

-- * @param breakType The type of break desired, or -1.

-- * @param foundBreaks An allocated C array of the breaks found, if any

-- * @return The number of breaks found.

-- */

-+ /**

-+ * Find any breaks within a run in the supplied text.

-+ *

-+ * @param text A UText representing the text. The iterator is left at

-+ * the end of the run of characters which the engine is capable of handling

-+ * that starts from the first (or last) character in the range.

-+ * @param startPos The start of the run within the supplied text.

-+ * @param endPos The end of the run within the supplied text.

-+ * @param reverse Whether the caller is looking for breaks in a reverse

-+ * direction.

-+ * @param breakType The type of break desired, or -1.

-+ * @param foundBreaks An allocated C array of the breaks found, if any

-+ * @return The number of breaks found.

-+ */

- virtual int32_t findBreaks( UText *text,

- int32_t startPos,

- int32_t endPos,

-@@ -114,7 +114,7 @@

- // virtual void setBreakTypes( uint32_t breakTypes );

- /**

-- * Divide up a range of known dictionary characters.

-+ * Divide up a range of known dictionary characters handled by this break engine.

- *

- * @param text A UText representing the text

- * @param rangeStart The start of the range of dictionary characters

-@@ -171,7 +171,7 @@

- protected:

- /**

-- * Divide up a range of known dictionary characters.

-+ * Divide up a range of known dictionary characters handled by this break engine.

- *

- * @param text A UText representing the text

- * @param rangeStart The start of the range of dictionary characters

-@@ -186,6 +186,66 @@

- };

-+/*******************************************************************

-+ * CjkBreakEngine

-+ */

-+//indicates language/script that the CjkBreakEngine will handle

-+enum LanguageType {

-+ kKorean,

-+ kChineseJapanese

-+};

-+/**

-+ * CjkBreakEngine is a kind of DictionaryBreakEngine that uses a

-+ * TrieWordDictionary with costs associated with each word and

-+ * Viterbi decoding to determine CJK-specific breaks.

-+ */

-+class CjkBreakEngine : public DictionaryBreakEngine {

-+ protected:

-+ /**

-+ * The set of characters handled by this engine

-+ * @internal

-+ */

-+ UnicodeSet fHangulWordSet;

-+ UnicodeSet fHanWordSet;

-+ UnicodeSet fKatakanaWordSet;

-+ UnicodeSet fHiraganaWordSet;

-+ const TrieWordDictionary *fDictionary;

-+ public:

-+ /**

-+ * Default constructor.

-+ *

-+ * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the

-+ * engine is deleted. The TrieWordDictionary must contain costs for each word

-+ * in order for the dictionary to work properly.

-+ */

-+ CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status);

-+ /**

-+ * Virtual destructor.

-+ */

-+ virtual ~CjkBreakEngine();

-+ protected:

-+ /**

-+ * Divide up a range of known dictionary characters handled by this break engine.

-+ *

-+ * @param text A UText representing the text

-+ * @param rangeStart The start of the range of dictionary characters

-+ * @param rangeEnd The end of the range of dictionary characters

-+ * @param foundBreaks Output of C array of int32_t break positions, or 0

-+ * @return The number of breaks found

-+ */

-+ virtual int32_t divideUpDictionaryRange( UText *text,

-+ int32_t rangeStart,

-+ int32_t rangeEnd,

-+ UStack &foundBreaks ) const;

-+};

- U_NAMESPACE_END

---- source/common/rbbi.cpp 2010-07-22 17:15:37.000000000 -0700

-+++ source/common/rbbi.cpp 2011-01-21 14:12:45.457938000 -0800

-@@ -1555,10 +1555,12 @@

- int32_t endPos,

- UBool reverse) {

- // Reset the old break cache first.

-- uint32_t dictionaryCount = fDictionaryCharCount;

- reset();

-- if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {

-+ // note: code segment below assumes that dictionary chars are in the

-+ // startPos-endPos range

-+ // value returned should be next character in sequence

-+ if ((endPos - startPos) <= 1) {

- return (reverse ? startPos : endPos);

- }

-@@ -1711,7 +1713,7 @@

- // proposed break by one of the breaks we found. Use following() and

- // preceding() to do the work. They should never recurse in this case.

- if (reverse) {

-- return preceding(endPos - 1);

-+ return preceding(endPos);

- }

- else {

- return following(startPos);

---- source/common/triedict.cpp 2008-02-13 01:35:50.000000000 -0800

-+++ source/common/triedict.cpp 2011-01-21 14:12:45.271006000 -0800

-@@ -20,6 +20,7 @@

- #include "uvector.h"

- #include "uvectr32.h"

- #include "uarrsort.h"

-+#include "hash.h"

- //#define DEBUG_TRIE_DICT 1

-@@ -27,6 +28,11 @@

- #include <sys/times.h>

- #include <limits.h>

- #include <stdio.h>

-+#include <time.h>

-+#ifndef CLK_TCK

-+#define CLK_TCK CLOCKS_PER_SEC

-+#endif

- #endif

- U_NAMESPACE_BEGIN

-@@ -45,6 +51,11 @@

- * MutableTrieDictionary

- */

-+//#define MAX_VALUE 65535

-+// forward declaration

-+inline uint16_t scaleLogProbabilities(double logprob);

- // Node structure for the ternary, uncompressed trie

- struct TernaryNode : public UMemory {

- UChar ch; // UTF-16 code unit

-@@ -77,7 +88,8 @@

- delete high;

- }

--MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) {

-+MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status,

-+ UBool containsValue /* = FALSE */ ) {

- // Start the trie off with something. Having the root node already present

- // cuts a special case out of the search/insertion functions.

- // Making it a median character cuts the worse case for searches from

-@@ -91,14 +103,19 @@

- if (U_SUCCESS(status) && fIter == NULL) {

- status = U_MEMORY_ALLOCATION_ERROR;

- }

-+ fValued = containsValue;

- }

--MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) {

-+MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status,

-+ UBool containsValue /* = false */ ) {

- fTrie = NULL;

- fIter = utext_openUChars(NULL, NULL, 0, &status);

- if (U_SUCCESS(status) && fIter == NULL) {

- status = U_MEMORY_ALLOCATION_ERROR;

- }

-+ fValued = containsValue;

- }

- MutableTrieDictionary::~MutableTrieDictionary() {

-@@ -108,12 +125,13 @@

- int32_t

- MutableTrieDictionary::search( UText *text,

-- int32_t maxLength,

-- int32_t *lengths,

-- int &count,

-- int limit,

-- TernaryNode *&parent,

-- UBool &pMatched ) const {

-+ int32_t maxLength,

-+ int32_t *lengths,

-+ int &count,

-+ int limit,

-+ TernaryNode *&parent,

-+ UBool &pMatched,

-+ uint16_t *values /*=NULL*/) const {

- // TODO: current implementation works in UTF-16 space

- const TernaryNode *up = NULL;

- const TernaryNode *p = fTrie;

-@@ -121,6 +139,10 @@

- pMatched = TRUE;

- int i;

-+ if (!fValued) {

-+ values = NULL;

-+ }

- UChar uc = utext_current32(text);

- for (i = 0; i < maxLength && p != NULL; ++i) {

- while (p != NULL) {

-@@ -141,7 +163,11 @@

- break;

- }

- // Must be equal to get here

-- if (limit > 0 && (p->flags & kEndsWord)) {

-+ if (limit > 0 && (p->flags > 0)) {

-+ //is there a more efficient way to add values? ie. remove if stmt

-+ if(values != NULL) {

-+ values[mycount] = p->flags;

-+ }

- lengths[mycount++] = i+1;

- --limit;

- }

-@@ -161,13 +187,14 @@

- void

- MutableTrieDictionary::addWord( const UChar *word,

- int32_t length,

-- UErrorCode &status ) {

--#if 0

-- if (length <= 0) {

-+ UErrorCode &status,

-+ uint16_t value /* = 0 */ ) {

-+ // dictionary cannot store zero values, would interfere with flags

-+ if (length <= 0 || (!fValued && value > 0) || (fValued && value == 0)) {

- status = U_ILLEGAL_ARGUMENT_ERROR;

- return;

- }

--#endif

- TernaryNode *parent;

- UBool pMatched;

- int count;

-@@ -177,7 +204,7 @@

- matched = search(fIter, length, NULL, count, 0, parent, pMatched);

- while (matched++ < length) {

-- UChar32 uc = utext_next32(fIter); // TODO: supplemetary support?

-+ UChar32 uc = utext_next32(fIter); // TODO: supplementary support?

- U_ASSERT(uc != U_SENTINEL);

- TernaryNode *newNode = new TernaryNode(uc);

- if (newNode == NULL) {

-@@ -199,30 +226,23 @@

- parent = newNode;

- }

-- parent->flags |= kEndsWord;

--}

--#if 0

--void

--MutableTrieDictionary::addWords( UEnumeration *words,

-- UErrorCode &status ) {

-- int32_t length;

-- const UChar *word;

-- while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) {

-- addWord(word, length, status);

-+ if(fValued && value > 0){

-+ parent->flags = value;

-+ } else {

-+ parent->flags |= kEndsWord;

- }

--#endif

- int32_t

- MutableTrieDictionary::matches( UText *text,

- int32_t maxLength,

- int32_t *lengths,

- int &count,

-- int limit ) const {

-+ int limit,

-+ uint16_t *values /*=NULL*/) const {

- TernaryNode *parent;

- UBool pMatched;

-- return search(text, maxLength, lengths, count, limit, parent, pMatched);

-+ return search(text, maxLength, lengths, count, limit, parent, pMatched, values);

- }

- // Implementation of iteration for MutableTrieDictionary

-@@ -277,7 +297,7 @@

- break;

- }

- case kEqual:

-- emit = (node->flags & kEndsWord) != 0;

-+ emit = node->flags > 0;

- equal = (node->equal != NULL);

- // If this node should be part of the next emitted string, append

- // the UChar to the string, and make sure we pop it when we come

-@@ -299,7 +319,7 @@

- }

- case kGreaterThan:

- // If this node's character is in the string, remove it.

-- if (node->equal != NULL || (node->flags & kEndsWord)) {

-+ if (node->equal != NULL || node->flags > 0) {

- unistr.truncate(unistr.length()-1);

- }

- if (node->high != NULL) {

-@@ -354,12 +374,75 @@

- * CompactTrieDictionary

- */

-+//TODO further optimization:

-+// minimise size of trie with logprobs by storing values

-+// for terminal nodes directly in offsets[]

-+// --> calculating from next offset *might* be simpler, but would have to add

-+// one last offset for logprob of last node

-+// --> if calculate from current offset, need to factor in possible overflow

-+// as well.

-+// idea: store in offset, set first bit to indicate logprob storage-->won't

-+// have to access additional node

-+// {'Dic', 1}, version 1: uses old header, no values

-+#define COMPACT_TRIE_MAGIC_1 0x44696301

-+// version 2: uses new header (more than 2^16 nodes), no values

-+#define COMPACT_TRIE_MAGIC_2 0x44696302

-+// version 3: uses new header, includes values

-+#define COMPACT_TRIE_MAGIC_3 0x44696303

- struct CompactTrieHeader {

- uint32_t size; // Size of the data in bytes

- uint32_t magic; // Magic number (including version)

-+ uint32_t nodeCount; // Number of entries in offsets[]

-+ uint32_t root; // Node number of the root node

-+ uint32_t offsets[1]; // Offsets to nodes from start of data

-+};

-+// old version of CompactTrieHeader kept for backwards compatibility

-+struct CompactTrieHeaderV1 {

-+ uint32_t size; // Size of the data in bytes

-+ uint32_t magic; // Magic number (including version)

- uint16_t nodeCount; // Number of entries in offsets[]

- uint16_t root; // Node number of the root node

-- uint32_t offsets[1]; // Offsets to nodes from start of data

-+ uint32_t offsets[1]; // Offsets to nodes from start of data

-+};

-+// Helper class for managing CompactTrieHeader and CompactTrieHeaderV1

-+struct CompactTrieInfo {

-+ uint32_t size; // Size of the data in bytes

-+ uint32_t magic; // Magic number (including version)

-+ uint32_t nodeCount; // Number of entries in offsets[]

-+ uint32_t root; // Node number of the root node

-+ uint32_t *offsets; // Offsets to nodes from start of data

-+ uint8_t *address; // pointer to header bytes in memory

-+ CompactTrieInfo(const void *data, UErrorCode &status){

-+ CompactTrieHeader *header = (CompactTrieHeader *) data;

-+ if (header->magic != COMPACT_TRIE_MAGIC_1 &&

-+ header->magic != COMPACT_TRIE_MAGIC_2 &&

-+ header->magic != COMPACT_TRIE_MAGIC_3) {

-+ status = U_ILLEGAL_ARGUMENT_ERROR;

-+ } else {

-+ size = header->size;

-+ magic = header->magic;

-+ if (header->magic == COMPACT_TRIE_MAGIC_1) {

-+ CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *) header;

-+ nodeCount = headerV1->nodeCount;

-+ root = headerV1->root;

-+ offsets = &(headerV1->offsets[0]);

-+ address = (uint8_t *)headerV1;

-+ } else {

-+ nodeCount = header->nodeCount;

-+ root = header->root;

-+ offsets = &(header->offsets[0]);

-+ address = (uint8_t *)header;

-+ }

-+ ~CompactTrieInfo(){}

- };

- // Note that to avoid platform-specific alignment issues, all members of the node

-@@ -375,10 +458,14 @@

- enum CompactTrieNodeFlags {

- kVerticalNode = 0x1000, // This is a vertical node

- kParentEndsWord = 0x2000, // The node whose equal link points to this ends a word

-- kReservedFlag1 = 0x4000,

-- kReservedFlag2 = 0x8000,

-+ kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kReservedFlag1

-+ kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReservedFlag2

- kCountMask = 0x0FFF, // The count portion of flagscount

-- kFlagMask = 0xF000 // The flags portion of flagscount

-+ kFlagMask = 0xF000, // The flags portion of flagscount

-+ kRootCountMask = 0x7FFF // The count portion of flagscount in the root node

-+ //offset flags:

-+ //kOffsetContainsValue = 0x80000000 // Offset contains value for parent node

- };

- // The two node types are distinguished by the kVerticalNode flag.

-@@ -402,63 +489,177 @@

- uint16_t chars[1]; // Code units

- };

--// {'Dic', 1}, version 1

--#define COMPACT_TRIE_MAGIC_1 0x44696301

- CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj,

- UErrorCode &status )

- : fUData(dataObj)

- {

-- fData = (const CompactTrieHeader *) udata_getMemory(dataObj);

-+ fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));

-+ *fInfo = CompactTrieInfo(udata_getMemory(dataObj), status);

- fOwnData = FALSE;

-- if (fData->magic != COMPACT_TRIE_MAGIC_1) {

-- status = U_ILLEGAL_ARGUMENT_ERROR;

-- fData = NULL;

-- }

- }

- CompactTrieDictionary::CompactTrieDictionary( const void *data,

- UErrorCode &status )

- : fUData(NULL)

- {

-- fData = (const CompactTrieHeader *) data;

-+ fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));

-+ *fInfo = CompactTrieInfo(data, status);

- fOwnData = FALSE;

-- if (fData->magic != COMPACT_TRIE_MAGIC_1) {

-- status = U_ILLEGAL_ARGUMENT_ERROR;

-- fData = NULL;

-- }

- }

- CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict,

- UErrorCode &status )

- : fUData(NULL)

- {

-- fData = compactMutableTrieDictionary(dict, status);

-+ const CompactTrieHeader* header = compactMutableTrieDictionary(dict, status);

-+ if (U_SUCCESS(status)) {

-+ fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));

-+ *fInfo = CompactTrieInfo(header, status);

-+ }

- fOwnData = !U_FAILURE(status);

- }

- CompactTrieDictionary::~CompactTrieDictionary() {

- if (fOwnData) {

-- uprv_free((void *)fData);

-+ uprv_free((void *)(fInfo->address));

- }

-+ uprv_free((void *)fInfo);

- if (fUData) {

- udata_close(fUData);

- }

-+UBool CompactTrieDictionary::getValued() const{

-+ return fInfo->magic == COMPACT_TRIE_MAGIC_3;

-+}

- uint32_t

- CompactTrieDictionary::dataSize() const {

-- return fData->size;

-+ return fInfo->size;

- }

- const void *

- CompactTrieDictionary::data() const {

-- return fData;

-+ return fInfo->address;

-+}

-+//This function finds the address of a node for us, given its node ID

-+static inline const CompactTrieNode *

-+getCompactNode(const CompactTrieInfo *info, uint32_t node) {

-+ if(node < info->root-1) {

-+ return (const CompactTrieNode *)(&info->offsets[node]);

-+ } else {

-+ return (const CompactTrieNode *)(info->address + info->offsets[node]);

-+ }

- }

--// This function finds the address of a node for us, given its node ID

-+//this version of getCompactNode is currently only used in compactMutableTrieDictionary()

- static inline const CompactTrieNode *

--getCompactNode(const CompactTrieHeader *header, uint16_t node) {

-- return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]);

-+getCompactNode(const CompactTrieHeader *header, uint32_t node) {

-+ if(node < header->root-1) {

-+ return (const CompactTrieNode *)(&header->offsets[node]);

-+ } else {

-+ return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]);

-+ }

-+}

-+/**

-+ * Calculates the number of links in a node

-+ * @node The specified node

-+ */

-+static inline const uint16_t

-+getCount(const CompactTrieNode *node){

-+ return (node->flagscount & kCountMask);

-+ //use the code below if number of links ever exceed 4096

-+ //return (node->flagscount & kCountMask) + ((node->flagscount & kExceedsCount) >> 2);

-+}

-+/**

-+ * calculates an equal link node ID of a horizontal node

-+ * @hnode The horizontal node containing the equal link

-+ * @param index The index into hnode->entries[]

-+ * @param nodeCount The length of hnode->entries[]

-+ */

-+static inline uint32_t calcEqualLink(const CompactTrieVerticalNode *vnode){

-+ if(vnode->flagscount & kEqualOverflows){

-+ // treat overflow bits as an extension of chars[]

-+ uint16_t *overflow = (uint16_t *) &vnode->chars[getCount((CompactTrieNode*)vnode)];

-+ return vnode->equal + (((uint32_t)*overflow) << 16);

-+ }else{

-+ return vnode->equal;

-+ }

-+}

-+/**

-+ * calculates an equal link node ID of a horizontal node

-+ * @hnode The horizontal node containing the equal link

-+ * @param index The index into hnode->entries[]

-+ * @param nodeCount The length of hnode->entries[]

-+ */

-+static inline uint32_t calcEqualLink(const CompactTrieHorizontalNode *hnode, uint16_t index, uint16_t nodeCount){

-+ if(hnode->flagscount & kEqualOverflows){

-+ //set overflow to point to the uint16_t containing the overflow bits

-+ uint16_t *overflow = (uint16_t *) &hnode->entries[nodeCount];

-+ overflow += index/4;

-+ uint16_t extraBits = (*overflow >> (3 - (index % 4)) * 4) % 0x10;

-+ return hnode->entries[index].equal + (((uint32_t)extraBits) << 16);

-+ } else {

-+ return hnode->entries[index].equal;

-+ }

-+}

-+/**

-+ * Returns the value stored in the specified node which is associated with its

-+ * parent node.

-+ * TODO: how to tell that value is stored in node or in offset? check whether

-+ * node ID < fInfo->root!

-+ */

-+static inline uint16_t getValue(const CompactTrieHorizontalNode *hnode){

-+ uint16_t count = getCount((CompactTrieNode *)hnode);

-+ uint16_t overflowSize = 0; //size of node ID overflow storage in bytes

-+ if(hnode->flagscount & kEqualOverflows)

-+ overflowSize = (count + 3) / 4 * sizeof(uint16_t);

-+ return *((uint16_t *)((uint8_t *)&hnode->entries[count] + overflowSize));

-+}

-+static inline uint16_t getValue(const CompactTrieVerticalNode *vnode){

-+ // calculate size of total node ID overflow storage in bytes

-+ uint16_t overflowSize = (vnode->flagscount & kEqualOverflows)? sizeof(uint16_t) : 0;

-+ return *((uint16_t *)((uint8_t *)&vnode->chars[getCount((CompactTrieNode *)vnode)] + overflowSize));

-+}

-+static inline uint16_t getValue(const CompactTrieNode *node){

-+ if(node->flagscount & kVerticalNode)

-+ return getValue((const CompactTrieVerticalNode *)node);

-+ else

-+ return getValue((const CompactTrieHorizontalNode *)node);

-+}

-+//returns index of match in CompactTrieHorizontalNode.entries[] using binary search

-+inline int16_t

-+searchHorizontalEntries(const CompactTrieHorizontalEntry *entries,

-+ UChar uc, uint16_t nodeCount){

-+ int low = 0;

-+ int high = nodeCount-1;

-+ int middle;

-+ while (high >= low) {

-+ middle = (high+low)/2;

-+ if (uc == entries[middle].ch) {

-+ return middle;

-+ }

-+ else if (uc < entries[middle].ch) {

-+ high = middle-1;

-+ }

-+ else {

-+ low = middle+1;

-+ }

-+ return -1;

- }

- int32_t

-@@ -466,17 +667,38 @@

- int32_t maxLength,

- int32_t *lengths,

- int &count,

-- int limit ) const {

-+ int limit,

-+ uint16_t *values /*= NULL*/) const {

-+ if (fInfo->magic == COMPACT_TRIE_MAGIC_2)

-+ values = NULL;

- // TODO: current implementation works in UTF-16 space

-- const CompactTrieNode *node = getCompactNode(fData, fData->root);

-+ const CompactTrieNode *node = getCompactNode(fInfo, fInfo->root);

- int mycount = 0;

- UChar uc = utext_current32(text);

- int i = 0;

-+ // handle root node with only kEqualOverflows flag: assume horizontal node without parent

-+ if(node != NULL){

-+ const CompactTrieHorizontalNode *root = (const CompactTrieHorizontalNode *) node;

-+ int index = searchHorizontalEntries(root->entries, uc, root->flagscount & kRootCountMask);

-+ if(index > -1){

-+ node = getCompactNode(fInfo, calcEqualLink(root, index, root->flagscount & kRootCountMask));

-+ utext_next32(text);

-+ uc = utext_current32(text);

-+ ++i;

-+ }else{

-+ node = NULL;

-+ }

- while (node != NULL) {

- // Check if the node we just exited ends a word

- if (limit > 0 && (node->flagscount & kParentEndsWord)) {

-+ if(values != NULL){

-+ values[mycount] = getValue(node);

-+ }

- lengths[mycount++] = i;

- --limit;

- }

-@@ -487,7 +709,7 @@

- break;

- }

-- int nodeCount = (node->flagscount & kCountMask);

-+ int nodeCount = getCount(node);

- if (nodeCount == 0) {

- // Special terminal node; return now

- break;

-@@ -507,35 +729,27 @@

- // To get here we must have come through the whole list successfully;

- // go on to the next node. Note that a word cannot end in the middle

- // of a vertical node.

-- node = getCompactNode(fData, vnode->equal);

-+ node = getCompactNode(fInfo, calcEqualLink(vnode));

- }

- else {

- // Horizontal node; do binary search

- const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;

-- int low = 0;

-- int high = nodeCount-1;

-- int middle;

-- node = NULL; // If we don't find a match, we'll fall out of the loop

-- while (high >= low) {

-- middle = (high+low)/2;

-- if (uc == hnode->entries[middle].ch) {

-- // We hit a match; get the next node and next character

-- node = getCompactNode(fData, hnode->entries[middle].equal);

-- utext_next32(text);

-- uc = utext_current32(text);

-- ++i;

-- break;

-- }

-- else if (uc < hnode->entries[middle].ch) {

-- high = middle-1;

-- }

-- else {

-- low = middle+1;

-- }

-+ const CompactTrieHorizontalEntry *entries;

-+ entries = hnode->entries;

-+ int index = searchHorizontalEntries(entries, uc, nodeCount);

-+ if(index > -1){ //

-+ // We hit a match; get the next node and next character

-+ node = getCompactNode(fInfo, calcEqualLink(hnode, index, nodeCount));

-+ utext_next32(text);

-+ uc = utext_current32(text);

-+ ++i;

-+ }else{

-+ node = NULL; // If we don't find a match, we'll fall out of the loop

- }

--exit:

-+ exit:

- count = mycount;

- return i;

- }

-@@ -545,16 +759,16 @@

- private:

- UVector32 fNodeStack; // Stack of nodes to process

- UVector32 fIndexStack; // Stack of where in node we are

-- const CompactTrieHeader *fHeader; // Trie data

-+ const CompactTrieInfo *fInfo; // Trie data

- public:

- static UClassID U_EXPORT2 getStaticClassID(void);

- virtual UClassID getDynamicClassID(void) const;

- public:

-- CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status)

-+ CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status)

- : fNodeStack(status), fIndexStack(status) {

-- fHeader = header;

-- fNodeStack.push(header->root, status);

-+ fInfo = info;

-+ fNodeStack.push(info->root, status);

- fIndexStack.push(0, status);

- unistr.remove();

- }

-@@ -564,14 +778,14 @@

- virtual StringEnumeration *clone() const {

- UErrorCode status = U_ZERO_ERROR;

-- return new CompactTrieEnumeration(fHeader, status);

-+ return new CompactTrieEnumeration(fInfo, status);

- }

- virtual const UnicodeString * snext(UErrorCode &status);

- // Very expensive, but this should never be used.

- virtual int32_t count(UErrorCode &status) const {

-- CompactTrieEnumeration counter(fHeader, status);

-+ CompactTrieEnumeration counter(fInfo, status);

- int32_t result = 0;

- while (counter.snext(status) != NULL && U_SUCCESS(status)) {

- ++result;

-@@ -582,7 +796,7 @@

- virtual void reset(UErrorCode &status) {

- fNodeStack.removeAllElements();

- fIndexStack.removeAllElements();

-- fNodeStack.push(fHeader->root, status);

-+ fNodeStack.push(fInfo->root, status);

- fIndexStack.push(0, status);

- unistr.remove();

- }

-@@ -595,26 +809,34 @@

- if (fNodeStack.empty() || U_FAILURE(status)) {

- return NULL;

- }

-- const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki());

-+ const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki());

- int where = fIndexStack.peeki();

- while (!fNodeStack.empty() && U_SUCCESS(status)) {

-- int nodeCount = (node->flagscount & kCountMask);

-+ int nodeCount;

-+ bool isRoot = fNodeStack.peeki() == static_cast<int32_t>(fInfo->root);

-+ if(isRoot){

-+ nodeCount = node->flagscount & kRootCountMask;

-+ } else {

-+ nodeCount = getCount(node);

-+ }

- UBool goingDown = FALSE;

- if (nodeCount == 0) {

- // Terminal node; go up immediately

- fNodeStack.popi();

- fIndexStack.popi();

-- node = getCompactNode(fHeader, fNodeStack.peeki());

-+ node = getCompactNode(fInfo, fNodeStack.peeki());

- where = fIndexStack.peeki();

- }

-- else if (node->flagscount & kVerticalNode) {

-+ else if ((node->flagscount & kVerticalNode) && !isRoot) {

- // Vertical node

- const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;

- if (where == 0) {

- // Going down

-- unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount);

-+ unistr.append((const UChar *)vnode->chars, nodeCount);

- fIndexStack.setElementAt(1, fIndexStack.size()-1);

-- node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, status));

-+ node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnode), status));

- where = fIndexStack.push(0, status);

- goingDown = TRUE;

- }

-@@ -623,7 +845,7 @@

- unistr.truncate(unistr.length()-nodeCount);

- fNodeStack.popi();

- fIndexStack.popi();

-- node = getCompactNode(fHeader, fNodeStack.peeki());

-+ node = getCompactNode(fInfo, fNodeStack.peeki());

- where = fIndexStack.peeki();

- }

-@@ -638,7 +860,7 @@

- // Push on next node

- unistr.append((UChar)hnode->entries[where].ch);

- fIndexStack.setElementAt(where+1, fIndexStack.size()-1);

-- node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[where].equal, status));

-+ node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnode, where, nodeCount), status));

- where = fIndexStack.push(0, status);

- goingDown = TRUE;

- }

-@@ -646,12 +868,14 @@

- // Going up

- fNodeStack.popi();

- fIndexStack.popi();

-- node = getCompactNode(fHeader, fNodeStack.peeki());

-+ node = getCompactNode(fInfo, fNodeStack.peeki());

- where = fIndexStack.peeki();

- }

- // Check if the parent of the node we've just gone down to ends a

- // word. If so, return it.

-+ // The root node should never end up here.

- if (goingDown && (node->flagscount & kParentEndsWord)) {

- return &unistr;

- }

-@@ -664,7 +888,7 @@

- if (U_FAILURE(status)) {

- return NULL;

- }

-- return new CompactTrieEnumeration(fData, status);

-+ return new CompactTrieEnumeration(fInfo, status);

- }

- //

-@@ -672,21 +896,36 @@

- // and back again

- //

--// Helper classes to construct the compact trie

-+enum CompactTrieNodeType {

-+ kHorizontalType = 0,

-+ kVerticalType = 1,

-+ kValueType = 2

-+};

-+/**

-+ * The following classes (i.e. BuildCompactTrie*Node) are helper classes to

-+ * construct the compact trie by storing information for each node and later

-+ * writing the node to memory in a sequential format.

-+ */

- class BuildCompactTrieNode: public UMemory {

-- public:

-+public:

- UBool fParentEndsWord;

-- UBool fVertical;

-+ CompactTrieNodeType fNodeType;

- UBool fHasDuplicate;

-+ UBool fEqualOverflows;

- int32_t fNodeID;

- UnicodeString fChars;

-+ uint16_t fValue;

-- public:

-- BuildCompactTrieNode(UBool parentEndsWord, UBool vertical, UStack &nodes, UErrorCode &status) {

-+public:

-+ BuildCompactTrieNode(UBool parentEndsWord, CompactTrieNodeType nodeType,

-+ UStack &nodes, UErrorCode &status, uint16_t value = 0) {

- fParentEndsWord = parentEndsWord;

- fHasDuplicate = FALSE;

-- fVertical = vertical;

-+ fNodeType = nodeType;

-+ fEqualOverflows = FALSE;

- fNodeID = nodes.size();

-+ fValue = parentEndsWord? value : 0;

- nodes.push(this, status);

- }

-@@ -694,87 +933,225 @@

- }

- virtual uint32_t size() {

-- return sizeof(uint16_t);

-+ if(fValue > 0)

-+ return sizeof(uint16_t) * 2;

-+ else

-+ return sizeof(uint16_t);

- }

- virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*translate*/) {

- // Write flag/count

-- *((uint16_t *)(bytes+offset)) = (fChars.length() & kCountMask)

-- | (fVertical ? kVerticalNode : 0) | (fParentEndsWord ? kParentEndsWord : 0 );

-+ // if this ever fails, a flag bit (i.e. kExceedsCount) will need to be

-+ // used as a 5th MSB.

-+ U_ASSERT(fChars.length() < 4096 || fNodeID == 2);

-+ *((uint16_t *)(bytes+offset)) = (fEqualOverflows? kEqualOverflows : 0) |

-+ ((fNodeID == 2)? (fChars.length() & kRootCountMask):

-+ (

-+ (fChars.length() & kCountMask) |

-+ //((fChars.length() << 2) & kExceedsCount) |

-+ (fNodeType == kVerticalType ? kVerticalNode : 0) |

-+ (fParentEndsWord ? kParentEndsWord : 0 )

-+ )

-+ );

- offset += sizeof(uint16_t);

- }

-+ virtual void writeValue(uint8_t *bytes, uint32_t &offset) {

-+ if(fValue > 0){

-+ *((uint16_t *)(bytes+offset)) = fValue;

-+ offset += sizeof(uint16_t);

-+ }

-+};

-+/**

-+ * Stores value of parent terminating nodes that have no more subtries.

-+ */

-+class BuildCompactTrieValueNode: public BuildCompactTrieNode {

-+public:

-+ BuildCompactTrieValueNode(UStack &nodes, UErrorCode &status, uint16_t value)

-+ : BuildCompactTrieNode(TRUE, kValueType, nodes, status, value){

-+ }

-+ virtual ~BuildCompactTrieValueNode(){

-+ }

-+ virtual uint32_t size() {

-+ return sizeof(uint16_t) * 2;

-+ }

-+ virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {

-+ // don't write value directly to memory but store it in offset to be written later

-+ //offset = fValue & kOffsetContainsValue;

-+ BuildCompactTrieNode::write(bytes, offset, translate);

-+ BuildCompactTrieNode::writeValue(bytes, offset);

-+ }

- };

- class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode {

- public:

- UStack fLinks;

-+ UBool fMayOverflow; //intermediate value for fEqualOverflows

- public:

-- BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status)

-- : BuildCompactTrieNode(parentEndsWord, FALSE, nodes, status), fLinks(status) {

-+ BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0)

-+ : BuildCompactTrieNode(parentEndsWord, kHorizontalType, nodes, status, value), fLinks(status) {

-+ fMayOverflow = FALSE;

- }

- virtual ~BuildCompactTrieHorizontalNode() {

- }

-+ // It is impossible to know beforehand exactly how much space the node will

-+ // need in memory before being written, because the node IDs in the equal

-+ // links may or may not overflow after node coalescing. Therefore, this method

-+ // returns the maximum size possible for the node.

- virtual uint32_t size() {

-- return offsetof(CompactTrieHorizontalNode,entries) +

-- (fChars.length()*sizeof(CompactTrieHorizontalEntry));

-+ uint32_t estimatedSize = offsetof(CompactTrieHorizontalNode,entries) +

-+ (fChars.length()*sizeof(CompactTrieHorizontalEntry));

-+ if(fValue > 0)

-+ estimatedSize += sizeof(uint16_t);

-+ //estimate extra space needed to store overflow for node ID links

-+ //may be more than what is actually needed

-+ for(int i=0; i < fChars.length(); i++){

-+ if(((BuildCompactTrieNode *)fLinks[i])->fNodeID > 0xFFFF){

-+ fMayOverflow = TRUE;

-+ break;

-+ }

-+ if(fMayOverflow) // added space for overflow should be same as ceil(fChars.length()/4) * sizeof(uint16_t)

-+ estimatedSize += (sizeof(uint16_t) * fChars.length() + 2)/4;

-+ return estimatedSize;

- }

- virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {

-- BuildCompactTrieNode::write(bytes, offset, translate);

- int32_t count = fChars.length();

-+ //if largest nodeID > 2^16, set flag

-+ //large node IDs are more likely to be at the back of the array

-+ for (int32_t i = count-1; i >= 0; --i) {

-+ if(translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) > 0xFFFF){

-+ fEqualOverflows = TRUE;

-+ break;

-+ }

-+ BuildCompactTrieNode::write(bytes, offset, translate);

-+ // write entries[] to memory

- for (int32_t i = 0; i < count; ++i) {

- CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(bytes+offset);

- entry->ch = fChars[i];

- entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID);

- #ifdef DEBUG_TRIE_DICT

-- if (entry->equal == 0) {

-+ if ((entry->equal == 0) && !fEqualOverflows) {

- fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n",

- i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);

- }

- #endif

- offset += sizeof(CompactTrieHorizontalEntry);

- }

-+ // append extra bits of equal nodes to end if fEqualOverflows

-+ if (fEqualOverflows) {

-+ uint16_t leftmostBits = 0;

-+ for (int16_t i = 0; i < count; i++) {

-+ leftmostBits = (leftmostBits << 4) | getLeftmostBits(translate, i);

-+ // write filled uint16_t to memory

-+ if(i % 4 == 3){

-+ *((uint16_t *)(bytes+offset)) = leftmostBits;

-+ leftmostBits = 0;

-+ offset += sizeof(uint16_t);

-+ }

-+ // pad last uint16_t with zeroes if necessary

-+ int remainder = count % 4;

-+ if (remainder > 0) {

-+ *((uint16_t *)(bytes+offset)) = (leftmostBits << (16 - 4 * remainder));

-+ offset += sizeof(uint16_t);

-+ }

-+ BuildCompactTrieNode::writeValue(bytes, offset);

-+ }

-+ // returns leftmost bits of physical node link

-+ uint16_t getLeftmostBits(const UVector32 &translate, uint32_t i){

-+ uint16_t leftmostBits = (uint16_t) (translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) >> 16);

-+#ifdef DEBUG_TRIE_DICT

-+ if (leftmostBits > 0xF) {

-+ fprintf(stderr, "ERROR: horizontal link %d, logical node %d exceeds maximum possible node ID value\n",

-+ i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);

-+ }

-+#endif

-+ return leftmostBits;

- }

- void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) {

- fChars.append(ch);

- fLinks.push(link, status);

- }

- };

- class BuildCompactTrieVerticalNode: public BuildCompactTrieNode {

-- public:

-+public:

- BuildCompactTrieNode *fEqual;

-- public:

-- BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status)

-- : BuildCompactTrieNode(parentEndsWord, TRUE, nodes, status) {

-+public:

-+ BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0)

-+ : BuildCompactTrieNode(parentEndsWord, kVerticalType, nodes, status, value) {

- fEqual = NULL;

- }

- virtual ~BuildCompactTrieVerticalNode() {

- }

-+ // Returns the maximum possible size of this node. See comment in

-+ // BuildCompactTrieHorizontal node for more information.

- virtual uint32_t size() {

-- return offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t));

-+ uint32_t estimatedSize = offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t));

-+ if(fValue > 0){

-+ estimatedSize += sizeof(uint16_t);

-+ }

-+ if(fEqual->fNodeID > 0xFFFF){

-+ estimatedSize += sizeof(uint16_t);

-+ }

-+ return estimatedSize;

- }

- virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {

- CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset);

-+ fEqualOverflows = (translate.elementAti(fEqual->fNodeID) > 0xFFFF);

- BuildCompactTrieNode::write(bytes, offset, translate);

- node->equal = translate.elementAti(fEqual->fNodeID);

- offset += sizeof(node->equal);

- #ifdef DEBUG_TRIE_DICT

-- if (node->equal == 0) {

-+ if ((node->equal == 0) && !fEqualOverflows) {

- fprintf(stderr, "ERROR: vertical link, logical node %d maps to physical node zero\n",

- fEqual->fNodeID);

- }

- #endif

- fChars.extract(0, fChars.length(), (UChar *)node->chars);

-- offset += sizeof(uint16_t)*fChars.length();

-+ offset += sizeof(UChar)*fChars.length();

-+ // append 16 bits of to end for equal node if fEqualOverflows

-+ if (fEqualOverflows) {

-+ *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNodeID) >> 16);

-+ offset += sizeof(uint16_t);

-+ }

-+ BuildCompactTrieNode::writeValue(bytes, offset);

- }

- void addChar(UChar ch) {

-@@ -784,60 +1161,85 @@

- void setLink(BuildCompactTrieNode *node) {

- fEqual = node;

- }

- };

- // Forward declaration

- static void walkHorizontal(const TernaryNode *node,

- BuildCompactTrieHorizontalNode *building,

- UStack &nodes,

-- UErrorCode &status);

-+ UErrorCode &status,

-+ Hashtable *values);

--// Convert one node. Uses recursion.

-+// Convert one TernaryNode into a BuildCompactTrieNode. Uses recursion.

- static BuildCompactTrieNode *

--compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, UErrorCode &status) {

-+compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes,

-+ UErrorCode &status, Hashtable *values = NULL, uint16_t parentValue = 0) {

- if (U_FAILURE(status)) {

- return NULL;

- }

- BuildCompactTrieNode *result = NULL;

- UBool horizontal = (node->low != NULL || node->high != NULL);

- if (horizontal) {

-- BuildCompactTrieHorizontalNode *hResult =

-- new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status);

-+ BuildCompactTrieHorizontalNode *hResult;

-+ if(values != NULL){

-+ hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status, parentValue);

-+ } else {

-+ hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status);

-+ }

- if (hResult == NULL) {

- status = U_MEMORY_ALLOCATION_ERROR;

- return NULL;

- }

- if (U_SUCCESS(status)) {

-- walkHorizontal(node, hResult, nodes, status);

-+ walkHorizontal(node, hResult, nodes, status, values);

- result = hResult;

- }

- else {

-- BuildCompactTrieVerticalNode *vResult =

-- new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status);

-+ BuildCompactTrieVerticalNode *vResult;

-+ if(values != NULL){

-+ vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status, parentValue);

-+ } else {

-+ vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status);

-+ }

- if (vResult == NULL) {

- status = U_MEMORY_ALLOCATION_ERROR;

-+ return NULL;

- }

- else if (U_SUCCESS(status)) {

-- UBool endsWord = FALSE;

-+ uint16_t value = 0;

-+ UBool endsWord = FALSE;

- // Take up nodes until we end a word, or hit a node with < or > links

- do {

- vResult->addChar(node->ch);

-- endsWord = (node->flags & kEndsWord) != 0;

-+ value = node->flags;

-+ endsWord = value > 0;

- node = node->equal;

- }

- while(node != NULL && !endsWord && node->low == NULL && node->high == NULL);

- if (node == NULL) {

- if (!endsWord) {

- status = U_ILLEGAL_ARGUMENT_ERROR; // Corrupt input trie

- }

-- else {

-+ else if(values != NULL){

-+ UnicodeString key(value); //store value as a single-char UnicodeString

-+ BuildCompactTrieValueNode *link = (BuildCompactTrieValueNode *) values->get(key);

-+ if(link == NULL){

-+ link = new BuildCompactTrieValueNode(nodes, status, value); //take out nodes?

-+ values->put(key, link, status);

-+ }

-+ vResult->setLink(link);

-+ } else {

- vResult->setLink((BuildCompactTrieNode *)nodes[1]);

- }

- else {

-- vResult->setLink(compactOneNode(node, endsWord, nodes, status));

-+ vResult->setLink(compactOneNode(node, endsWord, nodes, status, values, value));

- }

- result = vResult;

- }

-@@ -849,19 +1251,28 @@

- // Uses recursion.

- static void walkHorizontal(const TernaryNode *node,

-- BuildCompactTrieHorizontalNode *building,

-- UStack &nodes,

-- UErrorCode &status) {

-+ BuildCompactTrieHorizontalNode *building,

-+ UStack &nodes,

-+ UErrorCode &status, Hashtable *values = NULL) {

- while (U_SUCCESS(status) && node != NULL) {

- if (node->low != NULL) {

-- walkHorizontal(node->low, building, nodes, status);

-+ walkHorizontal(node->low, building, nodes, status, values);

- }

- BuildCompactTrieNode *link = NULL;

- if (node->equal != NULL) {

-- link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, nodes, status);

-+ link = compactOneNode(node->equal, node->flags > 0, nodes, status, values, node->flags);

- }

-- else if (node->flags & kEndsWord) {

-- link = (BuildCompactTrieNode *)nodes[1];

-+ else if (node->flags > 0) {

-+ if(values != NULL) {

-+ UnicodeString key(node->flags); //store value as a single-char UnicodeString

-+ link = (BuildCompactTrieValueNode *) values->get(key);

-+ if(link == NULL) {

-+ link = new BuildCompactTrieValueNode(nodes, status, node->flags); //take out nodes?

-+ values->put(key, link, status);

-+ }

-+ } else {

-+ link = (BuildCompactTrieNode *)nodes[1];

-+ }

- }

- if (U_SUCCESS(status) && link != NULL) {

- building->addNode(node->ch, link, status);

-@@ -881,13 +1292,15 @@

- _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) {

- BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl;

- BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr;

- // Check for comparing a node to itself, to avoid spurious duplicates

- if (left == right) {

- return 0;

- }

- // Most significant is type of node. Can never coalesce.

-- if (left->fVertical != right->fVertical) {

-- return left->fVertical - right->fVertical;

-+ if (left->fNodeType != right->fNodeType) {

-+ return left->fNodeType - right->fNodeType;

- }

- // Next, the "parent ends word" flag. If that differs, we cannot coalesce.

- if (left->fParentEndsWord != right->fParentEndsWord) {

-@@ -898,12 +1311,19 @@

- if (result != 0) {

- return result;

- }

-+ // If the node value differs, we should not coalesce.

-+ // If values aren't stored, all fValues should be 0.

-+ if (left->fValue != right->fValue) {

-+ return left->fValue - right->fValue;

-+ }

- // We know they're both the same node type, so branch for the two cases.

-- if (left->fVertical) {

-+ if (left->fNodeType == kVerticalType) {

- result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID

-- - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;

-+ - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;

- }

-- else {

-+ else if(left->fChars.length() > 0 && right->fChars.length() > 0){

- // We need to compare the links vectors. They should be the

- // same size because the strings were equal.

- // We compare the node IDs instead of the pointers, to handle

-@@ -914,9 +1334,10 @@

- int32_t count = hleft->fLinks.size();

- for (int32_t i = 0; i < count && result == 0; ++i) {

- result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID -

-- ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;

-+ ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;

- }

- // If they are equal to each other, mark them (speeds coalescing)

- if (result == 0) {

- left->fHasDuplicate = TRUE;

-@@ -1031,20 +1452,25 @@

- // Add node 0, used as the NULL pointer/sentinel.

- nodes.addElement((int32_t)0, status);

-+ Hashtable *values = NULL; // Index of (unique) values

-+ if (dict.fValued) {

-+ values = new Hashtable(status);

-+ }

- // Start by creating the special empty node we use to indicate that the parent

- // terminates a word. This must be node 1, because the builder assumes

-- // that.

-+ // that. This node will never be used for tries storing numerical values.

- if (U_FAILURE(status)) {

- return NULL;

- }

-- BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, nodes, status);

-+ BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontalType, nodes, status);

- if (terminal == NULL) {

- status = U_MEMORY_ALLOCATION_ERROR;

- }

- // This call does all the work of building the new trie structure. The root

-- // will be node 2.

-- BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status);

-+ // will have node ID 2 before writing to memory.

-+ BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status, values);

- #ifdef DEBUG_TRIE_DICT

- (void) ::times(&timing);

- fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n",

-@@ -1077,21 +1503,37 @@

- return NULL;

- }

-+ //map terminal value nodes

-+ int valueCount = 0;

-+ UVector valueNodes(status);

-+ if(values != NULL) {

-+ valueCount = values->count(); //number of unique terminal value nodes

-+ }

-+ // map non-terminal nodes

-+ int valuePos = 1;//, nodePos = valueCount + valuePos;

-+ nodeCount = valueCount + valuePos;

- for (i = 1; i < count; ++i) {

- node = (BuildCompactTrieNode *)nodes[i];

- if (node->fNodeID == i) {

- // Only one node out of each duplicate set is used

-- if (i >= translate.size()) {

-+ if (node->fNodeID >= translate.size()) {

- // Logically extend the mapping table

-- translate.setSize(i+1);

-+ translate.setSize(i + 1);

-+ }

-+ //translate.setElementAt(object, index)!

-+ if(node->fNodeType == kValueType) {

-+ valueNodes.addElement(node, status);

-+ translate.setElementAt(valuePos++, i);

-+ } else {

-+ translate.setElementAt(nodeCount++, i);

- }

-- translate.setElementAt(nodeCount++, i);

- totalSize += node->size();

- }

-- // Check for overflowing 16 bits worth of nodes.

-- if (nodeCount > 0x10000) {

-+ // Check for overflowing 20 bits worth of nodes.

-+ if (nodeCount > 0x100000) {

- status = U_ILLEGAL_ARGUMENT_ERROR;

- return NULL;

- }

-@@ -1111,9 +1553,14 @@

- status = U_MEMORY_ALLOCATION_ERROR;

- return NULL;

- }

- CompactTrieHeader *header = (CompactTrieHeader *)bytes;

-- header->size = totalSize;

-+ //header->size = totalSize;

-+ if(dict.fValued){

-+ header->magic = COMPACT_TRIE_MAGIC_3;

-+ } else {

-+ header->magic = COMPACT_TRIE_MAGIC_2;

-+ }

- header->nodeCount = nodeCount;

- header->offsets[0] = 0; // Sentinel

- header->root = translate.elementAti(root->fNodeID);

-@@ -1123,23 +1570,40 @@

- }

- #endif

- uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint32_t));

-- nodeCount = 1;

-+ nodeCount = valueCount + 1;

-+ // Write terminal value nodes to memory

-+ for (i=0; i < valueNodes.size(); i++) {

-+ //header->offsets[i + 1] = offset;

-+ uint32_t tmpOffset = 0;

-+ node = (BuildCompactTrieNode *) valueNodes.elementAt(i);

-+ //header->offsets[i + 1] = (uint32_t)node->fValue;

-+ node->write((uint8_t *)&header->offsets[i+1], tmpOffset, translate);

-+ }

- // Now write the data

- for (i = 1; i < count; ++i) {

- node = (BuildCompactTrieNode *)nodes[i];

-- if (node->fNodeID == i) {

-+ if (node->fNodeID == i && node->fNodeType != kValueType) {

- header->offsets[nodeCount++] = offset;

- node->write(bytes, offset, translate);

- }

-+ //free all extra space

-+ uprv_realloc(bytes, offset);

-+ header->size = offset;

- #ifdef DEBUG_TRIE_DICT

-+ fprintf(stdout, "Space freed: %d\n", totalSize-offset);

- (void) ::times(&timing);

- fprintf(stderr, "Trie built, time user %f system %f\n",

- (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,

- (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);

- previous = timing;

- fprintf(stderr, "Final offset is %d\n", offset);

- // Collect statistics on node types and sizes

- int hCount = 0;

- int vCount = 0;

-@@ -1148,68 +1612,85 @@

- size_t hItemCount = 0;

- size_t vItemCount = 0;

- uint32_t previousOff = offset;

-- for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {

-+ uint32_t numOverflow = 0;

-+ uint32_t valueSpace = 0;

-+ for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {

- const CompactTrieNode *node = getCompactNode(header, nodeIdx);

-- if (node->flagscount & kVerticalNode) {

-+ int itemCount;

-+ if(nodeIdx == header->root)

-+ itemCount = node->flagscount & kRootCountMask;

-+ else

-+ itemCount = getCount(node);

-+ if(node->flagscount & kEqualOverflows){

-+ numOverflow++;

-+ }

-+ if (node->flagscount & kVerticalNode && nodeIdx != header->root) {

- vCount += 1;

-- vItemCount += (node->flagscount & kCountMask);

-+ vItemCount += itemCount;

- vSize += previousOff-header->offsets[nodeIdx];

- }

- else {

- hCount += 1;

-- hItemCount += (node->flagscount & kCountMask);

-- hSize += previousOff-header->offsets[nodeIdx];

-+ hItemCount += itemCount;

-+ if(nodeIdx >= header->root) {

-+ hSize += previousOff-header->offsets[nodeIdx];

-+ }

- }

-+ if(header->magic == COMPACT_TRIE_MAGIC_3 && node->flagscount & kParentEndsWord)

-+ valueSpace += sizeof(uint16_t);

- previousOff = header->offsets[nodeIdx];

- }

- fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\n", hCount,

- (double)hSize/hCount, (double)hItemCount/hCount);

- fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n", vCount,

- (double)vSize/vCount, (double)vItemCount/vCount);

-+ fprintf(stderr, "Number of nodes with overflowing nodeIDs: %d \n", numOverflow);

-+ fprintf(stderr, "Space taken up by values: %d \n", valueSpace);

- #endif

- if (U_FAILURE(status)) {

- uprv_free(bytes);

- header = NULL;

- }

-- else {

-- header->magic = COMPACT_TRIE_MAGIC_1;

-- }

- return header;

- }

- // Forward declaration

- static TernaryNode *

--unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status );

-+unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status );

- // Convert a horizontal node (or subarray thereof) into a ternary subtrie

- static TernaryNode *

--unpackHorizontalArray( const CompactTrieHeader *header, const CompactTrieHorizontalEntry *array,

-- int low, int high, UErrorCode &status ) {

-+unpackHorizontalArray( const CompactTrieInfo *info, const CompactTrieHorizontalNode *hnode,

-+ int low, int high, int nodeCount, UErrorCode &status) {

- if (U_FAILURE(status) || low > high) {

- return NULL;

- }

- int middle = (low+high)/2;

-- TernaryNode *result = new TernaryNode(array[middle].ch);

-+ TernaryNode *result = new TernaryNode(hnode->entries[middle].ch);

- if (result == NULL) {

- status = U_MEMORY_ALLOCATION_ERROR;

- return NULL;

- }

-- const CompactTrieNode *equal = getCompactNode(header, array[middle].equal);

-+ const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(hnode, middle, nodeCount));

- if (equal->flagscount & kParentEndsWord) {

-- result->flags |= kEndsWord;

-+ if(info->magic == COMPACT_TRIE_MAGIC_3){

-+ result->flags = getValue(equal);

-+ }else{

-+ result->flags |= kEndsWord;

-+ }

- }

-- result->low = unpackHorizontalArray(header, array, low, middle-1, status);

-- result->high = unpackHorizontalArray(header, array, middle+1, high, status);

-- result->equal = unpackOneNode(header, equal, status);

-+ result->low = unpackHorizontalArray(info, hnode, low, middle-1, nodeCount, status);

-+ result->high = unpackHorizontalArray(info, hnode, middle+1, high, nodeCount, status);

-+ result->equal = unpackOneNode(info, equal, status);

- return result;

- }

- // Convert one compact trie node into a ternary subtrie

- static TernaryNode *

--unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status ) {

-- int nodeCount = (node->flagscount & kCountMask);

-+unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status ) {

-+ int nodeCount = getCount(node);

- if (nodeCount == 0 || U_FAILURE(status)) {

- // Failure, or terminal node

- return NULL;

-@@ -1234,29 +1715,41 @@

- previous = latest;

- }

- if (latest != NULL) {

-- const CompactTrieNode *equal = getCompactNode(header, vnode->equal);

-+ const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(vnode));

- if (equal->flagscount & kParentEndsWord) {

-- latest->flags |= kEndsWord;

-+ if(info->magic == COMPACT_TRIE_MAGIC_3){

-+ latest->flags = getValue(equal);

-+ } else {

-+ latest->flags |= kEndsWord;

-+ }

- }

-- latest->equal = unpackOneNode(header, equal, status);

-+ latest->equal = unpackOneNode(info, equal, status);

- }

- return head;

- }

- else {

- // Horizontal node

- const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;

-- return unpackHorizontalArray(header, &hnode->entries[0], 0, nodeCount-1, status);

-+ return unpackHorizontalArray(info, hnode, 0, nodeCount-1, nodeCount, status);

- }

-+// returns a MutableTrieDictionary generated from the CompactTrieDictionary

- MutableTrieDictionary *

- CompactTrieDictionary::cloneMutable( UErrorCode &status ) const {

-- MutableTrieDictionary *result = new MutableTrieDictionary( status );

-+ MutableTrieDictionary *result = new MutableTrieDictionary( status, fInfo->magic == COMPACT_TRIE_MAGIC_3 );

- if (result == NULL) {

- status = U_MEMORY_ALLOCATION_ERROR;

- return NULL;

- }

-- TernaryNode *root = unpackOneNode(fData, getCompactNode(fData, fData->root), status);

-+ // treat root node as special case: don't call unpackOneNode() or unpackHorizontalArray() directly

-+ // because only kEqualOverflows flag should be checked in root's flagscount

-+ const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)

-+ getCompactNode(fInfo, fInfo->root);

-+ uint16_t nodeCount = hnode->flagscount & kRootCountMask;

-+ TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1,

-+ nodeCount, status);

- if (U_FAILURE(status)) {

- delete root; // Clean up

- delete result;

-@@ -1270,8 +1763,8 @@

- U_CAPI int32_t U_EXPORT2

- triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,

-- UErrorCode *status) {

-+ UErrorCode *status) {

- if (status == NULL || U_FAILURE(*status)) {

- return 0;

- }

-@@ -1286,14 +1779,14 @@

- //

- const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4);

- if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */

-- pInfo->dataFormat[1]==0x72 &&

-- pInfo->dataFormat[2]==0x44 &&

-- pInfo->dataFormat[3]==0x63 &&

-- pInfo->formatVersion[0]==1 )) {

-+ pInfo->dataFormat[1]==0x72 &&

-+ pInfo->dataFormat[2]==0x44 &&

-+ pInfo->dataFormat[3]==0x63 &&

-+ pInfo->formatVersion[0]==1 )) {

- udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",

-- pInfo->dataFormat[0], pInfo->dataFormat[1],

-- pInfo->dataFormat[2], pInfo->dataFormat[3],

-- pInfo->formatVersion[0]);

-+ pInfo->dataFormat[0], pInfo->dataFormat[1],

-+ pInfo->dataFormat[2], pInfo->dataFormat[3],

-+ pInfo->formatVersion[0]);

- *status=U_UNSUPPORTED_ERROR;

- return 0;

- }

-@@ -1311,8 +1804,10 @@

- //

- const uint8_t *inBytes =(const uint8_t *)inData+headerSize;

- const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes;

-- if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1

-- || ds->readUInt32(header->size) < sizeof(CompactTrieHeader))

-+ uint32_t magic = ds->readUInt32(header->magic);

-+ if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic != COMPACT_TRIE_MAGIC_3

-+ || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeaderV1)

-+ || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeader))

- {

- udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n");

- *status=U_UNSUPPORTED_ERROR;

-@@ -1333,10 +1828,10 @@

- //

- if (length < sizeWithUData) {

- udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",

-- totalSize);

-+ totalSize);

- *status=U_INDEX_OUTOFBOUNDS_ERROR;

- return 0;

-- }

-+ }

- //

- // Swap the Data. Do the data itself first, then the CompactTrieHeader, because

-@@ -1355,20 +1850,38 @@

- }

- // We need to loop through all the nodes in the offset table, and swap each one.

-- uint16_t nodeCount = ds->readUInt16(header->nodeCount);

-+ uint32_t nodeCount, rootId;

-+ if(header->magic == COMPACT_TRIE_MAGIC_1) {

-+ nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount);

-+ rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root);

-+ } else {

-+ nodeCount = ds->readUInt32(header->nodeCount);

-+ rootId = ds->readUInt32(header->root);

-+ }

- // Skip node 0, which should always be 0.

-- for (int i = 1; i < nodeCount; ++i) {

-+ for (uint32_t i = 1; i < nodeCount; ++i) {

- uint32_t nodeOff = ds->readUInt32(header->offsets[i]);

- const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + nodeOff);

- CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff);

- uint16_t flagscount = ds->readUInt16(inNode->flagscount);

-- uint16_t itemCount = flagscount & kCountMask;

-+ uint16_t itemCount = getCount(inNode);

-+ //uint16_t itemCount = flagscount & kCountMask;

- ds->writeUInt16(&outNode->flagscount, flagscount);

- if (itemCount > 0) {

-- if (flagscount & kVerticalNode) {

-+ uint16_t overflow = 0; //number of extra uint16_ts needed to be swapped

-+ if (flagscount & kVerticalNode && i != rootId) {

-+ if(flagscount & kEqualOverflows){

-+ // include overflow bits

-+ overflow += 1;

-+ }

-+ if (header->magic == COMPACT_TRIE_MAGIC_3 && flagscount & kEndsParentWord) {

-+ //include values

-+ overflow += 1;

-+ }

- ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars),

-- itemCount*sizeof(uint16_t),

-- outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status);

-+ (itemCount + overflow)*sizeof(uint16_t),

-+ outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status);

- uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal);

- ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal));

- }

-@@ -1381,26 +1894,62 @@

- word = ds->readUInt16(inHNode->entries[j].equal);

- ds->writeUInt16(&outHNode->entries[j].equal, word);

- }

-+ // swap overflow/value information

-+ if(flagscount & kEqualOverflows){

-+ overflow += (itemCount + 3) / 4;

-+ }

-+ if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && flagscount & kEndsParentWord) {

-+ //include values

-+ overflow += 1;

-+ }

-+ uint16_t *inOverflow = (uint16_t *) &inHNode->entries[itemCount];

-+ uint16_t *outOverflow = (uint16_t *) &outHNode->entries[itemCount];

-+ for(int j = 0; j<overflow; j++){

-+ uint16_t extraInfo = ds->readUInt16(*inOverflow);

-+ ds->writeUInt16(outOverflow, extraInfo);

-+ inOverflow++;

-+ outOverflow++;

-+ }

- }

- #endif

-- // All the data in all the nodes consist of 16 bit items. Swap them all at once.

-- uint16_t nodeCount = ds->readUInt16(header->nodeCount);

-- uint32_t nodesOff = offsetof(CompactTrieHeader,offsets)+((uint32_t)nodeCount*sizeof(uint32_t));

-- ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status);

- // Swap the header

- ds->writeUInt32(&outputHeader->size, totalSize);

-- uint32_t magic = ds->readUInt32(header->magic);

- ds->writeUInt32(&outputHeader->magic, magic);

-- ds->writeUInt16(&outputHeader->nodeCount, nodeCount);

-- uint16_t root = ds->readUInt16(header->root);

-- ds->writeUInt16(&outputHeader->root, root);

-- ds->swapArray32(ds, inBytes+offsetof(CompactTrieHeader,offsets),

-- sizeof(uint32_t)*(int32_t)nodeCount,

-- outBytes+offsetof(CompactTrieHeader,offsets), status);

-+ uint32_t nodeCount;

-+ uint32_t offsetPos;

-+ if (header->magic == COMPACT_TRIE_MAGIC_1) {

-+ CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *)header;

-+ CompactTrieHeaderV1 *outputHeaderV1 = (CompactTrieHeaderV1 *)outputHeader;

-+ nodeCount = ds->readUInt16(headerV1->nodeCount);

-+ ds->writeUInt16(&outputHeaderV1->nodeCount, nodeCount);

-+ uint16_t root = ds->readUInt16(headerV1->root);

-+ ds->writeUInt16(&outputHeaderV1->root, root);

-+ offsetPos = offsetof(CompactTrieHeaderV1,offsets);

-+ } else {

-+ nodeCount = ds->readUInt32(header->nodeCount);

-+ ds->writeUInt32(&outputHeader->nodeCount, nodeCount);

-+ uint32_t root = ds->readUInt32(header->root);

-+ ds->writeUInt32(&outputHeader->root, root);

-+ offsetPos = offsetof(CompactTrieHeader,offsets);

-+ }

-+ // All the data in all the nodes consist of 16 bit items. Swap them all at once.

-+ uint32_t nodesOff = offsetPos+((uint32_t)nodeCount*sizeof(uint32_t));

-+ ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status);

-+ //swap offsets

-+ ds->swapArray32(ds, inBytes+offsetPos,

-+ sizeof(uint32_t)*(uint32_t)nodeCount,

-+ outBytes+offsetPos, status);

- return sizeWithUData;

- }

---- source/common/triedict.h 2006-06-06 15:38:49.000000000 -0700

-+++ source/common/triedict.h 2011-01-21 14:12:45.496927000 -0800

-@@ -47,7 +47,6 @@

- U_NAMESPACE_BEGIN

- class StringEnumeration;

--struct CompactTrieHeader;

- /*******************************************************************

- * TrieWordDictionary

-@@ -72,23 +71,29 @@

- */

- virtual ~TrieWordDictionary();

-+ /**

-+ * Returns true if the dictionary contains values associated with each word.

-+ */

-+ virtual UBool getValued() const = 0;

- /**

- * Find dictionary words that match the text.

- *

- * @param text A UText representing the text. The

- * iterator is left after the longest prefix match in the dictionary.

-- * @param start The current position in text.

- * @param maxLength The maximum number of code units to match.

- * @param lengths An array that is filled with the lengths of words that matched.

- * @param count Filled with the number of elements output in lengths.

- * @param limit The size of the lengths array; this limits the number of words output.

-+ * @param values An array that is filled with the values associated with the matched words.

- * @return The number of characters in text that were matched.

- */

- virtual int32_t matches( UText *text,

- int32_t maxLength,

- int32_t *lengths,

- int &count,

-- int limit ) const = 0;

-+ int limit,

-+ uint16_t *values = NULL) const = 0;

- /**

- * Return a StringEnumeration for iterating all the words in the dictionary.

-@@ -128,6 +133,12 @@

- UText *fIter;

-+ /**

-+ * A UText for internal use

-+ * @internal

-+ */

-+ UBool fValued;

- friend class CompactTrieDictionary; // For fast conversion

- public:

-@@ -138,14 +149,29 @@

- * @param median A UChar around which to balance the trie. Ideally, it should

- * begin at least one word that is near the median of the set in the dictionary

- * @param status A status code recording the success of the call.

-+ * @param containsValue True if the dictionary stores values associated with each word.

- */

-- MutableTrieDictionary( UChar median, UErrorCode &status );

-+ MutableTrieDictionary( UChar median, UErrorCode &status, UBool containsValue = FALSE );

- /**

- * Virtual destructor.

- */

- virtual ~MutableTrieDictionary();

-+ /**

-+ * Indicate whether the MutableTrieDictionary stores values associated with each word

-+ */

-+ void setValued(UBool valued){

-+ fValued = valued;

-+ }

-+ /**

-+ * Returns true if the dictionary contains values associated with each word.

-+ */

-+ virtual UBool getValued() const {

-+ return fValued;

-+ }

- /**

- * Find dictionary words that match the text.

- *

-@@ -155,13 +181,15 @@

- * @param lengths An array that is filled with the lengths of words that matched.

- * @param count Filled with the number of elements output in lengths.

- * @param limit The size of the lengths array; this limits the number of words output.

-+ * @param values An array that is filled with the values associated with the matched words.

- * @return The number of characters in text that were matched.

- */

- virtual int32_t matches( UText *text,

- int32_t maxLength,

- int32_t *lengths,

- int &count,

-- int limit ) const;

-+ int limit,

-+ uint16_t *values = NULL) const;

- /**

- * Return a StringEnumeration for iterating all the words in the dictionary.

-@@ -173,15 +201,17 @@

- virtual StringEnumeration *openWords( UErrorCode &status ) const;

- /**

-- * Add one word to the dictionary.

-+ * Add one word to the dictionary with an optional associated value.

- *

- * @param word A UChar buffer containing the word.

- * @param length The length of the word.

-- * @param status The resultant status

-+ * @param status The resultant status.

-+ * @param value The nonzero value associated with this word.

- */

- virtual void addWord( const UChar *word,

- int32_t length,

-- UErrorCode &status);

-+ UErrorCode &status,

-+ uint16_t value = 0);

- #if 0

- /**

-@@ -203,8 +233,9 @@

- * @param lengths An array that is filled with the lengths of words that matched.

- * @param count Filled with the number of elements output in lengths.

- * @param limit The size of the lengths array; this limits the number of words output.

-- * @param parent The parent of the current node

-- * @param pMatched The returned parent node matched the input

-+ * @param parent The parent of the current node.

-+ * @param pMatched The returned parent node matched the input/

-+ * @param values An array that is filled with the values associated with the matched words.

- * @return The number of characters in text that were matched.

- */

- virtual int32_t search( UText *text,

-@@ -213,40 +244,46 @@

- int &count,

- int limit,

- TernaryNode *&parent,

-- UBool &pMatched ) const;

-+ UBool &pMatched,

-+ uint16_t *values = NULL) const;

- private:

- /**

- * Private constructor. The root node it not allocated.

- *

- * @param status A status code recording the success of the call.

-+ * @param containsValues True if the dictionary will store a value associated

-+ * with each word added.

- */

-- MutableTrieDictionary( UErrorCode &status );

-+ MutableTrieDictionary( UErrorCode &status, UBool containsValues = false );

- };

- /*******************************************************************

- * CompactTrieDictionary

- */

-+//forward declarations

-+struct CompactTrieHeader;

-+struct CompactTrieInfo;

- /**

- * CompactTrieDictionary is a TrieWordDictionary that has been compacted

- * to save space.

- */

- class U_COMMON_API CompactTrieDictionary : public TrieWordDictionary {

- private:

-- /**

-- * The root node of the trie

-- */

-+ /**

-+ * The header of the CompactTrieDictionary which contains all info

-+ */

-- const CompactTrieHeader *fData;

-- /**

-- * A UBool indicating whether or not we own the fData.

-- */

-+ CompactTrieInfo *fInfo;

-+ /**

-+ * A UBool indicating whether or not we own the fData.

-+ */

- UBool fOwnData;

-- UDataMemory *fUData;

-+ UDataMemory *fUData;

- public:

- /**

- * Construct a dictionary from a UDataMemory.

-@@ -277,6 +314,11 @@

- */

- virtual ~CompactTrieDictionary();

-+ /**

-+ * Returns true if the dictionary contains values associated with each word.

-+ */

-+ virtual UBool getValued() const;

- /**

- * Find dictionary words that match the text.

- *

-@@ -286,13 +328,15 @@

- * @param lengths An array that is filled with the lengths of words that matched.

- * @param count Filled with the number of elements output in lengths.

- * @param limit The size of the lengths array; this limits the number of words output.

-+ * @param values An array that is filled with the values associated with the matched words.

- * @return The number of characters in text that were matched.

- */

- virtual int32_t matches( UText *text,

-- int32_t rangeEnd,

-+ int32_t maxLength,

- int32_t *lengths,

- int &count,

-- int limit ) const;

-+ int limit,

-+ uint16_t *values = NULL) const;

- /**

- * Return a StringEnumeration for iterating all the words in the dictionary.

-@@ -311,7 +355,7 @@

- virtual uint32_t dataSize() const;

- /**

-- * Return a void * pointer to the compact data, platform-endian.

-+ * Return a void * pointer to the (unmanaged) compact data, platform-endian.

- *

- * @return The data for the compact dictionary, suitable for passing to the

- * constructor.

-@@ -342,5 +386,5 @@

- U_NAMESPACE_END

-- /* TRIEDICT_H */

-+/* TRIEDICT_H */

- #endif

---- source/data/Makefile.in 2010-10-29 13:21:33.000000000 -0700

-+++ source/data/Makefile.in 2011-01-26 16:24:24.856798000 -0800

-@@ -509,8 +520,9 @@

- #################################################### CTD

- # CTD FILES

--$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)

-- $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<

-+# .ctd file now generated regardless of whether dictionary file exists

-+$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)

-+ $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F).txt

- #################################################### CFU

- # CFU FILES

---- source/data/brkitr/root.txt 2010-07-28 17:18:28.000000000 -0700

-+++ source/data/brkitr/root.txt 2011-01-21 14:12:45.653922000 -0800

-@@ -17,5 +17,8 @@

- }

- dictionaries{

- Thai:process(dependency){"thaidict.ctd"}

-+ Hani:process(dependency){"cjdict.ctd"}

-+ Hira:process(dependency){"cjdict.ctd"}

-+ Kata:process(dependency){"cjdict.ctd"}

- }

---- source/data/xml/brkitr/root.xml 2010-03-01 15:13:18.000000000 -0800

-+++ source/data/xml/brkitr/root.xml 2011-01-21 14:12:45.735922000 -0800

-@@ -25,6 +25,9 @@

- </icu:boundaries>

- <icu:dictionaries>

- <icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/>

-+ <icu:dictionary type="Hani" icu:dependency="cjdict.ctd"/>

-+ <icu:dictionary type="Hira" icu:dependency="cjdict.ctd"/>

-+ <icu:dictionary type="Kata" icu:dependency="cjdict.ctd"/>

- </icu:dictionaries>

- </icu:breakIteratorData>

- </special>

---- source/test/cintltst/creststn.c 2010-10-28 10:44:02.000000000 -0700

-+++ source/test/cintltst/creststn.c 2011-01-21 14:12:44.995020000 -0800

-@@ -2188,21 +2188,21 @@

- {

-- UResourceBundle* ja = ures_open(U_ICUDATA_BRKITR,"ja", &status);

-+ UResourceBundle* th = ures_open(U_ICUDATA_BRKITR,"th", &status);

- const UChar *got = NULL, *exp=NULL;

- int32_t gotLen = 0, expLen=0;

-- ja = ures_getByKey(ja, "boundaries", ja, &status);

-- exp = tres_getString(ja, -1, "word", &expLen, &status);

-+ th = ures_getByKey(th, "boundaries", th, &status);

-+ exp = tres_getString(th, -1, "grapheme", &expLen, &status);

- tb = ures_getByKey(aliasB, "boundaries", tb, &status);

-- got = tres_getString(tb, -1, "word", &gotLen, &status);

-+ got = tres_getString(tb, -1, "grapheme", &gotLen, &status);

- if(U_FAILURE(status)) {

- log_err("%s trying to read str boundaries\n", u_errorName(status));

- } else if(gotLen != expLen || u_strncmp(exp, got, gotLen) != 0) {

- log_err("Referencing alias didn't get the right data\n");

- }

-- ures_close(ja);

-+ ures_close(th);

- status = U_ZERO_ERROR;

- }

- /* simple alias */

---- source/test/intltest/rbbiapts.cpp 2010-07-12 11:03:29.000000000 -0700

-+++ source/test/intltest/rbbiapts.cpp 2011-01-21 14:12:45.033014000 -0800

-@@ -156,9 +156,13 @@

- if(*a!=*b){

- errln("Failed: boilerplate method operator!= does not return correct results");

- }

-- BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status);

-- if(a && c){

-- if(*c==*a){

-+ // Japanese word break iteratos is identical to root with

-+ // a dictionary-based break iterator, but Thai character break iterator

-+ // is still different from Root.

-+ BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),status);

-+ BreakIterator* d = BreakIterator::createCharacterInstance(Locale("th"),status);

-+ if(c && d){

-+ if(*c==*d){

- errln("Failed: boilerplate method opertator== does not return correct results");

- }

- }else{

-@@ -167,6 +171,7 @@

- delete a;

- delete b;

- delete c;

-+ delete d;

- }

- void RBBIAPITest::TestgetRules()

-@@ -635,21 +640,21 @@

- //

- void RBBIAPITest::TestRuleStatus() {

- UChar str[30];

-- u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094",

-- // 012345678901234567 8 9 0 1 2 3 4 5 6

-- // Ideographic Katakana Hiragana

-+ //no longer test Han or hiragana breaking here: ruleStatusVec would return nothing

-+ // changed UBRK_WORD_KANA to UBRK_WORD_IDEO

-+ u_unescape("plain word 123.45 \\u30a1\\u30a2 ",

-+ // 012345678901234567 8 9 0

-+ // Katakana

- str, 30);

- UnicodeString testString1(str);

-- int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26};

-+ int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21};

- int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER,

- UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE,

-- UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE,

-- UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA, UBRK_WORD_KANA};

-+ UBRK_WORD_IDEO, UBRK_WORD_NONE};

- int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,

- UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT,

-- UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT,

-- UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT};

-+ UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT};

- UErrorCode status=U_ZERO_ERROR;

-@@ -888,9 +893,11 @@

- URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD, status);

- {

-+#if 0 // With a dictionary based word breaking, ja_word is identical to root.

- if (ja_word && *ja_word == *root_word) {

- errln("japan not different from root");

- }

-+#endif

- }

- {

---- source/test/intltest/rbbitst.cpp 2010-10-08 18:23:28.000000000 -0700

-+++ source/test/intltest/rbbitst.cpp 2011-01-21 14:12:45.180030000 -0800

-@@ -35,6 +35,8 @@

- #include <string.h>

- #include <stdio.h>

- #include <stdlib.h>

-+#include "unicode/numfmt.h"

-+#include "unicode/uscript.h"

- #define TEST_ASSERT(x) {if (!(x)) { \

- errln("Failure in file %s, line %d", __FILE__, __LINE__);}}

-@@ -138,11 +140,13 @@

- if (exec) TestThaiBreaks(); break;

- case 23: name = "TestTailoredBreaks";

- if (exec) TestTailoredBreaks(); break;

-+ case 24: name = "TestTrieDictWithValue";

-+ if(exec) TestTrieDictWithValue(); break;

- #else

-- case 21: case 22: case 23: name = "skip";

-+ case 21: case 22: case 23: case 24: name = "skip";

- break;

- #endif

-- case 24: name = "TestDictRules";

-+ case 25: name = "TestDictRules";

- if (exec) TestDictRules(); break;

- case 25: name = "TestBug5532";

- if (exec) TestBug5532(); break;

-@@ -607,6 +611,8 @@

- void RBBITest::TestJapaneseWordBreak() {

-+// TODO: Rewrite this test for a dictionary-based word breaking.

-+#if 0

- UErrorCode status = U_ZERO_ERROR;

- BITestData japaneseWordSelection(status);

-@@ -628,6 +634,7 @@

- generalIteratorTest(*e, japaneseWordSelection);

- delete e;

-+#endif

- }

- void RBBITest::TestTrieDict() {

-@@ -849,6 +856,372 @@

- delete compact2;

- }

-+/*TODO: delete later*/

-+inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){

-+ UErrorCode status = U_ZERO_ERROR;

-+ FILE *outfile = fopen(filename,"w");

-+ UConverter *cvt = ucnv_open("UTF-8", &status);

-+ if (U_FAILURE(status))

-+ return;

-+ if(outfile != NULL){

-+ status = U_ZERO_ERROR;

-+ const UnicodeString *word = enumer->snext(status);

-+ while (word != NULL && U_SUCCESS(status)) {

-+ char u8word[500];

-+ status = U_ZERO_ERROR;

-+ ucnv_fromUChars(cvt, u8word, 500, word->getBuffer(), word->length(),

-+ &status);

-+ fprintf(outfile,"%s\n", u8word);

-+ status = U_ZERO_ERROR;

-+ word = enumer->snext(status);

-+ }

-+ fclose(outfile);

-+ }

-+ ucnv_close(cvt);

-+}

-+// A very simple helper class to streamline the buffer handling in

-+// TestTrieDictWithValue

-+template<class T, size_t N>

-+class AutoBuffer {

-+ public:

-+ AutoBuffer(size_t size) : buffer(stackBuffer) {

-+ if (size > N)

-+ buffer = new T[size];

-+ }

-+ ~AutoBuffer() {

-+ if (buffer != stackBuffer)

-+ delete [] buffer;

-+ }

-+ T* elems() {

-+ return buffer;

-+ }

-+ const T& operator[] (size_t i) const {

-+ return buffer[i];

-+ }

-+ T& operator[] (size_t i) {

-+ return buffer[i];

-+ }

-+ private:

-+ T stackBuffer[N];

-+ T* buffer;

-+ AutoBuffer();

-+};

-+//----------------------------------------------------------------------------

-+//

-+// TestTrieDictWithValue Test trie dictionaries with logprob values and

-+// more than 2^16 nodes after compaction.

-+//

-+//----------------------------------------------------------------------------

-+void RBBITest::TestTrieDictWithValue() {

-+ UErrorCode status = U_ZERO_ERROR;

-+ //

-+ // Open and read the test data file.

-+ //

-+ const char *testDataDirectory = IntlTest::getSourceTestData(status);

-+ const char *filename = "cjdict-truncated.txt";

-+ char testFileName[1000];

-+ if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen(filename) + 10 >= sizeof(testFileName)) {

-+ errln("Can't open test data. Path too long.");

-+ return;

-+ }

-+ strcpy(testFileName, testDataDirectory);

-+ strcat(testFileName, filename);

-+ // Items needing deleting at the end

-+ MutableTrieDictionary *mutableDict = NULL;

-+ CompactTrieDictionary *compactDict = NULL;

-+ UnicodeSet *breaks = NULL;

-+ UChar *testFile = NULL;

-+ StringEnumeration *enumer1 = NULL;

-+ StringEnumeration *enumer2 = NULL;

-+ MutableTrieDictionary *mutable2 = NULL;

-+ StringEnumeration *cloneEnum = NULL;

-+ CompactTrieDictionary *compact2 = NULL;

-+ NumberFormat *nf = NULL;

-+ UText *originalText = NULL, *cloneText = NULL;

-+ const UnicodeString *originalWord = NULL;

-+ const UnicodeString *cloneWord = NULL;

-+ UChar *current;

-+ UChar *word;

-+ UChar uc;

-+ int32_t wordLen;

-+ int32_t wordCount;

-+ int32_t testCount;

-+ int32_t valueLen;

-+ int counter = 0;

-+ int len;

-+ testFile = ReadAndConvertFile(testFileName, len, NULL, status);

-+ if (U_FAILURE(status)) {

-+ goto cleanup; /* something went wrong, error already output */

-+ }

-+ mutableDict = new MutableTrieDictionary(0x0E1C, status, TRUE);

-+ if (U_FAILURE(status)) {

-+ errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));

-+ goto cleanup;

-+ }

-+ breaks = new UnicodeSet;

-+ breaks->add(0x000A); // Line Feed

-+ breaks->add(0x000D); // Carriage Return

-+ breaks->add(0x2028); // Line Separator

-+ breaks->add(0x2029); // Paragraph Separator

-+ breaks->add(0x0009); // Tab character

-+ // Now add each non-comment line of the file as a word.

-+ current = testFile;

-+ word = current;

-+ uc = *current++;

-+ wordLen = 0;

-+ wordCount = 0;

-+ nf = NumberFormat::createInstance(status);

-+ while (uc) {

-+ UnicodeString ucharValue;

-+ valueLen = 0;

-+ if (uc == 0x0023) { // #comment line, skip

-+ while (uc && !breaks->contains(uc)) {

-+ uc = *current++;

-+ }

-+ else{

-+ while (uc && !breaks->contains(uc)) {

-+ ++wordLen;

-+ uc = *current++;

-+ }

-+ if(uc == 0x0009){ //separator is a tab char, read in num after tab

-+ uc = *current++;

-+ while (uc && !breaks->contains(uc)) {

-+ ucharValue.append(uc);

-+ uc = *current++;

-+ }

-+ if (wordLen > 0) {

-+ Formattable value((int32_t)0);

-+ nf->parse(ucharValue.getTerminatedBuffer(), value, status);

-+ if(U_FAILURE(status)){

-+ errln("parsing of value failed when reading in dictionary\n");

-+ goto cleanup;

-+ }

-+ mutableDict->addWord(word, wordLen, status, value.getLong());

-+ if (U_FAILURE(status)) {

-+ errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));

-+ goto cleanup;

-+ }

-+ wordCount += 1;

-+ }

-+ // Find beginning of next line

-+ while (uc && breaks->contains(uc)) {

-+ uc = *current++;

-+ }

-+ word = current-1;

-+ wordLen = 0;

-+ }

-+ if (wordCount < 50) {

-+ errln("Word count (%d) unreasonably small\n", wordCount);

-+ goto cleanup;

-+ }

-+ enumer1 = mutableDict->openWords(status);

-+ if (U_FAILURE(status)) {

-+ errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));

-+ goto cleanup;

-+ }

-+ testCount = 0;

-+ if (wordCount != (testCount = enumer1->count(status))) {

-+ errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",

-+ testCount, wordCount, u_errorName(status));

-+ goto cleanup;

-+ }

-+ // Now compact it

-+ compactDict = new CompactTrieDictionary(*mutableDict, status);

-+ if (U_FAILURE(status)) {

-+ errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));

-+ goto cleanup;

-+ }

-+ enumer2 = compactDict->openWords(status);

-+ if (U_FAILURE(status)) {

-+ errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));

-+ goto cleanup;

-+ }

-+ //delete later

-+// writeEnumerationToFile(enumer1, "/home/jchye/mutable.txt");

-+// writeEnumerationToFile(enumer2, "/home/jchye/compact.txt");

-+ enumer1->reset(status);

-+ enumer2->reset(status);

-+ originalWord = enumer1->snext(status);

-+ cloneWord = enumer2->snext(status);

-+ while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {

-+ if (*originalWord != *cloneWord) {

-+ errln("MutableTrieDictionary and CompactTrieDictionary word mismatch at %d, lengths are %d and %d\n",

-+ counter, originalWord->length(), cloneWord->length());

-+ goto cleanup;

-+ }

-+ // check if attached values of the same word in both dictionaries tally

-+#if 0

-+ int32_t lengths1[originalWord->length()], lengths2[cloneWord->length()];

-+ uint16_t values1[originalWord->length()], values2[cloneWord->length()];

-+#endif

-+ AutoBuffer<int32_t, 20> lengths1(originalWord->length());

-+ AutoBuffer<int32_t, 20> lengths2(cloneWord->length());

-+ AutoBuffer<uint16_t, 20> values1(originalWord->length());

-+ AutoBuffer<uint16_t, 20> values2(cloneWord->length());

-+ originalText = utext_openConstUnicodeString(originalText, originalWord, &status);

-+ cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status);

-+ int count1, count2;

-+ mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems());

-+ compactDict->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems());

-+ if(values1[count1-1] != values2[count2-1]){

-+ errln("Values of word %d in MutableTrieDictionary and CompactTrieDictionary do not match, with values %d and %d\n",

-+ counter, values1[count1-1], values2[count2-1]);

-+ goto cleanup;

-+ }

-+ counter++;

-+ originalWord = enumer1->snext(status);

-+ cloneWord = enumer2->snext(status);

-+ }

-+ if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {

-+ errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same");

-+ }

-+ delete enumer1;

-+ enumer1 = NULL;

-+ delete enumer2;

-+ enumer2 = NULL;

-+ // Now un-compact it

-+ mutable2 = compactDict->cloneMutable(status);

-+ if (U_FAILURE(status)) {

-+ errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));

-+ goto cleanup;

-+ }

-+ cloneEnum = mutable2->openWords(status);

-+ if (U_FAILURE(status)) {

-+ errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));

-+ goto cleanup;

-+ }

-+ if (wordCount != (testCount = cloneEnum->count(status))) {

-+ errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",

-+ testCount, wordCount, u_errorName(status));

-+ goto cleanup;

-+ }

-+ // Compact original dictionary to clone. Note that we can only compare the same kind of

-+ // dictionary as the order of the enumerators is not guaranteed to be the same between

-+ // different kinds

-+ enumer1 = mutableDict->openWords(status);

-+ if (U_FAILURE(status)) {

-+ errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));

-+ goto cleanup;

-+ }

-+ counter = 0;

-+ originalWord = enumer1->snext(status);

-+ cloneWord = cloneEnum->snext(status);

-+ while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {

-+ if (*originalWord != *cloneWord) {

-+ errln("Original and cloned MutableTrieDictionary word mismatch\n");

-+ goto cleanup;

-+ }

-+ // check if attached values of the same word in both dictionaries tally

-+ AutoBuffer<int32_t, 20> lengths1(originalWord->length());

-+ AutoBuffer<int32_t, 20> lengths2(cloneWord->length());

-+ AutoBuffer<uint16_t, 20> values1(originalWord->length());

-+ AutoBuffer<uint16_t, 20> values2(cloneWord->length());

-+ originalText = utext_openConstUnicodeString(originalText, originalWord, &status);

-+ cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status);

-+ int count1, count2;

-+ mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems());

-+ mutable2->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems());

-+ if(values1[count1-1] != values2[count2-1]){

-+ errln("Values of word %d in original and cloned MutableTrieDictionary do not match, with values %d and %d\n",

-+ counter, values1[count1-1], values2[count2-1]);

-+ goto cleanup;

-+ }

-+ counter++;

-+ originalWord = enumer1->snext(status);

-+ cloneWord = cloneEnum->snext(status);

-+ }

-+ if (U_FAILURE(status)) {

-+ errln("Enumeration failed: %s\n", u_errorName(status));

-+ goto cleanup;

-+ }

-+ if (originalWord != cloneWord) {

-+ errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");

-+ goto cleanup;

-+ }

-+ // Test the data copying constructor for CompactTrieDict, and the data access APIs.

-+ compact2 = new CompactTrieDictionary(compactDict->data(), status);

-+ if (U_FAILURE(status)) {

-+ errln("CompactTrieDictionary(const void *,...) failed\n");

-+ goto cleanup;

-+ }

-+ if (compact2->dataSize() == 0) {

-+ errln("CompactTrieDictionary->dataSize() == 0\n");

-+ goto cleanup;

-+ }

-+ // Now count the words via the second dictionary

-+ delete enumer1;

-+ enumer1 = compact2->openWords(status);

-+ if (U_FAILURE(status)) {

-+ errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));

-+ goto cleanup;

-+ }

-+ if (wordCount != (testCount = enumer1->count(status))) {

-+ errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",

-+ testCount, wordCount, u_errorName(status));

-+ goto cleanup;

-+ }

-+ cleanup:

-+ delete compactDict;

-+ delete mutableDict;

-+ delete breaks;

-+ delete[] testFile;

-+ delete enumer1;

-+ delete mutable2;

-+ delete cloneEnum;

-+ delete compact2;

-+ utext_close(originalText);

-+ utext_close(cloneText);

-+}

- //----------------------------------------------------------------------------

- //

-@@ -1870,8 +2243,15 @@

- // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).

- static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"

- "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";

-+#if 0

- static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 17, 18, 20, 21, 24, 27, 28 };

- static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };

-+#endif

-+// There's no separate Japanese word break iterator. Root is the same as Japanese.

-+// Our dictionary-based iterator has to be tweaked to better handle U+3005,

-+// U+3007, U+300B and some other cases.

-+static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };

-+static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };

- // UBreakIteratorType UBRK_SENTENCE, Locale "el"

- // Add break after Greek question mark (cldrbug #2069).

-@@ -2672,6 +3052,8 @@

- UnicodeSet *fNewlineSet;

- UnicodeSet *fKatakanaSet;

- UnicodeSet *fALetterSet;

-+ // TODO(jungshik): Do we still need this change?

-+ // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt

- UnicodeSet *fMidNumLetSet;

- UnicodeSet *fMidLetterSet;

- UnicodeSet *fMidNumSet;

-@@ -2680,6 +3062,7 @@

- UnicodeSet *fOtherSet;

- UnicodeSet *fExtendSet;

- UnicodeSet *fExtendNumLetSet;

-+ UnicodeSet *fDictionaryCjkSet;

- RegexMatcher *fMatcher;

-@@ -2696,12 +3079,24 @@

- fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);

- fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);

- fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);

-- fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);

-+ fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);

-+ // Exclude Hangul syllables from ALetterSet during testing.

-+ // Leave CJK dictionary characters out from the monkey tests!

-+#if 0

-+ fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"

-+ "[\\p{Line_Break = Complex_Context}"

-+ "-\\p{Grapheme_Cluster_Break = Extend}"

-+ "-\\p{Grapheme_Cluster_Break = Control}"

-+ "]]",

-+ status);

-+#endif

-+ fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);

-+ fALetterSet->removeAll(*fDictionaryCjkSet);

- fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);

- fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);

- fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);

- fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);

-- fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);

-+ fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}[\\uff10-\\uff19]]"), status);

- fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);

- fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);

- fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);

-@@ -2725,13 +3120,14 @@

- fOtherSet->removeAll(*fFormatSet);

- fOtherSet->removeAll(*fExtendSet);

- // Inhibit dictionary characters from being tested at all.

-+ fOtherSet->removeAll(*fDictionaryCjkSet);

- fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));

- fSets->addElement(fCRSet, status);

- fSets->addElement(fLFSet, status);

- fSets->addElement(fNewlineSet, status);

- fSets->addElement(fALetterSet, status);

-- fSets->addElement(fKatakanaSet, status);

-+ //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana

- fSets->addElement(fMidLetterSet, status);

- fSets->addElement(fMidNumLetSet, status);

- fSets->addElement(fMidNumSet, status);

-@@ -3978,6 +4374,7 @@

- for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {

- count --;

- if (forward[count] != i) {

-+ printStringBreaks(ustr, expected, expectedcount);

- test->errln("happy break test previous() failed: expected %d but got %d",

- forward[count], i);

- break;

-@@ -4011,23 +4408,25 @@

- UErrorCode status = U_ZERO_ERROR;

- // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);

- BreakIterator *bi = BreakIterator::createWordInstance(locale, status);

-+ // Replaced any C+J characters in a row with a random sequence of characters

-+ // of the same length to make our C+J segmentation not get in the way.

- static const char *strlist[] =

- {

- "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",

-- "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",

-+ "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",

- "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",

- "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",

-- "\\u90ca\\u3588\\u009c\\u0953\\u194b",

-+ "\\uac00\\u3588\\u009c\\u0953\\u194b",

- "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",

- "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",

-- "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",

-+ "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",

- "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",

- "\\u003b\\u024a\\u102e\\U000e0071\\u0600",

- "\\u2027\\U000e0067\\u0a47\\u00b7",

- "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",

- "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",

- "\\u0589\\U000e006e\\u0a42\\U000104a5",

-- "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",

-+ "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",

- "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",

- "\\u0027\\u11af\\U000e0057\\u0602",

- "\\U0001d7f2\\U000e007\\u0004\\u0589",

-@@ -4039,7 +4438,7 @@

- "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",

- "\\u0233\\U000e0020\\u0a69\\u0d6a",

- "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",

-- "\\u58f4\\U000e0049\\u20e7\\u2027",

-+ "\\u18f4\\U000e0049\\u20e7\\u2027",

- "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",

- "\\ua183\\u102d\\u0bec\\u003a",

- "\\u17e8\\u06e7\\u002e\\u096d\\u003b",

-@@ -4049,7 +4448,7 @@

- "\\U000e005d\\u2044\\u0731\\u0650\\u0061",

- "\\u003a\\u0664\\u00b7\\u1fba",

- "\\u003b\\u0027\\u00b7\\u47a3",

-- "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",

-+ "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",

- "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",

- "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",

- };

-@@ -4104,12 +4503,12 @@

- "\\U0001d7f2\\U000e007d\\u0004\\u0589",

- "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",

- "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",

-- "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",

-+ "\\U000e0065\\u302c\\u09ee\\U000e0068",

- "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",

- "\\u0233\\U000e0020\\u0a69\\u0d6a",

- "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",

- "\\u58f4\\U000e0049\\u20e7\\u2027",

-- "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",

-+ "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",

- "\\ua183\\u102d\\u0bec\\u003a",

- "\\u17e8\\u06e7\\u002e\\u096d\\u003b",

- "\\u003a\\u0e57\\u0fad\\u002e",

---- source/test/intltest/rbbitst.h 2010-07-22 17:15:37.000000000 -0700

-+++ source/test/intltest/rbbitst.h 2011-01-21 14:12:45.152007000 -0800

-@@ -70,6 +70,7 @@

- void TestBug5775();

- void TestThaiBreaks();

- void TestTailoredBreaks();

-+ void TestTrieDictWithValue();

- void TestDictRules();

- void TestBug5532();

---- source/test/testdata/rbbitst.txt 2010-07-28 17:18:28.000000000 -0700

-+++ source/test/testdata/rbbitst.txt 2011-01-21 14:12:45.221011000 -0800

-@@ -161,7 +161,23 @@

- <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>

- # Hiragana & Katakana stay together, but separates from each other and Latin.

--<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>

-+# *** what to do about theoretical combos of chars? i.e. hiragana + accent

-+#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>

-+# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth

-+<data>•芽キャベツ<400>芽キャﾍﾞツ<400></data>

-+# more Japanese tests

-+# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana

-+# and the Katakana block are not treated correctly. Enable this later.

-+#<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400>　•て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>

-+<data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400>　•て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>

-+# Testing of word boundary for dictionary word containing both kanji and kana

-+<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data>

-+# Testing of Chinese segmentation (taken from a Chinese news article)

-+<data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400>到了<400>“•推荐<400>票<400>”•，•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400>的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>，•选出<400>他们<400>属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</data>

- # Words with interior formatting characters

- <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data>

-@@ -169,6 +185,8 @@

- # to test for bug #4097779

- <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>

-+# fullwidth numeric, midletter characters etc should be treated like their halfwidth counterparts

-+<data>•ＩＳＮ'Ｔ<200> •１９<100>日<400></data>

- # to test for bug #4098467

- # What follows is a string of Korean characters (I found it in the Yellow Pages

-@@ -178,9 +196,15 @@

- # precomposed syllables...

- <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>

--<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data>

-+# more Korean tests (Jamo not tested here, not counted as dictionary characters)

-+# Disable them now because we don't include a Korean dictionary.

-+#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<200>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data>

-+#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2dd<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200> •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data>

-+<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</data>

-+<data>•\u06c9<200>\uc799<200>\ufffa•</data>

--<data>•\u06c9\uc799\ufffa<200></data>

- #

- # Try some words from other scripts.

-@@ -491,8 +515,7 @@

- <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•</data>

- # conjoining jamo...

--# TODO: rules update needed

--#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>

-+<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>

- # to test for bug #4117554: Fullwidth .!? should be treated as postJwrd

- <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data>

---- source/test/testdata/testaliases.txt 2009-11-12 13:53:42.000000000 -0800

-+++ source/test/testdata/testaliases.txt 2011-01-21 14:12:45.204005000 -0800

-@@ -28,7 +28,7 @@

- LocaleScript:alias { "/ICUDATA/ja/LocaleScript" }

- // aliasing using position

-- boundaries:alias { "/ICUDATA-brkitr/ja" } // Referencing corresponding resource in another bundle

-+ boundaries:alias { "/ICUDATA-brkitr/th" } // Referencing corresponding resource in another bundle

- // aliasing arrays

- zoneTests {

---- source/tools/genctd/genctd.cpp 2009-08-04 14:09:17.000000000 -0700

-+++ source/tools/genctd/genctd.cpp 2011-01-21 14:12:45.564923000 -0800

-@@ -1,6 +1,6 @@

- /*

- **********************************************************************

- *

-@@ -34,12 +34,15 @@

- #include "unicode/udata.h"

- #include "unicode/putil.h"

-+//#include "unicode/ustdio.h"

- #include "uoptions.h"

- #include "unewdata.h"

- #include "ucmndata.h"

- #include "rbbidata.h"

- #include "triedict.h"

- #include "cmemory.h"

-+#include "uassert.h"

- #include <stdio.h>

- #include <stdlib.h>

-@@ -199,147 +202,191 @@

- long wordFileSize;

- FILE *file;

- char *wordBufferC;

-+ MutableTrieDictionary *mtd = NULL;

- file = fopen(wordFileName, "rb");

-- if( file == 0 ) {

-- fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);

-- exit(-1);

-- }

-- fseek(file, 0, SEEK_END);

-- wordFileSize = ftell(file);

-- fseek(file, 0, SEEK_SET);

-- wordBufferC = new char[wordFileSize+10];

-- result = (long)fread(wordBufferC, 1, wordFileSize, file);

-- if (result != wordFileSize) {

-- fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);

-- exit (-1);

-- }

-- wordBufferC[wordFileSize]=0;

-- fclose(file);

-- //

-- // Look for a Unicode Signature (BOM) on the word file

-- //

-- int32_t signatureLength;

-- const char * wordSourceC = wordBufferC;

-- const char* encoding = ucnv_detectUnicodeSignature(

-- wordSourceC, wordFileSize, &signatureLength, &status);

-- if (U_FAILURE(status)) {

-- exit(status);

-- }

-- if(encoding!=NULL ){

-- wordSourceC += signatureLength;

-- wordFileSize -= signatureLength;

-- }

-- //

-- // Open a converter to take the rule file to UTF-16

-- //

-- UConverter* conv;

-- conv = ucnv_open(encoding, &status);

-- if (U_FAILURE(status)) {

-- fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));

-- exit(status);

-- }

-- //

-- // Convert the words to UChar.

-- // Preflight first to determine required buffer size.

-- //

-- uint32_t destCap = ucnv_toUChars(conv,

-- NULL, // dest,

-- 0, // destCapacity,

-- wordSourceC,

-- wordFileSize,

-- &status);

-- if (status != U_BUFFER_OVERFLOW_ERROR) {

-- fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));

-- exit(status);

-- };

-- status = U_ZERO_ERROR;

-- UChar *wordSourceU = new UChar[destCap+1];

-- ucnv_toUChars(conv,

-- wordSourceU, // dest,

-- destCap+1,

-- wordSourceC,

-- wordFileSize,

-- &status);

-- if (U_FAILURE(status)) {

-- fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));

-- exit(status);

-- };

-- ucnv_close(conv);

-- // Get rid of the original file buffer

-- delete[] wordBufferC;

-- // Create a MutableTrieDictionary, and loop through all the lines, inserting

-- // words.

-- // First, pick a median character.

-- UChar *current = wordSourceU + (destCap/2);

-- UChar uc = *current++;

-- UnicodeSet breaks;

-- breaks.add(0x000A); // Line Feed

-- breaks.add(0x000D); // Carriage Return

-- breaks.add(0x2028); // Line Separator

-- breaks.add(0x2029); // Paragraph Separator

-- do {

-- // Look for line break

-- while (uc && !breaks.contains(uc)) {

-- uc = *current++;

-- }

-- // Now skip to first non-line-break

-- while (uc && breaks.contains(uc)) {

-- uc = *current++;

-+ if( file == 0 ) { //cannot find file

-+ //create 1-line dummy file: ie 1 char, 1 value

-+ UNewDataMemory *pData;

-+ char msg[1024];

-+ /* write message with just the name */

-+ sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFileName);

-+ fprintf(stderr, "%s\n", msg);

-+ UChar c = 0x0020;

-+ mtd = new MutableTrieDictionary(c, status, TRUE);

-+ mtd->addWord(&c, 1, status, 1);

-+ } else { //read words in from input file

-+ fseek(file, 0, SEEK_END);

-+ wordFileSize = ftell(file);

-+ fseek(file, 0, SEEK_SET);

-+ wordBufferC = new char[wordFileSize+10];

-+ result = (long)fread(wordBufferC, 1, wordFileSize, file);

-+ if (result != wordFileSize) {

-+ fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);

-+ exit (-1);

- }

-- }

-- while (uc && (breaks.contains(uc) || u_isspace(uc)));

-- MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);

-+ wordBufferC[wordFileSize]=0;

-+ fclose(file);

-- if (U_FAILURE(status)) {

-- fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));

-- exit(status);

-- }

-+ //

-+ // Look for a Unicode Signature (BOM) on the word file

-+ //

-+ int32_t signatureLength;

-+ const char * wordSourceC = wordBufferC;

-+ const char* encoding = ucnv_detectUnicodeSignature(

-+ wordSourceC, wordFileSize, &signatureLength, &status);

-+ if (U_FAILURE(status)) {

-+ exit(status);

-+ }

-+ if(encoding!=NULL ){

-+ wordSourceC += signatureLength;

-+ wordFileSize -= signatureLength;

-+ }

-- // Now add the words. Words are non-space characters at the beginning of

-- // lines, and must be at least one UChar.

-- current = wordSourceU;

-- UChar *candidate = current;

-- uc = *current++;

-- int32_t length = 0;

-- while (uc) {

-- while (uc && !u_isspace(uc)) {

-- ++length;

-- uc = *current++;

-+ //

-+ // Open a converter to take the rule file to UTF-16

-+ //

-+ UConverter* conv;

-+ conv = ucnv_open(encoding, &status);

-+ if (U_FAILURE(status)) {

-+ fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));

-+ exit(status);

- }

-- if (length > 0) {

-- mtd->addWord(candidate, length, status);

-- if (U_FAILURE(status)) {

-- fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n",

-- u_errorName(status));

-- exit(status);

-+ //

-+ // Convert the words to UChar.

-+ // Preflight first to determine required buffer size.

-+ //

-+ uint32_t destCap = ucnv_toUChars(conv,

-+ NULL, // dest,

-+ 0, // destCapacity,

-+ wordSourceC,

-+ wordFileSize,

-+ &status);

-+ if (status != U_BUFFER_OVERFLOW_ERROR) {

-+ fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));

-+ exit(status);

-+ };

-+ status = U_ZERO_ERROR;

-+ UChar *wordSourceU = new UChar[destCap+1];

-+ ucnv_toUChars(conv,

-+ wordSourceU, // dest,

-+ destCap+1,

-+ wordSourceC,

-+ wordFileSize,

-+ &status);

-+ if (U_FAILURE(status)) {

-+ fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));

-+ exit(status);

-+ };

-+ ucnv_close(conv);

-+ // Get rid of the original file buffer

-+ delete[] wordBufferC;

-+ // Create a MutableTrieDictionary, and loop through all the lines, inserting

-+ // words.

-+ // First, pick a median character.

-+ UChar *current = wordSourceU + (destCap/2);

-+ UChar uc = *current++;

-+ UnicodeSet breaks;

-+ breaks.add(0x000A); // Line Feed

-+ breaks.add(0x000D); // Carriage Return

-+ breaks.add(0x2028); // Line Separator

-+ breaks.add(0x2029); // Paragraph Separator

-+ do {

-+ // Look for line break

-+ while (uc && !breaks.contains(uc)) {

-+ uc = *current++;

-+ }

-+ // Now skip to first non-line-break

-+ while (uc && breaks.contains(uc)) {

-+ uc = *current++;

- }

-- // Find beginning of next line

-- while (uc && !breaks.contains(uc)) {

-- uc = *current++;

-+ while (uc && (breaks.contains(uc) || u_isspace(uc)));

-+ mtd = new MutableTrieDictionary(uc, status);

-+ if (U_FAILURE(status)) {

-+ fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));

-+ exit(status);

- }

-- while (uc && breaks.contains(uc)) {

-- uc = *current++;

-+ // Now add the words. Words are non-space characters at the beginning of

-+ // lines, and must be at least one UChar. If a word has an associated value,

-+ // the value should follow the word on the same line after a tab character.

-+ current = wordSourceU;

-+ UChar *candidate = current;

-+ uc = *current++;

-+ int32_t length = 0;

-+ int count = 0;

-+ while (uc) {

-+ while (uc && !u_isspace(uc)) {

-+ ++length;

-+ uc = *current++;

-+ }

-+ UnicodeString valueString;

-+ UChar candidateValue;

-+ if(uc == 0x0009){ //separator is a tab char, read in number after space

-+ while (uc && u_isspace(uc)) {

-+ uc = *current++;

-+ }

-+ while (uc && !u_isspace(uc)) {

-+ valueString.append(uc);

-+ uc = *current++;

-+ }

-+ if (length > 0) {

-+ count++;

-+ if(valueString.length() > 0){

-+ mtd->setValued(TRUE);

-+ uint32_t value = 0;

-+ char* s = new char[valueString.length()];

-+ valueString.extract(0,valueString.length(), s, valueString.length());

-+ int n = sscanf(s, "%ud", &value);

-+ U_ASSERT(n == 1);

-+ U_ASSERT(value >= 0);

-+ mtd->addWord(candidate, length, status, (uint16_t)value);

-+ delete[] s;

-+ } else {

-+ mtd->addWord(candidate, length, status);

-+ }

-+ if (U_FAILURE(status)) {

-+ fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n",

-+ u_errorName(status), count);

-+ exit(status);

-+ }

-+ // Find beginning of next line

-+ while (uc && !breaks.contains(uc)) {

-+ uc = *current++;

-+ }

-+ // Find next non-line-breaking character

-+ while (uc && breaks.contains(uc)) {

-+ uc = *current++;

-+ }

-+ candidate = current-1;

-+ length = 0;

- }

-- candidate = current-1;

-- length = 0;

-+ // Get rid of the Unicode text buffer

-+ delete[] wordSourceU;

- }

-- // Get rid of the Unicode text buffer

-- delete[] wordSourceU;

- // Now, create a CompactTrieDictionary from the mutable dictionary

- CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);

- if (U_FAILURE(status)) {

-@@ -393,4 +440,3 @@

- #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

- }

---- source/tools/genctd/Makefile.in 2006-12-16 13:07:01.000000000 -0800

-+++ source/tools/genctd/Makefile.in 2011-01-21 14:12:45.555920000 -0800

-@@ -23,13 +23,13 @@

- ## Extra files to remove for 'make clean'

- CLEANFILES = *~ $(DEPS) $(MAN_FILES)

--## Target information

-+## Target informationcd

- TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)

- ifneq ($(top_builddir),$(top_srcdir))

- CPPFLAGS += -I$(top_builddir)/common

- endif

--CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil

-+CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -I$(top_srcdir)/i18n

- LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)

- OBJECTS = genctd.o

« no previous file with comments | « icu52/patches/search_collation.patch ('k') | icu52/patches/si_value.undef.patch » ('j') | no next file with comments »