Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(194)

Side by Side Diff: icu52/patches/segmentation.patch

Issue 224943002: icu local change part1 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/
Patch Set: function indentation changed Created 6 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « icu52/patches/search_collation.patch ('k') | icu52/patches/si_value.undef.patch » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 --- source/common/brkeng.cpp 2009-11-11 07:47:22.000000000 -0800
2 +++ source/common/brkeng.cpp 2011-01-21 14:12:45.479922000 -0800
3 @@ -226,6 +226,30 @@
4 case USCRIPT_THAI:
5 engine = new ThaiBreakEngine(dict, status);
6 break;
7 +
8 + case USCRIPT_HANGUL:
9 + engine = new CjkBreakEngine(dict, kKorean, status);
10 + break;
11 +
12 + // use same BreakEngine and dictionary for both Chinese and Japanes e
13 + case USCRIPT_HIRAGANA:
14 + case USCRIPT_KATAKANA:
15 + case USCRIPT_HAN:
16 + engine = new CjkBreakEngine(dict, kChineseJapanese, status);
17 + break;
18 +#if 0
19 + // TODO: Have to get some characters with script=common handled
20 + // by CjkBreakEngine (e.g. U+309B). Simply subjecting
21 + // them to CjkBreakEngine does not work. The engine has to
22 + // special-case them.
23 + case USCRIPT_COMMON:
24 + {
25 + UBlockCode block = ublock_getCode(code);
26 + if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
27 + engine = new CjkBreakEngine(dict, kChineseJapanese, status);
28 + break;
29 + }
30 +#endif
31 default:
32 break;
33 }
34 @@ -281,6 +305,13 @@
35 dict = NULL;
36 }
37 return dict;
38 + } else if (dictfname != NULL){
39 + //create dummy dict if dictionary filename not valid
40 + UChar c = 0x0020;
41 + status = U_ZERO_ERROR;
42 + MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE) ;
43 + mtd->addWord(&c, 1, status, 1);
44 + return new CompactTrieDictionary(*mtd, status);
45 }
46 return NULL;
47 }
48 --- source/common/dictbe.cpp 2008-06-13 12:21:12.000000000 -0700
49 +++ source/common/dictbe.cpp 2011-01-21 14:12:45.468928000 -0800
50 @@ -16,6 +16,9 @@
51 #include "unicode/ubrk.h"
52 #include "uvector.h"
53 #include "triedict.h"
54 +#include "uassert.h"
55 +#include "unicode/normlzr.h"
56 +#include "cmemory.h"
57
58 U_NAMESPACE_BEGIN
59
60 @@ -422,6 +425,294 @@
61 return wordsFound;
62 }
63
64 +/*
65 + ******************************************************************
66 + * CjkBreakEngine
67 + */
68 +static const uint32_t kuint32max = 0xFFFFFFFF;
69 +CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, Langu ageType type, UErrorCode &status)
70 +: DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){
71 + if (!adoptDictionary->getValued()) {
72 + status = U_ILLEGAL_ARGUMENT_ERROR;
73 + return;
74 + }
75 +
76 + // Korean dictionary only includes Hangul syllables
77 + fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), sta tus);
78 + fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
79 + fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\ uff9f]"), status);
80 + fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status );
81 +
82 + if (U_SUCCESS(status)) {
83 + // handle Korean and Japanese/Chinese using different dictionaries
84 + if (type == kKorean) {
85 + setCharacters(fHangulWordSet);
86 + } else { //Chinese and Japanese
87 + UnicodeSet cjSet;
88 + cjSet.addAll(fHanWordSet);
89 + cjSet.addAll(fKatakanaWordSet);
90 + cjSet.addAll(fHiraganaWordSet);
91 + cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc"));
92 + setCharacters(cjSet);
93 + }
94 + }
95 +}
96 +
97 +CjkBreakEngine::~CjkBreakEngine(){
98 + delete fDictionary;
99 +}
100 +
101 +// The katakanaCost values below are based on the length frequencies of all
102 +// katakana phrases in the dictionary
103 +static const int kMaxKatakanaLength = 8;
104 +static const int kMaxKatakanaGroupLength = 20;
105 +static const uint32_t maxSnlp = 255;
106 +
107 +static inline uint32_t getKatakanaCost(int wordLength){
108 + //TODO: fill array with actual values from dictionary!
109 + static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
110 + = {8192, 984, 408, 240, 204, 252, 300, 3 72, 480};
111 + return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
112 +}
113 +
114 +static inline bool isKatakana(uint16_t value) {
115 + return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
116 + (value >= 0xFF66u && value <= 0xFF9fu);
117 +}
118 +
119 +// A very simple helper class to streamline the buffer handling in
120 +// divideUpDictionaryRange.
121 +template<class T, size_t N>
122 +class AutoBuffer {
123 + public:
124 + AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {
125 + if (size > N) {
126 + buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
127 + capacity = size;
128 + }
129 + }
130 + ~AutoBuffer() {
131 + if (buffer != stackBuffer)
132 + uprv_free(buffer);
133 + }
134 +#if 0
135 + T* operator& () {
136 + return buffer;
137 + }
138 +#endif
139 + T* elems() {
140 + return buffer;
141 + }
142 + const T& operator[] (size_t i) const {
143 + return buffer[i];
144 + }
145 + T& operator[] (size_t i) {
146 + return buffer[i];
147 + }
148 +
149 + // resize without copy
150 + void resize(size_t size) {
151 + if (size <= capacity)
152 + return;
153 + if (buffer != stackBuffer)
154 + uprv_free(buffer);
155 + buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
156 + capacity = size;
157 + }
158 + private:
159 + T stackBuffer[N];
160 + T* buffer;
161 + AutoBuffer();
162 + size_t capacity;
163 +};
164 +
165 +
166 +/*
167 + * @param text A UText representing the text
168 + * @param rangeStart The start of the range of dictionary characters
169 + * @param rangeEnd The end of the range of dictionary characters
170 + * @param foundBreaks Output of C array of int32_t break positions, or 0
171 + * @return The number of breaks found
172 + */
173 +int32_t
174 +CjkBreakEngine::divideUpDictionaryRange( UText *text,
175 + int32_t rangeStart,
176 + int32_t rangeEnd,
177 + UStack &foundBreaks ) const {
178 + if (rangeStart >= rangeEnd) {
179 + return 0;
180 + }
181 +
182 + const size_t defaultInputLength = 80;
183 + size_t inputLength = rangeEnd - rangeStart;
184 + AutoBuffer<UChar, defaultInputLength> charString(inputLength);
185 +
186 + // Normalize the input string and put it in normalizedText.
187 + // The map from the indices of the normalized input to the raw
188 + // input is kept in charPositions.
189 + UErrorCode status = U_ZERO_ERROR;
190 + utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);
191 + if (U_FAILURE(status))
192 + return 0;
193 +
194 + UnicodeString inputString(charString.elems(), inputLength);
195 + UNormalizationMode norm_mode = UNORM_NFKC;
196 + UBool isNormalized =
197 + Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
198 + Normalizer::isNormalized(inputString, norm_mode, status);
199 +
200 + AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
201 + int numChars = 0;
202 + UText normalizedText = UTEXT_INITIALIZER;
203 + // Needs to be declared here because normalizedText holds onto its buffer.
204 + UnicodeString normalizedString;
205 + if (isNormalized) {
206 + int32_t index = 0;
207 + charPositions[0] = 0;
208 + while(index < inputString.length()) {
209 + index = inputString.moveIndex32(index, 1);
210 + charPositions[++numChars] = index;
211 + }
212 + utext_openUnicodeString(&normalizedText, &inputString, &status);
213 + }
214 + else {
215 + Normalizer::normalize(inputString, norm_mode, 0, normalizedString, stat us);
216 + if (U_FAILURE(status))
217 + return 0;
218 + charPositions.resize(normalizedString.length() + 1);
219 + Normalizer normalizer(charString.elems(), inputLength, norm_mode);
220 + int32_t index = 0;
221 + charPositions[0] = 0;
222 + while(index < normalizer.endIndex()){
223 + UChar32 uc = normalizer.next();
224 + charPositions[++numChars] = index = normalizer.getIndex();
225 + }
226 + utext_openUnicodeString(&normalizedText, &normalizedString, &status);
227 + }
228 +
229 + if (U_FAILURE(status))
230 + return 0;
231 +
232 + // From this point on, all the indices refer to the indices of
233 + // the normalized input string.
234 +
235 + // bestSnlp[i] is the snlp of the best segmentation of the first i
236 + // characters in the range to be matched.
237 + AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
238 + bestSnlp[0] = 0;
239 + for(int i=1; i<=numChars; i++){
240 + bestSnlp[i] = kuint32max;
241 + }
242 +
243 + // prev[i] is the index of the last CJK character in the previous word in
244 + // the best segmentation of the first i characters.
245 + AutoBuffer<int, defaultInputLength> prev(numChars + 1);
246 + for(int i=0; i<=numChars; i++){
247 + prev[i] = -1;
248 + }
249 +
250 + const size_t maxWordSize = 20;
251 + AutoBuffer<uint16_t, maxWordSize> values(numChars);
252 + AutoBuffer<int32_t, maxWordSize> lengths(numChars);
253 +
254 + // Dynamic programming to find the best segmentation.
255 + bool is_prev_katakana = false;
256 + for (int i = 0; i < numChars; ++i) {
257 + //utext_setNativeIndex(text, rangeStart + i);
258 + utext_setNativeIndex(&normalizedText, i);
259 + if (bestSnlp[i] == kuint32max)
260 + continue;
261 +
262 + int count;
263 + // limit maximum word length matched to size of current substring
264 + int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSiz e: numChars - i;
265 +
266 + fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());
267 +
268 + // if there are no single character matches found in the dictionary
269 + // starting with this charcter, treat character as a 1-character word
270 + // with the highest value possible, i.e. the least likely to occur.
271 + // Exclude Korean characters from this treatment, as they should be lef t
272 + // together by default.
273 + if((count == 0 || lengths[0] != 1) &&
274 + !fHangulWordSet.contains(utext_current32(&normalizedText))){
275 + values[count] = maxSnlp;
276 + lengths[count++] = 1;
277 + }
278 +
279 + for (int j = 0; j < count; j++){
280 + //U_ASSERT(values[j] >= 0 && values[j] <= maxSnlp);
281 + uint32_t newSnlp = bestSnlp[i] + values[j];
282 + if (newSnlp < bestSnlp[lengths[j] + i]) {
283 + bestSnlp[lengths[j] + i] = newSnlp;
284 + prev[lengths[j] + i] = i;
285 + }
286 + }
287 +
288 + // In Japanese,
289 + // Katakana word in single character is pretty rare. So we apply
290 + // the following heuristic to Katakana: any continuous run of Katakana
291 + // characters is considered a candidate word with a default cost
292 + // specified in the katakanaCost table according to its length.
293 + //utext_setNativeIndex(text, rangeStart + i);
294 + utext_setNativeIndex(&normalizedText, i);
295 + bool is_katakana = isKatakana(utext_current32(&normalizedText));
296 + if (!is_prev_katakana && is_katakana) {
297 + int j = i + 1;
298 + utext_next32(&normalizedText);
299 + // Find the end of the continuous run of Katakana characters
300 + while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
301 + isKatakana(utext_current32(&normalizedText))) {
302 + utext_next32(&normalizedText);
303 + ++j;
304 + }
305 + if ((j - i) < kMaxKatakanaGroupLength) {
306 + uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
307 + if (newSnlp < bestSnlp[j]) {
308 + bestSnlp[j] = newSnlp;
309 + prev[j] = i;
310 + }
311 + }
312 + }
313 + is_prev_katakana = is_katakana;
314 + }
315 +
316 + // Start pushing the optimal offset index into t_boundary (t for tentative) .
317 + // prev[numChars] is guaranteed to be meaningful.
318 + // We'll first push in the reverse order, i.e.,
319 + // t_boundary[0] = numChars, and afterwards do a swap.
320 + AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);
321 +
322 + int numBreaks = 0;
323 + // No segmentation found, set boundary to end of range
324 + if (bestSnlp[numChars] == kuint32max) {
325 + t_boundary[numBreaks++] = numChars;
326 + } else {
327 + for (int i = numChars; i > 0; i = prev[i]){
328 + t_boundary[numBreaks++] = i;
329 +
330 + }
331 + U_ASSERT(prev[t_boundary[numBreaks-1]] == 0);
332 + }
333 +
334 + // Reverse offset index in t_boundary.
335 + // Don't add a break for the start of the dictionary range if there is one
336 + // there already.
337 + if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
338 + t_boundary[numBreaks++] = 0;
339 + }
340 +
341 + // Now that we're done, convert positions in t_bdry[] (indices in
342 + // the normalized input string) back to indices in the raw input string
343 + // while reversing t_bdry and pushing values to foundBreaks.
344 + for (int i = numBreaks-1; i >= 0; i--) {
345 + foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
346 + }
347 +
348 + utext_close(&normalizedText);
349 + return numBreaks;
350 +}
351 +
352 U_NAMESPACE_END
353
354 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
355 --- source/common/dictbe.h 2006-09-29 17:37:45.000000000 -0700
356 +++ source/common/dictbe.h 2011-01-21 14:12:45.492920000 -0800
357 @@ -1,8 +1,8 @@
358 /**
359 - ****************************************************************************** *
360 - * Copyright (C) 2006, International Business Machines Corporation and others. *
361 - * All Rights Reserved. *
362 - ****************************************************************************** *
363 + ****************************************************************************** ****
364 + * Copyright (C) 2006-2010, International Business Machines Corporation and oth ers.
365 + * All Rights Reserved.
366 + ****************************************************************************** ****
367 */
368
369 #ifndef DICTBE_H
370 @@ -65,31 +65,31 @@
371 */
372 virtual ~DictionaryBreakEngine();
373
374 - /**
375 - * <p>Indicate whether this engine handles a particular character for
376 - * a particular kind of break.</p>
377 - *
378 - * @param c A character which begins a run that the engine might handle
379 - * @param breakType The type of text break which the caller wants to determine
380 - * @return TRUE if this engine handles the particular character and break
381 - * type.
382 - */
383 + /**
384 + * <p>Indicate whether this engine handles a particular character for
385 + * a particular kind of break.</p>
386 + *
387 + * @param c A character which begins a run that the engine might handle
388 + * @param breakType The type of text break which the caller wants to determin e
389 + * @return TRUE if this engine handles the particular character and break
390 + * type.
391 + */
392 virtual UBool handles( UChar32 c, int32_t breakType ) const;
393
394 - /**
395 - * <p>Find any breaks within a run in the supplied text.</p>
396 - *
397 - * @param text A UText representing the text. The
398 - * iterator is left at the end of the run of characters which the engine
399 - * is capable of handling.
400 - * @param startPos The start of the run within the supplied text.
401 - * @param endPos The end of the run within the supplied text.
402 - * @param reverse Whether the caller is looking for breaks in a reverse
403 - * direction.
404 - * @param breakType The type of break desired, or -1.
405 - * @param foundBreaks An allocated C array of the breaks found, if any
406 - * @return The number of breaks found.
407 - */
408 + /**
409 + * <p>Find any breaks within a run in the supplied text.</p>
410 + *
411 + * @param text A UText representing the text. The iterator is left at
412 + * the end of the run of characters which the engine is capable of handling
413 + * that starts from the first (or last) character in the range.
414 + * @param startPos The start of the run within the supplied text.
415 + * @param endPos The end of the run within the supplied text.
416 + * @param reverse Whether the caller is looking for breaks in a reverse
417 + * direction.
418 + * @param breakType The type of break desired, or -1.
419 + * @param foundBreaks An allocated C array of the breaks found, if any
420 + * @return The number of breaks found.
421 + */
422 virtual int32_t findBreaks( UText *text,
423 int32_t startPos,
424 int32_t endPos,
425 @@ -114,7 +114,7 @@
426 // virtual void setBreakTypes( uint32_t breakTypes );
427
428 /**
429 - * <p>Divide up a range of known dictionary characters.</p>
430 + * <p>Divide up a range of known dictionary characters handled by this break e ngine.</p>
431 *
432 * @param text A UText representing the text
433 * @param rangeStart The start of the range of dictionary characters
434 @@ -171,7 +171,7 @@
435
436 protected:
437 /**
438 - * <p>Divide up a range of known dictionary characters.</p>
439 + * <p>Divide up a range of known dictionary characters handled by this break e ngine.</p>
440 *
441 * @param text A UText representing the text
442 * @param rangeStart The start of the range of dictionary characters
443 @@ -186,6 +186,66 @@
444
445 };
446
447 +/*******************************************************************
448 + * CjkBreakEngine
449 + */
450 +
451 +//indicates language/script that the CjkBreakEngine will handle
452 +enum LanguageType {
453 + kKorean,
454 + kChineseJapanese
455 +};
456 +
457 +/**
458 + * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
459 + * TrieWordDictionary with costs associated with each word and
460 + * Viterbi decoding to determine CJK-specific breaks.</p>
461 + */
462 +class CjkBreakEngine : public DictionaryBreakEngine {
463 + protected:
464 + /**
465 + * The set of characters handled by this engine
466 + * @internal
467 + */
468 + UnicodeSet fHangulWordSet;
469 + UnicodeSet fHanWordSet;
470 + UnicodeSet fKatakanaWordSet;
471 + UnicodeSet fHiraganaWordSet;
472 +
473 + const TrieWordDictionary *fDictionary;
474 +
475 + public:
476 +
477 + /**
478 + * <p>Default constructor.</p>
479 + *
480 + * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the
481 + * engine is deleted. The TrieWordDictionary must contain costs for each wo rd
482 + * in order for the dictionary to work properly.
483 + */
484 + CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status);
485 +
486 + /**
487 + * <p>Virtual destructor.</p>
488 + */
489 + virtual ~CjkBreakEngine();
490 +
491 + protected:
492 + /**
493 + * <p>Divide up a range of known dictionary characters handled by this brea k engine.</p>
494 + *
495 + * @param text A UText representing the text
496 + * @param rangeStart The start of the range of dictionary characters
497 + * @param rangeEnd The end of the range of dictionary characters
498 + * @param foundBreaks Output of C array of int32_t break positions, or 0
499 + * @return The number of breaks found
500 + */
501 + virtual int32_t divideUpDictionaryRange( UText *text,
502 + int32_t rangeStart,
503 + int32_t rangeEnd,
504 + UStack &foundBreaks ) const;
505 +
506 +};
507
508 U_NAMESPACE_END
509
510 --- source/common/rbbi.cpp 2010-07-22 17:15:37.000000000 -0700
511 +++ source/common/rbbi.cpp 2011-01-21 14:12:45.457938000 -0800
512 @@ -1555,10 +1555,12 @@
513 int32_t endPos,
514 UBool reverse) {
515 // Reset the old break cache first.
516 - uint32_t dictionaryCount = fDictionaryCharCount;
517 reset();
518
519 - if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
520 + // note: code segment below assumes that dictionary chars are in the
521 + // startPos-endPos range
522 + // value returned should be next character in sequence
523 + if ((endPos - startPos) <= 1) {
524 return (reverse ? startPos : endPos);
525 }
526
527 @@ -1711,7 +1713,7 @@
528 // proposed break by one of the breaks we found. Use following() an d
529 // preceding() to do the work. They should never recurse in this ca se.
530 if (reverse) {
531 - return preceding(endPos - 1);
532 + return preceding(endPos);
533 }
534 else {
535 return following(startPos);
536 --- source/common/triedict.cpp 2008-02-13 01:35:50.000000000 -0800
537 +++ source/common/triedict.cpp 2011-01-21 14:12:45.271006000 -0800
538 @@ -20,6 +20,7 @@
539 #include "uvector.h"
540 #include "uvectr32.h"
541 #include "uarrsort.h"
542 +#include "hash.h"
543
544 //#define DEBUG_TRIE_DICT 1
545
546 @@ -27,6 +28,11 @@
547 #include <sys/times.h>
548 #include <limits.h>
549 #include <stdio.h>
550 +#include <time.h>
551 +#ifndef CLK_TCK
552 +#define CLK_TCK CLOCKS_PER_SEC
553 +#endif
554 +
555 #endif
556
557 U_NAMESPACE_BEGIN
558 @@ -45,6 +51,11 @@
559 * MutableTrieDictionary
560 */
561
562 +//#define MAX_VALUE 65535
563 +
564 +// forward declaration
565 +inline uint16_t scaleLogProbabilities(double logprob);
566 +
567 // Node structure for the ternary, uncompressed trie
568 struct TernaryNode : public UMemory {
569 UChar ch; // UTF-16 code unit
570 @@ -77,7 +88,8 @@
571 delete high;
572 }
573
574 -MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) {
575 +MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status,
576 + UBool containsValue /* = FALSE */ ) {
577 // Start the trie off with something. Having the root node already present
578 // cuts a special case out of the search/insertion functions.
579 // Making it a median character cuts the worse case for searches from
580 @@ -91,14 +103,19 @@
581 if (U_SUCCESS(status) && fIter == NULL) {
582 status = U_MEMORY_ALLOCATION_ERROR;
583 }
584 +
585 + fValued = containsValue;
586 }
587
588 -MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) {
589 +MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status,
590 + UBool containsValue /* = false */ ) {
591 fTrie = NULL;
592 fIter = utext_openUChars(NULL, NULL, 0, &status);
593 if (U_SUCCESS(status) && fIter == NULL) {
594 status = U_MEMORY_ALLOCATION_ERROR;
595 }
596 +
597 + fValued = containsValue;
598 }
599
600 MutableTrieDictionary::~MutableTrieDictionary() {
601 @@ -108,12 +125,13 @@
602
603 int32_t
604 MutableTrieDictionary::search( UText *text,
605 - int32_t maxLength,
606 - int32_t *lengths,
607 - int &count,
608 - int limit,
609 - TernaryNode *&parent,
610 - UBool &pMatched ) const {
611 + int32_t maxLength,
612 + int32_t *lengths,
613 + int &count,
614 + int limit,
615 + TernaryNode *&parent,
616 + UBool &pMatched,
617 + uint16_t *values /*=NULL*/) const {
618 // TODO: current implementation works in UTF-16 space
619 const TernaryNode *up = NULL;
620 const TernaryNode *p = fTrie;
621 @@ -121,6 +139,10 @@
622 pMatched = TRUE;
623 int i;
624
625 + if (!fValued) {
626 + values = NULL;
627 + }
628 +
629 UChar uc = utext_current32(text);
630 for (i = 0; i < maxLength && p != NULL; ++i) {
631 while (p != NULL) {
632 @@ -141,7 +163,11 @@
633 break;
634 }
635 // Must be equal to get here
636 - if (limit > 0 && (p->flags & kEndsWord)) {
637 + if (limit > 0 && (p->flags > 0)) {
638 + //is there a more efficient way to add values? ie. remove if stmt
639 + if(values != NULL) {
640 + values[mycount] = p->flags;
641 + }
642 lengths[mycount++] = i+1;
643 --limit;
644 }
645 @@ -161,13 +187,14 @@
646 void
647 MutableTrieDictionary::addWord( const UChar *word,
648 int32_t length,
649 - UErrorCode &status ) {
650 -#if 0
651 - if (length <= 0) {
652 + UErrorCode &status,
653 + uint16_t value /* = 0 */ ) {
654 + // dictionary cannot store zero values, would interfere with flags
655 + if (length <= 0 || (!fValued && value > 0) || (fValued && value == 0)) {
656 status = U_ILLEGAL_ARGUMENT_ERROR;
657 return;
658 }
659 -#endif
660 +
661 TernaryNode *parent;
662 UBool pMatched;
663 int count;
664 @@ -177,7 +204,7 @@
665 matched = search(fIter, length, NULL, count, 0, parent, pMatched);
666
667 while (matched++ < length) {
668 - UChar32 uc = utext_next32(fIter); // TODO: supplemetary support?
669 + UChar32 uc = utext_next32(fIter); // TODO: supplementary support?
670 U_ASSERT(uc != U_SENTINEL);
671 TernaryNode *newNode = new TernaryNode(uc);
672 if (newNode == NULL) {
673 @@ -199,30 +226,23 @@
674 parent = newNode;
675 }
676
677 - parent->flags |= kEndsWord;
678 -}
679 -
680 -#if 0
681 -void
682 -MutableTrieDictionary::addWords( UEnumeration *words,
683 - UErrorCode &status ) {
684 - int32_t length;
685 - const UChar *word;
686 - while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) {
687 - addWord(word, length, status);
688 + if(fValued && value > 0){
689 + parent->flags = value;
690 + } else {
691 + parent->flags |= kEndsWord;
692 }
693 }
694 -#endif
695
696 int32_t
697 MutableTrieDictionary::matches( UText *text,
698 int32_t maxLength,
699 int32_t *lengths,
700 int &count,
701 - int limit ) const {
702 + int limit,
703 + uint16_t *values /*=NULL*/) const {
704 TernaryNode *parent;
705 UBool pMatched;
706 - return search(text, maxLength, lengths, count, limit, parent, pMatched);
707 + return search(text, maxLength, lengths, count, limit, parent, pMatched, val ues);
708 }
709
710 // Implementation of iteration for MutableTrieDictionary
711 @@ -277,7 +297,7 @@
712 break;
713 }
714 case kEqual:
715 - emit = (node->flags & kEndsWord) != 0;
716 + emit = node->flags > 0;
717 equal = (node->equal != NULL);
718 // If this node should be part of the next emitted string, appe nd
719 // the UChar to the string, and make sure we pop it when we com e
720 @@ -299,7 +319,7 @@
721 }
722 case kGreaterThan:
723 // If this node's character is in the string, remove it.
724 - if (node->equal != NULL || (node->flags & kEndsWord)) {
725 + if (node->equal != NULL || node->flags > 0) {
726 unistr.truncate(unistr.length()-1);
727 }
728 if (node->high != NULL) {
729 @@ -354,12 +374,75 @@
730 * CompactTrieDictionary
731 */
732
733 +//TODO further optimization:
734 +// minimise size of trie with logprobs by storing values
735 +// for terminal nodes directly in offsets[]
736 +// --> calculating from next offset *might* be simpler, but would have to add
737 +// one last offset for logprob of last node
738 +// --> if calculate from current offset, need to factor in possible overflow
739 +// as well.
740 +// idea: store in offset, set first bit to indicate logprob storage-->won't
741 +// have to access additional node
742 +
743 +// {'Dic', 1}, version 1: uses old header, no values
744 +#define COMPACT_TRIE_MAGIC_1 0x44696301
745 +// version 2: uses new header (more than 2^16 nodes), no values
746 +#define COMPACT_TRIE_MAGIC_2 0x44696302
747 +// version 3: uses new header, includes values
748 +#define COMPACT_TRIE_MAGIC_3 0x44696303
749 +
750 struct CompactTrieHeader {
751 uint32_t size; // Size of the data in bytes
752 uint32_t magic; // Magic number (including version)
753 + uint32_t nodeCount; // Number of entries in offsets[]
754 + uint32_t root; // Node number of the root node
755 + uint32_t offsets[1]; // Offsets to nodes from start of data
756 +};
757 +
758 +// old version of CompactTrieHeader kept for backwards compatibility
759 +struct CompactTrieHeaderV1 {
760 + uint32_t size; // Size of the data in bytes
761 + uint32_t magic; // Magic number (including version)
762 uint16_t nodeCount; // Number of entries in offsets[]
763 uint16_t root; // Node number of the root node
764 - uint32_t offsets[1]; // Offsets to nodes from start of data
765 + uint32_t offsets[1]; // Offsets to nodes from start of data
766 +};
767 +
768 +// Helper class for managing CompactTrieHeader and CompactTrieHeaderV1
769 +struct CompactTrieInfo {
770 + uint32_t size; // Size of the data in bytes
771 + uint32_t magic; // Magic number (including version)
772 + uint32_t nodeCount; // Number of entries in offsets[]
773 + uint32_t root; // Node number of the root node
774 + uint32_t *offsets; // Offsets to nodes from start of data
775 + uint8_t *address; // pointer to header bytes in memory
776 +
777 + CompactTrieInfo(const void *data, UErrorCode &status){
778 + CompactTrieHeader *header = (CompactTrieHeader *) data;
779 + if (header->magic != COMPACT_TRIE_MAGIC_1 &&
780 + header->magic != COMPACT_TRIE_MAGIC_2 &&
781 + header->magic != COMPACT_TRIE_MAGIC_3) {
782 + status = U_ILLEGAL_ARGUMENT_ERROR;
783 + } else {
784 + size = header->size;
785 + magic = header->magic;
786 +
787 + if (header->magic == COMPACT_TRIE_MAGIC_1) {
788 + CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *) header;
789 + nodeCount = headerV1->nodeCount;
790 + root = headerV1->root;
791 + offsets = &(headerV1->offsets[0]);
792 + address = (uint8_t *)headerV1;
793 + } else {
794 + nodeCount = header->nodeCount;
795 + root = header->root;
796 + offsets = &(header->offsets[0]);
797 + address = (uint8_t *)header;
798 + }
799 + }
800 + }
801 +
802 + ~CompactTrieInfo(){}
803 };
804
805 // Note that to avoid platform-specific alignment issues, all members of the no de
806 @@ -375,10 +458,14 @@
807 enum CompactTrieNodeFlags {
808 kVerticalNode = 0x1000, // This is a vertical node
809 kParentEndsWord = 0x2000, // The node whose equal link points to this ends a word
810 - kReservedFlag1 = 0x4000,
811 - kReservedFlag2 = 0x8000,
812 + kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kR eservedFlag1
813 + kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReserved Flag2
814 kCountMask = 0x0FFF, // The count portion of flagscount
815 - kFlagMask = 0xF000 // The flags portion of flagscount
816 + kFlagMask = 0xF000, // The flags portion of flagscount
817 + kRootCountMask = 0x7FFF // The count portion of flagscount in the r oot node
818 +
819 + //offset flags:
820 + //kOffsetContainsValue = 0x80000000 // Offset contains value for pare nt node
821 };
822
823 // The two node types are distinguished by the kVerticalNode flag.
824 @@ -402,63 +489,177 @@
825 uint16_t chars[1]; // Code units
826 };
827
828 -// {'Dic', 1}, version 1
829 -#define COMPACT_TRIE_MAGIC_1 0x44696301
830 -
831 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj,
832 UErrorCode &status )
833 : fUData(dataObj)
834 {
835 - fData = (const CompactTrieHeader *) udata_getMemory(dataObj);
836 + fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
837 + *fInfo = CompactTrieInfo(udata_getMemory(dataObj), status);
838 fOwnData = FALSE;
839 - if (fData->magic != COMPACT_TRIE_MAGIC_1) {
840 - status = U_ILLEGAL_ARGUMENT_ERROR;
841 - fData = NULL;
842 - }
843 }
844 +
845 CompactTrieDictionary::CompactTrieDictionary( const void *data,
846 UErrorCode &status )
847 : fUData(NULL)
848 {
849 - fData = (const CompactTrieHeader *) data;
850 + fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
851 + *fInfo = CompactTrieInfo(data, status);
852 fOwnData = FALSE;
853 - if (fData->magic != COMPACT_TRIE_MAGIC_1) {
854 - status = U_ILLEGAL_ARGUMENT_ERROR;
855 - fData = NULL;
856 - }
857 }
858
859 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict ,
860 UErrorCode &status )
861 : fUData(NULL)
862 {
863 - fData = compactMutableTrieDictionary(dict, status);
864 + const CompactTrieHeader* header = compactMutableTrieDictionary(dict, status );
865 + if (U_SUCCESS(status)) {
866 + fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
867 + *fInfo = CompactTrieInfo(header, status);
868 + }
869 +
870 fOwnData = !U_FAILURE(status);
871 }
872
873 CompactTrieDictionary::~CompactTrieDictionary() {
874 if (fOwnData) {
875 - uprv_free((void *)fData);
876 + uprv_free((void *)(fInfo->address));
877 }
878 + uprv_free((void *)fInfo);
879 +
880 if (fUData) {
881 udata_close(fUData);
882 }
883 }
884
885 +UBool CompactTrieDictionary::getValued() const{
886 + return fInfo->magic == COMPACT_TRIE_MAGIC_3;
887 +}
888 +
889 uint32_t
890 CompactTrieDictionary::dataSize() const {
891 - return fData->size;
892 + return fInfo->size;
893 }
894
895 const void *
896 CompactTrieDictionary::data() const {
897 - return fData;
898 + return fInfo->address;
899 +}
900 +
901 +//This function finds the address of a node for us, given its node ID
902 +static inline const CompactTrieNode *
903 +getCompactNode(const CompactTrieInfo *info, uint32_t node) {
904 + if(node < info->root-1) {
905 + return (const CompactTrieNode *)(&info->offsets[node]);
906 + } else {
907 + return (const CompactTrieNode *)(info->address + info->offsets[node]);
908 + }
909 }
910
911 -// This function finds the address of a node for us, given its node ID
912 +//this version of getCompactNode is currently only used in compactMutableTrieDi ctionary()
913 static inline const CompactTrieNode *
914 -getCompactNode(const CompactTrieHeader *header, uint16_t node) {
915 - return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[ node]);
916 +getCompactNode(const CompactTrieHeader *header, uint32_t node) {
917 + if(node < header->root-1) {
918 + return (const CompactTrieNode *)(&header->offsets[node]);
919 + } else {
920 + return (const CompactTrieNode *)((const uint8_t *)header + header->offs ets[node]);
921 + }
922 +}
923 +
924 +
925 +/**
926 + * Calculates the number of links in a node
927 + * @node The specified node
928 + */
929 +static inline const uint16_t
930 +getCount(const CompactTrieNode *node){
931 + return (node->flagscount & kCountMask);
932 + //use the code below if number of links ever exceed 4096
933 + //return (node->flagscount & kCountMask) + ((node->flagscount & kExceedsCou nt) >> 2);
934 +}
935 +
936 +/**
937 + * calculates an equal link node ID of a horizontal node
938 + * @hnode The horizontal node containing the equal link
939 + * @param index The index into hnode->entries[]
940 + * @param nodeCount The length of hnode->entries[]
941 + */
942 +static inline uint32_t calcEqualLink(const CompactTrieVerticalNode *vnode){
943 + if(vnode->flagscount & kEqualOverflows){
944 + // treat overflow bits as an extension of chars[]
945 + uint16_t *overflow = (uint16_t *) &vnode->chars[getCount((CompactTrieNo de*)vnode)];
946 + return vnode->equal + (((uint32_t)*overflow) << 16);
947 + }else{
948 + return vnode->equal;
949 + }
950 +}
951 +
952 +/**
953 + * calculates an equal link node ID of a horizontal node
954 + * @hnode The horizontal node containing the equal link
955 + * @param index The index into hnode->entries[]
956 + * @param nodeCount The length of hnode->entries[]
957 + */
958 +static inline uint32_t calcEqualLink(const CompactTrieHorizontalNode *hnode, ui nt16_t index, uint16_t nodeCount){
959 + if(hnode->flagscount & kEqualOverflows){
960 + //set overflow to point to the uint16_t containing the overflow bits
961 + uint16_t *overflow = (uint16_t *) &hnode->entries[nodeCount];
962 + overflow += index/4;
963 + uint16_t extraBits = (*overflow >> (3 - (index % 4)) * 4) % 0x10;
964 + return hnode->entries[index].equal + (((uint32_t)extraBits) << 16);
965 + } else {
966 + return hnode->entries[index].equal;
967 + }
968 +}
969 +
970 +/**
971 + * Returns the value stored in the specified node which is associated with its
972 + * parent node.
973 + * TODO: how to tell that value is stored in node or in offset? check whether
974 + * node ID < fInfo->root!
975 + */
976 +static inline uint16_t getValue(const CompactTrieHorizontalNode *hnode){
977 + uint16_t count = getCount((CompactTrieNode *)hnode);
978 + uint16_t overflowSize = 0; //size of node ID overflow storage in bytes
979 +
980 + if(hnode->flagscount & kEqualOverflows)
981 + overflowSize = (count + 3) / 4 * sizeof(uint16_t);
982 + return *((uint16_t *)((uint8_t *)&hnode->entries[count] + overflowSize));
983 +}
984 +
985 +static inline uint16_t getValue(const CompactTrieVerticalNode *vnode){
986 + // calculate size of total node ID overflow storage in bytes
987 + uint16_t overflowSize = (vnode->flagscount & kEqualOverflows)? sizeof(uint1 6_t) : 0;
988 + return *((uint16_t *)((uint8_t *)&vnode->chars[getCount((CompactTrieNode *) vnode)] + overflowSize));
989 +}
990 +
991 +static inline uint16_t getValue(const CompactTrieNode *node){
992 + if(node->flagscount & kVerticalNode)
993 + return getValue((const CompactTrieVerticalNode *)node);
994 + else
995 + return getValue((const CompactTrieHorizontalNode *)node);
996 +}
997 +
998 +//returns index of match in CompactTrieHorizontalNode.entries[] using binary se arch
999 +inline int16_t
1000 +searchHorizontalEntries(const CompactTrieHorizontalEntry *entries,
1001 + UChar uc, uint16_t nodeCount){
1002 + int low = 0;
1003 + int high = nodeCount-1;
1004 + int middle;
1005 + while (high >= low) {
1006 + middle = (high+low)/2;
1007 + if (uc == entries[middle].ch) {
1008 + return middle;
1009 + }
1010 + else if (uc < entries[middle].ch) {
1011 + high = middle-1;
1012 + }
1013 + else {
1014 + low = middle+1;
1015 + }
1016 + }
1017 +
1018 + return -1;
1019 }
1020
1021 int32_t
1022 @@ -466,17 +667,38 @@
1023 int32_t maxLength,
1024 int32_t *lengths,
1025 int &count,
1026 - int limit ) const {
1027 + int limit,
1028 + uint16_t *values /*= NULL*/) const {
1029 + if (fInfo->magic == COMPACT_TRIE_MAGIC_2)
1030 + values = NULL;
1031 +
1032 // TODO: current implementation works in UTF-16 space
1033 - const CompactTrieNode *node = getCompactNode(fData, fData->root);
1034 + const CompactTrieNode *node = getCompactNode(fInfo, fInfo->root);
1035 int mycount = 0;
1036
1037 UChar uc = utext_current32(text);
1038 int i = 0;
1039
1040 + // handle root node with only kEqualOverflows flag: assume horizontal node without parent
1041 + if(node != NULL){
1042 + const CompactTrieHorizontalNode *root = (const CompactTrieHorizontalNod e *) node;
1043 + int index = searchHorizontalEntries(root->entries, uc, root->flagscount & kRootCountMask);
1044 + if(index > -1){
1045 + node = getCompactNode(fInfo, calcEqualLink(root, index, root->flags count & kRootCountMask));
1046 + utext_next32(text);
1047 + uc = utext_current32(text);
1048 + ++i;
1049 + }else{
1050 + node = NULL;
1051 + }
1052 + }
1053 +
1054 while (node != NULL) {
1055 // Check if the node we just exited ends a word
1056 if (limit > 0 && (node->flagscount & kParentEndsWord)) {
1057 + if(values != NULL){
1058 + values[mycount] = getValue(node);
1059 + }
1060 lengths[mycount++] = i;
1061 --limit;
1062 }
1063 @@ -487,7 +709,7 @@
1064 break;
1065 }
1066
1067 - int nodeCount = (node->flagscount & kCountMask);
1068 + int nodeCount = getCount(node);
1069 if (nodeCount == 0) {
1070 // Special terminal node; return now
1071 break;
1072 @@ -507,35 +729,27 @@
1073 // To get here we must have come through the whole list successfull y;
1074 // go on to the next node. Note that a word cannot end in the middl e
1075 // of a vertical node.
1076 - node = getCompactNode(fData, vnode->equal);
1077 + node = getCompactNode(fInfo, calcEqualLink(vnode));
1078 }
1079 else {
1080 // Horizontal node; do binary search
1081 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizont alNode *)node;
1082 - int low = 0;
1083 - int high = nodeCount-1;
1084 - int middle;
1085 - node = NULL; // If we don't find a match, we'll fall out of the loop
1086 - while (high >= low) {
1087 - middle = (high+low)/2;
1088 - if (uc == hnode->entries[middle].ch) {
1089 - // We hit a match; get the next node and next character
1090 - node = getCompactNode(fData, hnode->entries[middle].equal);
1091 - utext_next32(text);
1092 - uc = utext_current32(text);
1093 - ++i;
1094 - break;
1095 - }
1096 - else if (uc < hnode->entries[middle].ch) {
1097 - high = middle-1;
1098 - }
1099 - else {
1100 - low = middle+1;
1101 - }
1102 + const CompactTrieHorizontalEntry *entries;
1103 + entries = hnode->entries;
1104 +
1105 + int index = searchHorizontalEntries(entries, uc, nodeCount);
1106 + if(index > -1){ //
1107 + // We hit a match; get the next node and next character
1108 + node = getCompactNode(fInfo, calcEqualLink(hnode, index, nodeCo unt));
1109 + utext_next32(text);
1110 + uc = utext_current32(text);
1111 + ++i;
1112 + }else{
1113 + node = NULL; // If we don't find a match, we'll fall out of the loop
1114 }
1115 }
1116 }
1117 -exit:
1118 + exit:
1119 count = mycount;
1120 return i;
1121 }
1122 @@ -545,16 +759,16 @@
1123 private:
1124 UVector32 fNodeStack; // Stack of nodes to process
1125 UVector32 fIndexStack; // Stack of where in node we are
1126 - const CompactTrieHeader *fHeader; // Trie data
1127 + const CompactTrieInfo *fInfo; // Trie data
1128
1129 public:
1130 static UClassID U_EXPORT2 getStaticClassID(void);
1131 virtual UClassID getDynamicClassID(void) const;
1132 public:
1133 - CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status)
1134 + CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status)
1135 : fNodeStack(status), fIndexStack(status) {
1136 - fHeader = header;
1137 - fNodeStack.push(header->root, status);
1138 + fInfo = info;
1139 + fNodeStack.push(info->root, status);
1140 fIndexStack.push(0, status);
1141 unistr.remove();
1142 }
1143 @@ -564,14 +778,14 @@
1144
1145 virtual StringEnumeration *clone() const {
1146 UErrorCode status = U_ZERO_ERROR;
1147 - return new CompactTrieEnumeration(fHeader, status);
1148 + return new CompactTrieEnumeration(fInfo, status);
1149 }
1150
1151 virtual const UnicodeString * snext(UErrorCode &status);
1152
1153 // Very expensive, but this should never be used.
1154 virtual int32_t count(UErrorCode &status) const {
1155 - CompactTrieEnumeration counter(fHeader, status);
1156 + CompactTrieEnumeration counter(fInfo, status);
1157 int32_t result = 0;
1158 while (counter.snext(status) != NULL && U_SUCCESS(status)) {
1159 ++result;
1160 @@ -582,7 +796,7 @@
1161 virtual void reset(UErrorCode &status) {
1162 fNodeStack.removeAllElements();
1163 fIndexStack.removeAllElements();
1164 - fNodeStack.push(fHeader->root, status);
1165 + fNodeStack.push(fInfo->root, status);
1166 fIndexStack.push(0, status);
1167 unistr.remove();
1168 }
1169 @@ -595,26 +809,34 @@
1170 if (fNodeStack.empty() || U_FAILURE(status)) {
1171 return NULL;
1172 }
1173 - const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki());
1174 + const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki());
1175 int where = fIndexStack.peeki();
1176 while (!fNodeStack.empty() && U_SUCCESS(status)) {
1177 - int nodeCount = (node->flagscount & kCountMask);
1178 + int nodeCount;
1179 +
1180 + bool isRoot = fNodeStack.peeki() == static_cast<int32_t>(fInfo->root);
1181 + if(isRoot){
1182 + nodeCount = node->flagscount & kRootCountMask;
1183 + } else {
1184 + nodeCount = getCount(node);
1185 + }
1186 +
1187 UBool goingDown = FALSE;
1188 if (nodeCount == 0) {
1189 // Terminal node; go up immediately
1190 fNodeStack.popi();
1191 fIndexStack.popi();
1192 - node = getCompactNode(fHeader, fNodeStack.peeki());
1193 + node = getCompactNode(fInfo, fNodeStack.peeki());
1194 where = fIndexStack.peeki();
1195 }
1196 - else if (node->flagscount & kVerticalNode) {
1197 + else if ((node->flagscount & kVerticalNode) && !isRoot) {
1198 // Vertical node
1199 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNo de *)node;
1200 if (where == 0) {
1201 // Going down
1202 - unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount) ;
1203 + unistr.append((const UChar *)vnode->chars, nodeCount);
1204 fIndexStack.setElementAt(1, fIndexStack.size()-1);
1205 - node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, st atus));
1206 + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnod e), status));
1207 where = fIndexStack.push(0, status);
1208 goingDown = TRUE;
1209 }
1210 @@ -623,7 +845,7 @@
1211 unistr.truncate(unistr.length()-nodeCount);
1212 fNodeStack.popi();
1213 fIndexStack.popi();
1214 - node = getCompactNode(fHeader, fNodeStack.peeki());
1215 + node = getCompactNode(fInfo, fNodeStack.peeki());
1216 where = fIndexStack.peeki();
1217 }
1218 }
1219 @@ -638,7 +860,7 @@
1220 // Push on next node
1221 unistr.append((UChar)hnode->entries[where].ch);
1222 fIndexStack.setElementAt(where+1, fIndexStack.size()-1);
1223 - node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[w here].equal, status));
1224 + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnod e, where, nodeCount), status));
1225 where = fIndexStack.push(0, status);
1226 goingDown = TRUE;
1227 }
1228 @@ -646,12 +868,14 @@
1229 // Going up
1230 fNodeStack.popi();
1231 fIndexStack.popi();
1232 - node = getCompactNode(fHeader, fNodeStack.peeki());
1233 + node = getCompactNode(fInfo, fNodeStack.peeki());
1234 where = fIndexStack.peeki();
1235 }
1236 }
1237 +
1238 // Check if the parent of the node we've just gone down to ends a
1239 // word. If so, return it.
1240 + // The root node should never end up here.
1241 if (goingDown && (node->flagscount & kParentEndsWord)) {
1242 return &unistr;
1243 }
1244 @@ -664,7 +888,7 @@
1245 if (U_FAILURE(status)) {
1246 return NULL;
1247 }
1248 - return new CompactTrieEnumeration(fData, status);
1249 + return new CompactTrieEnumeration(fInfo, status);
1250 }
1251
1252 //
1253 @@ -672,21 +896,36 @@
1254 // and back again
1255 //
1256
1257 -// Helper classes to construct the compact trie
1258 +enum CompactTrieNodeType {
1259 + kHorizontalType = 0,
1260 + kVerticalType = 1,
1261 + kValueType = 2
1262 +};
1263 +
1264 +/**
1265 + * The following classes (i.e. BuildCompactTrie*Node) are helper classes to
1266 + * construct the compact trie by storing information for each node and later
1267 + * writing the node to memory in a sequential format.
1268 + */
1269 class BuildCompactTrieNode: public UMemory {
1270 - public:
1271 +public:
1272 UBool fParentEndsWord;
1273 - UBool fVertical;
1274 + CompactTrieNodeType fNodeType;
1275 UBool fHasDuplicate;
1276 + UBool fEqualOverflows;
1277 int32_t fNodeID;
1278 UnicodeString fChars;
1279 + uint16_t fValue;
1280
1281 - public:
1282 - BuildCompactTrieNode(UBool parentEndsWord, UBool vertical, UStack &nodes, U ErrorCode &status) {
1283 +public:
1284 + BuildCompactTrieNode(UBool parentEndsWord, CompactTrieNodeType nodeType,
1285 + UStack &nodes, UErrorCode &status, uint16_t value = 0) {
1286 fParentEndsWord = parentEndsWord;
1287 fHasDuplicate = FALSE;
1288 - fVertical = vertical;
1289 + fNodeType = nodeType;
1290 + fEqualOverflows = FALSE;
1291 fNodeID = nodes.size();
1292 + fValue = parentEndsWord? value : 0;
1293 nodes.push(this, status);
1294 }
1295
1296 @@ -694,87 +933,225 @@
1297 }
1298
1299 virtual uint32_t size() {
1300 - return sizeof(uint16_t);
1301 + if(fValue > 0)
1302 + return sizeof(uint16_t) * 2;
1303 + else
1304 + return sizeof(uint16_t);
1305 }
1306
1307 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*tra nslate*/) {
1308 // Write flag/count
1309 - *((uint16_t *)(bytes+offset)) = (fChars.length() & kCountMask)
1310 - | (fVertical ? kVerticalNode : 0) | (fParentEndsWord ? kParentEndsW ord : 0 );
1311 +
1312 + // if this ever fails, a flag bit (i.e. kExceedsCount) will need to be
1313 + // used as a 5th MSB.
1314 + U_ASSERT(fChars.length() < 4096 || fNodeID == 2);
1315 +
1316 + *((uint16_t *)(bytes+offset)) = (fEqualOverflows? kEqualOverflows : 0) |
1317 + ((fNodeID == 2)? (fChars.length() & kRootCountMask):
1318 + (
1319 + (fChars.length() & kCountMask) |
1320 + //((fChars.length() << 2) & kExceedsCount) |
1321 + (fNodeType == kVerticalType ? kVerticalNode : 0) |
1322 + (fParentEndsWord ? kParentEndsWord : 0 )
1323 + )
1324 + );
1325 offset += sizeof(uint16_t);
1326 }
1327 +
1328 + virtual void writeValue(uint8_t *bytes, uint32_t &offset) {
1329 + if(fValue > 0){
1330 + *((uint16_t *)(bytes+offset)) = fValue;
1331 + offset += sizeof(uint16_t);
1332 + }
1333 + }
1334 +
1335 +};
1336 +
1337 +/**
1338 + * Stores value of parent terminating nodes that have no more subtries.
1339 + */
1340 +class BuildCompactTrieValueNode: public BuildCompactTrieNode {
1341 +public:
1342 + BuildCompactTrieValueNode(UStack &nodes, UErrorCode &status, uint16_t value )
1343 + : BuildCompactTrieNode(TRUE, kValueType, nodes, status, value){
1344 + }
1345 +
1346 + virtual ~BuildCompactTrieValueNode(){
1347 + }
1348 +
1349 + virtual uint32_t size() {
1350 + return sizeof(uint16_t) * 2;
1351 + }
1352 +
1353 + virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &trans late) {
1354 + // don't write value directly to memory but store it in offset to be wr itten later
1355 + //offset = fValue & kOffsetContainsValue;
1356 + BuildCompactTrieNode::write(bytes, offset, translate);
1357 + BuildCompactTrieNode::writeValue(bytes, offset);
1358 + }
1359 };
1360
1361 class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode {
1362 public:
1363 UStack fLinks;
1364 + UBool fMayOverflow; //intermediate value for fEqualOverflows
1365
1366 public:
1367 - BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorC ode &status)
1368 - : BuildCompactTrieNode(parentEndsWord, FALSE, nodes, status), fLinks(st atus) {
1369 + BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorC ode &status, uint16_t value = 0)
1370 + : BuildCompactTrieNode(parentEndsWord, kHorizontalType, nodes, status, valu e), fLinks(status) {
1371 + fMayOverflow = FALSE;
1372 }
1373
1374 virtual ~BuildCompactTrieHorizontalNode() {
1375 }
1376
1377 + // It is impossible to know beforehand exactly how much space the node will
1378 + // need in memory before being written, because the node IDs in the equal
1379 + // links may or may not overflow after node coalescing. Therefore, this met hod
1380 + // returns the maximum size possible for the node.
1381 virtual uint32_t size() {
1382 - return offsetof(CompactTrieHorizontalNode,entries) +
1383 - (fChars.length()*sizeof(CompactTrieHorizontalEntry));
1384 + uint32_t estimatedSize = offsetof(CompactTrieHorizontalNode,entries) +
1385 + (fChars.length()*sizeof(CompactTrieHorizontalEntry));
1386 +
1387 + if(fValue > 0)
1388 + estimatedSize += sizeof(uint16_t);
1389 +
1390 + //estimate extra space needed to store overflow for node ID links
1391 + //may be more than what is actually needed
1392 + for(int i=0; i < fChars.length(); i++){
1393 + if(((BuildCompactTrieNode *)fLinks[i])->fNodeID > 0xFFFF){
1394 + fMayOverflow = TRUE;
1395 + break;
1396 + }
1397 + }
1398 + if(fMayOverflow) // added space for overflow should be same as ceil(fCh ars.length()/4) * sizeof(uint16_t)
1399 + estimatedSize += (sizeof(uint16_t) * fChars.length() + 2)/4;
1400 +
1401 + return estimatedSize;
1402 }
1403
1404 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &trans late) {
1405 - BuildCompactTrieNode::write(bytes, offset, translate);
1406 int32_t count = fChars.length();
1407 +
1408 + //if largest nodeID > 2^16, set flag
1409 + //large node IDs are more likely to be at the back of the array
1410 + for (int32_t i = count-1; i >= 0; --i) {
1411 + if(translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeI D) > 0xFFFF){
1412 + fEqualOverflows = TRUE;
1413 + break;
1414 + }
1415 + }
1416 +
1417 + BuildCompactTrieNode::write(bytes, offset, translate);
1418 +
1419 + // write entries[] to memory
1420 for (int32_t i = 0; i < count; ++i) {
1421 CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)( bytes+offset);
1422 entry->ch = fChars[i];
1423 entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks [i])->fNodeID);
1424 #ifdef DEBUG_TRIE_DICT
1425 - if (entry->equal == 0) {
1426 +
1427 + if ((entry->equal == 0) && !fEqualOverflows) {
1428 fprintf(stderr, "ERROR: horizontal link %d, logical node %d map s to physical node zero\n",
1429 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
1430 }
1431 #endif
1432 offset += sizeof(CompactTrieHorizontalEntry);
1433 }
1434 +
1435 + // append extra bits of equal nodes to end if fEqualOverflows
1436 + if (fEqualOverflows) {
1437 + uint16_t leftmostBits = 0;
1438 + for (int16_t i = 0; i < count; i++) {
1439 + leftmostBits = (leftmostBits << 4) | getLeftmostBits(translate, i);
1440 +
1441 + // write filled uint16_t to memory
1442 + if(i % 4 == 3){
1443 + *((uint16_t *)(bytes+offset)) = leftmostBits;
1444 + leftmostBits = 0;
1445 + offset += sizeof(uint16_t);
1446 + }
1447 + }
1448 +
1449 + // pad last uint16_t with zeroes if necessary
1450 + int remainder = count % 4;
1451 + if (remainder > 0) {
1452 + *((uint16_t *)(bytes+offset)) = (leftmostBits << (16 - 4 * rema inder));
1453 + offset += sizeof(uint16_t);
1454 + }
1455 + }
1456 +
1457 + BuildCompactTrieNode::writeValue(bytes, offset);
1458 + }
1459 +
1460 + // returns leftmost bits of physical node link
1461 + uint16_t getLeftmostBits(const UVector32 &translate, uint32_t i){
1462 + uint16_t leftmostBits = (uint16_t) (translate.elementAti(((BuildCompact TrieNode *)fLinks[i])->fNodeID) >> 16);
1463 +#ifdef DEBUG_TRIE_DICT
1464 + if (leftmostBits > 0xF) {
1465 + fprintf(stderr, "ERROR: horizontal link %d, logical node %d exceeds maximum possible node ID value\n",
1466 + i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
1467 + }
1468 +#endif
1469 + return leftmostBits;
1470 }
1471
1472 void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) {
1473 fChars.append(ch);
1474 fLinks.push(link, status);
1475 }
1476 +
1477 };
1478
1479 class BuildCompactTrieVerticalNode: public BuildCompactTrieNode {
1480 - public:
1481 +public:
1482 BuildCompactTrieNode *fEqual;
1483
1484 - public:
1485 - BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCod e &status)
1486 - : BuildCompactTrieNode(parentEndsWord, TRUE, nodes, status) {
1487 +public:
1488 + BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCod e &status, uint16_t value = 0)
1489 + : BuildCompactTrieNode(parentEndsWord, kVerticalType, nodes, status, value) {
1490 fEqual = NULL;
1491 }
1492
1493 virtual ~BuildCompactTrieVerticalNode() {
1494 }
1495
1496 + // Returns the maximum possible size of this node. See comment in
1497 + // BuildCompactTrieHorizontal node for more information.
1498 virtual uint32_t size() {
1499 - return offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeo f(uint16_t));
1500 + uint32_t estimatedSize = offsetof(CompactTrieVerticalNode,chars) + (fCh ars.length()*sizeof(uint16_t));
1501 + if(fValue > 0){
1502 + estimatedSize += sizeof(uint16_t);
1503 + }
1504 +
1505 + if(fEqual->fNodeID > 0xFFFF){
1506 + estimatedSize += sizeof(uint16_t);
1507 + }
1508 + return estimatedSize;
1509 }
1510
1511 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &trans late) {
1512 CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offse t);
1513 + fEqualOverflows = (translate.elementAti(fEqual->fNodeID) > 0xFFFF);
1514 BuildCompactTrieNode::write(bytes, offset, translate);
1515 node->equal = translate.elementAti(fEqual->fNodeID);
1516 offset += sizeof(node->equal);
1517 #ifdef DEBUG_TRIE_DICT
1518 - if (node->equal == 0) {
1519 + if ((node->equal == 0) && !fEqualOverflows) {
1520 fprintf(stderr, "ERROR: vertical link, logical node %d maps to phys ical node zero\n",
1521 fEqual->fNodeID);
1522 }
1523 #endif
1524 fChars.extract(0, fChars.length(), (UChar *)node->chars);
1525 - offset += sizeof(uint16_t)*fChars.length();
1526 + offset += sizeof(UChar)*fChars.length();
1527 +
1528 + // append 16 bits of to end for equal node if fEqualOverflows
1529 + if (fEqualOverflows) {
1530 + *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNode ID) >> 16);
1531 + offset += sizeof(uint16_t);
1532 + }
1533 +
1534 + BuildCompactTrieNode::writeValue(bytes, offset);
1535 }
1536
1537 void addChar(UChar ch) {
1538 @@ -784,60 +1161,85 @@
1539 void setLink(BuildCompactTrieNode *node) {
1540 fEqual = node;
1541 }
1542 +
1543 };
1544
1545 // Forward declaration
1546 static void walkHorizontal(const TernaryNode *node,
1547 BuildCompactTrieHorizontalNode *building,
1548 UStack &nodes,
1549 - UErrorCode &status);
1550 + UErrorCode &status,
1551 + Hashtable *values);
1552
1553 -// Convert one node. Uses recursion.
1554 +// Convert one TernaryNode into a BuildCompactTrieNode. Uses recursion.
1555
1556 static BuildCompactTrieNode *
1557 -compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, UE rrorCode &status) {
1558 +compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes,
1559 + UErrorCode &status, Hashtable *values = NULL, uint16_t parentValue = 0) {
1560 if (U_FAILURE(status)) {
1561 return NULL;
1562 }
1563 BuildCompactTrieNode *result = NULL;
1564 UBool horizontal = (node->low != NULL || node->high != NULL);
1565 if (horizontal) {
1566 - BuildCompactTrieHorizontalNode *hResult =
1567 - new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, statu s);
1568 + BuildCompactTrieHorizontalNode *hResult;
1569 + if(values != NULL){
1570 + hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status, parentValue);
1571 + } else {
1572 + hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status);
1573 + }
1574 +
1575 if (hResult == NULL) {
1576 status = U_MEMORY_ALLOCATION_ERROR;
1577 return NULL;
1578 }
1579 if (U_SUCCESS(status)) {
1580 - walkHorizontal(node, hResult, nodes, status);
1581 + walkHorizontal(node, hResult, nodes, status, values);
1582 result = hResult;
1583 }
1584 }
1585 else {
1586 - BuildCompactTrieVerticalNode *vResult =
1587 - new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status) ;
1588 + BuildCompactTrieVerticalNode *vResult;
1589 + if(values != NULL){
1590 + vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, s tatus, parentValue);
1591 + } else {
1592 + vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, s tatus);
1593 + }
1594 +
1595 if (vResult == NULL) {
1596 status = U_MEMORY_ALLOCATION_ERROR;
1597 + return NULL;
1598 }
1599 else if (U_SUCCESS(status)) {
1600 - UBool endsWord = FALSE;
1601 + uint16_t value = 0;
1602 + UBool endsWord = FALSE;
1603 // Take up nodes until we end a word, or hit a node with < or > lin ks
1604 do {
1605 vResult->addChar(node->ch);
1606 - endsWord = (node->flags & kEndsWord) != 0;
1607 + value = node->flags;
1608 + endsWord = value > 0;
1609 node = node->equal;
1610 }
1611 while(node != NULL && !endsWord && node->low == NULL && node->high == NULL);
1612 +
1613 if (node == NULL) {
1614 if (!endsWord) {
1615 status = U_ILLEGAL_ARGUMENT_ERROR; // Corrupt input trie
1616 }
1617 - else {
1618 + else if(values != NULL){
1619 + UnicodeString key(value); //store value as a single-char Un icodeString
1620 + BuildCompactTrieValueNode *link = (BuildCompactTrieValueNod e *) values->get(key);
1621 + if(link == NULL){
1622 + link = new BuildCompactTrieValueNode(nodes, status, val ue); //take out nodes?
1623 + values->put(key, link, status);
1624 + }
1625 + vResult->setLink(link);
1626 + } else {
1627 vResult->setLink((BuildCompactTrieNode *)nodes[1]);
1628 }
1629 }
1630 else {
1631 - vResult->setLink(compactOneNode(node, endsWord, nodes, status)) ;
1632 + vResult->setLink(compactOneNode(node, endsWord, nodes, status, values, value));
1633 }
1634 result = vResult;
1635 }
1636 @@ -849,19 +1251,28 @@
1637 // Uses recursion.
1638
1639 static void walkHorizontal(const TernaryNode *node,
1640 - BuildCompactTrieHorizontalNode *building,
1641 - UStack &nodes,
1642 - UErrorCode &status) {
1643 + BuildCompactTrieHorizontalNode *building,
1644 + UStack &nodes,
1645 + UErrorCode &status, Hashtable *values = NULL) {
1646 while (U_SUCCESS(status) && node != NULL) {
1647 if (node->low != NULL) {
1648 - walkHorizontal(node->low, building, nodes, status);
1649 + walkHorizontal(node->low, building, nodes, status, values);
1650 }
1651 BuildCompactTrieNode *link = NULL;
1652 if (node->equal != NULL) {
1653 - link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, nodes, status);
1654 + link = compactOneNode(node->equal, node->flags > 0, nodes, status, values, node->flags);
1655 }
1656 - else if (node->flags & kEndsWord) {
1657 - link = (BuildCompactTrieNode *)nodes[1];
1658 + else if (node->flags > 0) {
1659 + if(values != NULL) {
1660 + UnicodeString key(node->flags); //store value as a single-char UnicodeString
1661 + link = (BuildCompactTrieValueNode *) values->get(key);
1662 + if(link == NULL) {
1663 + link = new BuildCompactTrieValueNode(nodes, status, node->f lags); //take out nodes?
1664 + values->put(key, link, status);
1665 + }
1666 + } else {
1667 + link = (BuildCompactTrieNode *)nodes[1];
1668 + }
1669 }
1670 if (U_SUCCESS(status) && link != NULL) {
1671 building->addNode(node->ch, link, status);
1672 @@ -881,13 +1292,15 @@
1673 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) {
1674 BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl;
1675 BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr;
1676 +
1677 // Check for comparing a node to itself, to avoid spurious duplicates
1678 if (left == right) {
1679 return 0;
1680 }
1681 +
1682 // Most significant is type of node. Can never coalesce.
1683 - if (left->fVertical != right->fVertical) {
1684 - return left->fVertical - right->fVertical;
1685 + if (left->fNodeType != right->fNodeType) {
1686 + return left->fNodeType - right->fNodeType;
1687 }
1688 // Next, the "parent ends word" flag. If that differs, we cannot coalesce.
1689 if (left->fParentEndsWord != right->fParentEndsWord) {
1690 @@ -898,12 +1311,19 @@
1691 if (result != 0) {
1692 return result;
1693 }
1694 +
1695 + // If the node value differs, we should not coalesce.
1696 + // If values aren't stored, all fValues should be 0.
1697 + if (left->fValue != right->fValue) {
1698 + return left->fValue - right->fValue;
1699 + }
1700 +
1701 // We know they're both the same node type, so branch for the two cases.
1702 - if (left->fVertical) {
1703 + if (left->fNodeType == kVerticalType) {
1704 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID
1705 - - ((BuildCompactTrieVerticalNode *)right)->fEqual-> fNodeID;
1706 + - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;
1707 }
1708 - else {
1709 + else if(left->fChars.length() > 0 && right->fChars.length() > 0){
1710 // We need to compare the links vectors. They should be the
1711 // same size because the strings were equal.
1712 // We compare the node IDs instead of the pointers, to handle
1713 @@ -914,9 +1334,10 @@
1714 int32_t count = hleft->fLinks.size();
1715 for (int32_t i = 0; i < count && result == 0; ++i) {
1716 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID -
1717 - ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;
1718 + ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;
1719 }
1720 }
1721 +
1722 // If they are equal to each other, mark them (speeds coalescing)
1723 if (result == 0) {
1724 left->fHasDuplicate = TRUE;
1725 @@ -1031,20 +1452,25 @@
1726 // Add node 0, used as the NULL pointer/sentinel.
1727 nodes.addElement((int32_t)0, status);
1728
1729 + Hashtable *values = NULL; // Index of (unique) va lues
1730 + if (dict.fValued) {
1731 + values = new Hashtable(status);
1732 + }
1733 +
1734 // Start by creating the special empty node we use to indicate that the par ent
1735 // terminates a word. This must be node 1, because the builder assumes
1736 - // that.
1737 + // that. This node will never be used for tries storing numerical values.
1738 if (U_FAILURE(status)) {
1739 return NULL;
1740 }
1741 - BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, node s, status);
1742 + BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontal Type, nodes, status);
1743 if (terminal == NULL) {
1744 status = U_MEMORY_ALLOCATION_ERROR;
1745 }
1746
1747 // This call does all the work of building the new trie structure. The root
1748 - // will be node 2.
1749 - BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, statu s);
1750 + // will have node ID 2 before writing to memory.
1751 + BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, statu s, values);
1752 #ifdef DEBUG_TRIE_DICT
1753 (void) ::times(&timing);
1754 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n",
1755 @@ -1077,21 +1503,37 @@
1756 return NULL;
1757 }
1758
1759 + //map terminal value nodes
1760 + int valueCount = 0;
1761 + UVector valueNodes(status);
1762 + if(values != NULL) {
1763 + valueCount = values->count(); //number of unique terminal value nodes
1764 + }
1765 +
1766 + // map non-terminal nodes
1767 + int valuePos = 1;//, nodePos = valueCount + valuePos;
1768 + nodeCount = valueCount + valuePos;
1769 for (i = 1; i < count; ++i) {
1770 node = (BuildCompactTrieNode *)nodes[i];
1771 if (node->fNodeID == i) {
1772 // Only one node out of each duplicate set is used
1773 - if (i >= translate.size()) {
1774 + if (node->fNodeID >= translate.size()) {
1775 // Logically extend the mapping table
1776 - translate.setSize(i+1);
1777 + translate.setSize(i + 1);
1778 + }
1779 + //translate.setElementAt(object, index)!
1780 + if(node->fNodeType == kValueType) {
1781 + valueNodes.addElement(node, status);
1782 + translate.setElementAt(valuePos++, i);
1783 + } else {
1784 + translate.setElementAt(nodeCount++, i);
1785 }
1786 - translate.setElementAt(nodeCount++, i);
1787 totalSize += node->size();
1788 }
1789 }
1790 -
1791 - // Check for overflowing 16 bits worth of nodes.
1792 - if (nodeCount > 0x10000) {
1793 +
1794 + // Check for overflowing 20 bits worth of nodes.
1795 + if (nodeCount > 0x100000) {
1796 status = U_ILLEGAL_ARGUMENT_ERROR;
1797 return NULL;
1798 }
1799 @@ -1111,9 +1553,14 @@
1800 status = U_MEMORY_ALLOCATION_ERROR;
1801 return NULL;
1802 }
1803 -
1804 +
1805 CompactTrieHeader *header = (CompactTrieHeader *)bytes;
1806 - header->size = totalSize;
1807 + //header->size = totalSize;
1808 + if(dict.fValued){
1809 + header->magic = COMPACT_TRIE_MAGIC_3;
1810 + } else {
1811 + header->magic = COMPACT_TRIE_MAGIC_2;
1812 + }
1813 header->nodeCount = nodeCount;
1814 header->offsets[0] = 0; // Sentinel
1815 header->root = translate.elementAti(root->fNodeID);
1816 @@ -1123,23 +1570,40 @@
1817 }
1818 #endif
1819 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uin t32_t));
1820 - nodeCount = 1;
1821 + nodeCount = valueCount + 1;
1822 +
1823 + // Write terminal value nodes to memory
1824 + for (i=0; i < valueNodes.size(); i++) {
1825 + //header->offsets[i + 1] = offset;
1826 + uint32_t tmpOffset = 0;
1827 + node = (BuildCompactTrieNode *) valueNodes.elementAt(i);
1828 + //header->offsets[i + 1] = (uint32_t)node->fValue;
1829 + node->write((uint8_t *)&header->offsets[i+1], tmpOffset, translate);
1830 + }
1831 +
1832 // Now write the data
1833 for (i = 1; i < count; ++i) {
1834 node = (BuildCompactTrieNode *)nodes[i];
1835 - if (node->fNodeID == i) {
1836 + if (node->fNodeID == i && node->fNodeType != kValueType) {
1837 header->offsets[nodeCount++] = offset;
1838 node->write(bytes, offset, translate);
1839 }
1840 }
1841 +
1842 + //free all extra space
1843 + uprv_realloc(bytes, offset);
1844 + header->size = offset;
1845 +
1846 #ifdef DEBUG_TRIE_DICT
1847 + fprintf(stdout, "Space freed: %d\n", totalSize-offset);
1848 +
1849 (void) ::times(&timing);
1850 fprintf(stderr, "Trie built, time user %f system %f\n",
1851 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1852 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1853 previous = timing;
1854 fprintf(stderr, "Final offset is %d\n", offset);
1855 -
1856 +
1857 // Collect statistics on node types and sizes
1858 int hCount = 0;
1859 int vCount = 0;
1860 @@ -1148,68 +1612,85 @@
1861 size_t hItemCount = 0;
1862 size_t vItemCount = 0;
1863 uint32_t previousOff = offset;
1864 - for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {
1865 + uint32_t numOverflow = 0;
1866 + uint32_t valueSpace = 0;
1867 + for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {
1868 const CompactTrieNode *node = getCompactNode(header, nodeIdx);
1869 - if (node->flagscount & kVerticalNode) {
1870 + int itemCount;
1871 + if(nodeIdx == header->root)
1872 + itemCount = node->flagscount & kRootCountMask;
1873 + else
1874 + itemCount = getCount(node);
1875 + if(node->flagscount & kEqualOverflows){
1876 + numOverflow++;
1877 + }
1878 + if (node->flagscount & kVerticalNode && nodeIdx != header->root) {
1879 vCount += 1;
1880 - vItemCount += (node->flagscount & kCountMask);
1881 + vItemCount += itemCount;
1882 vSize += previousOff-header->offsets[nodeIdx];
1883 }
1884 else {
1885 hCount += 1;
1886 - hItemCount += (node->flagscount & kCountMask);
1887 - hSize += previousOff-header->offsets[nodeIdx];
1888 + hItemCount += itemCount;
1889 + if(nodeIdx >= header->root) {
1890 + hSize += previousOff-header->offsets[nodeIdx];
1891 + }
1892 }
1893 +
1894 + if(header->magic == COMPACT_TRIE_MAGIC_3 && node->flagscount & kParentE ndsWord)
1895 + valueSpace += sizeof(uint16_t);
1896 previousOff = header->offsets[nodeIdx];
1897 }
1898 fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items \n", hCount,
1899 (double)hSize/hCount, (double)hItemCount/hCount);
1900 fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n ", vCount,
1901 (double)vSize/vCount, (double)vItemCount/vCount);
1902 + fprintf(stderr, "Number of nodes with overflowing nodeIDs: %d \n", numOverf low);
1903 + fprintf(stderr, "Space taken up by values: %d \n", valueSpace);
1904 #endif
1905
1906 if (U_FAILURE(status)) {
1907 uprv_free(bytes);
1908 header = NULL;
1909 }
1910 - else {
1911 - header->magic = COMPACT_TRIE_MAGIC_1;
1912 - }
1913 return header;
1914 }
1915
1916 // Forward declaration
1917 static TernaryNode *
1918 -unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UE rrorCode &status );
1919 -
1920 +unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UError Code &status );
1921
1922 // Convert a horizontal node (or subarray thereof) into a ternary subtrie
1923 static TernaryNode *
1924 -unpackHorizontalArray( const CompactTrieHeader *header, const CompactTrieHorizo ntalEntry *array,
1925 - int low, int high, UErrorCode &status ) {
1926 +unpackHorizontalArray( const CompactTrieInfo *info, const CompactTrieHorizontal Node *hnode,
1927 + int low, int high, int nodeCount, UErrorCode &status) {
1928 if (U_FAILURE(status) || low > high) {
1929 return NULL;
1930 }
1931 int middle = (low+high)/2;
1932 - TernaryNode *result = new TernaryNode(array[middle].ch);
1933 + TernaryNode *result = new TernaryNode(hnode->entries[middle].ch);
1934 if (result == NULL) {
1935 status = U_MEMORY_ALLOCATION_ERROR;
1936 return NULL;
1937 }
1938 - const CompactTrieNode *equal = getCompactNode(header, array[middle].equal);
1939 + const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(hnode, mi ddle, nodeCount));
1940 if (equal->flagscount & kParentEndsWord) {
1941 - result->flags |= kEndsWord;
1942 + if(info->magic == COMPACT_TRIE_MAGIC_3){
1943 + result->flags = getValue(equal);
1944 + }else{
1945 + result->flags |= kEndsWord;
1946 + }
1947 }
1948 - result->low = unpackHorizontalArray(header, array, low, middle-1, status);
1949 - result->high = unpackHorizontalArray(header, array, middle+1, high, status) ;
1950 - result->equal = unpackOneNode(header, equal, status);
1951 + result->low = unpackHorizontalArray(info, hnode, low, middle-1, nodeCount, status);
1952 + result->high = unpackHorizontalArray(info, hnode, middle+1, high, nodeCount , status);
1953 + result->equal = unpackOneNode(info, equal, status);
1954 return result;
1955 }
1956
1957 // Convert one compact trie node into a ternary subtrie
1958 static TernaryNode *
1959 -unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UE rrorCode &status ) {
1960 - int nodeCount = (node->flagscount & kCountMask);
1961 +unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UError Code &status ) {
1962 + int nodeCount = getCount(node);
1963 if (nodeCount == 0 || U_FAILURE(status)) {
1964 // Failure, or terminal node
1965 return NULL;
1966 @@ -1234,29 +1715,41 @@
1967 previous = latest;
1968 }
1969 if (latest != NULL) {
1970 - const CompactTrieNode *equal = getCompactNode(header, vnode->equal) ;
1971 + const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(v node));
1972 if (equal->flagscount & kParentEndsWord) {
1973 - latest->flags |= kEndsWord;
1974 + if(info->magic == COMPACT_TRIE_MAGIC_3){
1975 + latest->flags = getValue(equal);
1976 + } else {
1977 + latest->flags |= kEndsWord;
1978 + }
1979 }
1980 - latest->equal = unpackOneNode(header, equal, status);
1981 + latest->equal = unpackOneNode(info, equal, status);
1982 }
1983 return head;
1984 }
1985 else {
1986 // Horizontal node
1987 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNo de *)node;
1988 - return unpackHorizontalArray(header, &hnode->entries[0], 0, nodeCount-1 , status);
1989 + return unpackHorizontalArray(info, hnode, 0, nodeCount-1, nodeCount, st atus);
1990 }
1991 }
1992
1993 +// returns a MutableTrieDictionary generated from the CompactTrieDictionary
1994 MutableTrieDictionary *
1995 CompactTrieDictionary::cloneMutable( UErrorCode &status ) const {
1996 - MutableTrieDictionary *result = new MutableTrieDictionary( status );
1997 + MutableTrieDictionary *result = new MutableTrieDictionary( status, fInfo->m agic == COMPACT_TRIE_MAGIC_3 );
1998 if (result == NULL) {
1999 status = U_MEMORY_ALLOCATION_ERROR;
2000 return NULL;
2001 }
2002 - TernaryNode *root = unpackOneNode(fData, getCompactNode(fData, fData->root) , status);
2003 + // treat root node as special case: don't call unpackOneNode() or unpackHor izontalArray() directly
2004 + // because only kEqualOverflows flag should be checked in root's flagscount
2005 + const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode * )
2006 + getCompactNode(fInfo, fInfo->root);
2007 + uint16_t nodeCount = hnode->flagscount & kRootCountMask;
2008 + TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1,
2009 + nodeCount, status);
2010 +
2011 if (U_FAILURE(status)) {
2012 delete root; // Clean up
2013 delete result;
2014 @@ -1270,8 +1763,8 @@
2015
2016 U_CAPI int32_t U_EXPORT2
2017 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
2018 - UErrorCode *status) {
2019 -
2020 + UErrorCode *status) {
2021 +
2022 if (status == NULL || U_FAILURE(*status)) {
2023 return 0;
2024 }
2025 @@ -1286,14 +1779,14 @@
2026 //
2027 const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4);
2028 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */
2029 - pInfo->dataFormat[1]==0x72 &&
2030 - pInfo->dataFormat[2]==0x44 &&
2031 - pInfo->dataFormat[3]==0x63 &&
2032 - pInfo->formatVersion[0]==1 )) {
2033 + pInfo->dataFormat[1]==0x72 &&
2034 + pInfo->dataFormat[2]==0x44 &&
2035 + pInfo->dataFormat[3]==0x63 &&
2036 + pInfo->formatVersion[0]==1 )) {
2037 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
2038 - pInfo->dataFormat[0], pInfo->dataFormat[1],
2039 - pInfo->dataFormat[2], pInfo->dataFormat[3],
2040 - pInfo->formatVersion[0]);
2041 + pInfo->dataFormat[0], pInfo->dataFormat[1],
2042 + pInfo->dataFormat[2], pInfo->dataFormat[3],
2043 + pInfo->formatVersion[0]);
2044 *status=U_UNSUPPORTED_ERROR;
2045 return 0;
2046 }
2047 @@ -1311,8 +1804,10 @@
2048 //
2049 const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
2050 const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes;
2051 - if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1
2052 - || ds->readUInt32(header->size) < sizeof(CompactTrieHeader))
2053 + uint32_t magic = ds->readUInt32(header->magic);
2054 + if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic != COMPACT_TRIE_MAGIC_3
2055 + || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeaderV1)
2056 + || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeader))
2057 {
2058 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n" );
2059 *status=U_UNSUPPORTED_ERROR;
2060 @@ -1333,10 +1828,10 @@
2061 //
2062 if (length < sizeWithUData) {
2063 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",
2064 - totalSize);
2065 + totalSize);
2066 *status=U_INDEX_OUTOFBOUNDS_ERROR;
2067 return 0;
2068 - }
2069 + }
2070
2071 //
2072 // Swap the Data. Do the data itself first, then the CompactTrieHeader, be cause
2073 @@ -1355,20 +1850,38 @@
2074 }
2075
2076 // We need to loop through all the nodes in the offset table, and swap each one.
2077 - uint16_t nodeCount = ds->readUInt16(header->nodeCount);
2078 + uint32_t nodeCount, rootId;
2079 + if(header->magic == COMPACT_TRIE_MAGIC_1) {
2080 + nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount);
2081 + rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root);
2082 + } else {
2083 + nodeCount = ds->readUInt32(header->nodeCount);
2084 + rootId = ds->readUInt32(header->root);
2085 + }
2086 +
2087 // Skip node 0, which should always be 0.
2088 - for (int i = 1; i < nodeCount; ++i) {
2089 + for (uint32_t i = 1; i < nodeCount; ++i) {
2090 uint32_t nodeOff = ds->readUInt32(header->offsets[i]);
2091 const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + nod eOff);
2092 CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff);
2093 uint16_t flagscount = ds->readUInt16(inNode->flagscount);
2094 - uint16_t itemCount = flagscount & kCountMask;
2095 + uint16_t itemCount = getCount(inNode);
2096 + //uint16_t itemCount = flagscount & kCountMask;
2097 ds->writeUInt16(&outNode->flagscount, flagscount);
2098 if (itemCount > 0) {
2099 - if (flagscount & kVerticalNode) {
2100 + uint16_t overflow = 0; //number of extra uint16_ts needed to be swa pped
2101 + if (flagscount & kVerticalNode && i != rootId) {
2102 + if(flagscount & kEqualOverflows){
2103 + // include overflow bits
2104 + overflow += 1;
2105 + }
2106 + if (header->magic == COMPACT_TRIE_MAGIC_3 && flagscount & kEnds ParentWord) {
2107 + //include values
2108 + overflow += 1;
2109 + }
2110 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVertica lNode,chars),
2111 - itemCount*sizeof(uint16_t),
2112 - outBytes+nodeOff+offsetof(CompactTrieVertic alNode,chars), status);
2113 + (itemCount + overflow)*sizeof(uint16_t),
2114 + outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars ), status);
2115 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(Compac tTrieVerticalNode,equal);
2116 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNo de,equal));
2117 }
2118 @@ -1381,26 +1894,62 @@
2119 word = ds->readUInt16(inHNode->entries[j].equal);
2120 ds->writeUInt16(&outHNode->entries[j].equal, word);
2121 }
2122 +
2123 + // swap overflow/value information
2124 + if(flagscount & kEqualOverflows){
2125 + overflow += (itemCount + 3) / 4;
2126 + }
2127 +
2128 + if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && fla gscount & kEndsParentWord) {
2129 + //include values
2130 + overflow += 1;
2131 + }
2132 +
2133 + uint16_t *inOverflow = (uint16_t *) &inHNode->entries[itemCount ];
2134 + uint16_t *outOverflow = (uint16_t *) &outHNode->entries[itemCou nt];
2135 + for(int j = 0; j<overflow; j++){
2136 + uint16_t extraInfo = ds->readUInt16(*inOverflow);
2137 + ds->writeUInt16(outOverflow, extraInfo);
2138 +
2139 + inOverflow++;
2140 + outOverflow++;
2141 + }
2142 }
2143 }
2144 }
2145 #endif
2146
2147 - // All the data in all the nodes consist of 16 bit items. Swap them all at once.
2148 - uint16_t nodeCount = ds->readUInt16(header->nodeCount);
2149 - uint32_t nodesOff = offsetof(CompactTrieHeader,offsets)+((uint32_t)nodeCoun t*sizeof(uint32_t));
2150 - ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff , status);
2151 -
2152 // Swap the header
2153 ds->writeUInt32(&outputHeader->size, totalSize);
2154 - uint32_t magic = ds->readUInt32(header->magic);
2155 ds->writeUInt32(&outputHeader->magic, magic);
2156 - ds->writeUInt16(&outputHeader->nodeCount, nodeCount);
2157 - uint16_t root = ds->readUInt16(header->root);
2158 - ds->writeUInt16(&outputHeader->root, root);
2159 - ds->swapArray32(ds, inBytes+offsetof(CompactTrieHeader,offsets),
2160 - sizeof(uint32_t)*(int32_t)nodeCount,
2161 - outBytes+offsetof(CompactTrieHeader,offsets), status);
2162 +
2163 + uint32_t nodeCount;
2164 + uint32_t offsetPos;
2165 + if (header->magic == COMPACT_TRIE_MAGIC_1) {
2166 + CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *)header;
2167 + CompactTrieHeaderV1 *outputHeaderV1 = (CompactTrieHeaderV1 *)outputHead er;
2168 +
2169 + nodeCount = ds->readUInt16(headerV1->nodeCount);
2170 + ds->writeUInt16(&outputHeaderV1->nodeCount, nodeCount);
2171 + uint16_t root = ds->readUInt16(headerV1->root);
2172 + ds->writeUInt16(&outputHeaderV1->root, root);
2173 + offsetPos = offsetof(CompactTrieHeaderV1,offsets);
2174 + } else {
2175 + nodeCount = ds->readUInt32(header->nodeCount);
2176 + ds->writeUInt32(&outputHeader->nodeCount, nodeCount);
2177 + uint32_t root = ds->readUInt32(header->root);
2178 + ds->writeUInt32(&outputHeader->root, root);
2179 + offsetPos = offsetof(CompactTrieHeader,offsets);
2180 + }
2181 +
2182 + // All the data in all the nodes consist of 16 bit items. Swap them all at once.
2183 + uint32_t nodesOff = offsetPos+((uint32_t)nodeCount*sizeof(uint32_t));
2184 + ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff , status);
2185 +
2186 + //swap offsets
2187 + ds->swapArray32(ds, inBytes+offsetPos,
2188 + sizeof(uint32_t)*(uint32_t)nodeCount,
2189 + outBytes+offsetPos, status);
2190
2191 return sizeWithUData;
2192 }
2193 --- source/common/triedict.h 2006-06-06 15:38:49.000000000 -0700
2194 +++ source/common/triedict.h 2011-01-21 14:12:45.496927000 -0800
2195 @@ -47,7 +47,6 @@
2196 U_NAMESPACE_BEGIN
2197
2198 class StringEnumeration;
2199 -struct CompactTrieHeader;
2200
2201 /*******************************************************************
2202 * TrieWordDictionary
2203 @@ -72,23 +71,29 @@
2204 */
2205 virtual ~TrieWordDictionary();
2206
2207 + /**
2208 + * <p>Returns true if the dictionary contains values associated with each wor d.</p>
2209 + */
2210 + virtual UBool getValued() const = 0;
2211 +
2212 /**
2213 * <p>Find dictionary words that match the text.</p>
2214 *
2215 * @param text A UText representing the text. The
2216 * iterator is left after the longest prefix match in the dictionary.
2217 - * @param start The current position in text.
2218 * @param maxLength The maximum number of code units to match.
2219 * @param lengths An array that is filled with the lengths of words that match ed.
2220 * @param count Filled with the number of elements output in lengths.
2221 * @param limit The size of the lengths array; this limits the number of words output.
2222 + * @param values An array that is filled with the values associated with the m atched words.
2223 * @return The number of characters in text that were matched.
2224 */
2225 virtual int32_t matches( UText *text,
2226 int32_t maxLength,
2227 int32_t *lengths,
2228 int &count,
2229 - int limit ) const = 0;
2230 + int limit,
2231 + uint16_t *values = NULL) const = 0;
2232
2233 /**
2234 * <p>Return a StringEnumeration for iterating all the words in the dictionar y.</p>
2235 @@ -128,6 +133,12 @@
2236
2237 UText *fIter;
2238
2239 + /**
2240 + * A UText for internal use
2241 + * @internal
2242 + */
2243 + UBool fValued;
2244 +
2245 friend class CompactTrieDictionary; // For fast conversion
2246
2247 public:
2248 @@ -138,14 +149,29 @@
2249 * @param median A UChar around which to balance the trie. Ideally, it should
2250 * begin at least one word that is near the median of the set in the dictionar y
2251 * @param status A status code recording the success of the call.
2252 + * @param containsValue True if the dictionary stores values associated with e ach word.
2253 */
2254 - MutableTrieDictionary( UChar median, UErrorCode &status );
2255 + MutableTrieDictionary( UChar median, UErrorCode &status, UBool containsValue = FALSE );
2256
2257 /**
2258 * <p>Virtual destructor.</p>
2259 */
2260 virtual ~MutableTrieDictionary();
2261
2262 + /**
2263 + * Indicate whether the MutableTrieDictionary stores values associated with e ach word
2264 + */
2265 + void setValued(UBool valued){
2266 + fValued = valued;
2267 + }
2268 +
2269 + /**
2270 + * <p>Returns true if the dictionary contains values associated with each wor d.</p>
2271 + */
2272 + virtual UBool getValued() const {
2273 + return fValued;
2274 + }
2275 +
2276 /**
2277 * <p>Find dictionary words that match the text.</p>
2278 *
2279 @@ -155,13 +181,15 @@
2280 * @param lengths An array that is filled with the lengths of words that match ed.
2281 * @param count Filled with the number of elements output in lengths.
2282 * @param limit The size of the lengths array; this limits the number of words output.
2283 + * @param values An array that is filled with the values associated with the m atched words.
2284 * @return The number of characters in text that were matched.
2285 */
2286 virtual int32_t matches( UText *text,
2287 int32_t maxLength,
2288 int32_t *lengths,
2289 int &count,
2290 - int limit ) const;
2291 + int limit,
2292 + uint16_t *values = NULL) const;
2293
2294 /**
2295 * <p>Return a StringEnumeration for iterating all the words in the dictionar y.</p>
2296 @@ -173,15 +201,17 @@
2297 virtual StringEnumeration *openWords( UErrorCode &status ) const;
2298
2299 /**
2300 - * <p>Add one word to the dictionary.</p>
2301 + * <p>Add one word to the dictionary with an optional associated value.</p>
2302 *
2303 * @param word A UChar buffer containing the word.
2304 * @param length The length of the word.
2305 - * @param status The resultant status
2306 + * @param status The resultant status.
2307 + * @param value The nonzero value associated with this word.
2308 */
2309 virtual void addWord( const UChar *word,
2310 int32_t length,
2311 - UErrorCode &status);
2312 + UErrorCode &status,
2313 + uint16_t value = 0);
2314
2315 #if 0
2316 /**
2317 @@ -203,8 +233,9 @@
2318 * @param lengths An array that is filled with the lengths of words that match ed.
2319 * @param count Filled with the number of elements output in lengths.
2320 * @param limit The size of the lengths array; this limits the number of words output.
2321 - * @param parent The parent of the current node
2322 - * @param pMatched The returned parent node matched the input
2323 + * @param parent The parent of the current node.
2324 + * @param pMatched The returned parent node matched the input/
2325 + * @param values An array that is filled with the values associated with the m atched words.
2326 * @return The number of characters in text that were matched.
2327 */
2328 virtual int32_t search( UText *text,
2329 @@ -213,40 +244,46 @@
2330 int &count,
2331 int limit,
2332 TernaryNode *&parent,
2333 - UBool &pMatched ) const;
2334 + UBool &pMatched,
2335 + uint16_t *values = NULL) const;
2336
2337 private:
2338 /**
2339 * <p>Private constructor. The root node it not allocated.</p>
2340 *
2341 * @param status A status code recording the success of the call.
2342 + * @param containsValues True if the dictionary will store a value associated
2343 + * with each word added.
2344 */
2345 - MutableTrieDictionary( UErrorCode &status );
2346 + MutableTrieDictionary( UErrorCode &status, UBool containsValues = false );
2347 };
2348
2349 /*******************************************************************
2350 * CompactTrieDictionary
2351 */
2352
2353 +//forward declarations
2354 +struct CompactTrieHeader;
2355 +struct CompactTrieInfo;
2356 +
2357 /**
2358 * <p>CompactTrieDictionary is a TrieWordDictionary that has been compacted
2359 * to save space.</p>
2360 */
2361 class U_COMMON_API CompactTrieDictionary : public TrieWordDictionary {
2362 private:
2363 - /**
2364 - * The root node of the trie
2365 - */
2366 + /**
2367 + * The header of the CompactTrieDictionary which contains all info
2368 + */
2369
2370 - const CompactTrieHeader *fData;
2371 -
2372 - /**
2373 - * A UBool indicating whether or not we own the fData.
2374 - */
2375 + CompactTrieInfo *fInfo;
2376
2377 + /**
2378 + * A UBool indicating whether or not we own the fData.
2379 + */
2380 UBool fOwnData;
2381
2382 - UDataMemory *fUData;
2383 + UDataMemory *fUData;
2384 public:
2385 /**
2386 * <p>Construct a dictionary from a UDataMemory.</p>
2387 @@ -277,6 +314,11 @@
2388 */
2389 virtual ~CompactTrieDictionary();
2390
2391 + /**
2392 + * <p>Returns true if the dictionary contains values associated with each wor d.</p>
2393 + */
2394 + virtual UBool getValued() const;
2395 +
2396 /**
2397 * <p>Find dictionary words that match the text.</p>
2398 *
2399 @@ -286,13 +328,15 @@
2400 * @param lengths An array that is filled with the lengths of words that match ed.
2401 * @param count Filled with the number of elements output in lengths.
2402 * @param limit The size of the lengths array; this limits the number of words output.
2403 + * @param values An array that is filled with the values associated with the m atched words.
2404 * @return The number of characters in text that were matched.
2405 */
2406 virtual int32_t matches( UText *text,
2407 - int32_t rangeEnd,
2408 + int32_t maxLength,
2409 int32_t *lengths,
2410 int &count,
2411 - int limit ) const;
2412 + int limit,
2413 + uint16_t *values = NULL) const;
2414
2415 /**
2416 * <p>Return a StringEnumeration for iterating all the words in the dictionar y.</p>
2417 @@ -311,7 +355,7 @@
2418 virtual uint32_t dataSize() const;
2419
2420 /**
2421 - * <p>Return a void * pointer to the compact data, platform-endian.</p>
2422 + * <p>Return a void * pointer to the (unmanaged) compact data, platform-endian .</p>
2423 *
2424 * @return The data for the compact dictionary, suitable for passing to the
2425 * constructor.
2426 @@ -342,5 +386,5 @@
2427
2428 U_NAMESPACE_END
2429
2430 - /* TRIEDICT_H */
2431 +/* TRIEDICT_H */
2432 #endif
2433 --- source/data/Makefile.in 2010-10-29 13:21:33.000000000 -0700
2434 +++ source/data/Makefile.in 2011-01-26 16:24:24.856798000 -0800
2435 @@ -509,8 +520,9 @@
2436 #################################################### CTD
2437 # CTD FILES
2438
2439 -$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_ FILES)
2440 - $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<
2441 +# .ctd file now generated regardless of whether dictionary file exists
2442 +$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)
2443 + $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F ).txt
2444
2445 #################################################### CFU
2446 # CFU FILES
2447 --- source/data/brkitr/root.txt 2010-07-28 17:18:28.000000000 -0700
2448 +++ source/data/brkitr/root.txt 2011-01-21 14:12:45.653922000 -0800
2449 @@ -17,5 +17,8 @@
2450 }
2451 dictionaries{
2452 Thai:process(dependency){"thaidict.ctd"}
2453 + Hani:process(dependency){"cjdict.ctd"}
2454 + Hira:process(dependency){"cjdict.ctd"}
2455 + Kata:process(dependency){"cjdict.ctd"}
2456 }
2457 }
2458 --- source/data/xml/brkitr/root.xml 2010-03-01 15:13:18.000000000 -0800
2459 +++ source/data/xml/brkitr/root.xml 2011-01-21 14:12:45.735922000 -0800
2460 @@ -25,6 +25,9 @@
2461 </icu:boundaries>
2462 <icu:dictionaries>
2463 <icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/>
2464 + <icu:dictionary type="Hani" icu:dependency="cjdict.ctd"/>
2465 + <icu:dictionary type="Hira" icu:dependency="cjdict.ctd"/>
2466 + <icu:dictionary type="Kata" icu:dependency="cjdict.ctd"/>
2467 </icu:dictionaries>
2468 </icu:breakIteratorData>
2469 </special>
2470 --- source/test/cintltst/creststn.c 2010-10-28 10:44:02.000000000 -0700
2471 +++ source/test/cintltst/creststn.c 2011-01-21 14:12:44.995020000 -0800
2472 @@ -2188,21 +2188,21 @@
2473
2474
2475 {
2476 - UResourceBundle* ja = ures_open(U_ICUDATA_BRKITR,"ja", &status);
2477 + UResourceBundle* th = ures_open(U_ICUDATA_BRKITR,"th", &status);
2478 const UChar *got = NULL, *exp=NULL;
2479 int32_t gotLen = 0, expLen=0;
2480 - ja = ures_getByKey(ja, "boundaries", ja, &status);
2481 - exp = tres_getString(ja, -1, "word", &expLen, &status);
2482 + th = ures_getByKey(th, "boundaries", th, &status);
2483 + exp = tres_getString(th, -1, "grapheme", &expLen, &status);
2484
2485 tb = ures_getByKey(aliasB, "boundaries", tb, &status);
2486 - got = tres_getString(tb, -1, "word", &gotLen, &status);
2487 + got = tres_getString(tb, -1, "grapheme", &gotLen, &status);
2488
2489 if(U_FAILURE(status)) {
2490 log_err("%s trying to read str boundaries\n", u_errorName(statu s));
2491 } else if(gotLen != expLen || u_strncmp(exp, got, gotLen) != 0) {
2492 log_err("Referencing alias didn't get the right data\n");
2493 }
2494 - ures_close(ja);
2495 + ures_close(th);
2496 status = U_ZERO_ERROR;
2497 }
2498 /* simple alias */
2499 --- source/test/intltest/rbbiapts.cpp 2010-07-12 11:03:29.000000000 -0700
2500 +++ source/test/intltest/rbbiapts.cpp 2011-01-21 14:12:45.033014000 -0800
2501 @@ -156,9 +156,13 @@
2502 if(*a!=*b){
2503 errln("Failed: boilerplate method operator!= does not return correct re sults");
2504 }
2505 - BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status);
2506 - if(a && c){
2507 - if(*c==*a){
2508 + // Japanese word break iteratos is identical to root with
2509 + // a dictionary-based break iterator, but Thai character break iterator
2510 + // is still different from Root.
2511 + BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),stat us);
2512 + BreakIterator* d = BreakIterator::createCharacterInstance(Locale("th"),stat us);
2513 + if(c && d){
2514 + if(*c==*d){
2515 errln("Failed: boilerplate method opertator== does not return corre ct results");
2516 }
2517 }else{
2518 @@ -167,6 +171,7 @@
2519 delete a;
2520 delete b;
2521 delete c;
2522 + delete d;
2523 }
2524
2525 void RBBIAPITest::TestgetRules()
2526 @@ -635,21 +640,21 @@
2527 //
2528 void RBBIAPITest::TestRuleStatus() {
2529 UChar str[30];
2530 - u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094 ",
2531 - // 012345678901234567 8 9 0 1 2 3 4 5 6
2532 - // Ideographic Katakana Hiragana
2533 + //no longer test Han or hiragana breaking here: ruleStatusVec would return nothing
2534 + // changed UBRK_WORD_KANA to UBRK_WORD_IDEO
2535 + u_unescape("plain word 123.45 \\u30a1\\u30a2 ",
2536 + // 012345678901234567 8 9 0
2537 + // Katakana
2538 str, 30);
2539 UnicodeString testString1(str);
2540 - int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26};
2541 + int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21};
2542 int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER,
2543 UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE,
2544 - UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE,
2545 - UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA, UBRK_WORD_KANA};
2546 + UBRK_WORD_IDEO, UBRK_WORD_NONE};
2547
2548 int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WO RD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,
2549 UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WO RD_NONE_LIMIT,
2550 - UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WO RD_NONE_LIMIT,
2551 - UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WO RD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT};
2552 + UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT};
2553
2554 UErrorCode status=U_ZERO_ERROR;
2555
2556 @@ -888,9 +893,11 @@
2557
2558 URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD , status);
2559 {
2560 +#if 0 // With a dictionary based word breaking, ja_word is identical to root.
2561 if (ja_word && *ja_word == *root_word) {
2562 errln("japan not different from root");
2563 }
2564 +#endif
2565 }
2566
2567 {
2568 --- source/test/intltest/rbbitst.cpp 2010-10-08 18:23:28.000000000 -0700
2569 +++ source/test/intltest/rbbitst.cpp 2011-01-21 14:12:45.180030000 -0800
2570 @@ -35,6 +35,8 @@
2571 #include <string.h>
2572 #include <stdio.h>
2573 #include <stdlib.h>
2574 +#include "unicode/numfmt.h"
2575 +#include "unicode/uscript.h"
2576
2577 #define TEST_ASSERT(x) {if (!(x)) { \
2578 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
2579 @@ -138,11 +140,13 @@
2580 if (exec) TestThaiBreaks(); break;
2581 case 23: name = "TestTailoredBreaks";
2582 if (exec) TestTailoredBreaks(); break;
2583 + case 24: name = "TestTrieDictWithValue";
2584 + if(exec) TestTrieDictWithValue(); break;
2585 #else
2586 - case 21: case 22: case 23: name = "skip";
2587 + case 21: case 22: case 23: case 24: name = "skip";
2588 break;
2589 #endif
2590 - case 24: name = "TestDictRules";
2591 + case 25: name = "TestDictRules";
2592 if (exec) TestDictRules(); break;
2593 case 25: name = "TestBug5532";
2594 if (exec) TestBug5532(); break;
2595 @@ -607,6 +611,8 @@
2596
2597
2598 void RBBITest::TestJapaneseWordBreak() {
2599 +// TODO: Rewrite this test for a dictionary-based word breaking.
2600 +#if 0
2601 UErrorCode status = U_ZERO_ERROR;
2602 BITestData japaneseWordSelection(status);
2603
2604 @@ -628,6 +634,7 @@
2605
2606 generalIteratorTest(*e, japaneseWordSelection);
2607 delete e;
2608 +#endif
2609 }
2610
2611 void RBBITest::TestTrieDict() {
2612 @@ -849,6 +856,372 @@
2613 delete compact2;
2614 }
2615
2616 +/*TODO: delete later*/
2617 +inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){
2618 + UErrorCode status = U_ZERO_ERROR;
2619 + FILE *outfile = fopen(filename,"w");
2620 + UConverter *cvt = ucnv_open("UTF-8", &status);
2621 + if (U_FAILURE(status))
2622 + return;
2623 + if(outfile != NULL){
2624 + status = U_ZERO_ERROR;
2625 + const UnicodeString *word = enumer->snext(status);
2626 + while (word != NULL && U_SUCCESS(status)) {
2627 + char u8word[500];
2628 + status = U_ZERO_ERROR;
2629 + ucnv_fromUChars(cvt, u8word, 500, word->getBuffer(), word->length() ,
2630 + &status);
2631 + fprintf(outfile,"%s\n", u8word);
2632 + status = U_ZERO_ERROR;
2633 + word = enumer->snext(status);
2634 + }
2635 + fclose(outfile);
2636 + }
2637 + ucnv_close(cvt);
2638 +}
2639 +
2640 +// A very simple helper class to streamline the buffer handling in
2641 +// TestTrieDictWithValue
2642 +template<class T, size_t N>
2643 +class AutoBuffer {
2644 + public:
2645 + AutoBuffer(size_t size) : buffer(stackBuffer) {
2646 + if (size > N)
2647 + buffer = new T[size];
2648 + }
2649 + ~AutoBuffer() {
2650 + if (buffer != stackBuffer)
2651 + delete [] buffer;
2652 + }
2653 + T* elems() {
2654 + return buffer;
2655 + }
2656 + const T& operator[] (size_t i) const {
2657 + return buffer[i];
2658 + }
2659 + T& operator[] (size_t i) {
2660 + return buffer[i];
2661 + }
2662 + private:
2663 + T stackBuffer[N];
2664 + T* buffer;
2665 + AutoBuffer();
2666 +};
2667 +
2668 +//----------------------------------------------------------------------------
2669 +//
2670 +// TestTrieDictWithValue Test trie dictionaries with logprob values and
2671 +// more than 2^16 nodes after compaction.
2672 +//
2673 +//----------------------------------------------------------------------------
2674 +void RBBITest::TestTrieDictWithValue() {
2675 + UErrorCode status = U_ZERO_ERROR;
2676 +
2677 + //
2678 + // Open and read the test data file.
2679 + //
2680 + const char *testDataDirectory = IntlTest::getSourceTestData(status);
2681 + const char *filename = "cjdict-truncated.txt";
2682 + char testFileName[1000];
2683 + if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen(filenam e) + 10 >= sizeof(testFileName)) {
2684 + errln("Can't open test data. Path too long.");
2685 + return;
2686 + }
2687 + strcpy(testFileName, testDataDirectory);
2688 + strcat(testFileName, filename);
2689 +
2690 + // Items needing deleting at the end
2691 + MutableTrieDictionary *mutableDict = NULL;
2692 + CompactTrieDictionary *compactDict = NULL;
2693 + UnicodeSet *breaks = NULL;
2694 + UChar *testFile = NULL;
2695 + StringEnumeration *enumer1 = NULL;
2696 + StringEnumeration *enumer2 = NULL;
2697 + MutableTrieDictionary *mutable2 = NULL;
2698 + StringEnumeration *cloneEnum = NULL;
2699 + CompactTrieDictionary *compact2 = NULL;
2700 + NumberFormat *nf = NULL;
2701 + UText *originalText = NULL, *cloneText = NULL;
2702 +
2703 + const UnicodeString *originalWord = NULL;
2704 + const UnicodeString *cloneWord = NULL;
2705 + UChar *current;
2706 + UChar *word;
2707 + UChar uc;
2708 + int32_t wordLen;
2709 + int32_t wordCount;
2710 + int32_t testCount;
2711 + int32_t valueLen;
2712 + int counter = 0;
2713 +
2714 + int len;
2715 + testFile = ReadAndConvertFile(testFileName, len, NULL, status);
2716 + if (U_FAILURE(status)) {
2717 + goto cleanup; /* something went wrong, error already output */
2718 + }
2719 +
2720 + mutableDict = new MutableTrieDictionary(0x0E1C, status, TRUE);
2721 + if (U_FAILURE(status)) {
2722 + errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status) );
2723 + goto cleanup;
2724 + }
2725 +
2726 + breaks = new UnicodeSet;
2727 + breaks->add(0x000A); // Line Feed
2728 + breaks->add(0x000D); // Carriage Return
2729 + breaks->add(0x2028); // Line Separator
2730 + breaks->add(0x2029); // Paragraph Separator
2731 + breaks->add(0x0009); // Tab character
2732 +
2733 + // Now add each non-comment line of the file as a word.
2734 + current = testFile;
2735 + word = current;
2736 + uc = *current++;
2737 + wordLen = 0;
2738 + wordCount = 0;
2739 + nf = NumberFormat::createInstance(status);
2740 +
2741 + while (uc) {
2742 + UnicodeString ucharValue;
2743 + valueLen = 0;
2744 +
2745 + if (uc == 0x0023) { // #comment line, skip
2746 + while (uc && !breaks->contains(uc)) {
2747 + uc = *current++;
2748 + }
2749 + }
2750 + else{
2751 + while (uc && !breaks->contains(uc)) {
2752 + ++wordLen;
2753 + uc = *current++;
2754 + }
2755 + if(uc == 0x0009){ //separator is a tab char, read in num after tab
2756 + uc = *current++;
2757 + while (uc && !breaks->contains(uc)) {
2758 + ucharValue.append(uc);
2759 + uc = *current++;
2760 + }
2761 + }
2762 + }
2763 + if (wordLen > 0) {
2764 + Formattable value((int32_t)0);
2765 + nf->parse(ucharValue.getTerminatedBuffer(), value, status);
2766 +
2767 + if(U_FAILURE(status)){
2768 + errln("parsing of value failed when reading in dictionary\n");
2769 + goto cleanup;
2770 + }
2771 + mutableDict->addWord(word, wordLen, status, value.getLong());
2772 + if (U_FAILURE(status)) {
2773 + errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
2774 + goto cleanup;
2775 + }
2776 + wordCount += 1;
2777 + }
2778 +
2779 + // Find beginning of next line
2780 + while (uc && breaks->contains(uc)) {
2781 + uc = *current++;
2782 + }
2783 + word = current-1;
2784 + wordLen = 0;
2785 + }
2786 +
2787 + if (wordCount < 50) {
2788 + errln("Word count (%d) unreasonably small\n", wordCount);
2789 + goto cleanup;
2790 + }
2791 +
2792 + enumer1 = mutableDict->openWords(status);
2793 + if (U_FAILURE(status)) {
2794 + errln("Could not open mutable dictionary enumerator: %s\n", u_errorName (status));
2795 + goto cleanup;
2796 + }
2797 +
2798 + testCount = 0;
2799 + if (wordCount != (testCount = enumer1->count(status))) {
2800 + errln("MutableTrieDictionary word count (%d) differs from file word cou nt (%d), with status %s\n",
2801 + testCount, wordCount, u_errorName(status));
2802 + goto cleanup;
2803 + }
2804 +
2805 + // Now compact it
2806 + compactDict = new CompactTrieDictionary(*mutableDict, status);
2807 + if (U_FAILURE(status)) {
2808 + errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(statu s));
2809 + goto cleanup;
2810 + }
2811 +
2812 + enumer2 = compactDict->openWords(status);
2813 + if (U_FAILURE(status)) {
2814 + errln("Could not open compact trie dictionary enumerator: %s\n", u_erro rName(status));
2815 + goto cleanup;
2816 + }
2817 +
2818 +
2819 + //delete later
2820 +// writeEnumerationToFile(enumer1, "/home/jchye/mutable.txt");
2821 +// writeEnumerationToFile(enumer2, "/home/jchye/compact.txt");
2822 +
2823 + enumer1->reset(status);
2824 + enumer2->reset(status);
2825 +
2826 + originalWord = enumer1->snext(status);
2827 + cloneWord = enumer2->snext(status);
2828 + while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
2829 + if (*originalWord != *cloneWord) {
2830 + errln("MutableTrieDictionary and CompactTrieDictionary word mismatc h at %d, lengths are %d and %d\n",
2831 + counter, originalWord->length(), cloneWord->length());
2832 + goto cleanup;
2833 + }
2834 +
2835 + // check if attached values of the same word in both dictionaries tally
2836 +#if 0
2837 + int32_t lengths1[originalWord->length()], lengths2[cloneWord->length()] ;
2838 + uint16_t values1[originalWord->length()], values2[cloneWord->length()];
2839 +#endif
2840 + AutoBuffer<int32_t, 20> lengths1(originalWord->length());
2841 + AutoBuffer<int32_t, 20> lengths2(cloneWord->length());
2842 + AutoBuffer<uint16_t, 20> values1(originalWord->length());
2843 + AutoBuffer<uint16_t, 20> values2(cloneWord->length());
2844 +
2845 + originalText = utext_openConstUnicodeString(originalText, originalWord, &status);
2846 + cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status) ;
2847 +
2848 + int count1, count2;
2849 + mutableDict->matches(originalText, originalWord->length(), lengths1.ele ms(), count1, originalWord->length(), values1.elems());
2850 + compactDict->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems());
2851 +
2852 + if(values1[count1-1] != values2[count2-1]){
2853 + errln("Values of word %d in MutableTrieDictionary and CompactTrieDi ctionary do not match, with values %d and %d\n",
2854 + counter, values1[count1-1], values2[count2-1]);
2855 + goto cleanup;
2856 + }
2857 +
2858 + counter++;
2859 + originalWord = enumer1->snext(status);
2860 + cloneWord = enumer2->snext(status);
2861 + }
2862 + if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {
2863 + errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are t he same");
2864 + }
2865 +
2866 + delete enumer1;
2867 + enumer1 = NULL;
2868 + delete enumer2;
2869 + enumer2 = NULL;
2870 +
2871 + // Now un-compact it
2872 + mutable2 = compactDict->cloneMutable(status);
2873 + if (U_FAILURE(status)) {
2874 + errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
2875 + goto cleanup;
2876 + }
2877 +
2878 + cloneEnum = mutable2->openWords(status);
2879 + if (U_FAILURE(status)) {
2880 + errln("Could not create cloned mutable enumerator: %s\n", u_errorName(s tatus));
2881 + goto cleanup;
2882 + }
2883 +
2884 + if (wordCount != (testCount = cloneEnum->count(status))) {
2885 + errln("Cloned MutableTrieDictionary word count (%d) differs from file w ord count (%d), with status %s\n",
2886 + testCount, wordCount, u_errorName(status));
2887 + goto cleanup;
2888 + }
2889 +
2890 + // Compact original dictionary to clone. Note that we can only compare the same kind of
2891 + // dictionary as the order of the enumerators is not guaranteed to be the s ame between
2892 + // different kinds
2893 + enumer1 = mutableDict->openWords(status);
2894 + if (U_FAILURE(status)) {
2895 + errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorN ame(status));
2896 + goto cleanup;
2897 + }
2898 +
2899 + counter = 0;
2900 + originalWord = enumer1->snext(status);
2901 + cloneWord = cloneEnum->snext(status);
2902 + while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
2903 + if (*originalWord != *cloneWord) {
2904 + errln("Original and cloned MutableTrieDictionary word mismatch\n");
2905 + goto cleanup;
2906 + }
2907 +
2908 + // check if attached values of the same word in both dictionaries tally
2909 + AutoBuffer<int32_t, 20> lengths1(originalWord->length());
2910 + AutoBuffer<int32_t, 20> lengths2(cloneWord->length());
2911 + AutoBuffer<uint16_t, 20> values1(originalWord->length());
2912 + AutoBuffer<uint16_t, 20> values2(cloneWord->length());
2913 + originalText = utext_openConstUnicodeString(originalText, originalWord, &status);
2914 + cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status) ;
2915 +
2916 + int count1, count2;
2917 + mutableDict->matches(originalText, originalWord->length(), lengths1.ele ms(), count1, originalWord->length(), values1.elems());
2918 + mutable2->matches(cloneText, cloneWord->length(), lengths2.elems(), cou nt2, cloneWord->length(), values2.elems());
2919 +
2920 + if(values1[count1-1] != values2[count2-1]){
2921 + errln("Values of word %d in original and cloned MutableTrieDictiona ry do not match, with values %d and %d\n",
2922 + counter, values1[count1-1], values2[count2-1]);
2923 + goto cleanup;
2924 + }
2925 +
2926 + counter++;
2927 +
2928 + originalWord = enumer1->snext(status);
2929 + cloneWord = cloneEnum->snext(status);
2930 + }
2931 +
2932 + if (U_FAILURE(status)) {
2933 + errln("Enumeration failed: %s\n", u_errorName(status));
2934 + goto cleanup;
2935 + }
2936 +
2937 + if (originalWord != cloneWord) {
2938 + errln("Original and cloned MutableTrieDictionary ended enumeration at d ifferent points\n");
2939 + goto cleanup;
2940 + }
2941 +
2942 + // Test the data copying constructor for CompactTrieDict, and the data acce ss APIs.
2943 + compact2 = new CompactTrieDictionary(compactDict->data(), status);
2944 + if (U_FAILURE(status)) {
2945 + errln("CompactTrieDictionary(const void *,...) failed\n");
2946 + goto cleanup;
2947 + }
2948 +
2949 + if (compact2->dataSize() == 0) {
2950 + errln("CompactTrieDictionary->dataSize() == 0\n");
2951 + goto cleanup;
2952 + }
2953 +
2954 + // Now count the words via the second dictionary
2955 + delete enumer1;
2956 + enumer1 = compact2->openWords(status);
2957 + if (U_FAILURE(status)) {
2958 + errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_er rorName(status));
2959 + goto cleanup;
2960 + }
2961 +
2962 + if (wordCount != (testCount = enumer1->count(status))) {
2963 + errln("CompactTrieDictionary 2 word count (%d) differs from file word c ount (%d), with status %s\n",
2964 + testCount, wordCount, u_errorName(status));
2965 + goto cleanup;
2966 + }
2967 +
2968 + cleanup:
2969 + delete compactDict;
2970 + delete mutableDict;
2971 + delete breaks;
2972 + delete[] testFile;
2973 + delete enumer1;
2974 + delete mutable2;
2975 + delete cloneEnum;
2976 + delete compact2;
2977 + utext_close(originalText);
2978 + utext_close(cloneText);
2979 +
2980 +
2981 +}
2982
2983 //----------------------------------------------------------------------------
2984 //
2985 @@ -1870,8 +2243,15 @@
2986 // Don't break in runs of hiragana or runs of ideograph, where the latter inclu des \u3005 \u3007 \u303B (cldrbug #2009).
2987 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u 3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
2988 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u 3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
2989 +#if 0
2990 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 1 7, 18, 20, 21, 24, 27, 28 };
2991 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 1 7, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
2992 +#endif
2993 +// There's no separate Japanese word break iterator. Root is the same as Japane se.
2994 +// Our dictionary-based iterator has to be tweaked to better handle U+3005,
2995 +// U+3007, U+300B and some other cases.
2996 +static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 1 5, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };
2997 +static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 1 5, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };
2998
2999 // UBreakIteratorType UBRK_SENTENCE, Locale "el"
3000 // Add break after Greek question mark (cldrbug #2069).
3001 @@ -2672,6 +3052,8 @@
3002 UnicodeSet *fNewlineSet;
3003 UnicodeSet *fKatakanaSet;
3004 UnicodeSet *fALetterSet;
3005 + // TODO(jungshik): Do we still need this change?
3006 + // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
3007 UnicodeSet *fMidNumLetSet;
3008 UnicodeSet *fMidLetterSet;
3009 UnicodeSet *fMidNumSet;
3010 @@ -2680,6 +3062,7 @@
3011 UnicodeSet *fOtherSet;
3012 UnicodeSet *fExtendSet;
3013 UnicodeSet *fExtendNumLetSet;
3014 + UnicodeSet *fDictionaryCjkSet;
3015
3016 RegexMatcher *fMatcher;
3017
3018 @@ -2696,12 +3079,24 @@
3019 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
3020 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
3021 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
3022 - fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
3023 + fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
3024 + // Exclude Hangul syllables from ALetterSet during testing.
3025 + // Leave CJK dictionary characters out from the monkey tests!
3026 +#if 0
3027 + fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
3028 + "[\\p{Line_Break = Complex_Context}"
3029 + "-\\p{Grapheme_Cluster_Break = Extend}"
3030 + "-\\p{Grapheme_Cluster_Break = Control}"
3031 + "]]",
3032 + status);
3033 +#endif
3034 + fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
3035 + fALetterSet->removeAll(*fDictionaryCjkSet);
3036 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
3037 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
3038 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);
3039 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
3040 - fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
3041 + fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}[\\uff10-\\uff19]]"), status);
3042 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
3043 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
3044 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
3045 @@ -2725,13 +3120,14 @@
3046 fOtherSet->removeAll(*fFormatSet);
3047 fOtherSet->removeAll(*fExtendSet);
3048 // Inhibit dictionary characters from being tested at all.
3049 + fOtherSet->removeAll(*fDictionaryCjkSet);
3050 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Com plex_Context}]"), status));
3051
3052 fSets->addElement(fCRSet, status);
3053 fSets->addElement(fLFSet, status);
3054 fSets->addElement(fNewlineSet, status);
3055 fSets->addElement(fALetterSet, status);
3056 - fSets->addElement(fKatakanaSet, status);
3057 + //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test ka takana
3058 fSets->addElement(fMidLetterSet, status);
3059 fSets->addElement(fMidNumLetSet, status);
3060 fSets->addElement(fMidNumSet, status);
3061 @@ -3978,6 +4374,7 @@
3062 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3063 count --;
3064 if (forward[count] != i) {
3065 + printStringBreaks(ustr, expected, expectedcount);
3066 test->errln("happy break test previous() failed: expected %d but go t %d",
3067 forward[count], i);
3068 break;
3069 @@ -4011,23 +4408,25 @@
3070 UErrorCode status = U_ZERO_ERROR;
3071 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, stat us);
3072 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3073 + // Replaced any C+J characters in a row with a random sequence of character s
3074 + // of the same length to make our C+J segmentation not get in the way.
3075 static const char *strlist[] =
3076 {
3077 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3078 - "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e004 0\\u003b",
3079 + "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e004 0\\u003b",
3080 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000 e0061\\u003a",
3081 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3082 - "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3083 + "\\uac00\\u3588\\u009c\\u0953\\u194b",
3084 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3085 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e ",
3086 - "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3087 + "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3088 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3089 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3090 "\\u2027\\U000e0067\\u0a47\\u00b7",
3091 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3092 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3093 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3094 - "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3095 + "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3096 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3097 "\\u0027\\u11af\\U000e0057\\u0602",
3098 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3099 @@ -4039,7 +4438,7 @@
3100 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3101 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3102 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3103 - "\\u58f4\\U000e0049\\u20e7\\u2027",
3104 + "\\u18f4\\U000e0049\\u20e7\\u2027",
3105 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3106 "\\ua183\\u102d\\u0bec\\u003a",
3107 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3108 @@ -4049,7 +4448,7 @@
3109 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3110 "\\u003a\\u0664\\u00b7\\u1fba",
3111 "\\u003b\\u0027\\u00b7\\u47a3",
3112 - "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
3113 + "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3114 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\ u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3115 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3116 };
3117 @@ -4104,12 +4503,12 @@
3118 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3119 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3120 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3121 - "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3122 + "\\U000e0065\\u302c\\u09ee\\U000e0068",
3123 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3124 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3125 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3126 "\\u58f4\\U000e0049\\u20e7\\u2027",
3127 - "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3128 + "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3129 "\\ua183\\u102d\\u0bec\\u003a",
3130 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3131 "\\u003a\\u0e57\\u0fad\\u002e",
3132 --- source/test/intltest/rbbitst.h 2010-07-22 17:15:37.000000000 -0700
3133 +++ source/test/intltest/rbbitst.h 2011-01-21 14:12:45.152007000 -0800
3134 @@ -70,6 +70,7 @@
3135 void TestBug5775();
3136 void TestThaiBreaks();
3137 void TestTailoredBreaks();
3138 + void TestTrieDictWithValue();
3139 void TestDictRules();
3140 void TestBug5532();
3141
3142 --- source/test/testdata/rbbitst.txt 2010-07-28 17:18:28.000000000 -0700
3143 +++ source/test/testdata/rbbitst.txt 2011-01-21 14:12:45.221011000 -0800
3144 @@ -161,7 +161,23 @@
3145 <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>
3146
3147 # Hiragana & Katakana stay together, but separates from each other and Latin.
3148 -<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINI NG ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A} \N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>
3149 +# *** what to do about theoretical combos of chars? i.e. hiragana + accent
3150 +#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBIN ING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A }\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKAN A LETTER N}<300>def<200>#•</data>
3151 +
3152 +# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth
3153 +<data>•芽キャベツ<400>芽キャベツ<400></data>
3154 +
3155 +# more Japanese tests
3156 +# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana
3157 +# and the Katakana block are not treated correctly. Enable this later.
3158 +#<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400 >は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>
3159 +<data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>で も<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>
3160 +
3161 +# Testing of word boundary for dictionary word containing both kanji and kana
3162 +<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data>
3163 +
3164 +# Testing of Chinese segmentation (taken from a Chinese news article)
3165 +<data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400 >到了<400>“•推荐<400>票<400>”•,•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400> 的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>,•选出<400>他们<400> 属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</d ata>
3166
3167 # Words with interior formatting characters
3168 <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</dat a>
3169 @@ -169,6 +185,8 @@
3170 # to test for bug #4097779
3171 <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>
3172
3173 +# fullwidth numeric, midletter characters etc should be treated like their half width counterparts
3174 +<data>•ISN'T<200> •19<100>日<400></data>
3175
3176 # to test for bug #4098467
3177 # What follows is a string of Korean characters (I found it in the Yellow Pages
3178 @@ -178,9 +196,15 @@
3179 # precomposed syllables...
3180 <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\ua d50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u1 10b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u1 1bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>
3181
3182 -<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data>
3183 +# more Korean tests (Jamo not tested here, not counted as dictionary characters )
3184 +# Disable them now because we don't include a Korean dictionary.
3185 +#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<2 00>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data>
3186 +#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2d d<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200 > •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data>
3187 +
3188 +<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</da ta>
3189 +
3190 +<data>•\u06c9<200>\uc799<200>\ufffa•</data>
3191
3192 -<data>•\u06c9\uc799\ufffa<200></data>
3193
3194 #
3195 # Try some words from other scripts.
3196 @@ -491,8 +515,7 @@
3197 <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c •</data>
3198
3199 # conjoining jamo...
3200 -# TODO: rules update needed
3201 -#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\ u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\ u1100\u116d•\u1112\u116c•</data>
3202 +<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u 11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1 100\u116d•\u1112\u116c•</data>
3203
3204 # to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
3205 <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data>
3206 --- source/test/testdata/testaliases.txt 2009-11-12 13:53:42.000000000 -0 800
3207 +++ source/test/testdata/testaliases.txt 2011-01-21 14:12:45.204005000 -0 800
3208 @@ -28,7 +28,7 @@
3209 LocaleScript:alias { "/ICUDATA/ja/LocaleScript" }
3210
3211 // aliasing using position
3212 - boundaries:alias { "/ICUDATA-brkitr/ja" } // Referencing corresponding reso urce in another bundle
3213 + boundaries:alias { "/ICUDATA-brkitr/th" } // Referencing corresponding reso urce in another bundle
3214
3215 // aliasing arrays
3216 zoneTests {
3217 --- source/tools/genctd/genctd.cpp 2009-08-04 14:09:17.000000000 -0700
3218 +++ source/tools/genctd/genctd.cpp 2011-01-21 14:12:45.564923000 -0800
3219 @@ -1,6 +1,6 @@
3220 /*
3221 **********************************************************************
3222 -* Copyright (C) 2002-2009, International Business Machines
3223 +* Copyright (C) 2002-2010, International Business Machines
3224 * Corporation and others. All Rights Reserved.
3225 **********************************************************************
3226 *
3227 @@ -34,12 +34,15 @@
3228 #include "unicode/udata.h"
3229 #include "unicode/putil.h"
3230
3231 +//#include "unicode/ustdio.h"
3232 +
3233 #include "uoptions.h"
3234 #include "unewdata.h"
3235 #include "ucmndata.h"
3236 #include "rbbidata.h"
3237 #include "triedict.h"
3238 #include "cmemory.h"
3239 +#include "uassert.h"
3240
3241 #include <stdio.h>
3242 #include <stdlib.h>
3243 @@ -199,147 +202,191 @@
3244 long wordFileSize;
3245 FILE *file;
3246 char *wordBufferC;
3247 -
3248 + MutableTrieDictionary *mtd = NULL;
3249 +
3250 file = fopen(wordFileName, "rb");
3251 - if( file == 0 ) {
3252 - fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);
3253 - exit(-1);
3254 - }
3255 - fseek(file, 0, SEEK_END);
3256 - wordFileSize = ftell(file);
3257 - fseek(file, 0, SEEK_SET);
3258 - wordBufferC = new char[wordFileSize+10];
3259 -
3260 - result = (long)fread(wordBufferC, 1, wordFileSize, file);
3261 - if (result != wordFileSize) {
3262 - fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
3263 - exit (-1);
3264 - }
3265 - wordBufferC[wordFileSize]=0;
3266 - fclose(file);
3267 -
3268 - //
3269 - // Look for a Unicode Signature (BOM) on the word file
3270 - //
3271 - int32_t signatureLength;
3272 - const char * wordSourceC = wordBufferC;
3273 - const char* encoding = ucnv_detectUnicodeSignature(
3274 - wordSourceC, wordFileSize, &signatureLength, &status );
3275 - if (U_FAILURE(status)) {
3276 - exit(status);
3277 - }
3278 - if(encoding!=NULL ){
3279 - wordSourceC += signatureLength;
3280 - wordFileSize -= signatureLength;
3281 - }
3282 -
3283 - //
3284 - // Open a converter to take the rule file to UTF-16
3285 - //
3286 - UConverter* conv;
3287 - conv = ucnv_open(encoding, &status);
3288 - if (U_FAILURE(status)) {
3289 - fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
3290 - exit(status);
3291 - }
3292 -
3293 - //
3294 - // Convert the words to UChar.
3295 - // Preflight first to determine required buffer size.
3296 - //
3297 - uint32_t destCap = ucnv_toUChars(conv,
3298 - NULL, // dest,
3299 - 0, // destCapacity,
3300 - wordSourceC,
3301 - wordFileSize,
3302 - &status);
3303 - if (status != U_BUFFER_OVERFLOW_ERROR) {
3304 - fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status ));
3305 - exit(status);
3306 - };
3307 -
3308 - status = U_ZERO_ERROR;
3309 - UChar *wordSourceU = new UChar[destCap+1];
3310 - ucnv_toUChars(conv,
3311 - wordSourceU, // dest,
3312 - destCap+1,
3313 - wordSourceC,
3314 - wordFileSize,
3315 - &status);
3316 - if (U_FAILURE(status)) {
3317 - fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status ));
3318 - exit(status);
3319 - };
3320 - ucnv_close(conv);
3321 -
3322 - // Get rid of the original file buffer
3323 - delete[] wordBufferC;
3324 -
3325 - // Create a MutableTrieDictionary, and loop through all the lines, insertin g
3326 - // words.
3327 -
3328 - // First, pick a median character.
3329 - UChar *current = wordSourceU + (destCap/2);
3330 - UChar uc = *current++;
3331 - UnicodeSet breaks;
3332 - breaks.add(0x000A); // Line Feed
3333 - breaks.add(0x000D); // Carriage Return
3334 - breaks.add(0x2028); // Line Separator
3335 - breaks.add(0x2029); // Paragraph Separator
3336 -
3337 - do {
3338 - // Look for line break
3339 - while (uc && !breaks.contains(uc)) {
3340 - uc = *current++;
3341 - }
3342 - // Now skip to first non-line-break
3343 - while (uc && breaks.contains(uc)) {
3344 - uc = *current++;
3345 + if( file == 0 ) { //cannot find file
3346 + //create 1-line dummy file: ie 1 char, 1 value
3347 + UNewDataMemory *pData;
3348 + char msg[1024];
3349 +
3350 + /* write message with just the name */
3351 + sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outF ileName);
3352 + fprintf(stderr, "%s\n", msg);
3353 +
3354 + UChar c = 0x0020;
3355 + mtd = new MutableTrieDictionary(c, status, TRUE);
3356 + mtd->addWord(&c, 1, status, 1);
3357 +
3358 + } else { //read words in from input file
3359 + fseek(file, 0, SEEK_END);
3360 + wordFileSize = ftell(file);
3361 + fseek(file, 0, SEEK_SET);
3362 + wordBufferC = new char[wordFileSize+10];
3363 +
3364 + result = (long)fread(wordBufferC, 1, wordFileSize, file);
3365 + if (result != wordFileSize) {
3366 + fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
3367 + exit (-1);
3368 }
3369 - }
3370 - while (uc && (breaks.contains(uc) || u_isspace(uc)));
3371 -
3372 - MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);
3373 + wordBufferC[wordFileSize]=0;
3374 + fclose(file);
3375
3376 - if (U_FAILURE(status)) {
3377 - fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_erro rName(status));
3378 - exit(status);
3379 - }
3380 + //
3381 + // Look for a Unicode Signature (BOM) on the word file
3382 + //
3383 + int32_t signatureLength;
3384 + const char * wordSourceC = wordBufferC;
3385 + const char* encoding = ucnv_detectUnicodeSignature(
3386 + wordSourceC, wordFileSize, &signatureLength, &st atus);
3387 + if (U_FAILURE(status)) {
3388 + exit(status);
3389 + }
3390 + if(encoding!=NULL ){
3391 + wordSourceC += signatureLength;
3392 + wordFileSize -= signatureLength;
3393 + }
3394
3395 - // Now add the words. Words are non-space characters at the beginning of
3396 - // lines, and must be at least one UChar.
3397 - current = wordSourceU;
3398 - UChar *candidate = current;
3399 - uc = *current++;
3400 - int32_t length = 0;
3401 -
3402 - while (uc) {
3403 - while (uc && !u_isspace(uc)) {
3404 - ++length;
3405 - uc = *current++;
3406 + //
3407 + // Open a converter to take the rule file to UTF-16
3408 + //
3409 + UConverter* conv;
3410 + conv = ucnv_open(encoding, &status);
3411 + if (U_FAILURE(status)) {
3412 + fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status ));
3413 + exit(status);
3414 }
3415 - if (length > 0) {
3416 - mtd->addWord(candidate, length, status);
3417 - if (U_FAILURE(status)) {
3418 - fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s \"\n",
3419 - u_errorName(status));
3420 - exit(status);
3421 +
3422 + //
3423 + // Convert the words to UChar.
3424 + // Preflight first to determine required buffer size.
3425 + //
3426 + uint32_t destCap = ucnv_toUChars(conv,
3427 + NULL, // dest,
3428 + 0, // destCapacity,
3429 + wordSourceC,
3430 + wordFileSize,
3431 + &status);
3432 + if (status != U_BUFFER_OVERFLOW_ERROR) {
3433 + fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(st atus));
3434 + exit(status);
3435 + };
3436 +
3437 + status = U_ZERO_ERROR;
3438 + UChar *wordSourceU = new UChar[destCap+1];
3439 + ucnv_toUChars(conv,
3440 + wordSourceU, // dest,
3441 + destCap+1,
3442 + wordSourceC,
3443 + wordFileSize,
3444 + &status);
3445 + if (U_FAILURE(status)) {
3446 + fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(st atus));
3447 + exit(status);
3448 + };
3449 + ucnv_close(conv);
3450 +
3451 + // Get rid of the original file buffer
3452 + delete[] wordBufferC;
3453 +
3454 + // Create a MutableTrieDictionary, and loop through all the lines, inse rting
3455 + // words.
3456 +
3457 + // First, pick a median character.
3458 + UChar *current = wordSourceU + (destCap/2);
3459 + UChar uc = *current++;
3460 + UnicodeSet breaks;
3461 + breaks.add(0x000A); // Line Feed
3462 + breaks.add(0x000D); // Carriage Return
3463 + breaks.add(0x2028); // Line Separator
3464 + breaks.add(0x2029); // Paragraph Separator
3465 +
3466 + do {
3467 + // Look for line break
3468 + while (uc && !breaks.contains(uc)) {
3469 + uc = *current++;
3470 + }
3471 + // Now skip to first non-line-break
3472 + while (uc && breaks.contains(uc)) {
3473 + uc = *current++;
3474 }
3475 }
3476 - // Find beginning of next line
3477 - while (uc && !breaks.contains(uc)) {
3478 - uc = *current++;
3479 + while (uc && (breaks.contains(uc) || u_isspace(uc)));
3480 +
3481 + mtd = new MutableTrieDictionary(uc, status);
3482 +
3483 + if (U_FAILURE(status)) {
3484 + fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_ errorName(status));
3485 + exit(status);
3486 }
3487 - while (uc && breaks.contains(uc)) {
3488 - uc = *current++;
3489 +
3490 + // Now add the words. Words are non-space characters at the beginning o f
3491 + // lines, and must be at least one UChar. If a word has an associated v alue,
3492 + // the value should follow the word on the same line after a tab charac ter.
3493 + current = wordSourceU;
3494 + UChar *candidate = current;
3495 + uc = *current++;
3496 + int32_t length = 0;
3497 + int count = 0;
3498 +
3499 + while (uc) {
3500 + while (uc && !u_isspace(uc)) {
3501 + ++length;
3502 + uc = *current++;
3503 + }
3504 +
3505 + UnicodeString valueString;
3506 + UChar candidateValue;
3507 + if(uc == 0x0009){ //separator is a tab char, read in number after s pace
3508 + while (uc && u_isspace(uc)) {
3509 + uc = *current++;
3510 + }
3511 + while (uc && !u_isspace(uc)) {
3512 + valueString.append(uc);
3513 + uc = *current++;
3514 + }
3515 + }
3516 +
3517 + if (length > 0) {
3518 + count++;
3519 + if(valueString.length() > 0){
3520 + mtd->setValued(TRUE);
3521 +
3522 + uint32_t value = 0;
3523 + char* s = new char[valueString.length()];
3524 + valueString.extract(0,valueString.length(), s, valueString. length());
3525 + int n = sscanf(s, "%ud", &value);
3526 + U_ASSERT(n == 1);
3527 + U_ASSERT(value >= 0);
3528 + mtd->addWord(candidate, length, status, (uint16_t)value);
3529 + delete[] s;
3530 + } else {
3531 + mtd->addWord(candidate, length, status);
3532 + }
3533 +
3534 + if (U_FAILURE(status)) {
3535 + fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n",
3536 + u_errorName(status), count);
3537 + exit(status);
3538 + }
3539 + }
3540 +
3541 + // Find beginning of next line
3542 + while (uc && !breaks.contains(uc)) {
3543 + uc = *current++;
3544 + }
3545 + // Find next non-line-breaking character
3546 + while (uc && breaks.contains(uc)) {
3547 + uc = *current++;
3548 + }
3549 + candidate = current-1;
3550 + length = 0;
3551 }
3552 - candidate = current-1;
3553 - length = 0;
3554 +
3555 + // Get rid of the Unicode text buffer
3556 + delete[] wordSourceU;
3557 }
3558
3559 - // Get rid of the Unicode text buffer
3560 - delete[] wordSourceU;
3561 -
3562 // Now, create a CompactTrieDictionary from the mutable dictionary
3563 CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
3564 if (U_FAILURE(status)) {
3565 @@ -393,4 +440,3 @@
3566
3567 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
3568 }
3569 -
3570 --- source/tools/genctd/Makefile.in 2006-12-16 13:07:01.000000000 -0800
3571 +++ source/tools/genctd/Makefile.in 2011-01-21 14:12:45.555920000 -0800
3572 @@ -23,13 +23,13 @@
3573 ## Extra files to remove for 'make clean'
3574 CLEANFILES = *~ $(DEPS) $(MAN_FILES)
3575
3576 -## Target information
3577 +## Target informationcd
3578 TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
3579
3580 ifneq ($(top_builddir),$(top_srcdir))
3581 CPPFLAGS += -I$(top_builddir)/common
3582 endif
3583 -CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil
3584 +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -I$(top_srcdir)/i18n
3585 LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
3586
3587 OBJECTS = genctd.o
OLDNEW
« no previous file with comments | « icu52/patches/search_collation.patch ('k') | icu52/patches/si_value.undef.patch » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698