OLD | NEW |
| (Empty) |
1 --- source/common/brkeng.cpp 2009-11-11 07:47:22.000000000 -0800 | |
2 +++ source/common/brkeng.cpp 2011-01-21 14:12:45.479922000 -0800 | |
3 @@ -226,6 +226,30 @@ | |
4 case USCRIPT_THAI: | |
5 engine = new ThaiBreakEngine(dict, status); | |
6 break; | |
7 + | |
8 + case USCRIPT_HANGUL: | |
9 + engine = new CjkBreakEngine(dict, kKorean, status); | |
10 + break; | |
11 + | |
12 + // use same BreakEngine and dictionary for both Chinese and Japanes
e | |
13 + case USCRIPT_HIRAGANA: | |
14 + case USCRIPT_KATAKANA: | |
15 + case USCRIPT_HAN: | |
16 + engine = new CjkBreakEngine(dict, kChineseJapanese, status); | |
17 + break; | |
18 +#if 0 | |
19 + // TODO: Have to get some characters with script=common handled | |
20 + // by CjkBreakEngine (e.g. U+309B). Simply subjecting | |
21 + // them to CjkBreakEngine does not work. The engine has to | |
22 + // special-case them. | |
23 + case USCRIPT_COMMON: | |
24 + { | |
25 + UBlockCode block = ublock_getCode(code); | |
26 + if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) | |
27 + engine = new CjkBreakEngine(dict, kChineseJapanese, status); | |
28 + break; | |
29 + } | |
30 +#endif | |
31 default: | |
32 break; | |
33 } | |
34 @@ -281,6 +305,13 @@ | |
35 dict = NULL; | |
36 } | |
37 return dict; | |
38 + } else if (dictfname != NULL){ | |
39 + //create dummy dict if dictionary filename not valid | |
40 + UChar c = 0x0020; | |
41 + status = U_ZERO_ERROR; | |
42 + MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE)
; | |
43 + mtd->addWord(&c, 1, status, 1); | |
44 + return new CompactTrieDictionary(*mtd, status); | |
45 } | |
46 return NULL; | |
47 } | |
48 --- source/common/dictbe.cpp 2008-06-13 12:21:12.000000000 -0700 | |
49 +++ source/common/dictbe.cpp 2011-01-21 14:12:45.468928000 -0800 | |
50 @@ -16,6 +16,9 @@ | |
51 #include "unicode/ubrk.h" | |
52 #include "uvector.h" | |
53 #include "triedict.h" | |
54 +#include "uassert.h" | |
55 +#include "unicode/normlzr.h" | |
56 +#include "cmemory.h" | |
57 | |
58 U_NAMESPACE_BEGIN | |
59 | |
60 @@ -422,6 +425,294 @@ | |
61 return wordsFound; | |
62 } | |
63 | |
64 +/* | |
65 + ****************************************************************** | |
66 + * CjkBreakEngine | |
67 + */ | |
68 +static const uint32_t kuint32max = 0xFFFFFFFF; | |
69 +CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, Langu
ageType type, UErrorCode &status) | |
70 +: DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){ | |
71 + if (!adoptDictionary->getValued()) { | |
72 + status = U_ILLEGAL_ARGUMENT_ERROR; | |
73 + return; | |
74 + } | |
75 + | |
76 + // Korean dictionary only includes Hangul syllables | |
77 + fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), sta
tus); | |
78 + fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); | |
79 + fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\
uff9f]"), status); | |
80 + fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status
); | |
81 + | |
82 + if (U_SUCCESS(status)) { | |
83 + // handle Korean and Japanese/Chinese using different dictionaries | |
84 + if (type == kKorean) { | |
85 + setCharacters(fHangulWordSet); | |
86 + } else { //Chinese and Japanese | |
87 + UnicodeSet cjSet; | |
88 + cjSet.addAll(fHanWordSet); | |
89 + cjSet.addAll(fKatakanaWordSet); | |
90 + cjSet.addAll(fHiraganaWordSet); | |
91 + cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc")); | |
92 + setCharacters(cjSet); | |
93 + } | |
94 + } | |
95 +} | |
96 + | |
97 +CjkBreakEngine::~CjkBreakEngine(){ | |
98 + delete fDictionary; | |
99 +} | |
100 + | |
101 +// The katakanaCost values below are based on the length frequencies of all | |
102 +// katakana phrases in the dictionary | |
103 +static const int kMaxKatakanaLength = 8; | |
104 +static const int kMaxKatakanaGroupLength = 20; | |
105 +static const uint32_t maxSnlp = 255; | |
106 + | |
107 +static inline uint32_t getKatakanaCost(int wordLength){ | |
108 + //TODO: fill array with actual values from dictionary! | |
109 + static const uint32_t katakanaCost[kMaxKatakanaLength + 1] | |
110 + = {8192, 984, 408, 240, 204, 252, 300, 3
72, 480}; | |
111 + return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength]; | |
112 +} | |
113 + | |
114 +static inline bool isKatakana(uint16_t value) { | |
115 + return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) || | |
116 + (value >= 0xFF66u && value <= 0xFF9fu); | |
117 +} | |
118 + | |
119 +// A very simple helper class to streamline the buffer handling in | |
120 +// divideUpDictionaryRange. | |
121 +template<class T, size_t N> | |
122 +class AutoBuffer { | |
123 + public: | |
124 + AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) { | |
125 + if (size > N) { | |
126 + buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); | |
127 + capacity = size; | |
128 + } | |
129 + } | |
130 + ~AutoBuffer() { | |
131 + if (buffer != stackBuffer) | |
132 + uprv_free(buffer); | |
133 + } | |
134 +#if 0 | |
135 + T* operator& () { | |
136 + return buffer; | |
137 + } | |
138 +#endif | |
139 + T* elems() { | |
140 + return buffer; | |
141 + } | |
142 + const T& operator[] (size_t i) const { | |
143 + return buffer[i]; | |
144 + } | |
145 + T& operator[] (size_t i) { | |
146 + return buffer[i]; | |
147 + } | |
148 + | |
149 + // resize without copy | |
150 + void resize(size_t size) { | |
151 + if (size <= capacity) | |
152 + return; | |
153 + if (buffer != stackBuffer) | |
154 + uprv_free(buffer); | |
155 + buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); | |
156 + capacity = size; | |
157 + } | |
158 + private: | |
159 + T stackBuffer[N]; | |
160 + T* buffer; | |
161 + AutoBuffer(); | |
162 + size_t capacity; | |
163 +}; | |
164 + | |
165 + | |
166 +/* | |
167 + * @param text A UText representing the text | |
168 + * @param rangeStart The start of the range of dictionary characters | |
169 + * @param rangeEnd The end of the range of dictionary characters | |
170 + * @param foundBreaks Output of C array of int32_t break positions, or 0 | |
171 + * @return The number of breaks found | |
172 + */ | |
173 +int32_t | |
174 +CjkBreakEngine::divideUpDictionaryRange( UText *text, | |
175 + int32_t rangeStart, | |
176 + int32_t rangeEnd, | |
177 + UStack &foundBreaks ) const { | |
178 + if (rangeStart >= rangeEnd) { | |
179 + return 0; | |
180 + } | |
181 + | |
182 + const size_t defaultInputLength = 80; | |
183 + size_t inputLength = rangeEnd - rangeStart; | |
184 + AutoBuffer<UChar, defaultInputLength> charString(inputLength); | |
185 + | |
186 + // Normalize the input string and put it in normalizedText. | |
187 + // The map from the indices of the normalized input to the raw | |
188 + // input is kept in charPositions. | |
189 + UErrorCode status = U_ZERO_ERROR; | |
190 + utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength,
&status); | |
191 + if (U_FAILURE(status)) | |
192 + return 0; | |
193 + | |
194 + UnicodeString inputString(charString.elems(), inputLength); | |
195 + UNormalizationMode norm_mode = UNORM_NFKC; | |
196 + UBool isNormalized = | |
197 + Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES || | |
198 + Normalizer::isNormalized(inputString, norm_mode, status); | |
199 + | |
200 + AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1); | |
201 + int numChars = 0; | |
202 + UText normalizedText = UTEXT_INITIALIZER; | |
203 + // Needs to be declared here because normalizedText holds onto its buffer. | |
204 + UnicodeString normalizedString; | |
205 + if (isNormalized) { | |
206 + int32_t index = 0; | |
207 + charPositions[0] = 0; | |
208 + while(index < inputString.length()) { | |
209 + index = inputString.moveIndex32(index, 1); | |
210 + charPositions[++numChars] = index; | |
211 + } | |
212 + utext_openUnicodeString(&normalizedText, &inputString, &status); | |
213 + } | |
214 + else { | |
215 + Normalizer::normalize(inputString, norm_mode, 0, normalizedString, stat
us); | |
216 + if (U_FAILURE(status)) | |
217 + return 0; | |
218 + charPositions.resize(normalizedString.length() + 1); | |
219 + Normalizer normalizer(charString.elems(), inputLength, norm_mode); | |
220 + int32_t index = 0; | |
221 + charPositions[0] = 0; | |
222 + while(index < normalizer.endIndex()){ | |
223 + UChar32 uc = normalizer.next(); | |
224 + charPositions[++numChars] = index = normalizer.getIndex(); | |
225 + } | |
226 + utext_openUnicodeString(&normalizedText, &normalizedString, &status); | |
227 + } | |
228 + | |
229 + if (U_FAILURE(status)) | |
230 + return 0; | |
231 + | |
232 + // From this point on, all the indices refer to the indices of | |
233 + // the normalized input string. | |
234 + | |
235 + // bestSnlp[i] is the snlp of the best segmentation of the first i | |
236 + // characters in the range to be matched. | |
237 + AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1); | |
238 + bestSnlp[0] = 0; | |
239 + for(int i=1; i<=numChars; i++){ | |
240 + bestSnlp[i] = kuint32max; | |
241 + } | |
242 + | |
243 + // prev[i] is the index of the last CJK character in the previous word in | |
244 + // the best segmentation of the first i characters. | |
245 + AutoBuffer<int, defaultInputLength> prev(numChars + 1); | |
246 + for(int i=0; i<=numChars; i++){ | |
247 + prev[i] = -1; | |
248 + } | |
249 + | |
250 + const size_t maxWordSize = 20; | |
251 + AutoBuffer<uint16_t, maxWordSize> values(numChars); | |
252 + AutoBuffer<int32_t, maxWordSize> lengths(numChars); | |
253 + | |
254 + // Dynamic programming to find the best segmentation. | |
255 + bool is_prev_katakana = false; | |
256 + for (int i = 0; i < numChars; ++i) { | |
257 + //utext_setNativeIndex(text, rangeStart + i); | |
258 + utext_setNativeIndex(&normalizedText, i); | |
259 + if (bestSnlp[i] == kuint32max) | |
260 + continue; | |
261 + | |
262 + int count; | |
263 + // limit maximum word length matched to size of current substring | |
264 + int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSiz
e: numChars - i; | |
265 + | |
266 + fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(),
count, maxSearchLength, values.elems()); | |
267 + | |
268 + // if there are no single character matches found in the dictionary | |
269 + // starting with this charcter, treat character as a 1-character word | |
270 + // with the highest value possible, i.e. the least likely to occur. | |
271 + // Exclude Korean characters from this treatment, as they should be lef
t | |
272 + // together by default. | |
273 + if((count == 0 || lengths[0] != 1) && | |
274 + !fHangulWordSet.contains(utext_current32(&normalizedText))){ | |
275 + values[count] = maxSnlp; | |
276 + lengths[count++] = 1; | |
277 + } | |
278 + | |
279 + for (int j = 0; j < count; j++){ | |
280 + //U_ASSERT(values[j] >= 0 && values[j] <= maxSnlp); | |
281 + uint32_t newSnlp = bestSnlp[i] + values[j]; | |
282 + if (newSnlp < bestSnlp[lengths[j] + i]) { | |
283 + bestSnlp[lengths[j] + i] = newSnlp; | |
284 + prev[lengths[j] + i] = i; | |
285 + } | |
286 + } | |
287 + | |
288 + // In Japanese, | |
289 + // Katakana word in single character is pretty rare. So we apply | |
290 + // the following heuristic to Katakana: any continuous run of Katakana | |
291 + // characters is considered a candidate word with a default cost | |
292 + // specified in the katakanaCost table according to its length. | |
293 + //utext_setNativeIndex(text, rangeStart + i); | |
294 + utext_setNativeIndex(&normalizedText, i); | |
295 + bool is_katakana = isKatakana(utext_current32(&normalizedText)); | |
296 + if (!is_prev_katakana && is_katakana) { | |
297 + int j = i + 1; | |
298 + utext_next32(&normalizedText); | |
299 + // Find the end of the continuous run of Katakana characters | |
300 + while (j < numChars && (j - i) < kMaxKatakanaGroupLength && | |
301 + isKatakana(utext_current32(&normalizedText))) { | |
302 + utext_next32(&normalizedText); | |
303 + ++j; | |
304 + } | |
305 + if ((j - i) < kMaxKatakanaGroupLength) { | |
306 + uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i); | |
307 + if (newSnlp < bestSnlp[j]) { | |
308 + bestSnlp[j] = newSnlp; | |
309 + prev[j] = i; | |
310 + } | |
311 + } | |
312 + } | |
313 + is_prev_katakana = is_katakana; | |
314 + } | |
315 + | |
316 + // Start pushing the optimal offset index into t_boundary (t for tentative)
. | |
317 + // prev[numChars] is guaranteed to be meaningful. | |
318 + // We'll first push in the reverse order, i.e., | |
319 + // t_boundary[0] = numChars, and afterwards do a swap. | |
320 + AutoBuffer<int, maxWordSize> t_boundary(numChars + 1); | |
321 + | |
322 + int numBreaks = 0; | |
323 + // No segmentation found, set boundary to end of range | |
324 + if (bestSnlp[numChars] == kuint32max) { | |
325 + t_boundary[numBreaks++] = numChars; | |
326 + } else { | |
327 + for (int i = numChars; i > 0; i = prev[i]){ | |
328 + t_boundary[numBreaks++] = i; | |
329 + | |
330 + } | |
331 + U_ASSERT(prev[t_boundary[numBreaks-1]] == 0); | |
332 + } | |
333 + | |
334 + // Reverse offset index in t_boundary. | |
335 + // Don't add a break for the start of the dictionary range if there is one | |
336 + // there already. | |
337 + if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { | |
338 + t_boundary[numBreaks++] = 0; | |
339 + } | |
340 + | |
341 + // Now that we're done, convert positions in t_bdry[] (indices in | |
342 + // the normalized input string) back to indices in the raw input string | |
343 + // while reversing t_bdry and pushing values to foundBreaks. | |
344 + for (int i = numBreaks-1; i >= 0; i--) { | |
345 + foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status); | |
346 + } | |
347 + | |
348 + utext_close(&normalizedText); | |
349 + return numBreaks; | |
350 +} | |
351 + | |
352 U_NAMESPACE_END | |
353 | |
354 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | |
355 --- source/common/dictbe.h 2006-09-29 17:37:45.000000000 -0700 | |
356 +++ source/common/dictbe.h 2011-01-21 14:12:45.492920000 -0800 | |
357 @@ -1,8 +1,8 @@ | |
358 /** | |
359 - ******************************************************************************
* | |
360 - * Copyright (C) 2006, International Business Machines Corporation and others.
* | |
361 - * All Rights Reserved.
* | |
362 - ******************************************************************************
* | |
363 + ******************************************************************************
**** | |
364 + * Copyright (C) 2006-2010, International Business Machines Corporation and oth
ers. | |
365 + * All Rights Reserved. | |
366 + ******************************************************************************
**** | |
367 */ | |
368 | |
369 #ifndef DICTBE_H | |
370 @@ -65,31 +65,31 @@ | |
371 */ | |
372 virtual ~DictionaryBreakEngine(); | |
373 | |
374 - /** | |
375 - * <p>Indicate whether this engine handles a particular character for | |
376 - * a particular kind of break.</p> | |
377 - * | |
378 - * @param c A character which begins a run that the engine might handle | |
379 - * @param breakType The type of text break which the caller wants to determine | |
380 - * @return TRUE if this engine handles the particular character and break | |
381 - * type. | |
382 - */ | |
383 + /** | |
384 + * <p>Indicate whether this engine handles a particular character for | |
385 + * a particular kind of break.</p> | |
386 + * | |
387 + * @param c A character which begins a run that the engine might handle | |
388 + * @param breakType The type of text break which the caller wants to determin
e | |
389 + * @return TRUE if this engine handles the particular character and break | |
390 + * type. | |
391 + */ | |
392 virtual UBool handles( UChar32 c, int32_t breakType ) const; | |
393 | |
394 - /** | |
395 - * <p>Find any breaks within a run in the supplied text.</p> | |
396 - * | |
397 - * @param text A UText representing the text. The | |
398 - * iterator is left at the end of the run of characters which the engine | |
399 - * is capable of handling. | |
400 - * @param startPos The start of the run within the supplied text. | |
401 - * @param endPos The end of the run within the supplied text. | |
402 - * @param reverse Whether the caller is looking for breaks in a reverse | |
403 - * direction. | |
404 - * @param breakType The type of break desired, or -1. | |
405 - * @param foundBreaks An allocated C array of the breaks found, if any | |
406 - * @return The number of breaks found. | |
407 - */ | |
408 + /** | |
409 + * <p>Find any breaks within a run in the supplied text.</p> | |
410 + * | |
411 + * @param text A UText representing the text. The iterator is left at | |
412 + * the end of the run of characters which the engine is capable of handling | |
413 + * that starts from the first (or last) character in the range. | |
414 + * @param startPos The start of the run within the supplied text. | |
415 + * @param endPos The end of the run within the supplied text. | |
416 + * @param reverse Whether the caller is looking for breaks in a reverse | |
417 + * direction. | |
418 + * @param breakType The type of break desired, or -1. | |
419 + * @param foundBreaks An allocated C array of the breaks found, if any | |
420 + * @return The number of breaks found. | |
421 + */ | |
422 virtual int32_t findBreaks( UText *text, | |
423 int32_t startPos, | |
424 int32_t endPos, | |
425 @@ -114,7 +114,7 @@ | |
426 // virtual void setBreakTypes( uint32_t breakTypes ); | |
427 | |
428 /** | |
429 - * <p>Divide up a range of known dictionary characters.</p> | |
430 + * <p>Divide up a range of known dictionary characters handled by this break e
ngine.</p> | |
431 * | |
432 * @param text A UText representing the text | |
433 * @param rangeStart The start of the range of dictionary characters | |
434 @@ -171,7 +171,7 @@ | |
435 | |
436 protected: | |
437 /** | |
438 - * <p>Divide up a range of known dictionary characters.</p> | |
439 + * <p>Divide up a range of known dictionary characters handled by this break e
ngine.</p> | |
440 * | |
441 * @param text A UText representing the text | |
442 * @param rangeStart The start of the range of dictionary characters | |
443 @@ -186,6 +186,66 @@ | |
444 | |
445 }; | |
446 | |
447 +/******************************************************************* | |
448 + * CjkBreakEngine | |
449 + */ | |
450 + | |
451 +//indicates language/script that the CjkBreakEngine will handle | |
452 +enum LanguageType { | |
453 + kKorean, | |
454 + kChineseJapanese | |
455 +}; | |
456 + | |
457 +/** | |
458 + * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a | |
459 + * TrieWordDictionary with costs associated with each word and | |
460 + * Viterbi decoding to determine CJK-specific breaks.</p> | |
461 + */ | |
462 +class CjkBreakEngine : public DictionaryBreakEngine { | |
463 + protected: | |
464 + /** | |
465 + * The set of characters handled by this engine | |
466 + * @internal | |
467 + */ | |
468 + UnicodeSet fHangulWordSet; | |
469 + UnicodeSet fHanWordSet; | |
470 + UnicodeSet fKatakanaWordSet; | |
471 + UnicodeSet fHiraganaWordSet; | |
472 + | |
473 + const TrieWordDictionary *fDictionary; | |
474 + | |
475 + public: | |
476 + | |
477 + /** | |
478 + * <p>Default constructor.</p> | |
479 + * | |
480 + * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the | |
481 + * engine is deleted. The TrieWordDictionary must contain costs for each wo
rd | |
482 + * in order for the dictionary to work properly. | |
483 + */ | |
484 + CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type,
UErrorCode &status); | |
485 + | |
486 + /** | |
487 + * <p>Virtual destructor.</p> | |
488 + */ | |
489 + virtual ~CjkBreakEngine(); | |
490 + | |
491 + protected: | |
492 + /** | |
493 + * <p>Divide up a range of known dictionary characters handled by this brea
k engine.</p> | |
494 + * | |
495 + * @param text A UText representing the text | |
496 + * @param rangeStart The start of the range of dictionary characters | |
497 + * @param rangeEnd The end of the range of dictionary characters | |
498 + * @param foundBreaks Output of C array of int32_t break positions, or 0 | |
499 + * @return The number of breaks found | |
500 + */ | |
501 + virtual int32_t divideUpDictionaryRange( UText *text, | |
502 + int32_t rangeStart, | |
503 + int32_t rangeEnd, | |
504 + UStack &foundBreaks ) const; | |
505 + | |
506 +}; | |
507 | |
508 U_NAMESPACE_END | |
509 | |
510 --- source/common/rbbi.cpp 2010-07-22 17:15:37.000000000 -0700 | |
511 +++ source/common/rbbi.cpp 2011-01-21 14:12:45.457938000 -0800 | |
512 @@ -1555,10 +1555,12 @@ | |
513 int32_t endPos, | |
514 UBool reverse) { | |
515 // Reset the old break cache first. | |
516 - uint32_t dictionaryCount = fDictionaryCharCount; | |
517 reset(); | |
518 | |
519 - if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { | |
520 + // note: code segment below assumes that dictionary chars are in the | |
521 + // startPos-endPos range | |
522 + // value returned should be next character in sequence | |
523 + if ((endPos - startPos) <= 1) { | |
524 return (reverse ? startPos : endPos); | |
525 } | |
526 | |
527 @@ -1711,7 +1713,7 @@ | |
528 // proposed break by one of the breaks we found. Use following() an
d | |
529 // preceding() to do the work. They should never recurse in this ca
se. | |
530 if (reverse) { | |
531 - return preceding(endPos - 1); | |
532 + return preceding(endPos); | |
533 } | |
534 else { | |
535 return following(startPos); | |
536 --- source/common/triedict.cpp 2008-02-13 01:35:50.000000000 -0800 | |
537 +++ source/common/triedict.cpp 2011-01-21 14:12:45.271006000 -0800 | |
538 @@ -20,6 +20,7 @@ | |
539 #include "uvector.h" | |
540 #include "uvectr32.h" | |
541 #include "uarrsort.h" | |
542 +#include "hash.h" | |
543 | |
544 //#define DEBUG_TRIE_DICT 1 | |
545 | |
546 @@ -27,6 +28,11 @@ | |
547 #include <sys/times.h> | |
548 #include <limits.h> | |
549 #include <stdio.h> | |
550 +#include <time.h> | |
551 +#ifndef CLK_TCK | |
552 +#define CLK_TCK CLOCKS_PER_SEC | |
553 +#endif | |
554 + | |
555 #endif | |
556 | |
557 U_NAMESPACE_BEGIN | |
558 @@ -45,6 +51,11 @@ | |
559 * MutableTrieDictionary | |
560 */ | |
561 | |
562 +//#define MAX_VALUE 65535 | |
563 + | |
564 +// forward declaration | |
565 +inline uint16_t scaleLogProbabilities(double logprob); | |
566 + | |
567 // Node structure for the ternary, uncompressed trie | |
568 struct TernaryNode : public UMemory { | |
569 UChar ch; // UTF-16 code unit | |
570 @@ -77,7 +88,8 @@ | |
571 delete high; | |
572 } | |
573 | |
574 -MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status
) { | |
575 +MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status, | |
576 + UBool containsValue /* = FALSE */
) { | |
577 // Start the trie off with something. Having the root node already present | |
578 // cuts a special case out of the search/insertion functions. | |
579 // Making it a median character cuts the worse case for searches from | |
580 @@ -91,14 +103,19 @@ | |
581 if (U_SUCCESS(status) && fIter == NULL) { | |
582 status = U_MEMORY_ALLOCATION_ERROR; | |
583 } | |
584 + | |
585 + fValued = containsValue; | |
586 } | |
587 | |
588 -MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) { | |
589 +MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status, | |
590 + UBool containsValue /* = false */
) { | |
591 fTrie = NULL; | |
592 fIter = utext_openUChars(NULL, NULL, 0, &status); | |
593 if (U_SUCCESS(status) && fIter == NULL) { | |
594 status = U_MEMORY_ALLOCATION_ERROR; | |
595 } | |
596 + | |
597 + fValued = containsValue; | |
598 } | |
599 | |
600 MutableTrieDictionary::~MutableTrieDictionary() { | |
601 @@ -108,12 +125,13 @@ | |
602 | |
603 int32_t | |
604 MutableTrieDictionary::search( UText *text, | |
605 - int32_t maxLength, | |
606 - int32_t *lengths, | |
607 - int &count, | |
608 - int limit, | |
609 - TernaryNode *&parent, | |
610 - UBool &pMatched ) const { | |
611 + int32_t maxLength, | |
612 + int32_t *lengths, | |
613 + int &count, | |
614 + int limit, | |
615 + TernaryNode *&parent, | |
616 + UBool &pMatched, | |
617 + uint16_t *values /*=NULL*/) const { | |
618 // TODO: current implementation works in UTF-16 space | |
619 const TernaryNode *up = NULL; | |
620 const TernaryNode *p = fTrie; | |
621 @@ -121,6 +139,10 @@ | |
622 pMatched = TRUE; | |
623 int i; | |
624 | |
625 + if (!fValued) { | |
626 + values = NULL; | |
627 + } | |
628 + | |
629 UChar uc = utext_current32(text); | |
630 for (i = 0; i < maxLength && p != NULL; ++i) { | |
631 while (p != NULL) { | |
632 @@ -141,7 +163,11 @@ | |
633 break; | |
634 } | |
635 // Must be equal to get here | |
636 - if (limit > 0 && (p->flags & kEndsWord)) { | |
637 + if (limit > 0 && (p->flags > 0)) { | |
638 + //is there a more efficient way to add values? ie. remove if stmt | |
639 + if(values != NULL) { | |
640 + values[mycount] = p->flags; | |
641 + } | |
642 lengths[mycount++] = i+1; | |
643 --limit; | |
644 } | |
645 @@ -161,13 +187,14 @@ | |
646 void | |
647 MutableTrieDictionary::addWord( const UChar *word, | |
648 int32_t length, | |
649 - UErrorCode &status ) { | |
650 -#if 0 | |
651 - if (length <= 0) { | |
652 + UErrorCode &status, | |
653 + uint16_t value /* = 0 */ ) { | |
654 + // dictionary cannot store zero values, would interfere with flags | |
655 + if (length <= 0 || (!fValued && value > 0) || (fValued && value == 0)) { | |
656 status = U_ILLEGAL_ARGUMENT_ERROR; | |
657 return; | |
658 } | |
659 -#endif | |
660 + | |
661 TernaryNode *parent; | |
662 UBool pMatched; | |
663 int count; | |
664 @@ -177,7 +204,7 @@ | |
665 matched = search(fIter, length, NULL, count, 0, parent, pMatched); | |
666 | |
667 while (matched++ < length) { | |
668 - UChar32 uc = utext_next32(fIter); // TODO: supplemetary support? | |
669 + UChar32 uc = utext_next32(fIter); // TODO: supplementary support? | |
670 U_ASSERT(uc != U_SENTINEL); | |
671 TernaryNode *newNode = new TernaryNode(uc); | |
672 if (newNode == NULL) { | |
673 @@ -199,30 +226,23 @@ | |
674 parent = newNode; | |
675 } | |
676 | |
677 - parent->flags |= kEndsWord; | |
678 -} | |
679 - | |
680 -#if 0 | |
681 -void | |
682 -MutableTrieDictionary::addWords( UEnumeration *words, | |
683 - UErrorCode &status ) { | |
684 - int32_t length; | |
685 - const UChar *word; | |
686 - while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status))
{ | |
687 - addWord(word, length, status); | |
688 + if(fValued && value > 0){ | |
689 + parent->flags = value; | |
690 + } else { | |
691 + parent->flags |= kEndsWord; | |
692 } | |
693 } | |
694 -#endif | |
695 | |
696 int32_t | |
697 MutableTrieDictionary::matches( UText *text, | |
698 int32_t maxLength, | |
699 int32_t *lengths, | |
700 int &count, | |
701 - int limit ) const { | |
702 + int limit, | |
703 + uint16_t *values /*=NULL*/) const { | |
704 TernaryNode *parent; | |
705 UBool pMatched; | |
706 - return search(text, maxLength, lengths, count, limit, parent, pMatched); | |
707 + return search(text, maxLength, lengths, count, limit, parent, pMatched, val
ues); | |
708 } | |
709 | |
710 // Implementation of iteration for MutableTrieDictionary | |
711 @@ -277,7 +297,7 @@ | |
712 break; | |
713 } | |
714 case kEqual: | |
715 - emit = (node->flags & kEndsWord) != 0; | |
716 + emit = node->flags > 0; | |
717 equal = (node->equal != NULL); | |
718 // If this node should be part of the next emitted string, appe
nd | |
719 // the UChar to the string, and make sure we pop it when we com
e | |
720 @@ -299,7 +319,7 @@ | |
721 } | |
722 case kGreaterThan: | |
723 // If this node's character is in the string, remove it. | |
724 - if (node->equal != NULL || (node->flags & kEndsWord)) { | |
725 + if (node->equal != NULL || node->flags > 0) { | |
726 unistr.truncate(unistr.length()-1); | |
727 } | |
728 if (node->high != NULL) { | |
729 @@ -354,12 +374,75 @@ | |
730 * CompactTrieDictionary | |
731 */ | |
732 | |
733 +//TODO further optimization: | |
734 +// minimise size of trie with logprobs by storing values | |
735 +// for terminal nodes directly in offsets[] | |
736 +// --> calculating from next offset *might* be simpler, but would have to add | |
737 +// one last offset for logprob of last node | |
738 +// --> if calculate from current offset, need to factor in possible overflow | |
739 +// as well. | |
740 +// idea: store in offset, set first bit to indicate logprob storage-->won't | |
741 +// have to access additional node | |
742 + | |
743 +// {'Dic', 1}, version 1: uses old header, no values | |
744 +#define COMPACT_TRIE_MAGIC_1 0x44696301 | |
745 +// version 2: uses new header (more than 2^16 nodes), no values | |
746 +#define COMPACT_TRIE_MAGIC_2 0x44696302 | |
747 +// version 3: uses new header, includes values | |
748 +#define COMPACT_TRIE_MAGIC_3 0x44696303 | |
749 + | |
750 struct CompactTrieHeader { | |
751 uint32_t size; // Size of the data in bytes | |
752 uint32_t magic; // Magic number (including version) | |
753 + uint32_t nodeCount; // Number of entries in offsets[] | |
754 + uint32_t root; // Node number of the root node | |
755 + uint32_t offsets[1]; // Offsets to nodes from start of data | |
756 +}; | |
757 + | |
758 +// old version of CompactTrieHeader kept for backwards compatibility | |
759 +struct CompactTrieHeaderV1 { | |
760 + uint32_t size; // Size of the data in bytes | |
761 + uint32_t magic; // Magic number (including version) | |
762 uint16_t nodeCount; // Number of entries in offsets[] | |
763 uint16_t root; // Node number of the root node | |
764 - uint32_t offsets[1]; // Offsets to nodes from start of data | |
765 + uint32_t offsets[1]; // Offsets to nodes from start of data | |
766 +}; | |
767 + | |
768 +// Helper class for managing CompactTrieHeader and CompactTrieHeaderV1 | |
769 +struct CompactTrieInfo { | |
770 + uint32_t size; // Size of the data in bytes | |
771 + uint32_t magic; // Magic number (including version) | |
772 + uint32_t nodeCount; // Number of entries in offsets[] | |
773 + uint32_t root; // Node number of the root node | |
774 + uint32_t *offsets; // Offsets to nodes from start of data | |
775 + uint8_t *address; // pointer to header bytes in memory | |
776 + | |
777 + CompactTrieInfo(const void *data, UErrorCode &status){ | |
778 + CompactTrieHeader *header = (CompactTrieHeader *) data; | |
779 + if (header->magic != COMPACT_TRIE_MAGIC_1 && | |
780 + header->magic != COMPACT_TRIE_MAGIC_2 && | |
781 + header->magic != COMPACT_TRIE_MAGIC_3) { | |
782 + status = U_ILLEGAL_ARGUMENT_ERROR; | |
783 + } else { | |
784 + size = header->size; | |
785 + magic = header->magic; | |
786 + | |
787 + if (header->magic == COMPACT_TRIE_MAGIC_1) { | |
788 + CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *) header; | |
789 + nodeCount = headerV1->nodeCount; | |
790 + root = headerV1->root; | |
791 + offsets = &(headerV1->offsets[0]); | |
792 + address = (uint8_t *)headerV1; | |
793 + } else { | |
794 + nodeCount = header->nodeCount; | |
795 + root = header->root; | |
796 + offsets = &(header->offsets[0]); | |
797 + address = (uint8_t *)header; | |
798 + } | |
799 + } | |
800 + } | |
801 + | |
802 + ~CompactTrieInfo(){} | |
803 }; | |
804 | |
805 // Note that to avoid platform-specific alignment issues, all members of the no
de | |
806 @@ -375,10 +458,14 @@ | |
807 enum CompactTrieNodeFlags { | |
808 kVerticalNode = 0x1000, // This is a vertical node | |
809 kParentEndsWord = 0x2000, // The node whose equal link points to this
ends a word | |
810 - kReservedFlag1 = 0x4000, | |
811 - kReservedFlag2 = 0x8000, | |
812 + kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kR
eservedFlag1 | |
813 + kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReserved
Flag2 | |
814 kCountMask = 0x0FFF, // The count portion of flagscount | |
815 - kFlagMask = 0xF000 // The flags portion of flagscount | |
816 + kFlagMask = 0xF000, // The flags portion of flagscount | |
817 + kRootCountMask = 0x7FFF // The count portion of flagscount in the r
oot node | |
818 + | |
819 + //offset flags: | |
820 + //kOffsetContainsValue = 0x80000000 // Offset contains value for pare
nt node | |
821 }; | |
822 | |
823 // The two node types are distinguished by the kVerticalNode flag. | |
824 @@ -402,63 +489,177 @@ | |
825 uint16_t chars[1]; // Code units | |
826 }; | |
827 | |
828 -// {'Dic', 1}, version 1 | |
829 -#define COMPACT_TRIE_MAGIC_1 0x44696301 | |
830 - | |
831 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj, | |
832 UErrorCode &status ) | |
833 : fUData(dataObj) | |
834 { | |
835 - fData = (const CompactTrieHeader *) udata_getMemory(dataObj); | |
836 + fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); | |
837 + *fInfo = CompactTrieInfo(udata_getMemory(dataObj), status); | |
838 fOwnData = FALSE; | |
839 - if (fData->magic != COMPACT_TRIE_MAGIC_1) { | |
840 - status = U_ILLEGAL_ARGUMENT_ERROR; | |
841 - fData = NULL; | |
842 - } | |
843 } | |
844 + | |
845 CompactTrieDictionary::CompactTrieDictionary( const void *data, | |
846 UErrorCode &status ) | |
847 : fUData(NULL) | |
848 { | |
849 - fData = (const CompactTrieHeader *) data; | |
850 + fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); | |
851 + *fInfo = CompactTrieInfo(data, status); | |
852 fOwnData = FALSE; | |
853 - if (fData->magic != COMPACT_TRIE_MAGIC_1) { | |
854 - status = U_ILLEGAL_ARGUMENT_ERROR; | |
855 - fData = NULL; | |
856 - } | |
857 } | |
858 | |
859 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict
, | |
860 UErrorCode &status ) | |
861 : fUData(NULL) | |
862 { | |
863 - fData = compactMutableTrieDictionary(dict, status); | |
864 + const CompactTrieHeader* header = compactMutableTrieDictionary(dict, status
); | |
865 + if (U_SUCCESS(status)) { | |
866 + fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); | |
867 + *fInfo = CompactTrieInfo(header, status); | |
868 + } | |
869 + | |
870 fOwnData = !U_FAILURE(status); | |
871 } | |
872 | |
873 CompactTrieDictionary::~CompactTrieDictionary() { | |
874 if (fOwnData) { | |
875 - uprv_free((void *)fData); | |
876 + uprv_free((void *)(fInfo->address)); | |
877 } | |
878 + uprv_free((void *)fInfo); | |
879 + | |
880 if (fUData) { | |
881 udata_close(fUData); | |
882 } | |
883 } | |
884 | |
885 +UBool CompactTrieDictionary::getValued() const{ | |
886 + return fInfo->magic == COMPACT_TRIE_MAGIC_3; | |
887 +} | |
888 + | |
889 uint32_t | |
890 CompactTrieDictionary::dataSize() const { | |
891 - return fData->size; | |
892 + return fInfo->size; | |
893 } | |
894 | |
895 const void * | |
896 CompactTrieDictionary::data() const { | |
897 - return fData; | |
898 + return fInfo->address; | |
899 +} | |
900 + | |
901 +//This function finds the address of a node for us, given its node ID | |
902 +static inline const CompactTrieNode * | |
903 +getCompactNode(const CompactTrieInfo *info, uint32_t node) { | |
904 + if(node < info->root-1) { | |
905 + return (const CompactTrieNode *)(&info->offsets[node]); | |
906 + } else { | |
907 + return (const CompactTrieNode *)(info->address + info->offsets[node]); | |
908 + } | |
909 } | |
910 | |
911 -// This function finds the address of a node for us, given its node ID | |
912 +//this version of getCompactNode is currently only used in compactMutableTrieDi
ctionary() | |
913 static inline const CompactTrieNode * | |
914 -getCompactNode(const CompactTrieHeader *header, uint16_t node) { | |
915 - return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[
node]); | |
916 +getCompactNode(const CompactTrieHeader *header, uint32_t node) { | |
917 + if(node < header->root-1) { | |
918 + return (const CompactTrieNode *)(&header->offsets[node]); | |
919 + } else { | |
920 + return (const CompactTrieNode *)((const uint8_t *)header + header->offs
ets[node]); | |
921 + } | |
922 +} | |
923 + | |
924 + | |
925 +/** | |
926 + * Calculates the number of links in a node | |
927 + * @node The specified node | |
928 + */ | |
929 +static inline const uint16_t | |
930 +getCount(const CompactTrieNode *node){ | |
931 + return (node->flagscount & kCountMask); | |
932 + //use the code below if number of links ever exceed 4096 | |
933 + //return (node->flagscount & kCountMask) + ((node->flagscount & kExceedsCou
nt) >> 2); | |
934 +} | |
935 + | |
936 +/** | |
937 + * calculates an equal link node ID of a horizontal node | |
938 + * @hnode The horizontal node containing the equal link | |
939 + * @param index The index into hnode->entries[] | |
940 + * @param nodeCount The length of hnode->entries[] | |
941 + */ | |
942 +static inline uint32_t calcEqualLink(const CompactTrieVerticalNode *vnode){ | |
943 + if(vnode->flagscount & kEqualOverflows){ | |
944 + // treat overflow bits as an extension of chars[] | |
945 + uint16_t *overflow = (uint16_t *) &vnode->chars[getCount((CompactTrieNo
de*)vnode)]; | |
946 + return vnode->equal + (((uint32_t)*overflow) << 16); | |
947 + }else{ | |
948 + return vnode->equal; | |
949 + } | |
950 +} | |
951 + | |
952 +/** | |
953 + * calculates an equal link node ID of a horizontal node | |
954 + * @hnode The horizontal node containing the equal link | |
955 + * @param index The index into hnode->entries[] | |
956 + * @param nodeCount The length of hnode->entries[] | |
957 + */ | |
958 +static inline uint32_t calcEqualLink(const CompactTrieHorizontalNode *hnode, ui
nt16_t index, uint16_t nodeCount){ | |
959 + if(hnode->flagscount & kEqualOverflows){ | |
960 + //set overflow to point to the uint16_t containing the overflow bits | |
961 + uint16_t *overflow = (uint16_t *) &hnode->entries[nodeCount]; | |
962 + overflow += index/4; | |
963 + uint16_t extraBits = (*overflow >> (3 - (index % 4)) * 4) % 0x10; | |
964 + return hnode->entries[index].equal + (((uint32_t)extraBits) << 16); | |
965 + } else { | |
966 + return hnode->entries[index].equal; | |
967 + } | |
968 +} | |
969 + | |
970 +/** | |
971 + * Returns the value stored in the specified node which is associated with its | |
972 + * parent node. | |
973 + * TODO: how to tell that value is stored in node or in offset? check whether | |
974 + * node ID < fInfo->root! | |
975 + */ | |
976 +static inline uint16_t getValue(const CompactTrieHorizontalNode *hnode){ | |
977 + uint16_t count = getCount((CompactTrieNode *)hnode); | |
978 + uint16_t overflowSize = 0; //size of node ID overflow storage in bytes | |
979 + | |
980 + if(hnode->flagscount & kEqualOverflows) | |
981 + overflowSize = (count + 3) / 4 * sizeof(uint16_t); | |
982 + return *((uint16_t *)((uint8_t *)&hnode->entries[count] + overflowSize)); | |
983 +} | |
984 + | |
985 +static inline uint16_t getValue(const CompactTrieVerticalNode *vnode){ | |
986 + // calculate size of total node ID overflow storage in bytes | |
987 + uint16_t overflowSize = (vnode->flagscount & kEqualOverflows)? sizeof(uint1
6_t) : 0; | |
988 + return *((uint16_t *)((uint8_t *)&vnode->chars[getCount((CompactTrieNode *)
vnode)] + overflowSize)); | |
989 +} | |
990 + | |
991 +static inline uint16_t getValue(const CompactTrieNode *node){ | |
992 + if(node->flagscount & kVerticalNode) | |
993 + return getValue((const CompactTrieVerticalNode *)node); | |
994 + else | |
995 + return getValue((const CompactTrieHorizontalNode *)node); | |
996 +} | |
997 + | |
998 +//returns index of match in CompactTrieHorizontalNode.entries[] using binary se
arch | |
999 +inline int16_t | |
1000 +searchHorizontalEntries(const CompactTrieHorizontalEntry *entries, | |
1001 + UChar uc, uint16_t nodeCount){ | |
1002 + int low = 0; | |
1003 + int high = nodeCount-1; | |
1004 + int middle; | |
1005 + while (high >= low) { | |
1006 + middle = (high+low)/2; | |
1007 + if (uc == entries[middle].ch) { | |
1008 + return middle; | |
1009 + } | |
1010 + else if (uc < entries[middle].ch) { | |
1011 + high = middle-1; | |
1012 + } | |
1013 + else { | |
1014 + low = middle+1; | |
1015 + } | |
1016 + } | |
1017 + | |
1018 + return -1; | |
1019 } | |
1020 | |
1021 int32_t | |
1022 @@ -466,17 +667,38 @@ | |
1023 int32_t maxLength, | |
1024 int32_t *lengths, | |
1025 int &count, | |
1026 - int limit ) const { | |
1027 + int limit, | |
1028 + uint16_t *values /*= NULL*/) const { | |
1029 + if (fInfo->magic == COMPACT_TRIE_MAGIC_2) | |
1030 + values = NULL; | |
1031 + | |
1032 // TODO: current implementation works in UTF-16 space | |
1033 - const CompactTrieNode *node = getCompactNode(fData, fData->root); | |
1034 + const CompactTrieNode *node = getCompactNode(fInfo, fInfo->root); | |
1035 int mycount = 0; | |
1036 | |
1037 UChar uc = utext_current32(text); | |
1038 int i = 0; | |
1039 | |
1040 + // handle root node with only kEqualOverflows flag: assume horizontal node
without parent | |
1041 + if(node != NULL){ | |
1042 + const CompactTrieHorizontalNode *root = (const CompactTrieHorizontalNod
e *) node; | |
1043 + int index = searchHorizontalEntries(root->entries, uc, root->flagscount
& kRootCountMask); | |
1044 + if(index > -1){ | |
1045 + node = getCompactNode(fInfo, calcEqualLink(root, index, root->flags
count & kRootCountMask)); | |
1046 + utext_next32(text); | |
1047 + uc = utext_current32(text); | |
1048 + ++i; | |
1049 + }else{ | |
1050 + node = NULL; | |
1051 + } | |
1052 + } | |
1053 + | |
1054 while (node != NULL) { | |
1055 // Check if the node we just exited ends a word | |
1056 if (limit > 0 && (node->flagscount & kParentEndsWord)) { | |
1057 + if(values != NULL){ | |
1058 + values[mycount] = getValue(node); | |
1059 + } | |
1060 lengths[mycount++] = i; | |
1061 --limit; | |
1062 } | |
1063 @@ -487,7 +709,7 @@ | |
1064 break; | |
1065 } | |
1066 | |
1067 - int nodeCount = (node->flagscount & kCountMask); | |
1068 + int nodeCount = getCount(node); | |
1069 if (nodeCount == 0) { | |
1070 // Special terminal node; return now | |
1071 break; | |
1072 @@ -507,35 +729,27 @@ | |
1073 // To get here we must have come through the whole list successfull
y; | |
1074 // go on to the next node. Note that a word cannot end in the middl
e | |
1075 // of a vertical node. | |
1076 - node = getCompactNode(fData, vnode->equal); | |
1077 + node = getCompactNode(fInfo, calcEqualLink(vnode)); | |
1078 } | |
1079 else { | |
1080 // Horizontal node; do binary search | |
1081 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizont
alNode *)node; | |
1082 - int low = 0; | |
1083 - int high = nodeCount-1; | |
1084 - int middle; | |
1085 - node = NULL; // If we don't find a match, we'll fall out of the
loop | |
1086 - while (high >= low) { | |
1087 - middle = (high+low)/2; | |
1088 - if (uc == hnode->entries[middle].ch) { | |
1089 - // We hit a match; get the next node and next character | |
1090 - node = getCompactNode(fData, hnode->entries[middle].equal); | |
1091 - utext_next32(text); | |
1092 - uc = utext_current32(text); | |
1093 - ++i; | |
1094 - break; | |
1095 - } | |
1096 - else if (uc < hnode->entries[middle].ch) { | |
1097 - high = middle-1; | |
1098 - } | |
1099 - else { | |
1100 - low = middle+1; | |
1101 - } | |
1102 + const CompactTrieHorizontalEntry *entries; | |
1103 + entries = hnode->entries; | |
1104 + | |
1105 + int index = searchHorizontalEntries(entries, uc, nodeCount); | |
1106 + if(index > -1){ // | |
1107 + // We hit a match; get the next node and next character | |
1108 + node = getCompactNode(fInfo, calcEqualLink(hnode, index, nodeCo
unt)); | |
1109 + utext_next32(text); | |
1110 + uc = utext_current32(text); | |
1111 + ++i; | |
1112 + }else{ | |
1113 + node = NULL; // If we don't find a match, we'll fall out of
the loop | |
1114 } | |
1115 } | |
1116 } | |
1117 -exit: | |
1118 + exit: | |
1119 count = mycount; | |
1120 return i; | |
1121 } | |
1122 @@ -545,16 +759,16 @@ | |
1123 private: | |
1124 UVector32 fNodeStack; // Stack of nodes to process | |
1125 UVector32 fIndexStack; // Stack of where in node we are | |
1126 - const CompactTrieHeader *fHeader; // Trie data | |
1127 + const CompactTrieInfo *fInfo; // Trie data | |
1128 | |
1129 public: | |
1130 static UClassID U_EXPORT2 getStaticClassID(void); | |
1131 virtual UClassID getDynamicClassID(void) const; | |
1132 public: | |
1133 - CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status)
| |
1134 + CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status) | |
1135 : fNodeStack(status), fIndexStack(status) { | |
1136 - fHeader = header; | |
1137 - fNodeStack.push(header->root, status); | |
1138 + fInfo = info; | |
1139 + fNodeStack.push(info->root, status); | |
1140 fIndexStack.push(0, status); | |
1141 unistr.remove(); | |
1142 } | |
1143 @@ -564,14 +778,14 @@ | |
1144 | |
1145 virtual StringEnumeration *clone() const { | |
1146 UErrorCode status = U_ZERO_ERROR; | |
1147 - return new CompactTrieEnumeration(fHeader, status); | |
1148 + return new CompactTrieEnumeration(fInfo, status); | |
1149 } | |
1150 | |
1151 virtual const UnicodeString * snext(UErrorCode &status); | |
1152 | |
1153 // Very expensive, but this should never be used. | |
1154 virtual int32_t count(UErrorCode &status) const { | |
1155 - CompactTrieEnumeration counter(fHeader, status); | |
1156 + CompactTrieEnumeration counter(fInfo, status); | |
1157 int32_t result = 0; | |
1158 while (counter.snext(status) != NULL && U_SUCCESS(status)) { | |
1159 ++result; | |
1160 @@ -582,7 +796,7 @@ | |
1161 virtual void reset(UErrorCode &status) { | |
1162 fNodeStack.removeAllElements(); | |
1163 fIndexStack.removeAllElements(); | |
1164 - fNodeStack.push(fHeader->root, status); | |
1165 + fNodeStack.push(fInfo->root, status); | |
1166 fIndexStack.push(0, status); | |
1167 unistr.remove(); | |
1168 } | |
1169 @@ -595,26 +809,34 @@ | |
1170 if (fNodeStack.empty() || U_FAILURE(status)) { | |
1171 return NULL; | |
1172 } | |
1173 - const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki()); | |
1174 + const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki()); | |
1175 int where = fIndexStack.peeki(); | |
1176 while (!fNodeStack.empty() && U_SUCCESS(status)) { | |
1177 - int nodeCount = (node->flagscount & kCountMask); | |
1178 + int nodeCount; | |
1179 + | |
1180 + bool isRoot = fNodeStack.peeki() == static_cast<int32_t>(fInfo->root); | |
1181 + if(isRoot){ | |
1182 + nodeCount = node->flagscount & kRootCountMask; | |
1183 + } else { | |
1184 + nodeCount = getCount(node); | |
1185 + } | |
1186 + | |
1187 UBool goingDown = FALSE; | |
1188 if (nodeCount == 0) { | |
1189 // Terminal node; go up immediately | |
1190 fNodeStack.popi(); | |
1191 fIndexStack.popi(); | |
1192 - node = getCompactNode(fHeader, fNodeStack.peeki()); | |
1193 + node = getCompactNode(fInfo, fNodeStack.peeki()); | |
1194 where = fIndexStack.peeki(); | |
1195 } | |
1196 - else if (node->flagscount & kVerticalNode) { | |
1197 + else if ((node->flagscount & kVerticalNode) && !isRoot) { | |
1198 // Vertical node | |
1199 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNo
de *)node; | |
1200 if (where == 0) { | |
1201 // Going down | |
1202 - unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount)
; | |
1203 + unistr.append((const UChar *)vnode->chars, nodeCount); | |
1204 fIndexStack.setElementAt(1, fIndexStack.size()-1); | |
1205 - node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, st
atus)); | |
1206 + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnod
e), status)); | |
1207 where = fIndexStack.push(0, status); | |
1208 goingDown = TRUE; | |
1209 } | |
1210 @@ -623,7 +845,7 @@ | |
1211 unistr.truncate(unistr.length()-nodeCount); | |
1212 fNodeStack.popi(); | |
1213 fIndexStack.popi(); | |
1214 - node = getCompactNode(fHeader, fNodeStack.peeki()); | |
1215 + node = getCompactNode(fInfo, fNodeStack.peeki()); | |
1216 where = fIndexStack.peeki(); | |
1217 } | |
1218 } | |
1219 @@ -638,7 +860,7 @@ | |
1220 // Push on next node | |
1221 unistr.append((UChar)hnode->entries[where].ch); | |
1222 fIndexStack.setElementAt(where+1, fIndexStack.size()-1); | |
1223 - node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[w
here].equal, status)); | |
1224 + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnod
e, where, nodeCount), status)); | |
1225 where = fIndexStack.push(0, status); | |
1226 goingDown = TRUE; | |
1227 } | |
1228 @@ -646,12 +868,14 @@ | |
1229 // Going up | |
1230 fNodeStack.popi(); | |
1231 fIndexStack.popi(); | |
1232 - node = getCompactNode(fHeader, fNodeStack.peeki()); | |
1233 + node = getCompactNode(fInfo, fNodeStack.peeki()); | |
1234 where = fIndexStack.peeki(); | |
1235 } | |
1236 } | |
1237 + | |
1238 // Check if the parent of the node we've just gone down to ends a | |
1239 // word. If so, return it. | |
1240 + // The root node should never end up here. | |
1241 if (goingDown && (node->flagscount & kParentEndsWord)) { | |
1242 return &unistr; | |
1243 } | |
1244 @@ -664,7 +888,7 @@ | |
1245 if (U_FAILURE(status)) { | |
1246 return NULL; | |
1247 } | |
1248 - return new CompactTrieEnumeration(fData, status); | |
1249 + return new CompactTrieEnumeration(fInfo, status); | |
1250 } | |
1251 | |
1252 // | |
1253 @@ -672,21 +896,36 @@ | |
1254 // and back again | |
1255 // | |
1256 | |
1257 -// Helper classes to construct the compact trie | |
1258 +enum CompactTrieNodeType { | |
1259 + kHorizontalType = 0, | |
1260 + kVerticalType = 1, | |
1261 + kValueType = 2 | |
1262 +}; | |
1263 + | |
1264 +/** | |
1265 + * The following classes (i.e. BuildCompactTrie*Node) are helper classes to | |
1266 + * construct the compact trie by storing information for each node and later | |
1267 + * writing the node to memory in a sequential format. | |
1268 + */ | |
1269 class BuildCompactTrieNode: public UMemory { | |
1270 - public: | |
1271 +public: | |
1272 UBool fParentEndsWord; | |
1273 - UBool fVertical; | |
1274 + CompactTrieNodeType fNodeType; | |
1275 UBool fHasDuplicate; | |
1276 + UBool fEqualOverflows; | |
1277 int32_t fNodeID; | |
1278 UnicodeString fChars; | |
1279 + uint16_t fValue; | |
1280 | |
1281 - public: | |
1282 - BuildCompactTrieNode(UBool parentEndsWord, UBool vertical, UStack &nodes, U
ErrorCode &status) { | |
1283 +public: | |
1284 + BuildCompactTrieNode(UBool parentEndsWord, CompactTrieNodeType nodeType, | |
1285 + UStack &nodes, UErrorCode &status, uint16_t value = 0) { | |
1286 fParentEndsWord = parentEndsWord; | |
1287 fHasDuplicate = FALSE; | |
1288 - fVertical = vertical; | |
1289 + fNodeType = nodeType; | |
1290 + fEqualOverflows = FALSE; | |
1291 fNodeID = nodes.size(); | |
1292 + fValue = parentEndsWord? value : 0; | |
1293 nodes.push(this, status); | |
1294 } | |
1295 | |
1296 @@ -694,87 +933,225 @@ | |
1297 } | |
1298 | |
1299 virtual uint32_t size() { | |
1300 - return sizeof(uint16_t); | |
1301 + if(fValue > 0) | |
1302 + return sizeof(uint16_t) * 2; | |
1303 + else | |
1304 + return sizeof(uint16_t); | |
1305 } | |
1306 | |
1307 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*tra
nslate*/) { | |
1308 // Write flag/count | |
1309 - *((uint16_t *)(bytes+offset)) = (fChars.length() & kCountMask) | |
1310 - | (fVertical ? kVerticalNode : 0) | (fParentEndsWord ? kParentEndsW
ord : 0 ); | |
1311 + | |
1312 + // if this ever fails, a flag bit (i.e. kExceedsCount) will need to be | |
1313 + // used as a 5th MSB. | |
1314 + U_ASSERT(fChars.length() < 4096 || fNodeID == 2); | |
1315 + | |
1316 + *((uint16_t *)(bytes+offset)) = (fEqualOverflows? kEqualOverflows : 0)
| | |
1317 + ((fNodeID == 2)? (fChars.length() & kRootCountMask): | |
1318 + ( | |
1319 + (fChars.length() & kCountMask) | | |
1320 + //((fChars.length() << 2) & kExceedsCount) | | |
1321 + (fNodeType == kVerticalType ? kVerticalNode : 0) | | |
1322 + (fParentEndsWord ? kParentEndsWord : 0 ) | |
1323 + ) | |
1324 + ); | |
1325 offset += sizeof(uint16_t); | |
1326 } | |
1327 + | |
1328 + virtual void writeValue(uint8_t *bytes, uint32_t &offset) { | |
1329 + if(fValue > 0){ | |
1330 + *((uint16_t *)(bytes+offset)) = fValue; | |
1331 + offset += sizeof(uint16_t); | |
1332 + } | |
1333 + } | |
1334 + | |
1335 +}; | |
1336 + | |
1337 +/** | |
1338 + * Stores value of parent terminating nodes that have no more subtries. | |
1339 + */ | |
1340 +class BuildCompactTrieValueNode: public BuildCompactTrieNode { | |
1341 +public: | |
1342 + BuildCompactTrieValueNode(UStack &nodes, UErrorCode &status, uint16_t value
) | |
1343 + : BuildCompactTrieNode(TRUE, kValueType, nodes, status, value){ | |
1344 + } | |
1345 + | |
1346 + virtual ~BuildCompactTrieValueNode(){ | |
1347 + } | |
1348 + | |
1349 + virtual uint32_t size() { | |
1350 + return sizeof(uint16_t) * 2; | |
1351 + } | |
1352 + | |
1353 + virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &trans
late) { | |
1354 + // don't write value directly to memory but store it in offset to be wr
itten later | |
1355 + //offset = fValue & kOffsetContainsValue; | |
1356 + BuildCompactTrieNode::write(bytes, offset, translate); | |
1357 + BuildCompactTrieNode::writeValue(bytes, offset); | |
1358 + } | |
1359 }; | |
1360 | |
1361 class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode { | |
1362 public: | |
1363 UStack fLinks; | |
1364 + UBool fMayOverflow; //intermediate value for fEqualOverflows | |
1365 | |
1366 public: | |
1367 - BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorC
ode &status) | |
1368 - : BuildCompactTrieNode(parentEndsWord, FALSE, nodes, status), fLinks(st
atus) { | |
1369 + BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorC
ode &status, uint16_t value = 0) | |
1370 + : BuildCompactTrieNode(parentEndsWord, kHorizontalType, nodes, status, valu
e), fLinks(status) { | |
1371 + fMayOverflow = FALSE; | |
1372 } | |
1373 | |
1374 virtual ~BuildCompactTrieHorizontalNode() { | |
1375 } | |
1376 | |
1377 + // It is impossible to know beforehand exactly how much space the node will | |
1378 + // need in memory before being written, because the node IDs in the equal | |
1379 + // links may or may not overflow after node coalescing. Therefore, this met
hod | |
1380 + // returns the maximum size possible for the node. | |
1381 virtual uint32_t size() { | |
1382 - return offsetof(CompactTrieHorizontalNode,entries) + | |
1383 - (fChars.length()*sizeof(CompactTrieHorizontalEntry)); | |
1384 + uint32_t estimatedSize = offsetof(CompactTrieHorizontalNode,entries) + | |
1385 + (fChars.length()*sizeof(CompactTrieHorizontalEntry)); | |
1386 + | |
1387 + if(fValue > 0) | |
1388 + estimatedSize += sizeof(uint16_t); | |
1389 + | |
1390 + //estimate extra space needed to store overflow for node ID links | |
1391 + //may be more than what is actually needed | |
1392 + for(int i=0; i < fChars.length(); i++){ | |
1393 + if(((BuildCompactTrieNode *)fLinks[i])->fNodeID > 0xFFFF){ | |
1394 + fMayOverflow = TRUE; | |
1395 + break; | |
1396 + } | |
1397 + } | |
1398 + if(fMayOverflow) // added space for overflow should be same as ceil(fCh
ars.length()/4) * sizeof(uint16_t) | |
1399 + estimatedSize += (sizeof(uint16_t) * fChars.length() + 2)/4; | |
1400 + | |
1401 + return estimatedSize; | |
1402 } | |
1403 | |
1404 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &trans
late) { | |
1405 - BuildCompactTrieNode::write(bytes, offset, translate); | |
1406 int32_t count = fChars.length(); | |
1407 + | |
1408 + //if largest nodeID > 2^16, set flag | |
1409 + //large node IDs are more likely to be at the back of the array | |
1410 + for (int32_t i = count-1; i >= 0; --i) { | |
1411 + if(translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeI
D) > 0xFFFF){ | |
1412 + fEqualOverflows = TRUE; | |
1413 + break; | |
1414 + } | |
1415 + } | |
1416 + | |
1417 + BuildCompactTrieNode::write(bytes, offset, translate); | |
1418 + | |
1419 + // write entries[] to memory | |
1420 for (int32_t i = 0; i < count; ++i) { | |
1421 CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(
bytes+offset); | |
1422 entry->ch = fChars[i]; | |
1423 entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks
[i])->fNodeID); | |
1424 #ifdef DEBUG_TRIE_DICT | |
1425 - if (entry->equal == 0) { | |
1426 + | |
1427 + if ((entry->equal == 0) && !fEqualOverflows) { | |
1428 fprintf(stderr, "ERROR: horizontal link %d, logical node %d map
s to physical node zero\n", | |
1429 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); | |
1430 } | |
1431 #endif | |
1432 offset += sizeof(CompactTrieHorizontalEntry); | |
1433 } | |
1434 + | |
1435 + // append extra bits of equal nodes to end if fEqualOverflows | |
1436 + if (fEqualOverflows) { | |
1437 + uint16_t leftmostBits = 0; | |
1438 + for (int16_t i = 0; i < count; i++) { | |
1439 + leftmostBits = (leftmostBits << 4) | getLeftmostBits(translate,
i); | |
1440 + | |
1441 + // write filled uint16_t to memory | |
1442 + if(i % 4 == 3){ | |
1443 + *((uint16_t *)(bytes+offset)) = leftmostBits; | |
1444 + leftmostBits = 0; | |
1445 + offset += sizeof(uint16_t); | |
1446 + } | |
1447 + } | |
1448 + | |
1449 + // pad last uint16_t with zeroes if necessary | |
1450 + int remainder = count % 4; | |
1451 + if (remainder > 0) { | |
1452 + *((uint16_t *)(bytes+offset)) = (leftmostBits << (16 - 4 * rema
inder)); | |
1453 + offset += sizeof(uint16_t); | |
1454 + } | |
1455 + } | |
1456 + | |
1457 + BuildCompactTrieNode::writeValue(bytes, offset); | |
1458 + } | |
1459 + | |
1460 + // returns leftmost bits of physical node link | |
1461 + uint16_t getLeftmostBits(const UVector32 &translate, uint32_t i){ | |
1462 + uint16_t leftmostBits = (uint16_t) (translate.elementAti(((BuildCompact
TrieNode *)fLinks[i])->fNodeID) >> 16); | |
1463 +#ifdef DEBUG_TRIE_DICT | |
1464 + if (leftmostBits > 0xF) { | |
1465 + fprintf(stderr, "ERROR: horizontal link %d, logical node %d exceeds
maximum possible node ID value\n", | |
1466 + i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); | |
1467 + } | |
1468 +#endif | |
1469 + return leftmostBits; | |
1470 } | |
1471 | |
1472 void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) { | |
1473 fChars.append(ch); | |
1474 fLinks.push(link, status); | |
1475 } | |
1476 + | |
1477 }; | |
1478 | |
1479 class BuildCompactTrieVerticalNode: public BuildCompactTrieNode { | |
1480 - public: | |
1481 +public: | |
1482 BuildCompactTrieNode *fEqual; | |
1483 | |
1484 - public: | |
1485 - BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCod
e &status) | |
1486 - : BuildCompactTrieNode(parentEndsWord, TRUE, nodes, status) { | |
1487 +public: | |
1488 + BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCod
e &status, uint16_t value = 0) | |
1489 + : BuildCompactTrieNode(parentEndsWord, kVerticalType, nodes, status, value)
{ | |
1490 fEqual = NULL; | |
1491 } | |
1492 | |
1493 virtual ~BuildCompactTrieVerticalNode() { | |
1494 } | |
1495 | |
1496 + // Returns the maximum possible size of this node. See comment in | |
1497 + // BuildCompactTrieHorizontal node for more information. | |
1498 virtual uint32_t size() { | |
1499 - return offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeo
f(uint16_t)); | |
1500 + uint32_t estimatedSize = offsetof(CompactTrieVerticalNode,chars) + (fCh
ars.length()*sizeof(uint16_t)); | |
1501 + if(fValue > 0){ | |
1502 + estimatedSize += sizeof(uint16_t); | |
1503 + } | |
1504 + | |
1505 + if(fEqual->fNodeID > 0xFFFF){ | |
1506 + estimatedSize += sizeof(uint16_t); | |
1507 + } | |
1508 + return estimatedSize; | |
1509 } | |
1510 | |
1511 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &trans
late) { | |
1512 CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offse
t); | |
1513 + fEqualOverflows = (translate.elementAti(fEqual->fNodeID) > 0xFFFF); | |
1514 BuildCompactTrieNode::write(bytes, offset, translate); | |
1515 node->equal = translate.elementAti(fEqual->fNodeID); | |
1516 offset += sizeof(node->equal); | |
1517 #ifdef DEBUG_TRIE_DICT | |
1518 - if (node->equal == 0) { | |
1519 + if ((node->equal == 0) && !fEqualOverflows) { | |
1520 fprintf(stderr, "ERROR: vertical link, logical node %d maps to phys
ical node zero\n", | |
1521 fEqual->fNodeID); | |
1522 } | |
1523 #endif | |
1524 fChars.extract(0, fChars.length(), (UChar *)node->chars); | |
1525 - offset += sizeof(uint16_t)*fChars.length(); | |
1526 + offset += sizeof(UChar)*fChars.length(); | |
1527 + | |
1528 + // append 16 bits of to end for equal node if fEqualOverflows | |
1529 + if (fEqualOverflows) { | |
1530 + *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNode
ID) >> 16); | |
1531 + offset += sizeof(uint16_t); | |
1532 + } | |
1533 + | |
1534 + BuildCompactTrieNode::writeValue(bytes, offset); | |
1535 } | |
1536 | |
1537 void addChar(UChar ch) { | |
1538 @@ -784,60 +1161,85 @@ | |
1539 void setLink(BuildCompactTrieNode *node) { | |
1540 fEqual = node; | |
1541 } | |
1542 + | |
1543 }; | |
1544 | |
1545 // Forward declaration | |
1546 static void walkHorizontal(const TernaryNode *node, | |
1547 BuildCompactTrieHorizontalNode *building, | |
1548 UStack &nodes, | |
1549 - UErrorCode &status); | |
1550 + UErrorCode &status, | |
1551 + Hashtable *values); | |
1552 | |
1553 -// Convert one node. Uses recursion. | |
1554 +// Convert one TernaryNode into a BuildCompactTrieNode. Uses recursion. | |
1555 | |
1556 static BuildCompactTrieNode * | |
1557 -compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, UE
rrorCode &status) { | |
1558 +compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, | |
1559 + UErrorCode &status, Hashtable *values = NULL, uint16_t parentValue = 0)
{ | |
1560 if (U_FAILURE(status)) { | |
1561 return NULL; | |
1562 } | |
1563 BuildCompactTrieNode *result = NULL; | |
1564 UBool horizontal = (node->low != NULL || node->high != NULL); | |
1565 if (horizontal) { | |
1566 - BuildCompactTrieHorizontalNode *hResult = | |
1567 - new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, statu
s); | |
1568 + BuildCompactTrieHorizontalNode *hResult; | |
1569 + if(values != NULL){ | |
1570 + hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes,
status, parentValue); | |
1571 + } else { | |
1572 + hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes,
status); | |
1573 + } | |
1574 + | |
1575 if (hResult == NULL) { | |
1576 status = U_MEMORY_ALLOCATION_ERROR; | |
1577 return NULL; | |
1578 } | |
1579 if (U_SUCCESS(status)) { | |
1580 - walkHorizontal(node, hResult, nodes, status); | |
1581 + walkHorizontal(node, hResult, nodes, status, values); | |
1582 result = hResult; | |
1583 } | |
1584 } | |
1585 else { | |
1586 - BuildCompactTrieVerticalNode *vResult = | |
1587 - new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status)
; | |
1588 + BuildCompactTrieVerticalNode *vResult; | |
1589 + if(values != NULL){ | |
1590 + vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, s
tatus, parentValue); | |
1591 + } else { | |
1592 + vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, s
tatus); | |
1593 + } | |
1594 + | |
1595 if (vResult == NULL) { | |
1596 status = U_MEMORY_ALLOCATION_ERROR; | |
1597 + return NULL; | |
1598 } | |
1599 else if (U_SUCCESS(status)) { | |
1600 - UBool endsWord = FALSE; | |
1601 + uint16_t value = 0; | |
1602 + UBool endsWord = FALSE; | |
1603 // Take up nodes until we end a word, or hit a node with < or > lin
ks | |
1604 do { | |
1605 vResult->addChar(node->ch); | |
1606 - endsWord = (node->flags & kEndsWord) != 0; | |
1607 + value = node->flags; | |
1608 + endsWord = value > 0; | |
1609 node = node->equal; | |
1610 } | |
1611 while(node != NULL && !endsWord && node->low == NULL && node->high
== NULL); | |
1612 + | |
1613 if (node == NULL) { | |
1614 if (!endsWord) { | |
1615 status = U_ILLEGAL_ARGUMENT_ERROR; // Corrupt input trie | |
1616 } | |
1617 - else { | |
1618 + else if(values != NULL){ | |
1619 + UnicodeString key(value); //store value as a single-char Un
icodeString | |
1620 + BuildCompactTrieValueNode *link = (BuildCompactTrieValueNod
e *) values->get(key); | |
1621 + if(link == NULL){ | |
1622 + link = new BuildCompactTrieValueNode(nodes, status, val
ue); //take out nodes? | |
1623 + values->put(key, link, status); | |
1624 + } | |
1625 + vResult->setLink(link); | |
1626 + } else { | |
1627 vResult->setLink((BuildCompactTrieNode *)nodes[1]); | |
1628 } | |
1629 } | |
1630 else { | |
1631 - vResult->setLink(compactOneNode(node, endsWord, nodes, status))
; | |
1632 + vResult->setLink(compactOneNode(node, endsWord, nodes, status,
values, value)); | |
1633 } | |
1634 result = vResult; | |
1635 } | |
1636 @@ -849,19 +1251,28 @@ | |
1637 // Uses recursion. | |
1638 | |
1639 static void walkHorizontal(const TernaryNode *node, | |
1640 - BuildCompactTrieHorizontalNode *building, | |
1641 - UStack &nodes, | |
1642 - UErrorCode &status) { | |
1643 + BuildCompactTrieHorizontalNode *building, | |
1644 + UStack &nodes, | |
1645 + UErrorCode &status, Hashtable *values = NULL) { | |
1646 while (U_SUCCESS(status) && node != NULL) { | |
1647 if (node->low != NULL) { | |
1648 - walkHorizontal(node->low, building, nodes, status); | |
1649 + walkHorizontal(node->low, building, nodes, status, values); | |
1650 } | |
1651 BuildCompactTrieNode *link = NULL; | |
1652 if (node->equal != NULL) { | |
1653 - link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0,
nodes, status); | |
1654 + link = compactOneNode(node->equal, node->flags > 0, nodes, status,
values, node->flags); | |
1655 } | |
1656 - else if (node->flags & kEndsWord) { | |
1657 - link = (BuildCompactTrieNode *)nodes[1]; | |
1658 + else if (node->flags > 0) { | |
1659 + if(values != NULL) { | |
1660 + UnicodeString key(node->flags); //store value as a single-char
UnicodeString | |
1661 + link = (BuildCompactTrieValueNode *) values->get(key); | |
1662 + if(link == NULL) { | |
1663 + link = new BuildCompactTrieValueNode(nodes, status, node->f
lags); //take out nodes? | |
1664 + values->put(key, link, status); | |
1665 + } | |
1666 + } else { | |
1667 + link = (BuildCompactTrieNode *)nodes[1]; | |
1668 + } | |
1669 } | |
1670 if (U_SUCCESS(status) && link != NULL) { | |
1671 building->addNode(node->ch, link, status); | |
1672 @@ -881,13 +1292,15 @@ | |
1673 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr)
{ | |
1674 BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl; | |
1675 BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr; | |
1676 + | |
1677 // Check for comparing a node to itself, to avoid spurious duplicates | |
1678 if (left == right) { | |
1679 return 0; | |
1680 } | |
1681 + | |
1682 // Most significant is type of node. Can never coalesce. | |
1683 - if (left->fVertical != right->fVertical) { | |
1684 - return left->fVertical - right->fVertical; | |
1685 + if (left->fNodeType != right->fNodeType) { | |
1686 + return left->fNodeType - right->fNodeType; | |
1687 } | |
1688 // Next, the "parent ends word" flag. If that differs, we cannot coalesce. | |
1689 if (left->fParentEndsWord != right->fParentEndsWord) { | |
1690 @@ -898,12 +1311,19 @@ | |
1691 if (result != 0) { | |
1692 return result; | |
1693 } | |
1694 + | |
1695 + // If the node value differs, we should not coalesce. | |
1696 + // If values aren't stored, all fValues should be 0. | |
1697 + if (left->fValue != right->fValue) { | |
1698 + return left->fValue - right->fValue; | |
1699 + } | |
1700 + | |
1701 // We know they're both the same node type, so branch for the two cases. | |
1702 - if (left->fVertical) { | |
1703 + if (left->fNodeType == kVerticalType) { | |
1704 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID | |
1705 - - ((BuildCompactTrieVerticalNode *)right)->fEqual->
fNodeID; | |
1706 + - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID; | |
1707 } | |
1708 - else { | |
1709 + else if(left->fChars.length() > 0 && right->fChars.length() > 0){ | |
1710 // We need to compare the links vectors. They should be the | |
1711 // same size because the strings were equal. | |
1712 // We compare the node IDs instead of the pointers, to handle | |
1713 @@ -914,9 +1334,10 @@ | |
1714 int32_t count = hleft->fLinks.size(); | |
1715 for (int32_t i = 0; i < count && result == 0; ++i) { | |
1716 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID - | |
1717 - ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; | |
1718 + ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; | |
1719 } | |
1720 } | |
1721 + | |
1722 // If they are equal to each other, mark them (speeds coalescing) | |
1723 if (result == 0) { | |
1724 left->fHasDuplicate = TRUE; | |
1725 @@ -1031,20 +1452,25 @@ | |
1726 // Add node 0, used as the NULL pointer/sentinel. | |
1727 nodes.addElement((int32_t)0, status); | |
1728 | |
1729 + Hashtable *values = NULL; // Index of (unique) va
lues | |
1730 + if (dict.fValued) { | |
1731 + values = new Hashtable(status); | |
1732 + } | |
1733 + | |
1734 // Start by creating the special empty node we use to indicate that the par
ent | |
1735 // terminates a word. This must be node 1, because the builder assumes | |
1736 - // that. | |
1737 + // that. This node will never be used for tries storing numerical values. | |
1738 if (U_FAILURE(status)) { | |
1739 return NULL; | |
1740 } | |
1741 - BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, node
s, status); | |
1742 + BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontal
Type, nodes, status); | |
1743 if (terminal == NULL) { | |
1744 status = U_MEMORY_ALLOCATION_ERROR; | |
1745 } | |
1746 | |
1747 // This call does all the work of building the new trie structure. The root | |
1748 - // will be node 2. | |
1749 - BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, statu
s); | |
1750 + // will have node ID 2 before writing to memory. | |
1751 + BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, statu
s, values); | |
1752 #ifdef DEBUG_TRIE_DICT | |
1753 (void) ::times(&timing); | |
1754 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n", | |
1755 @@ -1077,21 +1503,37 @@ | |
1756 return NULL; | |
1757 } | |
1758 | |
1759 + //map terminal value nodes | |
1760 + int valueCount = 0; | |
1761 + UVector valueNodes(status); | |
1762 + if(values != NULL) { | |
1763 + valueCount = values->count(); //number of unique terminal value nodes | |
1764 + } | |
1765 + | |
1766 + // map non-terminal nodes | |
1767 + int valuePos = 1;//, nodePos = valueCount + valuePos; | |
1768 + nodeCount = valueCount + valuePos; | |
1769 for (i = 1; i < count; ++i) { | |
1770 node = (BuildCompactTrieNode *)nodes[i]; | |
1771 if (node->fNodeID == i) { | |
1772 // Only one node out of each duplicate set is used | |
1773 - if (i >= translate.size()) { | |
1774 + if (node->fNodeID >= translate.size()) { | |
1775 // Logically extend the mapping table | |
1776 - translate.setSize(i+1); | |
1777 + translate.setSize(i + 1); | |
1778 + } | |
1779 + //translate.setElementAt(object, index)! | |
1780 + if(node->fNodeType == kValueType) { | |
1781 + valueNodes.addElement(node, status); | |
1782 + translate.setElementAt(valuePos++, i); | |
1783 + } else { | |
1784 + translate.setElementAt(nodeCount++, i); | |
1785 } | |
1786 - translate.setElementAt(nodeCount++, i); | |
1787 totalSize += node->size(); | |
1788 } | |
1789 } | |
1790 - | |
1791 - // Check for overflowing 16 bits worth of nodes. | |
1792 - if (nodeCount > 0x10000) { | |
1793 + | |
1794 + // Check for overflowing 20 bits worth of nodes. | |
1795 + if (nodeCount > 0x100000) { | |
1796 status = U_ILLEGAL_ARGUMENT_ERROR; | |
1797 return NULL; | |
1798 } | |
1799 @@ -1111,9 +1553,14 @@ | |
1800 status = U_MEMORY_ALLOCATION_ERROR; | |
1801 return NULL; | |
1802 } | |
1803 - | |
1804 + | |
1805 CompactTrieHeader *header = (CompactTrieHeader *)bytes; | |
1806 - header->size = totalSize; | |
1807 + //header->size = totalSize; | |
1808 + if(dict.fValued){ | |
1809 + header->magic = COMPACT_TRIE_MAGIC_3; | |
1810 + } else { | |
1811 + header->magic = COMPACT_TRIE_MAGIC_2; | |
1812 + } | |
1813 header->nodeCount = nodeCount; | |
1814 header->offsets[0] = 0; // Sentinel | |
1815 header->root = translate.elementAti(root->fNodeID); | |
1816 @@ -1123,23 +1570,40 @@ | |
1817 } | |
1818 #endif | |
1819 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uin
t32_t)); | |
1820 - nodeCount = 1; | |
1821 + nodeCount = valueCount + 1; | |
1822 + | |
1823 + // Write terminal value nodes to memory | |
1824 + for (i=0; i < valueNodes.size(); i++) { | |
1825 + //header->offsets[i + 1] = offset; | |
1826 + uint32_t tmpOffset = 0; | |
1827 + node = (BuildCompactTrieNode *) valueNodes.elementAt(i); | |
1828 + //header->offsets[i + 1] = (uint32_t)node->fValue; | |
1829 + node->write((uint8_t *)&header->offsets[i+1], tmpOffset, translate); | |
1830 + } | |
1831 + | |
1832 // Now write the data | |
1833 for (i = 1; i < count; ++i) { | |
1834 node = (BuildCompactTrieNode *)nodes[i]; | |
1835 - if (node->fNodeID == i) { | |
1836 + if (node->fNodeID == i && node->fNodeType != kValueType) { | |
1837 header->offsets[nodeCount++] = offset; | |
1838 node->write(bytes, offset, translate); | |
1839 } | |
1840 } | |
1841 + | |
1842 + //free all extra space | |
1843 + uprv_realloc(bytes, offset); | |
1844 + header->size = offset; | |
1845 + | |
1846 #ifdef DEBUG_TRIE_DICT | |
1847 + fprintf(stdout, "Space freed: %d\n", totalSize-offset); | |
1848 + | |
1849 (void) ::times(&timing); | |
1850 fprintf(stderr, "Trie built, time user %f system %f\n", | |
1851 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, | |
1852 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); | |
1853 previous = timing; | |
1854 fprintf(stderr, "Final offset is %d\n", offset); | |
1855 - | |
1856 + | |
1857 // Collect statistics on node types and sizes | |
1858 int hCount = 0; | |
1859 int vCount = 0; | |
1860 @@ -1148,68 +1612,85 @@ | |
1861 size_t hItemCount = 0; | |
1862 size_t vItemCount = 0; | |
1863 uint32_t previousOff = offset; | |
1864 - for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { | |
1865 + uint32_t numOverflow = 0; | |
1866 + uint32_t valueSpace = 0; | |
1867 + for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { | |
1868 const CompactTrieNode *node = getCompactNode(header, nodeIdx); | |
1869 - if (node->flagscount & kVerticalNode) { | |
1870 + int itemCount; | |
1871 + if(nodeIdx == header->root) | |
1872 + itemCount = node->flagscount & kRootCountMask; | |
1873 + else | |
1874 + itemCount = getCount(node); | |
1875 + if(node->flagscount & kEqualOverflows){ | |
1876 + numOverflow++; | |
1877 + } | |
1878 + if (node->flagscount & kVerticalNode && nodeIdx != header->root) { | |
1879 vCount += 1; | |
1880 - vItemCount += (node->flagscount & kCountMask); | |
1881 + vItemCount += itemCount; | |
1882 vSize += previousOff-header->offsets[nodeIdx]; | |
1883 } | |
1884 else { | |
1885 hCount += 1; | |
1886 - hItemCount += (node->flagscount & kCountMask); | |
1887 - hSize += previousOff-header->offsets[nodeIdx]; | |
1888 + hItemCount += itemCount; | |
1889 + if(nodeIdx >= header->root) { | |
1890 + hSize += previousOff-header->offsets[nodeIdx]; | |
1891 + } | |
1892 } | |
1893 + | |
1894 + if(header->magic == COMPACT_TRIE_MAGIC_3 && node->flagscount & kParentE
ndsWord) | |
1895 + valueSpace += sizeof(uint16_t); | |
1896 previousOff = header->offsets[nodeIdx]; | |
1897 } | |
1898 fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items
\n", hCount, | |
1899 (double)hSize/hCount, (double)hItemCount/hCount); | |
1900 fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n
", vCount, | |
1901 (double)vSize/vCount, (double)vItemCount/vCount); | |
1902 + fprintf(stderr, "Number of nodes with overflowing nodeIDs: %d \n", numOverf
low); | |
1903 + fprintf(stderr, "Space taken up by values: %d \n", valueSpace); | |
1904 #endif | |
1905 | |
1906 if (U_FAILURE(status)) { | |
1907 uprv_free(bytes); | |
1908 header = NULL; | |
1909 } | |
1910 - else { | |
1911 - header->magic = COMPACT_TRIE_MAGIC_1; | |
1912 - } | |
1913 return header; | |
1914 } | |
1915 | |
1916 // Forward declaration | |
1917 static TernaryNode * | |
1918 -unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UE
rrorCode &status ); | |
1919 - | |
1920 +unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UError
Code &status ); | |
1921 | |
1922 // Convert a horizontal node (or subarray thereof) into a ternary subtrie | |
1923 static TernaryNode * | |
1924 -unpackHorizontalArray( const CompactTrieHeader *header, const CompactTrieHorizo
ntalEntry *array, | |
1925 - int low, int high, UErrorCode &status ) { | |
1926 +unpackHorizontalArray( const CompactTrieInfo *info, const CompactTrieHorizontal
Node *hnode, | |
1927 + int low, int high, int nodeCount, UErrorCode &status) { | |
1928 if (U_FAILURE(status) || low > high) { | |
1929 return NULL; | |
1930 } | |
1931 int middle = (low+high)/2; | |
1932 - TernaryNode *result = new TernaryNode(array[middle].ch); | |
1933 + TernaryNode *result = new TernaryNode(hnode->entries[middle].ch); | |
1934 if (result == NULL) { | |
1935 status = U_MEMORY_ALLOCATION_ERROR; | |
1936 return NULL; | |
1937 } | |
1938 - const CompactTrieNode *equal = getCompactNode(header, array[middle].equal); | |
1939 + const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(hnode, mi
ddle, nodeCount)); | |
1940 if (equal->flagscount & kParentEndsWord) { | |
1941 - result->flags |= kEndsWord; | |
1942 + if(info->magic == COMPACT_TRIE_MAGIC_3){ | |
1943 + result->flags = getValue(equal); | |
1944 + }else{ | |
1945 + result->flags |= kEndsWord; | |
1946 + } | |
1947 } | |
1948 - result->low = unpackHorizontalArray(header, array, low, middle-1, status); | |
1949 - result->high = unpackHorizontalArray(header, array, middle+1, high, status)
; | |
1950 - result->equal = unpackOneNode(header, equal, status); | |
1951 + result->low = unpackHorizontalArray(info, hnode, low, middle-1, nodeCount,
status); | |
1952 + result->high = unpackHorizontalArray(info, hnode, middle+1, high, nodeCount
, status); | |
1953 + result->equal = unpackOneNode(info, equal, status); | |
1954 return result; | |
1955 } | |
1956 | |
1957 // Convert one compact trie node into a ternary subtrie | |
1958 static TernaryNode * | |
1959 -unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UE
rrorCode &status ) { | |
1960 - int nodeCount = (node->flagscount & kCountMask); | |
1961 +unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UError
Code &status ) { | |
1962 + int nodeCount = getCount(node); | |
1963 if (nodeCount == 0 || U_FAILURE(status)) { | |
1964 // Failure, or terminal node | |
1965 return NULL; | |
1966 @@ -1234,29 +1715,41 @@ | |
1967 previous = latest; | |
1968 } | |
1969 if (latest != NULL) { | |
1970 - const CompactTrieNode *equal = getCompactNode(header, vnode->equal)
; | |
1971 + const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(v
node)); | |
1972 if (equal->flagscount & kParentEndsWord) { | |
1973 - latest->flags |= kEndsWord; | |
1974 + if(info->magic == COMPACT_TRIE_MAGIC_3){ | |
1975 + latest->flags = getValue(equal); | |
1976 + } else { | |
1977 + latest->flags |= kEndsWord; | |
1978 + } | |
1979 } | |
1980 - latest->equal = unpackOneNode(header, equal, status); | |
1981 + latest->equal = unpackOneNode(info, equal, status); | |
1982 } | |
1983 return head; | |
1984 } | |
1985 else { | |
1986 // Horizontal node | |
1987 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNo
de *)node; | |
1988 - return unpackHorizontalArray(header, &hnode->entries[0], 0, nodeCount-1
, status); | |
1989 + return unpackHorizontalArray(info, hnode, 0, nodeCount-1, nodeCount, st
atus); | |
1990 } | |
1991 } | |
1992 | |
1993 +// returns a MutableTrieDictionary generated from the CompactTrieDictionary | |
1994 MutableTrieDictionary * | |
1995 CompactTrieDictionary::cloneMutable( UErrorCode &status ) const { | |
1996 - MutableTrieDictionary *result = new MutableTrieDictionary( status ); | |
1997 + MutableTrieDictionary *result = new MutableTrieDictionary( status, fInfo->m
agic == COMPACT_TRIE_MAGIC_3 ); | |
1998 if (result == NULL) { | |
1999 status = U_MEMORY_ALLOCATION_ERROR; | |
2000 return NULL; | |
2001 } | |
2002 - TernaryNode *root = unpackOneNode(fData, getCompactNode(fData, fData->root)
, status); | |
2003 + // treat root node as special case: don't call unpackOneNode() or unpackHor
izontalArray() directly | |
2004 + // because only kEqualOverflows flag should be checked in root's flagscount | |
2005 + const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *
) | |
2006 + getCompactNode(fInfo, fInfo->root); | |
2007 + uint16_t nodeCount = hnode->flagscount & kRootCountMask; | |
2008 + TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1, | |
2009 + nodeCount, status); | |
2010 + | |
2011 if (U_FAILURE(status)) { | |
2012 delete root; // Clean up | |
2013 delete result; | |
2014 @@ -1270,8 +1763,8 @@ | |
2015 | |
2016 U_CAPI int32_t U_EXPORT2 | |
2017 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void
*outData, | |
2018 - UErrorCode *status) { | |
2019 - | |
2020 + UErrorCode *status) { | |
2021 + | |
2022 if (status == NULL || U_FAILURE(*status)) { | |
2023 return 0; | |
2024 } | |
2025 @@ -1286,14 +1779,14 @@ | |
2026 // | |
2027 const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4); | |
2028 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */ | |
2029 - pInfo->dataFormat[1]==0x72 && | |
2030 - pInfo->dataFormat[2]==0x44 && | |
2031 - pInfo->dataFormat[3]==0x63 && | |
2032 - pInfo->formatVersion[0]==1 )) { | |
2033 + pInfo->dataFormat[1]==0x72 && | |
2034 + pInfo->dataFormat[2]==0x44 && | |
2035 + pInfo->dataFormat[3]==0x63 && | |
2036 + pInfo->formatVersion[0]==1 )) { | |
2037 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x
(format version %02x) is not recognized\n", | |
2038 - pInfo->dataFormat[0], pInfo->dataFormat[1], | |
2039 - pInfo->dataFormat[2], pInfo->dataFormat[3], | |
2040 - pInfo->formatVersion[0]); | |
2041 + pInfo->dataFormat[0], pInfo->dataFormat[1], | |
2042 + pInfo->dataFormat[2], pInfo->dataFormat[3], | |
2043 + pInfo->formatVersion[0]); | |
2044 *status=U_UNSUPPORTED_ERROR; | |
2045 return 0; | |
2046 } | |
2047 @@ -1311,8 +1804,10 @@ | |
2048 // | |
2049 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; | |
2050 const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes; | |
2051 - if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1 | |
2052 - || ds->readUInt32(header->size) < sizeof(CompactTrieHeader)) | |
2053 + uint32_t magic = ds->readUInt32(header->magic); | |
2054 + if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic
!= COMPACT_TRIE_MAGIC_3 | |
2055 + || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) <
sizeof(CompactTrieHeaderV1) | |
2056 + || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) <
sizeof(CompactTrieHeader)) | |
2057 { | |
2058 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n"
); | |
2059 *status=U_UNSUPPORTED_ERROR; | |
2060 @@ -1333,10 +1828,10 @@ | |
2061 // | |
2062 if (length < sizeWithUData) { | |
2063 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data
header) for trie data.\n", | |
2064 - totalSize); | |
2065 + totalSize); | |
2066 *status=U_INDEX_OUTOFBOUNDS_ERROR; | |
2067 return 0; | |
2068 - } | |
2069 + } | |
2070 | |
2071 // | |
2072 // Swap the Data. Do the data itself first, then the CompactTrieHeader, be
cause | |
2073 @@ -1355,20 +1850,38 @@ | |
2074 } | |
2075 | |
2076 // We need to loop through all the nodes in the offset table, and swap each
one. | |
2077 - uint16_t nodeCount = ds->readUInt16(header->nodeCount); | |
2078 + uint32_t nodeCount, rootId; | |
2079 + if(header->magic == COMPACT_TRIE_MAGIC_1) { | |
2080 + nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount); | |
2081 + rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root); | |
2082 + } else { | |
2083 + nodeCount = ds->readUInt32(header->nodeCount); | |
2084 + rootId = ds->readUInt32(header->root); | |
2085 + } | |
2086 + | |
2087 // Skip node 0, which should always be 0. | |
2088 - for (int i = 1; i < nodeCount; ++i) { | |
2089 + for (uint32_t i = 1; i < nodeCount; ++i) { | |
2090 uint32_t nodeOff = ds->readUInt32(header->offsets[i]); | |
2091 const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + nod
eOff); | |
2092 CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff); | |
2093 uint16_t flagscount = ds->readUInt16(inNode->flagscount); | |
2094 - uint16_t itemCount = flagscount & kCountMask; | |
2095 + uint16_t itemCount = getCount(inNode); | |
2096 + //uint16_t itemCount = flagscount & kCountMask; | |
2097 ds->writeUInt16(&outNode->flagscount, flagscount); | |
2098 if (itemCount > 0) { | |
2099 - if (flagscount & kVerticalNode) { | |
2100 + uint16_t overflow = 0; //number of extra uint16_ts needed to be swa
pped | |
2101 + if (flagscount & kVerticalNode && i != rootId) { | |
2102 + if(flagscount & kEqualOverflows){ | |
2103 + // include overflow bits | |
2104 + overflow += 1; | |
2105 + } | |
2106 + if (header->magic == COMPACT_TRIE_MAGIC_3 && flagscount & kEnds
ParentWord) { | |
2107 + //include values | |
2108 + overflow += 1; | |
2109 + } | |
2110 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVertica
lNode,chars), | |
2111 - itemCount*sizeof(uint16_t), | |
2112 - outBytes+nodeOff+offsetof(CompactTrieVertic
alNode,chars), status); | |
2113 + (itemCount + overflow)*sizeof(uint16_t), | |
2114 + outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars
), status); | |
2115 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(Compac
tTrieVerticalNode,equal); | |
2116 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNo
de,equal)); | |
2117 } | |
2118 @@ -1381,26 +1894,62 @@ | |
2119 word = ds->readUInt16(inHNode->entries[j].equal); | |
2120 ds->writeUInt16(&outHNode->entries[j].equal, word); | |
2121 } | |
2122 + | |
2123 + // swap overflow/value information | |
2124 + if(flagscount & kEqualOverflows){ | |
2125 + overflow += (itemCount + 3) / 4; | |
2126 + } | |
2127 + | |
2128 + if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && fla
gscount & kEndsParentWord) { | |
2129 + //include values | |
2130 + overflow += 1; | |
2131 + } | |
2132 + | |
2133 + uint16_t *inOverflow = (uint16_t *) &inHNode->entries[itemCount
]; | |
2134 + uint16_t *outOverflow = (uint16_t *) &outHNode->entries[itemCou
nt]; | |
2135 + for(int j = 0; j<overflow; j++){ | |
2136 + uint16_t extraInfo = ds->readUInt16(*inOverflow); | |
2137 + ds->writeUInt16(outOverflow, extraInfo); | |
2138 + | |
2139 + inOverflow++; | |
2140 + outOverflow++; | |
2141 + } | |
2142 } | |
2143 } | |
2144 } | |
2145 #endif | |
2146 | |
2147 - // All the data in all the nodes consist of 16 bit items. Swap them all at
once. | |
2148 - uint16_t nodeCount = ds->readUInt16(header->nodeCount); | |
2149 - uint32_t nodesOff = offsetof(CompactTrieHeader,offsets)+((uint32_t)nodeCoun
t*sizeof(uint32_t)); | |
2150 - ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff
, status); | |
2151 - | |
2152 // Swap the header | |
2153 ds->writeUInt32(&outputHeader->size, totalSize); | |
2154 - uint32_t magic = ds->readUInt32(header->magic); | |
2155 ds->writeUInt32(&outputHeader->magic, magic); | |
2156 - ds->writeUInt16(&outputHeader->nodeCount, nodeCount); | |
2157 - uint16_t root = ds->readUInt16(header->root); | |
2158 - ds->writeUInt16(&outputHeader->root, root); | |
2159 - ds->swapArray32(ds, inBytes+offsetof(CompactTrieHeader,offsets), | |
2160 - sizeof(uint32_t)*(int32_t)nodeCount, | |
2161 - outBytes+offsetof(CompactTrieHeader,offsets), status); | |
2162 + | |
2163 + uint32_t nodeCount; | |
2164 + uint32_t offsetPos; | |
2165 + if (header->magic == COMPACT_TRIE_MAGIC_1) { | |
2166 + CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *)header; | |
2167 + CompactTrieHeaderV1 *outputHeaderV1 = (CompactTrieHeaderV1 *)outputHead
er; | |
2168 + | |
2169 + nodeCount = ds->readUInt16(headerV1->nodeCount); | |
2170 + ds->writeUInt16(&outputHeaderV1->nodeCount, nodeCount); | |
2171 + uint16_t root = ds->readUInt16(headerV1->root); | |
2172 + ds->writeUInt16(&outputHeaderV1->root, root); | |
2173 + offsetPos = offsetof(CompactTrieHeaderV1,offsets); | |
2174 + } else { | |
2175 + nodeCount = ds->readUInt32(header->nodeCount); | |
2176 + ds->writeUInt32(&outputHeader->nodeCount, nodeCount); | |
2177 + uint32_t root = ds->readUInt32(header->root); | |
2178 + ds->writeUInt32(&outputHeader->root, root); | |
2179 + offsetPos = offsetof(CompactTrieHeader,offsets); | |
2180 + } | |
2181 + | |
2182 + // All the data in all the nodes consist of 16 bit items. Swap them all at
once. | |
2183 + uint32_t nodesOff = offsetPos+((uint32_t)nodeCount*sizeof(uint32_t)); | |
2184 + ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff
, status); | |
2185 + | |
2186 + //swap offsets | |
2187 + ds->swapArray32(ds, inBytes+offsetPos, | |
2188 + sizeof(uint32_t)*(uint32_t)nodeCount, | |
2189 + outBytes+offsetPos, status); | |
2190 | |
2191 return sizeWithUData; | |
2192 } | |
2193 --- source/common/triedict.h 2006-06-06 15:38:49.000000000 -0700 | |
2194 +++ source/common/triedict.h 2011-01-21 14:12:45.496927000 -0800 | |
2195 @@ -47,7 +47,6 @@ | |
2196 U_NAMESPACE_BEGIN | |
2197 | |
2198 class StringEnumeration; | |
2199 -struct CompactTrieHeader; | |
2200 | |
2201 /******************************************************************* | |
2202 * TrieWordDictionary | |
2203 @@ -72,23 +71,29 @@ | |
2204 */ | |
2205 virtual ~TrieWordDictionary(); | |
2206 | |
2207 + /** | |
2208 + * <p>Returns true if the dictionary contains values associated with each wor
d.</p> | |
2209 + */ | |
2210 + virtual UBool getValued() const = 0; | |
2211 + | |
2212 /** | |
2213 * <p>Find dictionary words that match the text.</p> | |
2214 * | |
2215 * @param text A UText representing the text. The | |
2216 * iterator is left after the longest prefix match in the dictionary. | |
2217 - * @param start The current position in text. | |
2218 * @param maxLength The maximum number of code units to match. | |
2219 * @param lengths An array that is filled with the lengths of words that match
ed. | |
2220 * @param count Filled with the number of elements output in lengths. | |
2221 * @param limit The size of the lengths array; this limits the number of words
output. | |
2222 + * @param values An array that is filled with the values associated with the m
atched words. | |
2223 * @return The number of characters in text that were matched. | |
2224 */ | |
2225 virtual int32_t matches( UText *text, | |
2226 int32_t maxLength, | |
2227 int32_t *lengths, | |
2228 int &count, | |
2229 - int limit ) const = 0; | |
2230 + int limit, | |
2231 + uint16_t *values = NULL) const = 0; | |
2232 | |
2233 /** | |
2234 * <p>Return a StringEnumeration for iterating all the words in the dictionar
y.</p> | |
2235 @@ -128,6 +133,12 @@ | |
2236 | |
2237 UText *fIter; | |
2238 | |
2239 + /** | |
2240 + * A UText for internal use | |
2241 + * @internal | |
2242 + */ | |
2243 + UBool fValued; | |
2244 + | |
2245 friend class CompactTrieDictionary; // For fast conversion | |
2246 | |
2247 public: | |
2248 @@ -138,14 +149,29 @@ | |
2249 * @param median A UChar around which to balance the trie. Ideally, it should | |
2250 * begin at least one word that is near the median of the set in the dictionar
y | |
2251 * @param status A status code recording the success of the call. | |
2252 + * @param containsValue True if the dictionary stores values associated with e
ach word. | |
2253 */ | |
2254 - MutableTrieDictionary( UChar median, UErrorCode &status ); | |
2255 + MutableTrieDictionary( UChar median, UErrorCode &status, UBool containsValue
= FALSE ); | |
2256 | |
2257 /** | |
2258 * <p>Virtual destructor.</p> | |
2259 */ | |
2260 virtual ~MutableTrieDictionary(); | |
2261 | |
2262 + /** | |
2263 + * Indicate whether the MutableTrieDictionary stores values associated with e
ach word | |
2264 + */ | |
2265 + void setValued(UBool valued){ | |
2266 + fValued = valued; | |
2267 + } | |
2268 + | |
2269 + /** | |
2270 + * <p>Returns true if the dictionary contains values associated with each wor
d.</p> | |
2271 + */ | |
2272 + virtual UBool getValued() const { | |
2273 + return fValued; | |
2274 + } | |
2275 + | |
2276 /** | |
2277 * <p>Find dictionary words that match the text.</p> | |
2278 * | |
2279 @@ -155,13 +181,15 @@ | |
2280 * @param lengths An array that is filled with the lengths of words that match
ed. | |
2281 * @param count Filled with the number of elements output in lengths. | |
2282 * @param limit The size of the lengths array; this limits the number of words
output. | |
2283 + * @param values An array that is filled with the values associated with the m
atched words. | |
2284 * @return The number of characters in text that were matched. | |
2285 */ | |
2286 virtual int32_t matches( UText *text, | |
2287 int32_t maxLength, | |
2288 int32_t *lengths, | |
2289 int &count, | |
2290 - int limit ) const; | |
2291 + int limit, | |
2292 + uint16_t *values = NULL) const; | |
2293 | |
2294 /** | |
2295 * <p>Return a StringEnumeration for iterating all the words in the dictionar
y.</p> | |
2296 @@ -173,15 +201,17 @@ | |
2297 virtual StringEnumeration *openWords( UErrorCode &status ) const; | |
2298 | |
2299 /** | |
2300 - * <p>Add one word to the dictionary.</p> | |
2301 + * <p>Add one word to the dictionary with an optional associated value.</p> | |
2302 * | |
2303 * @param word A UChar buffer containing the word. | |
2304 * @param length The length of the word. | |
2305 - * @param status The resultant status | |
2306 + * @param status The resultant status. | |
2307 + * @param value The nonzero value associated with this word. | |
2308 */ | |
2309 virtual void addWord( const UChar *word, | |
2310 int32_t length, | |
2311 - UErrorCode &status); | |
2312 + UErrorCode &status, | |
2313 + uint16_t value = 0); | |
2314 | |
2315 #if 0 | |
2316 /** | |
2317 @@ -203,8 +233,9 @@ | |
2318 * @param lengths An array that is filled with the lengths of words that match
ed. | |
2319 * @param count Filled with the number of elements output in lengths. | |
2320 * @param limit The size of the lengths array; this limits the number of words
output. | |
2321 - * @param parent The parent of the current node | |
2322 - * @param pMatched The returned parent node matched the input | |
2323 + * @param parent The parent of the current node. | |
2324 + * @param pMatched The returned parent node matched the input/ | |
2325 + * @param values An array that is filled with the values associated with the m
atched words. | |
2326 * @return The number of characters in text that were matched. | |
2327 */ | |
2328 virtual int32_t search( UText *text, | |
2329 @@ -213,40 +244,46 @@ | |
2330 int &count, | |
2331 int limit, | |
2332 TernaryNode *&parent, | |
2333 - UBool &pMatched ) const; | |
2334 + UBool &pMatched, | |
2335 + uint16_t *values = NULL) const; | |
2336 | |
2337 private: | |
2338 /** | |
2339 * <p>Private constructor. The root node it not allocated.</p> | |
2340 * | |
2341 * @param status A status code recording the success of the call. | |
2342 + * @param containsValues True if the dictionary will store a value associated | |
2343 + * with each word added. | |
2344 */ | |
2345 - MutableTrieDictionary( UErrorCode &status ); | |
2346 + MutableTrieDictionary( UErrorCode &status, UBool containsValues = false ); | |
2347 }; | |
2348 | |
2349 /******************************************************************* | |
2350 * CompactTrieDictionary | |
2351 */ | |
2352 | |
2353 +//forward declarations | |
2354 +struct CompactTrieHeader; | |
2355 +struct CompactTrieInfo; | |
2356 + | |
2357 /** | |
2358 * <p>CompactTrieDictionary is a TrieWordDictionary that has been compacted | |
2359 * to save space.</p> | |
2360 */ | |
2361 class U_COMMON_API CompactTrieDictionary : public TrieWordDictionary { | |
2362 private: | |
2363 - /** | |
2364 - * The root node of the trie | |
2365 - */ | |
2366 + /** | |
2367 + * The header of the CompactTrieDictionary which contains all info | |
2368 + */ | |
2369 | |
2370 - const CompactTrieHeader *fData; | |
2371 - | |
2372 - /** | |
2373 - * A UBool indicating whether or not we own the fData. | |
2374 - */ | |
2375 + CompactTrieInfo *fInfo; | |
2376 | |
2377 + /** | |
2378 + * A UBool indicating whether or not we own the fData. | |
2379 + */ | |
2380 UBool fOwnData; | |
2381 | |
2382 - UDataMemory *fUData; | |
2383 + UDataMemory *fUData; | |
2384 public: | |
2385 /** | |
2386 * <p>Construct a dictionary from a UDataMemory.</p> | |
2387 @@ -277,6 +314,11 @@ | |
2388 */ | |
2389 virtual ~CompactTrieDictionary(); | |
2390 | |
2391 + /** | |
2392 + * <p>Returns true if the dictionary contains values associated with each wor
d.</p> | |
2393 + */ | |
2394 + virtual UBool getValued() const; | |
2395 + | |
2396 /** | |
2397 * <p>Find dictionary words that match the text.</p> | |
2398 * | |
2399 @@ -286,13 +328,15 @@ | |
2400 * @param lengths An array that is filled with the lengths of words that match
ed. | |
2401 * @param count Filled with the number of elements output in lengths. | |
2402 * @param limit The size of the lengths array; this limits the number of words
output. | |
2403 + * @param values An array that is filled with the values associated with the m
atched words. | |
2404 * @return The number of characters in text that were matched. | |
2405 */ | |
2406 virtual int32_t matches( UText *text, | |
2407 - int32_t rangeEnd, | |
2408 + int32_t maxLength, | |
2409 int32_t *lengths, | |
2410 int &count, | |
2411 - int limit ) const; | |
2412 + int limit, | |
2413 + uint16_t *values = NULL) const; | |
2414 | |
2415 /** | |
2416 * <p>Return a StringEnumeration for iterating all the words in the dictionar
y.</p> | |
2417 @@ -311,7 +355,7 @@ | |
2418 virtual uint32_t dataSize() const; | |
2419 | |
2420 /** | |
2421 - * <p>Return a void * pointer to the compact data, platform-endian.</p> | |
2422 + * <p>Return a void * pointer to the (unmanaged) compact data, platform-endian
.</p> | |
2423 * | |
2424 * @return The data for the compact dictionary, suitable for passing to the | |
2425 * constructor. | |
2426 @@ -342,5 +386,5 @@ | |
2427 | |
2428 U_NAMESPACE_END | |
2429 | |
2430 - /* TRIEDICT_H */ | |
2431 +/* TRIEDICT_H */ | |
2432 #endif | |
2433 --- source/data/Makefile.in 2010-10-29 13:21:33.000000000 -0700 | |
2434 +++ source/data/Makefile.in 2011-01-26 16:24:24.856798000 -0800 | |
2435 @@ -509,8 +520,9 @@ | |
2436 #################################################### CTD | |
2437 # CTD FILES | |
2438 | |
2439 -$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_
FILES) | |
2440 - $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $< | |
2441 +# .ctd file now generated regardless of whether dictionary file exists | |
2442 +$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES) | |
2443 + $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F
).txt | |
2444 | |
2445 #################################################### CFU | |
2446 # CFU FILES | |
2447 --- source/data/brkitr/root.txt 2010-07-28 17:18:28.000000000 -0700 | |
2448 +++ source/data/brkitr/root.txt 2011-01-21 14:12:45.653922000 -0800 | |
2449 @@ -17,5 +17,8 @@ | |
2450 } | |
2451 dictionaries{ | |
2452 Thai:process(dependency){"thaidict.ctd"} | |
2453 + Hani:process(dependency){"cjdict.ctd"} | |
2454 + Hira:process(dependency){"cjdict.ctd"} | |
2455 + Kata:process(dependency){"cjdict.ctd"} | |
2456 } | |
2457 } | |
2458 --- source/data/xml/brkitr/root.xml 2010-03-01 15:13:18.000000000 -0800 | |
2459 +++ source/data/xml/brkitr/root.xml 2011-01-21 14:12:45.735922000 -0800 | |
2460 @@ -25,6 +25,9 @@ | |
2461 </icu:boundaries> | |
2462 <icu:dictionaries> | |
2463 <icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/> | |
2464 + <icu:dictionary type="Hani" icu:dependency="cjdict.ctd"/> | |
2465 + <icu:dictionary type="Hira" icu:dependency="cjdict.ctd"/> | |
2466 + <icu:dictionary type="Kata" icu:dependency="cjdict.ctd"/> | |
2467 </icu:dictionaries> | |
2468 </icu:breakIteratorData> | |
2469 </special> | |
2470 --- source/test/cintltst/creststn.c 2010-10-28 10:44:02.000000000 -0700 | |
2471 +++ source/test/cintltst/creststn.c 2011-01-21 14:12:44.995020000 -0800 | |
2472 @@ -2188,21 +2188,21 @@ | |
2473 | |
2474 | |
2475 { | |
2476 - UResourceBundle* ja = ures_open(U_ICUDATA_BRKITR,"ja", &status); | |
2477 + UResourceBundle* th = ures_open(U_ICUDATA_BRKITR,"th", &status); | |
2478 const UChar *got = NULL, *exp=NULL; | |
2479 int32_t gotLen = 0, expLen=0; | |
2480 - ja = ures_getByKey(ja, "boundaries", ja, &status); | |
2481 - exp = tres_getString(ja, -1, "word", &expLen, &status); | |
2482 + th = ures_getByKey(th, "boundaries", th, &status); | |
2483 + exp = tres_getString(th, -1, "grapheme", &expLen, &status); | |
2484 | |
2485 tb = ures_getByKey(aliasB, "boundaries", tb, &status); | |
2486 - got = tres_getString(tb, -1, "word", &gotLen, &status); | |
2487 + got = tres_getString(tb, -1, "grapheme", &gotLen, &status); | |
2488 | |
2489 if(U_FAILURE(status)) { | |
2490 log_err("%s trying to read str boundaries\n", u_errorName(statu
s)); | |
2491 } else if(gotLen != expLen || u_strncmp(exp, got, gotLen) != 0) { | |
2492 log_err("Referencing alias didn't get the right data\n"); | |
2493 } | |
2494 - ures_close(ja); | |
2495 + ures_close(th); | |
2496 status = U_ZERO_ERROR; | |
2497 } | |
2498 /* simple alias */ | |
2499 --- source/test/intltest/rbbiapts.cpp 2010-07-12 11:03:29.000000000 -0700 | |
2500 +++ source/test/intltest/rbbiapts.cpp 2011-01-21 14:12:45.033014000 -0800 | |
2501 @@ -156,9 +156,13 @@ | |
2502 if(*a!=*b){ | |
2503 errln("Failed: boilerplate method operator!= does not return correct re
sults"); | |
2504 } | |
2505 - BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status); | |
2506 - if(a && c){ | |
2507 - if(*c==*a){ | |
2508 + // Japanese word break iteratos is identical to root with | |
2509 + // a dictionary-based break iterator, but Thai character break iterator | |
2510 + // is still different from Root. | |
2511 + BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),stat
us); | |
2512 + BreakIterator* d = BreakIterator::createCharacterInstance(Locale("th"),stat
us); | |
2513 + if(c && d){ | |
2514 + if(*c==*d){ | |
2515 errln("Failed: boilerplate method opertator== does not return corre
ct results"); | |
2516 } | |
2517 }else{ | |
2518 @@ -167,6 +171,7 @@ | |
2519 delete a; | |
2520 delete b; | |
2521 delete c; | |
2522 + delete d; | |
2523 } | |
2524 | |
2525 void RBBIAPITest::TestgetRules() | |
2526 @@ -635,21 +640,21 @@ | |
2527 // | |
2528 void RBBIAPITest::TestRuleStatus() { | |
2529 UChar str[30]; | |
2530 - u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094
", | |
2531 - // 012345678901234567 8 9 0 1 2 3 4 5
6 | |
2532 - // Ideographic Katakana Hiragana | |
2533 + //no longer test Han or hiragana breaking here: ruleStatusVec would return
nothing | |
2534 + // changed UBRK_WORD_KANA to UBRK_WORD_IDEO | |
2535 + u_unescape("plain word 123.45 \\u30a1\\u30a2 ", | |
2536 + // 012345678901234567 8 9 0 | |
2537 + // Katakana | |
2538 str, 30); | |
2539 UnicodeString testString1(str); | |
2540 - int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26}; | |
2541 + int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21}; | |
2542 int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE,
UBRK_WORD_LETTER, | |
2543 UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE, | |
2544 - UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE, | |
2545 - UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA,
UBRK_WORD_KANA}; | |
2546 + UBRK_WORD_IDEO, UBRK_WORD_NONE}; | |
2547 | |
2548 int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WO
RD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, | |
2549 UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WO
RD_NONE_LIMIT, | |
2550 - UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WO
RD_NONE_LIMIT, | |
2551 - UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WO
RD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT}; | |
2552 + UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT}; | |
2553 | |
2554 UErrorCode status=U_ZERO_ERROR; | |
2555 | |
2556 @@ -888,9 +893,11 @@ | |
2557 | |
2558 URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD
, status); | |
2559 { | |
2560 +#if 0 // With a dictionary based word breaking, ja_word is identical to root. | |
2561 if (ja_word && *ja_word == *root_word) { | |
2562 errln("japan not different from root"); | |
2563 } | |
2564 +#endif | |
2565 } | |
2566 | |
2567 { | |
2568 --- source/test/intltest/rbbitst.cpp 2010-10-08 18:23:28.000000000 -0700 | |
2569 +++ source/test/intltest/rbbitst.cpp 2011-01-21 14:12:45.180030000 -0800 | |
2570 @@ -35,6 +35,8 @@ | |
2571 #include <string.h> | |
2572 #include <stdio.h> | |
2573 #include <stdlib.h> | |
2574 +#include "unicode/numfmt.h" | |
2575 +#include "unicode/uscript.h" | |
2576 | |
2577 #define TEST_ASSERT(x) {if (!(x)) { \ | |
2578 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} | |
2579 @@ -138,11 +140,13 @@ | |
2580 if (exec) TestThaiBreaks(); break; | |
2581 case 23: name = "TestTailoredBreaks"; | |
2582 if (exec) TestTailoredBreaks(); break; | |
2583 + case 24: name = "TestTrieDictWithValue"; | |
2584 + if(exec) TestTrieDictWithValue(); break; | |
2585 #else | |
2586 - case 21: case 22: case 23: name = "skip"; | |
2587 + case 21: case 22: case 23: case 24: name = "skip"; | |
2588 break; | |
2589 #endif | |
2590 - case 24: name = "TestDictRules"; | |
2591 + case 25: name = "TestDictRules"; | |
2592 if (exec) TestDictRules(); break; | |
2593 case 25: name = "TestBug5532"; | |
2594 if (exec) TestBug5532(); break; | |
2595 @@ -607,6 +611,8 @@ | |
2596 | |
2597 | |
2598 void RBBITest::TestJapaneseWordBreak() { | |
2599 +// TODO: Rewrite this test for a dictionary-based word breaking. | |
2600 +#if 0 | |
2601 UErrorCode status = U_ZERO_ERROR; | |
2602 BITestData japaneseWordSelection(status); | |
2603 | |
2604 @@ -628,6 +634,7 @@ | |
2605 | |
2606 generalIteratorTest(*e, japaneseWordSelection); | |
2607 delete e; | |
2608 +#endif | |
2609 } | |
2610 | |
2611 void RBBITest::TestTrieDict() { | |
2612 @@ -849,6 +856,372 @@ | |
2613 delete compact2; | |
2614 } | |
2615 | |
2616 +/*TODO: delete later*/ | |
2617 +inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){ | |
2618 + UErrorCode status = U_ZERO_ERROR; | |
2619 + FILE *outfile = fopen(filename,"w"); | |
2620 + UConverter *cvt = ucnv_open("UTF-8", &status); | |
2621 + if (U_FAILURE(status)) | |
2622 + return; | |
2623 + if(outfile != NULL){ | |
2624 + status = U_ZERO_ERROR; | |
2625 + const UnicodeString *word = enumer->snext(status); | |
2626 + while (word != NULL && U_SUCCESS(status)) { | |
2627 + char u8word[500]; | |
2628 + status = U_ZERO_ERROR; | |
2629 + ucnv_fromUChars(cvt, u8word, 500, word->getBuffer(), word->length()
, | |
2630 + &status); | |
2631 + fprintf(outfile,"%s\n", u8word); | |
2632 + status = U_ZERO_ERROR; | |
2633 + word = enumer->snext(status); | |
2634 + } | |
2635 + fclose(outfile); | |
2636 + } | |
2637 + ucnv_close(cvt); | |
2638 +} | |
2639 + | |
2640 +// A very simple helper class to streamline the buffer handling in | |
2641 +// TestTrieDictWithValue | |
2642 +template<class T, size_t N> | |
2643 +class AutoBuffer { | |
2644 + public: | |
2645 + AutoBuffer(size_t size) : buffer(stackBuffer) { | |
2646 + if (size > N) | |
2647 + buffer = new T[size]; | |
2648 + } | |
2649 + ~AutoBuffer() { | |
2650 + if (buffer != stackBuffer) | |
2651 + delete [] buffer; | |
2652 + } | |
2653 + T* elems() { | |
2654 + return buffer; | |
2655 + } | |
2656 + const T& operator[] (size_t i) const { | |
2657 + return buffer[i]; | |
2658 + } | |
2659 + T& operator[] (size_t i) { | |
2660 + return buffer[i]; | |
2661 + } | |
2662 + private: | |
2663 + T stackBuffer[N]; | |
2664 + T* buffer; | |
2665 + AutoBuffer(); | |
2666 +}; | |
2667 + | |
2668 +//---------------------------------------------------------------------------- | |
2669 +// | |
2670 +// TestTrieDictWithValue Test trie dictionaries with logprob values and | |
2671 +// more than 2^16 nodes after compaction. | |
2672 +// | |
2673 +//---------------------------------------------------------------------------- | |
2674 +void RBBITest::TestTrieDictWithValue() { | |
2675 + UErrorCode status = U_ZERO_ERROR; | |
2676 + | |
2677 + // | |
2678 + // Open and read the test data file. | |
2679 + // | |
2680 + const char *testDataDirectory = IntlTest::getSourceTestData(status); | |
2681 + const char *filename = "cjdict-truncated.txt"; | |
2682 + char testFileName[1000]; | |
2683 + if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen(filenam
e) + 10 >= sizeof(testFileName)) { | |
2684 + errln("Can't open test data. Path too long."); | |
2685 + return; | |
2686 + } | |
2687 + strcpy(testFileName, testDataDirectory); | |
2688 + strcat(testFileName, filename); | |
2689 + | |
2690 + // Items needing deleting at the end | |
2691 + MutableTrieDictionary *mutableDict = NULL; | |
2692 + CompactTrieDictionary *compactDict = NULL; | |
2693 + UnicodeSet *breaks = NULL; | |
2694 + UChar *testFile = NULL; | |
2695 + StringEnumeration *enumer1 = NULL; | |
2696 + StringEnumeration *enumer2 = NULL; | |
2697 + MutableTrieDictionary *mutable2 = NULL; | |
2698 + StringEnumeration *cloneEnum = NULL; | |
2699 + CompactTrieDictionary *compact2 = NULL; | |
2700 + NumberFormat *nf = NULL; | |
2701 + UText *originalText = NULL, *cloneText = NULL; | |
2702 + | |
2703 + const UnicodeString *originalWord = NULL; | |
2704 + const UnicodeString *cloneWord = NULL; | |
2705 + UChar *current; | |
2706 + UChar *word; | |
2707 + UChar uc; | |
2708 + int32_t wordLen; | |
2709 + int32_t wordCount; | |
2710 + int32_t testCount; | |
2711 + int32_t valueLen; | |
2712 + int counter = 0; | |
2713 + | |
2714 + int len; | |
2715 + testFile = ReadAndConvertFile(testFileName, len, NULL, status); | |
2716 + if (U_FAILURE(status)) { | |
2717 + goto cleanup; /* something went wrong, error already output */ | |
2718 + } | |
2719 + | |
2720 + mutableDict = new MutableTrieDictionary(0x0E1C, status, TRUE); | |
2721 + if (U_FAILURE(status)) { | |
2722 + errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status)
); | |
2723 + goto cleanup; | |
2724 + } | |
2725 + | |
2726 + breaks = new UnicodeSet; | |
2727 + breaks->add(0x000A); // Line Feed | |
2728 + breaks->add(0x000D); // Carriage Return | |
2729 + breaks->add(0x2028); // Line Separator | |
2730 + breaks->add(0x2029); // Paragraph Separator | |
2731 + breaks->add(0x0009); // Tab character | |
2732 + | |
2733 + // Now add each non-comment line of the file as a word. | |
2734 + current = testFile; | |
2735 + word = current; | |
2736 + uc = *current++; | |
2737 + wordLen = 0; | |
2738 + wordCount = 0; | |
2739 + nf = NumberFormat::createInstance(status); | |
2740 + | |
2741 + while (uc) { | |
2742 + UnicodeString ucharValue; | |
2743 + valueLen = 0; | |
2744 + | |
2745 + if (uc == 0x0023) { // #comment line, skip | |
2746 + while (uc && !breaks->contains(uc)) { | |
2747 + uc = *current++; | |
2748 + } | |
2749 + } | |
2750 + else{ | |
2751 + while (uc && !breaks->contains(uc)) { | |
2752 + ++wordLen; | |
2753 + uc = *current++; | |
2754 + } | |
2755 + if(uc == 0x0009){ //separator is a tab char, read in num after tab | |
2756 + uc = *current++; | |
2757 + while (uc && !breaks->contains(uc)) { | |
2758 + ucharValue.append(uc); | |
2759 + uc = *current++; | |
2760 + } | |
2761 + } | |
2762 + } | |
2763 + if (wordLen > 0) { | |
2764 + Formattable value((int32_t)0); | |
2765 + nf->parse(ucharValue.getTerminatedBuffer(), value, status); | |
2766 + | |
2767 + if(U_FAILURE(status)){ | |
2768 + errln("parsing of value failed when reading in dictionary\n"); | |
2769 + goto cleanup; | |
2770 + } | |
2771 + mutableDict->addWord(word, wordLen, status, value.getLong()); | |
2772 + if (U_FAILURE(status)) { | |
2773 + errln("Could not add word to mutable dictionary; status %s\n",
u_errorName(status)); | |
2774 + goto cleanup; | |
2775 + } | |
2776 + wordCount += 1; | |
2777 + } | |
2778 + | |
2779 + // Find beginning of next line | |
2780 + while (uc && breaks->contains(uc)) { | |
2781 + uc = *current++; | |
2782 + } | |
2783 + word = current-1; | |
2784 + wordLen = 0; | |
2785 + } | |
2786 + | |
2787 + if (wordCount < 50) { | |
2788 + errln("Word count (%d) unreasonably small\n", wordCount); | |
2789 + goto cleanup; | |
2790 + } | |
2791 + | |
2792 + enumer1 = mutableDict->openWords(status); | |
2793 + if (U_FAILURE(status)) { | |
2794 + errln("Could not open mutable dictionary enumerator: %s\n", u_errorName
(status)); | |
2795 + goto cleanup; | |
2796 + } | |
2797 + | |
2798 + testCount = 0; | |
2799 + if (wordCount != (testCount = enumer1->count(status))) { | |
2800 + errln("MutableTrieDictionary word count (%d) differs from file word cou
nt (%d), with status %s\n", | |
2801 + testCount, wordCount, u_errorName(status)); | |
2802 + goto cleanup; | |
2803 + } | |
2804 + | |
2805 + // Now compact it | |
2806 + compactDict = new CompactTrieDictionary(*mutableDict, status); | |
2807 + if (U_FAILURE(status)) { | |
2808 + errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(statu
s)); | |
2809 + goto cleanup; | |
2810 + } | |
2811 + | |
2812 + enumer2 = compactDict->openWords(status); | |
2813 + if (U_FAILURE(status)) { | |
2814 + errln("Could not open compact trie dictionary enumerator: %s\n", u_erro
rName(status)); | |
2815 + goto cleanup; | |
2816 + } | |
2817 + | |
2818 + | |
2819 + //delete later | |
2820 +// writeEnumerationToFile(enumer1, "/home/jchye/mutable.txt"); | |
2821 +// writeEnumerationToFile(enumer2, "/home/jchye/compact.txt"); | |
2822 + | |
2823 + enumer1->reset(status); | |
2824 + enumer2->reset(status); | |
2825 + | |
2826 + originalWord = enumer1->snext(status); | |
2827 + cloneWord = enumer2->snext(status); | |
2828 + while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { | |
2829 + if (*originalWord != *cloneWord) { | |
2830 + errln("MutableTrieDictionary and CompactTrieDictionary word mismatc
h at %d, lengths are %d and %d\n", | |
2831 + counter, originalWord->length(), cloneWord->length()); | |
2832 + goto cleanup; | |
2833 + } | |
2834 + | |
2835 + // check if attached values of the same word in both dictionaries tally | |
2836 +#if 0 | |
2837 + int32_t lengths1[originalWord->length()], lengths2[cloneWord->length()]
; | |
2838 + uint16_t values1[originalWord->length()], values2[cloneWord->length()]; | |
2839 +#endif | |
2840 + AutoBuffer<int32_t, 20> lengths1(originalWord->length()); | |
2841 + AutoBuffer<int32_t, 20> lengths2(cloneWord->length()); | |
2842 + AutoBuffer<uint16_t, 20> values1(originalWord->length()); | |
2843 + AutoBuffer<uint16_t, 20> values2(cloneWord->length()); | |
2844 + | |
2845 + originalText = utext_openConstUnicodeString(originalText, originalWord,
&status); | |
2846 + cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status)
; | |
2847 + | |
2848 + int count1, count2; | |
2849 + mutableDict->matches(originalText, originalWord->length(), lengths1.ele
ms(), count1, originalWord->length(), values1.elems()); | |
2850 + compactDict->matches(cloneText, cloneWord->length(), lengths2.elems(),
count2, cloneWord->length(), values2.elems()); | |
2851 + | |
2852 + if(values1[count1-1] != values2[count2-1]){ | |
2853 + errln("Values of word %d in MutableTrieDictionary and CompactTrieDi
ctionary do not match, with values %d and %d\n", | |
2854 + counter, values1[count1-1], values2[count2-1]); | |
2855 + goto cleanup; | |
2856 + } | |
2857 + | |
2858 + counter++; | |
2859 + originalWord = enumer1->snext(status); | |
2860 + cloneWord = enumer2->snext(status); | |
2861 + } | |
2862 + if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) { | |
2863 + errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are t
he same"); | |
2864 + } | |
2865 + | |
2866 + delete enumer1; | |
2867 + enumer1 = NULL; | |
2868 + delete enumer2; | |
2869 + enumer2 = NULL; | |
2870 + | |
2871 + // Now un-compact it | |
2872 + mutable2 = compactDict->cloneMutable(status); | |
2873 + if (U_FAILURE(status)) { | |
2874 + errln("Could not clone CompactTrieDictionary to MutableTrieDictionary:
%s\n", u_errorName(status)); | |
2875 + goto cleanup; | |
2876 + } | |
2877 + | |
2878 + cloneEnum = mutable2->openWords(status); | |
2879 + if (U_FAILURE(status)) { | |
2880 + errln("Could not create cloned mutable enumerator: %s\n", u_errorName(s
tatus)); | |
2881 + goto cleanup; | |
2882 + } | |
2883 + | |
2884 + if (wordCount != (testCount = cloneEnum->count(status))) { | |
2885 + errln("Cloned MutableTrieDictionary word count (%d) differs from file w
ord count (%d), with status %s\n", | |
2886 + testCount, wordCount, u_errorName(status)); | |
2887 + goto cleanup; | |
2888 + } | |
2889 + | |
2890 + // Compact original dictionary to clone. Note that we can only compare the
same kind of | |
2891 + // dictionary as the order of the enumerators is not guaranteed to be the s
ame between | |
2892 + // different kinds | |
2893 + enumer1 = mutableDict->openWords(status); | |
2894 + if (U_FAILURE(status)) { | |
2895 + errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorN
ame(status)); | |
2896 + goto cleanup; | |
2897 + } | |
2898 + | |
2899 + counter = 0; | |
2900 + originalWord = enumer1->snext(status); | |
2901 + cloneWord = cloneEnum->snext(status); | |
2902 + while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { | |
2903 + if (*originalWord != *cloneWord) { | |
2904 + errln("Original and cloned MutableTrieDictionary word mismatch\n"); | |
2905 + goto cleanup; | |
2906 + } | |
2907 + | |
2908 + // check if attached values of the same word in both dictionaries tally | |
2909 + AutoBuffer<int32_t, 20> lengths1(originalWord->length()); | |
2910 + AutoBuffer<int32_t, 20> lengths2(cloneWord->length()); | |
2911 + AutoBuffer<uint16_t, 20> values1(originalWord->length()); | |
2912 + AutoBuffer<uint16_t, 20> values2(cloneWord->length()); | |
2913 + originalText = utext_openConstUnicodeString(originalText, originalWord,
&status); | |
2914 + cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status)
; | |
2915 + | |
2916 + int count1, count2; | |
2917 + mutableDict->matches(originalText, originalWord->length(), lengths1.ele
ms(), count1, originalWord->length(), values1.elems()); | |
2918 + mutable2->matches(cloneText, cloneWord->length(), lengths2.elems(), cou
nt2, cloneWord->length(), values2.elems()); | |
2919 + | |
2920 + if(values1[count1-1] != values2[count2-1]){ | |
2921 + errln("Values of word %d in original and cloned MutableTrieDictiona
ry do not match, with values %d and %d\n", | |
2922 + counter, values1[count1-1], values2[count2-1]); | |
2923 + goto cleanup; | |
2924 + } | |
2925 + | |
2926 + counter++; | |
2927 + | |
2928 + originalWord = enumer1->snext(status); | |
2929 + cloneWord = cloneEnum->snext(status); | |
2930 + } | |
2931 + | |
2932 + if (U_FAILURE(status)) { | |
2933 + errln("Enumeration failed: %s\n", u_errorName(status)); | |
2934 + goto cleanup; | |
2935 + } | |
2936 + | |
2937 + if (originalWord != cloneWord) { | |
2938 + errln("Original and cloned MutableTrieDictionary ended enumeration at d
ifferent points\n"); | |
2939 + goto cleanup; | |
2940 + } | |
2941 + | |
2942 + // Test the data copying constructor for CompactTrieDict, and the data acce
ss APIs. | |
2943 + compact2 = new CompactTrieDictionary(compactDict->data(), status); | |
2944 + if (U_FAILURE(status)) { | |
2945 + errln("CompactTrieDictionary(const void *,...) failed\n"); | |
2946 + goto cleanup; | |
2947 + } | |
2948 + | |
2949 + if (compact2->dataSize() == 0) { | |
2950 + errln("CompactTrieDictionary->dataSize() == 0\n"); | |
2951 + goto cleanup; | |
2952 + } | |
2953 + | |
2954 + // Now count the words via the second dictionary | |
2955 + delete enumer1; | |
2956 + enumer1 = compact2->openWords(status); | |
2957 + if (U_FAILURE(status)) { | |
2958 + errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_er
rorName(status)); | |
2959 + goto cleanup; | |
2960 + } | |
2961 + | |
2962 + if (wordCount != (testCount = enumer1->count(status))) { | |
2963 + errln("CompactTrieDictionary 2 word count (%d) differs from file word c
ount (%d), with status %s\n", | |
2964 + testCount, wordCount, u_errorName(status)); | |
2965 + goto cleanup; | |
2966 + } | |
2967 + | |
2968 + cleanup: | |
2969 + delete compactDict; | |
2970 + delete mutableDict; | |
2971 + delete breaks; | |
2972 + delete[] testFile; | |
2973 + delete enumer1; | |
2974 + delete mutable2; | |
2975 + delete cloneEnum; | |
2976 + delete compact2; | |
2977 + utext_close(originalText); | |
2978 + utext_close(cloneText); | |
2979 + | |
2980 + | |
2981 +} | |
2982 | |
2983 //---------------------------------------------------------------------------- | |
2984 // | |
2985 @@ -1870,8 +2243,15 @@ | |
2986 // Don't break in runs of hiragana or runs of ideograph, where the latter inclu
des \u3005 \u3007 \u303B (cldrbug #2009). | |
2987 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u
3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF" | |
2988 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u
3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002"; | |
2989 +#if 0 | |
2990 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 1
7, 18, 20, 21, 24, 27, 28 }; | |
2991 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 1
7, 18, 19, 20, 21, 24, 25, 26, 27, 28 }; | |
2992 +#endif | |
2993 +// There's no separate Japanese word break iterator. Root is the same as Japane
se. | |
2994 +// Our dictionary-based iterator has to be tweaked to better handle U+3005, | |
2995 +// U+3007, U+300B and some other cases. | |
2996 +static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 1
5, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; | |
2997 +static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 1
5, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; | |
2998 | |
2999 // UBreakIteratorType UBRK_SENTENCE, Locale "el" | |
3000 // Add break after Greek question mark (cldrbug #2069). | |
3001 @@ -2672,6 +3052,8 @@ | |
3002 UnicodeSet *fNewlineSet; | |
3003 UnicodeSet *fKatakanaSet; | |
3004 UnicodeSet *fALetterSet; | |
3005 + // TODO(jungshik): Do we still need this change? | |
3006 + // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt | |
3007 UnicodeSet *fMidNumLetSet; | |
3008 UnicodeSet *fMidLetterSet; | |
3009 UnicodeSet *fMidNumSet; | |
3010 @@ -2680,6 +3062,7 @@ | |
3011 UnicodeSet *fOtherSet; | |
3012 UnicodeSet *fExtendSet; | |
3013 UnicodeSet *fExtendNumLetSet; | |
3014 + UnicodeSet *fDictionaryCjkSet; | |
3015 | |
3016 RegexMatcher *fMatcher; | |
3017 | |
3018 @@ -2696,12 +3079,24 @@ | |
3019 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
CR}]"), status); | |
3020 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
LF}]"), status); | |
3021 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
Newline}]"), status); | |
3022 - fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
ALetter}]"), status); | |
3023 + fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]",
status); | |
3024 + // Exclude Hangul syllables from ALetterSet during testing. | |
3025 + // Leave CJK dictionary characters out from the monkey tests! | |
3026 +#if 0 | |
3027 + fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" | |
3028 + "[\\p{Line_Break = Complex_Context}" | |
3029 + "-\\p{Grapheme_Cluster_Break = Extend}" | |
3030 + "-\\p{Grapheme_Cluster_Break = Control}" | |
3031 + "]]", | |
3032 + status); | |
3033 +#endif | |
3034 + fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
ALetter}]"), status); | |
3035 + fALetterSet->removeAll(*fDictionaryCjkSet); | |
3036 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
Katakana}]"), status); | |
3037 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
MidNumLet}]"), status); | |
3038 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
MidLetter}]"), status); | |
3039 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
MidNum}]"), status); | |
3040 - fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
Numeric}]"), status); | |
3041 + fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
Numeric}[\\uff10-\\uff19]]"), status); | |
3042 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
Format}]"), status); | |
3043 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
ExtendNumLet}]"), status); | |
3044 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break =
Extend}]"), status); | |
3045 @@ -2725,13 +3120,14 @@ | |
3046 fOtherSet->removeAll(*fFormatSet); | |
3047 fOtherSet->removeAll(*fExtendSet); | |
3048 // Inhibit dictionary characters from being tested at all. | |
3049 + fOtherSet->removeAll(*fDictionaryCjkSet); | |
3050 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Com
plex_Context}]"), status)); | |
3051 | |
3052 fSets->addElement(fCRSet, status); | |
3053 fSets->addElement(fLFSet, status); | |
3054 fSets->addElement(fNewlineSet, status); | |
3055 fSets->addElement(fALetterSet, status); | |
3056 - fSets->addElement(fKatakanaSet, status); | |
3057 + //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test ka
takana | |
3058 fSets->addElement(fMidLetterSet, status); | |
3059 fSets->addElement(fMidNumLetSet, status); | |
3060 fSets->addElement(fMidNumSet, status); | |
3061 @@ -3978,6 +4374,7 @@ | |
3062 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { | |
3063 count --; | |
3064 if (forward[count] != i) { | |
3065 + printStringBreaks(ustr, expected, expectedcount); | |
3066 test->errln("happy break test previous() failed: expected %d but go
t %d", | |
3067 forward[count], i); | |
3068 break; | |
3069 @@ -4011,23 +4408,25 @@ | |
3070 UErrorCode status = U_ZERO_ERROR; | |
3071 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, stat
us); | |
3072 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); | |
3073 + // Replaced any C+J characters in a row with a random sequence of character
s | |
3074 + // of the same length to make our C+J segmentation not get in the way. | |
3075 static const char *strlist[] = | |
3076 { | |
3077 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", | |
3078 - "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e004
0\\u003b", | |
3079 + "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e004
0\\u003b", | |
3080 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000
e0061\\u003a", | |
3081 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", | |
3082 - "\\u90ca\\u3588\\u009c\\u0953\\u194b", | |
3083 + "\\uac00\\u3588\\u009c\\u0953\\u194b", | |
3084 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", | |
3085 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e
", | |
3086 - "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e", | |
3087 + "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", | |
3088 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", | |
3089 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", | |
3090 "\\u2027\\U000e0067\\u0a47\\u00b7", | |
3091 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", | |
3092 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", | |
3093 "\\u0589\\U000e006e\\u0a42\\U000104a5", | |
3094 - "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", | |
3095 + "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", | |
3096 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", | |
3097 "\\u0027\\u11af\\U000e0057\\u0602", | |
3098 "\\U0001d7f2\\U000e007\\u0004\\u0589", | |
3099 @@ -4039,7 +4438,7 @@ | |
3100 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", | |
3101 "\\u0233\\U000e0020\\u0a69\\u0d6a", | |
3102 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", | |
3103 - "\\u58f4\\U000e0049\\u20e7\\u2027", | |
3104 + "\\u18f4\\U000e0049\\u20e7\\u2027", | |
3105 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", | |
3106 "\\ua183\\u102d\\u0bec\\u003a", | |
3107 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", | |
3108 @@ -4049,7 +4448,7 @@ | |
3109 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", | |
3110 "\\u003a\\u0664\\u00b7\\u1fba", | |
3111 "\\u003b\\u0027\\u00b7\\u47a3", | |
3112 - "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b", | |
3113 + "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", | |
3114 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\
u0e51\\u1058\\U000e0058\\u00b7\\u0673", | |
3115 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", | |
3116 }; | |
3117 @@ -4104,12 +4503,12 @@ | |
3118 "\\U0001d7f2\\U000e007d\\u0004\\u0589", | |
3119 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", | |
3120 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", | |
3121 - "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", | |
3122 + "\\U000e0065\\u302c\\u09ee\\U000e0068", | |
3123 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", | |
3124 "\\u0233\\U000e0020\\u0a69\\u0d6a", | |
3125 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", | |
3126 "\\u58f4\\U000e0049\\u20e7\\u2027", | |
3127 - "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", | |
3128 + "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", | |
3129 "\\ua183\\u102d\\u0bec\\u003a", | |
3130 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", | |
3131 "\\u003a\\u0e57\\u0fad\\u002e", | |
3132 --- source/test/intltest/rbbitst.h 2010-07-22 17:15:37.000000000 -0700 | |
3133 +++ source/test/intltest/rbbitst.h 2011-01-21 14:12:45.152007000 -0800 | |
3134 @@ -70,6 +70,7 @@ | |
3135 void TestBug5775(); | |
3136 void TestThaiBreaks(); | |
3137 void TestTailoredBreaks(); | |
3138 + void TestTrieDictWithValue(); | |
3139 void TestDictRules(); | |
3140 void TestBug5532(); | |
3141 | |
3142 --- source/test/testdata/rbbitst.txt 2010-07-28 17:18:28.000000000 -0700 | |
3143 +++ source/test/testdata/rbbitst.txt 2011-01-21 14:12:45.221011000 -0800 | |
3144 @@ -161,7 +161,23 @@ | |
3145 <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data> | |
3146 | |
3147 # Hiragana & Katakana stay together, but separates from each other and Latin. | |
3148 -<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINI
NG ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}
\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA
LETTER N}<300>def<200>#•</data> | |
3149 +# *** what to do about theoretical combos of chars? i.e. hiragana + accent | |
3150 +#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBIN
ING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A
}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKAN
A LETTER N}<300>def<200>#•</data> | |
3151 + | |
3152 +# test normalization/dictionary handling of halfwidth katakana: same dictionary
phrase in fullwidth and halfwidth | |
3153 +<data>•芽キャベツ<400>芽キャベツ<400></data> | |
3154 + | |
3155 +# more Japanese tests | |
3156 +# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana | |
3157 +# and the Katakana block are not treated correctly. Enable this later. | |
3158 +#<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400
>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data> | |
3159 +<data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>で
も<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data> | |
3160 + | |
3161 +# Testing of word boundary for dictionary word containing both kanji and kana | |
3162 +<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data> | |
3163 + | |
3164 +# Testing of Chinese segmentation (taken from a Chinese news article) | |
3165 +<data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400
>到了<400>“•推荐<400>票<400>”•,•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400>
的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>,•选出<400>他们<400>
属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</d
ata> | |
3166 | |
3167 # Words with interior formatting characters | |
3168 <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</dat
a> | |
3169 @@ -169,6 +185,8 @@ | |
3170 # to test for bug #4097779 | |
3171 <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data> | |
3172 | |
3173 +# fullwidth numeric, midletter characters etc should be treated like their half
width counterparts | |
3174 +<data>•ISN'T<200> •19<100>日<400></data> | |
3175 | |
3176 # to test for bug #4098467 | |
3177 # What follows is a string of Korean characters (I found it in the Yellow
Pages | |
3178 @@ -178,9 +196,15 @@ | |
3179 # precomposed syllables... | |
3180 <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\ua
d50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u1
10b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u1
1bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data> | |
3181 | |
3182 -<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200>
•</data> | |
3183 +# more Korean tests (Jamo not tested here, not counted as dictionary characters
) | |
3184 +# Disable them now because we don't include a Korean dictionary. | |
3185 +#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<2
00>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data> | |
3186 +#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2d
d<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200
> •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data> | |
3187 + | |
3188 +<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</da
ta> | |
3189 + | |
3190 +<data>•\u06c9<200>\uc799<200>\ufffa•</data> | |
3191 | |
3192 -<data>•\u06c9\uc799\ufffa<200></data> | |
3193 | |
3194 # | |
3195 # Try some words from other scripts. | |
3196 @@ -491,8 +515,7 @@ | |
3197 <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c
•</data> | |
3198 | |
3199 # conjoining jamo... | |
3200 -# TODO: rules update needed | |
3201 -#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\
u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\
u1100\u116d•\u1112\u116c•</data> | |
3202 +<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u
11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1
100\u116d•\u1112\u116c•</data> | |
3203 | |
3204 # to test for bug #4117554: Fullwidth .!? should be treated as postJwrd | |
3205 <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data> | |
3206 --- source/test/testdata/testaliases.txt 2009-11-12 13:53:42.000000000 -0
800 | |
3207 +++ source/test/testdata/testaliases.txt 2011-01-21 14:12:45.204005000 -0
800 | |
3208 @@ -28,7 +28,7 @@ | |
3209 LocaleScript:alias { "/ICUDATA/ja/LocaleScript" } | |
3210 | |
3211 // aliasing using position | |
3212 - boundaries:alias { "/ICUDATA-brkitr/ja" } // Referencing corresponding reso
urce in another bundle | |
3213 + boundaries:alias { "/ICUDATA-brkitr/th" } // Referencing corresponding reso
urce in another bundle | |
3214 | |
3215 // aliasing arrays | |
3216 zoneTests { | |
3217 --- source/tools/genctd/genctd.cpp 2009-08-04 14:09:17.000000000 -0700 | |
3218 +++ source/tools/genctd/genctd.cpp 2011-01-21 14:12:45.564923000 -0800 | |
3219 @@ -1,6 +1,6 @@ | |
3220 /* | |
3221 ********************************************************************** | |
3222 -* Copyright (C) 2002-2009, International Business Machines | |
3223 +* Copyright (C) 2002-2010, International Business Machines | |
3224 * Corporation and others. All Rights Reserved. | |
3225 ********************************************************************** | |
3226 * | |
3227 @@ -34,12 +34,15 @@ | |
3228 #include "unicode/udata.h" | |
3229 #include "unicode/putil.h" | |
3230 | |
3231 +//#include "unicode/ustdio.h" | |
3232 + | |
3233 #include "uoptions.h" | |
3234 #include "unewdata.h" | |
3235 #include "ucmndata.h" | |
3236 #include "rbbidata.h" | |
3237 #include "triedict.h" | |
3238 #include "cmemory.h" | |
3239 +#include "uassert.h" | |
3240 | |
3241 #include <stdio.h> | |
3242 #include <stdlib.h> | |
3243 @@ -199,147 +202,191 @@ | |
3244 long wordFileSize; | |
3245 FILE *file; | |
3246 char *wordBufferC; | |
3247 - | |
3248 + MutableTrieDictionary *mtd = NULL; | |
3249 + | |
3250 file = fopen(wordFileName, "rb"); | |
3251 - if( file == 0 ) { | |
3252 - fprintf(stderr, "Could not open file \"%s\"\n", wordFileName); | |
3253 - exit(-1); | |
3254 - } | |
3255 - fseek(file, 0, SEEK_END); | |
3256 - wordFileSize = ftell(file); | |
3257 - fseek(file, 0, SEEK_SET); | |
3258 - wordBufferC = new char[wordFileSize+10]; | |
3259 - | |
3260 - result = (long)fread(wordBufferC, 1, wordFileSize, file); | |
3261 - if (result != wordFileSize) { | |
3262 - fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); | |
3263 - exit (-1); | |
3264 - } | |
3265 - wordBufferC[wordFileSize]=0; | |
3266 - fclose(file); | |
3267 - | |
3268 - // | |
3269 - // Look for a Unicode Signature (BOM) on the word file | |
3270 - // | |
3271 - int32_t signatureLength; | |
3272 - const char * wordSourceC = wordBufferC; | |
3273 - const char* encoding = ucnv_detectUnicodeSignature( | |
3274 - wordSourceC, wordFileSize, &signatureLength, &status
); | |
3275 - if (U_FAILURE(status)) { | |
3276 - exit(status); | |
3277 - } | |
3278 - if(encoding!=NULL ){ | |
3279 - wordSourceC += signatureLength; | |
3280 - wordFileSize -= signatureLength; | |
3281 - } | |
3282 - | |
3283 - // | |
3284 - // Open a converter to take the rule file to UTF-16 | |
3285 - // | |
3286 - UConverter* conv; | |
3287 - conv = ucnv_open(encoding, &status); | |
3288 - if (U_FAILURE(status)) { | |
3289 - fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); | |
3290 - exit(status); | |
3291 - } | |
3292 - | |
3293 - // | |
3294 - // Convert the words to UChar. | |
3295 - // Preflight first to determine required buffer size. | |
3296 - // | |
3297 - uint32_t destCap = ucnv_toUChars(conv, | |
3298 - NULL, // dest, | |
3299 - 0, // destCapacity, | |
3300 - wordSourceC, | |
3301 - wordFileSize, | |
3302 - &status); | |
3303 - if (status != U_BUFFER_OVERFLOW_ERROR) { | |
3304 - fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
)); | |
3305 - exit(status); | |
3306 - }; | |
3307 - | |
3308 - status = U_ZERO_ERROR; | |
3309 - UChar *wordSourceU = new UChar[destCap+1]; | |
3310 - ucnv_toUChars(conv, | |
3311 - wordSourceU, // dest, | |
3312 - destCap+1, | |
3313 - wordSourceC, | |
3314 - wordFileSize, | |
3315 - &status); | |
3316 - if (U_FAILURE(status)) { | |
3317 - fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
)); | |
3318 - exit(status); | |
3319 - }; | |
3320 - ucnv_close(conv); | |
3321 - | |
3322 - // Get rid of the original file buffer | |
3323 - delete[] wordBufferC; | |
3324 - | |
3325 - // Create a MutableTrieDictionary, and loop through all the lines, insertin
g | |
3326 - // words. | |
3327 - | |
3328 - // First, pick a median character. | |
3329 - UChar *current = wordSourceU + (destCap/2); | |
3330 - UChar uc = *current++; | |
3331 - UnicodeSet breaks; | |
3332 - breaks.add(0x000A); // Line Feed | |
3333 - breaks.add(0x000D); // Carriage Return | |
3334 - breaks.add(0x2028); // Line Separator | |
3335 - breaks.add(0x2029); // Paragraph Separator | |
3336 - | |
3337 - do { | |
3338 - // Look for line break | |
3339 - while (uc && !breaks.contains(uc)) { | |
3340 - uc = *current++; | |
3341 - } | |
3342 - // Now skip to first non-line-break | |
3343 - while (uc && breaks.contains(uc)) { | |
3344 - uc = *current++; | |
3345 + if( file == 0 ) { //cannot find file | |
3346 + //create 1-line dummy file: ie 1 char, 1 value | |
3347 + UNewDataMemory *pData; | |
3348 + char msg[1024]; | |
3349 + | |
3350 + /* write message with just the name */ | |
3351 + sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outF
ileName); | |
3352 + fprintf(stderr, "%s\n", msg); | |
3353 + | |
3354 + UChar c = 0x0020; | |
3355 + mtd = new MutableTrieDictionary(c, status, TRUE); | |
3356 + mtd->addWord(&c, 1, status, 1); | |
3357 + | |
3358 + } else { //read words in from input file | |
3359 + fseek(file, 0, SEEK_END); | |
3360 + wordFileSize = ftell(file); | |
3361 + fseek(file, 0, SEEK_SET); | |
3362 + wordBufferC = new char[wordFileSize+10]; | |
3363 + | |
3364 + result = (long)fread(wordBufferC, 1, wordFileSize, file); | |
3365 + if (result != wordFileSize) { | |
3366 + fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); | |
3367 + exit (-1); | |
3368 } | |
3369 - } | |
3370 - while (uc && (breaks.contains(uc) || u_isspace(uc))); | |
3371 - | |
3372 - MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status); | |
3373 + wordBufferC[wordFileSize]=0; | |
3374 + fclose(file); | |
3375 | |
3376 - if (U_FAILURE(status)) { | |
3377 - fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_erro
rName(status)); | |
3378 - exit(status); | |
3379 - } | |
3380 + // | |
3381 + // Look for a Unicode Signature (BOM) on the word file | |
3382 + // | |
3383 + int32_t signatureLength; | |
3384 + const char * wordSourceC = wordBufferC; | |
3385 + const char* encoding = ucnv_detectUnicodeSignature( | |
3386 + wordSourceC, wordFileSize, &signatureLength, &st
atus); | |
3387 + if (U_FAILURE(status)) { | |
3388 + exit(status); | |
3389 + } | |
3390 + if(encoding!=NULL ){ | |
3391 + wordSourceC += signatureLength; | |
3392 + wordFileSize -= signatureLength; | |
3393 + } | |
3394 | |
3395 - // Now add the words. Words are non-space characters at the beginning of | |
3396 - // lines, and must be at least one UChar. | |
3397 - current = wordSourceU; | |
3398 - UChar *candidate = current; | |
3399 - uc = *current++; | |
3400 - int32_t length = 0; | |
3401 - | |
3402 - while (uc) { | |
3403 - while (uc && !u_isspace(uc)) { | |
3404 - ++length; | |
3405 - uc = *current++; | |
3406 + // | |
3407 + // Open a converter to take the rule file to UTF-16 | |
3408 + // | |
3409 + UConverter* conv; | |
3410 + conv = ucnv_open(encoding, &status); | |
3411 + if (U_FAILURE(status)) { | |
3412 + fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status
)); | |
3413 + exit(status); | |
3414 } | |
3415 - if (length > 0) { | |
3416 - mtd->addWord(candidate, length, status); | |
3417 - if (U_FAILURE(status)) { | |
3418 - fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s
\"\n", | |
3419 - u_errorName(status)); | |
3420 - exit(status); | |
3421 + | |
3422 + // | |
3423 + // Convert the words to UChar. | |
3424 + // Preflight first to determine required buffer size. | |
3425 + // | |
3426 + uint32_t destCap = ucnv_toUChars(conv, | |
3427 + NULL, // dest, | |
3428 + 0, // destCapacity, | |
3429 + wordSourceC, | |
3430 + wordFileSize, | |
3431 + &status); | |
3432 + if (status != U_BUFFER_OVERFLOW_ERROR) { | |
3433 + fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(st
atus)); | |
3434 + exit(status); | |
3435 + }; | |
3436 + | |
3437 + status = U_ZERO_ERROR; | |
3438 + UChar *wordSourceU = new UChar[destCap+1]; | |
3439 + ucnv_toUChars(conv, | |
3440 + wordSourceU, // dest, | |
3441 + destCap+1, | |
3442 + wordSourceC, | |
3443 + wordFileSize, | |
3444 + &status); | |
3445 + if (U_FAILURE(status)) { | |
3446 + fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(st
atus)); | |
3447 + exit(status); | |
3448 + }; | |
3449 + ucnv_close(conv); | |
3450 + | |
3451 + // Get rid of the original file buffer | |
3452 + delete[] wordBufferC; | |
3453 + | |
3454 + // Create a MutableTrieDictionary, and loop through all the lines, inse
rting | |
3455 + // words. | |
3456 + | |
3457 + // First, pick a median character. | |
3458 + UChar *current = wordSourceU + (destCap/2); | |
3459 + UChar uc = *current++; | |
3460 + UnicodeSet breaks; | |
3461 + breaks.add(0x000A); // Line Feed | |
3462 + breaks.add(0x000D); // Carriage Return | |
3463 + breaks.add(0x2028); // Line Separator | |
3464 + breaks.add(0x2029); // Paragraph Separator | |
3465 + | |
3466 + do { | |
3467 + // Look for line break | |
3468 + while (uc && !breaks.contains(uc)) { | |
3469 + uc = *current++; | |
3470 + } | |
3471 + // Now skip to first non-line-break | |
3472 + while (uc && breaks.contains(uc)) { | |
3473 + uc = *current++; | |
3474 } | |
3475 } | |
3476 - // Find beginning of next line | |
3477 - while (uc && !breaks.contains(uc)) { | |
3478 - uc = *current++; | |
3479 + while (uc && (breaks.contains(uc) || u_isspace(uc))); | |
3480 + | |
3481 + mtd = new MutableTrieDictionary(uc, status); | |
3482 + | |
3483 + if (U_FAILURE(status)) { | |
3484 + fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_
errorName(status)); | |
3485 + exit(status); | |
3486 } | |
3487 - while (uc && breaks.contains(uc)) { | |
3488 - uc = *current++; | |
3489 + | |
3490 + // Now add the words. Words are non-space characters at the beginning o
f | |
3491 + // lines, and must be at least one UChar. If a word has an associated v
alue, | |
3492 + // the value should follow the word on the same line after a tab charac
ter. | |
3493 + current = wordSourceU; | |
3494 + UChar *candidate = current; | |
3495 + uc = *current++; | |
3496 + int32_t length = 0; | |
3497 + int count = 0; | |
3498 + | |
3499 + while (uc) { | |
3500 + while (uc && !u_isspace(uc)) { | |
3501 + ++length; | |
3502 + uc = *current++; | |
3503 + } | |
3504 + | |
3505 + UnicodeString valueString; | |
3506 + UChar candidateValue; | |
3507 + if(uc == 0x0009){ //separator is a tab char, read in number after s
pace | |
3508 + while (uc && u_isspace(uc)) { | |
3509 + uc = *current++; | |
3510 + } | |
3511 + while (uc && !u_isspace(uc)) { | |
3512 + valueString.append(uc); | |
3513 + uc = *current++; | |
3514 + } | |
3515 + } | |
3516 + | |
3517 + if (length > 0) { | |
3518 + count++; | |
3519 + if(valueString.length() > 0){ | |
3520 + mtd->setValued(TRUE); | |
3521 + | |
3522 + uint32_t value = 0; | |
3523 + char* s = new char[valueString.length()]; | |
3524 + valueString.extract(0,valueString.length(), s, valueString.
length()); | |
3525 + int n = sscanf(s, "%ud", &value); | |
3526 + U_ASSERT(n == 1); | |
3527 + U_ASSERT(value >= 0); | |
3528 + mtd->addWord(candidate, length, status, (uint16_t)value); | |
3529 + delete[] s; | |
3530 + } else { | |
3531 + mtd->addWord(candidate, length, status); | |
3532 + } | |
3533 + | |
3534 + if (U_FAILURE(status)) { | |
3535 + fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error
\"%s\" at line %d in input file\n", | |
3536 + u_errorName(status), count); | |
3537 + exit(status); | |
3538 + } | |
3539 + } | |
3540 + | |
3541 + // Find beginning of next line | |
3542 + while (uc && !breaks.contains(uc)) { | |
3543 + uc = *current++; | |
3544 + } | |
3545 + // Find next non-line-breaking character | |
3546 + while (uc && breaks.contains(uc)) { | |
3547 + uc = *current++; | |
3548 + } | |
3549 + candidate = current-1; | |
3550 + length = 0; | |
3551 } | |
3552 - candidate = current-1; | |
3553 - length = 0; | |
3554 + | |
3555 + // Get rid of the Unicode text buffer | |
3556 + delete[] wordSourceU; | |
3557 } | |
3558 | |
3559 - // Get rid of the Unicode text buffer | |
3560 - delete[] wordSourceU; | |
3561 - | |
3562 // Now, create a CompactTrieDictionary from the mutable dictionary | |
3563 CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status); | |
3564 if (U_FAILURE(status)) { | |
3565 @@ -393,4 +440,3 @@ | |
3566 | |
3567 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | |
3568 } | |
3569 - | |
3570 --- source/tools/genctd/Makefile.in 2006-12-16 13:07:01.000000000 -0800 | |
3571 +++ source/tools/genctd/Makefile.in 2011-01-21 14:12:45.555920000 -0800 | |
3572 @@ -23,13 +23,13 @@ | |
3573 ## Extra files to remove for 'make clean' | |
3574 CLEANFILES = *~ $(DEPS) $(MAN_FILES) | |
3575 | |
3576 -## Target information | |
3577 +## Target informationcd | |
3578 TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) | |
3579 | |
3580 ifneq ($(top_builddir),$(top_srcdir)) | |
3581 CPPFLAGS += -I$(top_builddir)/common | |
3582 endif | |
3583 -CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil | |
3584 +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -I$(top_srcdir)/i18n | |
3585 LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) | |
3586 | |
3587 OBJECTS = genctd.o | |
OLD | NEW |