icu52/patches/segmentation.patch - Issue 224943002: icu local change part1

Side by Side Diff: icu52/patches/segmentation.patch

Issue 224943002: icu local change part1 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/

Patch Set: fix typos in uconfig.h and putil.patch Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 --- source/common/brkeng.cpp 2009-11-11 07:47:22.000000000 -0800

2 +++ source/common/brkeng.cpp 2011-01-21 14:12:45.479922000 -0800

3 @@ -226,6 +226,30 @@

4 case USCRIPT_THAI:

5 engine = new ThaiBreakEngine(dict, status);

6 break;

7 +

8 + case USCRIPT_HANGUL:

9 + engine = new CjkBreakEngine(dict, kKorean, status);

10 + break;

11 +

12 + // use same BreakEngine and dictionary for both Chinese and Japanes e

13 + case USCRIPT_HIRAGANA:

14 + case USCRIPT_KATAKANA:

15 + case USCRIPT_HAN:

16 + engine = new CjkBreakEngine(dict, kChineseJapanese, status);

17 + break;

18 +#if 0

19 + // TODO: Have to get some characters with script=common handled

20 + // by CjkBreakEngine (e.g. U+309B). Simply subjecting

21 + // them to CjkBreakEngine does not work. The engine has to

22 + // special-case them.

23 + case USCRIPT_COMMON:

24 + {

25 + UBlockCode block = ublock_getCode(code);

26 + if (block == UBLOCK_HIRAGANA \|\| block == UBLOCK_KATAKANA)

27 + engine = new CjkBreakEngine(dict, kChineseJapanese, status);

28 + break;

29 + }

30 +#endif

31 default:

32 break;

33 }

34 @@ -281,6 +305,13 @@

35 dict = NULL;

36 }

37 return dict;

38 + } else if (dictfname != NULL){

39 + //create dummy dict if dictionary filename not valid

40 + UChar c = 0x0020;

41 + status = U_ZERO_ERROR;

42 + MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE) ;

43 + mtd->addWord(&c, 1, status, 1);

44 + return new CompactTrieDictionary(*mtd, status);

45 }

46 return NULL;

47 }

48 --- source/common/dictbe.cpp 2008-06-13 12:21:12.000000000 -0700

49 +++ source/common/dictbe.cpp 2011-01-21 14:12:45.468928000 -0800

50 @@ -16,6 +16,9 @@

51 #include "unicode/ubrk.h"

52 #include "uvector.h"

53 #include "triedict.h"

54 +#include "uassert.h"

55 +#include "unicode/normlzr.h"

56 +#include "cmemory.h"

57

58 U_NAMESPACE_BEGIN

59

60 @@ -422,6 +425,294 @@

61 return wordsFound;

62 }

63

64 +/*

65 + ******************************************************************

66 + * CjkBreakEngine

67 + */

68 +static const uint32_t kuint32max = 0xFFFFFFFF;

69 +CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, Langu ageType type, UErrorCode &status)

70 +: DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){

71 + if (!adoptDictionary->getValued()) {

72 + status = U_ILLEGAL_ARGUMENT_ERROR;

73 + return;

74 + }

75 +

76 + // Korean dictionary only includes Hangul syllables

77 + fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), sta tus);

78 + fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);

79 + fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\ uff9f]"), status);

80 + fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status );

81 +

82 + if (U_SUCCESS(status)) {

83 + // handle Korean and Japanese/Chinese using different dictionaries

84 + if (type == kKorean) {

85 + setCharacters(fHangulWordSet);

86 + } else { //Chinese and Japanese

87 + UnicodeSet cjSet;

88 + cjSet.addAll(fHanWordSet);

89 + cjSet.addAll(fKatakanaWordSet);

90 + cjSet.addAll(fHiraganaWordSet);

91 + cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc"));

92 + setCharacters(cjSet);

93 + }

94 + }

95 +}

96 +

97 +CjkBreakEngine::~CjkBreakEngine(){

98 + delete fDictionary;

99 +}

100 +

101 +// The katakanaCost values below are based on the length frequencies of all

102 +// katakana phrases in the dictionary

103 +static const int kMaxKatakanaLength = 8;

104 +static const int kMaxKatakanaGroupLength = 20;

105 +static const uint32_t maxSnlp = 255;

106 +

107 +static inline uint32_t getKatakanaCost(int wordLength){

108 + //TODO: fill array with actual values from dictionary!

109 + static const uint32_t katakanaCost[kMaxKatakanaLength + 1]

110 + = {8192, 984, 408, 240, 204, 252, 300, 3 72, 480};

111 + return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];

112 +}

113 +

114 +static inline bool isKatakana(uint16_t value) {

115 + return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) \|\|

116 + (value >= 0xFF66u && value <= 0xFF9fu);

117 +}

118 +

119 +// A very simple helper class to streamline the buffer handling in

120 +// divideUpDictionaryRange.

121 +template<class T, size_t N>

122 +class AutoBuffer {

123 + public:

124 + AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {

125 + if (size > N) {

126 + buffer = reinterpret_cast<T>(uprv_malloc(sizeof(T)size));

127 + capacity = size;

128 + }

129 + }

130 + ~AutoBuffer() {

131 + if (buffer != stackBuffer)

132 + uprv_free(buffer);

133 + }

134 +#if 0

135 + T* operator& () {

136 + return buffer;

137 + }

138 +#endif

139 + T* elems() {

140 + return buffer;

141 + }

142 + const T& operator[] (size_t i) const {

143 + return buffer[i];

144 + }

145 + T& operator[] (size_t i) {

146 + return buffer[i];

147 + }

148 +

149 + // resize without copy

150 + void resize(size_t size) {

151 + if (size <= capacity)

152 + return;

153 + if (buffer != stackBuffer)

154 + uprv_free(buffer);

155 + buffer = reinterpret_cast<T>(uprv_malloc(sizeof(T)size));

156 + capacity = size;

157 + }

158 + private:

159 + T stackBuffer[N];

160 + T* buffer;

161 + AutoBuffer();

162 + size_t capacity;

163 +};

164 +

165 +

166 +/*

167 + * @param text A UText representing the text

168 + * @param rangeStart The start of the range of dictionary characters

169 + * @param rangeEnd The end of the range of dictionary characters

170 + * @param foundBreaks Output of C array of int32_t break positions, or 0

171 + * @return The number of breaks found

172 + */

173 +int32_t

174 +CjkBreakEngine::divideUpDictionaryRange( UText *text,

175 + int32_t rangeStart,

176 + int32_t rangeEnd,

177 + UStack &foundBreaks ) const {

178 + if (rangeStart >= rangeEnd) {

179 + return 0;

180 + }

181 +

182 + const size_t defaultInputLength = 80;

183 + size_t inputLength = rangeEnd - rangeStart;

184 + AutoBuffer<UChar, defaultInputLength> charString(inputLength);

185 +

186 + // Normalize the input string and put it in normalizedText.

187 + // The map from the indices of the normalized input to the raw

188 + // input is kept in charPositions.

189 + UErrorCode status = U_ZERO_ERROR;

190 + utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);

191 + if (U_FAILURE(status))

192 + return 0;

193 +

194 + UnicodeString inputString(charString.elems(), inputLength);

195 + UNormalizationMode norm_mode = UNORM_NFKC;

196 + UBool isNormalized =

197 + Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES \|\|

198 + Normalizer::isNormalized(inputString, norm_mode, status);

199 +

200 + AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);

201 + int numChars = 0;

202 + UText normalizedText = UTEXT_INITIALIZER;

203 + // Needs to be declared here because normalizedText holds onto its buffer.

204 + UnicodeString normalizedString;

205 + if (isNormalized) {

206 + int32_t index = 0;

207 + charPositions[0] = 0;

208 + while(index < inputString.length()) {

209 + index = inputString.moveIndex32(index, 1);

210 + charPositions[++numChars] = index;

211 + }

212 + utext_openUnicodeString(&normalizedText, &inputString, &status);

213 + }

214 + else {

215 + Normalizer::normalize(inputString, norm_mode, 0, normalizedString, stat us);

216 + if (U_FAILURE(status))

217 + return 0;

218 + charPositions.resize(normalizedString.length() + 1);

219 + Normalizer normalizer(charString.elems(), inputLength, norm_mode);

220 + int32_t index = 0;

221 + charPositions[0] = 0;

222 + while(index < normalizer.endIndex()){

223 + UChar32 uc = normalizer.next();

224 + charPositions[++numChars] = index = normalizer.getIndex();

225 + }

226 + utext_openUnicodeString(&normalizedText, &normalizedString, &status);

227 + }

228 +

229 + if (U_FAILURE(status))

230 + return 0;

231 +

232 + // From this point on, all the indices refer to the indices of

233 + // the normalized input string.

234 +

235 + // bestSnlp[i] is the snlp of the best segmentation of the first i

236 + // characters in the range to be matched.

237 + AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);

238 + bestSnlp[0] = 0;

239 + for(int i=1; i<=numChars; i++){

240 + bestSnlp[i] = kuint32max;

241 + }

242 +

243 + // prev[i] is the index of the last CJK character in the previous word in

244 + // the best segmentation of the first i characters.

245 + AutoBuffer<int, defaultInputLength> prev(numChars + 1);

246 + for(int i=0; i<=numChars; i++){

247 + prev[i] = -1;

248 + }

249 +

250 + const size_t maxWordSize = 20;

251 + AutoBuffer<uint16_t, maxWordSize> values(numChars);

252 + AutoBuffer<int32_t, maxWordSize> lengths(numChars);

253 +

254 + // Dynamic programming to find the best segmentation.

255 + bool is_prev_katakana = false;

256 + for (int i = 0; i < numChars; ++i) {

257 + //utext_setNativeIndex(text, rangeStart + i);

258 + utext_setNativeIndex(&normalizedText, i);

259 + if (bestSnlp[i] == kuint32max)

260 + continue;

261 +

262 + int count;

263 + // limit maximum word length matched to size of current substring

264 + int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSiz e: numChars - i;

265 +

266 + fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());

267 +

268 + // if there are no single character matches found in the dictionary

269 + // starting with this charcter, treat character as a 1-character word

270 + // with the highest value possible, i.e. the least likely to occur.

271 + // Exclude Korean characters from this treatment, as they should be lef t

272 + // together by default.

273 + if((count == 0 \|\| lengths[0] != 1) &&

274 + !fHangulWordSet.contains(utext_current32(&normalizedText))){

275 + values[count] = maxSnlp;

276 + lengths[count++] = 1;

277 + }

278 +

279 + for (int j = 0; j < count; j++){

280 + //U_ASSERT(values[j] >= 0 && values[j] <= maxSnlp);

281 + uint32_t newSnlp = bestSnlp[i] + values[j];

282 + if (newSnlp < bestSnlp[lengths[j] + i]) {

283 + bestSnlp[lengths[j] + i] = newSnlp;

284 + prev[lengths[j] + i] = i;

285 + }

286 + }

287 +

288 + // In Japanese,

289 + // Katakana word in single character is pretty rare. So we apply

290 + // the following heuristic to Katakana: any continuous run of Katakana

291 + // characters is considered a candidate word with a default cost

292 + // specified in the katakanaCost table according to its length.

293 + //utext_setNativeIndex(text, rangeStart + i);

294 + utext_setNativeIndex(&normalizedText, i);

295 + bool is_katakana = isKatakana(utext_current32(&normalizedText));

296 + if (!is_prev_katakana && is_katakana) {

297 + int j = i + 1;

298 + utext_next32(&normalizedText);

299 + // Find the end of the continuous run of Katakana characters

300 + while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&

301 + isKatakana(utext_current32(&normalizedText))) {

302 + utext_next32(&normalizedText);

303 + ++j;

304 + }

305 + if ((j - i) < kMaxKatakanaGroupLength) {

306 + uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);

307 + if (newSnlp < bestSnlp[j]) {

308 + bestSnlp[j] = newSnlp;

309 + prev[j] = i;

310 + }

311 + }

312 + }

313 + is_prev_katakana = is_katakana;

314 + }

315 +

316 + // Start pushing the optimal offset index into t_boundary (t for tentative) .

317 + // prev[numChars] is guaranteed to be meaningful.

318 + // We'll first push in the reverse order, i.e.,

319 + // t_boundary[0] = numChars, and afterwards do a swap.

320 + AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);

321 +

322 + int numBreaks = 0;

323 + // No segmentation found, set boundary to end of range

324 + if (bestSnlp[numChars] == kuint32max) {

325 + t_boundary[numBreaks++] = numChars;

326 + } else {

327 + for (int i = numChars; i > 0; i = prev[i]){

328 + t_boundary[numBreaks++] = i;

329 +

330 + }

331 + U_ASSERT(prev[t_boundary[numBreaks-1]] == 0);

332 + }

333 +

334 + // Reverse offset index in t_boundary.

335 + // Don't add a break for the start of the dictionary range if there is one

336 + // there already.

337 + if (foundBreaks.size() == 0 \|\| foundBreaks.peeki() < rangeStart) {

338 + t_boundary[numBreaks++] = 0;

339 + }

340 +

341 + // Now that we're done, convert positions in t_bdry[] (indices in

342 + // the normalized input string) back to indices in the raw input string

343 + // while reversing t_bdry and pushing values to foundBreaks.

344 + for (int i = numBreaks-1; i >= 0; i--) {

345 + foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);

346 + }

347 +

348 + utext_close(&normalizedText);

349 + return numBreaks;

350 +}

351 +

352 U_NAMESPACE_END

353

354 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

355 --- source/common/dictbe.h 2006-09-29 17:37:45.000000000 -0700

356 +++ source/common/dictbe.h 2011-01-21 14:12:45.492920000 -0800

357 @@ -1,8 +1,8 @@

358 /**

359 - ****************************************************************************** *

360 - * Copyright (C) 2006, International Business Machines Corporation and others. *

361 - * All Rights Reserved. *

362 - ****************************************************************************** *

363 + **************************************************************************** **

364 + * Copyright (C) 2006-2010, International Business Machines Corporation and oth ers.

365 + * All Rights Reserved.

366 + **************************************************************************** **

367 */

368

369 #ifndef DICTBE_H

370 @@ -65,31 +65,31 @@

371 */

372 virtual ~DictionaryBreakEngine();

373

374 - /**

375 - * <p>Indicate whether this engine handles a particular character for

376 - * a particular kind of break.</p>

377 - *

378 - * @param c A character which begins a run that the engine might handle

379 - * @param breakType The type of text break which the caller wants to determine

380 - * @return TRUE if this engine handles the particular character and break

381 - * type.

382 - */

383 + /**

384 + * <p>Indicate whether this engine handles a particular character for

385 + * a particular kind of break.</p>

386 + *

387 + * @param c A character which begins a run that the engine might handle

388 + * @param breakType The type of text break which the caller wants to determin e

389 + * @return TRUE if this engine handles the particular character and break

390 + * type.

391 + */

392 virtual UBool handles( UChar32 c, int32_t breakType ) const;

393

394 - /**

395 - * <p>Find any breaks within a run in the supplied text.</p>

396 - *

397 - * @param text A UText representing the text. The

398 - * iterator is left at the end of the run of characters which the engine

399 - * is capable of handling.

400 - * @param startPos The start of the run within the supplied text.

401 - * @param endPos The end of the run within the supplied text.

402 - * @param reverse Whether the caller is looking for breaks in a reverse

403 - * direction.

404 - * @param breakType The type of break desired, or -1.

405 - * @param foundBreaks An allocated C array of the breaks found, if any

406 - * @return The number of breaks found.

407 - */

408 + /**

409 + * <p>Find any breaks within a run in the supplied text.</p>

410 + *

411 + * @param text A UText representing the text. The iterator is left at

412 + * the end of the run of characters which the engine is capable of handling

413 + * that starts from the first (or last) character in the range.

414 + * @param startPos The start of the run within the supplied text.

415 + * @param endPos The end of the run within the supplied text.

416 + * @param reverse Whether the caller is looking for breaks in a reverse

417 + * direction.

418 + * @param breakType The type of break desired, or -1.

419 + * @param foundBreaks An allocated C array of the breaks found, if any

420 + * @return The number of breaks found.

421 + */

422 virtual int32_t findBreaks( UText *text,

423 int32_t startPos,

424 int32_t endPos,

425 @@ -114,7 +114,7 @@

426 // virtual void setBreakTypes( uint32_t breakTypes );

427

428 /**

429 - * <p>Divide up a range of known dictionary characters.</p>

430 + * <p>Divide up a range of known dictionary characters handled by this break e ngine.</p>

431 *

432 * @param text A UText representing the text

433 * @param rangeStart The start of the range of dictionary characters

434 @@ -171,7 +171,7 @@

435

436 protected:

437 /**

438 - * <p>Divide up a range of known dictionary characters.</p>

439 + * <p>Divide up a range of known dictionary characters handled by this break e ngine.</p>

440 *

441 * @param text A UText representing the text

442 * @param rangeStart The start of the range of dictionary characters

443 @@ -186,6 +186,66 @@

444

445 };

446

447 +/*******************************************************************

448 + * CjkBreakEngine

449 + */

450 +

451 +//indicates language/script that the CjkBreakEngine will handle

452 +enum LanguageType {

453 + kKorean,

454 + kChineseJapanese

455 +};

456 +

457 +/**

458 + * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a

459 + * TrieWordDictionary with costs associated with each word and

460 + * Viterbi decoding to determine CJK-specific breaks.</p>

461 + */

462 +class CjkBreakEngine : public DictionaryBreakEngine {

463 + protected:

464 + /**

465 + * The set of characters handled by this engine

466 + * @internal

467 + */

468 + UnicodeSet fHangulWordSet;

469 + UnicodeSet fHanWordSet;

470 + UnicodeSet fKatakanaWordSet;

471 + UnicodeSet fHiraganaWordSet;

472 +

473 + const TrieWordDictionary *fDictionary;

474 +

475 + public:

476 +

477 + /**

478 + * <p>Default constructor.</p>

479 + *

480 + * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the

481 + * engine is deleted. The TrieWordDictionary must contain costs for each wo rd

482 + * in order for the dictionary to work properly.

483 + */

484 + CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status);

485 +

486 + /**

487 + * <p>Virtual destructor.</p>

488 + */

489 + virtual ~CjkBreakEngine();

490 +

491 + protected:

492 + /**

493 + * <p>Divide up a range of known dictionary characters handled by this brea k engine.</p>

494 + *

495 + * @param text A UText representing the text

496 + * @param rangeStart The start of the range of dictionary characters

497 + * @param rangeEnd The end of the range of dictionary characters

498 + * @param foundBreaks Output of C array of int32_t break positions, or 0

499 + * @return The number of breaks found

500 + */

501 + virtual int32_t divideUpDictionaryRange( UText *text,

502 + int32_t rangeStart,

503 + int32_t rangeEnd,

504 + UStack &foundBreaks ) const;

505 +

506 +};

507

508 U_NAMESPACE_END

509

510 --- source/common/rbbi.cpp 2010-07-22 17:15:37.000000000 -0700

511 +++ source/common/rbbi.cpp 2011-01-21 14:12:45.457938000 -0800

512 @@ -1555,10 +1555,12 @@

513 int32_t endPos,

514 UBool reverse) {

515 // Reset the old break cache first.

516 - uint32_t dictionaryCount = fDictionaryCharCount;

517 reset();

518

519 - if (dictionaryCount <= 1 \|\| (endPos - startPos) <= 1) {

520 + // note: code segment below assumes that dictionary chars are in the

521 + // startPos-endPos range

522 + // value returned should be next character in sequence

523 + if ((endPos - startPos) <= 1) {

524 return (reverse ? startPos : endPos);

525 }

526

527 @@ -1711,7 +1713,7 @@

528 // proposed break by one of the breaks we found. Use following() an d

529 // preceding() to do the work. They should never recurse in this ca se.

530 if (reverse) {

531 - return preceding(endPos - 1);

532 + return preceding(endPos);

533 }

534 else {

535 return following(startPos);

536 --- source/common/triedict.cpp 2008-02-13 01:35:50.000000000 -0800

537 +++ source/common/triedict.cpp 2011-01-21 14:12:45.271006000 -0800

538 @@ -20,6 +20,7 @@

539 #include "uvector.h"

540 #include "uvectr32.h"

541 #include "uarrsort.h"

542 +#include "hash.h"

543

544 //#define DEBUG_TRIE_DICT 1

545

546 @@ -27,6 +28,11 @@

547 #include <sys/times.h>

548 #include <limits.h>

549 #include <stdio.h>

550 +#include <time.h>

551 +#ifndef CLK_TCK

552 +#define CLK_TCK CLOCKS_PER_SEC

553 +#endif

554 +

555 #endif

556

557 U_NAMESPACE_BEGIN

558 @@ -45,6 +51,11 @@

559 * MutableTrieDictionary

560 */

561

562 +//#define MAX_VALUE 65535

563 +

564 +// forward declaration

565 +inline uint16_t scaleLogProbabilities(double logprob);

566 +

567 // Node structure for the ternary, uncompressed trie

568 struct TernaryNode : public UMemory {

569 UChar ch; // UTF-16 code unit

570 @@ -77,7 +88,8 @@

571 delete high;

572 }

573

574 -MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) {

575 +MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status,

576 + UBool containsValue /* = FALSE */ ) {

577 // Start the trie off with something. Having the root node already present

578 // cuts a special case out of the search/insertion functions.

579 // Making it a median character cuts the worse case for searches from

580 @@ -91,14 +103,19 @@

581 if (U_SUCCESS(status) && fIter == NULL) {

582 status = U_MEMORY_ALLOCATION_ERROR;

583 }

584 +

585 + fValued = containsValue;

586 }

587

588 -MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) {

589 +MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status,

590 + UBool containsValue /* = false */ ) {

591 fTrie = NULL;

592 fIter = utext_openUChars(NULL, NULL, 0, &status);

593 if (U_SUCCESS(status) && fIter == NULL) {

594 status = U_MEMORY_ALLOCATION_ERROR;

595 }

596 +

597 + fValued = containsValue;

598 }

599

600 MutableTrieDictionary::~MutableTrieDictionary() {

601 @@ -108,12 +125,13 @@

602

603 int32_t

604 MutableTrieDictionary::search( UText *text,

605 - int32_t maxLength,

606 - int32_t *lengths,

607 - int &count,

608 - int limit,

609 - TernaryNode *&parent,

610 - UBool &pMatched ) const {

611 + int32_t maxLength,

612 + int32_t *lengths,

613 + int &count,

614 + int limit,

615 + TernaryNode *&parent,

616 + UBool &pMatched,

617 + uint16_t values /=NULL*/) const {

618 // TODO: current implementation works in UTF-16 space

619 const TernaryNode *up = NULL;

620 const TernaryNode *p = fTrie;

621 @@ -121,6 +139,10 @@

622 pMatched = TRUE;

623 int i;

624

625 + if (!fValued) {

626 + values = NULL;

627 + }

628 +

629 UChar uc = utext_current32(text);

630 for (i = 0; i < maxLength && p != NULL; ++i) {

631 while (p != NULL) {

632 @@ -141,7 +163,11 @@

633 break;

634 }

635 // Must be equal to get here

636 - if (limit > 0 && (p->flags & kEndsWord)) {

637 + if (limit > 0 && (p->flags > 0)) {

638 + //is there a more efficient way to add values? ie. remove if stmt

639 + if(values != NULL) {

640 + values[mycount] = p->flags;

641 + }

642 lengths[mycount++] = i+1;

643 --limit;

644 }

645 @@ -161,13 +187,14 @@

646 void

647 MutableTrieDictionary::addWord( const UChar *word,

648 int32_t length,

649 - UErrorCode &status ) {

650 -#if 0

651 - if (length <= 0) {

652 + UErrorCode &status,

653 + uint16_t value /* = 0 */ ) {

654 + // dictionary cannot store zero values, would interfere with flags

655 + if (length <= 0 \|\| (!fValued && value > 0) \|\| (fValued && value == 0)) {

656 status = U_ILLEGAL_ARGUMENT_ERROR;

657 return;

658 }

659 -#endif

660 +

661 TernaryNode *parent;

662 UBool pMatched;

663 int count;

664 @@ -177,7 +204,7 @@

665 matched = search(fIter, length, NULL, count, 0, parent, pMatched);

666

667 while (matched++ < length) {

668 - UChar32 uc = utext_next32(fIter); // TODO: supplemetary support?

669 + UChar32 uc = utext_next32(fIter); // TODO: supplementary support?

670 U_ASSERT(uc != U_SENTINEL);

671 TernaryNode *newNode = new TernaryNode(uc);

672 if (newNode == NULL) {

673 @@ -199,30 +226,23 @@

674 parent = newNode;

675 }

676

677 - parent->flags \|= kEndsWord;

678 -}

679 -

680 -#if 0

681 -void

682 -MutableTrieDictionary::addWords( UEnumeration *words,

683 - UErrorCode &status ) {

684 - int32_t length;

685 - const UChar *word;

686 - while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) {

687 - addWord(word, length, status);

688 + if(fValued && value > 0){

689 + parent->flags = value;

690 + } else {

691 + parent->flags \|= kEndsWord;

692 }

693 }

694 -#endif

695

696 int32_t

697 MutableTrieDictionary::matches( UText *text,

698 int32_t maxLength,

699 int32_t *lengths,

700 int &count,

701 - int limit ) const {

702 + int limit,

703 + uint16_t values /=NULL*/) const {

704 TernaryNode *parent;

705 UBool pMatched;

706 - return search(text, maxLength, lengths, count, limit, parent, pMatched);

707 + return search(text, maxLength, lengths, count, limit, parent, pMatched, val ues);

708 }

709

710 // Implementation of iteration for MutableTrieDictionary

711 @@ -277,7 +297,7 @@

712 break;

713 }

714 case kEqual:

715 - emit = (node->flags & kEndsWord) != 0;

716 + emit = node->flags > 0;

717 equal = (node->equal != NULL);

718 // If this node should be part of the next emitted string, appe nd

719 // the UChar to the string, and make sure we pop it when we com e

720 @@ -299,7 +319,7 @@

721 }

722 case kGreaterThan:

723 // If this node's character is in the string, remove it.

724 - if (node->equal != NULL \|\| (node->flags & kEndsWord)) {

725 + if (node->equal != NULL \|\| node->flags > 0) {

726 unistr.truncate(unistr.length()-1);

727 }

728 if (node->high != NULL) {

729 @@ -354,12 +374,75 @@

730 * CompactTrieDictionary

731 */

732

733 +//TODO further optimization:

734 +// minimise size of trie with logprobs by storing values

735 +// for terminal nodes directly in offsets[]

736 +// --> calculating from next offset might be simpler, but would have to add

737 +// one last offset for logprob of last node

738 +// --> if calculate from current offset, need to factor in possible overflow

739 +// as well.

740 +// idea: store in offset, set first bit to indicate logprob storage-->won't

741 +// have to access additional node

742 +

743 +// {'Dic', 1}, version 1: uses old header, no values

744 +#define COMPACT_TRIE_MAGIC_1 0x44696301

745 +// version 2: uses new header (more than 2^16 nodes), no values

746 +#define COMPACT_TRIE_MAGIC_2 0x44696302

747 +// version 3: uses new header, includes values

748 +#define COMPACT_TRIE_MAGIC_3 0x44696303

749 +

750 struct CompactTrieHeader {

751 uint32_t size; // Size of the data in bytes

752 uint32_t magic; // Magic number (including version)

753 + uint32_t nodeCount; // Number of entries in offsets[]

754 + uint32_t root; // Node number of the root node

755 + uint32_t offsets[1]; // Offsets to nodes from start of data

756 +};

757 +

758 +// old version of CompactTrieHeader kept for backwards compatibility

759 +struct CompactTrieHeaderV1 {

760 + uint32_t size; // Size of the data in bytes

761 + uint32_t magic; // Magic number (including version)

762 uint16_t nodeCount; // Number of entries in offsets[]

763 uint16_t root; // Node number of the root node

764 - uint32_t offsets[1]; // Offsets to nodes from start of data

765 + uint32_t offsets[1]; // Offsets to nodes from start of data

766 +};

767 +

768 +// Helper class for managing CompactTrieHeader and CompactTrieHeaderV1

769 +struct CompactTrieInfo {

770 + uint32_t size; // Size of the data in bytes

771 + uint32_t magic; // Magic number (including version)

772 + uint32_t nodeCount; // Number of entries in offsets[]

773 + uint32_t root; // Node number of the root node

774 + uint32_t *offsets; // Offsets to nodes from start of data

775 + uint8_t *address; // pointer to header bytes in memory

776 +

777 + CompactTrieInfo(const void *data, UErrorCode &status){

778 + CompactTrieHeader header = (CompactTrieHeader ) data;

779 + if (header->magic != COMPACT_TRIE_MAGIC_1 &&

780 + header->magic != COMPACT_TRIE_MAGIC_2 &&

781 + header->magic != COMPACT_TRIE_MAGIC_3) {

782 + status = U_ILLEGAL_ARGUMENT_ERROR;

783 + } else {

784 + size = header->size;

785 + magic = header->magic;

786 +

787 + if (header->magic == COMPACT_TRIE_MAGIC_1) {

788 + CompactTrieHeaderV1 headerV1 = (CompactTrieHeaderV1 ) header;

789 + nodeCount = headerV1->nodeCount;

790 + root = headerV1->root;

791 + offsets = &(headerV1->offsets[0]);

792 + address = (uint8_t *)headerV1;

793 + } else {

794 + nodeCount = header->nodeCount;

795 + root = header->root;

796 + offsets = &(header->offsets[0]);

797 + address = (uint8_t *)header;

798 + }

799 + }

800 + }

801 +

802 + ~CompactTrieInfo(){}

803 };

804

805 // Note that to avoid platform-specific alignment issues, all members of the no de

806 @@ -375,10 +458,14 @@

807 enum CompactTrieNodeFlags {

808 kVerticalNode = 0x1000, // This is a vertical node

809 kParentEndsWord = 0x2000, // The node whose equal link points to this ends a word

810 - kReservedFlag1 = 0x4000,

811 - kReservedFlag2 = 0x8000,

812 + kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kR eservedFlag1

813 + kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReserved Flag2

814 kCountMask = 0x0FFF, // The count portion of flagscount

815 - kFlagMask = 0xF000 // The flags portion of flagscount

816 + kFlagMask = 0xF000, // The flags portion of flagscount

817 + kRootCountMask = 0x7FFF // The count portion of flagscount in the r oot node

818 +

819 + //offset flags:

820 + //kOffsetContainsValue = 0x80000000 // Offset contains value for pare nt node

821 };

822

823 // The two node types are distinguished by the kVerticalNode flag.

824 @@ -402,63 +489,177 @@

825 uint16_t chars[1]; // Code units

826 };

827

828 -// {'Dic', 1}, version 1

829 -#define COMPACT_TRIE_MAGIC_1 0x44696301

830 -

831 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj,

832 UErrorCode &status )

833 : fUData(dataObj)

834 {

835 - fData = (const CompactTrieHeader *) udata_getMemory(dataObj);

836 + fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));

837 + *fInfo = CompactTrieInfo(udata_getMemory(dataObj), status);

838 fOwnData = FALSE;

839 - if (fData->magic != COMPACT_TRIE_MAGIC_1) {

840 - status = U_ILLEGAL_ARGUMENT_ERROR;

841 - fData = NULL;

842 - }

843 }

844 +

845 CompactTrieDictionary::CompactTrieDictionary( const void *data,

846 UErrorCode &status )

847 : fUData(NULL)

848 {

849 - fData = (const CompactTrieHeader *) data;

850 + fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));

851 + *fInfo = CompactTrieInfo(data, status);

852 fOwnData = FALSE;

853 - if (fData->magic != COMPACT_TRIE_MAGIC_1) {

854 - status = U_ILLEGAL_ARGUMENT_ERROR;

855 - fData = NULL;

856 - }

857 }

858

859 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict ,

860 UErrorCode &status )

861 : fUData(NULL)

862 {

863 - fData = compactMutableTrieDictionary(dict, status);

864 + const CompactTrieHeader* header = compactMutableTrieDictionary(dict, status );

865 + if (U_SUCCESS(status)) {

866 + fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));

867 + *fInfo = CompactTrieInfo(header, status);

868 + }

869 +

870 fOwnData = !U_FAILURE(status);

871 }

872

873 CompactTrieDictionary::~CompactTrieDictionary() {

874 if (fOwnData) {

875 - uprv_free((void *)fData);

876 + uprv_free((void *)(fInfo->address));

877 }

878 + uprv_free((void *)fInfo);

879 +

880 if (fUData) {

881 udata_close(fUData);

882 }

883 }

884

885 +UBool CompactTrieDictionary::getValued() const{

886 + return fInfo->magic == COMPACT_TRIE_MAGIC_3;

887 +}

888 +

889 uint32_t

890 CompactTrieDictionary::dataSize() const {

891 - return fData->size;

892 + return fInfo->size;

893 }

894

895 const void *

896 CompactTrieDictionary::data() const {

897 - return fData;

898 + return fInfo->address;

899 +}

900 +

901 +//This function finds the address of a node for us, given its node ID

902 +static inline const CompactTrieNode *

903 +getCompactNode(const CompactTrieInfo *info, uint32_t node) {

904 + if(node < info->root-1) {

905 + return (const CompactTrieNode *)(&info->offsets[node]);

906 + } else {

907 + return (const CompactTrieNode *)(info->address + info->offsets[node]);

908 + }

909 }

910

911 -// This function finds the address of a node for us, given its node ID

912 +//this version of getCompactNode is currently only used in compactMutableTrieDi ctionary()

913 static inline const CompactTrieNode *

914 -getCompactNode(const CompactTrieHeader *header, uint16_t node) {

915 - return (const CompactTrieNode )((const uint8_t )header + header->offsets[ node]);

916 +getCompactNode(const CompactTrieHeader *header, uint32_t node) {

917 + if(node < header->root-1) {

918 + return (const CompactTrieNode *)(&header->offsets[node]);

919 + } else {

920 + return (const CompactTrieNode )((const uint8_t )header + header->offs ets[node]);

921 + }

922 +}

923 +

924 +

925 +/**

926 + * Calculates the number of links in a node

927 + * @node The specified node

928 + */

929 +static inline const uint16_t

930 +getCount(const CompactTrieNode *node){

931 + return (node->flagscount & kCountMask);

932 + //use the code below if number of links ever exceed 4096

933 + //return (node->flagscount & kCountMask) + ((node->flagscount & kExceedsCou nt) >> 2);

934 +}

935 +

936 +/**

937 + * calculates an equal link node ID of a horizontal node

938 + * @hnode The horizontal node containing the equal link

939 + * @param index The index into hnode->entries[]

940 + * @param nodeCount The length of hnode->entries[]

941 + */

942 +static inline uint32_t calcEqualLink(const CompactTrieVerticalNode *vnode){

943 + if(vnode->flagscount & kEqualOverflows){

944 + // treat overflow bits as an extension of chars[]

945 + uint16_t overflow = (uint16_t ) &vnode->chars[getCount((CompactTrieNo de*)vnode)];

946 + return vnode->equal + (((uint32_t)*overflow) << 16);

947 + }else{

948 + return vnode->equal;

949 + }

950 +}

951 +

952 +/**

953 + * calculates an equal link node ID of a horizontal node

954 + * @hnode The horizontal node containing the equal link

955 + * @param index The index into hnode->entries[]

956 + * @param nodeCount The length of hnode->entries[]

957 + */

958 +static inline uint32_t calcEqualLink(const CompactTrieHorizontalNode *hnode, ui nt16_t index, uint16_t nodeCount){

959 + if(hnode->flagscount & kEqualOverflows){

960 + //set overflow to point to the uint16_t containing the overflow bits

961 + uint16_t overflow = (uint16_t ) &hnode->entries[nodeCount];

962 + overflow += index/4;

963 + uint16_t extraBits = (overflow >> (3 - (index % 4)) 4) % 0x10;

964 + return hnode->entries[index].equal + (((uint32_t)extraBits) << 16);

965 + } else {

966 + return hnode->entries[index].equal;

967 + }

968 +}

969 +

970 +/**

971 + * Returns the value stored in the specified node which is associated with its

972 + * parent node.

973 + * TODO: how to tell that value is stored in node or in offset? check whether

974 + * node ID < fInfo->root!

975 + */

976 +static inline uint16_t getValue(const CompactTrieHorizontalNode *hnode){

977 + uint16_t count = getCount((CompactTrieNode *)hnode);

978 + uint16_t overflowSize = 0; //size of node ID overflow storage in bytes

979 +

980 + if(hnode->flagscount & kEqualOverflows)

981 + overflowSize = (count + 3) / 4 * sizeof(uint16_t);

982 + return ((uint16_t )((uint8_t *)&hnode->entries[count] + overflowSize));

983 +}

984 +

985 +static inline uint16_t getValue(const CompactTrieVerticalNode *vnode){

986 + // calculate size of total node ID overflow storage in bytes

987 + uint16_t overflowSize = (vnode->flagscount & kEqualOverflows)? sizeof(uint1 6_t) : 0;

988 + return ((uint16_t )((uint8_t )&vnode->chars[getCount((CompactTrieNode ) vnode)] + overflowSize));

989 +}

990 +

991 +static inline uint16_t getValue(const CompactTrieNode *node){

992 + if(node->flagscount & kVerticalNode)

993 + return getValue((const CompactTrieVerticalNode *)node);

994 + else

995 + return getValue((const CompactTrieHorizontalNode *)node);

996 +}

997 +

998 +//returns index of match in CompactTrieHorizontalNode.entries[] using binary se arch

999 +inline int16_t

1000 +searchHorizontalEntries(const CompactTrieHorizontalEntry *entries,

1001 + UChar uc, uint16_t nodeCount){

1002 + int low = 0;

1003 + int high = nodeCount-1;

1004 + int middle;

1005 + while (high >= low) {

1006 + middle = (high+low)/2;

1007 + if (uc == entries[middle].ch) {

1008 + return middle;

1009 + }

1010 + else if (uc < entries[middle].ch) {

1011 + high = middle-1;

1012 + }

1013 + else {

1014 + low = middle+1;

1015 + }

1016 + }

1017 +

1018 + return -1;

1019 }

1020

1021 int32_t

1022 @@ -466,17 +667,38 @@

1023 int32_t maxLength,

1024 int32_t *lengths,

1025 int &count,

1026 - int limit ) const {

1027 + int limit,

1028 + uint16_t values /= NULL*/) const {

1029 + if (fInfo->magic == COMPACT_TRIE_MAGIC_2)

1030 + values = NULL;

1031 +

1032 // TODO: current implementation works in UTF-16 space

1033 - const CompactTrieNode *node = getCompactNode(fData, fData->root);

1034 + const CompactTrieNode *node = getCompactNode(fInfo, fInfo->root);

1035 int mycount = 0;

1036

1037 UChar uc = utext_current32(text);

1038 int i = 0;

1039

1040 + // handle root node with only kEqualOverflows flag: assume horizontal node without parent

1041 + if(node != NULL){

1042 + const CompactTrieHorizontalNode root = (const CompactTrieHorizontalNod e ) node;

1043 + int index = searchHorizontalEntries(root->entries, uc, root->flagscount & kRootCountMask);

1044 + if(index > -1){

1045 + node = getCompactNode(fInfo, calcEqualLink(root, index, root->flags count & kRootCountMask));

1046 + utext_next32(text);

1047 + uc = utext_current32(text);

1048 + ++i;

1049 + }else{

1050 + node = NULL;

1051 + }

1052 + }

1053 +

1054 while (node != NULL) {

1055 // Check if the node we just exited ends a word

1056 if (limit > 0 && (node->flagscount & kParentEndsWord)) {

1057 + if(values != NULL){

1058 + values[mycount] = getValue(node);

1059 + }

1060 lengths[mycount++] = i;

1061 --limit;

1062 }

1063 @@ -487,7 +709,7 @@

1064 break;

1065 }

1066

1067 - int nodeCount = (node->flagscount & kCountMask);

1068 + int nodeCount = getCount(node);

1069 if (nodeCount == 0) {

1070 // Special terminal node; return now

1071 break;

1072 @@ -507,35 +729,27 @@

1073 // To get here we must have come through the whole list successfull y;

1074 // go on to the next node. Note that a word cannot end in the middl e

1075 // of a vertical node.

1076 - node = getCompactNode(fData, vnode->equal);

1077 + node = getCompactNode(fInfo, calcEqualLink(vnode));

1078 }

1079 else {

1080 // Horizontal node; do binary search

1081 const CompactTrieHorizontalNode hnode = (const CompactTrieHorizont alNode )node;

1082 - int low = 0;

1083 - int high = nodeCount-1;

1084 - int middle;

1085 - node = NULL; // If we don't find a match, we'll fall out of the loop

1086 - while (high >= low) {

1087 - middle = (high+low)/2;

1088 - if (uc == hnode->entries[middle].ch) {

1089 - // We hit a match; get the next node and next character

1090 - node = getCompactNode(fData, hnode->entries[middle].equal);

1091 - utext_next32(text);

1092 - uc = utext_current32(text);

1093 - ++i;

1094 - break;

1095 - }

1096 - else if (uc < hnode->entries[middle].ch) {

1097 - high = middle-1;

1098 - }

1099 - else {

1100 - low = middle+1;

1101 - }

1102 + const CompactTrieHorizontalEntry *entries;

1103 + entries = hnode->entries;

1104 +

1105 + int index = searchHorizontalEntries(entries, uc, nodeCount);

1106 + if(index > -1){ //

1107 + // We hit a match; get the next node and next character

1108 + node = getCompactNode(fInfo, calcEqualLink(hnode, index, nodeCo unt));

1109 + utext_next32(text);

1110 + uc = utext_current32(text);

1111 + ++i;

1112 + }else{

1113 + node = NULL; // If we don't find a match, we'll fall out of the loop

1114 }

1115 }

1116 }

1117 -exit:

1118 + exit:

1119 count = mycount;

1120 return i;

1121 }

1122 @@ -545,16 +759,16 @@

1123 private:

1124 UVector32 fNodeStack; // Stack of nodes to process

1125 UVector32 fIndexStack; // Stack of where in node we are

1126 - const CompactTrieHeader *fHeader; // Trie data

1127 + const CompactTrieInfo *fInfo; // Trie data

1128

1129 public:

1130 static UClassID U_EXPORT2 getStaticClassID(void);

1131 virtual UClassID getDynamicClassID(void) const;

1132 public:

1133 - CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status)

1134 + CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status)

1135 : fNodeStack(status), fIndexStack(status) {

1136 - fHeader = header;

1137 - fNodeStack.push(header->root, status);

1138 + fInfo = info;

1139 + fNodeStack.push(info->root, status);

1140 fIndexStack.push(0, status);

1141 unistr.remove();

1142 }

1143 @@ -564,14 +778,14 @@

1144

1145 virtual StringEnumeration *clone() const {

1146 UErrorCode status = U_ZERO_ERROR;

1147 - return new CompactTrieEnumeration(fHeader, status);

1148 + return new CompactTrieEnumeration(fInfo, status);

1149 }

1150

1151 virtual const UnicodeString * snext(UErrorCode &status);

1152

1153 // Very expensive, but this should never be used.

1154 virtual int32_t count(UErrorCode &status) const {

1155 - CompactTrieEnumeration counter(fHeader, status);

1156 + CompactTrieEnumeration counter(fInfo, status);

1157 int32_t result = 0;

1158 while (counter.snext(status) != NULL && U_SUCCESS(status)) {

1159 ++result;

1160 @@ -582,7 +796,7 @@

1161 virtual void reset(UErrorCode &status) {

1162 fNodeStack.removeAllElements();

1163 fIndexStack.removeAllElements();

1164 - fNodeStack.push(fHeader->root, status);

1165 + fNodeStack.push(fInfo->root, status);

1166 fIndexStack.push(0, status);

1167 unistr.remove();

1168 }

1169 @@ -595,26 +809,34 @@

1170 if (fNodeStack.empty() \|\| U_FAILURE(status)) {

1171 return NULL;

1172 }

1173 - const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki());

1174 + const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki());

1175 int where = fIndexStack.peeki();

1176 while (!fNodeStack.empty() && U_SUCCESS(status)) {

1177 - int nodeCount = (node->flagscount & kCountMask);

1178 + int nodeCount;

1179 +

1180 + bool isRoot = fNodeStack.peeki() == static_cast<int32_t>(fInfo->root);

1181 + if(isRoot){

1182 + nodeCount = node->flagscount & kRootCountMask;

1183 + } else {

1184 + nodeCount = getCount(node);

1185 + }

1186 +

1187 UBool goingDown = FALSE;

1188 if (nodeCount == 0) {

1189 // Terminal node; go up immediately

1190 fNodeStack.popi();

1191 fIndexStack.popi();

1192 - node = getCompactNode(fHeader, fNodeStack.peeki());

1193 + node = getCompactNode(fInfo, fNodeStack.peeki());

1194 where = fIndexStack.peeki();

1195 }

1196 - else if (node->flagscount & kVerticalNode) {

1197 + else if ((node->flagscount & kVerticalNode) && !isRoot) {

1198 // Vertical node

1199 const CompactTrieVerticalNode vnode = (const CompactTrieVerticalNo de )node;

1200 if (where == 0) {

1201 // Going down

1202 - unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount) ;

1203 + unistr.append((const UChar *)vnode->chars, nodeCount);

1204 fIndexStack.setElementAt(1, fIndexStack.size()-1);

1205 - node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, st atus));

1206 + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnod e), status));

1207 where = fIndexStack.push(0, status);

1208 goingDown = TRUE;

1209 }

1210 @@ -623,7 +845,7 @@

1211 unistr.truncate(unistr.length()-nodeCount);

1212 fNodeStack.popi();

1213 fIndexStack.popi();

1214 - node = getCompactNode(fHeader, fNodeStack.peeki());

1215 + node = getCompactNode(fInfo, fNodeStack.peeki());

1216 where = fIndexStack.peeki();

1217 }

1218 }

1219 @@ -638,7 +860,7 @@

1220 // Push on next node

1221 unistr.append((UChar)hnode->entries[where].ch);

1222 fIndexStack.setElementAt(where+1, fIndexStack.size()-1);

1223 - node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[w here].equal, status));

1224 + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnod e, where, nodeCount), status));

1225 where = fIndexStack.push(0, status);

1226 goingDown = TRUE;

1227 }

1228 @@ -646,12 +868,14 @@

1229 // Going up

1230 fNodeStack.popi();

1231 fIndexStack.popi();

1232 - node = getCompactNode(fHeader, fNodeStack.peeki());

1233 + node = getCompactNode(fInfo, fNodeStack.peeki());

1234 where = fIndexStack.peeki();

1235 }

1236 }

1237 +

1238 // Check if the parent of the node we've just gone down to ends a

1239 // word. If so, return it.

1240 + // The root node should never end up here.

1241 if (goingDown && (node->flagscount & kParentEndsWord)) {

1242 return &unistr;

1243 }

1244 @@ -664,7 +888,7 @@

1245 if (U_FAILURE(status)) {

1246 return NULL;

1247 }

1248 - return new CompactTrieEnumeration(fData, status);

1249 + return new CompactTrieEnumeration(fInfo, status);

1250 }

1251

1252 //

1253 @@ -672,21 +896,36 @@

1254 // and back again

1255 //

1256

1257 -// Helper classes to construct the compact trie

1258 +enum CompactTrieNodeType {

1259 + kHorizontalType = 0,

1260 + kVerticalType = 1,

1261 + kValueType = 2

1262 +};

1263 +

1264 +/**

1265 + * The following classes (i.e. BuildCompactTrie*Node) are helper classes to

1266 + * construct the compact trie by storing information for each node and later

1267 + * writing the node to memory in a sequential format.

1268 + */

1269 class BuildCompactTrieNode: public UMemory {

1270 - public:

1271 +public:

1272 UBool fParentEndsWord;

1273 - UBool fVertical;

1274 + CompactTrieNodeType fNodeType;

1275 UBool fHasDuplicate;

1276 + UBool fEqualOverflows;

1277 int32_t fNodeID;

1278 UnicodeString fChars;

1279 + uint16_t fValue;

1280

1281 - public:

1282 - BuildCompactTrieNode(UBool parentEndsWord, UBool vertical, UStack &nodes, U ErrorCode &status) {

1283 +public:

1284 + BuildCompactTrieNode(UBool parentEndsWord, CompactTrieNodeType nodeType,

1285 + UStack &nodes, UErrorCode &status, uint16_t value = 0) {

1286 fParentEndsWord = parentEndsWord;

1287 fHasDuplicate = FALSE;

1288 - fVertical = vertical;

1289 + fNodeType = nodeType;

1290 + fEqualOverflows = FALSE;

1291 fNodeID = nodes.size();

1292 + fValue = parentEndsWord? value : 0;

1293 nodes.push(this, status);

1294 }

1295

1296 @@ -694,87 +933,225 @@

1297 }

1298

1299 virtual uint32_t size() {

1300 - return sizeof(uint16_t);

1301 + if(fValue > 0)

1302 + return sizeof(uint16_t) * 2;

1303 + else

1304 + return sizeof(uint16_t);

1305 }

1306

1307 virtual void write(uint8_t bytes, uint32_t &offset, const UVector32 &/tra nslate*/) {

1308 // Write flag/count

1309 - ((uint16_t )(bytes+offset)) = (fChars.length() & kCountMask)

1310 - \| (fVertical ? kVerticalNode : 0) \| (fParentEndsWord ? kParentEndsW ord : 0 );

1311 +

1312 + // if this ever fails, a flag bit (i.e. kExceedsCount) will need to be

1313 + // used as a 5th MSB.

1314 + U_ASSERT(fChars.length() < 4096 \|\| fNodeID == 2);

1315 +

1316 + ((uint16_t )(bytes+offset)) = (fEqualOverflows? kEqualOverflows : 0) \|

1317 + ((fNodeID == 2)? (fChars.length() & kRootCountMask):

1318 + (

1319 + (fChars.length() & kCountMask) \|

1320 + //((fChars.length() << 2) & kExceedsCount) \|

1321 + (fNodeType == kVerticalType ? kVerticalNode : 0) \|

1322 + (fParentEndsWord ? kParentEndsWord : 0 )

1323 + )

1324 + );

1325 offset += sizeof(uint16_t);

1326 }

1327 +

1328 + virtual void writeValue(uint8_t *bytes, uint32_t &offset) {

1329 + if(fValue > 0){

1330 + ((uint16_t )(bytes+offset)) = fValue;

1331 + offset += sizeof(uint16_t);

1332 + }

1333 + }

1334 +

1335 +};

1336 +

1337 +/**

1338 + * Stores value of parent terminating nodes that have no more subtries.

1339 + */

1340 +class BuildCompactTrieValueNode: public BuildCompactTrieNode {

1341 +public:

1342 + BuildCompactTrieValueNode(UStack &nodes, UErrorCode &status, uint16_t value )

1343 + : BuildCompactTrieNode(TRUE, kValueType, nodes, status, value){

1344 + }

1345 +

1346 + virtual ~BuildCompactTrieValueNode(){

1347 + }

1348 +

1349 + virtual uint32_t size() {

1350 + return sizeof(uint16_t) * 2;

1351 + }

1352 +

1353 + virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &trans late) {

1354 + // don't write value directly to memory but store it in offset to be wr itten later

1355 + //offset = fValue & kOffsetContainsValue;

1356 + BuildCompactTrieNode::write(bytes, offset, translate);

1357 + BuildCompactTrieNode::writeValue(bytes, offset);

1358 + }

1359 };

1360

1361 class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode {

1362 public:

1363 UStack fLinks;

1364 + UBool fMayOverflow; //intermediate value for fEqualOverflows

1365

1366 public:

1367 - BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorC ode &status)

1368 - : BuildCompactTrieNode(parentEndsWord, FALSE, nodes, status), fLinks(st atus) {

1369 + BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorC ode &status, uint16_t value = 0)

1370 + : BuildCompactTrieNode(parentEndsWord, kHorizontalType, nodes, status, valu e), fLinks(status) {

1371 + fMayOverflow = FALSE;

1372 }

1373

1374 virtual ~BuildCompactTrieHorizontalNode() {

1375 }

1376

1377 + // It is impossible to know beforehand exactly how much space the node will

1378 + // need in memory before being written, because the node IDs in the equal

1379 + // links may or may not overflow after node coalescing. Therefore, this met hod

1380 + // returns the maximum size possible for the node.

1381 virtual uint32_t size() {

1382 - return offsetof(CompactTrieHorizontalNode,entries) +

1383 - (fChars.length()*sizeof(CompactTrieHorizontalEntry));

1384 + uint32_t estimatedSize = offsetof(CompactTrieHorizontalNode,entries) +

1385 + (fChars.length()*sizeof(CompactTrieHorizontalEntry));

1386 +

1387 + if(fValue > 0)

1388 + estimatedSize += sizeof(uint16_t);

1389 +

1390 + //estimate extra space needed to store overflow for node ID links

1391 + //may be more than what is actually needed

1392 + for(int i=0; i < fChars.length(); i++){

1393 + if(((BuildCompactTrieNode *)fLinks[i])->fNodeID > 0xFFFF){

1394 + fMayOverflow = TRUE;

1395 + break;

1396 + }

1397 + }

1398 + if(fMayOverflow) // added space for overflow should be same as ceil(fCh ars.length()/4) * sizeof(uint16_t)

1399 + estimatedSize += (sizeof(uint16_t) * fChars.length() + 2)/4;

1400 +

1401 + return estimatedSize;

1402 }

1403

1404 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &trans late) {

1405 - BuildCompactTrieNode::write(bytes, offset, translate);

1406 int32_t count = fChars.length();

1407 +

1408 + //if largest nodeID > 2^16, set flag

1409 + //large node IDs are more likely to be at the back of the array

1410 + for (int32_t i = count-1; i >= 0; --i) {

1411 + if(translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeI D) > 0xFFFF){

1412 + fEqualOverflows = TRUE;

1413 + break;

1414 + }

1415 + }

1416 +

1417 + BuildCompactTrieNode::write(bytes, offset, translate);

1418 +

1419 + // write entries[] to memory

1420 for (int32_t i = 0; i < count; ++i) {

1421 CompactTrieHorizontalEntry entry = (CompactTrieHorizontalEntry )( bytes+offset);

1422 entry->ch = fChars[i];

1423 entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks [i])->fNodeID);

1424 #ifdef DEBUG_TRIE_DICT

1425 - if (entry->equal == 0) {

1426 +

1427 + if ((entry->equal == 0) && !fEqualOverflows) {

1428 fprintf(stderr, "ERROR: horizontal link %d, logical node %d map s to physical node zero\n",

1429 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);

1430 }

1431 #endif

1432 offset += sizeof(CompactTrieHorizontalEntry);

1433 }

1434 +

1435 + // append extra bits of equal nodes to end if fEqualOverflows

1436 + if (fEqualOverflows) {

1437 + uint16_t leftmostBits = 0;

1438 + for (int16_t i = 0; i < count; i++) {

1439 + leftmostBits = (leftmostBits << 4) \| getLeftmostBits(translate, i);

1440 +

1441 + // write filled uint16_t to memory

1442 + if(i % 4 == 3){

1443 + ((uint16_t )(bytes+offset)) = leftmostBits;

1444 + leftmostBits = 0;

1445 + offset += sizeof(uint16_t);

1446 + }

1447 + }

1448 +

1449 + // pad last uint16_t with zeroes if necessary

1450 + int remainder = count % 4;

1451 + if (remainder > 0) {

1452 + ((uint16_t )(bytes+offset)) = (leftmostBits << (16 - 4 * rema inder));

1453 + offset += sizeof(uint16_t);

1454 + }

1455 + }

1456 +

1457 + BuildCompactTrieNode::writeValue(bytes, offset);

1458 + }

1459 +

1460 + // returns leftmost bits of physical node link

1461 + uint16_t getLeftmostBits(const UVector32 &translate, uint32_t i){

1462 + uint16_t leftmostBits = (uint16_t) (translate.elementAti(((BuildCompact TrieNode *)fLinks[i])->fNodeID) >> 16);

1463 +#ifdef DEBUG_TRIE_DICT

1464 + if (leftmostBits > 0xF) {

1465 + fprintf(stderr, "ERROR: horizontal link %d, logical node %d exceeds maximum possible node ID value\n",

1466 + i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);

1467 + }

1468 +#endif

1469 + return leftmostBits;

1470 }

1471

1472 void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) {

1473 fChars.append(ch);

1474 fLinks.push(link, status);

1475 }

1476 +

1477 };

1478

1479 class BuildCompactTrieVerticalNode: public BuildCompactTrieNode {

1480 - public:

1481 +public:

1482 BuildCompactTrieNode *fEqual;

1483

1484 - public:

1485 - BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCod e &status)

1486 - : BuildCompactTrieNode(parentEndsWord, TRUE, nodes, status) {

1487 +public:

1488 + BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCod e &status, uint16_t value = 0)

1489 + : BuildCompactTrieNode(parentEndsWord, kVerticalType, nodes, status, value) {

1490 fEqual = NULL;

1491 }

1492

1493 virtual ~BuildCompactTrieVerticalNode() {

1494 }

1495

1496 + // Returns the maximum possible size of this node. See comment in

1497 + // BuildCompactTrieHorizontal node for more information.

1498 virtual uint32_t size() {

1499 - return offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeo f(uint16_t));

1500 + uint32_t estimatedSize = offsetof(CompactTrieVerticalNode,chars) + (fCh ars.length()*sizeof(uint16_t));

1501 + if(fValue > 0){

1502 + estimatedSize += sizeof(uint16_t);

1503 + }

1504 +

1505 + if(fEqual->fNodeID > 0xFFFF){

1506 + estimatedSize += sizeof(uint16_t);

1507 + }

1508 + return estimatedSize;

1509 }

1510

1511 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &trans late) {

1512 CompactTrieVerticalNode node = (CompactTrieVerticalNode )(bytes+offse t);

1513 + fEqualOverflows = (translate.elementAti(fEqual->fNodeID) > 0xFFFF);

1514 BuildCompactTrieNode::write(bytes, offset, translate);

1515 node->equal = translate.elementAti(fEqual->fNodeID);

1516 offset += sizeof(node->equal);

1517 #ifdef DEBUG_TRIE_DICT

1518 - if (node->equal == 0) {

1519 + if ((node->equal == 0) && !fEqualOverflows) {

1520 fprintf(stderr, "ERROR: vertical link, logical node %d maps to phys ical node zero\n",

1521 fEqual->fNodeID);

1522 }

1523 #endif

1524 fChars.extract(0, fChars.length(), (UChar *)node->chars);

1525 - offset += sizeof(uint16_t)*fChars.length();

1526 + offset += sizeof(UChar)*fChars.length();

1527 +

1528 + // append 16 bits of to end for equal node if fEqualOverflows

1529 + if (fEqualOverflows) {

1530 + ((uint16_t )(bytes+offset)) = (translate.elementAti(fEqual->fNode ID) >> 16);

1531 + offset += sizeof(uint16_t);

1532 + }

1533 +

1534 + BuildCompactTrieNode::writeValue(bytes, offset);

1535 }

1536

1537 void addChar(UChar ch) {

1538 @@ -784,60 +1161,85 @@

1539 void setLink(BuildCompactTrieNode *node) {

1540 fEqual = node;

1541 }

1542 +

1543 };

1544

1545 // Forward declaration

1546 static void walkHorizontal(const TernaryNode *node,

1547 BuildCompactTrieHorizontalNode *building,

1548 UStack &nodes,

1549 - UErrorCode &status);

1550 + UErrorCode &status,

1551 + Hashtable *values);

1552

1553 -// Convert one node. Uses recursion.

1554 +// Convert one TernaryNode into a BuildCompactTrieNode. Uses recursion.

1555

1556 static BuildCompactTrieNode *

1557 -compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, UE rrorCode &status) {

1558 +compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes,

1559 + UErrorCode &status, Hashtable *values = NULL, uint16_t parentValue = 0) {

1560 if (U_FAILURE(status)) {

1561 return NULL;

1562 }

1563 BuildCompactTrieNode *result = NULL;

1564 UBool horizontal = (node->low != NULL \|\| node->high != NULL);

1565 if (horizontal) {

1566 - BuildCompactTrieHorizontalNode *hResult =

1567 - new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, statu s);

1568 + BuildCompactTrieHorizontalNode *hResult;

1569 + if(values != NULL){

1570 + hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status, parentValue);

1571 + } else {

1572 + hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status);

1573 + }

1574 +

1575 if (hResult == NULL) {

1576 status = U_MEMORY_ALLOCATION_ERROR;

1577 return NULL;

1578 }

1579 if (U_SUCCESS(status)) {

1580 - walkHorizontal(node, hResult, nodes, status);

1581 + walkHorizontal(node, hResult, nodes, status, values);

1582 result = hResult;

1583 }

1584 }

1585 else {

1586 - BuildCompactTrieVerticalNode *vResult =

1587 - new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status) ;

1588 + BuildCompactTrieVerticalNode *vResult;

1589 + if(values != NULL){

1590 + vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, s tatus, parentValue);

1591 + } else {

1592 + vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, s tatus);

1593 + }

1594 +

1595 if (vResult == NULL) {

1596 status = U_MEMORY_ALLOCATION_ERROR;

1597 + return NULL;

1598 }

1599 else if (U_SUCCESS(status)) {

1600 - UBool endsWord = FALSE;

1601 + uint16_t value = 0;

1602 + UBool endsWord = FALSE;

1603 // Take up nodes until we end a word, or hit a node with < or > lin ks

1604 do {

1605 vResult->addChar(node->ch);

1606 - endsWord = (node->flags & kEndsWord) != 0;

1607 + value = node->flags;

1608 + endsWord = value > 0;

1609 node = node->equal;

1610 }

1611 while(node != NULL && !endsWord && node->low == NULL && node->high == NULL);

1612 +

1613 if (node == NULL) {

1614 if (!endsWord) {

1615 status = U_ILLEGAL_ARGUMENT_ERROR; // Corrupt input trie

1616 }

1617 - else {

1618 + else if(values != NULL){

1619 + UnicodeString key(value); //store value as a single-char Un icodeString

1620 + BuildCompactTrieValueNode link = (BuildCompactTrieValueNod e ) values->get(key);

1621 + if(link == NULL){

1622 + link = new BuildCompactTrieValueNode(nodes, status, val ue); //take out nodes?

1623 + values->put(key, link, status);

1624 + }

1625 + vResult->setLink(link);

1626 + } else {

1627 vResult->setLink((BuildCompactTrieNode *)nodes[1]);

1628 }

1629 }

1630 else {

1631 - vResult->setLink(compactOneNode(node, endsWord, nodes, status)) ;

1632 + vResult->setLink(compactOneNode(node, endsWord, nodes, status, values, value));

1633 }

1634 result = vResult;

1635 }

1636 @@ -849,19 +1251,28 @@

1637 // Uses recursion.

1638

1639 static void walkHorizontal(const TernaryNode *node,

1640 - BuildCompactTrieHorizontalNode *building,

1641 - UStack &nodes,

1642 - UErrorCode &status) {

1643 + BuildCompactTrieHorizontalNode *building,

1644 + UStack &nodes,

1645 + UErrorCode &status, Hashtable *values = NULL) {

1646 while (U_SUCCESS(status) && node != NULL) {

1647 if (node->low != NULL) {

1648 - walkHorizontal(node->low, building, nodes, status);

1649 + walkHorizontal(node->low, building, nodes, status, values);

1650 }

1651 BuildCompactTrieNode *link = NULL;

1652 if (node->equal != NULL) {

1653 - link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, nodes, status);

1654 + link = compactOneNode(node->equal, node->flags > 0, nodes, status, values, node->flags);

1655 }

1656 - else if (node->flags & kEndsWord) {

1657 - link = (BuildCompactTrieNode *)nodes[1];

1658 + else if (node->flags > 0) {

1659 + if(values != NULL) {

1660 + UnicodeString key(node->flags); //store value as a single-char UnicodeString

1661 + link = (BuildCompactTrieValueNode *) values->get(key);

1662 + if(link == NULL) {

1663 + link = new BuildCompactTrieValueNode(nodes, status, node->f lags); //take out nodes?

1664 + values->put(key, link, status);

1665 + }

1666 + } else {

1667 + link = (BuildCompactTrieNode *)nodes[1];

1668 + }

1669 }

1670 if (U_SUCCESS(status) && link != NULL) {

1671 building->addNode(node->ch, link, status);

1672 @@ -881,13 +1292,15 @@

1673 _sortBuildNodes(const void * /context/, const void voidl, const void voidr) {

1674 BuildCompactTrieNode left = (BuildCompactTrieNode **)voidl;

1675 BuildCompactTrieNode right = (BuildCompactTrieNode **)voidr;

1676 +

1677 // Check for comparing a node to itself, to avoid spurious duplicates

1678 if (left == right) {

1679 return 0;

1680 }

1681 +

1682 // Most significant is type of node. Can never coalesce.

1683 - if (left->fVertical != right->fVertical) {

1684 - return left->fVertical - right->fVertical;

1685 + if (left->fNodeType != right->fNodeType) {

1686 + return left->fNodeType - right->fNodeType;

1687 }

1688 // Next, the "parent ends word" flag. If that differs, we cannot coalesce.

1689 if (left->fParentEndsWord != right->fParentEndsWord) {

1690 @@ -898,12 +1311,19 @@

1691 if (result != 0) {

1692 return result;

1693 }

1694 +

1695 + // If the node value differs, we should not coalesce.

1696 + // If values aren't stored, all fValues should be 0.

1697 + if (left->fValue != right->fValue) {

1698 + return left->fValue - right->fValue;

1699 + }

1700 +

1701 // We know they're both the same node type, so branch for the two cases.

1702 - if (left->fVertical) {

1703 + if (left->fNodeType == kVerticalType) {

1704 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID

1705 - - ((BuildCompactTrieVerticalNode *)right)->fEqual-> fNodeID;

1706 + - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;

1707 }

1708 - else {

1709 + else if(left->fChars.length() > 0 && right->fChars.length() > 0){

1710 // We need to compare the links vectors. They should be the

1711 // same size because the strings were equal.

1712 // We compare the node IDs instead of the pointers, to handle

1713 @@ -914,9 +1334,10 @@

1714 int32_t count = hleft->fLinks.size();

1715 for (int32_t i = 0; i < count && result == 0; ++i) {

1716 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID -

1717 - ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;

1718 + ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;

1719 }

1720 }

1721 +

1722 // If they are equal to each other, mark them (speeds coalescing)

1723 if (result == 0) {

1724 left->fHasDuplicate = TRUE;

1725 @@ -1031,20 +1452,25 @@

1726 // Add node 0, used as the NULL pointer/sentinel.

1727 nodes.addElement((int32_t)0, status);

1728

1729 + Hashtable *values = NULL; // Index of (unique) va lues

1730 + if (dict.fValued) {

1731 + values = new Hashtable(status);

1732 + }

1733 +

1734 // Start by creating the special empty node we use to indicate that the par ent

1735 // terminates a word. This must be node 1, because the builder assumes

1736 - // that.

1737 + // that. This node will never be used for tries storing numerical values.

1738 if (U_FAILURE(status)) {

1739 return NULL;

1740 }

1741 - BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, node s, status);

1742 + BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontal Type, nodes, status);

1743 if (terminal == NULL) {

1744 status = U_MEMORY_ALLOCATION_ERROR;

1745 }

1746

1747 // This call does all the work of building the new trie structure. The root

1748 - // will be node 2.

1749 - BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, statu s);

1750 + // will have node ID 2 before writing to memory.

1751 + BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, statu s, values);

1752 #ifdef DEBUG_TRIE_DICT

1753 (void) ::times(&timing);

1754 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n",

1755 @@ -1077,21 +1503,37 @@

1756 return NULL;

1757 }

1758

1759 + //map terminal value nodes

1760 + int valueCount = 0;

1761 + UVector valueNodes(status);

1762 + if(values != NULL) {

1763 + valueCount = values->count(); //number of unique terminal value nodes

1764 + }

1765 +

1766 + // map non-terminal nodes

1767 + int valuePos = 1;//, nodePos = valueCount + valuePos;

1768 + nodeCount = valueCount + valuePos;

1769 for (i = 1; i < count; ++i) {

1770 node = (BuildCompactTrieNode *)nodes[i];

1771 if (node->fNodeID == i) {

1772 // Only one node out of each duplicate set is used

1773 - if (i >= translate.size()) {

1774 + if (node->fNodeID >= translate.size()) {

1775 // Logically extend the mapping table

1776 - translate.setSize(i+1);

1777 + translate.setSize(i + 1);

1778 + }

1779 + //translate.setElementAt(object, index)!

1780 + if(node->fNodeType == kValueType) {

1781 + valueNodes.addElement(node, status);

1782 + translate.setElementAt(valuePos++, i);

1783 + } else {

1784 + translate.setElementAt(nodeCount++, i);

1785 }

1786 - translate.setElementAt(nodeCount++, i);

1787 totalSize += node->size();

1788 }

1789 }

1790 -

1791 - // Check for overflowing 16 bits worth of nodes.

1792 - if (nodeCount > 0x10000) {

1793 +

1794 + // Check for overflowing 20 bits worth of nodes.

1795 + if (nodeCount > 0x100000) {

1796 status = U_ILLEGAL_ARGUMENT_ERROR;

1797 return NULL;

1798 }

1799 @@ -1111,9 +1553,14 @@

1800 status = U_MEMORY_ALLOCATION_ERROR;

1801 return NULL;

1802 }

1803 -

1804 +

1805 CompactTrieHeader header = (CompactTrieHeader )bytes;

1806 - header->size = totalSize;

1807 + //header->size = totalSize;

1808 + if(dict.fValued){

1809 + header->magic = COMPACT_TRIE_MAGIC_3;

1810 + } else {

1811 + header->magic = COMPACT_TRIE_MAGIC_2;

1812 + }

1813 header->nodeCount = nodeCount;

1814 header->offsets[0] = 0; // Sentinel

1815 header->root = translate.elementAti(root->fNodeID);

1816 @@ -1123,23 +1570,40 @@

1817 }

1818 #endif

1819 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uin t32_t));

1820 - nodeCount = 1;

1821 + nodeCount = valueCount + 1;

1822 +

1823 + // Write terminal value nodes to memory

1824 + for (i=0; i < valueNodes.size(); i++) {

1825 + //header->offsets[i + 1] = offset;

1826 + uint32_t tmpOffset = 0;

1827 + node = (BuildCompactTrieNode *) valueNodes.elementAt(i);

1828 + //header->offsets[i + 1] = (uint32_t)node->fValue;

1829 + node->write((uint8_t *)&header->offsets[i+1], tmpOffset, translate);

1830 + }

1831 +

1832 // Now write the data

1833 for (i = 1; i < count; ++i) {

1834 node = (BuildCompactTrieNode *)nodes[i];

1835 - if (node->fNodeID == i) {

1836 + if (node->fNodeID == i && node->fNodeType != kValueType) {

1837 header->offsets[nodeCount++] = offset;

1838 node->write(bytes, offset, translate);

1839 }

1840 }

1841 +

1842 + //free all extra space

1843 + uprv_realloc(bytes, offset);

1844 + header->size = offset;

1845 +

1846 #ifdef DEBUG_TRIE_DICT

1847 + fprintf(stdout, "Space freed: %d\n", totalSize-offset);

1848 +

1849 (void) ::times(&timing);

1850 fprintf(stderr, "Trie built, time user %f system %f\n",

1851 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,

1852 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);

1853 previous = timing;

1854 fprintf(stderr, "Final offset is %d\n", offset);

1855 -

1856 +

1857 // Collect statistics on node types and sizes

1858 int hCount = 0;

1859 int vCount = 0;

1860 @@ -1148,68 +1612,85 @@

1861 size_t hItemCount = 0;

1862 size_t vItemCount = 0;

1863 uint32_t previousOff = offset;

1864 - for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {

1865 + uint32_t numOverflow = 0;

1866 + uint32_t valueSpace = 0;

1867 + for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {

1868 const CompactTrieNode *node = getCompactNode(header, nodeIdx);

1869 - if (node->flagscount & kVerticalNode) {

1870 + int itemCount;

1871 + if(nodeIdx == header->root)

1872 + itemCount = node->flagscount & kRootCountMask;

1873 + else

1874 + itemCount = getCount(node);

1875 + if(node->flagscount & kEqualOverflows){

1876 + numOverflow++;

1877 + }

1878 + if (node->flagscount & kVerticalNode && nodeIdx != header->root) {

1879 vCount += 1;

1880 - vItemCount += (node->flagscount & kCountMask);

1881 + vItemCount += itemCount;

1882 vSize += previousOff-header->offsets[nodeIdx];

1883 }

1884 else {

1885 hCount += 1;

1886 - hItemCount += (node->flagscount & kCountMask);

1887 - hSize += previousOff-header->offsets[nodeIdx];

1888 + hItemCount += itemCount;

1889 + if(nodeIdx >= header->root) {

1890 + hSize += previousOff-header->offsets[nodeIdx];

1891 + }

1892 }

1893 +

1894 + if(header->magic == COMPACT_TRIE_MAGIC_3 && node->flagscount & kParentE ndsWord)

1895 + valueSpace += sizeof(uint16_t);

1896 previousOff = header->offsets[nodeIdx];

1897 }

1898 fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items \n", hCount,

1899 (double)hSize/hCount, (double)hItemCount/hCount);

1900 fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n ", vCount,

1901 (double)vSize/vCount, (double)vItemCount/vCount);

1902 + fprintf(stderr, "Number of nodes with overflowing nodeIDs: %d \n", numOverf low);

1903 + fprintf(stderr, "Space taken up by values: %d \n", valueSpace);

1904 #endif

1905

1906 if (U_FAILURE(status)) {

1907 uprv_free(bytes);

1908 header = NULL;

1909 }

1910 - else {

1911 - header->magic = COMPACT_TRIE_MAGIC_1;

1912 - }

1913 return header;

1914 }

1915

1916 // Forward declaration

1917 static TernaryNode *

1918 -unpackOneNode( const CompactTrieHeader header, const CompactTrieNode node, UE rrorCode &status );

1919 -

1920 +unpackOneNode( const CompactTrieInfo info, const CompactTrieNode node, UError Code &status );

1921

1922 // Convert a horizontal node (or subarray thereof) into a ternary subtrie

1923 static TernaryNode *

1924 -unpackHorizontalArray( const CompactTrieHeader header, const CompactTrieHorizo ntalEntry array,

1925 - int low, int high, UErrorCode &status ) {

1926 +unpackHorizontalArray( const CompactTrieInfo info, const CompactTrieHorizontal Node hnode,

1927 + int low, int high, int nodeCount, UErrorCode &status) {

1928 if (U_FAILURE(status) \|\| low > high) {

1929 return NULL;

1930 }

1931 int middle = (low+high)/2;

1932 - TernaryNode *result = new TernaryNode(array[middle].ch);

1933 + TernaryNode *result = new TernaryNode(hnode->entries[middle].ch);

1934 if (result == NULL) {

1935 status = U_MEMORY_ALLOCATION_ERROR;

1936 return NULL;

1937 }

1938 - const CompactTrieNode *equal = getCompactNode(header, array[middle].equal);

1939 + const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(hnode, mi ddle, nodeCount));

1940 if (equal->flagscount & kParentEndsWord) {

1941 - result->flags \|= kEndsWord;

1942 + if(info->magic == COMPACT_TRIE_MAGIC_3){

1943 + result->flags = getValue(equal);

1944 + }else{

1945 + result->flags \|= kEndsWord;

1946 + }

1947 }

1948 - result->low = unpackHorizontalArray(header, array, low, middle-1, status);

1949 - result->high = unpackHorizontalArray(header, array, middle+1, high, status) ;

1950 - result->equal = unpackOneNode(header, equal, status);

1951 + result->low = unpackHorizontalArray(info, hnode, low, middle-1, nodeCount, status);

1952 + result->high = unpackHorizontalArray(info, hnode, middle+1, high, nodeCount , status);

1953 + result->equal = unpackOneNode(info, equal, status);

1954 return result;

1955 }

1956

1957 // Convert one compact trie node into a ternary subtrie

1958 static TernaryNode *

1959 -unpackOneNode( const CompactTrieHeader header, const CompactTrieNode node, UE rrorCode &status ) {

1960 - int nodeCount = (node->flagscount & kCountMask);

1961 +unpackOneNode( const CompactTrieInfo info, const CompactTrieNode node, UError Code &status ) {

1962 + int nodeCount = getCount(node);

1963 if (nodeCount == 0 \|\| U_FAILURE(status)) {

1964 // Failure, or terminal node

1965 return NULL;

1966 @@ -1234,29 +1715,41 @@

1967 previous = latest;

1968 }

1969 if (latest != NULL) {

1970 - const CompactTrieNode *equal = getCompactNode(header, vnode->equal) ;

1971 + const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(v node));

1972 if (equal->flagscount & kParentEndsWord) {

1973 - latest->flags \|= kEndsWord;

1974 + if(info->magic == COMPACT_TRIE_MAGIC_3){

1975 + latest->flags = getValue(equal);

1976 + } else {

1977 + latest->flags \|= kEndsWord;

1978 + }

1979 }

1980 - latest->equal = unpackOneNode(header, equal, status);

1981 + latest->equal = unpackOneNode(info, equal, status);

1982 }

1983 return head;

1984 }

1985 else {

1986 // Horizontal node

1987 const CompactTrieHorizontalNode hnode = (const CompactTrieHorizontalNo de )node;

1988 - return unpackHorizontalArray(header, &hnode->entries[0], 0, nodeCount-1 , status);

1989 + return unpackHorizontalArray(info, hnode, 0, nodeCount-1, nodeCount, st atus);

1990 }

1991 }

1992

1993 +// returns a MutableTrieDictionary generated from the CompactTrieDictionary

1994 MutableTrieDictionary *

1995 CompactTrieDictionary::cloneMutable( UErrorCode &status ) const {

1996 - MutableTrieDictionary *result = new MutableTrieDictionary( status );

1997 + MutableTrieDictionary *result = new MutableTrieDictionary( status, fInfo->m agic == COMPACT_TRIE_MAGIC_3 );

1998 if (result == NULL) {

1999 status = U_MEMORY_ALLOCATION_ERROR;

2000 return NULL;

2001 }

2002 - TernaryNode *root = unpackOneNode(fData, getCompactNode(fData, fData->root) , status);

2003 + // treat root node as special case: don't call unpackOneNode() or unpackHor izontalArray() directly

2004 + // because only kEqualOverflows flag should be checked in root's flagscount

2005 + const CompactTrieHorizontalNode hnode = (const CompactTrieHorizontalNode )

2006 + getCompactNode(fInfo, fInfo->root);

2007 + uint16_t nodeCount = hnode->flagscount & kRootCountMask;

2008 + TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1,

2009 + nodeCount, status);

2010 +

2011 if (U_FAILURE(status)) {

2012 delete root; // Clean up

2013 delete result;

2014 @@ -1270,8 +1763,8 @@

2015

2016 U_CAPI int32_t U_EXPORT2

2017 triedict_swap(const UDataSwapper ds, const void inData, int32_t length, void *outData,

2018 - UErrorCode *status) {

2019 -

2020 + UErrorCode *status) {

2021 +

2022 if (status == NULL \|\| U_FAILURE(*status)) {

2023 return 0;

2024 }

2025 @@ -1286,14 +1779,14 @@

2026 //

2027 const UDataInfo pInfo = (const UDataInfo )((const uint8_t *)inData+4);

2028 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */

2029 - pInfo->dataFormat[1]==0x72 &&

2030 - pInfo->dataFormat[2]==0x44 &&

2031 - pInfo->dataFormat[3]==0x63 &&

2032 - pInfo->formatVersion[0]==1 )) {

2033 + pInfo->dataFormat[1]==0x72 &&

2034 + pInfo->dataFormat[2]==0x44 &&

2035 + pInfo->dataFormat[3]==0x63 &&

2036 + pInfo->formatVersion[0]==1 )) {

2037 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",

2038 - pInfo->dataFormat[0], pInfo->dataFormat[1],

2039 - pInfo->dataFormat[2], pInfo->dataFormat[3],

2040 - pInfo->formatVersion[0]);

2041 + pInfo->dataFormat[0], pInfo->dataFormat[1],

2042 + pInfo->dataFormat[2], pInfo->dataFormat[3],

2043 + pInfo->formatVersion[0]);

2044 *status=U_UNSUPPORTED_ERROR;

2045 return 0;

2046 }

2047 @@ -1311,8 +1804,10 @@

2048 //

2049 const uint8_t inBytes =(const uint8_t )inData+headerSize;

2050 const CompactTrieHeader header = (const CompactTrieHeader )inBytes;

2051 - if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1

2052 - \|\| ds->readUInt32(header->size) < sizeof(CompactTrieHeader))

2053 + uint32_t magic = ds->readUInt32(header->magic);

2054 + if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic != COMPACT_TRIE_MAGIC_3

2055 + \|\| magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeaderV1)

2056 + \|\| magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeader))

2057 {

2058 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n" );

2059 *status=U_UNSUPPORTED_ERROR;

2060 @@ -1333,10 +1828,10 @@

2061 //

2062 if (length < sizeWithUData) {

2063 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",

2064 - totalSize);

2065 + totalSize);

2066 *status=U_INDEX_OUTOFBOUNDS_ERROR;

2067 return 0;

2068 - }

2069 + }

2070

2071 //

2072 // Swap the Data. Do the data itself first, then the CompactTrieHeader, be cause

2073 @@ -1355,20 +1850,38 @@

2074 }

2075

2076 // We need to loop through all the nodes in the offset table, and swap each one.

2077 - uint16_t nodeCount = ds->readUInt16(header->nodeCount);

2078 + uint32_t nodeCount, rootId;

2079 + if(header->magic == COMPACT_TRIE_MAGIC_1) {

2080 + nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount);

2081 + rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root);

2082 + } else {

2083 + nodeCount = ds->readUInt32(header->nodeCount);

2084 + rootId = ds->readUInt32(header->root);

2085 + }

2086 +

2087 // Skip node 0, which should always be 0.

2088 - for (int i = 1; i < nodeCount; ++i) {

2089 + for (uint32_t i = 1; i < nodeCount; ++i) {

2090 uint32_t nodeOff = ds->readUInt32(header->offsets[i]);

2091 const CompactTrieNode inNode = (const CompactTrieNode )(inBytes + nod eOff);

2092 CompactTrieNode outNode = (CompactTrieNode )(outBytes + nodeOff);

2093 uint16_t flagscount = ds->readUInt16(inNode->flagscount);

2094 - uint16_t itemCount = flagscount & kCountMask;

2095 + uint16_t itemCount = getCount(inNode);

2096 + //uint16_t itemCount = flagscount & kCountMask;

2097 ds->writeUInt16(&outNode->flagscount, flagscount);

2098 if (itemCount > 0) {

2099 - if (flagscount & kVerticalNode) {

2100 + uint16_t overflow = 0; //number of extra uint16_ts needed to be swa pped

2101 + if (flagscount & kVerticalNode && i != rootId) {

2102 + if(flagscount & kEqualOverflows){

2103 + // include overflow bits

2104 + overflow += 1;

2105 + }

2106 + if (header->magic == COMPACT_TRIE_MAGIC_3 && flagscount & kEnds ParentWord) {

2107 + //include values

2108 + overflow += 1;

2109 + }

2110 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVertica lNode,chars),

2111 - itemCount*sizeof(uint16_t),

2112 - outBytes+nodeOff+offsetof(CompactTrieVertic alNode,chars), status);

2113 + (itemCount + overflow)*sizeof(uint16_t),

2114 + outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars ), status);

2115 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(Compac tTrieVerticalNode,equal);

2116 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNo de,equal));

2117 }

2118 @@ -1381,26 +1894,62 @@

2119 word = ds->readUInt16(inHNode->entries[j].equal);

2120 ds->writeUInt16(&outHNode->entries[j].equal, word);

2121 }

2122 +

2123 + // swap overflow/value information

2124 + if(flagscount & kEqualOverflows){

2125 + overflow += (itemCount + 3) / 4;

2126 + }

2127 +

2128 + if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && fla gscount & kEndsParentWord) {

2129 + //include values

2130 + overflow += 1;

2131 + }

2132 +

2133 + uint16_t inOverflow = (uint16_t ) &inHNode->entries[itemCount ];

2134 + uint16_t outOverflow = (uint16_t ) &outHNode->entries[itemCou nt];

2135 + for(int j = 0; j<overflow; j++){

2136 + uint16_t extraInfo = ds->readUInt16(*inOverflow);

2137 + ds->writeUInt16(outOverflow, extraInfo);

2138 +

2139 + inOverflow++;

2140 + outOverflow++;

2141 + }

2142 }

2143 }

2144 }

2145 #endif

2146

2147 - // All the data in all the nodes consist of 16 bit items. Swap them all at once.

2148 - uint16_t nodeCount = ds->readUInt16(header->nodeCount);

2149 - uint32_t nodesOff = offsetof(CompactTrieHeader,offsets)+((uint32_t)nodeCoun t*sizeof(uint32_t));

2150 - ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff , status);

2151 -

2152 // Swap the header

2153 ds->writeUInt32(&outputHeader->size, totalSize);

2154 - uint32_t magic = ds->readUInt32(header->magic);

2155 ds->writeUInt32(&outputHeader->magic, magic);

2156 - ds->writeUInt16(&outputHeader->nodeCount, nodeCount);

2157 - uint16_t root = ds->readUInt16(header->root);

2158 - ds->writeUInt16(&outputHeader->root, root);

2159 - ds->swapArray32(ds, inBytes+offsetof(CompactTrieHeader,offsets),

2160 - sizeof(uint32_t)*(int32_t)nodeCount,

2161 - outBytes+offsetof(CompactTrieHeader,offsets), status);

2162 +

2163 + uint32_t nodeCount;

2164 + uint32_t offsetPos;

2165 + if (header->magic == COMPACT_TRIE_MAGIC_1) {

2166 + CompactTrieHeaderV1 headerV1 = (CompactTrieHeaderV1 )header;

2167 + CompactTrieHeaderV1 outputHeaderV1 = (CompactTrieHeaderV1 )outputHead er;

2168 +

2169 + nodeCount = ds->readUInt16(headerV1->nodeCount);

2170 + ds->writeUInt16(&outputHeaderV1->nodeCount, nodeCount);

2171 + uint16_t root = ds->readUInt16(headerV1->root);

2172 + ds->writeUInt16(&outputHeaderV1->root, root);

2173 + offsetPos = offsetof(CompactTrieHeaderV1,offsets);

2174 + } else {

2175 + nodeCount = ds->readUInt32(header->nodeCount);

2176 + ds->writeUInt32(&outputHeader->nodeCount, nodeCount);

2177 + uint32_t root = ds->readUInt32(header->root);

2178 + ds->writeUInt32(&outputHeader->root, root);

2179 + offsetPos = offsetof(CompactTrieHeader,offsets);

2180 + }

2181 +

2182 + // All the data in all the nodes consist of 16 bit items. Swap them all at once.

2183 + uint32_t nodesOff = offsetPos+((uint32_t)nodeCount*sizeof(uint32_t));

2184 + ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff , status);

2185 +

2186 + //swap offsets

2187 + ds->swapArray32(ds, inBytes+offsetPos,

2188 + sizeof(uint32_t)*(uint32_t)nodeCount,

2189 + outBytes+offsetPos, status);

2190

2191 return sizeWithUData;

2192 }

2193 --- source/common/triedict.h 2006-06-06 15:38:49.000000000 -0700

2194 +++ source/common/triedict.h 2011-01-21 14:12:45.496927000 -0800

2195 @@ -47,7 +47,6 @@

2196 U_NAMESPACE_BEGIN

2197

2198 class StringEnumeration;

2199 -struct CompactTrieHeader;

2200

2201 /*******************************************************************

2202 * TrieWordDictionary

2203 @@ -72,23 +71,29 @@

2204 */

2205 virtual ~TrieWordDictionary();

2206

2207 + /**

2208 + * <p>Returns true if the dictionary contains values associated with each wor d.</p>

2209 + */

2210 + virtual UBool getValued() const = 0;

2211 +

2212 /**

2213 * <p>Find dictionary words that match the text.</p>

2214 *

2215 * @param text A UText representing the text. The

2216 * iterator is left after the longest prefix match in the dictionary.

2217 - * @param start The current position in text.

2218 * @param maxLength The maximum number of code units to match.

2219 * @param lengths An array that is filled with the lengths of words that match ed.

2220 * @param count Filled with the number of elements output in lengths.

2221 * @param limit The size of the lengths array; this limits the number of words output.

2222 + * @param values An array that is filled with the values associated with the m atched words.

2223 * @return The number of characters in text that were matched.

2224 */

2225 virtual int32_t matches( UText *text,

2226 int32_t maxLength,

2227 int32_t *lengths,

2228 int &count,

2229 - int limit ) const = 0;

2230 + int limit,

2231 + uint16_t *values = NULL) const = 0;

2232

2233 /**

2234 * <p>Return a StringEnumeration for iterating all the words in the dictionar y.</p>

2235 @@ -128,6 +133,12 @@

2236

2237 UText *fIter;

2238

2239 + /**

2240 + * A UText for internal use

2241 + * @internal

2242 + */

2243 + UBool fValued;

2244 +

2245 friend class CompactTrieDictionary; // For fast conversion

2246

2247 public:

2248 @@ -138,14 +149,29 @@

2249 * @param median A UChar around which to balance the trie. Ideally, it should

2250 * begin at least one word that is near the median of the set in the dictionar y

2251 * @param status A status code recording the success of the call.

2252 + * @param containsValue True if the dictionary stores values associated with e ach word.

2253 */

2254 - MutableTrieDictionary( UChar median, UErrorCode &status );

2255 + MutableTrieDictionary( UChar median, UErrorCode &status, UBool containsValue = FALSE );

2256

2257 /**

2258 * <p>Virtual destructor.</p>

2259 */

2260 virtual ~MutableTrieDictionary();

2261

2262 + /**

2263 + * Indicate whether the MutableTrieDictionary stores values associated with e ach word

2264 + */

2265 + void setValued(UBool valued){

2266 + fValued = valued;

2267 + }

2268 +

2269 + /**

2270 + * <p>Returns true if the dictionary contains values associated with each wor d.</p>

2271 + */

2272 + virtual UBool getValued() const {

2273 + return fValued;

2274 + }

2275 +

2276 /**

2277 * <p>Find dictionary words that match the text.</p>

2278 *

2279 @@ -155,13 +181,15 @@

2280 * @param lengths An array that is filled with the lengths of words that match ed.

2281 * @param count Filled with the number of elements output in lengths.

2282 * @param limit The size of the lengths array; this limits the number of words output.

2283 + * @param values An array that is filled with the values associated with the m atched words.

2284 * @return The number of characters in text that were matched.

2285 */

2286 virtual int32_t matches( UText *text,

2287 int32_t maxLength,

2288 int32_t *lengths,

2289 int &count,

2290 - int limit ) const;

2291 + int limit,

2292 + uint16_t *values = NULL) const;

2293

2294 /**

2295 * <p>Return a StringEnumeration for iterating all the words in the dictionar y.</p>

2296 @@ -173,15 +201,17 @@

2297 virtual StringEnumeration *openWords( UErrorCode &status ) const;

2298

2299 /**

2300 - * <p>Add one word to the dictionary.</p>

2301 + * <p>Add one word to the dictionary with an optional associated value.</p>

2302 *

2303 * @param word A UChar buffer containing the word.

2304 * @param length The length of the word.

2305 - * @param status The resultant status

2306 + * @param status The resultant status.

2307 + * @param value The nonzero value associated with this word.

2308 */

2309 virtual void addWord( const UChar *word,

2310 int32_t length,

2311 - UErrorCode &status);

2312 + UErrorCode &status,

2313 + uint16_t value = 0);

2314

2315 #if 0

2316 /**

2317 @@ -203,8 +233,9 @@

2318 * @param lengths An array that is filled with the lengths of words that match ed.

2319 * @param count Filled with the number of elements output in lengths.

2320 * @param limit The size of the lengths array; this limits the number of words output.

2321 - * @param parent The parent of the current node

2322 - * @param pMatched The returned parent node matched the input

2323 + * @param parent The parent of the current node.

2324 + * @param pMatched The returned parent node matched the input/

2325 + * @param values An array that is filled with the values associated with the m atched words.

2326 * @return The number of characters in text that were matched.

2327 */

2328 virtual int32_t search( UText *text,

2329 @@ -213,40 +244,46 @@

2330 int &count,

2331 int limit,

2332 TernaryNode *&parent,

2333 - UBool &pMatched ) const;

2334 + UBool &pMatched,

2335 + uint16_t *values = NULL) const;

2336

2337 private:

2338 /**

2339 * <p>Private constructor. The root node it not allocated.</p>

2340 *

2341 * @param status A status code recording the success of the call.

2342 + * @param containsValues True if the dictionary will store a value associated

2343 + * with each word added.

2344 */

2345 - MutableTrieDictionary( UErrorCode &status );

2346 + MutableTrieDictionary( UErrorCode &status, UBool containsValues = false );

2347 };

2348

2349 /*******************************************************************

2350 * CompactTrieDictionary

2351 */

2352

2353 +//forward declarations

2354 +struct CompactTrieHeader;

2355 +struct CompactTrieInfo;

2356 +

2357 /**

2358 * <p>CompactTrieDictionary is a TrieWordDictionary that has been compacted

2359 * to save space.</p>

2360 */

2361 class U_COMMON_API CompactTrieDictionary : public TrieWordDictionary {

2362 private:

2363 - /**

2364 - * The root node of the trie

2365 - */

2366 + /**

2367 + * The header of the CompactTrieDictionary which contains all info

2368 + */

2369

2370 - const CompactTrieHeader *fData;

2371 -

2372 - /**

2373 - * A UBool indicating whether or not we own the fData.

2374 - */

2375 + CompactTrieInfo *fInfo;

2376

2377 + /**

2378 + * A UBool indicating whether or not we own the fData.

2379 + */

2380 UBool fOwnData;

2381

2382 - UDataMemory *fUData;

2383 + UDataMemory *fUData;

2384 public:

2385 /**

2386 * <p>Construct a dictionary from a UDataMemory.</p>

2387 @@ -277,6 +314,11 @@

2388 */

2389 virtual ~CompactTrieDictionary();

2390

2391 + /**

2392 + * <p>Returns true if the dictionary contains values associated with each wor d.</p>

2393 + */

2394 + virtual UBool getValued() const;

2395 +

2396 /**

2397 * <p>Find dictionary words that match the text.</p>

2398 *

2399 @@ -286,13 +328,15 @@

2400 * @param lengths An array that is filled with the lengths of words that match ed.

2401 * @param count Filled with the number of elements output in lengths.

2402 * @param limit The size of the lengths array; this limits the number of words output.

2403 + * @param values An array that is filled with the values associated with the m atched words.

2404 * @return The number of characters in text that were matched.

2405 */

2406 virtual int32_t matches( UText *text,

2407 - int32_t rangeEnd,

2408 + int32_t maxLength,

2409 int32_t *lengths,

2410 int &count,

2411 - int limit ) const;

2412 + int limit,

2413 + uint16_t *values = NULL) const;

2414

2415 /**

2416 * <p>Return a StringEnumeration for iterating all the words in the dictionar y.</p>

2417 @@ -311,7 +355,7 @@

2418 virtual uint32_t dataSize() const;

2419

2420 /**

2421 - * <p>Return a void * pointer to the compact data, platform-endian.</p>

2422 + * <p>Return a void * pointer to the (unmanaged) compact data, platform-endian .</p>

2423 *

2424 * @return The data for the compact dictionary, suitable for passing to the

2425 * constructor.

2426 @@ -342,5 +386,5 @@

2427

2428 U_NAMESPACE_END

2429

2430 - /* TRIEDICT_H */

2431 +/* TRIEDICT_H */

2432 #endif

2433 --- source/data/Makefile.in 2010-10-29 13:21:33.000000000 -0700

2434 +++ source/data/Makefile.in 2011-01-26 16:24:24.856798000 -0800

2435 @@ -509,8 +520,9 @@

2436 #################################################### CTD

2437 # CTD FILES

2438

2439 -$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_ FILES)

2440 - $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<

2441 +# .ctd file now generated regardless of whether dictionary file exists

2442 +$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)

2443 + $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F ).txt

2444

2445 #################################################### CFU

2446 # CFU FILES

2447 --- source/data/brkitr/root.txt 2010-07-28 17:18:28.000000000 -0700

2448 +++ source/data/brkitr/root.txt 2011-01-21 14:12:45.653922000 -0800

2449 @@ -17,5 +17,8 @@

2450 }

2451 dictionaries{

2452 Thai:process(dependency){"thaidict.ctd"}

2453 + Hani:process(dependency){"cjdict.ctd"}

2454 + Hira:process(dependency){"cjdict.ctd"}

2455 + Kata:process(dependency){"cjdict.ctd"}

2456 }

2457 }

2458 --- source/data/xml/brkitr/root.xml 2010-03-01 15:13:18.000000000 -0800

2459 +++ source/data/xml/brkitr/root.xml 2011-01-21 14:12:45.735922000 -0800

2460 @@ -25,6 +25,9 @@

2461 </icu:boundaries>

2462 <icu:dictionaries>

2463 <icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/>

2464 + <icu:dictionary type="Hani" icu:dependency="cjdict.ctd"/>

2465 + <icu:dictionary type="Hira" icu:dependency="cjdict.ctd"/>

2466 + <icu:dictionary type="Kata" icu:dependency="cjdict.ctd"/>

2467 </icu:dictionaries>

2468 </icu:breakIteratorData>

2469 </special>

2470 --- source/test/cintltst/creststn.c 2010-10-28 10:44:02.000000000 -0700

2471 +++ source/test/cintltst/creststn.c 2011-01-21 14:12:44.995020000 -0800

2472 @@ -2188,21 +2188,21 @@

2473

2474

2475 {

2476 - UResourceBundle* ja = ures_open(U_ICUDATA_BRKITR,"ja", &status);

2477 + UResourceBundle* th = ures_open(U_ICUDATA_BRKITR,"th", &status);

2478 const UChar got = NULL, exp=NULL;

2479 int32_t gotLen = 0, expLen=0;

2480 - ja = ures_getByKey(ja, "boundaries", ja, &status);

2481 - exp = tres_getString(ja, -1, "word", &expLen, &status);

2482 + th = ures_getByKey(th, "boundaries", th, &status);

2483 + exp = tres_getString(th, -1, "grapheme", &expLen, &status);

2484

2485 tb = ures_getByKey(aliasB, "boundaries", tb, &status);

2486 - got = tres_getString(tb, -1, "word", &gotLen, &status);

2487 + got = tres_getString(tb, -1, "grapheme", &gotLen, &status);

2488

2489 if(U_FAILURE(status)) {

2490 log_err("%s trying to read str boundaries\n", u_errorName(statu s));

2491 } else if(gotLen != expLen \|\| u_strncmp(exp, got, gotLen) != 0) {

2492 log_err("Referencing alias didn't get the right data\n");

2493 }

2494 - ures_close(ja);

2495 + ures_close(th);

2496 status = U_ZERO_ERROR;

2497 }

2498 /* simple alias */

2499 --- source/test/intltest/rbbiapts.cpp 2010-07-12 11:03:29.000000000 -0700

2500 +++ source/test/intltest/rbbiapts.cpp 2011-01-21 14:12:45.033014000 -0800

2501 @@ -156,9 +156,13 @@

2502 if(a!=b){

2503 errln("Failed: boilerplate method operator!= does not return correct re sults");

2504 }

2505 - BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status);

2506 - if(a && c){

2507 - if(c==a){

2508 + // Japanese word break iteratos is identical to root with

2509 + // a dictionary-based break iterator, but Thai character break iterator

2510 + // is still different from Root.

2511 + BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),stat us);

2512 + BreakIterator* d = BreakIterator::createCharacterInstance(Locale("th"),stat us);

2513 + if(c && d){

2514 + if(c==d){

2515 errln("Failed: boilerplate method opertator== does not return corre ct results");

2516 }

2517 }else{

2518 @@ -167,6 +171,7 @@

2519 delete a;

2520 delete b;

2521 delete c;

2522 + delete d;

2523 }

2524

2525 void RBBIAPITest::TestgetRules()

2526 @@ -635,21 +640,21 @@

2527 //

2528 void RBBIAPITest::TestRuleStatus() {

2529 UChar str[30];

2530 - u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094 ",

2531 - // 012345678901234567 8 9 0 1 2 3 4 5 6

2532 - // Ideographic Katakana Hiragana

2533 + //no longer test Han or hiragana breaking here: ruleStatusVec would return nothing

2534 + // changed UBRK_WORD_KANA to UBRK_WORD_IDEO

2535 + u_unescape("plain word 123.45 \\u30a1\\u30a2 ",

2536 + // 012345678901234567 8 9 0

2537 + // Katakana

2538 str, 30);

2539 UnicodeString testString1(str);

2540 - int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26};

2541 + int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21};

2542 int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER,

2543 UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE,

2544 - UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE,

2545 - UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA, UBRK_WORD_KANA};

2546 + UBRK_WORD_IDEO, UBRK_WORD_NONE};

2547

2548 int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WO RD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,

2549 UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WO RD_NONE_LIMIT,

2550 - UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WO RD_NONE_LIMIT,

2551 - UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WO RD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT};

2552 + UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT};

2553

2554 UErrorCode status=U_ZERO_ERROR;

2555

2556 @@ -888,9 +893,11 @@

2557

2558 URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD , status);

2559 {

2560 +#if 0 // With a dictionary based word breaking, ja_word is identical to root.

2561 if (ja_word && ja_word == root_word) {

2562 errln("japan not different from root");

2563 }

2564 +#endif

2565 }

2566

2567 {

2568 --- source/test/intltest/rbbitst.cpp 2010-10-08 18:23:28.000000000 -0700

2569 +++ source/test/intltest/rbbitst.cpp 2011-01-21 14:12:45.180030000 -0800

2570 @@ -35,6 +35,8 @@

2571 #include <string.h>

2572 #include <stdio.h>

2573 #include <stdlib.h>

2574 +#include "unicode/numfmt.h"

2575 +#include "unicode/uscript.h"

2576

2577 #define TEST_ASSERT(x) {if (!(x)) { \

2578 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}

2579 @@ -138,11 +140,13 @@

2580 if (exec) TestThaiBreaks(); break;

2581 case 23: name = "TestTailoredBreaks";

2582 if (exec) TestTailoredBreaks(); break;

2583 + case 24: name = "TestTrieDictWithValue";

2584 + if(exec) TestTrieDictWithValue(); break;

2585 #else

2586 - case 21: case 22: case 23: name = "skip";

2587 + case 21: case 22: case 23: case 24: name = "skip";

2588 break;

2589 #endif

2590 - case 24: name = "TestDictRules";

2591 + case 25: name = "TestDictRules";

2592 if (exec) TestDictRules(); break;

2593 case 25: name = "TestBug5532";

2594 if (exec) TestBug5532(); break;

2595 @@ -607,6 +611,8 @@

2596

2597

2598 void RBBITest::TestJapaneseWordBreak() {

2599 +// TODO: Rewrite this test for a dictionary-based word breaking.

2600 +#if 0

2601 UErrorCode status = U_ZERO_ERROR;

2602 BITestData japaneseWordSelection(status);

2603

2604 @@ -628,6 +634,7 @@

2605

2606 generalIteratorTest(*e, japaneseWordSelection);

2607 delete e;

2608 +#endif

2609 }

2610

2611 void RBBITest::TestTrieDict() {

2612 @@ -849,6 +856,372 @@

2613 delete compact2;

2614 }

2615

2616 +/TODO: delete later/

2617 +inline void writeEnumerationToFile(StringEnumeration enumer, char filename){

2618 + UErrorCode status = U_ZERO_ERROR;

2619 + FILE *outfile = fopen(filename,"w");

2620 + UConverter *cvt = ucnv_open("UTF-8", &status);

2621 + if (U_FAILURE(status))

2622 + return;

2623 + if(outfile != NULL){

2624 + status = U_ZERO_ERROR;

2625 + const UnicodeString *word = enumer->snext(status);

2626 + while (word != NULL && U_SUCCESS(status)) {

2627 + char u8word[500];

2628 + status = U_ZERO_ERROR;

2629 + ucnv_fromUChars(cvt, u8word, 500, word->getBuffer(), word->length() ,

2630 + &status);

2631 + fprintf(outfile,"%s\n", u8word);

2632 + status = U_ZERO_ERROR;

2633 + word = enumer->snext(status);

2634 + }

2635 + fclose(outfile);

2636 + }

2637 + ucnv_close(cvt);

2638 +}

2639 +

2640 +// A very simple helper class to streamline the buffer handling in

2641 +// TestTrieDictWithValue

2642 +template<class T, size_t N>

2643 +class AutoBuffer {

2644 + public:

2645 + AutoBuffer(size_t size) : buffer(stackBuffer) {

2646 + if (size > N)

2647 + buffer = new T[size];

2648 + }

2649 + ~AutoBuffer() {

2650 + if (buffer != stackBuffer)

2651 + delete [] buffer;

2652 + }

2653 + T* elems() {

2654 + return buffer;

2655 + }

2656 + const T& operator[] (size_t i) const {

2657 + return buffer[i];

2658 + }

2659 + T& operator[] (size_t i) {

2660 + return buffer[i];

2661 + }

2662 + private:

2663 + T stackBuffer[N];

2664 + T* buffer;

2665 + AutoBuffer();

2666 +};

2667 +

2668 +//----------------------------------------------------------------------------

2669 +//

2670 +// TestTrieDictWithValue Test trie dictionaries with logprob values and

2671 +// more than 2^16 nodes after compaction.

2672 +//

2673 +//----------------------------------------------------------------------------

2674 +void RBBITest::TestTrieDictWithValue() {

2675 + UErrorCode status = U_ZERO_ERROR;

2676 +

2677 + //

2678 + // Open and read the test data file.

2679 + //

2680 + const char *testDataDirectory = IntlTest::getSourceTestData(status);

2681 + const char *filename = "cjdict-truncated.txt";

2682 + char testFileName[1000];

2683 + if (testDataDirectory == NULL \|\| strlen(testDataDirectory) + strlen(filenam e) + 10 >= sizeof(testFileName)) {

2684 + errln("Can't open test data. Path too long.");

2685 + return;

2686 + }

2687 + strcpy(testFileName, testDataDirectory);

2688 + strcat(testFileName, filename);

2689 +

2690 + // Items needing deleting at the end

2691 + MutableTrieDictionary *mutableDict = NULL;

2692 + CompactTrieDictionary *compactDict = NULL;

2693 + UnicodeSet *breaks = NULL;

2694 + UChar *testFile = NULL;

2695 + StringEnumeration *enumer1 = NULL;

2696 + StringEnumeration *enumer2 = NULL;

2697 + MutableTrieDictionary *mutable2 = NULL;

2698 + StringEnumeration *cloneEnum = NULL;

2699 + CompactTrieDictionary *compact2 = NULL;

2700 + NumberFormat *nf = NULL;

2701 + UText originalText = NULL, cloneText = NULL;

2702 +

2703 + const UnicodeString *originalWord = NULL;

2704 + const UnicodeString *cloneWord = NULL;

2705 + UChar *current;

2706 + UChar *word;

2707 + UChar uc;

2708 + int32_t wordLen;

2709 + int32_t wordCount;

2710 + int32_t testCount;

2711 + int32_t valueLen;

2712 + int counter = 0;

2713 +

2714 + int len;

2715 + testFile = ReadAndConvertFile(testFileName, len, NULL, status);

2716 + if (U_FAILURE(status)) {

2717 + goto cleanup; /* something went wrong, error already output */

2718 + }

2719 +

2720 + mutableDict = new MutableTrieDictionary(0x0E1C, status, TRUE);

2721 + if (U_FAILURE(status)) {

2722 + errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status) );

2723 + goto cleanup;

2724 + }

2725 +

2726 + breaks = new UnicodeSet;

2727 + breaks->add(0x000A); // Line Feed

2728 + breaks->add(0x000D); // Carriage Return

2729 + breaks->add(0x2028); // Line Separator

2730 + breaks->add(0x2029); // Paragraph Separator

2731 + breaks->add(0x0009); // Tab character

2732 +

2733 + // Now add each non-comment line of the file as a word.

2734 + current = testFile;

2735 + word = current;

2736 + uc = *current++;

2737 + wordLen = 0;

2738 + wordCount = 0;

2739 + nf = NumberFormat::createInstance(status);

2740 +

2741 + while (uc) {

2742 + UnicodeString ucharValue;

2743 + valueLen = 0;

2744 +

2745 + if (uc == 0x0023) { // #comment line, skip

2746 + while (uc && !breaks->contains(uc)) {

2747 + uc = *current++;

2748 + }

2749 + }

2750 + else{

2751 + while (uc && !breaks->contains(uc)) {

2752 + ++wordLen;

2753 + uc = *current++;

2754 + }

2755 + if(uc == 0x0009){ //separator is a tab char, read in num after tab

2756 + uc = *current++;

2757 + while (uc && !breaks->contains(uc)) {

2758 + ucharValue.append(uc);

2759 + uc = *current++;

2760 + }

2761 + }

2762 + }

2763 + if (wordLen > 0) {

2764 + Formattable value((int32_t)0);

2765 + nf->parse(ucharValue.getTerminatedBuffer(), value, status);

2766 +

2767 + if(U_FAILURE(status)){

2768 + errln("parsing of value failed when reading in dictionary\n");

2769 + goto cleanup;

2770 + }

2771 + mutableDict->addWord(word, wordLen, status, value.getLong());

2772 + if (U_FAILURE(status)) {

2773 + errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));

2774 + goto cleanup;

2775 + }

2776 + wordCount += 1;

2777 + }

2778 +

2779 + // Find beginning of next line

2780 + while (uc && breaks->contains(uc)) {

2781 + uc = *current++;

2782 + }

2783 + word = current-1;

2784 + wordLen = 0;

2785 + }

2786 +

2787 + if (wordCount < 50) {

2788 + errln("Word count (%d) unreasonably small\n", wordCount);

2789 + goto cleanup;

2790 + }

2791 +

2792 + enumer1 = mutableDict->openWords(status);

2793 + if (U_FAILURE(status)) {

2794 + errln("Could not open mutable dictionary enumerator: %s\n", u_errorName (status));

2795 + goto cleanup;

2796 + }

2797 +

2798 + testCount = 0;

2799 + if (wordCount != (testCount = enumer1->count(status))) {

2800 + errln("MutableTrieDictionary word count (%d) differs from file word cou nt (%d), with status %s\n",

2801 + testCount, wordCount, u_errorName(status));

2802 + goto cleanup;

2803 + }

2804 +

2805 + // Now compact it

2806 + compactDict = new CompactTrieDictionary(*mutableDict, status);

2807 + if (U_FAILURE(status)) {

2808 + errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(statu s));

2809 + goto cleanup;

2810 + }

2811 +

2812 + enumer2 = compactDict->openWords(status);

2813 + if (U_FAILURE(status)) {

2814 + errln("Could not open compact trie dictionary enumerator: %s\n", u_erro rName(status));

2815 + goto cleanup;

2816 + }

2817 +

2818 +

2819 + //delete later

2820 +// writeEnumerationToFile(enumer1, "/home/jchye/mutable.txt");

2821 +// writeEnumerationToFile(enumer2, "/home/jchye/compact.txt");

2822 +

2823 + enumer1->reset(status);

2824 + enumer2->reset(status);

2825 +

2826 + originalWord = enumer1->snext(status);

2827 + cloneWord = enumer2->snext(status);

2828 + while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {

2829 + if (originalWord != cloneWord) {

2830 + errln("MutableTrieDictionary and CompactTrieDictionary word mismatc h at %d, lengths are %d and %d\n",

2831 + counter, originalWord->length(), cloneWord->length());

2832 + goto cleanup;

2833 + }

2834 +

2835 + // check if attached values of the same word in both dictionaries tally

2836 +#if 0

2837 + int32_t lengths1[originalWord->length()], lengths2[cloneWord->length()] ;

2838 + uint16_t values1[originalWord->length()], values2[cloneWord->length()];

2839 +#endif

2840 + AutoBuffer<int32_t, 20> lengths1(originalWord->length());

2841 + AutoBuffer<int32_t, 20> lengths2(cloneWord->length());

2842 + AutoBuffer<uint16_t, 20> values1(originalWord->length());

2843 + AutoBuffer<uint16_t, 20> values2(cloneWord->length());

2844 +

2845 + originalText = utext_openConstUnicodeString(originalText, originalWord, &status);

2846 + cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status) ;

2847 +

2848 + int count1, count2;

2849 + mutableDict->matches(originalText, originalWord->length(), lengths1.ele ms(), count1, originalWord->length(), values1.elems());

2850 + compactDict->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems());

2851 +

2852 + if(values1[count1-1] != values2[count2-1]){

2853 + errln("Values of word %d in MutableTrieDictionary and CompactTrieDi ctionary do not match, with values %d and %d\n",

2854 + counter, values1[count1-1], values2[count2-1]);

2855 + goto cleanup;

2856 + }

2857 +

2858 + counter++;

2859 + originalWord = enumer1->snext(status);

2860 + cloneWord = enumer2->snext(status);

2861 + }

2862 + if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {

2863 + errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are t he same");

2864 + }

2865 +

2866 + delete enumer1;

2867 + enumer1 = NULL;

2868 + delete enumer2;

2869 + enumer2 = NULL;

2870 +

2871 + // Now un-compact it

2872 + mutable2 = compactDict->cloneMutable(status);

2873 + if (U_FAILURE(status)) {

2874 + errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));

2875 + goto cleanup;

2876 + }

2877 +

2878 + cloneEnum = mutable2->openWords(status);

2879 + if (U_FAILURE(status)) {

2880 + errln("Could not create cloned mutable enumerator: %s\n", u_errorName(s tatus));

2881 + goto cleanup;

2882 + }

2883 +

2884 + if (wordCount != (testCount = cloneEnum->count(status))) {

2885 + errln("Cloned MutableTrieDictionary word count (%d) differs from file w ord count (%d), with status %s\n",

2886 + testCount, wordCount, u_errorName(status));

2887 + goto cleanup;

2888 + }

2889 +

2890 + // Compact original dictionary to clone. Note that we can only compare the same kind of

2891 + // dictionary as the order of the enumerators is not guaranteed to be the s ame between

2892 + // different kinds

2893 + enumer1 = mutableDict->openWords(status);

2894 + if (U_FAILURE(status)) {

2895 + errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorN ame(status));

2896 + goto cleanup;

2897 + }

2898 +

2899 + counter = 0;

2900 + originalWord = enumer1->snext(status);

2901 + cloneWord = cloneEnum->snext(status);

2902 + while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {

2903 + if (originalWord != cloneWord) {

2904 + errln("Original and cloned MutableTrieDictionary word mismatch\n");

2905 + goto cleanup;

2906 + }

2907 +

2908 + // check if attached values of the same word in both dictionaries tally

2909 + AutoBuffer<int32_t, 20> lengths1(originalWord->length());

2910 + AutoBuffer<int32_t, 20> lengths2(cloneWord->length());

2911 + AutoBuffer<uint16_t, 20> values1(originalWord->length());

2912 + AutoBuffer<uint16_t, 20> values2(cloneWord->length());

2913 + originalText = utext_openConstUnicodeString(originalText, originalWord, &status);

2914 + cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status) ;

2915 +

2916 + int count1, count2;

2917 + mutableDict->matches(originalText, originalWord->length(), lengths1.ele ms(), count1, originalWord->length(), values1.elems());

2918 + mutable2->matches(cloneText, cloneWord->length(), lengths2.elems(), cou nt2, cloneWord->length(), values2.elems());

2919 +

2920 + if(values1[count1-1] != values2[count2-1]){

2921 + errln("Values of word %d in original and cloned MutableTrieDictiona ry do not match, with values %d and %d\n",

2922 + counter, values1[count1-1], values2[count2-1]);

2923 + goto cleanup;

2924 + }

2925 +

2926 + counter++;

2927 +

2928 + originalWord = enumer1->snext(status);

2929 + cloneWord = cloneEnum->snext(status);

2930 + }

2931 +

2932 + if (U_FAILURE(status)) {

2933 + errln("Enumeration failed: %s\n", u_errorName(status));

2934 + goto cleanup;

2935 + }

2936 +

2937 + if (originalWord != cloneWord) {

2938 + errln("Original and cloned MutableTrieDictionary ended enumeration at d ifferent points\n");

2939 + goto cleanup;

2940 + }

2941 +

2942 + // Test the data copying constructor for CompactTrieDict, and the data acce ss APIs.

2943 + compact2 = new CompactTrieDictionary(compactDict->data(), status);

2944 + if (U_FAILURE(status)) {

2945 + errln("CompactTrieDictionary(const void *,...) failed\n");

2946 + goto cleanup;

2947 + }

2948 +

2949 + if (compact2->dataSize() == 0) {

2950 + errln("CompactTrieDictionary->dataSize() == 0\n");

2951 + goto cleanup;

2952 + }

2953 +

2954 + // Now count the words via the second dictionary

2955 + delete enumer1;

2956 + enumer1 = compact2->openWords(status);

2957 + if (U_FAILURE(status)) {

2958 + errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_er rorName(status));

2959 + goto cleanup;

2960 + }

2961 +

2962 + if (wordCount != (testCount = enumer1->count(status))) {

2963 + errln("CompactTrieDictionary 2 word count (%d) differs from file word c ount (%d), with status %s\n",

2964 + testCount, wordCount, u_errorName(status));

2965 + goto cleanup;

2966 + }

2967 +

2968 + cleanup:

2969 + delete compactDict;

2970 + delete mutableDict;

2971 + delete breaks;

2972 + delete[] testFile;

2973 + delete enumer1;

2974 + delete mutable2;

2975 + delete cloneEnum;

2976 + delete compact2;

2977 + utext_close(originalText);

2978 + utext_close(cloneText);

2979 +

2980 +

2981 +}

2982

2983 //----------------------------------------------------------------------------

2984 //

2985 @@ -1870,8 +2243,15 @@

2986 // Don't break in runs of hiragana or runs of ideograph, where the latter inclu des \u3005 \u3007 \u303B (cldrbug #2009).

2987 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u 3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"

2988 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u 3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";

2989 +#if 0

2990 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 1 7, 18, 20, 21, 24, 27, 28 };

2991 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 1 7, 18, 19, 20, 21, 24, 25, 26, 27, 28 };

2992 +#endif

2993 +// There's no separate Japanese word break iterator. Root is the same as Japane se.

2994 +// Our dictionary-based iterator has to be tweaked to better handle U+3005,

2995 +// U+3007, U+300B and some other cases.

2996 +static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 1 5, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };

2997 +static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 1 5, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };

2998

2999 // UBreakIteratorType UBRK_SENTENCE, Locale "el"

3000 // Add break after Greek question mark (cldrbug #2069).

3001 @@ -2672,6 +3052,8 @@

3002 UnicodeSet *fNewlineSet;

3003 UnicodeSet *fKatakanaSet;

3004 UnicodeSet *fALetterSet;

3005 + // TODO(jungshik): Do we still need this change?

3006 + // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt

3007 UnicodeSet *fMidNumLetSet;

3008 UnicodeSet *fMidLetterSet;

3009 UnicodeSet *fMidNumSet;

3010 @@ -2680,6 +3062,7 @@

3011 UnicodeSet *fOtherSet;

3012 UnicodeSet *fExtendSet;

3013 UnicodeSet *fExtendNumLetSet;

3014 + UnicodeSet *fDictionaryCjkSet;

3015

3016 RegexMatcher *fMatcher;

3017

3018 @@ -2696,12 +3079,24 @@

3019 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);

3020 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);

3021 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);

3022 - fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);

3023 + fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);

3024 + // Exclude Hangul syllables from ALetterSet during testing.

3025 + // Leave CJK dictionary characters out from the monkey tests!

3026 +#if 0

3027 + fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"

3028 + "[\\p{Line_Break = Complex_Context}"

3029 + "-\\p{Grapheme_Cluster_Break = Extend}"

3030 + "-\\p{Grapheme_Cluster_Break = Control}"

3031 + "]]",

3032 + status);

3033 +#endif

3034 + fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);

3035 + fALetterSet->removeAll(*fDictionaryCjkSet);

3036 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);

3037 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);

3038 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);

3039 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);

3040 - fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);

3041 + fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}[\\uff10-\\uff19]]"), status);

3042 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);

3043 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);

3044 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);

3045 @@ -2725,13 +3120,14 @@

3046 fOtherSet->removeAll(*fFormatSet);

3047 fOtherSet->removeAll(*fExtendSet);

3048 // Inhibit dictionary characters from being tested at all.

3049 + fOtherSet->removeAll(*fDictionaryCjkSet);

3050 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Com plex_Context}]"), status));

3051

3052 fSets->addElement(fCRSet, status);

3053 fSets->addElement(fLFSet, status);

3054 fSets->addElement(fNewlineSet, status);

3055 fSets->addElement(fALetterSet, status);

3056 - fSets->addElement(fKatakanaSet, status);

3057 + //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test ka takana

3058 fSets->addElement(fMidLetterSet, status);

3059 fSets->addElement(fMidNumLetSet, status);

3060 fSets->addElement(fMidNumSet, status);

3061 @@ -3978,6 +4374,7 @@

3062 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {

3063 count --;

3064 if (forward[count] != i) {

3065 + printStringBreaks(ustr, expected, expectedcount);

3066 test->errln("happy break test previous() failed: expected %d but go t %d",

3067 forward[count], i);

3068 break;

3069 @@ -4011,23 +4408,25 @@

3070 UErrorCode status = U_ZERO_ERROR;

3071 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, stat us);

3072 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);

3073 + // Replaced any C+J characters in a row with a random sequence of character s

3074 + // of the same length to make our C+J segmentation not get in the way.

3075 static const char *strlist[] =

3076 {

3077 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",

3078 - "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e004 0\\u003b",

3079 + "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e004 0\\u003b",

3080 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000 e0061\\u003a",

3081 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",

3082 - "\\u90ca\\u3588\\u009c\\u0953\\u194b",

3083 + "\\uac00\\u3588\\u009c\\u0953\\u194b",

3084 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",

3085 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e ",

3086 - "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",

3087 + "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",

3088 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",

3089 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",

3090 "\\u2027\\U000e0067\\u0a47\\u00b7",

3091 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",

3092 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",

3093 "\\u0589\\U000e006e\\u0a42\\U000104a5",

3094 - "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",

3095 + "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",

3096 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",

3097 "\\u0027\\u11af\\U000e0057\\u0602",

3098 "\\U0001d7f2\\U000e007\\u0004\\u0589",

3099 @@ -4039,7 +4438,7 @@

3100 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",

3101 "\\u0233\\U000e0020\\u0a69\\u0d6a",

3102 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",

3103 - "\\u58f4\\U000e0049\\u20e7\\u2027",

3104 + "\\u18f4\\U000e0049\\u20e7\\u2027",

3105 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",

3106 "\\ua183\\u102d\\u0bec\\u003a",

3107 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",

3108 @@ -4049,7 +4448,7 @@

3109 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",

3110 "\\u003a\\u0664\\u00b7\\u1fba",

3111 "\\u003b\\u0027\\u00b7\\u47a3",

3112 - "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",

3113 + "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",

3114 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\ u0e51\\u1058\\U000e0058\\u00b7\\u0673",

3115 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",

3116 };

3117 @@ -4104,12 +4503,12 @@

3118 "\\U0001d7f2\\U000e007d\\u0004\\u0589",

3119 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",

3120 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",

3121 - "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",

3122 + "\\U000e0065\\u302c\\u09ee\\U000e0068",

3123 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",

3124 "\\u0233\\U000e0020\\u0a69\\u0d6a",

3125 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",

3126 "\\u58f4\\U000e0049\\u20e7\\u2027",

3127 - "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",

3128 + "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",

3129 "\\ua183\\u102d\\u0bec\\u003a",

3130 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",

3131 "\\u003a\\u0e57\\u0fad\\u002e",

3132 --- source/test/intltest/rbbitst.h 2010-07-22 17:15:37.000000000 -0700

3133 +++ source/test/intltest/rbbitst.h 2011-01-21 14:12:45.152007000 -0800

3134 @@ -70,6 +70,7 @@

3135 void TestBug5775();

3136 void TestThaiBreaks();

3137 void TestTailoredBreaks();

3138 + void TestTrieDictWithValue();

3139 void TestDictRules();

3140 void TestBug5532();

3141

3142 --- source/test/testdata/rbbitst.txt 2010-07-28 17:18:28.000000000 -0700

3143 +++ source/test/testdata/rbbitst.txt 2011-01-21 14:12:45.221011000 -0800

3144 @@ -161,7 +161,23 @@

3145 <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>

3146

3147 # Hiragana & Katakana stay together, but separates from each other and Latin.

3148 -<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINI NG ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A} \N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>

3149 +# *** what to do about theoretical combos of chars? i.e. hiragana + accent

3150 +#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBIN ING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A }\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKAN A LETTER N}<300>def<200>#•</data>

3151 +

3152 +# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth

3153 +<data>•芽キャベツ<400>芽キャﾍﾞツ<400></data>

3154 +

3155 +# more Japanese tests

3156 +# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana

3157 +# and the Katakana block are not treated correctly. Enable this later.

3158 +#<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400>　•て<400>こと<400 >は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>

3159 +<data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400>　•て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>

3160 +

3161 +# Testing of word boundary for dictionary word containing both kanji and kana

3162 +<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data>

3163 +

3164 +# Testing of Chinese segmentation (taken from a Chinese news article)

3165 +<data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400 >到了<400>“•推荐<400>票<400>”•，•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400> 的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>，•选出<400>他们<400> 属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</d ata>

3166

3167 # Words with interior formatting characters

3168 <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</dat a>

3169 @@ -169,6 +185,8 @@

3170 # to test for bug #4097779

3171 <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>

3172

3173 +# fullwidth numeric, midletter characters etc should be treated like their half width counterparts

3174 +<data>•ＩＳＮ'Ｔ<200> •１９<100>日<400></data>

3175

3176 # to test for bug #4098467

3177 # What follows is a string of Korean characters (I found it in the Yellow Pages

3178 @@ -178,9 +196,15 @@

3179 # precomposed syllables...

3180 <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\ua d50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u1 10b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u1 1bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>

3181

3182 -<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data>

3183 +# more Korean tests (Jamo not tested here, not counted as dictionary characters )

3184 +# Disable them now because we don't include a Korean dictionary.

3185 +#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<2 00>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data>

3186 +#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2d d<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200 > •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data>

3187 +

3188 +<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</da ta>

3189 +

3190 +<data>•\u06c9<200>\uc799<200>\ufffa•</data>

3191

3192 -<data>•\u06c9\uc799\ufffa<200></data>

3193

3194 #

3195 # Try some words from other scripts.

3196 @@ -491,8 +515,7 @@

3197 <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c •</data>

3198

3199 # conjoining jamo...

3200 -# TODO: rules update needed

3201 -#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\ u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\ u1100\u116d•\u1112\u116c•</data>

3202 +<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u 11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1 100\u116d•\u1112\u116c•</data>

3203

3204 # to test for bug #4117554: Fullwidth .!? should be treated as postJwrd

3205 <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data>

3206 --- source/test/testdata/testaliases.txt 2009-11-12 13:53:42.000000000 -0 800

3207 +++ source/test/testdata/testaliases.txt 2011-01-21 14:12:45.204005000 -0 800

3208 @@ -28,7 +28,7 @@

3209 LocaleScript:alias { "/ICUDATA/ja/LocaleScript" }

3210

3211 // aliasing using position

3212 - boundaries:alias { "/ICUDATA-brkitr/ja" } // Referencing corresponding reso urce in another bundle

3213 + boundaries:alias { "/ICUDATA-brkitr/th" } // Referencing corresponding reso urce in another bundle

3214

3215 // aliasing arrays

3216 zoneTests {

3217 --- source/tools/genctd/genctd.cpp 2009-08-04 14:09:17.000000000 -0700

3218 +++ source/tools/genctd/genctd.cpp 2011-01-21 14:12:45.564923000 -0800

3219 @@ -1,6 +1,6 @@

3220 /*

3221 **********************************************************************

3222 -* Copyright (C) 2002-2009, International Business Machines

3223 +* Copyright (C) 2002-2010, International Business Machines

3224 * Corporation and others. All Rights Reserved.

3225 **********************************************************************

3226 *

3227 @@ -34,12 +34,15 @@

3228 #include "unicode/udata.h"

3229 #include "unicode/putil.h"

3230

3231 +//#include "unicode/ustdio.h"

3232 +

3233 #include "uoptions.h"

3234 #include "unewdata.h"

3235 #include "ucmndata.h"

3236 #include "rbbidata.h"

3237 #include "triedict.h"

3238 #include "cmemory.h"

3239 +#include "uassert.h"

3240

3241 #include <stdio.h>

3242 #include <stdlib.h>

3243 @@ -199,147 +202,191 @@

3244 long wordFileSize;

3245 FILE *file;

3246 char *wordBufferC;

3247 -

3248 + MutableTrieDictionary *mtd = NULL;

3249 +

3250 file = fopen(wordFileName, "rb");

3251 - if( file == 0 ) {

3252 - fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);

3253 - exit(-1);

3254 - }

3255 - fseek(file, 0, SEEK_END);

3256 - wordFileSize = ftell(file);

3257 - fseek(file, 0, SEEK_SET);

3258 - wordBufferC = new char[wordFileSize+10];

3259 -

3260 - result = (long)fread(wordBufferC, 1, wordFileSize, file);

3261 - if (result != wordFileSize) {

3262 - fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);

3263 - exit (-1);

3264 - }

3265 - wordBufferC[wordFileSize]=0;

3266 - fclose(file);

3267 -

3268 - //

3269 - // Look for a Unicode Signature (BOM) on the word file

3270 - //

3271 - int32_t signatureLength;

3272 - const char * wordSourceC = wordBufferC;

3273 - const char* encoding = ucnv_detectUnicodeSignature(

3274 - wordSourceC, wordFileSize, &signatureLength, &status );

3275 - if (U_FAILURE(status)) {

3276 - exit(status);

3277 - }

3278 - if(encoding!=NULL ){

3279 - wordSourceC += signatureLength;

3280 - wordFileSize -= signatureLength;

3281 - }

3282 -

3283 - //

3284 - // Open a converter to take the rule file to UTF-16

3285 - //

3286 - UConverter* conv;

3287 - conv = ucnv_open(encoding, &status);

3288 - if (U_FAILURE(status)) {

3289 - fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));

3290 - exit(status);

3291 - }

3292 -

3293 - //

3294 - // Convert the words to UChar.

3295 - // Preflight first to determine required buffer size.

3296 - //

3297 - uint32_t destCap = ucnv_toUChars(conv,

3298 - NULL, // dest,

3299 - 0, // destCapacity,

3300 - wordSourceC,

3301 - wordFileSize,

3302 - &status);

3303 - if (status != U_BUFFER_OVERFLOW_ERROR) {

3304 - fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status ));

3305 - exit(status);

3306 - };

3307 -

3308 - status = U_ZERO_ERROR;

3309 - UChar *wordSourceU = new UChar[destCap+1];

3310 - ucnv_toUChars(conv,

3311 - wordSourceU, // dest,

3312 - destCap+1,

3313 - wordSourceC,

3314 - wordFileSize,

3315 - &status);

3316 - if (U_FAILURE(status)) {

3317 - fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status ));

3318 - exit(status);

3319 - };

3320 - ucnv_close(conv);

3321 -

3322 - // Get rid of the original file buffer

3323 - delete[] wordBufferC;

3324 -

3325 - // Create a MutableTrieDictionary, and loop through all the lines, insertin g

3326 - // words.

3327 -

3328 - // First, pick a median character.

3329 - UChar *current = wordSourceU + (destCap/2);

3330 - UChar uc = *current++;

3331 - UnicodeSet breaks;

3332 - breaks.add(0x000A); // Line Feed

3333 - breaks.add(0x000D); // Carriage Return

3334 - breaks.add(0x2028); // Line Separator

3335 - breaks.add(0x2029); // Paragraph Separator

3336 -

3337 - do {

3338 - // Look for line break

3339 - while (uc && !breaks.contains(uc)) {

3340 - uc = *current++;

3341 - }

3342 - // Now skip to first non-line-break

3343 - while (uc && breaks.contains(uc)) {

3344 - uc = *current++;

3345 + if( file == 0 ) { //cannot find file

3346 + //create 1-line dummy file: ie 1 char, 1 value

3347 + UNewDataMemory *pData;

3348 + char msg[1024];

3349 +

3350 + /* write message with just the name */

3351 + sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outF ileName);

3352 + fprintf(stderr, "%s\n", msg);

3353 +

3354 + UChar c = 0x0020;

3355 + mtd = new MutableTrieDictionary(c, status, TRUE);

3356 + mtd->addWord(&c, 1, status, 1);

3357 +

3358 + } else { //read words in from input file

3359 + fseek(file, 0, SEEK_END);

3360 + wordFileSize = ftell(file);

3361 + fseek(file, 0, SEEK_SET);

3362 + wordBufferC = new char[wordFileSize+10];

3363 +

3364 + result = (long)fread(wordBufferC, 1, wordFileSize, file);

3365 + if (result != wordFileSize) {

3366 + fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);

3367 + exit (-1);

3368 }

3369 - }

3370 - while (uc && (breaks.contains(uc) \|\| u_isspace(uc)));

3371 -

3372 - MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);

3373 + wordBufferC[wordFileSize]=0;

3374 + fclose(file);

3375

3376 - if (U_FAILURE(status)) {

3377 - fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_erro rName(status));

3378 - exit(status);

3379 - }

3380 + //

3381 + // Look for a Unicode Signature (BOM) on the word file

3382 + //

3383 + int32_t signatureLength;

3384 + const char * wordSourceC = wordBufferC;

3385 + const char* encoding = ucnv_detectUnicodeSignature(

3386 + wordSourceC, wordFileSize, &signatureLength, &st atus);

3387 + if (U_FAILURE(status)) {

3388 + exit(status);

3389 + }

3390 + if(encoding!=NULL ){

3391 + wordSourceC += signatureLength;

3392 + wordFileSize -= signatureLength;

3393 + }

3394

3395 - // Now add the words. Words are non-space characters at the beginning of

3396 - // lines, and must be at least one UChar.

3397 - current = wordSourceU;

3398 - UChar *candidate = current;

3399 - uc = *current++;

3400 - int32_t length = 0;

3401 -

3402 - while (uc) {

3403 - while (uc && !u_isspace(uc)) {

3404 - ++length;

3405 - uc = *current++;

3406 + //

3407 + // Open a converter to take the rule file to UTF-16

3408 + //

3409 + UConverter* conv;

3410 + conv = ucnv_open(encoding, &status);

3411 + if (U_FAILURE(status)) {

3412 + fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status ));

3413 + exit(status);

3414 }

3415 - if (length > 0) {

3416 - mtd->addWord(candidate, length, status);

3417 - if (U_FAILURE(status)) {

3418 - fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s \"\n",

3419 - u_errorName(status));

3420 - exit(status);

3421 +

3422 + //

3423 + // Convert the words to UChar.

3424 + // Preflight first to determine required buffer size.

3425 + //

3426 + uint32_t destCap = ucnv_toUChars(conv,

3427 + NULL, // dest,

3428 + 0, // destCapacity,

3429 + wordSourceC,

3430 + wordFileSize,

3431 + &status);

3432 + if (status != U_BUFFER_OVERFLOW_ERROR) {

3433 + fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(st atus));

3434 + exit(status);

3435 + };

3436 +

3437 + status = U_ZERO_ERROR;

3438 + UChar *wordSourceU = new UChar[destCap+1];

3439 + ucnv_toUChars(conv,

3440 + wordSourceU, // dest,

3441 + destCap+1,

3442 + wordSourceC,

3443 + wordFileSize,

3444 + &status);

3445 + if (U_FAILURE(status)) {

3446 + fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(st atus));

3447 + exit(status);

3448 + };

3449 + ucnv_close(conv);

3450 +

3451 + // Get rid of the original file buffer

3452 + delete[] wordBufferC;

3453 +

3454 + // Create a MutableTrieDictionary, and loop through all the lines, inse rting

3455 + // words.

3456 +

3457 + // First, pick a median character.

3458 + UChar *current = wordSourceU + (destCap/2);

3459 + UChar uc = *current++;

3460 + UnicodeSet breaks;

3461 + breaks.add(0x000A); // Line Feed

3462 + breaks.add(0x000D); // Carriage Return

3463 + breaks.add(0x2028); // Line Separator

3464 + breaks.add(0x2029); // Paragraph Separator

3465 +

3466 + do {

3467 + // Look for line break

3468 + while (uc && !breaks.contains(uc)) {

3469 + uc = *current++;

3470 + }

3471 + // Now skip to first non-line-break

3472 + while (uc && breaks.contains(uc)) {

3473 + uc = *current++;

3474 }

3475 }

3476 - // Find beginning of next line

3477 - while (uc && !breaks.contains(uc)) {

3478 - uc = *current++;

3479 + while (uc && (breaks.contains(uc) \|\| u_isspace(uc)));

3480 +

3481 + mtd = new MutableTrieDictionary(uc, status);

3482 +

3483 + if (U_FAILURE(status)) {

3484 + fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_ errorName(status));

3485 + exit(status);

3486 }

3487 - while (uc && breaks.contains(uc)) {

3488 - uc = *current++;

3489 +

3490 + // Now add the words. Words are non-space characters at the beginning o f

3491 + // lines, and must be at least one UChar. If a word has an associated v alue,

3492 + // the value should follow the word on the same line after a tab charac ter.

3493 + current = wordSourceU;

3494 + UChar *candidate = current;

3495 + uc = *current++;

3496 + int32_t length = 0;

3497 + int count = 0;

3498 +

3499 + while (uc) {

3500 + while (uc && !u_isspace(uc)) {

3501 + ++length;

3502 + uc = *current++;

3503 + }

3504 +

3505 + UnicodeString valueString;

3506 + UChar candidateValue;

3507 + if(uc == 0x0009){ //separator is a tab char, read in number after s pace

3508 + while (uc && u_isspace(uc)) {

3509 + uc = *current++;

3510 + }

3511 + while (uc && !u_isspace(uc)) {

3512 + valueString.append(uc);

3513 + uc = *current++;

3514 + }

3515 + }

3516 +

3517 + if (length > 0) {

3518 + count++;

3519 + if(valueString.length() > 0){

3520 + mtd->setValued(TRUE);

3521 +

3522 + uint32_t value = 0;

3523 + char* s = new char[valueString.length()];

3524 + valueString.extract(0,valueString.length(), s, valueString. length());

3525 + int n = sscanf(s, "%ud", &value);

3526 + U_ASSERT(n == 1);

3527 + U_ASSERT(value >= 0);

3528 + mtd->addWord(candidate, length, status, (uint16_t)value);

3529 + delete[] s;

3530 + } else {

3531 + mtd->addWord(candidate, length, status);

3532 + }

3533 +

3534 + if (U_FAILURE(status)) {

3535 + fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n",

3536 + u_errorName(status), count);

3537 + exit(status);

3538 + }

3539 + }

3540 +

3541 + // Find beginning of next line

3542 + while (uc && !breaks.contains(uc)) {

3543 + uc = *current++;

3544 + }

3545 + // Find next non-line-breaking character

3546 + while (uc && breaks.contains(uc)) {

3547 + uc = *current++;

3548 + }

3549 + candidate = current-1;

3550 + length = 0;

3551 }

3552 - candidate = current-1;

3553 - length = 0;

3554 +

3555 + // Get rid of the Unicode text buffer

3556 + delete[] wordSourceU;

3557 }

3558

3559 - // Get rid of the Unicode text buffer

3560 - delete[] wordSourceU;

3561 -

3562 // Now, create a CompactTrieDictionary from the mutable dictionary

3563 CompactTrieDictionary ctd = new CompactTrieDictionary(mtd, status);

3564 if (U_FAILURE(status)) {

3565 @@ -393,4 +440,3 @@

3566

3567 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

3568 }

3569 -

3570 --- source/tools/genctd/Makefile.in 2006-12-16 13:07:01.000000000 -0800

3571 +++ source/tools/genctd/Makefile.in 2011-01-21 14:12:45.555920000 -0800

3572 @@ -23,13 +23,13 @@

3573 ## Extra files to remove for 'make clean'

3574 CLEANFILES = *~ $(DEPS) $(MAN_FILES)

3575

3576 -## Target information

3577 +## Target informationcd

3578 TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)

3579

3580 ifneq ($(top_builddir),$(top_srcdir))

3581 CPPFLAGS += -I$(top_builddir)/common

3582 endif

3583 -CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil

3584 +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -I$(top_srcdir)/i18n

3585 LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)

3586

3587 OBJECTS = genctd.o

OLD	NEW

« icu52/README.chromium ('K') | « icu52/patches/search_collation.patch ('k') | icu52/patches/si_value.undef.patch » ('j') | icu52/scripts/eucjp_gen.sh » ('J')