OLD | NEW |
(Empty) | |
| 1 /** |
| 2 ******************************************************************************* |
| 3 * Copyright (C) 2006-2008, International Business Machines Corporation * |
| 4 * and others. All Rights Reserved. * |
| 5 ******************************************************************************* |
| 6 */ |
| 7 |
| 8 #include "unicode/utypes.h" |
| 9 |
| 10 #if !UCONFIG_NO_BREAK_ITERATION |
| 11 |
| 12 #include "triedict.h" |
| 13 #include "unicode/chariter.h" |
| 14 #include "unicode/uchriter.h" |
| 15 #include "unicode/strenum.h" |
| 16 #include "unicode/uenum.h" |
| 17 #include "unicode/udata.h" |
| 18 #include "cmemory.h" |
| 19 #include "udataswp.h" |
| 20 #include "uvector.h" |
| 21 #include "uvectr32.h" |
| 22 #include "uarrsort.h" |
| 23 |
| 24 //#define DEBUG_TRIE_DICT 1 |
| 25 |
| 26 #ifdef DEBUG_TRIE_DICT |
| 27 #include <sys/times.h> |
| 28 #include <limits.h> |
| 29 #include <stdio.h> |
| 30 #endif |
| 31 |
| 32 U_NAMESPACE_BEGIN |
| 33 |
| 34 /******************************************************************* |
| 35 * TrieWordDictionary |
| 36 */ |
| 37 |
| 38 TrieWordDictionary::TrieWordDictionary() { |
| 39 } |
| 40 |
| 41 TrieWordDictionary::~TrieWordDictionary() { |
| 42 } |
| 43 |
| 44 /******************************************************************* |
| 45 * MutableTrieDictionary |
| 46 */ |
| 47 |
| 48 // Node structure for the ternary, uncompressed trie |
| 49 struct TernaryNode : public UMemory { |
| 50 UChar ch; // UTF-16 code unit |
| 51 uint16_t flags; // Flag word |
| 52 TernaryNode *low; // Less-than link |
| 53 TernaryNode *equal; // Equal link |
| 54 TernaryNode *high; // Greater-than link |
| 55 |
| 56 TernaryNode(UChar uc); |
| 57 ~TernaryNode(); |
| 58 }; |
| 59 |
| 60 enum MutableTrieNodeFlags { |
| 61 kEndsWord = 0x0001 // This node marks the end of a valid word |
| 62 }; |
| 63 |
| 64 inline |
| 65 TernaryNode::TernaryNode(UChar uc) { |
| 66 ch = uc; |
| 67 flags = 0; |
| 68 low = NULL; |
| 69 equal = NULL; |
| 70 high = NULL; |
| 71 } |
| 72 |
| 73 // Not inline since it's recursive |
| 74 TernaryNode::~TernaryNode() { |
| 75 delete low; |
| 76 delete equal; |
| 77 delete high; |
| 78 } |
| 79 |
| 80 MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status )
{ |
| 81 // Start the trie off with something. Having the root node already present |
| 82 // cuts a special case out of the search/insertion functions. |
| 83 // Making it a median character cuts the worse case for searches from |
| 84 // 4x a balanced trie to 2x a balanced trie. It's best to choose something |
| 85 // that starts a word that is midway in the list. |
| 86 fTrie = new TernaryNode(median); |
| 87 if (fTrie == NULL) { |
| 88 status = U_MEMORY_ALLOCATION_ERROR; |
| 89 } |
| 90 fIter = utext_openUChars(NULL, NULL, 0, &status); |
| 91 if (U_SUCCESS(status) && fIter == NULL) { |
| 92 status = U_MEMORY_ALLOCATION_ERROR; |
| 93 } |
| 94 } |
| 95 |
| 96 MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) { |
| 97 fTrie = NULL; |
| 98 fIter = utext_openUChars(NULL, NULL, 0, &status); |
| 99 if (U_SUCCESS(status) && fIter == NULL) { |
| 100 status = U_MEMORY_ALLOCATION_ERROR; |
| 101 } |
| 102 } |
| 103 |
| 104 MutableTrieDictionary::~MutableTrieDictionary() { |
| 105 delete fTrie; |
| 106 utext_close(fIter); |
| 107 } |
| 108 |
| 109 int32_t |
| 110 MutableTrieDictionary::search( UText *text, |
| 111 int32_t maxLength, |
| 112 int32_t *lengths, |
| 113 int &count, |
| 114 int limit, |
| 115 TernaryNode *&parent, |
| 116 UBool &pMatched ) const { |
| 117 // TODO: current implementation works in UTF-16 space |
| 118 const TernaryNode *up = NULL; |
| 119 const TernaryNode *p = fTrie; |
| 120 int mycount = 0; |
| 121 pMatched = TRUE; |
| 122 int i; |
| 123 |
| 124 UChar uc = utext_current32(text); |
| 125 for (i = 0; i < maxLength && p != NULL; ++i) { |
| 126 while (p != NULL) { |
| 127 if (uc < p->ch) { |
| 128 up = p; |
| 129 p = p->low; |
| 130 } |
| 131 else if (uc == p->ch) { |
| 132 break; |
| 133 } |
| 134 else { |
| 135 up = p; |
| 136 p = p->high; |
| 137 } |
| 138 } |
| 139 if (p == NULL) { |
| 140 pMatched = FALSE; |
| 141 break; |
| 142 } |
| 143 // Must be equal to get here |
| 144 if (limit > 0 && (p->flags & kEndsWord)) { |
| 145 lengths[mycount++] = i+1; |
| 146 --limit; |
| 147 } |
| 148 up = p; |
| 149 p = p->equal; |
| 150 uc = utext_next32(text); |
| 151 uc = utext_current32(text); |
| 152 } |
| 153 |
| 154 // Note that there is no way to reach here with up == 0 unless |
| 155 // maxLength is 0 coming in. |
| 156 parent = (TernaryNode *)up; |
| 157 count = mycount; |
| 158 return i; |
| 159 } |
| 160 |
| 161 void |
| 162 MutableTrieDictionary::addWord( const UChar *word, |
| 163 int32_t length, |
| 164 UErrorCode &status ) { |
| 165 #if 0 |
| 166 if (length <= 0) { |
| 167 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 168 return; |
| 169 } |
| 170 #endif |
| 171 TernaryNode *parent; |
| 172 UBool pMatched; |
| 173 int count; |
| 174 fIter = utext_openUChars(fIter, word, length, &status); |
| 175 |
| 176 int matched; |
| 177 matched = search(fIter, length, NULL, count, 0, parent, pMatched); |
| 178 |
| 179 while (matched++ < length) { |
| 180 UChar32 uc = utext_next32(fIter); // TODO: supplemetary support? |
| 181 U_ASSERT(uc != U_SENTINEL); |
| 182 TernaryNode *newNode = new TernaryNode(uc); |
| 183 if (newNode == NULL) { |
| 184 status = U_MEMORY_ALLOCATION_ERROR; |
| 185 return; |
| 186 } |
| 187 if (pMatched) { |
| 188 parent->equal = newNode; |
| 189 } |
| 190 else { |
| 191 pMatched = TRUE; |
| 192 if (uc < parent->ch) { |
| 193 parent->low = newNode; |
| 194 } |
| 195 else { |
| 196 parent->high = newNode; |
| 197 } |
| 198 } |
| 199 parent = newNode; |
| 200 } |
| 201 |
| 202 parent->flags |= kEndsWord; |
| 203 } |
| 204 |
| 205 #if 0 |
| 206 void |
| 207 MutableTrieDictionary::addWords( UEnumeration *words, |
| 208 UErrorCode &status ) { |
| 209 int32_t length; |
| 210 const UChar *word; |
| 211 while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) { |
| 212 addWord(word, length, status); |
| 213 } |
| 214 } |
| 215 #endif |
| 216 |
| 217 int32_t |
| 218 MutableTrieDictionary::matches( UText *text, |
| 219 int32_t maxLength, |
| 220 int32_t *lengths, |
| 221 int &count, |
| 222 int limit ) const { |
| 223 TernaryNode *parent; |
| 224 UBool pMatched; |
| 225 return search(text, maxLength, lengths, count, limit, parent, pMatched); |
| 226 } |
| 227 |
| 228 // Implementation of iteration for MutableTrieDictionary |
| 229 class MutableTrieEnumeration : public StringEnumeration { |
| 230 private: |
| 231 UStack fNodeStack; // Stack of nodes to process |
| 232 UVector32 fBranchStack; // Stack of which branch we are working on |
| 233 TernaryNode *fRoot; // Root node |
| 234 enum StackBranch { |
| 235 kLessThan, |
| 236 kEqual, |
| 237 kGreaterThan, |
| 238 kDone |
| 239 }; |
| 240 |
| 241 public: |
| 242 static UClassID U_EXPORT2 getStaticClassID(void); |
| 243 virtual UClassID getDynamicClassID(void) const; |
| 244 public: |
| 245 MutableTrieEnumeration(TernaryNode *root, UErrorCode &status) |
| 246 : fNodeStack(status), fBranchStack(status) { |
| 247 fRoot = root; |
| 248 fNodeStack.push(root, status); |
| 249 fBranchStack.push(kLessThan, status); |
| 250 unistr.remove(); |
| 251 } |
| 252 |
| 253 virtual ~MutableTrieEnumeration() { |
| 254 } |
| 255 |
| 256 virtual StringEnumeration *clone() const { |
| 257 UErrorCode status = U_ZERO_ERROR; |
| 258 return new MutableTrieEnumeration(fRoot, status); |
| 259 } |
| 260 |
| 261 virtual const UnicodeString *snext(UErrorCode &status) { |
| 262 if (fNodeStack.empty() || U_FAILURE(status)) { |
| 263 return NULL; |
| 264 } |
| 265 TernaryNode *node = (TernaryNode *) fNodeStack.peek(); |
| 266 StackBranch where = (StackBranch) fBranchStack.peeki(); |
| 267 while (!fNodeStack.empty() && U_SUCCESS(status)) { |
| 268 UBool emit; |
| 269 UBool equal; |
| 270 |
| 271 switch (where) { |
| 272 case kLessThan: |
| 273 if (node->low != NULL) { |
| 274 fBranchStack.setElementAt(kEqual, fBranchStack.size()-1); |
| 275 node = (TernaryNode *) fNodeStack.push(node->low, status); |
| 276 where = (StackBranch) fBranchStack.push(kLessThan, status); |
| 277 break; |
| 278 } |
| 279 case kEqual: |
| 280 emit = (node->flags & kEndsWord) != 0; |
| 281 equal = (node->equal != NULL); |
| 282 // If this node should be part of the next emitted string, appen
d |
| 283 // the UChar to the string, and make sure we pop it when we come |
| 284 // back to this node. The character should only be in the string |
| 285 // for as long as we're traversing the equal subtree of this nod
e |
| 286 if (equal || emit) { |
| 287 unistr.append(node->ch); |
| 288 fBranchStack.setElementAt(kGreaterThan, fBranchStack.size()-
1); |
| 289 } |
| 290 if (equal) { |
| 291 node = (TernaryNode *) fNodeStack.push(node->equal, status); |
| 292 where = (StackBranch) fBranchStack.push(kLessThan, status); |
| 293 } |
| 294 if (emit) { |
| 295 return &unistr; |
| 296 } |
| 297 if (equal) { |
| 298 break; |
| 299 } |
| 300 case kGreaterThan: |
| 301 // If this node's character is in the string, remove it. |
| 302 if (node->equal != NULL || (node->flags & kEndsWord)) { |
| 303 unistr.truncate(unistr.length()-1); |
| 304 } |
| 305 if (node->high != NULL) { |
| 306 fBranchStack.setElementAt(kDone, fBranchStack.size()-1); |
| 307 node = (TernaryNode *) fNodeStack.push(node->high, status); |
| 308 where = (StackBranch) fBranchStack.push(kLessThan, status); |
| 309 break; |
| 310 } |
| 311 case kDone: |
| 312 fNodeStack.pop(); |
| 313 fBranchStack.popi(); |
| 314 node = (TernaryNode *) fNodeStack.peek(); |
| 315 where = (StackBranch) fBranchStack.peeki(); |
| 316 break; |
| 317 default: |
| 318 return NULL; |
| 319 } |
| 320 } |
| 321 return NULL; |
| 322 } |
| 323 |
| 324 // Very expensive, but this should never be used. |
| 325 virtual int32_t count(UErrorCode &status) const { |
| 326 MutableTrieEnumeration counter(fRoot, status); |
| 327 int32_t result = 0; |
| 328 while (counter.snext(status) != NULL && U_SUCCESS(status)) { |
| 329 ++result; |
| 330 } |
| 331 return result; |
| 332 } |
| 333 |
| 334 virtual void reset(UErrorCode &status) { |
| 335 fNodeStack.removeAllElements(); |
| 336 fBranchStack.removeAllElements(); |
| 337 fNodeStack.push(fRoot, status); |
| 338 fBranchStack.push(kLessThan, status); |
| 339 unistr.remove(); |
| 340 } |
| 341 }; |
| 342 |
| 343 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MutableTrieEnumeration) |
| 344 |
| 345 StringEnumeration * |
| 346 MutableTrieDictionary::openWords( UErrorCode &status ) const { |
| 347 if (U_FAILURE(status)) { |
| 348 return NULL; |
| 349 } |
| 350 return new MutableTrieEnumeration(fTrie, status); |
| 351 } |
| 352 |
| 353 /******************************************************************* |
| 354 * CompactTrieDictionary |
| 355 */ |
| 356 |
| 357 struct CompactTrieHeader { |
| 358 uint32_t size; // Size of the data in bytes |
| 359 uint32_t magic; // Magic number (including version) |
| 360 uint16_t nodeCount; // Number of entries in offsets[] |
| 361 uint16_t root; // Node number of the root node |
| 362 uint32_t offsets[1]; // Offsets to nodes from start of data |
| 363 }; |
| 364 |
| 365 // Note that to avoid platform-specific alignment issues, all members of the nod
e |
| 366 // structures should be the same size, or should contain explicit padding to |
| 367 // natural alignment boundaries. |
| 368 |
| 369 // We can't use a bitfield for the flags+count field, because the layout of thos
e |
| 370 // is not portable. 12 bits of count allows for up to 4096 entries in a node. |
| 371 struct CompactTrieNode { |
| 372 uint16_t flagscount; // Count of sub-entries, plus flags |
| 373 }; |
| 374 |
| 375 enum CompactTrieNodeFlags { |
| 376 kVerticalNode = 0x1000, // This is a vertical node |
| 377 kParentEndsWord = 0x2000, // The node whose equal link points to this
ends a word |
| 378 kReservedFlag1 = 0x4000, |
| 379 kReservedFlag2 = 0x8000, |
| 380 kCountMask = 0x0FFF, // The count portion of flagscount |
| 381 kFlagMask = 0xF000 // The flags portion of flagscount |
| 382 }; |
| 383 |
| 384 // The two node types are distinguished by the kVerticalNode flag. |
| 385 |
| 386 struct CompactTrieHorizontalEntry { |
| 387 uint16_t ch; // UChar |
| 388 uint16_t equal; // Equal link node index |
| 389 }; |
| 390 |
| 391 // We don't use inheritance here because C++ does not guarantee that the |
| 392 // base class comes first in memory!! |
| 393 |
| 394 struct CompactTrieHorizontalNode { |
| 395 uint16_t flagscount; // Count of sub-entries, plus flags |
| 396 CompactTrieHorizontalEntry entries[1]; |
| 397 }; |
| 398 |
| 399 struct CompactTrieVerticalNode { |
| 400 uint16_t flagscount; // Count of sub-entries, plus flags |
| 401 uint16_t equal; // Equal link node index |
| 402 uint16_t chars[1]; // Code units |
| 403 }; |
| 404 |
| 405 // {'Dic', 1}, version 1 |
| 406 #define COMPACT_TRIE_MAGIC_1 0x44696301 |
| 407 |
| 408 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj, |
| 409 UErrorCode &status ) |
| 410 : fUData(dataObj) |
| 411 { |
| 412 fData = (const CompactTrieHeader *) udata_getMemory(dataObj); |
| 413 fOwnData = FALSE; |
| 414 if (fData->magic != COMPACT_TRIE_MAGIC_1) { |
| 415 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 416 fData = NULL; |
| 417 } |
| 418 } |
| 419 CompactTrieDictionary::CompactTrieDictionary( const void *data, |
| 420 UErrorCode &status ) |
| 421 : fUData(NULL) |
| 422 { |
| 423 fData = (const CompactTrieHeader *) data; |
| 424 fOwnData = FALSE; |
| 425 if (fData->magic != COMPACT_TRIE_MAGIC_1) { |
| 426 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 427 fData = NULL; |
| 428 } |
| 429 } |
| 430 |
| 431 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict, |
| 432 UErrorCode &status ) |
| 433 : fUData(NULL) |
| 434 { |
| 435 fData = compactMutableTrieDictionary(dict, status); |
| 436 fOwnData = !U_FAILURE(status); |
| 437 } |
| 438 |
| 439 CompactTrieDictionary::~CompactTrieDictionary() { |
| 440 if (fOwnData) { |
| 441 uprv_free((void *)fData); |
| 442 } |
| 443 if (fUData) { |
| 444 udata_close(fUData); |
| 445 } |
| 446 } |
| 447 |
| 448 uint32_t |
| 449 CompactTrieDictionary::dataSize() const { |
| 450 return fData->size; |
| 451 } |
| 452 |
| 453 const void * |
| 454 CompactTrieDictionary::data() const { |
| 455 return fData; |
| 456 } |
| 457 |
| 458 // This function finds the address of a node for us, given its node ID |
| 459 static inline const CompactTrieNode * |
| 460 getCompactNode(const CompactTrieHeader *header, uint16_t node) { |
| 461 return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[n
ode]); |
| 462 } |
| 463 |
| 464 int32_t |
| 465 CompactTrieDictionary::matches( UText *text, |
| 466 int32_t maxLength, |
| 467 int32_t *lengths, |
| 468 int &count, |
| 469 int limit ) const { |
| 470 // TODO: current implementation works in UTF-16 space |
| 471 const CompactTrieNode *node = getCompactNode(fData, fData->root); |
| 472 int mycount = 0; |
| 473 |
| 474 UChar uc = utext_current32(text); |
| 475 int i = 0; |
| 476 |
| 477 while (node != NULL) { |
| 478 // Check if the node we just exited ends a word |
| 479 if (limit > 0 && (node->flagscount & kParentEndsWord)) { |
| 480 lengths[mycount++] = i; |
| 481 --limit; |
| 482 } |
| 483 // Check that we haven't exceeded the maximum number of input characters
. |
| 484 // We have to do that here rather than in the while condition so that |
| 485 // we can check for ending a word, above. |
| 486 if (i >= maxLength) { |
| 487 break; |
| 488 } |
| 489 |
| 490 int nodeCount = (node->flagscount & kCountMask); |
| 491 if (nodeCount == 0) { |
| 492 // Special terminal node; return now |
| 493 break; |
| 494 } |
| 495 if (node->flagscount & kVerticalNode) { |
| 496 // Vertical node; check all the characters in it |
| 497 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNod
e *)node; |
| 498 for (int j = 0; j < nodeCount && i < maxLength; ++j) { |
| 499 if (uc != vnode->chars[j]) { |
| 500 // We hit a non-equal character; return |
| 501 goto exit; |
| 502 } |
| 503 utext_next32(text); |
| 504 uc = utext_current32(text); |
| 505 ++i; |
| 506 } |
| 507 // To get here we must have come through the whole list successfully
; |
| 508 // go on to the next node. Note that a word cannot end in the middle |
| 509 // of a vertical node. |
| 510 node = getCompactNode(fData, vnode->equal); |
| 511 } |
| 512 else { |
| 513 // Horizontal node; do binary search |
| 514 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizonta
lNode *)node; |
| 515 int low = 0; |
| 516 int high = nodeCount-1; |
| 517 int middle; |
| 518 node = NULL; // If we don't find a match, we'll fall out of the l
oop |
| 519 while (high >= low) { |
| 520 middle = (high+low)/2; |
| 521 if (uc == hnode->entries[middle].ch) { |
| 522 // We hit a match; get the next node and next character |
| 523 node = getCompactNode(fData, hnode->entries[middle].equal); |
| 524 utext_next32(text); |
| 525 uc = utext_current32(text); |
| 526 ++i; |
| 527 break; |
| 528 } |
| 529 else if (uc < hnode->entries[middle].ch) { |
| 530 high = middle-1; |
| 531 } |
| 532 else { |
| 533 low = middle+1; |
| 534 } |
| 535 } |
| 536 } |
| 537 } |
| 538 exit: |
| 539 count = mycount; |
| 540 return i; |
| 541 } |
| 542 |
| 543 // Implementation of iteration for CompactTrieDictionary |
| 544 class CompactTrieEnumeration : public StringEnumeration { |
| 545 private: |
| 546 UVector32 fNodeStack; // Stack of nodes to process |
| 547 UVector32 fIndexStack; // Stack of where in node we are |
| 548 const CompactTrieHeader *fHeader; // Trie data |
| 549 |
| 550 public: |
| 551 static UClassID U_EXPORT2 getStaticClassID(void); |
| 552 virtual UClassID getDynamicClassID(void) const; |
| 553 public: |
| 554 CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status) |
| 555 : fNodeStack(status), fIndexStack(status) { |
| 556 fHeader = header; |
| 557 fNodeStack.push(header->root, status); |
| 558 fIndexStack.push(0, status); |
| 559 unistr.remove(); |
| 560 } |
| 561 |
| 562 virtual ~CompactTrieEnumeration() { |
| 563 } |
| 564 |
| 565 virtual StringEnumeration *clone() const { |
| 566 UErrorCode status = U_ZERO_ERROR; |
| 567 return new CompactTrieEnumeration(fHeader, status); |
| 568 } |
| 569 |
| 570 virtual const UnicodeString * snext(UErrorCode &status); |
| 571 |
| 572 // Very expensive, but this should never be used. |
| 573 virtual int32_t count(UErrorCode &status) const { |
| 574 CompactTrieEnumeration counter(fHeader, status); |
| 575 int32_t result = 0; |
| 576 while (counter.snext(status) != NULL && U_SUCCESS(status)) { |
| 577 ++result; |
| 578 } |
| 579 return result; |
| 580 } |
| 581 |
| 582 virtual void reset(UErrorCode &status) { |
| 583 fNodeStack.removeAllElements(); |
| 584 fIndexStack.removeAllElements(); |
| 585 fNodeStack.push(fHeader->root, status); |
| 586 fIndexStack.push(0, status); |
| 587 unistr.remove(); |
| 588 } |
| 589 }; |
| 590 |
| 591 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration) |
| 592 |
| 593 const UnicodeString * |
| 594 CompactTrieEnumeration::snext(UErrorCode &status) { |
| 595 if (fNodeStack.empty() || U_FAILURE(status)) { |
| 596 return NULL; |
| 597 } |
| 598 const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki()); |
| 599 int where = fIndexStack.peeki(); |
| 600 while (!fNodeStack.empty() && U_SUCCESS(status)) { |
| 601 int nodeCount = (node->flagscount & kCountMask); |
| 602 UBool goingDown = FALSE; |
| 603 if (nodeCount == 0) { |
| 604 // Terminal node; go up immediately |
| 605 fNodeStack.popi(); |
| 606 fIndexStack.popi(); |
| 607 node = getCompactNode(fHeader, fNodeStack.peeki()); |
| 608 where = fIndexStack.peeki(); |
| 609 } |
| 610 else if (node->flagscount & kVerticalNode) { |
| 611 // Vertical node |
| 612 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNod
e *)node; |
| 613 if (where == 0) { |
| 614 // Going down |
| 615 unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount); |
| 616 fIndexStack.setElementAt(1, fIndexStack.size()-1); |
| 617 node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, sta
tus)); |
| 618 where = fIndexStack.push(0, status); |
| 619 goingDown = TRUE; |
| 620 } |
| 621 else { |
| 622 // Going up |
| 623 unistr.truncate(unistr.length()-nodeCount); |
| 624 fNodeStack.popi(); |
| 625 fIndexStack.popi(); |
| 626 node = getCompactNode(fHeader, fNodeStack.peeki()); |
| 627 where = fIndexStack.peeki(); |
| 628 } |
| 629 } |
| 630 else { |
| 631 // Horizontal node |
| 632 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizonta
lNode *)node; |
| 633 if (where > 0) { |
| 634 // Pop previous char |
| 635 unistr.truncate(unistr.length()-1); |
| 636 } |
| 637 if (where < nodeCount) { |
| 638 // Push on next node |
| 639 unistr.append((UChar)hnode->entries[where].ch); |
| 640 fIndexStack.setElementAt(where+1, fIndexStack.size()-1); |
| 641 node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[wh
ere].equal, status)); |
| 642 where = fIndexStack.push(0, status); |
| 643 goingDown = TRUE; |
| 644 } |
| 645 else { |
| 646 // Going up |
| 647 fNodeStack.popi(); |
| 648 fIndexStack.popi(); |
| 649 node = getCompactNode(fHeader, fNodeStack.peeki()); |
| 650 where = fIndexStack.peeki(); |
| 651 } |
| 652 } |
| 653 // Check if the parent of the node we've just gone down to ends a |
| 654 // word. If so, return it. |
| 655 if (goingDown && (node->flagscount & kParentEndsWord)) { |
| 656 return &unistr; |
| 657 } |
| 658 } |
| 659 return NULL; |
| 660 } |
| 661 |
| 662 StringEnumeration * |
| 663 CompactTrieDictionary::openWords( UErrorCode &status ) const { |
| 664 if (U_FAILURE(status)) { |
| 665 return NULL; |
| 666 } |
| 667 return new CompactTrieEnumeration(fData, status); |
| 668 } |
| 669 |
| 670 // |
| 671 // Below here is all code related to converting a ternary trie to a compact trie |
| 672 // and back again |
| 673 // |
| 674 |
| 675 // Helper classes to construct the compact trie |
| 676 class BuildCompactTrieNode: public UMemory { |
| 677 public: |
| 678 UBool fParentEndsWord; |
| 679 UBool fVertical; |
| 680 UBool fHasDuplicate; |
| 681 int32_t fNodeID; |
| 682 UnicodeString fChars; |
| 683 |
| 684 public: |
| 685 BuildCompactTrieNode(UBool parentEndsWord, UBool vertical, UStack &nodes, UE
rrorCode &status) { |
| 686 fParentEndsWord = parentEndsWord; |
| 687 fHasDuplicate = FALSE; |
| 688 fVertical = vertical; |
| 689 fNodeID = nodes.size(); |
| 690 nodes.push(this, status); |
| 691 } |
| 692 |
| 693 virtual ~BuildCompactTrieNode() { |
| 694 } |
| 695 |
| 696 virtual uint32_t size() { |
| 697 return sizeof(uint16_t); |
| 698 } |
| 699 |
| 700 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*tran
slate*/) { |
| 701 // Write flag/count |
| 702 *((uint16_t *)(bytes+offset)) = (fChars.length() & kCountMask) |
| 703 | (fVertical ? kVerticalNode : 0) | (fParentEndsWord ? kParentEndsWo
rd : 0 ); |
| 704 offset += sizeof(uint16_t); |
| 705 } |
| 706 }; |
| 707 |
| 708 class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode { |
| 709 public: |
| 710 UStack fLinks; |
| 711 |
| 712 public: |
| 713 BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCo
de &status) |
| 714 : BuildCompactTrieNode(parentEndsWord, FALSE, nodes, status), fLinks(sta
tus) { |
| 715 } |
| 716 |
| 717 virtual ~BuildCompactTrieHorizontalNode() { |
| 718 } |
| 719 |
| 720 virtual uint32_t size() { |
| 721 return offsetof(CompactTrieHorizontalNode,entries) + |
| 722 (fChars.length()*sizeof(CompactTrieHorizontalEntry)); |
| 723 } |
| 724 |
| 725 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &transl
ate) { |
| 726 BuildCompactTrieNode::write(bytes, offset, translate); |
| 727 int32_t count = fChars.length(); |
| 728 for (int32_t i = 0; i < count; ++i) { |
| 729 CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(b
ytes+offset); |
| 730 entry->ch = fChars[i]; |
| 731 entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[
i])->fNodeID); |
| 732 #ifdef DEBUG_TRIE_DICT |
| 733 if (entry->equal == 0) { |
| 734 fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps
to physical node zero\n", |
| 735 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); |
| 736 } |
| 737 #endif |
| 738 offset += sizeof(CompactTrieHorizontalEntry); |
| 739 } |
| 740 } |
| 741 |
| 742 void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) { |
| 743 fChars.append(ch); |
| 744 fLinks.push(link, status); |
| 745 } |
| 746 }; |
| 747 |
| 748 class BuildCompactTrieVerticalNode: public BuildCompactTrieNode { |
| 749 public: |
| 750 BuildCompactTrieNode *fEqual; |
| 751 |
| 752 public: |
| 753 BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode
&status) |
| 754 : BuildCompactTrieNode(parentEndsWord, TRUE, nodes, status) { |
| 755 fEqual = NULL; |
| 756 } |
| 757 |
| 758 virtual ~BuildCompactTrieVerticalNode() { |
| 759 } |
| 760 |
| 761 virtual uint32_t size() { |
| 762 return offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof
(uint16_t)); |
| 763 } |
| 764 |
| 765 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &transl
ate) { |
| 766 CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset
); |
| 767 BuildCompactTrieNode::write(bytes, offset, translate); |
| 768 node->equal = translate.elementAti(fEqual->fNodeID); |
| 769 offset += sizeof(node->equal); |
| 770 #ifdef DEBUG_TRIE_DICT |
| 771 if (node->equal == 0) { |
| 772 fprintf(stderr, "ERROR: vertical link, logical node %d maps to physi
cal node zero\n", |
| 773 fEqual->fNodeID); |
| 774 } |
| 775 #endif |
| 776 fChars.extract(0, fChars.length(), (UChar *)node->chars); |
| 777 offset += sizeof(uint16_t)*fChars.length(); |
| 778 } |
| 779 |
| 780 void addChar(UChar ch) { |
| 781 fChars.append(ch); |
| 782 } |
| 783 |
| 784 void setLink(BuildCompactTrieNode *node) { |
| 785 fEqual = node; |
| 786 } |
| 787 }; |
| 788 |
| 789 // Forward declaration |
| 790 static void walkHorizontal(const TernaryNode *node, |
| 791 BuildCompactTrieHorizontalNode *building, |
| 792 UStack &nodes, |
| 793 UErrorCode &status); |
| 794 |
| 795 // Convert one node. Uses recursion. |
| 796 |
| 797 static BuildCompactTrieNode * |
| 798 compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, UEr
rorCode &status) { |
| 799 if (U_FAILURE(status)) { |
| 800 return NULL; |
| 801 } |
| 802 BuildCompactTrieNode *result = NULL; |
| 803 UBool horizontal = (node->low != NULL || node->high != NULL); |
| 804 if (horizontal) { |
| 805 BuildCompactTrieHorizontalNode *hResult = |
| 806 new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status
); |
| 807 if (hResult == NULL) { |
| 808 status = U_MEMORY_ALLOCATION_ERROR; |
| 809 return NULL; |
| 810 } |
| 811 if (U_SUCCESS(status)) { |
| 812 walkHorizontal(node, hResult, nodes, status); |
| 813 result = hResult; |
| 814 } |
| 815 } |
| 816 else { |
| 817 BuildCompactTrieVerticalNode *vResult = |
| 818 new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status); |
| 819 if (vResult == NULL) { |
| 820 status = U_MEMORY_ALLOCATION_ERROR; |
| 821 } |
| 822 else if (U_SUCCESS(status)) { |
| 823 UBool endsWord = FALSE; |
| 824 // Take up nodes until we end a word, or hit a node with < or > link
s |
| 825 do { |
| 826 vResult->addChar(node->ch); |
| 827 endsWord = (node->flags & kEndsWord) != 0; |
| 828 node = node->equal; |
| 829 } |
| 830 while(node != NULL && !endsWord && node->low == NULL && node->high =
= NULL); |
| 831 if (node == NULL) { |
| 832 if (!endsWord) { |
| 833 status = U_ILLEGAL_ARGUMENT_ERROR; // Corrupt input trie |
| 834 } |
| 835 else { |
| 836 vResult->setLink((BuildCompactTrieNode *)nodes[1]); |
| 837 } |
| 838 } |
| 839 else { |
| 840 vResult->setLink(compactOneNode(node, endsWord, nodes, status)); |
| 841 } |
| 842 result = vResult; |
| 843 } |
| 844 } |
| 845 return result; |
| 846 } |
| 847 |
| 848 // Walk the set of peers at the same level, to build a horizontal node. |
| 849 // Uses recursion. |
| 850 |
| 851 static void walkHorizontal(const TernaryNode *node, |
| 852 BuildCompactTrieHorizontalNode *building, |
| 853 UStack &nodes, |
| 854 UErrorCode &status) { |
| 855 while (U_SUCCESS(status) && node != NULL) { |
| 856 if (node->low != NULL) { |
| 857 walkHorizontal(node->low, building, nodes, status); |
| 858 } |
| 859 BuildCompactTrieNode *link = NULL; |
| 860 if (node->equal != NULL) { |
| 861 link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, n
odes, status); |
| 862 } |
| 863 else if (node->flags & kEndsWord) { |
| 864 link = (BuildCompactTrieNode *)nodes[1]; |
| 865 } |
| 866 if (U_SUCCESS(status) && link != NULL) { |
| 867 building->addNode(node->ch, link, status); |
| 868 } |
| 869 // Tail recurse manually instead of leaving it to the compiler. |
| 870 //if (node->high != NULL) { |
| 871 // walkHorizontal(node->high, building, nodes, status); |
| 872 //} |
| 873 node = node->high; |
| 874 } |
| 875 } |
| 876 |
| 877 U_NAMESPACE_END |
| 878 U_NAMESPACE_USE |
| 879 U_CDECL_BEGIN |
| 880 static int32_t U_CALLCONV |
| 881 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr)
{ |
| 882 BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl; |
| 883 BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr; |
| 884 // Check for comparing a node to itself, to avoid spurious duplicates |
| 885 if (left == right) { |
| 886 return 0; |
| 887 } |
| 888 // Most significant is type of node. Can never coalesce. |
| 889 if (left->fVertical != right->fVertical) { |
| 890 return left->fVertical - right->fVertical; |
| 891 } |
| 892 // Next, the "parent ends word" flag. If that differs, we cannot coalesce. |
| 893 if (left->fParentEndsWord != right->fParentEndsWord) { |
| 894 return left->fParentEndsWord - right->fParentEndsWord; |
| 895 } |
| 896 // Next, the string. If that differs, we can never coalesce. |
| 897 int32_t result = left->fChars.compare(right->fChars); |
| 898 if (result != 0) { |
| 899 return result; |
| 900 } |
| 901 // We know they're both the same node type, so branch for the two cases. |
| 902 if (left->fVertical) { |
| 903 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID |
| 904 - ((BuildCompactTrieVerticalNode *)right)->fEqual->f
NodeID; |
| 905 } |
| 906 else { |
| 907 // We need to compare the links vectors. They should be the |
| 908 // same size because the strings were equal. |
| 909 // We compare the node IDs instead of the pointers, to handle |
| 910 // coalesced nodes. |
| 911 BuildCompactTrieHorizontalNode *hleft, *hright; |
| 912 hleft = (BuildCompactTrieHorizontalNode *)left; |
| 913 hright = (BuildCompactTrieHorizontalNode *)right; |
| 914 int32_t count = hleft->fLinks.size(); |
| 915 for (int32_t i = 0; i < count && result == 0; ++i) { |
| 916 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID - |
| 917 ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; |
| 918 } |
| 919 } |
| 920 // If they are equal to each other, mark them (speeds coalescing) |
| 921 if (result == 0) { |
| 922 left->fHasDuplicate = TRUE; |
| 923 right->fHasDuplicate = TRUE; |
| 924 } |
| 925 return result; |
| 926 } |
| 927 U_CDECL_END |
| 928 U_NAMESPACE_BEGIN |
| 929 |
| 930 static void coalesceDuplicates(UStack &nodes, UErrorCode &status) { |
| 931 // We sort the array of nodes to place duplicates next to each other |
| 932 if (U_FAILURE(status)) { |
| 933 return; |
| 934 } |
| 935 int32_t size = nodes.size(); |
| 936 void **array = (void **)uprv_malloc(sizeof(void *)*size); |
| 937 if (array == NULL) { |
| 938 status = U_MEMORY_ALLOCATION_ERROR; |
| 939 return; |
| 940 } |
| 941 (void) nodes.toArray(array); |
| 942 |
| 943 // Now repeatedly identify duplicates until there are no more |
| 944 int32_t dupes = 0; |
| 945 long passCount = 0; |
| 946 #ifdef DEBUG_TRIE_DICT |
| 947 long totalDupes = 0; |
| 948 #endif |
| 949 do { |
| 950 BuildCompactTrieNode *node; |
| 951 BuildCompactTrieNode *first = NULL; |
| 952 BuildCompactTrieNode **p; |
| 953 BuildCompactTrieNode **pFirst = NULL; |
| 954 int32_t counter = size - 2; |
| 955 // Sort the array, skipping nodes 0 and 1. Use quicksort for the first |
| 956 // pass for speed. For the second and subsequent passes, we use stable |
| 957 // (insertion) sort for two reasons: |
| 958 // 1. The array is already mostly ordered, so we get better performance. |
| 959 // 2. The way we find one and only one instance of a set of duplicates i
s to |
| 960 // check that the node ID equals the array index. If we used an unsta
ble |
| 961 // sort for the second or later passes, it's possible that none of th
e |
| 962 // duplicates would wind up with a node ID equal to its array index. |
| 963 // The sort stability guarantees that, because as we coalesce more an
d |
| 964 // more groups, the first element of the resultant group will be one
of |
| 965 // the first elements of the groups being coalesced. |
| 966 // To use quicksort for the second and subsequent passes, we would have
to |
| 967 // find the minimum of the node numbers in a group, and set all the node
s |
| 968 // in the group to that node number. |
| 969 uprv_sortArray(array+2, counter, sizeof(void *), _sortBuildNodes, NULL,
(passCount > 0), &status); |
| 970 dupes = 0; |
| 971 for (p = (BuildCompactTrieNode **)array + 2; counter > 0; --counter, ++p
) { |
| 972 node = *p; |
| 973 if (node->fHasDuplicate) { |
| 974 if (first == NULL) { |
| 975 first = node; |
| 976 pFirst = p; |
| 977 } |
| 978 else if (_sortBuildNodes(NULL, pFirst, p) != 0) { |
| 979 // Starting a new run of dupes |
| 980 first = node; |
| 981 pFirst = p; |
| 982 } |
| 983 else if (node->fNodeID != first->fNodeID) { |
| 984 // Slave one to the other, note duplicate |
| 985 node->fNodeID = first->fNodeID; |
| 986 dupes += 1; |
| 987 } |
| 988 } |
| 989 else { |
| 990 // This node has no dupes |
| 991 first = NULL; |
| 992 pFirst = NULL; |
| 993 } |
| 994 } |
| 995 passCount += 1; |
| 996 #ifdef DEBUG_TRIE_DICT |
| 997 totalDupes += dupes; |
| 998 fprintf(stderr, "Trie node dupe removal, pass %d: %d nodes tagged\n", pa
ssCount, dupes); |
| 999 #endif |
| 1000 } |
| 1001 while (dupes > 0); |
| 1002 #ifdef DEBUG_TRIE_DICT |
| 1003 fprintf(stderr, "Trie node dupe removal complete: %d tagged in %d passes\n",
totalDupes, passCount); |
| 1004 #endif |
| 1005 |
| 1006 // We no longer need the temporary array, as the nodes have all been marked
appropriately. |
| 1007 uprv_free(array); |
| 1008 } |
| 1009 |
| 1010 U_NAMESPACE_END |
| 1011 U_CDECL_BEGIN |
| 1012 static void U_CALLCONV _deleteBuildNode(void *obj) { |
| 1013 delete (BuildCompactTrieNode *) obj; |
| 1014 } |
| 1015 U_CDECL_END |
| 1016 U_NAMESPACE_BEGIN |
| 1017 |
| 1018 CompactTrieHeader * |
| 1019 CompactTrieDictionary::compactMutableTrieDictionary( const MutableTrieDictionary
&dict, |
| 1020 UErrorCode &status ) { |
| 1021 if (U_FAILURE(status)) { |
| 1022 return NULL; |
| 1023 } |
| 1024 #ifdef DEBUG_TRIE_DICT |
| 1025 struct tms timing; |
| 1026 struct tms previous; |
| 1027 (void) ::times(&previous); |
| 1028 #endif |
| 1029 UStack nodes(_deleteBuildNode, NULL, status); // Index of nodes |
| 1030 |
| 1031 // Add node 0, used as the NULL pointer/sentinel. |
| 1032 nodes.addElement((int32_t)0, status); |
| 1033 |
| 1034 // Start by creating the special empty node we use to indicate that the pare
nt |
| 1035 // terminates a word. This must be node 1, because the builder assumes |
| 1036 // that. |
| 1037 if (U_FAILURE(status)) { |
| 1038 return NULL; |
| 1039 } |
| 1040 BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, nodes
, status); |
| 1041 if (terminal == NULL) { |
| 1042 status = U_MEMORY_ALLOCATION_ERROR; |
| 1043 } |
| 1044 |
| 1045 // This call does all the work of building the new trie structure. The root |
| 1046 // will be node 2. |
| 1047 BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status
); |
| 1048 #ifdef DEBUG_TRIE_DICT |
| 1049 (void) ::times(&timing); |
| 1050 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n", |
| 1051 nodes.size(), (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, |
| 1052 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); |
| 1053 previous = timing; |
| 1054 #endif |
| 1055 |
| 1056 // Now coalesce all duplicate nodes. |
| 1057 coalesceDuplicates(nodes, status); |
| 1058 #ifdef DEBUG_TRIE_DICT |
| 1059 (void) ::times(&timing); |
| 1060 fprintf(stderr, "Duplicates coalesced, time user %f system %f\n", |
| 1061 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, |
| 1062 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); |
| 1063 previous = timing; |
| 1064 #endif |
| 1065 |
| 1066 // Next, build the output trie. |
| 1067 // First we compute all the sizes and build the node ID translation table. |
| 1068 uint32_t totalSize = offsetof(CompactTrieHeader,offsets); |
| 1069 int32_t count = nodes.size(); |
| 1070 int32_t nodeCount = 1; // The sentinel node we already have |
| 1071 BuildCompactTrieNode *node; |
| 1072 int32_t i; |
| 1073 UVector32 translate(count, status); // Should be no growth needed after this |
| 1074 translate.push(0, status); // The sentinel node |
| 1075 |
| 1076 if (U_FAILURE(status)) { |
| 1077 return NULL; |
| 1078 } |
| 1079 |
| 1080 for (i = 1; i < count; ++i) { |
| 1081 node = (BuildCompactTrieNode *)nodes[i]; |
| 1082 if (node->fNodeID == i) { |
| 1083 // Only one node out of each duplicate set is used |
| 1084 if (i >= translate.size()) { |
| 1085 // Logically extend the mapping table |
| 1086 translate.setSize(i+1); |
| 1087 } |
| 1088 translate.setElementAt(nodeCount++, i); |
| 1089 totalSize += node->size(); |
| 1090 } |
| 1091 } |
| 1092 |
| 1093 // Check for overflowing 16 bits worth of nodes. |
| 1094 if (nodeCount > 0x10000) { |
| 1095 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 1096 return NULL; |
| 1097 } |
| 1098 |
| 1099 // Add enough room for the offsets. |
| 1100 totalSize += nodeCount*sizeof(uint32_t); |
| 1101 #ifdef DEBUG_TRIE_DICT |
| 1102 (void) ::times(&timing); |
| 1103 fprintf(stderr, "Sizes/mapping done, time user %f system %f\n", |
| 1104 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, |
| 1105 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); |
| 1106 previous = timing; |
| 1107 fprintf(stderr, "%d nodes, %d unique, %d bytes\n", nodes.size(), nodeCount,
totalSize); |
| 1108 #endif |
| 1109 uint8_t *bytes = (uint8_t *)uprv_malloc(totalSize); |
| 1110 if (bytes == NULL) { |
| 1111 status = U_MEMORY_ALLOCATION_ERROR; |
| 1112 return NULL; |
| 1113 } |
| 1114 |
| 1115 CompactTrieHeader *header = (CompactTrieHeader *)bytes; |
| 1116 header->size = totalSize; |
| 1117 header->nodeCount = nodeCount; |
| 1118 header->offsets[0] = 0; // Sentinel |
| 1119 header->root = translate.elementAti(root->fNodeID); |
| 1120 #ifdef DEBUG_TRIE_DICT |
| 1121 if (header->root == 0) { |
| 1122 fprintf(stderr, "ERROR: root node %d translate to physical zero\n", root
->fNodeID); |
| 1123 } |
| 1124 #endif |
| 1125 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint
32_t)); |
| 1126 nodeCount = 1; |
| 1127 // Now write the data |
| 1128 for (i = 1; i < count; ++i) { |
| 1129 node = (BuildCompactTrieNode *)nodes[i]; |
| 1130 if (node->fNodeID == i) { |
| 1131 header->offsets[nodeCount++] = offset; |
| 1132 node->write(bytes, offset, translate); |
| 1133 } |
| 1134 } |
| 1135 #ifdef DEBUG_TRIE_DICT |
| 1136 (void) ::times(&timing); |
| 1137 fprintf(stderr, "Trie built, time user %f system %f\n", |
| 1138 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, |
| 1139 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); |
| 1140 previous = timing; |
| 1141 fprintf(stderr, "Final offset is %d\n", offset); |
| 1142 |
| 1143 // Collect statistics on node types and sizes |
| 1144 int hCount = 0; |
| 1145 int vCount = 0; |
| 1146 size_t hSize = 0; |
| 1147 size_t vSize = 0; |
| 1148 size_t hItemCount = 0; |
| 1149 size_t vItemCount = 0; |
| 1150 uint32_t previousOff = offset; |
| 1151 for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { |
| 1152 const CompactTrieNode *node = getCompactNode(header, nodeIdx); |
| 1153 if (node->flagscount & kVerticalNode) { |
| 1154 vCount += 1; |
| 1155 vItemCount += (node->flagscount & kCountMask); |
| 1156 vSize += previousOff-header->offsets[nodeIdx]; |
| 1157 } |
| 1158 else { |
| 1159 hCount += 1; |
| 1160 hItemCount += (node->flagscount & kCountMask); |
| 1161 hSize += previousOff-header->offsets[nodeIdx]; |
| 1162 } |
| 1163 previousOff = header->offsets[nodeIdx]; |
| 1164 } |
| 1165 fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\
n", hCount, |
| 1166 (double)hSize/hCount, (double)hItemCount/hCount); |
| 1167 fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n"
, vCount, |
| 1168 (double)vSize/vCount, (double)vItemCount/vCount); |
| 1169 #endif |
| 1170 |
| 1171 if (U_FAILURE(status)) { |
| 1172 uprv_free(bytes); |
| 1173 header = NULL; |
| 1174 } |
| 1175 else { |
| 1176 header->magic = COMPACT_TRIE_MAGIC_1; |
| 1177 } |
| 1178 return header; |
| 1179 } |
| 1180 |
| 1181 // Forward declaration |
| 1182 static TernaryNode * |
| 1183 unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UEr
rorCode &status ); |
| 1184 |
| 1185 |
| 1186 // Convert a horizontal node (or subarray thereof) into a ternary subtrie |
| 1187 static TernaryNode * |
| 1188 unpackHorizontalArray( const CompactTrieHeader *header, const CompactTrieHorizon
talEntry *array, |
| 1189 int low, int high, UErrorCode &status ) { |
| 1190 if (U_FAILURE(status) || low > high) { |
| 1191 return NULL; |
| 1192 } |
| 1193 int middle = (low+high)/2; |
| 1194 TernaryNode *result = new TernaryNode(array[middle].ch); |
| 1195 if (result == NULL) { |
| 1196 status = U_MEMORY_ALLOCATION_ERROR; |
| 1197 return NULL; |
| 1198 } |
| 1199 const CompactTrieNode *equal = getCompactNode(header, array[middle].equal); |
| 1200 if (equal->flagscount & kParentEndsWord) { |
| 1201 result->flags |= kEndsWord; |
| 1202 } |
| 1203 result->low = unpackHorizontalArray(header, array, low, middle-1, status); |
| 1204 result->high = unpackHorizontalArray(header, array, middle+1, high, status); |
| 1205 result->equal = unpackOneNode(header, equal, status); |
| 1206 return result; |
| 1207 } |
| 1208 |
| 1209 // Convert one compact trie node into a ternary subtrie |
| 1210 static TernaryNode * |
| 1211 unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UEr
rorCode &status ) { |
| 1212 int nodeCount = (node->flagscount & kCountMask); |
| 1213 if (nodeCount == 0 || U_FAILURE(status)) { |
| 1214 // Failure, or terminal node |
| 1215 return NULL; |
| 1216 } |
| 1217 if (node->flagscount & kVerticalNode) { |
| 1218 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)
node; |
| 1219 TernaryNode *head = NULL; |
| 1220 TernaryNode *previous = NULL; |
| 1221 TernaryNode *latest = NULL; |
| 1222 for (int i = 0; i < nodeCount; ++i) { |
| 1223 latest = new TernaryNode(vnode->chars[i]); |
| 1224 if (latest == NULL) { |
| 1225 status = U_MEMORY_ALLOCATION_ERROR; |
| 1226 break; |
| 1227 } |
| 1228 if (head == NULL) { |
| 1229 head = latest; |
| 1230 } |
| 1231 if (previous != NULL) { |
| 1232 previous->equal = latest; |
| 1233 } |
| 1234 previous = latest; |
| 1235 } |
| 1236 if (latest != NULL) { |
| 1237 const CompactTrieNode *equal = getCompactNode(header, vnode->equal); |
| 1238 if (equal->flagscount & kParentEndsWord) { |
| 1239 latest->flags |= kEndsWord; |
| 1240 } |
| 1241 latest->equal = unpackOneNode(header, equal, status); |
| 1242 } |
| 1243 return head; |
| 1244 } |
| 1245 else { |
| 1246 // Horizontal node |
| 1247 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNod
e *)node; |
| 1248 return unpackHorizontalArray(header, &hnode->entries[0], 0, nodeCount-1,
status); |
| 1249 } |
| 1250 } |
| 1251 |
| 1252 MutableTrieDictionary * |
| 1253 CompactTrieDictionary::cloneMutable( UErrorCode &status ) const { |
| 1254 MutableTrieDictionary *result = new MutableTrieDictionary( status ); |
| 1255 if (result == NULL) { |
| 1256 status = U_MEMORY_ALLOCATION_ERROR; |
| 1257 return NULL; |
| 1258 } |
| 1259 TernaryNode *root = unpackOneNode(fData, getCompactNode(fData, fData->root),
status); |
| 1260 if (U_FAILURE(status)) { |
| 1261 delete root; // Clean up |
| 1262 delete result; |
| 1263 return NULL; |
| 1264 } |
| 1265 result->fTrie = root; |
| 1266 return result; |
| 1267 } |
| 1268 |
| 1269 U_NAMESPACE_END |
| 1270 |
| 1271 U_CAPI int32_t U_EXPORT2 |
| 1272 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *
outData, |
| 1273 UErrorCode *status) { |
| 1274 |
| 1275 if (status == NULL || U_FAILURE(*status)) { |
| 1276 return 0; |
| 1277 } |
| 1278 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { |
| 1279 *status=U_ILLEGAL_ARGUMENT_ERROR; |
| 1280 return 0; |
| 1281 } |
| 1282 |
| 1283 // |
| 1284 // Check that the data header is for for dictionary data. |
| 1285 // (Header contents are defined in genxxx.cpp) |
| 1286 // |
| 1287 const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4); |
| 1288 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */ |
| 1289 pInfo->dataFormat[1]==0x72 && |
| 1290 pInfo->dataFormat[2]==0x44 && |
| 1291 pInfo->dataFormat[3]==0x63 && |
| 1292 pInfo->formatVersion[0]==1 )) { |
| 1293 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (
format version %02x) is not recognized\n", |
| 1294 pInfo->dataFormat[0], pInfo->dataFormat[1], |
| 1295 pInfo->dataFormat[2], pInfo->dataFormat[3], |
| 1296 pInfo->formatVersion[0]); |
| 1297 *status=U_UNSUPPORTED_ERROR; |
| 1298 return 0; |
| 1299 } |
| 1300 |
| 1301 // |
| 1302 // Swap the data header. (This is the generic ICU Data Header, not the |
| 1303 // CompactTrieHeader). This swap also conveniently
gets us |
| 1304 // the size of the ICU d.h., which lets us locate th
e start |
| 1305 // of the RBBI specific data. |
| 1306 // |
| 1307 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status)
; |
| 1308 |
| 1309 // |
| 1310 // Get the CompactTrieHeader, and check that it appears to be OK. |
| 1311 // |
| 1312 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; |
| 1313 const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes; |
| 1314 if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1 |
| 1315 || ds->readUInt32(header->size) < sizeof(CompactTrieHeader)) |
| 1316 { |
| 1317 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n")
; |
| 1318 *status=U_UNSUPPORTED_ERROR; |
| 1319 return 0; |
| 1320 } |
| 1321 |
| 1322 // |
| 1323 // Prefight operation? Just return the size |
| 1324 // |
| 1325 uint32_t totalSize = ds->readUInt32(header->size); |
| 1326 int32_t sizeWithUData = (int32_t)totalSize + headerSize; |
| 1327 if (length < 0) { |
| 1328 return sizeWithUData; |
| 1329 } |
| 1330 |
| 1331 // |
| 1332 // Check that length passed in is consistent with length from RBBI data head
er. |
| 1333 // |
| 1334 if (length < sizeWithUData) { |
| 1335 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data
header) for trie data.\n", |
| 1336 totalSize); |
| 1337 *status=U_INDEX_OUTOFBOUNDS_ERROR; |
| 1338 return 0; |
| 1339 } |
| 1340 |
| 1341 // |
| 1342 // Swap the Data. Do the data itself first, then the CompactTrieHeader, bec
ause |
| 1343 // we need to reference the header to locate the data, and a
n |
| 1344 // inplace swap of the header leaves it unusable. |
| 1345 // |
| 1346 uint8_t *outBytes = (uint8_t *)outData + headerSize; |
| 1347 CompactTrieHeader *outputHeader = (CompactTrieHeader *)outBytes; |
| 1348 |
| 1349 #if 0 |
| 1350 // |
| 1351 // If not swapping in place, zero out the output buffer before starting. |
| 1352 // |
| 1353 if (inBytes != outBytes) { |
| 1354 uprv_memset(outBytes, 0, totalSize); |
| 1355 } |
| 1356 |
| 1357 // We need to loop through all the nodes in the offset table, and swap each
one. |
| 1358 uint16_t nodeCount = ds->readUInt16(header->nodeCount); |
| 1359 // Skip node 0, which should always be 0. |
| 1360 for (int i = 1; i < nodeCount; ++i) { |
| 1361 uint32_t nodeOff = ds->readUInt32(header->offsets[i]); |
| 1362 const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + node
Off); |
| 1363 CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff); |
| 1364 uint16_t flagscount = ds->readUInt16(inNode->flagscount); |
| 1365 uint16_t itemCount = flagscount & kCountMask; |
| 1366 ds->writeUInt16(&outNode->flagscount, flagscount); |
| 1367 if (itemCount > 0) { |
| 1368 if (flagscount & kVerticalNode) { |
| 1369 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVertical
Node,chars), |
| 1370 itemCount*sizeof(uint16_t), |
| 1371 outBytes+nodeOff+offsetof(CompactTrieVertica
lNode,chars), status); |
| 1372 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(Compact
TrieVerticalNode,equal); |
| 1373 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNod
e,equal)); |
| 1374 } |
| 1375 else { |
| 1376 const CompactTrieHorizontalNode *inHNode = (const CompactTrieHor
izontalNode *)inNode; |
| 1377 CompactTrieHorizontalNode *outHNode = (CompactTrieHorizontalNode
*)outNode; |
| 1378 for (int j = 0; j < itemCount; ++j) { |
| 1379 uint16_t word = ds->readUInt16(inHNode->entries[j].ch); |
| 1380 ds->writeUInt16(&outHNode->entries[j].ch, word); |
| 1381 word = ds->readUInt16(inHNode->entries[j].equal); |
| 1382 ds->writeUInt16(&outHNode->entries[j].equal, word); |
| 1383 } |
| 1384 } |
| 1385 } |
| 1386 } |
| 1387 #endif |
| 1388 |
| 1389 // All the data in all the nodes consist of 16 bit items. Swap them all at o
nce. |
| 1390 uint16_t nodeCount = ds->readUInt16(header->nodeCount); |
| 1391 uint32_t nodesOff = offsetof(CompactTrieHeader,offsets)+((uint32_t)nodeCount
*sizeof(uint32_t)); |
| 1392 ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff,
status); |
| 1393 |
| 1394 // Swap the header |
| 1395 ds->writeUInt32(&outputHeader->size, totalSize); |
| 1396 uint32_t magic = ds->readUInt32(header->magic); |
| 1397 ds->writeUInt32(&outputHeader->magic, magic); |
| 1398 ds->writeUInt16(&outputHeader->nodeCount, nodeCount); |
| 1399 uint16_t root = ds->readUInt16(header->root); |
| 1400 ds->writeUInt16(&outputHeader->root, root); |
| 1401 ds->swapArray32(ds, inBytes+offsetof(CompactTrieHeader,offsets), |
| 1402 sizeof(uint32_t)*(int32_t)nodeCount, |
| 1403 outBytes+offsetof(CompactTrieHeader,offsets), status); |
| 1404 |
| 1405 return sizeWithUData; |
| 1406 } |
| 1407 |
| 1408 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
OLD | NEW |