| OLD | NEW |
| 1 /** | 1 /** |
| 2 ******************************************************************************* | 2 ******************************************************************************* |
| 3 * Copyright (C) 2006-2008, International Business Machines Corporation * | 3 * Copyright (C) 2006-2008, International Business Machines Corporation * |
| 4 * and others. All Rights Reserved. * | 4 * and others. All Rights Reserved. * |
| 5 ******************************************************************************* | 5 ******************************************************************************* |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "unicode/utypes.h" | 8 #include "unicode/utypes.h" |
| 9 | 9 |
| 10 #if !UCONFIG_NO_BREAK_ITERATION | 10 #if !UCONFIG_NO_BREAK_ITERATION |
| 11 | 11 |
| 12 #include "triedict.h" | 12 #include "triedict.h" |
| 13 #include "unicode/chariter.h" | 13 #include "unicode/chariter.h" |
| 14 #include "unicode/uchriter.h" | 14 #include "unicode/uchriter.h" |
| 15 #include "unicode/strenum.h" | 15 #include "unicode/strenum.h" |
| 16 #include "unicode/uenum.h" | 16 #include "unicode/uenum.h" |
| 17 #include "unicode/udata.h" | 17 #include "unicode/udata.h" |
| 18 #include "cmemory.h" | 18 #include "cmemory.h" |
| 19 #include "udataswp.h" | 19 #include "udataswp.h" |
| 20 #include "uvector.h" | 20 #include "uvector.h" |
| 21 #include "uvectr32.h" | 21 #include "uvectr32.h" |
| 22 #include "uarrsort.h" | 22 #include "uarrsort.h" |
| 23 #include "hash.h" |
| 23 | 24 |
| 24 //#define DEBUG_TRIE_DICT 1 | 25 //#define DEBUG_TRIE_DICT 1 |
| 25 | 26 |
| 26 #ifdef DEBUG_TRIE_DICT | 27 #ifdef DEBUG_TRIE_DICT |
| 27 #include <sys/times.h> | 28 #include <sys/times.h> |
| 28 #include <limits.h> | 29 #include <limits.h> |
| 29 #include <stdio.h> | 30 #include <stdio.h> |
| 31 #include <time.h> |
| 32 #ifndef CLK_TCK |
| 33 #define CLK_TCK CLOCKS_PER_SEC |
| 34 #endif |
| 35 |
| 30 #endif | 36 #endif |
| 31 | 37 |
| 32 U_NAMESPACE_BEGIN | 38 U_NAMESPACE_BEGIN |
| 33 | 39 |
| 34 /******************************************************************* | 40 /******************************************************************* |
| 35 * TrieWordDictionary | 41 * TrieWordDictionary |
| 36 */ | 42 */ |
| 37 | 43 |
| 38 TrieWordDictionary::TrieWordDictionary() { | 44 TrieWordDictionary::TrieWordDictionary() { |
| 39 } | 45 } |
| 40 | 46 |
| 41 TrieWordDictionary::~TrieWordDictionary() { | 47 TrieWordDictionary::~TrieWordDictionary() { |
| 42 } | 48 } |
| 43 | 49 |
| 44 /******************************************************************* | 50 /******************************************************************* |
| 45 * MutableTrieDictionary | 51 * MutableTrieDictionary |
| 46 */ | 52 */ |
| 47 | 53 |
| 54 //#define MAX_VALUE 65535 |
| 55 |
| 56 // forward declaration |
| 57 inline uint16_t scaleLogProbabilities(double logprob); |
| 58 |
| 48 // Node structure for the ternary, uncompressed trie | 59 // Node structure for the ternary, uncompressed trie |
| 49 struct TernaryNode : public UMemory { | 60 struct TernaryNode : public UMemory { |
| 50 UChar ch; // UTF-16 code unit | 61 UChar ch; // UTF-16 code unit |
| 51 uint16_t flags; // Flag word | 62 uint16_t flags; // Flag word |
| 52 TernaryNode *low; // Less-than link | 63 TernaryNode *low; // Less-than link |
| 53 TernaryNode *equal; // Equal link | 64 TernaryNode *equal; // Equal link |
| 54 TernaryNode *high; // Greater-than link | 65 TernaryNode *high; // Greater-than link |
| 55 | 66 |
| 56 TernaryNode(UChar uc); | 67 TernaryNode(UChar uc); |
| 57 ~TernaryNode(); | 68 ~TernaryNode(); |
| (...skipping 12 matching lines...) Expand all Loading... |
| 70 high = NULL; | 81 high = NULL; |
| 71 } | 82 } |
| 72 | 83 |
| 73 // Not inline since it's recursive | 84 // Not inline since it's recursive |
| 74 TernaryNode::~TernaryNode() { | 85 TernaryNode::~TernaryNode() { |
| 75 delete low; | 86 delete low; |
| 76 delete equal; | 87 delete equal; |
| 77 delete high; | 88 delete high; |
| 78 } | 89 } |
| 79 | 90 |
| 80 MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status )
{ | 91 MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status, |
| 92 UBool containsValue /* = FALSE */
) { |
| 81 // Start the trie off with something. Having the root node already present | 93 // Start the trie off with something. Having the root node already present |
| 82 // cuts a special case out of the search/insertion functions. | 94 // cuts a special case out of the search/insertion functions. |
| 83 // Making it a median character cuts the worse case for searches from | 95 // Making it a median character cuts the worse case for searches from |
| 84 // 4x a balanced trie to 2x a balanced trie. It's best to choose something | 96 // 4x a balanced trie to 2x a balanced trie. It's best to choose something |
| 85 // that starts a word that is midway in the list. | 97 // that starts a word that is midway in the list. |
| 86 fTrie = new TernaryNode(median); | 98 fTrie = new TernaryNode(median); |
| 87 if (fTrie == NULL) { | 99 if (fTrie == NULL) { |
| 88 status = U_MEMORY_ALLOCATION_ERROR; | 100 status = U_MEMORY_ALLOCATION_ERROR; |
| 89 } | 101 } |
| 90 fIter = utext_openUChars(NULL, NULL, 0, &status); | 102 fIter = utext_openUChars(NULL, NULL, 0, &status); |
| 91 if (U_SUCCESS(status) && fIter == NULL) { | 103 if (U_SUCCESS(status) && fIter == NULL) { |
| 92 status = U_MEMORY_ALLOCATION_ERROR; | 104 status = U_MEMORY_ALLOCATION_ERROR; |
| 93 } | 105 } |
| 106 |
| 107 fValued = containsValue; |
| 94 } | 108 } |
| 95 | 109 |
| 96 MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) { | 110 MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status, |
| 111 UBool containsValue /* = false */
) { |
| 97 fTrie = NULL; | 112 fTrie = NULL; |
| 98 fIter = utext_openUChars(NULL, NULL, 0, &status); | 113 fIter = utext_openUChars(NULL, NULL, 0, &status); |
| 99 if (U_SUCCESS(status) && fIter == NULL) { | 114 if (U_SUCCESS(status) && fIter == NULL) { |
| 100 status = U_MEMORY_ALLOCATION_ERROR; | 115 status = U_MEMORY_ALLOCATION_ERROR; |
| 101 } | 116 } |
| 117 |
| 118 fValued = containsValue; |
| 102 } | 119 } |
| 103 | 120 |
| 104 MutableTrieDictionary::~MutableTrieDictionary() { | 121 MutableTrieDictionary::~MutableTrieDictionary() { |
| 105 delete fTrie; | 122 delete fTrie; |
| 106 utext_close(fIter); | 123 utext_close(fIter); |
| 107 } | 124 } |
| 108 | 125 |
| 109 int32_t | 126 int32_t |
| 110 MutableTrieDictionary::search( UText *text, | 127 MutableTrieDictionary::search( UText *text, |
| 111 int32_t maxLength, | 128 int32_t maxLength, |
| 112 int32_t *lengths, | 129 int32_t *lengths, |
| 113 int &count, | 130 int &count, |
| 114 int limit, | 131 int limit, |
| 115 TernaryNode *&parent, | 132 TernaryNode *&parent, |
| 116 UBool &pMatched ) const { | 133 UBool &pMatched, |
| 134 uint16_t *values /*=NULL*/) const { |
| 117 // TODO: current implementation works in UTF-16 space | 135 // TODO: current implementation works in UTF-16 space |
| 118 const TernaryNode *up = NULL; | 136 const TernaryNode *up = NULL; |
| 119 const TernaryNode *p = fTrie; | 137 const TernaryNode *p = fTrie; |
| 120 int mycount = 0; | 138 int mycount = 0; |
| 121 pMatched = TRUE; | 139 pMatched = TRUE; |
| 122 int i; | 140 int i; |
| 123 | 141 |
| 142 if (!fValued) { |
| 143 values = NULL; |
| 144 } |
| 145 |
| 124 UChar uc = utext_current32(text); | 146 UChar uc = utext_current32(text); |
| 125 for (i = 0; i < maxLength && p != NULL; ++i) { | 147 for (i = 0; i < maxLength && p != NULL; ++i) { |
| 126 while (p != NULL) { | 148 while (p != NULL) { |
| 127 if (uc < p->ch) { | 149 if (uc < p->ch) { |
| 128 up = p; | 150 up = p; |
| 129 p = p->low; | 151 p = p->low; |
| 130 } | 152 } |
| 131 else if (uc == p->ch) { | 153 else if (uc == p->ch) { |
| 132 break; | 154 break; |
| 133 } | 155 } |
| 134 else { | 156 else { |
| 135 up = p; | 157 up = p; |
| 136 p = p->high; | 158 p = p->high; |
| 137 } | 159 } |
| 138 } | 160 } |
| 139 if (p == NULL) { | 161 if (p == NULL) { |
| 140 pMatched = FALSE; | 162 pMatched = FALSE; |
| 141 break; | 163 break; |
| 142 } | 164 } |
| 143 // Must be equal to get here | 165 // Must be equal to get here |
| 144 if (limit > 0 && (p->flags & kEndsWord)) { | 166 if (limit > 0 && (p->flags > 0)) { |
| 167 //is there a more efficient way to add values? ie. remove if stmt |
| 168 if(values != NULL) { |
| 169 values[mycount] = p->flags; |
| 170 } |
| 145 lengths[mycount++] = i+1; | 171 lengths[mycount++] = i+1; |
| 146 --limit; | 172 --limit; |
| 147 } | 173 } |
| 148 up = p; | 174 up = p; |
| 149 p = p->equal; | 175 p = p->equal; |
| 150 uc = utext_next32(text); | 176 uc = utext_next32(text); |
| 151 uc = utext_current32(text); | 177 uc = utext_current32(text); |
| 152 } | 178 } |
| 153 | 179 |
| 154 // Note that there is no way to reach here with up == 0 unless | 180 // Note that there is no way to reach here with up == 0 unless |
| 155 // maxLength is 0 coming in. | 181 // maxLength is 0 coming in. |
| 156 parent = (TernaryNode *)up; | 182 parent = (TernaryNode *)up; |
| 157 count = mycount; | 183 count = mycount; |
| 158 return i; | 184 return i; |
| 159 } | 185 } |
| 160 | 186 |
| 161 void | 187 void |
| 162 MutableTrieDictionary::addWord( const UChar *word, | 188 MutableTrieDictionary::addWord( const UChar *word, |
| 163 int32_t length, | 189 int32_t length, |
| 164 UErrorCode &status ) { | 190 UErrorCode &status, |
| 165 #if 0 | 191 uint16_t value /* = 0 */ ) { |
| 166 if (length <= 0) { | 192 // dictionary cannot store zero values, would interfere with flags |
| 193 if (length <= 0 || (!fValued && value > 0) || (fValued && value == 0)) { |
| 167 status = U_ILLEGAL_ARGUMENT_ERROR; | 194 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 168 return; | 195 return; |
| 169 } | 196 } |
| 170 #endif | 197 |
| 171 TernaryNode *parent; | 198 TernaryNode *parent; |
| 172 UBool pMatched; | 199 UBool pMatched; |
| 173 int count; | 200 int count; |
| 174 fIter = utext_openUChars(fIter, word, length, &status); | 201 fIter = utext_openUChars(fIter, word, length, &status); |
| 175 | 202 |
| 176 int matched; | 203 int matched; |
| 177 matched = search(fIter, length, NULL, count, 0, parent, pMatched); | 204 matched = search(fIter, length, NULL, count, 0, parent, pMatched); |
| 178 | 205 |
| 179 while (matched++ < length) { | 206 while (matched++ < length) { |
| 180 UChar32 uc = utext_next32(fIter); // TODO: supplemetary support? | 207 UChar32 uc = utext_next32(fIter); // TODO: supplementary support? |
| 181 U_ASSERT(uc != U_SENTINEL); | 208 U_ASSERT(uc != U_SENTINEL); |
| 182 TernaryNode *newNode = new TernaryNode(uc); | 209 TernaryNode *newNode = new TernaryNode(uc); |
| 183 if (newNode == NULL) { | 210 if (newNode == NULL) { |
| 184 status = U_MEMORY_ALLOCATION_ERROR; | 211 status = U_MEMORY_ALLOCATION_ERROR; |
| 185 return; | 212 return; |
| 186 } | 213 } |
| 187 if (pMatched) { | 214 if (pMatched) { |
| 188 parent->equal = newNode; | 215 parent->equal = newNode; |
| 189 } | 216 } |
| 190 else { | 217 else { |
| 191 pMatched = TRUE; | 218 pMatched = TRUE; |
| 192 if (uc < parent->ch) { | 219 if (uc < parent->ch) { |
| 193 parent->low = newNode; | 220 parent->low = newNode; |
| 194 } | 221 } |
| 195 else { | 222 else { |
| 196 parent->high = newNode; | 223 parent->high = newNode; |
| 197 } | 224 } |
| 198 } | 225 } |
| 199 parent = newNode; | 226 parent = newNode; |
| 200 } | 227 } |
| 201 | 228 |
| 202 parent->flags |= kEndsWord; | 229 if(fValued && value > 0){ |
| 203 } | 230 parent->flags = value; |
| 204 | 231 } else { |
| 205 #if 0 | 232 parent->flags |= kEndsWord; |
| 206 void | |
| 207 MutableTrieDictionary::addWords( UEnumeration *words, | |
| 208 UErrorCode &status ) { | |
| 209 int32_t length; | |
| 210 const UChar *word; | |
| 211 while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) { | |
| 212 addWord(word, length, status); | |
| 213 } | 233 } |
| 214 } | 234 } |
| 215 #endif | |
| 216 | 235 |
| 217 int32_t | 236 int32_t |
| 218 MutableTrieDictionary::matches( UText *text, | 237 MutableTrieDictionary::matches( UText *text, |
| 219 int32_t maxLength, | 238 int32_t maxLength, |
| 220 int32_t *lengths, | 239 int32_t *lengths, |
| 221 int &count, | 240 int &count, |
| 222 int limit ) const { | 241 int limit, |
| 242 uint16_t *values /*=NULL*/) const { |
| 223 TernaryNode *parent; | 243 TernaryNode *parent; |
| 224 UBool pMatched; | 244 UBool pMatched; |
| 225 return search(text, maxLength, lengths, count, limit, parent, pMatched); | 245 return search(text, maxLength, lengths, count, limit, parent, pMatched, valu
es); |
| 226 } | 246 } |
| 227 | 247 |
| 228 // Implementation of iteration for MutableTrieDictionary | 248 // Implementation of iteration for MutableTrieDictionary |
| 229 class MutableTrieEnumeration : public StringEnumeration { | 249 class MutableTrieEnumeration : public StringEnumeration { |
| 230 private: | 250 private: |
| 231 UStack fNodeStack; // Stack of nodes to process | 251 UStack fNodeStack; // Stack of nodes to process |
| 232 UVector32 fBranchStack; // Stack of which branch we are working on | 252 UVector32 fBranchStack; // Stack of which branch we are working on |
| 233 TernaryNode *fRoot; // Root node | 253 TernaryNode *fRoot; // Root node |
| 234 enum StackBranch { | 254 enum StackBranch { |
| 235 kLessThan, | 255 kLessThan, |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 270 | 290 |
| 271 switch (where) { | 291 switch (where) { |
| 272 case kLessThan: | 292 case kLessThan: |
| 273 if (node->low != NULL) { | 293 if (node->low != NULL) { |
| 274 fBranchStack.setElementAt(kEqual, fBranchStack.size()-1); | 294 fBranchStack.setElementAt(kEqual, fBranchStack.size()-1); |
| 275 node = (TernaryNode *) fNodeStack.push(node->low, status); | 295 node = (TernaryNode *) fNodeStack.push(node->low, status); |
| 276 where = (StackBranch) fBranchStack.push(kLessThan, status); | 296 where = (StackBranch) fBranchStack.push(kLessThan, status); |
| 277 break; | 297 break; |
| 278 } | 298 } |
| 279 case kEqual: | 299 case kEqual: |
| 280 emit = (node->flags & kEndsWord) != 0; | 300 emit = node->flags > 0; |
| 281 equal = (node->equal != NULL); | 301 equal = (node->equal != NULL); |
| 282 // If this node should be part of the next emitted string, appen
d | 302 // If this node should be part of the next emitted string, appen
d |
| 283 // the UChar to the string, and make sure we pop it when we come | 303 // the UChar to the string, and make sure we pop it when we come |
| 284 // back to this node. The character should only be in the string | 304 // back to this node. The character should only be in the string |
| 285 // for as long as we're traversing the equal subtree of this nod
e | 305 // for as long as we're traversing the equal subtree of this nod
e |
| 286 if (equal || emit) { | 306 if (equal || emit) { |
| 287 unistr.append(node->ch); | 307 unistr.append(node->ch); |
| 288 fBranchStack.setElementAt(kGreaterThan, fBranchStack.size()-
1); | 308 fBranchStack.setElementAt(kGreaterThan, fBranchStack.size()-
1); |
| 289 } | 309 } |
| 290 if (equal) { | 310 if (equal) { |
| 291 node = (TernaryNode *) fNodeStack.push(node->equal, status); | 311 node = (TernaryNode *) fNodeStack.push(node->equal, status); |
| 292 where = (StackBranch) fBranchStack.push(kLessThan, status); | 312 where = (StackBranch) fBranchStack.push(kLessThan, status); |
| 293 } | 313 } |
| 294 if (emit) { | 314 if (emit) { |
| 295 return &unistr; | 315 return &unistr; |
| 296 } | 316 } |
| 297 if (equal) { | 317 if (equal) { |
| 298 break; | 318 break; |
| 299 } | 319 } |
| 300 case kGreaterThan: | 320 case kGreaterThan: |
| 301 // If this node's character is in the string, remove it. | 321 // If this node's character is in the string, remove it. |
| 302 if (node->equal != NULL || (node->flags & kEndsWord)) { | 322 if (node->equal != NULL || node->flags > 0) { |
| 303 unistr.truncate(unistr.length()-1); | 323 unistr.truncate(unistr.length()-1); |
| 304 } | 324 } |
| 305 if (node->high != NULL) { | 325 if (node->high != NULL) { |
| 306 fBranchStack.setElementAt(kDone, fBranchStack.size()-1); | 326 fBranchStack.setElementAt(kDone, fBranchStack.size()-1); |
| 307 node = (TernaryNode *) fNodeStack.push(node->high, status); | 327 node = (TernaryNode *) fNodeStack.push(node->high, status); |
| 308 where = (StackBranch) fBranchStack.push(kLessThan, status); | 328 where = (StackBranch) fBranchStack.push(kLessThan, status); |
| 309 break; | 329 break; |
| 310 } | 330 } |
| 311 case kDone: | 331 case kDone: |
| 312 fNodeStack.pop(); | 332 fNodeStack.pop(); |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 347 if (U_FAILURE(status)) { | 367 if (U_FAILURE(status)) { |
| 348 return NULL; | 368 return NULL; |
| 349 } | 369 } |
| 350 return new MutableTrieEnumeration(fTrie, status); | 370 return new MutableTrieEnumeration(fTrie, status); |
| 351 } | 371 } |
| 352 | 372 |
| 353 /******************************************************************* | 373 /******************************************************************* |
| 354 * CompactTrieDictionary | 374 * CompactTrieDictionary |
| 355 */ | 375 */ |
| 356 | 376 |
| 377 //TODO further optimization: |
| 378 // minimise size of trie with logprobs by storing values |
| 379 // for terminal nodes directly in offsets[] |
| 380 // --> calculating from next offset *might* be simpler, but would have to add |
| 381 // one last offset for logprob of last node |
| 382 // --> if calculate from current offset, need to factor in possible overflow |
| 383 // as well. |
| 384 // idea: store in offset, set first bit to indicate logprob storage-->won't |
| 385 // have to access additional node |
| 386 |
| 387 // {'Dic', 1}, version 1: uses old header, no values |
| 388 #define COMPACT_TRIE_MAGIC_1 0x44696301 |
| 389 // version 2: uses new header (more than 2^16 nodes), no values |
| 390 #define COMPACT_TRIE_MAGIC_2 0x44696302 |
| 391 // version 3: uses new header, includes values |
| 392 #define COMPACT_TRIE_MAGIC_3 0x44696303 |
| 393 |
| 357 struct CompactTrieHeader { | 394 struct CompactTrieHeader { |
| 358 uint32_t size; // Size of the data in bytes | 395 uint32_t size; // Size of the data in bytes |
| 359 uint32_t magic; // Magic number (including version) | 396 uint32_t magic; // Magic number (including version) |
| 397 uint32_t nodeCount; // Number of entries in offsets[] |
| 398 uint32_t root; // Node number of the root node |
| 399 uint32_t offsets[1]; // Offsets to nodes from start of data |
| 400 }; |
| 401 |
| 402 // old version of CompactTrieHeader kept for backwards compatibility |
| 403 struct CompactTrieHeaderV1 { |
| 404 uint32_t size; // Size of the data in bytes |
| 405 uint32_t magic; // Magic number (including version) |
| 360 uint16_t nodeCount; // Number of entries in offsets[] | 406 uint16_t nodeCount; // Number of entries in offsets[] |
| 361 uint16_t root; // Node number of the root node | 407 uint16_t root; // Node number of the root node |
| 362 uint32_t offsets[1]; // Offsets to nodes from start of data | 408 uint32_t offsets[1]; // Offsets to nodes from start of data |
| 409 }; |
| 410 |
| 411 // Helper class for managing CompactTrieHeader and CompactTrieHeaderV1 |
| 412 struct CompactTrieInfo { |
| 413 uint32_t size; // Size of the data in bytes |
| 414 uint32_t magic; // Magic number (including version) |
| 415 uint32_t nodeCount; // Number of entries in offsets[] |
| 416 uint32_t root; // Node number of the root node |
| 417 uint32_t *offsets; // Offsets to nodes from start of data |
| 418 uint8_t *address; // pointer to header bytes in memory |
| 419 |
| 420 CompactTrieInfo(const void *data, UErrorCode &status){ |
| 421 CompactTrieHeader *header = (CompactTrieHeader *) data; |
| 422 if (header->magic != COMPACT_TRIE_MAGIC_1 && |
| 423 header->magic != COMPACT_TRIE_MAGIC_2 && |
| 424 header->magic != COMPACT_TRIE_MAGIC_3) { |
| 425 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 426 } else { |
| 427 size = header->size; |
| 428 magic = header->magic; |
| 429 |
| 430 if (header->magic == COMPACT_TRIE_MAGIC_1) { |
| 431 CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *) header; |
| 432 nodeCount = headerV1->nodeCount; |
| 433 root = headerV1->root; |
| 434 offsets = &(headerV1->offsets[0]); |
| 435 address = (uint8_t *)headerV1; |
| 436 } else { |
| 437 nodeCount = header->nodeCount; |
| 438 root = header->root; |
| 439 offsets = &(header->offsets[0]); |
| 440 address = (uint8_t *)header; |
| 441 } |
| 442 } |
| 443 } |
| 444 |
| 445 ~CompactTrieInfo(){} |
| 363 }; | 446 }; |
| 364 | 447 |
| 365 // Note that to avoid platform-specific alignment issues, all members of the nod
e | 448 // Note that to avoid platform-specific alignment issues, all members of the nod
e |
| 366 // structures should be the same size, or should contain explicit padding to | 449 // structures should be the same size, or should contain explicit padding to |
| 367 // natural alignment boundaries. | 450 // natural alignment boundaries. |
| 368 | 451 |
| 369 // We can't use a bitfield for the flags+count field, because the layout of thos
e | 452 // We can't use a bitfield for the flags+count field, because the layout of thos
e |
| 370 // is not portable. 12 bits of count allows for up to 4096 entries in a node. | 453 // is not portable. 12 bits of count allows for up to 4096 entries in a node. |
| 371 struct CompactTrieNode { | 454 struct CompactTrieNode { |
| 372 uint16_t flagscount; // Count of sub-entries, plus flags | 455 uint16_t flagscount; // Count of sub-entries, plus flags |
| 373 }; | 456 }; |
| 374 | 457 |
| 375 enum CompactTrieNodeFlags { | 458 enum CompactTrieNodeFlags { |
| 376 kVerticalNode = 0x1000, // This is a vertical node | 459 kVerticalNode = 0x1000, // This is a vertical node |
| 377 kParentEndsWord = 0x2000, // The node whose equal link points to this
ends a word | 460 kParentEndsWord = 0x2000, // The node whose equal link points to this
ends a word |
| 378 kReservedFlag1 = 0x4000, | 461 kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kRe
servedFlag1 |
| 379 kReservedFlag2 = 0x8000, | 462 kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReservedF
lag2 |
| 380 kCountMask = 0x0FFF, // The count portion of flagscount | 463 kCountMask = 0x0FFF, // The count portion of flagscount |
| 381 kFlagMask = 0xF000 // The flags portion of flagscount | 464 kFlagMask = 0xF000, // The flags portion of flagscount |
| 465 kRootCountMask = 0x7FFF // The count portion of flagscount in the ro
ot node |
| 466 |
| 467 //offset flags: |
| 468 //kOffsetContainsValue = 0x80000000 // Offset contains value for paren
t node |
| 382 }; | 469 }; |
| 383 | 470 |
| 384 // The two node types are distinguished by the kVerticalNode flag. | 471 // The two node types are distinguished by the kVerticalNode flag. |
| 385 | 472 |
| 386 struct CompactTrieHorizontalEntry { | 473 struct CompactTrieHorizontalEntry { |
| 387 uint16_t ch; // UChar | 474 uint16_t ch; // UChar |
| 388 uint16_t equal; // Equal link node index | 475 uint16_t equal; // Equal link node index |
| 389 }; | 476 }; |
| 390 | 477 |
| 391 // We don't use inheritance here because C++ does not guarantee that the | 478 // We don't use inheritance here because C++ does not guarantee that the |
| 392 // base class comes first in memory!! | 479 // base class comes first in memory!! |
| 393 | 480 |
| 394 struct CompactTrieHorizontalNode { | 481 struct CompactTrieHorizontalNode { |
| 395 uint16_t flagscount; // Count of sub-entries, plus flags | 482 uint16_t flagscount; // Count of sub-entries, plus flags |
| 396 CompactTrieHorizontalEntry entries[1]; | 483 CompactTrieHorizontalEntry entries[1]; |
| 397 }; | 484 }; |
| 398 | 485 |
| 399 struct CompactTrieVerticalNode { | 486 struct CompactTrieVerticalNode { |
| 400 uint16_t flagscount; // Count of sub-entries, plus flags | 487 uint16_t flagscount; // Count of sub-entries, plus flags |
| 401 uint16_t equal; // Equal link node index | 488 uint16_t equal; // Equal link node index |
| 402 uint16_t chars[1]; // Code units | 489 uint16_t chars[1]; // Code units |
| 403 }; | 490 }; |
| 404 | 491 |
| 405 // {'Dic', 1}, version 1 | |
| 406 #define COMPACT_TRIE_MAGIC_1 0x44696301 | |
| 407 | |
| 408 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj, | 492 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj, |
| 409 UErrorCode &status ) | 493 UErrorCode &status ) |
| 410 : fUData(dataObj) | 494 : fUData(dataObj) |
| 411 { | 495 { |
| 412 fData = (const CompactTrieHeader *) udata_getMemory(dataObj); | 496 fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); |
| 497 *fInfo = CompactTrieInfo(udata_getMemory(dataObj), status); |
| 413 fOwnData = FALSE; | 498 fOwnData = FALSE; |
| 414 if (fData->magic != COMPACT_TRIE_MAGIC_1) { | 499 } |
| 415 status = U_ILLEGAL_ARGUMENT_ERROR; | 500 |
| 416 fData = NULL; | |
| 417 } | |
| 418 } | |
| 419 CompactTrieDictionary::CompactTrieDictionary( const void *data, | 501 CompactTrieDictionary::CompactTrieDictionary( const void *data, |
| 420 UErrorCode &status ) | 502 UErrorCode &status ) |
| 421 : fUData(NULL) | 503 : fUData(NULL) |
| 422 { | 504 { |
| 423 fData = (const CompactTrieHeader *) data; | 505 fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); |
| 506 *fInfo = CompactTrieInfo(data, status); |
| 424 fOwnData = FALSE; | 507 fOwnData = FALSE; |
| 425 if (fData->magic != COMPACT_TRIE_MAGIC_1) { | |
| 426 status = U_ILLEGAL_ARGUMENT_ERROR; | |
| 427 fData = NULL; | |
| 428 } | |
| 429 } | 508 } |
| 430 | 509 |
| 431 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict, | 510 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict, |
| 432 UErrorCode &status ) | 511 UErrorCode &status ) |
| 433 : fUData(NULL) | 512 : fUData(NULL) |
| 434 { | 513 { |
| 435 fData = compactMutableTrieDictionary(dict, status); | 514 const CompactTrieHeader* header = compactMutableTrieDictionary(dict, status)
; |
| 515 if (U_SUCCESS(status)) { |
| 516 fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); |
| 517 *fInfo = CompactTrieInfo(header, status); |
| 518 } |
| 519 |
| 436 fOwnData = !U_FAILURE(status); | 520 fOwnData = !U_FAILURE(status); |
| 437 } | 521 } |
| 438 | 522 |
| 439 CompactTrieDictionary::~CompactTrieDictionary() { | 523 CompactTrieDictionary::~CompactTrieDictionary() { |
| 440 if (fOwnData) { | 524 if (fOwnData) { |
| 441 uprv_free((void *)fData); | 525 uprv_free((void *)(fInfo->address)); |
| 442 } | 526 } |
| 527 uprv_free((void *)fInfo); |
| 528 |
| 443 if (fUData) { | 529 if (fUData) { |
| 444 udata_close(fUData); | 530 udata_close(fUData); |
| 445 } | 531 } |
| 446 } | 532 } |
| 447 | 533 |
| 534 UBool CompactTrieDictionary::getValued() const{ |
| 535 return fInfo->magic == COMPACT_TRIE_MAGIC_3; |
| 536 } |
| 537 |
| 448 uint32_t | 538 uint32_t |
| 449 CompactTrieDictionary::dataSize() const { | 539 CompactTrieDictionary::dataSize() const { |
| 450 return fData->size; | 540 return fInfo->size; |
| 451 } | 541 } |
| 452 | 542 |
| 453 const void * | 543 const void * |
| 454 CompactTrieDictionary::data() const { | 544 CompactTrieDictionary::data() const { |
| 455 return fData; | 545 return fInfo->address; |
| 456 } | 546 } |
| 457 | 547 |
| 458 // This function finds the address of a node for us, given its node ID | 548 //This function finds the address of a node for us, given its node ID |
| 459 static inline const CompactTrieNode * | 549 static inline const CompactTrieNode * |
| 460 getCompactNode(const CompactTrieHeader *header, uint16_t node) { | 550 getCompactNode(const CompactTrieInfo *info, uint32_t node) { |
| 461 return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[n
ode]); | 551 if(node < info->root-1) { |
| 552 return (const CompactTrieNode *)(&info->offsets[node]); |
| 553 } else { |
| 554 return (const CompactTrieNode *)(info->address + info->offsets[node]); |
| 555 } |
| 556 } |
| 557 |
| 558 //this version of getCompactNode is currently only used in compactMutableTrieDic
tionary() |
| 559 static inline const CompactTrieNode * |
| 560 getCompactNode(const CompactTrieHeader *header, uint32_t node) { |
| 561 if(node < header->root-1) { |
| 562 return (const CompactTrieNode *)(&header->offsets[node]); |
| 563 } else { |
| 564 return (const CompactTrieNode *)((const uint8_t *)header + header->offse
ts[node]); |
| 565 } |
| 566 } |
| 567 |
| 568 |
| 569 /** |
| 570 * Calculates the number of links in a node |
| 571 * @node The specified node |
| 572 */ |
| 573 static inline const uint16_t |
| 574 getCount(const CompactTrieNode *node){ |
| 575 return (node->flagscount & kCountMask); |
| 576 //use the code below if number of links ever exceed 4096 |
| 577 //return (node->flagscount & kCountMask) + ((node->flagscount & kExceedsCoun
t) >> 2); |
| 578 } |
| 579 |
| 580 /** |
| 581 * calculates an equal link node ID of a horizontal node |
| 582 * @hnode The horizontal node containing the equal link |
| 583 * @param index The index into hnode->entries[] |
| 584 * @param nodeCount The length of hnode->entries[] |
| 585 */ |
| 586 static inline uint32_t calcEqualLink(const CompactTrieVerticalNode *vnode){ |
| 587 if(vnode->flagscount & kEqualOverflows){ |
| 588 // treat overflow bits as an extension of chars[] |
| 589 uint16_t *overflow = (uint16_t *) &vnode->chars[getCount((CompactTrieNod
e*)vnode)]; |
| 590 return vnode->equal + (((uint32_t)*overflow) << 16); |
| 591 }else{ |
| 592 return vnode->equal; |
| 593 } |
| 594 } |
| 595 |
| 596 /** |
| 597 * calculates an equal link node ID of a horizontal node |
| 598 * @hnode The horizontal node containing the equal link |
| 599 * @param index The index into hnode->entries[] |
| 600 * @param nodeCount The length of hnode->entries[] |
| 601 */ |
| 602 static inline uint32_t calcEqualLink(const CompactTrieHorizontalNode *hnode, uin
t16_t index, uint16_t nodeCount){ |
| 603 if(hnode->flagscount & kEqualOverflows){ |
| 604 //set overflow to point to the uint16_t containing the overflow bits |
| 605 uint16_t *overflow = (uint16_t *) &hnode->entries[nodeCount]; |
| 606 overflow += index/4; |
| 607 uint16_t extraBits = (*overflow >> (3 - (index % 4)) * 4) % 0x10; |
| 608 return hnode->entries[index].equal + (((uint32_t)extraBits) << 16); |
| 609 } else { |
| 610 return hnode->entries[index].equal; |
| 611 } |
| 612 } |
| 613 |
| 614 /** |
| 615 * Returns the value stored in the specified node which is associated with its |
| 616 * parent node. |
| 617 * TODO: how to tell that value is stored in node or in offset? check whether |
| 618 * node ID < fInfo->root! |
| 619 */ |
| 620 static inline uint16_t getValue(const CompactTrieHorizontalNode *hnode){ |
| 621 uint16_t count = getCount((CompactTrieNode *)hnode); |
| 622 uint16_t overflowSize = 0; //size of node ID overflow storage in bytes |
| 623 |
| 624 if(hnode->flagscount & kEqualOverflows) |
| 625 overflowSize = (count + 3) / 4 * sizeof(uint16_t); |
| 626 return *((uint16_t *)((uint8_t *)&hnode->entries[count] + overflowSize)); |
| 627 } |
| 628 |
| 629 static inline uint16_t getValue(const CompactTrieVerticalNode *vnode){ |
| 630 // calculate size of total node ID overflow storage in bytes |
| 631 uint16_t overflowSize = (vnode->flagscount & kEqualOverflows)? sizeof(uint16
_t) : 0; |
| 632 return *((uint16_t *)((uint8_t *)&vnode->chars[getCount((CompactTrieNode *)v
node)] + overflowSize)); |
| 633 } |
| 634 |
| 635 static inline uint16_t getValue(const CompactTrieNode *node){ |
| 636 if(node->flagscount & kVerticalNode) |
| 637 return getValue((const CompactTrieVerticalNode *)node); |
| 638 else |
| 639 return getValue((const CompactTrieHorizontalNode *)node); |
| 640 } |
| 641 |
| 642 //returns index of match in CompactTrieHorizontalNode.entries[] using binary sea
rch |
| 643 inline int16_t |
| 644 searchHorizontalEntries(const CompactTrieHorizontalEntry *entries, |
| 645 UChar uc, uint16_t nodeCount){ |
| 646 int low = 0; |
| 647 int high = nodeCount-1; |
| 648 int middle; |
| 649 while (high >= low) { |
| 650 middle = (high+low)/2; |
| 651 if (uc == entries[middle].ch) { |
| 652 return middle; |
| 653 } |
| 654 else if (uc < entries[middle].ch) { |
| 655 high = middle-1; |
| 656 } |
| 657 else { |
| 658 low = middle+1; |
| 659 } |
| 660 } |
| 661 |
| 662 return -1; |
| 462 } | 663 } |
| 463 | 664 |
| 464 int32_t | 665 int32_t |
| 465 CompactTrieDictionary::matches( UText *text, | 666 CompactTrieDictionary::matches( UText *text, |
| 466 int32_t maxLength, | 667 int32_t maxLength, |
| 467 int32_t *lengths, | 668 int32_t *lengths, |
| 468 int &count, | 669 int &count, |
| 469 int limit ) const { | 670 int limit, |
| 671 uint16_t *values /*= NULL*/) const { |
| 672 if (fInfo->magic == COMPACT_TRIE_MAGIC_2) |
| 673 values = NULL; |
| 674 |
| 470 // TODO: current implementation works in UTF-16 space | 675 // TODO: current implementation works in UTF-16 space |
| 471 const CompactTrieNode *node = getCompactNode(fData, fData->root); | 676 const CompactTrieNode *node = getCompactNode(fInfo, fInfo->root); |
| 472 int mycount = 0; | 677 int mycount = 0; |
| 473 | 678 |
| 474 UChar uc = utext_current32(text); | 679 UChar uc = utext_current32(text); |
| 475 int i = 0; | 680 int i = 0; |
| 476 | 681 |
| 682 // handle root node with only kEqualOverflows flag: assume horizontal node w
ithout parent |
| 683 if(node != NULL){ |
| 684 const CompactTrieHorizontalNode *root = (const CompactTrieHorizontalNode
*) node; |
| 685 int index = searchHorizontalEntries(root->entries, uc, root->flagscount
& kRootCountMask); |
| 686 if(index > -1){ |
| 687 node = getCompactNode(fInfo, calcEqualLink(root, index, root->flagsc
ount & kRootCountMask)); |
| 688 utext_next32(text); |
| 689 uc = utext_current32(text); |
| 690 ++i; |
| 691 }else{ |
| 692 node = NULL; |
| 693 } |
| 694 } |
| 695 |
| 477 while (node != NULL) { | 696 while (node != NULL) { |
| 478 // Check if the node we just exited ends a word | 697 // Check if the node we just exited ends a word |
| 479 if (limit > 0 && (node->flagscount & kParentEndsWord)) { | 698 if (limit > 0 && (node->flagscount & kParentEndsWord)) { |
| 699 if(values != NULL){ |
| 700 values[mycount] = getValue(node); |
| 701 } |
| 480 lengths[mycount++] = i; | 702 lengths[mycount++] = i; |
| 481 --limit; | 703 --limit; |
| 482 } | 704 } |
| 483 // Check that we haven't exceeded the maximum number of input characters
. | 705 // Check that we haven't exceeded the maximum number of input characters
. |
| 484 // We have to do that here rather than in the while condition so that | 706 // We have to do that here rather than in the while condition so that |
| 485 // we can check for ending a word, above. | 707 // we can check for ending a word, above. |
| 486 if (i >= maxLength) { | 708 if (i >= maxLength) { |
| 487 break; | 709 break; |
| 488 } | 710 } |
| 489 | 711 |
| 490 int nodeCount = (node->flagscount & kCountMask); | 712 int nodeCount = getCount(node); |
| 491 if (nodeCount == 0) { | 713 if (nodeCount == 0) { |
| 492 // Special terminal node; return now | 714 // Special terminal node; return now |
| 493 break; | 715 break; |
| 494 } | 716 } |
| 495 if (node->flagscount & kVerticalNode) { | 717 if (node->flagscount & kVerticalNode) { |
| 496 // Vertical node; check all the characters in it | 718 // Vertical node; check all the characters in it |
| 497 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNod
e *)node; | 719 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNod
e *)node; |
| 498 for (int j = 0; j < nodeCount && i < maxLength; ++j) { | 720 for (int j = 0; j < nodeCount && i < maxLength; ++j) { |
| 499 if (uc != vnode->chars[j]) { | 721 if (uc != vnode->chars[j]) { |
| 500 // We hit a non-equal character; return | 722 // We hit a non-equal character; return |
| 501 goto exit; | 723 goto exit; |
| 502 } | 724 } |
| 503 utext_next32(text); | 725 utext_next32(text); |
| 504 uc = utext_current32(text); | 726 uc = utext_current32(text); |
| 505 ++i; | 727 ++i; |
| 506 } | 728 } |
| 507 // To get here we must have come through the whole list successfully
; | 729 // To get here we must have come through the whole list successfully
; |
| 508 // go on to the next node. Note that a word cannot end in the middle | 730 // go on to the next node. Note that a word cannot end in the middle |
| 509 // of a vertical node. | 731 // of a vertical node. |
| 510 node = getCompactNode(fData, vnode->equal); | 732 node = getCompactNode(fInfo, calcEqualLink(vnode)); |
| 511 } | 733 } |
| 512 else { | 734 else { |
| 513 // Horizontal node; do binary search | 735 // Horizontal node; do binary search |
| 514 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizonta
lNode *)node; | 736 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizonta
lNode *)node; |
| 515 int low = 0; | 737 const CompactTrieHorizontalEntry *entries; |
| 516 int high = nodeCount-1; | 738 entries = hnode->entries; |
| 517 int middle; | 739 |
| 518 node = NULL; // If we don't find a match, we'll fall out of the l
oop | 740 int index = searchHorizontalEntries(entries, uc, nodeCount); |
| 519 while (high >= low) { | 741 if(index > -1){ // |
| 520 middle = (high+low)/2; | 742 // We hit a match; get the next node and next character |
| 521 if (uc == hnode->entries[middle].ch) { | 743 node = getCompactNode(fInfo, calcEqualLink(hnode, index, nodeCou
nt)); |
| 522 // We hit a match; get the next node and next character | 744 utext_next32(text); |
| 523 node = getCompactNode(fData, hnode->entries[middle].equal); | 745 uc = utext_current32(text); |
| 524 utext_next32(text); | 746 ++i; |
| 525 uc = utext_current32(text); | 747 }else{ |
| 526 ++i; | 748 node = NULL; // If we don't find a match, we'll fall out of t
he loop |
| 527 break; | |
| 528 } | |
| 529 else if (uc < hnode->entries[middle].ch) { | |
| 530 high = middle-1; | |
| 531 } | |
| 532 else { | |
| 533 low = middle+1; | |
| 534 } | |
| 535 } | 749 } |
| 536 } | 750 } |
| 537 } | 751 } |
| 538 exit: | 752 exit: |
| 539 count = mycount; | 753 count = mycount; |
| 540 return i; | 754 return i; |
| 541 } | 755 } |
| 542 | 756 |
| 543 // Implementation of iteration for CompactTrieDictionary | 757 // Implementation of iteration for CompactTrieDictionary |
| 544 class CompactTrieEnumeration : public StringEnumeration { | 758 class CompactTrieEnumeration : public StringEnumeration { |
| 545 private: | 759 private: |
| 546 UVector32 fNodeStack; // Stack of nodes to process | 760 UVector32 fNodeStack; // Stack of nodes to process |
| 547 UVector32 fIndexStack; // Stack of where in node we are | 761 UVector32 fIndexStack; // Stack of where in node we are |
| 548 const CompactTrieHeader *fHeader; // Trie data | 762 const CompactTrieInfo *fInfo; // Trie data |
| 549 | 763 |
| 550 public: | 764 public: |
| 551 static UClassID U_EXPORT2 getStaticClassID(void); | 765 static UClassID U_EXPORT2 getStaticClassID(void); |
| 552 virtual UClassID getDynamicClassID(void) const; | 766 virtual UClassID getDynamicClassID(void) const; |
| 553 public: | 767 public: |
| 554 CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status) | 768 CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status) |
| 555 : fNodeStack(status), fIndexStack(status) { | 769 : fNodeStack(status), fIndexStack(status) { |
| 556 fHeader = header; | 770 fInfo = info; |
| 557 fNodeStack.push(header->root, status); | 771 fNodeStack.push(info->root, status); |
| 558 fIndexStack.push(0, status); | 772 fIndexStack.push(0, status); |
| 559 unistr.remove(); | 773 unistr.remove(); |
| 560 } | 774 } |
| 561 | 775 |
| 562 virtual ~CompactTrieEnumeration() { | 776 virtual ~CompactTrieEnumeration() { |
| 563 } | 777 } |
| 564 | 778 |
| 565 virtual StringEnumeration *clone() const { | 779 virtual StringEnumeration *clone() const { |
| 566 UErrorCode status = U_ZERO_ERROR; | 780 UErrorCode status = U_ZERO_ERROR; |
| 567 return new CompactTrieEnumeration(fHeader, status); | 781 return new CompactTrieEnumeration(fInfo, status); |
| 568 } | 782 } |
| 569 | 783 |
| 570 virtual const UnicodeString * snext(UErrorCode &status); | 784 virtual const UnicodeString * snext(UErrorCode &status); |
| 571 | 785 |
| 572 // Very expensive, but this should never be used. | 786 // Very expensive, but this should never be used. |
| 573 virtual int32_t count(UErrorCode &status) const { | 787 virtual int32_t count(UErrorCode &status) const { |
| 574 CompactTrieEnumeration counter(fHeader, status); | 788 CompactTrieEnumeration counter(fInfo, status); |
| 575 int32_t result = 0; | 789 int32_t result = 0; |
| 576 while (counter.snext(status) != NULL && U_SUCCESS(status)) { | 790 while (counter.snext(status) != NULL && U_SUCCESS(status)) { |
| 577 ++result; | 791 ++result; |
| 578 } | 792 } |
| 579 return result; | 793 return result; |
| 580 } | 794 } |
| 581 | 795 |
| 582 virtual void reset(UErrorCode &status) { | 796 virtual void reset(UErrorCode &status) { |
| 583 fNodeStack.removeAllElements(); | 797 fNodeStack.removeAllElements(); |
| 584 fIndexStack.removeAllElements(); | 798 fIndexStack.removeAllElements(); |
| 585 fNodeStack.push(fHeader->root, status); | 799 fNodeStack.push(fInfo->root, status); |
| 586 fIndexStack.push(0, status); | 800 fIndexStack.push(0, status); |
| 587 unistr.remove(); | 801 unistr.remove(); |
| 588 } | 802 } |
| 589 }; | 803 }; |
| 590 | 804 |
| 591 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration) | 805 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration) |
| 592 | 806 |
| 593 const UnicodeString * | 807 const UnicodeString * |
| 594 CompactTrieEnumeration::snext(UErrorCode &status) { | 808 CompactTrieEnumeration::snext(UErrorCode &status) { |
| 595 if (fNodeStack.empty() || U_FAILURE(status)) { | 809 if (fNodeStack.empty() || U_FAILURE(status)) { |
| 596 return NULL; | 810 return NULL; |
| 597 } | 811 } |
| 598 const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki()); | 812 const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki()); |
| 599 int where = fIndexStack.peeki(); | 813 int where = fIndexStack.peeki(); |
| 600 while (!fNodeStack.empty() && U_SUCCESS(status)) { | 814 while (!fNodeStack.empty() && U_SUCCESS(status)) { |
| 601 int nodeCount = (node->flagscount & kCountMask); | 815 int nodeCount; |
| 816 |
| 817 bool isRoot = fNodeStack.peeki() == static_cast<int32_t>(fInfo->root); |
| 818 if(isRoot){ |
| 819 nodeCount = node->flagscount & kRootCountMask; |
| 820 } else { |
| 821 nodeCount = getCount(node); |
| 822 } |
| 823 |
| 602 UBool goingDown = FALSE; | 824 UBool goingDown = FALSE; |
| 603 if (nodeCount == 0) { | 825 if (nodeCount == 0) { |
| 604 // Terminal node; go up immediately | 826 // Terminal node; go up immediately |
| 605 fNodeStack.popi(); | 827 fNodeStack.popi(); |
| 606 fIndexStack.popi(); | 828 fIndexStack.popi(); |
| 607 node = getCompactNode(fHeader, fNodeStack.peeki()); | 829 node = getCompactNode(fInfo, fNodeStack.peeki()); |
| 608 where = fIndexStack.peeki(); | 830 where = fIndexStack.peeki(); |
| 609 } | 831 } |
| 610 else if (node->flagscount & kVerticalNode) { | 832 else if ((node->flagscount & kVerticalNode) && !isRoot) { |
| 611 // Vertical node | 833 // Vertical node |
| 612 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNod
e *)node; | 834 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNod
e *)node; |
| 613 if (where == 0) { | 835 if (where == 0) { |
| 614 // Going down | 836 // Going down |
| 615 unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount); | 837 unistr.append((const UChar *)vnode->chars, nodeCount); |
| 616 fIndexStack.setElementAt(1, fIndexStack.size()-1); | 838 fIndexStack.setElementAt(1, fIndexStack.size()-1); |
| 617 node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, sta
tus)); | 839 node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnode
), status)); |
| 618 where = fIndexStack.push(0, status); | 840 where = fIndexStack.push(0, status); |
| 619 goingDown = TRUE; | 841 goingDown = TRUE; |
| 620 } | 842 } |
| 621 else { | 843 else { |
| 622 // Going up | 844 // Going up |
| 623 unistr.truncate(unistr.length()-nodeCount); | 845 unistr.truncate(unistr.length()-nodeCount); |
| 624 fNodeStack.popi(); | 846 fNodeStack.popi(); |
| 625 fIndexStack.popi(); | 847 fIndexStack.popi(); |
| 626 node = getCompactNode(fHeader, fNodeStack.peeki()); | 848 node = getCompactNode(fInfo, fNodeStack.peeki()); |
| 627 where = fIndexStack.peeki(); | 849 where = fIndexStack.peeki(); |
| 628 } | 850 } |
| 629 } | 851 } |
| 630 else { | 852 else { |
| 631 // Horizontal node | 853 // Horizontal node |
| 632 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizonta
lNode *)node; | 854 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizonta
lNode *)node; |
| 633 if (where > 0) { | 855 if (where > 0) { |
| 634 // Pop previous char | 856 // Pop previous char |
| 635 unistr.truncate(unistr.length()-1); | 857 unistr.truncate(unistr.length()-1); |
| 636 } | 858 } |
| 637 if (where < nodeCount) { | 859 if (where < nodeCount) { |
| 638 // Push on next node | 860 // Push on next node |
| 639 unistr.append((UChar)hnode->entries[where].ch); | 861 unistr.append((UChar)hnode->entries[where].ch); |
| 640 fIndexStack.setElementAt(where+1, fIndexStack.size()-1); | 862 fIndexStack.setElementAt(where+1, fIndexStack.size()-1); |
| 641 node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[wh
ere].equal, status)); | 863 node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnode
, where, nodeCount), status)); |
| 642 where = fIndexStack.push(0, status); | 864 where = fIndexStack.push(0, status); |
| 643 goingDown = TRUE; | 865 goingDown = TRUE; |
| 644 } | 866 } |
| 645 else { | 867 else { |
| 646 // Going up | 868 // Going up |
| 647 fNodeStack.popi(); | 869 fNodeStack.popi(); |
| 648 fIndexStack.popi(); | 870 fIndexStack.popi(); |
| 649 node = getCompactNode(fHeader, fNodeStack.peeki()); | 871 node = getCompactNode(fInfo, fNodeStack.peeki()); |
| 650 where = fIndexStack.peeki(); | 872 where = fIndexStack.peeki(); |
| 651 } | 873 } |
| 652 } | 874 } |
| 875 |
| 653 // Check if the parent of the node we've just gone down to ends a | 876 // Check if the parent of the node we've just gone down to ends a |
| 654 // word. If so, return it. | 877 // word. If so, return it. |
| 878 // The root node should never end up here. |
| 655 if (goingDown && (node->flagscount & kParentEndsWord)) { | 879 if (goingDown && (node->flagscount & kParentEndsWord)) { |
| 656 return &unistr; | 880 return &unistr; |
| 657 } | 881 } |
| 658 } | 882 } |
| 659 return NULL; | 883 return NULL; |
| 660 } | 884 } |
| 661 | 885 |
| 662 StringEnumeration * | 886 StringEnumeration * |
| 663 CompactTrieDictionary::openWords( UErrorCode &status ) const { | 887 CompactTrieDictionary::openWords( UErrorCode &status ) const { |
| 664 if (U_FAILURE(status)) { | 888 if (U_FAILURE(status)) { |
| 665 return NULL; | 889 return NULL; |
| 666 } | 890 } |
| 667 return new CompactTrieEnumeration(fData, status); | 891 return new CompactTrieEnumeration(fInfo, status); |
| 668 } | 892 } |
| 669 | 893 |
| 670 // | 894 // |
| 671 // Below here is all code related to converting a ternary trie to a compact trie | 895 // Below here is all code related to converting a ternary trie to a compact trie |
| 672 // and back again | 896 // and back again |
| 673 // | 897 // |
| 674 | 898 |
| 675 // Helper classes to construct the compact trie | 899 enum CompactTrieNodeType { |
| 900 kHorizontalType = 0, |
| 901 kVerticalType = 1, |
| 902 kValueType = 2 |
| 903 }; |
| 904 |
| 905 /** |
| 906 * The following classes (i.e. BuildCompactTrie*Node) are helper classes to |
| 907 * construct the compact trie by storing information for each node and later |
| 908 * writing the node to memory in a sequential format. |
| 909 */ |
| 676 class BuildCompactTrieNode: public UMemory { | 910 class BuildCompactTrieNode: public UMemory { |
| 677 public: | 911 public: |
| 678 UBool fParentEndsWord; | 912 UBool fParentEndsWord; |
| 679 UBool fVertical; | 913 CompactTrieNodeType fNodeType; |
| 680 UBool fHasDuplicate; | 914 UBool fHasDuplicate; |
| 915 UBool fEqualOverflows; |
| 681 int32_t fNodeID; | 916 int32_t fNodeID; |
| 682 UnicodeString fChars; | 917 UnicodeString fChars; |
| 918 uint16_t fValue; |
| 683 | 919 |
| 684 public: | 920 public: |
| 685 BuildCompactTrieNode(UBool parentEndsWord, UBool vertical, UStack &nodes, UE
rrorCode &status) { | 921 BuildCompactTrieNode(UBool parentEndsWord, CompactTrieNodeType nodeType, |
| 922 UStack &nodes, UErrorCode &status, uint16_t value = 0) { |
| 686 fParentEndsWord = parentEndsWord; | 923 fParentEndsWord = parentEndsWord; |
| 687 fHasDuplicate = FALSE; | 924 fHasDuplicate = FALSE; |
| 688 fVertical = vertical; | 925 fNodeType = nodeType; |
| 926 fEqualOverflows = FALSE; |
| 689 fNodeID = nodes.size(); | 927 fNodeID = nodes.size(); |
| 928 fValue = parentEndsWord? value : 0; |
| 690 nodes.push(this, status); | 929 nodes.push(this, status); |
| 691 } | 930 } |
| 692 | 931 |
| 693 virtual ~BuildCompactTrieNode() { | 932 virtual ~BuildCompactTrieNode() { |
| 694 } | 933 } |
| 695 | 934 |
| 696 virtual uint32_t size() { | 935 virtual uint32_t size() { |
| 697 return sizeof(uint16_t); | 936 if(fValue > 0) |
| 937 return sizeof(uint16_t) * 2; |
| 938 else |
| 939 return sizeof(uint16_t); |
| 698 } | 940 } |
| 699 | 941 |
| 700 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*tran
slate*/) { | 942 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*tran
slate*/) { |
| 701 // Write flag/count | 943 // Write flag/count |
| 702 *((uint16_t *)(bytes+offset)) = (fChars.length() & kCountMask) | 944 |
| 703 | (fVertical ? kVerticalNode : 0) | (fParentEndsWord ? kParentEndsWo
rd : 0 ); | 945 // if this ever fails, a flag bit (i.e. kExceedsCount) will need to be |
| 946 // used as a 5th MSB. |
| 947 U_ASSERT(fChars.length() < 4096 || fNodeID == 2); |
| 948 |
| 949 *((uint16_t *)(bytes+offset)) = (fEqualOverflows? kEqualOverflows : 0) |
|
| 950 ((fNodeID == 2)? (fChars.length() & kRootCountMask): |
| 951 ( |
| 952 (fChars.length() & kCountMask) | |
| 953 //((fChars.length() << 2) & kExceedsCount) | |
| 954 (fNodeType == kVerticalType ? kVerticalNode : 0) | |
| 955 (fParentEndsWord ? kParentEndsWord : 0 ) |
| 956 ) |
| 957 ); |
| 704 offset += sizeof(uint16_t); | 958 offset += sizeof(uint16_t); |
| 705 } | 959 } |
| 960 |
| 961 virtual void writeValue(uint8_t *bytes, uint32_t &offset) { |
| 962 if(fValue > 0){ |
| 963 *((uint16_t *)(bytes+offset)) = fValue; |
| 964 offset += sizeof(uint16_t); |
| 965 } |
| 966 } |
| 967 |
| 968 }; |
| 969 |
| 970 /** |
| 971 * Stores value of parent terminating nodes that have no more subtries. |
| 972 */ |
| 973 class BuildCompactTrieValueNode: public BuildCompactTrieNode { |
| 974 public: |
| 975 BuildCompactTrieValueNode(UStack &nodes, UErrorCode &status, uint16_t value) |
| 976 : BuildCompactTrieNode(TRUE, kValueType, nodes, status, value){ |
| 977 } |
| 978 |
| 979 virtual ~BuildCompactTrieValueNode(){ |
| 980 } |
| 981 |
| 982 virtual uint32_t size() { |
| 983 return sizeof(uint16_t) * 2; |
| 984 } |
| 985 |
| 986 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &transl
ate) { |
| 987 // don't write value directly to memory but store it in offset to be wri
tten later |
| 988 //offset = fValue & kOffsetContainsValue; |
| 989 BuildCompactTrieNode::write(bytes, offset, translate); |
| 990 BuildCompactTrieNode::writeValue(bytes, offset); |
| 991 } |
| 706 }; | 992 }; |
| 707 | 993 |
| 708 class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode { | 994 class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode { |
| 709 public: | 995 public: |
| 710 UStack fLinks; | 996 UStack fLinks; |
| 997 UBool fMayOverflow; //intermediate value for fEqualOverflows |
| 711 | 998 |
| 712 public: | 999 public: |
| 713 BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCo
de &status) | 1000 BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCo
de &status, uint16_t value = 0) |
| 714 : BuildCompactTrieNode(parentEndsWord, FALSE, nodes, status), fLinks(sta
tus) { | 1001 : BuildCompactTrieNode(parentEndsWord, kHorizontalType, nodes, status, value
), fLinks(status) { |
| 1002 fMayOverflow = FALSE; |
| 715 } | 1003 } |
| 716 | 1004 |
| 717 virtual ~BuildCompactTrieHorizontalNode() { | 1005 virtual ~BuildCompactTrieHorizontalNode() { |
| 718 } | 1006 } |
| 719 | 1007 |
| 720 virtual uint32_t size() { | 1008 // It is impossible to know beforehand exactly how much space the node will |
| 721 return offsetof(CompactTrieHorizontalNode,entries) + | 1009 // need in memory before being written, because the node IDs in the equal |
| 722 (fChars.length()*sizeof(CompactTrieHorizontalEntry)); | 1010 // links may or may not overflow after node coalescing. Therefore, this meth
od |
| 1011 // returns the maximum size possible for the node. |
| 1012 virtual uint32_t size() { |
| 1013 uint32_t estimatedSize = offsetof(CompactTrieHorizontalNode,entries) + |
| 1014 (fChars.length()*sizeof(CompactTrieHorizontalEntry)); |
| 1015 |
| 1016 if(fValue > 0) |
| 1017 estimatedSize += sizeof(uint16_t); |
| 1018 |
| 1019 //estimate extra space needed to store overflow for node ID links |
| 1020 //may be more than what is actually needed |
| 1021 for(int i=0; i < fChars.length(); i++){ |
| 1022 if(((BuildCompactTrieNode *)fLinks[i])->fNodeID > 0xFFFF){ |
| 1023 fMayOverflow = TRUE; |
| 1024 break; |
| 1025 } |
| 1026 } |
| 1027 if(fMayOverflow) // added space for overflow should be same as ceil(fCha
rs.length()/4) * sizeof(uint16_t) |
| 1028 estimatedSize += (sizeof(uint16_t) * fChars.length() + 2)/4; |
| 1029 |
| 1030 return estimatedSize; |
| 723 } | 1031 } |
| 724 | 1032 |
| 725 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &transl
ate) { | 1033 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &transl
ate) { |
| 1034 int32_t count = fChars.length(); |
| 1035 |
| 1036 //if largest nodeID > 2^16, set flag |
| 1037 //large node IDs are more likely to be at the back of the array |
| 1038 for (int32_t i = count-1; i >= 0; --i) { |
| 1039 if(translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID
) > 0xFFFF){ |
| 1040 fEqualOverflows = TRUE; |
| 1041 break; |
| 1042 } |
| 1043 } |
| 1044 |
| 726 BuildCompactTrieNode::write(bytes, offset, translate); | 1045 BuildCompactTrieNode::write(bytes, offset, translate); |
| 727 int32_t count = fChars.length(); | 1046 |
| 1047 // write entries[] to memory |
| 728 for (int32_t i = 0; i < count; ++i) { | 1048 for (int32_t i = 0; i < count; ++i) { |
| 729 CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(b
ytes+offset); | 1049 CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(b
ytes+offset); |
| 730 entry->ch = fChars[i]; | 1050 entry->ch = fChars[i]; |
| 731 entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[
i])->fNodeID); | 1051 entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[
i])->fNodeID); |
| 732 #ifdef DEBUG_TRIE_DICT | 1052 #ifdef DEBUG_TRIE_DICT |
| 733 if (entry->equal == 0) { | 1053 |
| 1054 if ((entry->equal == 0) && !fEqualOverflows) { |
| 734 fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps
to physical node zero\n", | 1055 fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps
to physical node zero\n", |
| 735 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); | 1056 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); |
| 736 } | 1057 } |
| 737 #endif | 1058 #endif |
| 738 offset += sizeof(CompactTrieHorizontalEntry); | 1059 offset += sizeof(CompactTrieHorizontalEntry); |
| 739 } | 1060 } |
| 1061 |
| 1062 // append extra bits of equal nodes to end if fEqualOverflows |
| 1063 if (fEqualOverflows) { |
| 1064 uint16_t leftmostBits = 0; |
| 1065 for (int16_t i = 0; i < count; i++) { |
| 1066 leftmostBits = (leftmostBits << 4) | getLeftmostBits(translate,
i); |
| 1067 |
| 1068 // write filled uint16_t to memory |
| 1069 if(i % 4 == 3){ |
| 1070 *((uint16_t *)(bytes+offset)) = leftmostBits; |
| 1071 leftmostBits = 0; |
| 1072 offset += sizeof(uint16_t); |
| 1073 } |
| 1074 } |
| 1075 |
| 1076 // pad last uint16_t with zeroes if necessary |
| 1077 int remainder = count % 4; |
| 1078 if (remainder > 0) { |
| 1079 *((uint16_t *)(bytes+offset)) = (leftmostBits << (16 - 4 * remai
nder)); |
| 1080 offset += sizeof(uint16_t); |
| 1081 } |
| 1082 } |
| 1083 |
| 1084 BuildCompactTrieNode::writeValue(bytes, offset); |
| 1085 } |
| 1086 |
| 1087 // returns leftmost bits of physical node link |
| 1088 uint16_t getLeftmostBits(const UVector32 &translate, uint32_t i){ |
| 1089 uint16_t leftmostBits = (uint16_t) (translate.elementAti(((BuildCompactT
rieNode *)fLinks[i])->fNodeID) >> 16); |
| 1090 #ifdef DEBUG_TRIE_DICT |
| 1091 if (leftmostBits > 0xF) { |
| 1092 fprintf(stderr, "ERROR: horizontal link %d, logical node %d exceeds
maximum possible node ID value\n", |
| 1093 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); |
| 1094 } |
| 1095 #endif |
| 1096 return leftmostBits; |
| 740 } | 1097 } |
| 741 | 1098 |
| 742 void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) { | 1099 void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) { |
| 743 fChars.append(ch); | 1100 fChars.append(ch); |
| 744 fLinks.push(link, status); | 1101 fLinks.push(link, status); |
| 745 } | 1102 } |
| 1103 |
| 746 }; | 1104 }; |
| 747 | 1105 |
| 748 class BuildCompactTrieVerticalNode: public BuildCompactTrieNode { | 1106 class BuildCompactTrieVerticalNode: public BuildCompactTrieNode { |
| 749 public: | 1107 public: |
| 750 BuildCompactTrieNode *fEqual; | 1108 BuildCompactTrieNode *fEqual; |
| 751 | 1109 |
| 752 public: | 1110 public: |
| 753 BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode
&status) | 1111 BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode
&status, uint16_t value = 0) |
| 754 : BuildCompactTrieNode(parentEndsWord, TRUE, nodes, status) { | 1112 : BuildCompactTrieNode(parentEndsWord, kVerticalType, nodes, status, value)
{ |
| 755 fEqual = NULL; | 1113 fEqual = NULL; |
| 756 } | 1114 } |
| 757 | 1115 |
| 758 virtual ~BuildCompactTrieVerticalNode() { | 1116 virtual ~BuildCompactTrieVerticalNode() { |
| 759 } | 1117 } |
| 760 | 1118 |
| 761 virtual uint32_t size() { | 1119 // Returns the maximum possible size of this node. See comment in |
| 762 return offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof
(uint16_t)); | 1120 // BuildCompactTrieHorizontal node for more information. |
| 1121 virtual uint32_t size() { |
| 1122 uint32_t estimatedSize = offsetof(CompactTrieVerticalNode,chars) + (fCha
rs.length()*sizeof(uint16_t)); |
| 1123 if(fValue > 0){ |
| 1124 estimatedSize += sizeof(uint16_t); |
| 1125 } |
| 1126 |
| 1127 if(fEqual->fNodeID > 0xFFFF){ |
| 1128 estimatedSize += sizeof(uint16_t); |
| 1129 } |
| 1130 return estimatedSize; |
| 763 } | 1131 } |
| 764 | 1132 |
| 765 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &transl
ate) { | 1133 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &transl
ate) { |
| 766 CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset
); | 1134 CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset
); |
| 1135 fEqualOverflows = (translate.elementAti(fEqual->fNodeID) > 0xFFFF); |
| 767 BuildCompactTrieNode::write(bytes, offset, translate); | 1136 BuildCompactTrieNode::write(bytes, offset, translate); |
| 768 node->equal = translate.elementAti(fEqual->fNodeID); | 1137 node->equal = translate.elementAti(fEqual->fNodeID); |
| 769 offset += sizeof(node->equal); | 1138 offset += sizeof(node->equal); |
| 770 #ifdef DEBUG_TRIE_DICT | 1139 #ifdef DEBUG_TRIE_DICT |
| 771 if (node->equal == 0) { | 1140 if ((node->equal == 0) && !fEqualOverflows) { |
| 772 fprintf(stderr, "ERROR: vertical link, logical node %d maps to physi
cal node zero\n", | 1141 fprintf(stderr, "ERROR: vertical link, logical node %d maps to physi
cal node zero\n", |
| 773 fEqual->fNodeID); | 1142 fEqual->fNodeID); |
| 774 } | 1143 } |
| 775 #endif | 1144 #endif |
| 776 fChars.extract(0, fChars.length(), (UChar *)node->chars); | 1145 fChars.extract(0, fChars.length(), (UChar *)node->chars); |
| 777 offset += sizeof(uint16_t)*fChars.length(); | 1146 offset += sizeof(UChar)*fChars.length(); |
| 1147 |
| 1148 // append 16 bits of to end for equal node if fEqualOverflows |
| 1149 if (fEqualOverflows) { |
| 1150 *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNodeI
D) >> 16); |
| 1151 offset += sizeof(uint16_t); |
| 1152 } |
| 1153 |
| 1154 BuildCompactTrieNode::writeValue(bytes, offset); |
| 778 } | 1155 } |
| 779 | 1156 |
| 780 void addChar(UChar ch) { | 1157 void addChar(UChar ch) { |
| 781 fChars.append(ch); | 1158 fChars.append(ch); |
| 782 } | 1159 } |
| 783 | 1160 |
| 784 void setLink(BuildCompactTrieNode *node) { | 1161 void setLink(BuildCompactTrieNode *node) { |
| 785 fEqual = node; | 1162 fEqual = node; |
| 786 } | 1163 } |
| 1164 |
| 787 }; | 1165 }; |
| 788 | 1166 |
| 789 // Forward declaration | 1167 // Forward declaration |
| 790 static void walkHorizontal(const TernaryNode *node, | 1168 static void walkHorizontal(const TernaryNode *node, |
| 791 BuildCompactTrieHorizontalNode *building, | 1169 BuildCompactTrieHorizontalNode *building, |
| 792 UStack &nodes, | 1170 UStack &nodes, |
| 793 UErrorCode &status); | 1171 UErrorCode &status, |
| 1172 Hashtable *values); |
| 794 | 1173 |
| 795 // Convert one node. Uses recursion. | 1174 // Convert one TernaryNode into a BuildCompactTrieNode. Uses recursion. |
| 796 | 1175 |
| 797 static BuildCompactTrieNode * | 1176 static BuildCompactTrieNode * |
| 798 compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, UEr
rorCode &status) { | 1177 compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, |
| 1178 UErrorCode &status, Hashtable *values = NULL, uint16_t parentValue = 0)
{ |
| 799 if (U_FAILURE(status)) { | 1179 if (U_FAILURE(status)) { |
| 800 return NULL; | 1180 return NULL; |
| 801 } | 1181 } |
| 802 BuildCompactTrieNode *result = NULL; | 1182 BuildCompactTrieNode *result = NULL; |
| 803 UBool horizontal = (node->low != NULL || node->high != NULL); | 1183 UBool horizontal = (node->low != NULL || node->high != NULL); |
| 804 if (horizontal) { | 1184 if (horizontal) { |
| 805 BuildCompactTrieHorizontalNode *hResult = | 1185 BuildCompactTrieHorizontalNode *hResult; |
| 806 new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status
); | 1186 if(values != NULL){ |
| 1187 hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes,
status, parentValue); |
| 1188 } else { |
| 1189 hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes,
status); |
| 1190 } |
| 1191 |
| 807 if (hResult == NULL) { | 1192 if (hResult == NULL) { |
| 808 status = U_MEMORY_ALLOCATION_ERROR; | 1193 status = U_MEMORY_ALLOCATION_ERROR; |
| 809 return NULL; | 1194 return NULL; |
| 810 } | 1195 } |
| 811 if (U_SUCCESS(status)) { | 1196 if (U_SUCCESS(status)) { |
| 812 walkHorizontal(node, hResult, nodes, status); | 1197 walkHorizontal(node, hResult, nodes, status, values); |
| 813 result = hResult; | 1198 result = hResult; |
| 814 } | 1199 } |
| 815 } | 1200 } |
| 816 else { | 1201 else { |
| 817 BuildCompactTrieVerticalNode *vResult = | 1202 BuildCompactTrieVerticalNode *vResult; |
| 818 new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status); | 1203 if(values != NULL){ |
| 1204 vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, st
atus, parentValue); |
| 1205 } else { |
| 1206 vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, st
atus); |
| 1207 } |
| 1208 |
| 819 if (vResult == NULL) { | 1209 if (vResult == NULL) { |
| 820 status = U_MEMORY_ALLOCATION_ERROR; | 1210 status = U_MEMORY_ALLOCATION_ERROR; |
| 1211 return NULL; |
| 821 } | 1212 } |
| 822 else if (U_SUCCESS(status)) { | 1213 else if (U_SUCCESS(status)) { |
| 823 UBool endsWord = FALSE; | 1214 uint16_t value = 0; |
| 1215 UBool endsWord = FALSE; |
| 824 // Take up nodes until we end a word, or hit a node with < or > link
s | 1216 // Take up nodes until we end a word, or hit a node with < or > link
s |
| 825 do { | 1217 do { |
| 826 vResult->addChar(node->ch); | 1218 vResult->addChar(node->ch); |
| 827 endsWord = (node->flags & kEndsWord) != 0; | 1219 value = node->flags; |
| 1220 endsWord = value > 0; |
| 828 node = node->equal; | 1221 node = node->equal; |
| 829 } | 1222 } |
| 830 while(node != NULL && !endsWord && node->low == NULL && node->high =
= NULL); | 1223 while(node != NULL && !endsWord && node->low == NULL && node->high =
= NULL); |
| 1224 |
| 831 if (node == NULL) { | 1225 if (node == NULL) { |
| 832 if (!endsWord) { | 1226 if (!endsWord) { |
| 833 status = U_ILLEGAL_ARGUMENT_ERROR; // Corrupt input trie | 1227 status = U_ILLEGAL_ARGUMENT_ERROR; // Corrupt input trie |
| 834 } | 1228 } |
| 835 else { | 1229 else if(values != NULL){ |
| 1230 UnicodeString key(value); //store value as a single-char Uni
codeString |
| 1231 BuildCompactTrieValueNode *link = (BuildCompactTrieValueNode
*) values->get(key); |
| 1232 if(link == NULL){ |
| 1233 link = new BuildCompactTrieValueNode(nodes, status, valu
e); //take out nodes? |
| 1234 values->put(key, link, status); |
| 1235 } |
| 1236 vResult->setLink(link); |
| 1237 } else { |
| 836 vResult->setLink((BuildCompactTrieNode *)nodes[1]); | 1238 vResult->setLink((BuildCompactTrieNode *)nodes[1]); |
| 837 } | 1239 } |
| 838 } | 1240 } |
| 839 else { | 1241 else { |
| 840 vResult->setLink(compactOneNode(node, endsWord, nodes, status)); | 1242 vResult->setLink(compactOneNode(node, endsWord, nodes, status, v
alues, value)); |
| 841 } | 1243 } |
| 842 result = vResult; | 1244 result = vResult; |
| 843 } | 1245 } |
| 844 } | 1246 } |
| 845 return result; | 1247 return result; |
| 846 } | 1248 } |
| 847 | 1249 |
| 848 // Walk the set of peers at the same level, to build a horizontal node. | 1250 // Walk the set of peers at the same level, to build a horizontal node. |
| 849 // Uses recursion. | 1251 // Uses recursion. |
| 850 | 1252 |
| 851 static void walkHorizontal(const TernaryNode *node, | 1253 static void walkHorizontal(const TernaryNode *node, |
| 852 BuildCompactTrieHorizontalNode *building, | 1254 BuildCompactTrieHorizontalNode *building, |
| 853 UStack &nodes, | 1255 UStack &nodes, |
| 854 UErrorCode &status) { | 1256 UErrorCode &status, Hashtable *values = NULL) { |
| 855 while (U_SUCCESS(status) && node != NULL) { | 1257 while (U_SUCCESS(status) && node != NULL) { |
| 856 if (node->low != NULL) { | 1258 if (node->low != NULL) { |
| 857 walkHorizontal(node->low, building, nodes, status); | 1259 walkHorizontal(node->low, building, nodes, status, values); |
| 858 } | 1260 } |
| 859 BuildCompactTrieNode *link = NULL; | 1261 BuildCompactTrieNode *link = NULL; |
| 860 if (node->equal != NULL) { | 1262 if (node->equal != NULL) { |
| 861 link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, n
odes, status); | 1263 link = compactOneNode(node->equal, node->flags > 0, nodes, status, v
alues, node->flags); |
| 862 } | 1264 } |
| 863 else if (node->flags & kEndsWord) { | 1265 else if (node->flags > 0) { |
| 864 link = (BuildCompactTrieNode *)nodes[1]; | 1266 if(values != NULL) { |
| 1267 UnicodeString key(node->flags); //store value as a single-char U
nicodeString |
| 1268 link = (BuildCompactTrieValueNode *) values->get(key); |
| 1269 if(link == NULL) { |
| 1270 link = new BuildCompactTrieValueNode(nodes, status, node->fl
ags); //take out nodes? |
| 1271 values->put(key, link, status); |
| 1272 } |
| 1273 } else { |
| 1274 link = (BuildCompactTrieNode *)nodes[1]; |
| 1275 } |
| 865 } | 1276 } |
| 866 if (U_SUCCESS(status) && link != NULL) { | 1277 if (U_SUCCESS(status) && link != NULL) { |
| 867 building->addNode(node->ch, link, status); | 1278 building->addNode(node->ch, link, status); |
| 868 } | 1279 } |
| 869 // Tail recurse manually instead of leaving it to the compiler. | 1280 // Tail recurse manually instead of leaving it to the compiler. |
| 870 //if (node->high != NULL) { | 1281 //if (node->high != NULL) { |
| 871 // walkHorizontal(node->high, building, nodes, status); | 1282 // walkHorizontal(node->high, building, nodes, status); |
| 872 //} | 1283 //} |
| 873 node = node->high; | 1284 node = node->high; |
| 874 } | 1285 } |
| 875 } | 1286 } |
| 876 | 1287 |
| 877 U_NAMESPACE_END | 1288 U_NAMESPACE_END |
| 878 U_NAMESPACE_USE | 1289 U_NAMESPACE_USE |
| 879 U_CDECL_BEGIN | 1290 U_CDECL_BEGIN |
| 880 static int32_t U_CALLCONV | 1291 static int32_t U_CALLCONV |
| 881 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr)
{ | 1292 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr)
{ |
| 882 BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl; | 1293 BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl; |
| 883 BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr; | 1294 BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr; |
| 1295 |
| 884 // Check for comparing a node to itself, to avoid spurious duplicates | 1296 // Check for comparing a node to itself, to avoid spurious duplicates |
| 885 if (left == right) { | 1297 if (left == right) { |
| 886 return 0; | 1298 return 0; |
| 887 } | 1299 } |
| 1300 |
| 888 // Most significant is type of node. Can never coalesce. | 1301 // Most significant is type of node. Can never coalesce. |
| 889 if (left->fVertical != right->fVertical) { | 1302 if (left->fNodeType != right->fNodeType) { |
| 890 return left->fVertical - right->fVertical; | 1303 return left->fNodeType - right->fNodeType; |
| 891 } | 1304 } |
| 892 // Next, the "parent ends word" flag. If that differs, we cannot coalesce. | 1305 // Next, the "parent ends word" flag. If that differs, we cannot coalesce. |
| 893 if (left->fParentEndsWord != right->fParentEndsWord) { | 1306 if (left->fParentEndsWord != right->fParentEndsWord) { |
| 894 return left->fParentEndsWord - right->fParentEndsWord; | 1307 return left->fParentEndsWord - right->fParentEndsWord; |
| 895 } | 1308 } |
| 896 // Next, the string. If that differs, we can never coalesce. | 1309 // Next, the string. If that differs, we can never coalesce. |
| 897 int32_t result = left->fChars.compare(right->fChars); | 1310 int32_t result = left->fChars.compare(right->fChars); |
| 898 if (result != 0) { | 1311 if (result != 0) { |
| 899 return result; | 1312 return result; |
| 900 } | 1313 } |
| 1314 |
| 1315 // If the node value differs, we should not coalesce. |
| 1316 // If values aren't stored, all fValues should be 0. |
| 1317 if (left->fValue != right->fValue) { |
| 1318 return left->fValue - right->fValue; |
| 1319 } |
| 1320 |
| 901 // We know they're both the same node type, so branch for the two cases. | 1321 // We know they're both the same node type, so branch for the two cases. |
| 902 if (left->fVertical) { | 1322 if (left->fNodeType == kVerticalType) { |
| 903 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID | 1323 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID |
| 904 - ((BuildCompactTrieVerticalNode *)right)->fEqual->f
NodeID; | 1324 - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID; |
| 905 } | 1325 } |
| 906 else { | 1326 else if(left->fChars.length() > 0 && right->fChars.length() > 0){ |
| 907 // We need to compare the links vectors. They should be the | 1327 // We need to compare the links vectors. They should be the |
| 908 // same size because the strings were equal. | 1328 // same size because the strings were equal. |
| 909 // We compare the node IDs instead of the pointers, to handle | 1329 // We compare the node IDs instead of the pointers, to handle |
| 910 // coalesced nodes. | 1330 // coalesced nodes. |
| 911 BuildCompactTrieHorizontalNode *hleft, *hright; | 1331 BuildCompactTrieHorizontalNode *hleft, *hright; |
| 912 hleft = (BuildCompactTrieHorizontalNode *)left; | 1332 hleft = (BuildCompactTrieHorizontalNode *)left; |
| 913 hright = (BuildCompactTrieHorizontalNode *)right; | 1333 hright = (BuildCompactTrieHorizontalNode *)right; |
| 914 int32_t count = hleft->fLinks.size(); | 1334 int32_t count = hleft->fLinks.size(); |
| 915 for (int32_t i = 0; i < count && result == 0; ++i) { | 1335 for (int32_t i = 0; i < count && result == 0; ++i) { |
| 916 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID - | 1336 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID - |
| 917 ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; | 1337 ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; |
| 918 } | 1338 } |
| 919 } | 1339 } |
| 1340 |
| 920 // If they are equal to each other, mark them (speeds coalescing) | 1341 // If they are equal to each other, mark them (speeds coalescing) |
| 921 if (result == 0) { | 1342 if (result == 0) { |
| 922 left->fHasDuplicate = TRUE; | 1343 left->fHasDuplicate = TRUE; |
| 923 right->fHasDuplicate = TRUE; | 1344 right->fHasDuplicate = TRUE; |
| 924 } | 1345 } |
| 925 return result; | 1346 return result; |
| 926 } | 1347 } |
| 927 U_CDECL_END | 1348 U_CDECL_END |
| 928 U_NAMESPACE_BEGIN | 1349 U_NAMESPACE_BEGIN |
| 929 | 1350 |
| (...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1024 #ifdef DEBUG_TRIE_DICT | 1445 #ifdef DEBUG_TRIE_DICT |
| 1025 struct tms timing; | 1446 struct tms timing; |
| 1026 struct tms previous; | 1447 struct tms previous; |
| 1027 (void) ::times(&previous); | 1448 (void) ::times(&previous); |
| 1028 #endif | 1449 #endif |
| 1029 UStack nodes(_deleteBuildNode, NULL, status); // Index of nodes | 1450 UStack nodes(_deleteBuildNode, NULL, status); // Index of nodes |
| 1030 | 1451 |
| 1031 // Add node 0, used as the NULL pointer/sentinel. | 1452 // Add node 0, used as the NULL pointer/sentinel. |
| 1032 nodes.addElement((int32_t)0, status); | 1453 nodes.addElement((int32_t)0, status); |
| 1033 | 1454 |
| 1455 Hashtable *values = NULL; // Index of (unique) val
ues |
| 1456 if (dict.fValued) { |
| 1457 values = new Hashtable(status); |
| 1458 } |
| 1459 |
| 1034 // Start by creating the special empty node we use to indicate that the pare
nt | 1460 // Start by creating the special empty node we use to indicate that the pare
nt |
| 1035 // terminates a word. This must be node 1, because the builder assumes | 1461 // terminates a word. This must be node 1, because the builder assumes |
| 1036 // that. | 1462 // that. This node will never be used for tries storing numerical values. |
| 1037 if (U_FAILURE(status)) { | 1463 if (U_FAILURE(status)) { |
| 1038 return NULL; | 1464 return NULL; |
| 1039 } | 1465 } |
| 1040 BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, nodes
, status); | 1466 BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontalT
ype, nodes, status); |
| 1041 if (terminal == NULL) { | 1467 if (terminal == NULL) { |
| 1042 status = U_MEMORY_ALLOCATION_ERROR; | 1468 status = U_MEMORY_ALLOCATION_ERROR; |
| 1043 } | 1469 } |
| 1044 | 1470 |
| 1045 // This call does all the work of building the new trie structure. The root | 1471 // This call does all the work of building the new trie structure. The root |
| 1046 // will be node 2. | 1472 // will have node ID 2 before writing to memory. |
| 1047 BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status
); | 1473 BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status
, values); |
| 1048 #ifdef DEBUG_TRIE_DICT | 1474 #ifdef DEBUG_TRIE_DICT |
| 1049 (void) ::times(&timing); | 1475 (void) ::times(&timing); |
| 1050 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n", | 1476 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n", |
| 1051 nodes.size(), (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, | 1477 nodes.size(), (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, |
| 1052 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); | 1478 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); |
| 1053 previous = timing; | 1479 previous = timing; |
| 1054 #endif | 1480 #endif |
| 1055 | 1481 |
| 1056 // Now coalesce all duplicate nodes. | 1482 // Now coalesce all duplicate nodes. |
| 1057 coalesceDuplicates(nodes, status); | 1483 coalesceDuplicates(nodes, status); |
| (...skipping 12 matching lines...) Expand all Loading... |
| 1070 int32_t nodeCount = 1; // The sentinel node we already have | 1496 int32_t nodeCount = 1; // The sentinel node we already have |
| 1071 BuildCompactTrieNode *node; | 1497 BuildCompactTrieNode *node; |
| 1072 int32_t i; | 1498 int32_t i; |
| 1073 UVector32 translate(count, status); // Should be no growth needed after this | 1499 UVector32 translate(count, status); // Should be no growth needed after this |
| 1074 translate.push(0, status); // The sentinel node | 1500 translate.push(0, status); // The sentinel node |
| 1075 | 1501 |
| 1076 if (U_FAILURE(status)) { | 1502 if (U_FAILURE(status)) { |
| 1077 return NULL; | 1503 return NULL; |
| 1078 } | 1504 } |
| 1079 | 1505 |
| 1506 //map terminal value nodes |
| 1507 int valueCount = 0; |
| 1508 UVector valueNodes(status); |
| 1509 if(values != NULL) { |
| 1510 valueCount = values->count(); //number of unique terminal value nodes |
| 1511 } |
| 1512 |
| 1513 // map non-terminal nodes |
| 1514 int valuePos = 1;//, nodePos = valueCount + valuePos; |
| 1515 nodeCount = valueCount + valuePos; |
| 1080 for (i = 1; i < count; ++i) { | 1516 for (i = 1; i < count; ++i) { |
| 1081 node = (BuildCompactTrieNode *)nodes[i]; | 1517 node = (BuildCompactTrieNode *)nodes[i]; |
| 1082 if (node->fNodeID == i) { | 1518 if (node->fNodeID == i) { |
| 1083 // Only one node out of each duplicate set is used | 1519 // Only one node out of each duplicate set is used |
| 1084 if (i >= translate.size()) { | 1520 if (node->fNodeID >= translate.size()) { |
| 1085 // Logically extend the mapping table | 1521 // Logically extend the mapping table |
| 1086 translate.setSize(i+1); | 1522 translate.setSize(i + 1); |
| 1087 } | 1523 } |
| 1088 translate.setElementAt(nodeCount++, i); | 1524 //translate.setElementAt(object, index)! |
| 1525 if(node->fNodeType == kValueType) { |
| 1526 valueNodes.addElement(node, status); |
| 1527 translate.setElementAt(valuePos++, i); |
| 1528 } else { |
| 1529 translate.setElementAt(nodeCount++, i); |
| 1530 } |
| 1089 totalSize += node->size(); | 1531 totalSize += node->size(); |
| 1090 } | 1532 } |
| 1091 } | 1533 } |
| 1092 | 1534 |
| 1093 // Check for overflowing 16 bits worth of nodes. | 1535 // Check for overflowing 20 bits worth of nodes. |
| 1094 if (nodeCount > 0x10000) { | 1536 if (nodeCount > 0x100000) { |
| 1095 status = U_ILLEGAL_ARGUMENT_ERROR; | 1537 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 1096 return NULL; | 1538 return NULL; |
| 1097 } | 1539 } |
| 1098 | 1540 |
| 1099 // Add enough room for the offsets. | 1541 // Add enough room for the offsets. |
| 1100 totalSize += nodeCount*sizeof(uint32_t); | 1542 totalSize += nodeCount*sizeof(uint32_t); |
| 1101 #ifdef DEBUG_TRIE_DICT | 1543 #ifdef DEBUG_TRIE_DICT |
| 1102 (void) ::times(&timing); | 1544 (void) ::times(&timing); |
| 1103 fprintf(stderr, "Sizes/mapping done, time user %f system %f\n", | 1545 fprintf(stderr, "Sizes/mapping done, time user %f system %f\n", |
| 1104 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, | 1546 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, |
| 1105 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); | 1547 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); |
| 1106 previous = timing; | 1548 previous = timing; |
| 1107 fprintf(stderr, "%d nodes, %d unique, %d bytes\n", nodes.size(), nodeCount,
totalSize); | 1549 fprintf(stderr, "%d nodes, %d unique, %d bytes\n", nodes.size(), nodeCount,
totalSize); |
| 1108 #endif | 1550 #endif |
| 1109 uint8_t *bytes = (uint8_t *)uprv_malloc(totalSize); | 1551 uint8_t *bytes = (uint8_t *)uprv_malloc(totalSize); |
| 1110 if (bytes == NULL) { | 1552 if (bytes == NULL) { |
| 1111 status = U_MEMORY_ALLOCATION_ERROR; | 1553 status = U_MEMORY_ALLOCATION_ERROR; |
| 1112 return NULL; | 1554 return NULL; |
| 1113 } | 1555 } |
| 1114 | 1556 |
| 1115 CompactTrieHeader *header = (CompactTrieHeader *)bytes; | 1557 CompactTrieHeader *header = (CompactTrieHeader *)bytes; |
| 1116 header->size = totalSize; | 1558 //header->size = totalSize; |
| 1559 if(dict.fValued){ |
| 1560 header->magic = COMPACT_TRIE_MAGIC_3; |
| 1561 } else { |
| 1562 header->magic = COMPACT_TRIE_MAGIC_2; |
| 1563 } |
| 1117 header->nodeCount = nodeCount; | 1564 header->nodeCount = nodeCount; |
| 1118 header->offsets[0] = 0; // Sentinel | 1565 header->offsets[0] = 0; // Sentinel |
| 1119 header->root = translate.elementAti(root->fNodeID); | 1566 header->root = translate.elementAti(root->fNodeID); |
| 1120 #ifdef DEBUG_TRIE_DICT | 1567 #ifdef DEBUG_TRIE_DICT |
| 1121 if (header->root == 0) { | 1568 if (header->root == 0) { |
| 1122 fprintf(stderr, "ERROR: root node %d translate to physical zero\n", root
->fNodeID); | 1569 fprintf(stderr, "ERROR: root node %d translate to physical zero\n", root
->fNodeID); |
| 1123 } | 1570 } |
| 1124 #endif | 1571 #endif |
| 1125 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint
32_t)); | 1572 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint
32_t)); |
| 1126 nodeCount = 1; | 1573 nodeCount = valueCount + 1; |
| 1574 |
| 1575 // Write terminal value nodes to memory |
| 1576 for (i=0; i < valueNodes.size(); i++) { |
| 1577 //header->offsets[i + 1] = offset; |
| 1578 uint32_t tmpOffset = 0; |
| 1579 node = (BuildCompactTrieNode *) valueNodes.elementAt(i); |
| 1580 //header->offsets[i + 1] = (uint32_t)node->fValue; |
| 1581 node->write((uint8_t *)&header->offsets[i+1], tmpOffset, translate); |
| 1582 } |
| 1583 |
| 1127 // Now write the data | 1584 // Now write the data |
| 1128 for (i = 1; i < count; ++i) { | 1585 for (i = 1; i < count; ++i) { |
| 1129 node = (BuildCompactTrieNode *)nodes[i]; | 1586 node = (BuildCompactTrieNode *)nodes[i]; |
| 1130 if (node->fNodeID == i) { | 1587 if (node->fNodeID == i && node->fNodeType != kValueType) { |
| 1131 header->offsets[nodeCount++] = offset; | 1588 header->offsets[nodeCount++] = offset; |
| 1132 node->write(bytes, offset, translate); | 1589 node->write(bytes, offset, translate); |
| 1133 } | 1590 } |
| 1134 } | 1591 } |
| 1592 |
| 1593 //free all extra space |
| 1594 uprv_realloc(bytes, offset); |
| 1595 header->size = offset; |
| 1596 |
| 1135 #ifdef DEBUG_TRIE_DICT | 1597 #ifdef DEBUG_TRIE_DICT |
| 1598 fprintf(stdout, "Space freed: %d\n", totalSize-offset); |
| 1599 |
| 1136 (void) ::times(&timing); | 1600 (void) ::times(&timing); |
| 1137 fprintf(stderr, "Trie built, time user %f system %f\n", | 1601 fprintf(stderr, "Trie built, time user %f system %f\n", |
| 1138 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, | 1602 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, |
| 1139 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); | 1603 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); |
| 1140 previous = timing; | 1604 previous = timing; |
| 1141 fprintf(stderr, "Final offset is %d\n", offset); | 1605 fprintf(stderr, "Final offset is %d\n", offset); |
| 1142 | 1606 |
| 1143 // Collect statistics on node types and sizes | 1607 // Collect statistics on node types and sizes |
| 1144 int hCount = 0; | 1608 int hCount = 0; |
| 1145 int vCount = 0; | 1609 int vCount = 0; |
| 1146 size_t hSize = 0; | 1610 size_t hSize = 0; |
| 1147 size_t vSize = 0; | 1611 size_t vSize = 0; |
| 1148 size_t hItemCount = 0; | 1612 size_t hItemCount = 0; |
| 1149 size_t vItemCount = 0; | 1613 size_t vItemCount = 0; |
| 1150 uint32_t previousOff = offset; | 1614 uint32_t previousOff = offset; |
| 1151 for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { | 1615 uint32_t numOverflow = 0; |
| 1616 uint32_t valueSpace = 0; |
| 1617 for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { |
| 1152 const CompactTrieNode *node = getCompactNode(header, nodeIdx); | 1618 const CompactTrieNode *node = getCompactNode(header, nodeIdx); |
| 1153 if (node->flagscount & kVerticalNode) { | 1619 int itemCount; |
| 1620 if(nodeIdx == header->root) |
| 1621 itemCount = node->flagscount & kRootCountMask; |
| 1622 else |
| 1623 itemCount = getCount(node); |
| 1624 if(node->flagscount & kEqualOverflows){ |
| 1625 numOverflow++; |
| 1626 } |
| 1627 if (node->flagscount & kVerticalNode && nodeIdx != header->root) { |
| 1154 vCount += 1; | 1628 vCount += 1; |
| 1155 vItemCount += (node->flagscount & kCountMask); | 1629 vItemCount += itemCount; |
| 1156 vSize += previousOff-header->offsets[nodeIdx]; | 1630 vSize += previousOff-header->offsets[nodeIdx]; |
| 1157 } | 1631 } |
| 1158 else { | 1632 else { |
| 1159 hCount += 1; | 1633 hCount += 1; |
| 1160 hItemCount += (node->flagscount & kCountMask); | 1634 hItemCount += itemCount; |
| 1161 hSize += previousOff-header->offsets[nodeIdx]; | 1635 if(nodeIdx >= header->root) { |
| 1636 hSize += previousOff-header->offsets[nodeIdx]; |
| 1637 } |
| 1162 } | 1638 } |
| 1639 |
| 1640 if(header->magic == COMPACT_TRIE_MAGIC_3 && node->flagscount & kParentEn
dsWord) |
| 1641 valueSpace += sizeof(uint16_t); |
| 1163 previousOff = header->offsets[nodeIdx]; | 1642 previousOff = header->offsets[nodeIdx]; |
| 1164 } | 1643 } |
| 1165 fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\
n", hCount, | 1644 fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\
n", hCount, |
| 1166 (double)hSize/hCount, (double)hItemCount/hCount); | 1645 (double)hSize/hCount, (double)hItemCount/hCount); |
| 1167 fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n"
, vCount, | 1646 fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n"
, vCount, |
| 1168 (double)vSize/vCount, (double)vItemCount/vCount); | 1647 (double)vSize/vCount, (double)vItemCount/vCount); |
| 1648 fprintf(stderr, "Number of nodes with overflowing nodeIDs: %d \n", numOverfl
ow); |
| 1649 fprintf(stderr, "Space taken up by values: %d \n", valueSpace); |
| 1169 #endif | 1650 #endif |
| 1170 | 1651 |
| 1171 if (U_FAILURE(status)) { | 1652 if (U_FAILURE(status)) { |
| 1172 uprv_free(bytes); | 1653 uprv_free(bytes); |
| 1173 header = NULL; | 1654 header = NULL; |
| 1174 } | 1655 } |
| 1175 else { | |
| 1176 header->magic = COMPACT_TRIE_MAGIC_1; | |
| 1177 } | |
| 1178 return header; | 1656 return header; |
| 1179 } | 1657 } |
| 1180 | 1658 |
| 1181 // Forward declaration | 1659 // Forward declaration |
| 1182 static TernaryNode * | 1660 static TernaryNode * |
| 1183 unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UEr
rorCode &status ); | 1661 unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorC
ode &status ); |
| 1184 | |
| 1185 | 1662 |
| 1186 // Convert a horizontal node (or subarray thereof) into a ternary subtrie | 1663 // Convert a horizontal node (or subarray thereof) into a ternary subtrie |
| 1187 static TernaryNode * | 1664 static TernaryNode * |
| 1188 unpackHorizontalArray( const CompactTrieHeader *header, const CompactTrieHorizon
talEntry *array, | 1665 unpackHorizontalArray( const CompactTrieInfo *info, const CompactTrieHorizontalN
ode *hnode, |
| 1189 int low, int high, UErrorCode &status ) { | 1666 int low, int high, int nodeCount, UErrorCode &status) { |
| 1190 if (U_FAILURE(status) || low > high) { | 1667 if (U_FAILURE(status) || low > high) { |
| 1191 return NULL; | 1668 return NULL; |
| 1192 } | 1669 } |
| 1193 int middle = (low+high)/2; | 1670 int middle = (low+high)/2; |
| 1194 TernaryNode *result = new TernaryNode(array[middle].ch); | 1671 TernaryNode *result = new TernaryNode(hnode->entries[middle].ch); |
| 1195 if (result == NULL) { | 1672 if (result == NULL) { |
| 1196 status = U_MEMORY_ALLOCATION_ERROR; | 1673 status = U_MEMORY_ALLOCATION_ERROR; |
| 1197 return NULL; | 1674 return NULL; |
| 1198 } | 1675 } |
| 1199 const CompactTrieNode *equal = getCompactNode(header, array[middle].equal); | 1676 const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(hnode, mid
dle, nodeCount)); |
| 1200 if (equal->flagscount & kParentEndsWord) { | 1677 if (equal->flagscount & kParentEndsWord) { |
| 1201 result->flags |= kEndsWord; | 1678 if(info->magic == COMPACT_TRIE_MAGIC_3){ |
| 1679 result->flags = getValue(equal); |
| 1680 }else{ |
| 1681 result->flags |= kEndsWord; |
| 1682 } |
| 1202 } | 1683 } |
| 1203 result->low = unpackHorizontalArray(header, array, low, middle-1, status); | 1684 result->low = unpackHorizontalArray(info, hnode, low, middle-1, nodeCount, s
tatus); |
| 1204 result->high = unpackHorizontalArray(header, array, middle+1, high, status); | 1685 result->high = unpackHorizontalArray(info, hnode, middle+1, high, nodeCount,
status); |
| 1205 result->equal = unpackOneNode(header, equal, status); | 1686 result->equal = unpackOneNode(info, equal, status); |
| 1206 return result; | 1687 return result; |
| 1207 } | 1688 } |
| 1208 | 1689 |
| 1209 // Convert one compact trie node into a ternary subtrie | 1690 // Convert one compact trie node into a ternary subtrie |
| 1210 static TernaryNode * | 1691 static TernaryNode * |
| 1211 unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UEr
rorCode &status ) { | 1692 unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorC
ode &status ) { |
| 1212 int nodeCount = (node->flagscount & kCountMask); | 1693 int nodeCount = getCount(node); |
| 1213 if (nodeCount == 0 || U_FAILURE(status)) { | 1694 if (nodeCount == 0 || U_FAILURE(status)) { |
| 1214 // Failure, or terminal node | 1695 // Failure, or terminal node |
| 1215 return NULL; | 1696 return NULL; |
| 1216 } | 1697 } |
| 1217 if (node->flagscount & kVerticalNode) { | 1698 if (node->flagscount & kVerticalNode) { |
| 1218 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)
node; | 1699 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)
node; |
| 1219 TernaryNode *head = NULL; | 1700 TernaryNode *head = NULL; |
| 1220 TernaryNode *previous = NULL; | 1701 TernaryNode *previous = NULL; |
| 1221 TernaryNode *latest = NULL; | 1702 TernaryNode *latest = NULL; |
| 1222 for (int i = 0; i < nodeCount; ++i) { | 1703 for (int i = 0; i < nodeCount; ++i) { |
| 1223 latest = new TernaryNode(vnode->chars[i]); | 1704 latest = new TernaryNode(vnode->chars[i]); |
| 1224 if (latest == NULL) { | 1705 if (latest == NULL) { |
| 1225 status = U_MEMORY_ALLOCATION_ERROR; | 1706 status = U_MEMORY_ALLOCATION_ERROR; |
| 1226 break; | 1707 break; |
| 1227 } | 1708 } |
| 1228 if (head == NULL) { | 1709 if (head == NULL) { |
| 1229 head = latest; | 1710 head = latest; |
| 1230 } | 1711 } |
| 1231 if (previous != NULL) { | 1712 if (previous != NULL) { |
| 1232 previous->equal = latest; | 1713 previous->equal = latest; |
| 1233 } | 1714 } |
| 1234 previous = latest; | 1715 previous = latest; |
| 1235 } | 1716 } |
| 1236 if (latest != NULL) { | 1717 if (latest != NULL) { |
| 1237 const CompactTrieNode *equal = getCompactNode(header, vnode->equal); | 1718 const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(vn
ode)); |
| 1238 if (equal->flagscount & kParentEndsWord) { | 1719 if (equal->flagscount & kParentEndsWord) { |
| 1239 latest->flags |= kEndsWord; | 1720 if(info->magic == COMPACT_TRIE_MAGIC_3){ |
| 1721 latest->flags = getValue(equal); |
| 1722 } else { |
| 1723 latest->flags |= kEndsWord; |
| 1724 } |
| 1240 } | 1725 } |
| 1241 latest->equal = unpackOneNode(header, equal, status); | 1726 latest->equal = unpackOneNode(info, equal, status); |
| 1242 } | 1727 } |
| 1243 return head; | 1728 return head; |
| 1244 } | 1729 } |
| 1245 else { | 1730 else { |
| 1246 // Horizontal node | 1731 // Horizontal node |
| 1247 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNod
e *)node; | 1732 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNod
e *)node; |
| 1248 return unpackHorizontalArray(header, &hnode->entries[0], 0, nodeCount-1,
status); | 1733 return unpackHorizontalArray(info, hnode, 0, nodeCount-1, nodeCount, sta
tus); |
| 1249 } | 1734 } |
| 1250 } | 1735 } |
| 1251 | 1736 |
| 1737 // returns a MutableTrieDictionary generated from the CompactTrieDictionary |
| 1252 MutableTrieDictionary * | 1738 MutableTrieDictionary * |
| 1253 CompactTrieDictionary::cloneMutable( UErrorCode &status ) const { | 1739 CompactTrieDictionary::cloneMutable( UErrorCode &status ) const { |
| 1254 MutableTrieDictionary *result = new MutableTrieDictionary( status ); | 1740 MutableTrieDictionary *result = new MutableTrieDictionary( status, fInfo->ma
gic == COMPACT_TRIE_MAGIC_3 ); |
| 1255 if (result == NULL) { | 1741 if (result == NULL) { |
| 1256 status = U_MEMORY_ALLOCATION_ERROR; | 1742 status = U_MEMORY_ALLOCATION_ERROR; |
| 1257 return NULL; | 1743 return NULL; |
| 1258 } | 1744 } |
| 1259 TernaryNode *root = unpackOneNode(fData, getCompactNode(fData, fData->root),
status); | 1745 // treat root node as special case: don't call unpackOneNode() or unpackHori
zontalArray() directly |
| 1746 // because only kEqualOverflows flag should be checked in root's flagscount |
| 1747 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)
|
| 1748 getCompactNode(fInfo, fInfo->root); |
| 1749 uint16_t nodeCount = hnode->flagscount & kRootCountMask; |
| 1750 TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1, |
| 1751 nodeCount, status); |
| 1752 |
| 1260 if (U_FAILURE(status)) { | 1753 if (U_FAILURE(status)) { |
| 1261 delete root; // Clean up | 1754 delete root; // Clean up |
| 1262 delete result; | 1755 delete result; |
| 1263 return NULL; | 1756 return NULL; |
| 1264 } | 1757 } |
| 1265 result->fTrie = root; | 1758 result->fTrie = root; |
| 1266 return result; | 1759 return result; |
| 1267 } | 1760 } |
| 1268 | 1761 |
| 1269 U_NAMESPACE_END | 1762 U_NAMESPACE_END |
| 1270 | 1763 |
| 1271 U_CAPI int32_t U_EXPORT2 | 1764 U_CAPI int32_t U_EXPORT2 |
| 1272 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *
outData, | 1765 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *
outData, |
| 1273 UErrorCode *status) { | 1766 UErrorCode *status) { |
| 1274 | 1767 |
| 1275 if (status == NULL || U_FAILURE(*status)) { | 1768 if (status == NULL || U_FAILURE(*status)) { |
| 1276 return 0; | 1769 return 0; |
| 1277 } | 1770 } |
| 1278 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { | 1771 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { |
| 1279 *status=U_ILLEGAL_ARGUMENT_ERROR; | 1772 *status=U_ILLEGAL_ARGUMENT_ERROR; |
| 1280 return 0; | 1773 return 0; |
| 1281 } | 1774 } |
| 1282 | 1775 |
| 1283 // | 1776 // |
| 1284 // Check that the data header is for for dictionary data. | 1777 // Check that the data header is for for dictionary data. |
| 1285 // (Header contents are defined in genxxx.cpp) | 1778 // (Header contents are defined in genxxx.cpp) |
| 1286 // | 1779 // |
| 1287 const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4); | 1780 const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4); |
| 1288 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */ | 1781 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */ |
| 1289 pInfo->dataFormat[1]==0x72 && | 1782 pInfo->dataFormat[1]==0x72 && |
| 1290 pInfo->dataFormat[2]==0x44 && | 1783 pInfo->dataFormat[2]==0x44 && |
| 1291 pInfo->dataFormat[3]==0x63 && | 1784 pInfo->dataFormat[3]==0x63 && |
| 1292 pInfo->formatVersion[0]==1 )) { | 1785 pInfo->formatVersion[0]==1 )) { |
| 1293 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (
format version %02x) is not recognized\n", | 1786 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (
format version %02x) is not recognized\n", |
| 1294 pInfo->dataFormat[0], pInfo->dataFormat[1], | 1787 pInfo->dataFormat[0], pInfo->dataFormat[1], |
| 1295 pInfo->dataFormat[2], pInfo->dataFormat[3], | 1788 pInfo->dataFormat[2], pInfo->dataFormat[3], |
| 1296 pInfo->formatVersion[0]); | 1789 pInfo->formatVersion[0]); |
| 1297 *status=U_UNSUPPORTED_ERROR; | 1790 *status=U_UNSUPPORTED_ERROR; |
| 1298 return 0; | 1791 return 0; |
| 1299 } | 1792 } |
| 1300 | 1793 |
| 1301 // | 1794 // |
| 1302 // Swap the data header. (This is the generic ICU Data Header, not the | 1795 // Swap the data header. (This is the generic ICU Data Header, not the |
| 1303 // CompactTrieHeader). This swap also conveniently
gets us | 1796 // CompactTrieHeader). This swap also conveniently
gets us |
| 1304 // the size of the ICU d.h., which lets us locate th
e start | 1797 // the size of the ICU d.h., which lets us locate th
e start |
| 1305 // of the RBBI specific data. | 1798 // of the RBBI specific data. |
| 1306 // | 1799 // |
| 1307 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status)
; | 1800 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status)
; |
| 1308 | 1801 |
| 1309 // | 1802 // |
| 1310 // Get the CompactTrieHeader, and check that it appears to be OK. | 1803 // Get the CompactTrieHeader, and check that it appears to be OK. |
| 1311 // | 1804 // |
| 1312 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; | 1805 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; |
| 1313 const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes; | 1806 const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes; |
| 1314 if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1 | 1807 uint32_t magic = ds->readUInt32(header->magic); |
| 1315 || ds->readUInt32(header->size) < sizeof(CompactTrieHeader)) | 1808 if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic
!= COMPACT_TRIE_MAGIC_3 |
| 1809 || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < s
izeof(CompactTrieHeaderV1) |
| 1810 || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < s
izeof(CompactTrieHeader)) |
| 1316 { | 1811 { |
| 1317 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n")
; | 1812 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n")
; |
| 1318 *status=U_UNSUPPORTED_ERROR; | 1813 *status=U_UNSUPPORTED_ERROR; |
| 1319 return 0; | 1814 return 0; |
| 1320 } | 1815 } |
| 1321 | 1816 |
| 1322 // | 1817 // |
| 1323 // Prefight operation? Just return the size | 1818 // Prefight operation? Just return the size |
| 1324 // | 1819 // |
| 1325 uint32_t totalSize = ds->readUInt32(header->size); | 1820 uint32_t totalSize = ds->readUInt32(header->size); |
| 1326 int32_t sizeWithUData = (int32_t)totalSize + headerSize; | 1821 int32_t sizeWithUData = (int32_t)totalSize + headerSize; |
| 1327 if (length < 0) { | 1822 if (length < 0) { |
| 1328 return sizeWithUData; | 1823 return sizeWithUData; |
| 1329 } | 1824 } |
| 1330 | 1825 |
| 1331 // | 1826 // |
| 1332 // Check that length passed in is consistent with length from RBBI data head
er. | 1827 // Check that length passed in is consistent with length from RBBI data head
er. |
| 1333 // | 1828 // |
| 1334 if (length < sizeWithUData) { | 1829 if (length < sizeWithUData) { |
| 1335 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data
header) for trie data.\n", | 1830 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data
header) for trie data.\n", |
| 1336 totalSize); | 1831 totalSize); |
| 1337 *status=U_INDEX_OUTOFBOUNDS_ERROR; | 1832 *status=U_INDEX_OUTOFBOUNDS_ERROR; |
| 1338 return 0; | 1833 return 0; |
| 1339 } | 1834 } |
| 1340 | 1835 |
| 1341 // | 1836 // |
| 1342 // Swap the Data. Do the data itself first, then the CompactTrieHeader, bec
ause | 1837 // Swap the Data. Do the data itself first, then the CompactTrieHeader, bec
ause |
| 1343 // we need to reference the header to locate the data, and a
n | 1838 // we need to reference the header to locate the data, and a
n |
| 1344 // inplace swap of the header leaves it unusable. | 1839 // inplace swap of the header leaves it unusable. |
| 1345 // | 1840 // |
| 1346 uint8_t *outBytes = (uint8_t *)outData + headerSize; | 1841 uint8_t *outBytes = (uint8_t *)outData + headerSize; |
| 1347 CompactTrieHeader *outputHeader = (CompactTrieHeader *)outBytes; | 1842 CompactTrieHeader *outputHeader = (CompactTrieHeader *)outBytes; |
| 1348 | 1843 |
| 1349 #if 0 | 1844 #if 0 |
| 1350 // | 1845 // |
| 1351 // If not swapping in place, zero out the output buffer before starting. | 1846 // If not swapping in place, zero out the output buffer before starting. |
| 1352 // | 1847 // |
| 1353 if (inBytes != outBytes) { | 1848 if (inBytes != outBytes) { |
| 1354 uprv_memset(outBytes, 0, totalSize); | 1849 uprv_memset(outBytes, 0, totalSize); |
| 1355 } | 1850 } |
| 1356 | 1851 |
| 1357 // We need to loop through all the nodes in the offset table, and swap each
one. | 1852 // We need to loop through all the nodes in the offset table, and swap each
one. |
| 1358 uint16_t nodeCount = ds->readUInt16(header->nodeCount); | 1853 uint32_t nodeCount, rootId; |
| 1854 if(header->magic == COMPACT_TRIE_MAGIC_1) { |
| 1855 nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount); |
| 1856 rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root); |
| 1857 } else { |
| 1858 nodeCount = ds->readUInt32(header->nodeCount); |
| 1859 rootId = ds->readUInt32(header->root); |
| 1860 } |
| 1861 |
| 1359 // Skip node 0, which should always be 0. | 1862 // Skip node 0, which should always be 0. |
| 1360 for (int i = 1; i < nodeCount; ++i) { | 1863 for (uint32_t i = 1; i < nodeCount; ++i) { |
| 1361 uint32_t nodeOff = ds->readUInt32(header->offsets[i]); | 1864 uint32_t nodeOff = ds->readUInt32(header->offsets[i]); |
| 1362 const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + node
Off); | 1865 const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + node
Off); |
| 1363 CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff); | 1866 CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff); |
| 1364 uint16_t flagscount = ds->readUInt16(inNode->flagscount); | 1867 uint16_t flagscount = ds->readUInt16(inNode->flagscount); |
| 1365 uint16_t itemCount = flagscount & kCountMask; | 1868 uint16_t itemCount = getCount(inNode); |
| 1869 //uint16_t itemCount = flagscount & kCountMask; |
| 1366 ds->writeUInt16(&outNode->flagscount, flagscount); | 1870 ds->writeUInt16(&outNode->flagscount, flagscount); |
| 1367 if (itemCount > 0) { | 1871 if (itemCount > 0) { |
| 1368 if (flagscount & kVerticalNode) { | 1872 uint16_t overflow = 0; //number of extra uint16_ts needed to be swap
ped |
| 1873 if (flagscount & kVerticalNode && i != rootId) { |
| 1874 if(flagscount & kEqualOverflows){ |
| 1875 // include overflow bits |
| 1876 overflow += 1; |
| 1877 } |
| 1878 if (header->magic == COMPACT_TRIE_MAGIC_3 && flagscount & kEndsP
arentWord) { |
| 1879 //include values |
| 1880 overflow += 1; |
| 1881 } |
| 1369 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVertical
Node,chars), | 1882 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVertical
Node,chars), |
| 1370 itemCount*sizeof(uint16_t), | 1883 (itemCount + overflow)*sizeof(uint16_t), |
| 1371 outBytes+nodeOff+offsetof(CompactTrieVertica
lNode,chars), status); | 1884 outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars)
, status); |
| 1372 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(Compact
TrieVerticalNode,equal); | 1885 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(Compact
TrieVerticalNode,equal); |
| 1373 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNod
e,equal)); | 1886 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNod
e,equal)); |
| 1374 } | 1887 } |
| 1375 else { | 1888 else { |
| 1376 const CompactTrieHorizontalNode *inHNode = (const CompactTrieHor
izontalNode *)inNode; | 1889 const CompactTrieHorizontalNode *inHNode = (const CompactTrieHor
izontalNode *)inNode; |
| 1377 CompactTrieHorizontalNode *outHNode = (CompactTrieHorizontalNode
*)outNode; | 1890 CompactTrieHorizontalNode *outHNode = (CompactTrieHorizontalNode
*)outNode; |
| 1378 for (int j = 0; j < itemCount; ++j) { | 1891 for (int j = 0; j < itemCount; ++j) { |
| 1379 uint16_t word = ds->readUInt16(inHNode->entries[j].ch); | 1892 uint16_t word = ds->readUInt16(inHNode->entries[j].ch); |
| 1380 ds->writeUInt16(&outHNode->entries[j].ch, word); | 1893 ds->writeUInt16(&outHNode->entries[j].ch, word); |
| 1381 word = ds->readUInt16(inHNode->entries[j].equal); | 1894 word = ds->readUInt16(inHNode->entries[j].equal); |
| 1382 ds->writeUInt16(&outHNode->entries[j].equal, word); | 1895 ds->writeUInt16(&outHNode->entries[j].equal, word); |
| 1383 } | 1896 } |
| 1897 |
| 1898 // swap overflow/value information |
| 1899 if(flagscount & kEqualOverflows){ |
| 1900 overflow += (itemCount + 3) / 4; |
| 1901 } |
| 1902 |
| 1903 if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && flag
scount & kEndsParentWord) { |
| 1904 //include values |
| 1905 overflow += 1; |
| 1906 } |
| 1907 |
| 1908 uint16_t *inOverflow = (uint16_t *) &inHNode->entries[itemCount]
; |
| 1909 uint16_t *outOverflow = (uint16_t *) &outHNode->entries[itemCoun
t]; |
| 1910 for(int j = 0; j<overflow; j++){ |
| 1911 uint16_t extraInfo = ds->readUInt16(*inOverflow); |
| 1912 ds->writeUInt16(outOverflow, extraInfo); |
| 1913 |
| 1914 inOverflow++; |
| 1915 outOverflow++; |
| 1916 } |
| 1384 } | 1917 } |
| 1385 } | 1918 } |
| 1386 } | 1919 } |
| 1387 #endif | 1920 #endif |
| 1388 | 1921 |
| 1922 // Swap the header |
| 1923 ds->writeUInt32(&outputHeader->size, totalSize); |
| 1924 ds->writeUInt32(&outputHeader->magic, magic); |
| 1925 |
| 1926 uint32_t nodeCount; |
| 1927 uint32_t offsetPos; |
| 1928 if (header->magic == COMPACT_TRIE_MAGIC_1) { |
| 1929 CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *)header; |
| 1930 CompactTrieHeaderV1 *outputHeaderV1 = (CompactTrieHeaderV1 *)outputHeade
r; |
| 1931 |
| 1932 nodeCount = ds->readUInt16(headerV1->nodeCount); |
| 1933 ds->writeUInt16(&outputHeaderV1->nodeCount, nodeCount); |
| 1934 uint16_t root = ds->readUInt16(headerV1->root); |
| 1935 ds->writeUInt16(&outputHeaderV1->root, root); |
| 1936 offsetPos = offsetof(CompactTrieHeaderV1,offsets); |
| 1937 } else { |
| 1938 nodeCount = ds->readUInt32(header->nodeCount); |
| 1939 ds->writeUInt32(&outputHeader->nodeCount, nodeCount); |
| 1940 uint32_t root = ds->readUInt32(header->root); |
| 1941 ds->writeUInt32(&outputHeader->root, root); |
| 1942 offsetPos = offsetof(CompactTrieHeader,offsets); |
| 1943 } |
| 1944 |
| 1389 // All the data in all the nodes consist of 16 bit items. Swap them all at o
nce. | 1945 // All the data in all the nodes consist of 16 bit items. Swap them all at o
nce. |
| 1390 uint16_t nodeCount = ds->readUInt16(header->nodeCount); | 1946 uint32_t nodesOff = offsetPos+((uint32_t)nodeCount*sizeof(uint32_t)); |
| 1391 uint32_t nodesOff = offsetof(CompactTrieHeader,offsets)+((uint32_t)nodeCount
*sizeof(uint32_t)); | |
| 1392 ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff,
status); | 1947 ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff,
status); |
| 1393 | 1948 |
| 1394 // Swap the header | 1949 //swap offsets |
| 1395 ds->writeUInt32(&outputHeader->size, totalSize); | 1950 ds->swapArray32(ds, inBytes+offsetPos, |
| 1396 uint32_t magic = ds->readUInt32(header->magic); | 1951 sizeof(uint32_t)*(uint32_t)nodeCount, |
| 1397 ds->writeUInt32(&outputHeader->magic, magic); | 1952 outBytes+offsetPos, status); |
| 1398 ds->writeUInt16(&outputHeader->nodeCount, nodeCount); | |
| 1399 uint16_t root = ds->readUInt16(header->root); | |
| 1400 ds->writeUInt16(&outputHeader->root, root); | |
| 1401 ds->swapArray32(ds, inBytes+offsetof(CompactTrieHeader,offsets), | |
| 1402 sizeof(uint32_t)*(int32_t)nodeCount, | |
| 1403 outBytes+offsetof(CompactTrieHeader,offsets), status); | |
| 1404 | 1953 |
| 1405 return sizeWithUData; | 1954 return sizeWithUData; |
| 1406 } | 1955 } |
| 1407 | 1956 |
| 1408 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | 1957 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
| OLD | NEW |