OLD | NEW |
1 /** | 1 /** |
2 ******************************************************************************* | 2 ******************************************************************************* |
3 * Copyright (C) 2006-2008, International Business Machines Corporation * | 3 * Copyright (C) 2006-2008, International Business Machines Corporation * |
4 * and others. All Rights Reserved. * | 4 * and others. All Rights Reserved. * |
5 ******************************************************************************* | 5 ******************************************************************************* |
6 */ | 6 */ |
7 | 7 |
8 #include "unicode/utypes.h" | 8 #include "unicode/utypes.h" |
9 | 9 |
10 #if !UCONFIG_NO_BREAK_ITERATION | 10 #if !UCONFIG_NO_BREAK_ITERATION |
11 | 11 |
12 #include "triedict.h" | 12 #include "triedict.h" |
13 #include "unicode/chariter.h" | 13 #include "unicode/chariter.h" |
14 #include "unicode/uchriter.h" | 14 #include "unicode/uchriter.h" |
15 #include "unicode/strenum.h" | 15 #include "unicode/strenum.h" |
16 #include "unicode/uenum.h" | 16 #include "unicode/uenum.h" |
17 #include "unicode/udata.h" | 17 #include "unicode/udata.h" |
18 #include "cmemory.h" | 18 #include "cmemory.h" |
19 #include "udataswp.h" | 19 #include "udataswp.h" |
20 #include "uvector.h" | 20 #include "uvector.h" |
21 #include "uvectr32.h" | 21 #include "uvectr32.h" |
22 #include "uarrsort.h" | 22 #include "uarrsort.h" |
| 23 #include "hash.h" |
23 | 24 |
24 //#define DEBUG_TRIE_DICT 1 | 25 //#define DEBUG_TRIE_DICT 1 |
25 | 26 |
26 #ifdef DEBUG_TRIE_DICT | 27 #ifdef DEBUG_TRIE_DICT |
27 #include <sys/times.h> | 28 #include <sys/times.h> |
28 #include <limits.h> | 29 #include <limits.h> |
29 #include <stdio.h> | 30 #include <stdio.h> |
| 31 #include <time.h> |
| 32 #ifndef CLK_TCK |
| 33 #define CLK_TCK CLOCKS_PER_SEC |
| 34 #endif |
| 35 |
30 #endif | 36 #endif |
31 | 37 |
32 U_NAMESPACE_BEGIN | 38 U_NAMESPACE_BEGIN |
33 | 39 |
34 /******************************************************************* | 40 /******************************************************************* |
35 * TrieWordDictionary | 41 * TrieWordDictionary |
36 */ | 42 */ |
37 | 43 |
38 TrieWordDictionary::TrieWordDictionary() { | 44 TrieWordDictionary::TrieWordDictionary() { |
39 } | 45 } |
40 | 46 |
41 TrieWordDictionary::~TrieWordDictionary() { | 47 TrieWordDictionary::~TrieWordDictionary() { |
42 } | 48 } |
43 | 49 |
44 /******************************************************************* | 50 /******************************************************************* |
45 * MutableTrieDictionary | 51 * MutableTrieDictionary |
46 */ | 52 */ |
47 | 53 |
| 54 //#define MAX_VALUE 65535 |
| 55 |
| 56 // forward declaration |
| 57 inline uint16_t scaleLogProbabilities(double logprob); |
| 58 |
48 // Node structure for the ternary, uncompressed trie | 59 // Node structure for the ternary, uncompressed trie |
49 struct TernaryNode : public UMemory { | 60 struct TernaryNode : public UMemory { |
50 UChar ch; // UTF-16 code unit | 61 UChar ch; // UTF-16 code unit |
51 uint16_t flags; // Flag word | 62 uint16_t flags; // Flag word |
52 TernaryNode *low; // Less-than link | 63 TernaryNode *low; // Less-than link |
53 TernaryNode *equal; // Equal link | 64 TernaryNode *equal; // Equal link |
54 TernaryNode *high; // Greater-than link | 65 TernaryNode *high; // Greater-than link |
55 | 66 |
56 TernaryNode(UChar uc); | 67 TernaryNode(UChar uc); |
57 ~TernaryNode(); | 68 ~TernaryNode(); |
(...skipping 12 matching lines...) Expand all Loading... |
70 high = NULL; | 81 high = NULL; |
71 } | 82 } |
72 | 83 |
73 // Not inline since it's recursive | 84 // Not inline since it's recursive |
74 TernaryNode::~TernaryNode() { | 85 TernaryNode::~TernaryNode() { |
75 delete low; | 86 delete low; |
76 delete equal; | 87 delete equal; |
77 delete high; | 88 delete high; |
78 } | 89 } |
79 | 90 |
80 MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status )
{ | 91 MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status, |
| 92 UBool containsValue /* = FALSE */
) { |
81 // Start the trie off with something. Having the root node already present | 93 // Start the trie off with something. Having the root node already present |
82 // cuts a special case out of the search/insertion functions. | 94 // cuts a special case out of the search/insertion functions. |
83 // Making it a median character cuts the worse case for searches from | 95 // Making it a median character cuts the worse case for searches from |
84 // 4x a balanced trie to 2x a balanced trie. It's best to choose something | 96 // 4x a balanced trie to 2x a balanced trie. It's best to choose something |
85 // that starts a word that is midway in the list. | 97 // that starts a word that is midway in the list. |
86 fTrie = new TernaryNode(median); | 98 fTrie = new TernaryNode(median); |
87 if (fTrie == NULL) { | 99 if (fTrie == NULL) { |
88 status = U_MEMORY_ALLOCATION_ERROR; | 100 status = U_MEMORY_ALLOCATION_ERROR; |
89 } | 101 } |
90 fIter = utext_openUChars(NULL, NULL, 0, &status); | 102 fIter = utext_openUChars(NULL, NULL, 0, &status); |
91 if (U_SUCCESS(status) && fIter == NULL) { | 103 if (U_SUCCESS(status) && fIter == NULL) { |
92 status = U_MEMORY_ALLOCATION_ERROR; | 104 status = U_MEMORY_ALLOCATION_ERROR; |
93 } | 105 } |
| 106 |
| 107 fValued = containsValue; |
94 } | 108 } |
95 | 109 |
96 MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) { | 110 MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status, |
| 111 UBool containsValue /* = false */
) { |
97 fTrie = NULL; | 112 fTrie = NULL; |
98 fIter = utext_openUChars(NULL, NULL, 0, &status); | 113 fIter = utext_openUChars(NULL, NULL, 0, &status); |
99 if (U_SUCCESS(status) && fIter == NULL) { | 114 if (U_SUCCESS(status) && fIter == NULL) { |
100 status = U_MEMORY_ALLOCATION_ERROR; | 115 status = U_MEMORY_ALLOCATION_ERROR; |
101 } | 116 } |
| 117 |
| 118 fValued = containsValue; |
102 } | 119 } |
103 | 120 |
104 MutableTrieDictionary::~MutableTrieDictionary() { | 121 MutableTrieDictionary::~MutableTrieDictionary() { |
105 delete fTrie; | 122 delete fTrie; |
106 utext_close(fIter); | 123 utext_close(fIter); |
107 } | 124 } |
108 | 125 |
109 int32_t | 126 int32_t |
110 MutableTrieDictionary::search( UText *text, | 127 MutableTrieDictionary::search( UText *text, |
111 int32_t maxLength, | 128 int32_t maxLength, |
112 int32_t *lengths, | 129 int32_t *lengths, |
113 int &count, | 130 int &count, |
114 int limit, | 131 int limit, |
115 TernaryNode *&parent, | 132 TernaryNode *&parent, |
116 UBool &pMatched ) const { | 133 UBool &pMatched, |
| 134 uint16_t *values /*=NULL*/) const { |
117 // TODO: current implementation works in UTF-16 space | 135 // TODO: current implementation works in UTF-16 space |
118 const TernaryNode *up = NULL; | 136 const TernaryNode *up = NULL; |
119 const TernaryNode *p = fTrie; | 137 const TernaryNode *p = fTrie; |
120 int mycount = 0; | 138 int mycount = 0; |
121 pMatched = TRUE; | 139 pMatched = TRUE; |
122 int i; | 140 int i; |
123 | 141 |
| 142 if (!fValued) { |
| 143 values = NULL; |
| 144 } |
| 145 |
124 UChar uc = utext_current32(text); | 146 UChar uc = utext_current32(text); |
125 for (i = 0; i < maxLength && p != NULL; ++i) { | 147 for (i = 0; i < maxLength && p != NULL; ++i) { |
126 while (p != NULL) { | 148 while (p != NULL) { |
127 if (uc < p->ch) { | 149 if (uc < p->ch) { |
128 up = p; | 150 up = p; |
129 p = p->low; | 151 p = p->low; |
130 } | 152 } |
131 else if (uc == p->ch) { | 153 else if (uc == p->ch) { |
132 break; | 154 break; |
133 } | 155 } |
134 else { | 156 else { |
135 up = p; | 157 up = p; |
136 p = p->high; | 158 p = p->high; |
137 } | 159 } |
138 } | 160 } |
139 if (p == NULL) { | 161 if (p == NULL) { |
140 pMatched = FALSE; | 162 pMatched = FALSE; |
141 break; | 163 break; |
142 } | 164 } |
143 // Must be equal to get here | 165 // Must be equal to get here |
144 if (limit > 0 && (p->flags & kEndsWord)) { | 166 if (limit > 0 && (p->flags > 0)) { |
| 167 //is there a more efficient way to add values? ie. remove if stmt |
| 168 if(values != NULL) { |
| 169 values[mycount] = p->flags; |
| 170 } |
145 lengths[mycount++] = i+1; | 171 lengths[mycount++] = i+1; |
146 --limit; | 172 --limit; |
147 } | 173 } |
148 up = p; | 174 up = p; |
149 p = p->equal; | 175 p = p->equal; |
150 uc = utext_next32(text); | 176 uc = utext_next32(text); |
151 uc = utext_current32(text); | 177 uc = utext_current32(text); |
152 } | 178 } |
153 | 179 |
154 // Note that there is no way to reach here with up == 0 unless | 180 // Note that there is no way to reach here with up == 0 unless |
155 // maxLength is 0 coming in. | 181 // maxLength is 0 coming in. |
156 parent = (TernaryNode *)up; | 182 parent = (TernaryNode *)up; |
157 count = mycount; | 183 count = mycount; |
158 return i; | 184 return i; |
159 } | 185 } |
160 | 186 |
161 void | 187 void |
162 MutableTrieDictionary::addWord( const UChar *word, | 188 MutableTrieDictionary::addWord( const UChar *word, |
163 int32_t length, | 189 int32_t length, |
164 UErrorCode &status ) { | 190 UErrorCode &status, |
165 #if 0 | 191 uint16_t value /* = 0 */ ) { |
166 if (length <= 0) { | 192 // dictionary cannot store zero values, would interfere with flags |
| 193 if (length <= 0 || (!fValued && value > 0) || (fValued && value == 0)) { |
167 status = U_ILLEGAL_ARGUMENT_ERROR; | 194 status = U_ILLEGAL_ARGUMENT_ERROR; |
168 return; | 195 return; |
169 } | 196 } |
170 #endif | 197 |
171 TernaryNode *parent; | 198 TernaryNode *parent; |
172 UBool pMatched; | 199 UBool pMatched; |
173 int count; | 200 int count; |
174 fIter = utext_openUChars(fIter, word, length, &status); | 201 fIter = utext_openUChars(fIter, word, length, &status); |
175 | 202 |
176 int matched; | 203 int matched; |
177 matched = search(fIter, length, NULL, count, 0, parent, pMatched); | 204 matched = search(fIter, length, NULL, count, 0, parent, pMatched); |
178 | 205 |
179 while (matched++ < length) { | 206 while (matched++ < length) { |
180 UChar32 uc = utext_next32(fIter); // TODO: supplemetary support? | 207 UChar32 uc = utext_next32(fIter); // TODO: supplementary support? |
181 U_ASSERT(uc != U_SENTINEL); | 208 U_ASSERT(uc != U_SENTINEL); |
182 TernaryNode *newNode = new TernaryNode(uc); | 209 TernaryNode *newNode = new TernaryNode(uc); |
183 if (newNode == NULL) { | 210 if (newNode == NULL) { |
184 status = U_MEMORY_ALLOCATION_ERROR; | 211 status = U_MEMORY_ALLOCATION_ERROR; |
185 return; | 212 return; |
186 } | 213 } |
187 if (pMatched) { | 214 if (pMatched) { |
188 parent->equal = newNode; | 215 parent->equal = newNode; |
189 } | 216 } |
190 else { | 217 else { |
191 pMatched = TRUE; | 218 pMatched = TRUE; |
192 if (uc < parent->ch) { | 219 if (uc < parent->ch) { |
193 parent->low = newNode; | 220 parent->low = newNode; |
194 } | 221 } |
195 else { | 222 else { |
196 parent->high = newNode; | 223 parent->high = newNode; |
197 } | 224 } |
198 } | 225 } |
199 parent = newNode; | 226 parent = newNode; |
200 } | 227 } |
201 | 228 |
202 parent->flags |= kEndsWord; | 229 if(fValued && value > 0){ |
203 } | 230 parent->flags = value; |
204 | 231 } else { |
205 #if 0 | 232 parent->flags |= kEndsWord; |
206 void | |
207 MutableTrieDictionary::addWords( UEnumeration *words, | |
208 UErrorCode &status ) { | |
209 int32_t length; | |
210 const UChar *word; | |
211 while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) { | |
212 addWord(word, length, status); | |
213 } | 233 } |
214 } | 234 } |
215 #endif | |
216 | 235 |
217 int32_t | 236 int32_t |
218 MutableTrieDictionary::matches( UText *text, | 237 MutableTrieDictionary::matches( UText *text, |
219 int32_t maxLength, | 238 int32_t maxLength, |
220 int32_t *lengths, | 239 int32_t *lengths, |
221 int &count, | 240 int &count, |
222 int limit ) const { | 241 int limit, |
| 242 uint16_t *values /*=NULL*/) const { |
223 TernaryNode *parent; | 243 TernaryNode *parent; |
224 UBool pMatched; | 244 UBool pMatched; |
225 return search(text, maxLength, lengths, count, limit, parent, pMatched); | 245 return search(text, maxLength, lengths, count, limit, parent, pMatched, valu
es); |
226 } | 246 } |
227 | 247 |
228 // Implementation of iteration for MutableTrieDictionary | 248 // Implementation of iteration for MutableTrieDictionary |
229 class MutableTrieEnumeration : public StringEnumeration { | 249 class MutableTrieEnumeration : public StringEnumeration { |
230 private: | 250 private: |
231 UStack fNodeStack; // Stack of nodes to process | 251 UStack fNodeStack; // Stack of nodes to process |
232 UVector32 fBranchStack; // Stack of which branch we are working on | 252 UVector32 fBranchStack; // Stack of which branch we are working on |
233 TernaryNode *fRoot; // Root node | 253 TernaryNode *fRoot; // Root node |
234 enum StackBranch { | 254 enum StackBranch { |
235 kLessThan, | 255 kLessThan, |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
270 | 290 |
271 switch (where) { | 291 switch (where) { |
272 case kLessThan: | 292 case kLessThan: |
273 if (node->low != NULL) { | 293 if (node->low != NULL) { |
274 fBranchStack.setElementAt(kEqual, fBranchStack.size()-1); | 294 fBranchStack.setElementAt(kEqual, fBranchStack.size()-1); |
275 node = (TernaryNode *) fNodeStack.push(node->low, status); | 295 node = (TernaryNode *) fNodeStack.push(node->low, status); |
276 where = (StackBranch) fBranchStack.push(kLessThan, status); | 296 where = (StackBranch) fBranchStack.push(kLessThan, status); |
277 break; | 297 break; |
278 } | 298 } |
279 case kEqual: | 299 case kEqual: |
280 emit = (node->flags & kEndsWord) != 0; | 300 emit = node->flags > 0; |
281 equal = (node->equal != NULL); | 301 equal = (node->equal != NULL); |
282 // If this node should be part of the next emitted string, appen
d | 302 // If this node should be part of the next emitted string, appen
d |
283 // the UChar to the string, and make sure we pop it when we come | 303 // the UChar to the string, and make sure we pop it when we come |
284 // back to this node. The character should only be in the string | 304 // back to this node. The character should only be in the string |
285 // for as long as we're traversing the equal subtree of this nod
e | 305 // for as long as we're traversing the equal subtree of this nod
e |
286 if (equal || emit) { | 306 if (equal || emit) { |
287 unistr.append(node->ch); | 307 unistr.append(node->ch); |
288 fBranchStack.setElementAt(kGreaterThan, fBranchStack.size()-
1); | 308 fBranchStack.setElementAt(kGreaterThan, fBranchStack.size()-
1); |
289 } | 309 } |
290 if (equal) { | 310 if (equal) { |
291 node = (TernaryNode *) fNodeStack.push(node->equal, status); | 311 node = (TernaryNode *) fNodeStack.push(node->equal, status); |
292 where = (StackBranch) fBranchStack.push(kLessThan, status); | 312 where = (StackBranch) fBranchStack.push(kLessThan, status); |
293 } | 313 } |
294 if (emit) { | 314 if (emit) { |
295 return &unistr; | 315 return &unistr; |
296 } | 316 } |
297 if (equal) { | 317 if (equal) { |
298 break; | 318 break; |
299 } | 319 } |
300 case kGreaterThan: | 320 case kGreaterThan: |
301 // If this node's character is in the string, remove it. | 321 // If this node's character is in the string, remove it. |
302 if (node->equal != NULL || (node->flags & kEndsWord)) { | 322 if (node->equal != NULL || node->flags > 0) { |
303 unistr.truncate(unistr.length()-1); | 323 unistr.truncate(unistr.length()-1); |
304 } | 324 } |
305 if (node->high != NULL) { | 325 if (node->high != NULL) { |
306 fBranchStack.setElementAt(kDone, fBranchStack.size()-1); | 326 fBranchStack.setElementAt(kDone, fBranchStack.size()-1); |
307 node = (TernaryNode *) fNodeStack.push(node->high, status); | 327 node = (TernaryNode *) fNodeStack.push(node->high, status); |
308 where = (StackBranch) fBranchStack.push(kLessThan, status); | 328 where = (StackBranch) fBranchStack.push(kLessThan, status); |
309 break; | 329 break; |
310 } | 330 } |
311 case kDone: | 331 case kDone: |
312 fNodeStack.pop(); | 332 fNodeStack.pop(); |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
347 if (U_FAILURE(status)) { | 367 if (U_FAILURE(status)) { |
348 return NULL; | 368 return NULL; |
349 } | 369 } |
350 return new MutableTrieEnumeration(fTrie, status); | 370 return new MutableTrieEnumeration(fTrie, status); |
351 } | 371 } |
352 | 372 |
353 /******************************************************************* | 373 /******************************************************************* |
354 * CompactTrieDictionary | 374 * CompactTrieDictionary |
355 */ | 375 */ |
356 | 376 |
| 377 //TODO further optimization: |
| 378 // minimise size of trie with logprobs by storing values |
| 379 // for terminal nodes directly in offsets[] |
| 380 // --> calculating from next offset *might* be simpler, but would have to add |
| 381 // one last offset for logprob of last node |
| 382 // --> if calculate from current offset, need to factor in possible overflow |
| 383 // as well. |
| 384 // idea: store in offset, set first bit to indicate logprob storage-->won't |
| 385 // have to access additional node |
| 386 |
| 387 // {'Dic', 1}, version 1: uses old header, no values |
| 388 #define COMPACT_TRIE_MAGIC_1 0x44696301 |
| 389 // version 2: uses new header (more than 2^16 nodes), no values |
| 390 #define COMPACT_TRIE_MAGIC_2 0x44696302 |
| 391 // version 3: uses new header, includes values |
| 392 #define COMPACT_TRIE_MAGIC_3 0x44696303 |
| 393 |
357 struct CompactTrieHeader { | 394 struct CompactTrieHeader { |
358 uint32_t size; // Size of the data in bytes | 395 uint32_t size; // Size of the data in bytes |
359 uint32_t magic; // Magic number (including version) | 396 uint32_t magic; // Magic number (including version) |
| 397 uint32_t nodeCount; // Number of entries in offsets[] |
| 398 uint32_t root; // Node number of the root node |
| 399 uint32_t offsets[1]; // Offsets to nodes from start of data |
| 400 }; |
| 401 |
| 402 // old version of CompactTrieHeader kept for backwards compatibility |
| 403 struct CompactTrieHeaderV1 { |
| 404 uint32_t size; // Size of the data in bytes |
| 405 uint32_t magic; // Magic number (including version) |
360 uint16_t nodeCount; // Number of entries in offsets[] | 406 uint16_t nodeCount; // Number of entries in offsets[] |
361 uint16_t root; // Node number of the root node | 407 uint16_t root; // Node number of the root node |
362 uint32_t offsets[1]; // Offsets to nodes from start of data | 408 uint32_t offsets[1]; // Offsets to nodes from start of data |
| 409 }; |
| 410 |
| 411 // Helper class for managing CompactTrieHeader and CompactTrieHeaderV1 |
| 412 struct CompactTrieInfo { |
| 413 uint32_t size; // Size of the data in bytes |
| 414 uint32_t magic; // Magic number (including version) |
| 415 uint32_t nodeCount; // Number of entries in offsets[] |
| 416 uint32_t root; // Node number of the root node |
| 417 uint32_t *offsets; // Offsets to nodes from start of data |
| 418 uint8_t *address; // pointer to header bytes in memory |
| 419 |
| 420 CompactTrieInfo(const void *data, UErrorCode &status){ |
| 421 CompactTrieHeader *header = (CompactTrieHeader *) data; |
| 422 if (header->magic != COMPACT_TRIE_MAGIC_1 && |
| 423 header->magic != COMPACT_TRIE_MAGIC_2 && |
| 424 header->magic != COMPACT_TRIE_MAGIC_3) { |
| 425 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 426 } else { |
| 427 size = header->size; |
| 428 magic = header->magic; |
| 429 |
| 430 if (header->magic == COMPACT_TRIE_MAGIC_1) { |
| 431 CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *) header; |
| 432 nodeCount = headerV1->nodeCount; |
| 433 root = headerV1->root; |
| 434 offsets = &(headerV1->offsets[0]); |
| 435 address = (uint8_t *)headerV1; |
| 436 } else { |
| 437 nodeCount = header->nodeCount; |
| 438 root = header->root; |
| 439 offsets = &(header->offsets[0]); |
| 440 address = (uint8_t *)header; |
| 441 } |
| 442 } |
| 443 } |
| 444 |
| 445 ~CompactTrieInfo(){} |
363 }; | 446 }; |
364 | 447 |
365 // Note that to avoid platform-specific alignment issues, all members of the nod
e | 448 // Note that to avoid platform-specific alignment issues, all members of the nod
e |
366 // structures should be the same size, or should contain explicit padding to | 449 // structures should be the same size, or should contain explicit padding to |
367 // natural alignment boundaries. | 450 // natural alignment boundaries. |
368 | 451 |
369 // We can't use a bitfield for the flags+count field, because the layout of thos
e | 452 // We can't use a bitfield for the flags+count field, because the layout of thos
e |
370 // is not portable. 12 bits of count allows for up to 4096 entries in a node. | 453 // is not portable. 12 bits of count allows for up to 4096 entries in a node. |
371 struct CompactTrieNode { | 454 struct CompactTrieNode { |
372 uint16_t flagscount; // Count of sub-entries, plus flags | 455 uint16_t flagscount; // Count of sub-entries, plus flags |
373 }; | 456 }; |
374 | 457 |
375 enum CompactTrieNodeFlags { | 458 enum CompactTrieNodeFlags { |
376 kVerticalNode = 0x1000, // This is a vertical node | 459 kVerticalNode = 0x1000, // This is a vertical node |
377 kParentEndsWord = 0x2000, // The node whose equal link points to this
ends a word | 460 kParentEndsWord = 0x2000, // The node whose equal link points to this
ends a word |
378 kReservedFlag1 = 0x4000, | 461 kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kRe
servedFlag1 |
379 kReservedFlag2 = 0x8000, | 462 kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReservedF
lag2 |
380 kCountMask = 0x0FFF, // The count portion of flagscount | 463 kCountMask = 0x0FFF, // The count portion of flagscount |
381 kFlagMask = 0xF000 // The flags portion of flagscount | 464 kFlagMask = 0xF000, // The flags portion of flagscount |
| 465 kRootCountMask = 0x7FFF // The count portion of flagscount in the ro
ot node |
| 466 |
| 467 //offset flags: |
| 468 //kOffsetContainsValue = 0x80000000 // Offset contains value for paren
t node |
382 }; | 469 }; |
383 | 470 |
384 // The two node types are distinguished by the kVerticalNode flag. | 471 // The two node types are distinguished by the kVerticalNode flag. |
385 | 472 |
386 struct CompactTrieHorizontalEntry { | 473 struct CompactTrieHorizontalEntry { |
387 uint16_t ch; // UChar | 474 uint16_t ch; // UChar |
388 uint16_t equal; // Equal link node index | 475 uint16_t equal; // Equal link node index |
389 }; | 476 }; |
390 | 477 |
391 // We don't use inheritance here because C++ does not guarantee that the | 478 // We don't use inheritance here because C++ does not guarantee that the |
392 // base class comes first in memory!! | 479 // base class comes first in memory!! |
393 | 480 |
394 struct CompactTrieHorizontalNode { | 481 struct CompactTrieHorizontalNode { |
395 uint16_t flagscount; // Count of sub-entries, plus flags | 482 uint16_t flagscount; // Count of sub-entries, plus flags |
396 CompactTrieHorizontalEntry entries[1]; | 483 CompactTrieHorizontalEntry entries[1]; |
397 }; | 484 }; |
398 | 485 |
399 struct CompactTrieVerticalNode { | 486 struct CompactTrieVerticalNode { |
400 uint16_t flagscount; // Count of sub-entries, plus flags | 487 uint16_t flagscount; // Count of sub-entries, plus flags |
401 uint16_t equal; // Equal link node index | 488 uint16_t equal; // Equal link node index |
402 uint16_t chars[1]; // Code units | 489 uint16_t chars[1]; // Code units |
403 }; | 490 }; |
404 | 491 |
405 // {'Dic', 1}, version 1 | |
406 #define COMPACT_TRIE_MAGIC_1 0x44696301 | |
407 | |
408 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj, | 492 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj, |
409 UErrorCode &status ) | 493 UErrorCode &status ) |
410 : fUData(dataObj) | 494 : fUData(dataObj) |
411 { | 495 { |
412 fData = (const CompactTrieHeader *) udata_getMemory(dataObj); | 496 fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); |
| 497 *fInfo = CompactTrieInfo(udata_getMemory(dataObj), status); |
413 fOwnData = FALSE; | 498 fOwnData = FALSE; |
414 if (fData->magic != COMPACT_TRIE_MAGIC_1) { | 499 } |
415 status = U_ILLEGAL_ARGUMENT_ERROR; | 500 |
416 fData = NULL; | |
417 } | |
418 } | |
419 CompactTrieDictionary::CompactTrieDictionary( const void *data, | 501 CompactTrieDictionary::CompactTrieDictionary( const void *data, |
420 UErrorCode &status ) | 502 UErrorCode &status ) |
421 : fUData(NULL) | 503 : fUData(NULL) |
422 { | 504 { |
423 fData = (const CompactTrieHeader *) data; | 505 fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); |
| 506 *fInfo = CompactTrieInfo(data, status); |
424 fOwnData = FALSE; | 507 fOwnData = FALSE; |
425 if (fData->magic != COMPACT_TRIE_MAGIC_1) { | |
426 status = U_ILLEGAL_ARGUMENT_ERROR; | |
427 fData = NULL; | |
428 } | |
429 } | 508 } |
430 | 509 |
431 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict, | 510 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict, |
432 UErrorCode &status ) | 511 UErrorCode &status ) |
433 : fUData(NULL) | 512 : fUData(NULL) |
434 { | 513 { |
435 fData = compactMutableTrieDictionary(dict, status); | 514 const CompactTrieHeader* header = compactMutableTrieDictionary(dict, status)
; |
| 515 if (U_SUCCESS(status)) { |
| 516 fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); |
| 517 *fInfo = CompactTrieInfo(header, status); |
| 518 } |
| 519 |
436 fOwnData = !U_FAILURE(status); | 520 fOwnData = !U_FAILURE(status); |
437 } | 521 } |
438 | 522 |
439 CompactTrieDictionary::~CompactTrieDictionary() { | 523 CompactTrieDictionary::~CompactTrieDictionary() { |
440 if (fOwnData) { | 524 if (fOwnData) { |
441 uprv_free((void *)fData); | 525 uprv_free((void *)(fInfo->address)); |
442 } | 526 } |
| 527 uprv_free((void *)fInfo); |
| 528 |
443 if (fUData) { | 529 if (fUData) { |
444 udata_close(fUData); | 530 udata_close(fUData); |
445 } | 531 } |
446 } | 532 } |
447 | 533 |
| 534 UBool CompactTrieDictionary::getValued() const{ |
| 535 return fInfo->magic == COMPACT_TRIE_MAGIC_3; |
| 536 } |
| 537 |
448 uint32_t | 538 uint32_t |
449 CompactTrieDictionary::dataSize() const { | 539 CompactTrieDictionary::dataSize() const { |
450 return fData->size; | 540 return fInfo->size; |
451 } | 541 } |
452 | 542 |
453 const void * | 543 const void * |
454 CompactTrieDictionary::data() const { | 544 CompactTrieDictionary::data() const { |
455 return fData; | 545 return fInfo->address; |
456 } | 546 } |
457 | 547 |
458 // This function finds the address of a node for us, given its node ID | 548 //This function finds the address of a node for us, given its node ID |
459 static inline const CompactTrieNode * | 549 static inline const CompactTrieNode * |
460 getCompactNode(const CompactTrieHeader *header, uint16_t node) { | 550 getCompactNode(const CompactTrieInfo *info, uint32_t node) { |
461 return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[n
ode]); | 551 if(node < info->root-1) { |
| 552 return (const CompactTrieNode *)(&info->offsets[node]); |
| 553 } else { |
| 554 return (const CompactTrieNode *)(info->address + info->offsets[node]); |
| 555 } |
| 556 } |
| 557 |
| 558 //this version of getCompactNode is currently only used in compactMutableTrieDic
tionary() |
| 559 static inline const CompactTrieNode * |
| 560 getCompactNode(const CompactTrieHeader *header, uint32_t node) { |
| 561 if(node < header->root-1) { |
| 562 return (const CompactTrieNode *)(&header->offsets[node]); |
| 563 } else { |
| 564 return (const CompactTrieNode *)((const uint8_t *)header + header->offse
ts[node]); |
| 565 } |
| 566 } |
| 567 |
| 568 |
| 569 /** |
| 570 * Calculates the number of links in a node |
| 571 * @node The specified node |
| 572 */ |
| 573 static inline const uint16_t |
| 574 getCount(const CompactTrieNode *node){ |
| 575 return (node->flagscount & kCountMask); |
| 576 //use the code below if number of links ever exceed 4096 |
| 577 //return (node->flagscount & kCountMask) + ((node->flagscount & kExceedsCoun
t) >> 2); |
| 578 } |
| 579 |
| 580 /** |
| 581 * calculates an equal link node ID of a horizontal node |
| 582 * @hnode The horizontal node containing the equal link |
| 583 * @param index The index into hnode->entries[] |
| 584 * @param nodeCount The length of hnode->entries[] |
| 585 */ |
| 586 static inline uint32_t calcEqualLink(const CompactTrieVerticalNode *vnode){ |
| 587 if(vnode->flagscount & kEqualOverflows){ |
| 588 // treat overflow bits as an extension of chars[] |
| 589 uint16_t *overflow = (uint16_t *) &vnode->chars[getCount((CompactTrieNod
e*)vnode)]; |
| 590 return vnode->equal + (((uint32_t)*overflow) << 16); |
| 591 }else{ |
| 592 return vnode->equal; |
| 593 } |
| 594 } |
| 595 |
| 596 /** |
| 597 * calculates an equal link node ID of a horizontal node |
| 598 * @hnode The horizontal node containing the equal link |
| 599 * @param index The index into hnode->entries[] |
| 600 * @param nodeCount The length of hnode->entries[] |
| 601 */ |
| 602 static inline uint32_t calcEqualLink(const CompactTrieHorizontalNode *hnode, uin
t16_t index, uint16_t nodeCount){ |
| 603 if(hnode->flagscount & kEqualOverflows){ |
| 604 //set overflow to point to the uint16_t containing the overflow bits |
| 605 uint16_t *overflow = (uint16_t *) &hnode->entries[nodeCount]; |
| 606 overflow += index/4; |
| 607 uint16_t extraBits = (*overflow >> (3 - (index % 4)) * 4) % 0x10; |
| 608 return hnode->entries[index].equal + (((uint32_t)extraBits) << 16); |
| 609 } else { |
| 610 return hnode->entries[index].equal; |
| 611 } |
| 612 } |
| 613 |
| 614 /** |
| 615 * Returns the value stored in the specified node which is associated with its |
| 616 * parent node. |
| 617 * TODO: how to tell that value is stored in node or in offset? check whether |
| 618 * node ID < fInfo->root! |
| 619 */ |
| 620 static inline uint16_t getValue(const CompactTrieHorizontalNode *hnode){ |
| 621 uint16_t count = getCount((CompactTrieNode *)hnode); |
| 622 uint16_t overflowSize = 0; //size of node ID overflow storage in bytes |
| 623 |
| 624 if(hnode->flagscount & kEqualOverflows) |
| 625 overflowSize = (count + 3) / 4 * sizeof(uint16_t); |
| 626 return *((uint16_t *)((uint8_t *)&hnode->entries[count] + overflowSize)); |
| 627 } |
| 628 |
| 629 static inline uint16_t getValue(const CompactTrieVerticalNode *vnode){ |
| 630 // calculate size of total node ID overflow storage in bytes |
| 631 uint16_t overflowSize = (vnode->flagscount & kEqualOverflows)? sizeof(uint16
_t) : 0; |
| 632 return *((uint16_t *)((uint8_t *)&vnode->chars[getCount((CompactTrieNode *)v
node)] + overflowSize)); |
| 633 } |
| 634 |
| 635 static inline uint16_t getValue(const CompactTrieNode *node){ |
| 636 if(node->flagscount & kVerticalNode) |
| 637 return getValue((const CompactTrieVerticalNode *)node); |
| 638 else |
| 639 return getValue((const CompactTrieHorizontalNode *)node); |
| 640 } |
| 641 |
| 642 //returns index of match in CompactTrieHorizontalNode.entries[] using binary sea
rch |
| 643 inline int16_t |
| 644 searchHorizontalEntries(const CompactTrieHorizontalEntry *entries, |
| 645 UChar uc, uint16_t nodeCount){ |
| 646 int low = 0; |
| 647 int high = nodeCount-1; |
| 648 int middle; |
| 649 while (high >= low) { |
| 650 middle = (high+low)/2; |
| 651 if (uc == entries[middle].ch) { |
| 652 return middle; |
| 653 } |
| 654 else if (uc < entries[middle].ch) { |
| 655 high = middle-1; |
| 656 } |
| 657 else { |
| 658 low = middle+1; |
| 659 } |
| 660 } |
| 661 |
| 662 return -1; |
462 } | 663 } |
463 | 664 |
464 int32_t | 665 int32_t |
465 CompactTrieDictionary::matches( UText *text, | 666 CompactTrieDictionary::matches( UText *text, |
466 int32_t maxLength, | 667 int32_t maxLength, |
467 int32_t *lengths, | 668 int32_t *lengths, |
468 int &count, | 669 int &count, |
469 int limit ) const { | 670 int limit, |
| 671 uint16_t *values /*= NULL*/) const { |
| 672 if (fInfo->magic == COMPACT_TRIE_MAGIC_2) |
| 673 values = NULL; |
| 674 |
470 // TODO: current implementation works in UTF-16 space | 675 // TODO: current implementation works in UTF-16 space |
471 const CompactTrieNode *node = getCompactNode(fData, fData->root); | 676 const CompactTrieNode *node = getCompactNode(fInfo, fInfo->root); |
472 int mycount = 0; | 677 int mycount = 0; |
473 | 678 |
474 UChar uc = utext_current32(text); | 679 UChar uc = utext_current32(text); |
475 int i = 0; | 680 int i = 0; |
476 | 681 |
| 682 // handle root node with only kEqualOverflows flag: assume horizontal node w
ithout parent |
| 683 if(node != NULL){ |
| 684 const CompactTrieHorizontalNode *root = (const CompactTrieHorizontalNode
*) node; |
| 685 int index = searchHorizontalEntries(root->entries, uc, root->flagscount
& kRootCountMask); |
| 686 if(index > -1){ |
| 687 node = getCompactNode(fInfo, calcEqualLink(root, index, root->flagsc
ount & kRootCountMask)); |
| 688 utext_next32(text); |
| 689 uc = utext_current32(text); |
| 690 ++i; |
| 691 }else{ |
| 692 node = NULL; |
| 693 } |
| 694 } |
| 695 |
477 while (node != NULL) { | 696 while (node != NULL) { |
478 // Check if the node we just exited ends a word | 697 // Check if the node we just exited ends a word |
479 if (limit > 0 && (node->flagscount & kParentEndsWord)) { | 698 if (limit > 0 && (node->flagscount & kParentEndsWord)) { |
| 699 if(values != NULL){ |
| 700 values[mycount] = getValue(node); |
| 701 } |
480 lengths[mycount++] = i; | 702 lengths[mycount++] = i; |
481 --limit; | 703 --limit; |
482 } | 704 } |
483 // Check that we haven't exceeded the maximum number of input characters
. | 705 // Check that we haven't exceeded the maximum number of input characters
. |
484 // We have to do that here rather than in the while condition so that | 706 // We have to do that here rather than in the while condition so that |
485 // we can check for ending a word, above. | 707 // we can check for ending a word, above. |
486 if (i >= maxLength) { | 708 if (i >= maxLength) { |
487 break; | 709 break; |
488 } | 710 } |
489 | 711 |
490 int nodeCount = (node->flagscount & kCountMask); | 712 int nodeCount = getCount(node); |
491 if (nodeCount == 0) { | 713 if (nodeCount == 0) { |
492 // Special terminal node; return now | 714 // Special terminal node; return now |
493 break; | 715 break; |
494 } | 716 } |
495 if (node->flagscount & kVerticalNode) { | 717 if (node->flagscount & kVerticalNode) { |
496 // Vertical node; check all the characters in it | 718 // Vertical node; check all the characters in it |
497 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNod
e *)node; | 719 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNod
e *)node; |
498 for (int j = 0; j < nodeCount && i < maxLength; ++j) { | 720 for (int j = 0; j < nodeCount && i < maxLength; ++j) { |
499 if (uc != vnode->chars[j]) { | 721 if (uc != vnode->chars[j]) { |
500 // We hit a non-equal character; return | 722 // We hit a non-equal character; return |
501 goto exit; | 723 goto exit; |
502 } | 724 } |
503 utext_next32(text); | 725 utext_next32(text); |
504 uc = utext_current32(text); | 726 uc = utext_current32(text); |
505 ++i; | 727 ++i; |
506 } | 728 } |
507 // To get here we must have come through the whole list successfully
; | 729 // To get here we must have come through the whole list successfully
; |
508 // go on to the next node. Note that a word cannot end in the middle | 730 // go on to the next node. Note that a word cannot end in the middle |
509 // of a vertical node. | 731 // of a vertical node. |
510 node = getCompactNode(fData, vnode->equal); | 732 node = getCompactNode(fInfo, calcEqualLink(vnode)); |
511 } | 733 } |
512 else { | 734 else { |
513 // Horizontal node; do binary search | 735 // Horizontal node; do binary search |
514 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizonta
lNode *)node; | 736 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizonta
lNode *)node; |
515 int low = 0; | 737 const CompactTrieHorizontalEntry *entries; |
516 int high = nodeCount-1; | 738 entries = hnode->entries; |
517 int middle; | 739 |
518 node = NULL; // If we don't find a match, we'll fall out of the l
oop | 740 int index = searchHorizontalEntries(entries, uc, nodeCount); |
519 while (high >= low) { | 741 if(index > -1){ // |
520 middle = (high+low)/2; | 742 // We hit a match; get the next node and next character |
521 if (uc == hnode->entries[middle].ch) { | 743 node = getCompactNode(fInfo, calcEqualLink(hnode, index, nodeCou
nt)); |
522 // We hit a match; get the next node and next character | 744 utext_next32(text); |
523 node = getCompactNode(fData, hnode->entries[middle].equal); | 745 uc = utext_current32(text); |
524 utext_next32(text); | 746 ++i; |
525 uc = utext_current32(text); | 747 }else{ |
526 ++i; | 748 node = NULL; // If we don't find a match, we'll fall out of t
he loop |
527 break; | |
528 } | |
529 else if (uc < hnode->entries[middle].ch) { | |
530 high = middle-1; | |
531 } | |
532 else { | |
533 low = middle+1; | |
534 } | |
535 } | 749 } |
536 } | 750 } |
537 } | 751 } |
538 exit: | 752 exit: |
539 count = mycount; | 753 count = mycount; |
540 return i; | 754 return i; |
541 } | 755 } |
542 | 756 |
543 // Implementation of iteration for CompactTrieDictionary | 757 // Implementation of iteration for CompactTrieDictionary |
544 class CompactTrieEnumeration : public StringEnumeration { | 758 class CompactTrieEnumeration : public StringEnumeration { |
545 private: | 759 private: |
546 UVector32 fNodeStack; // Stack of nodes to process | 760 UVector32 fNodeStack; // Stack of nodes to process |
547 UVector32 fIndexStack; // Stack of where in node we are | 761 UVector32 fIndexStack; // Stack of where in node we are |
548 const CompactTrieHeader *fHeader; // Trie data | 762 const CompactTrieInfo *fInfo; // Trie data |
549 | 763 |
550 public: | 764 public: |
551 static UClassID U_EXPORT2 getStaticClassID(void); | 765 static UClassID U_EXPORT2 getStaticClassID(void); |
552 virtual UClassID getDynamicClassID(void) const; | 766 virtual UClassID getDynamicClassID(void) const; |
553 public: | 767 public: |
554 CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status) | 768 CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status) |
555 : fNodeStack(status), fIndexStack(status) { | 769 : fNodeStack(status), fIndexStack(status) { |
556 fHeader = header; | 770 fInfo = info; |
557 fNodeStack.push(header->root, status); | 771 fNodeStack.push(info->root, status); |
558 fIndexStack.push(0, status); | 772 fIndexStack.push(0, status); |
559 unistr.remove(); | 773 unistr.remove(); |
560 } | 774 } |
561 | 775 |
562 virtual ~CompactTrieEnumeration() { | 776 virtual ~CompactTrieEnumeration() { |
563 } | 777 } |
564 | 778 |
565 virtual StringEnumeration *clone() const { | 779 virtual StringEnumeration *clone() const { |
566 UErrorCode status = U_ZERO_ERROR; | 780 UErrorCode status = U_ZERO_ERROR; |
567 return new CompactTrieEnumeration(fHeader, status); | 781 return new CompactTrieEnumeration(fInfo, status); |
568 } | 782 } |
569 | 783 |
570 virtual const UnicodeString * snext(UErrorCode &status); | 784 virtual const UnicodeString * snext(UErrorCode &status); |
571 | 785 |
572 // Very expensive, but this should never be used. | 786 // Very expensive, but this should never be used. |
573 virtual int32_t count(UErrorCode &status) const { | 787 virtual int32_t count(UErrorCode &status) const { |
574 CompactTrieEnumeration counter(fHeader, status); | 788 CompactTrieEnumeration counter(fInfo, status); |
575 int32_t result = 0; | 789 int32_t result = 0; |
576 while (counter.snext(status) != NULL && U_SUCCESS(status)) { | 790 while (counter.snext(status) != NULL && U_SUCCESS(status)) { |
577 ++result; | 791 ++result; |
578 } | 792 } |
579 return result; | 793 return result; |
580 } | 794 } |
581 | 795 |
582 virtual void reset(UErrorCode &status) { | 796 virtual void reset(UErrorCode &status) { |
583 fNodeStack.removeAllElements(); | 797 fNodeStack.removeAllElements(); |
584 fIndexStack.removeAllElements(); | 798 fIndexStack.removeAllElements(); |
585 fNodeStack.push(fHeader->root, status); | 799 fNodeStack.push(fInfo->root, status); |
586 fIndexStack.push(0, status); | 800 fIndexStack.push(0, status); |
587 unistr.remove(); | 801 unistr.remove(); |
588 } | 802 } |
589 }; | 803 }; |
590 | 804 |
591 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration) | 805 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration) |
592 | 806 |
593 const UnicodeString * | 807 const UnicodeString * |
594 CompactTrieEnumeration::snext(UErrorCode &status) { | 808 CompactTrieEnumeration::snext(UErrorCode &status) { |
595 if (fNodeStack.empty() || U_FAILURE(status)) { | 809 if (fNodeStack.empty() || U_FAILURE(status)) { |
596 return NULL; | 810 return NULL; |
597 } | 811 } |
598 const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki()); | 812 const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki()); |
599 int where = fIndexStack.peeki(); | 813 int where = fIndexStack.peeki(); |
600 while (!fNodeStack.empty() && U_SUCCESS(status)) { | 814 while (!fNodeStack.empty() && U_SUCCESS(status)) { |
601 int nodeCount = (node->flagscount & kCountMask); | 815 int nodeCount; |
| 816 |
| 817 bool isRoot = fNodeStack.peeki() == static_cast<int32_t>(fInfo->root); |
| 818 if(isRoot){ |
| 819 nodeCount = node->flagscount & kRootCountMask; |
| 820 } else { |
| 821 nodeCount = getCount(node); |
| 822 } |
| 823 |
602 UBool goingDown = FALSE; | 824 UBool goingDown = FALSE; |
603 if (nodeCount == 0) { | 825 if (nodeCount == 0) { |
604 // Terminal node; go up immediately | 826 // Terminal node; go up immediately |
605 fNodeStack.popi(); | 827 fNodeStack.popi(); |
606 fIndexStack.popi(); | 828 fIndexStack.popi(); |
607 node = getCompactNode(fHeader, fNodeStack.peeki()); | 829 node = getCompactNode(fInfo, fNodeStack.peeki()); |
608 where = fIndexStack.peeki(); | 830 where = fIndexStack.peeki(); |
609 } | 831 } |
610 else if (node->flagscount & kVerticalNode) { | 832 else if ((node->flagscount & kVerticalNode) && !isRoot) { |
611 // Vertical node | 833 // Vertical node |
612 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNod
e *)node; | 834 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNod
e *)node; |
613 if (where == 0) { | 835 if (where == 0) { |
614 // Going down | 836 // Going down |
615 unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount); | 837 unistr.append((const UChar *)vnode->chars, nodeCount); |
616 fIndexStack.setElementAt(1, fIndexStack.size()-1); | 838 fIndexStack.setElementAt(1, fIndexStack.size()-1); |
617 node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, sta
tus)); | 839 node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnode
), status)); |
618 where = fIndexStack.push(0, status); | 840 where = fIndexStack.push(0, status); |
619 goingDown = TRUE; | 841 goingDown = TRUE; |
620 } | 842 } |
621 else { | 843 else { |
622 // Going up | 844 // Going up |
623 unistr.truncate(unistr.length()-nodeCount); | 845 unistr.truncate(unistr.length()-nodeCount); |
624 fNodeStack.popi(); | 846 fNodeStack.popi(); |
625 fIndexStack.popi(); | 847 fIndexStack.popi(); |
626 node = getCompactNode(fHeader, fNodeStack.peeki()); | 848 node = getCompactNode(fInfo, fNodeStack.peeki()); |
627 where = fIndexStack.peeki(); | 849 where = fIndexStack.peeki(); |
628 } | 850 } |
629 } | 851 } |
630 else { | 852 else { |
631 // Horizontal node | 853 // Horizontal node |
632 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizonta
lNode *)node; | 854 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizonta
lNode *)node; |
633 if (where > 0) { | 855 if (where > 0) { |
634 // Pop previous char | 856 // Pop previous char |
635 unistr.truncate(unistr.length()-1); | 857 unistr.truncate(unistr.length()-1); |
636 } | 858 } |
637 if (where < nodeCount) { | 859 if (where < nodeCount) { |
638 // Push on next node | 860 // Push on next node |
639 unistr.append((UChar)hnode->entries[where].ch); | 861 unistr.append((UChar)hnode->entries[where].ch); |
640 fIndexStack.setElementAt(where+1, fIndexStack.size()-1); | 862 fIndexStack.setElementAt(where+1, fIndexStack.size()-1); |
641 node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[wh
ere].equal, status)); | 863 node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnode
, where, nodeCount), status)); |
642 where = fIndexStack.push(0, status); | 864 where = fIndexStack.push(0, status); |
643 goingDown = TRUE; | 865 goingDown = TRUE; |
644 } | 866 } |
645 else { | 867 else { |
646 // Going up | 868 // Going up |
647 fNodeStack.popi(); | 869 fNodeStack.popi(); |
648 fIndexStack.popi(); | 870 fIndexStack.popi(); |
649 node = getCompactNode(fHeader, fNodeStack.peeki()); | 871 node = getCompactNode(fInfo, fNodeStack.peeki()); |
650 where = fIndexStack.peeki(); | 872 where = fIndexStack.peeki(); |
651 } | 873 } |
652 } | 874 } |
| 875 |
653 // Check if the parent of the node we've just gone down to ends a | 876 // Check if the parent of the node we've just gone down to ends a |
654 // word. If so, return it. | 877 // word. If so, return it. |
| 878 // The root node should never end up here. |
655 if (goingDown && (node->flagscount & kParentEndsWord)) { | 879 if (goingDown && (node->flagscount & kParentEndsWord)) { |
656 return &unistr; | 880 return &unistr; |
657 } | 881 } |
658 } | 882 } |
659 return NULL; | 883 return NULL; |
660 } | 884 } |
661 | 885 |
662 StringEnumeration * | 886 StringEnumeration * |
663 CompactTrieDictionary::openWords( UErrorCode &status ) const { | 887 CompactTrieDictionary::openWords( UErrorCode &status ) const { |
664 if (U_FAILURE(status)) { | 888 if (U_FAILURE(status)) { |
665 return NULL; | 889 return NULL; |
666 } | 890 } |
667 return new CompactTrieEnumeration(fData, status); | 891 return new CompactTrieEnumeration(fInfo, status); |
668 } | 892 } |
669 | 893 |
670 // | 894 // |
671 // Below here is all code related to converting a ternary trie to a compact trie | 895 // Below here is all code related to converting a ternary trie to a compact trie |
672 // and back again | 896 // and back again |
673 // | 897 // |
674 | 898 |
675 // Helper classes to construct the compact trie | 899 enum CompactTrieNodeType { |
| 900 kHorizontalType = 0, |
| 901 kVerticalType = 1, |
| 902 kValueType = 2 |
| 903 }; |
| 904 |
| 905 /** |
| 906 * The following classes (i.e. BuildCompactTrie*Node) are helper classes to |
| 907 * construct the compact trie by storing information for each node and later |
| 908 * writing the node to memory in a sequential format. |
| 909 */ |
676 class BuildCompactTrieNode: public UMemory { | 910 class BuildCompactTrieNode: public UMemory { |
677 public: | 911 public: |
678 UBool fParentEndsWord; | 912 UBool fParentEndsWord; |
679 UBool fVertical; | 913 CompactTrieNodeType fNodeType; |
680 UBool fHasDuplicate; | 914 UBool fHasDuplicate; |
| 915 UBool fEqualOverflows; |
681 int32_t fNodeID; | 916 int32_t fNodeID; |
682 UnicodeString fChars; | 917 UnicodeString fChars; |
| 918 uint16_t fValue; |
683 | 919 |
684 public: | 920 public: |
685 BuildCompactTrieNode(UBool parentEndsWord, UBool vertical, UStack &nodes, UE
rrorCode &status) { | 921 BuildCompactTrieNode(UBool parentEndsWord, CompactTrieNodeType nodeType, |
| 922 UStack &nodes, UErrorCode &status, uint16_t value = 0) { |
686 fParentEndsWord = parentEndsWord; | 923 fParentEndsWord = parentEndsWord; |
687 fHasDuplicate = FALSE; | 924 fHasDuplicate = FALSE; |
688 fVertical = vertical; | 925 fNodeType = nodeType; |
| 926 fEqualOverflows = FALSE; |
689 fNodeID = nodes.size(); | 927 fNodeID = nodes.size(); |
| 928 fValue = parentEndsWord? value : 0; |
690 nodes.push(this, status); | 929 nodes.push(this, status); |
691 } | 930 } |
692 | 931 |
693 virtual ~BuildCompactTrieNode() { | 932 virtual ~BuildCompactTrieNode() { |
694 } | 933 } |
695 | 934 |
696 virtual uint32_t size() { | 935 virtual uint32_t size() { |
697 return sizeof(uint16_t); | 936 if(fValue > 0) |
| 937 return sizeof(uint16_t) * 2; |
| 938 else |
| 939 return sizeof(uint16_t); |
698 } | 940 } |
699 | 941 |
700 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*tran
slate*/) { | 942 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*tran
slate*/) { |
701 // Write flag/count | 943 // Write flag/count |
702 *((uint16_t *)(bytes+offset)) = (fChars.length() & kCountMask) | 944 |
703 | (fVertical ? kVerticalNode : 0) | (fParentEndsWord ? kParentEndsWo
rd : 0 ); | 945 // if this ever fails, a flag bit (i.e. kExceedsCount) will need to be |
| 946 // used as a 5th MSB. |
| 947 U_ASSERT(fChars.length() < 4096 || fNodeID == 2); |
| 948 |
| 949 *((uint16_t *)(bytes+offset)) = (fEqualOverflows? kEqualOverflows : 0) |
|
| 950 ((fNodeID == 2)? (fChars.length() & kRootCountMask): |
| 951 ( |
| 952 (fChars.length() & kCountMask) | |
| 953 //((fChars.length() << 2) & kExceedsCount) | |
| 954 (fNodeType == kVerticalType ? kVerticalNode : 0) | |
| 955 (fParentEndsWord ? kParentEndsWord : 0 ) |
| 956 ) |
| 957 ); |
704 offset += sizeof(uint16_t); | 958 offset += sizeof(uint16_t); |
705 } | 959 } |
| 960 |
| 961 virtual void writeValue(uint8_t *bytes, uint32_t &offset) { |
| 962 if(fValue > 0){ |
| 963 *((uint16_t *)(bytes+offset)) = fValue; |
| 964 offset += sizeof(uint16_t); |
| 965 } |
| 966 } |
| 967 |
| 968 }; |
| 969 |
| 970 /** |
| 971 * Stores value of parent terminating nodes that have no more subtries. |
| 972 */ |
| 973 class BuildCompactTrieValueNode: public BuildCompactTrieNode { |
| 974 public: |
| 975 BuildCompactTrieValueNode(UStack &nodes, UErrorCode &status, uint16_t value) |
| 976 : BuildCompactTrieNode(TRUE, kValueType, nodes, status, value){ |
| 977 } |
| 978 |
| 979 virtual ~BuildCompactTrieValueNode(){ |
| 980 } |
| 981 |
| 982 virtual uint32_t size() { |
| 983 return sizeof(uint16_t) * 2; |
| 984 } |
| 985 |
| 986 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &transl
ate) { |
| 987 // don't write value directly to memory but store it in offset to be wri
tten later |
| 988 //offset = fValue & kOffsetContainsValue; |
| 989 BuildCompactTrieNode::write(bytes, offset, translate); |
| 990 BuildCompactTrieNode::writeValue(bytes, offset); |
| 991 } |
706 }; | 992 }; |
707 | 993 |
708 class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode { | 994 class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode { |
709 public: | 995 public: |
710 UStack fLinks; | 996 UStack fLinks; |
| 997 UBool fMayOverflow; //intermediate value for fEqualOverflows |
711 | 998 |
712 public: | 999 public: |
713 BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCo
de &status) | 1000 BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCo
de &status, uint16_t value = 0) |
714 : BuildCompactTrieNode(parentEndsWord, FALSE, nodes, status), fLinks(sta
tus) { | 1001 : BuildCompactTrieNode(parentEndsWord, kHorizontalType, nodes, status, value
), fLinks(status) { |
| 1002 fMayOverflow = FALSE; |
715 } | 1003 } |
716 | 1004 |
717 virtual ~BuildCompactTrieHorizontalNode() { | 1005 virtual ~BuildCompactTrieHorizontalNode() { |
718 } | 1006 } |
719 | 1007 |
720 virtual uint32_t size() { | 1008 // It is impossible to know beforehand exactly how much space the node will |
721 return offsetof(CompactTrieHorizontalNode,entries) + | 1009 // need in memory before being written, because the node IDs in the equal |
722 (fChars.length()*sizeof(CompactTrieHorizontalEntry)); | 1010 // links may or may not overflow after node coalescing. Therefore, this meth
od |
| 1011 // returns the maximum size possible for the node. |
| 1012 virtual uint32_t size() { |
| 1013 uint32_t estimatedSize = offsetof(CompactTrieHorizontalNode,entries) + |
| 1014 (fChars.length()*sizeof(CompactTrieHorizontalEntry)); |
| 1015 |
| 1016 if(fValue > 0) |
| 1017 estimatedSize += sizeof(uint16_t); |
| 1018 |
| 1019 //estimate extra space needed to store overflow for node ID links |
| 1020 //may be more than what is actually needed |
| 1021 for(int i=0; i < fChars.length(); i++){ |
| 1022 if(((BuildCompactTrieNode *)fLinks[i])->fNodeID > 0xFFFF){ |
| 1023 fMayOverflow = TRUE; |
| 1024 break; |
| 1025 } |
| 1026 } |
| 1027 if(fMayOverflow) // added space for overflow should be same as ceil(fCha
rs.length()/4) * sizeof(uint16_t) |
| 1028 estimatedSize += (sizeof(uint16_t) * fChars.length() + 2)/4; |
| 1029 |
| 1030 return estimatedSize; |
723 } | 1031 } |
724 | 1032 |
725 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &transl
ate) { | 1033 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &transl
ate) { |
| 1034 int32_t count = fChars.length(); |
| 1035 |
| 1036 //if largest nodeID > 2^16, set flag |
| 1037 //large node IDs are more likely to be at the back of the array |
| 1038 for (int32_t i = count-1; i >= 0; --i) { |
| 1039 if(translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID
) > 0xFFFF){ |
| 1040 fEqualOverflows = TRUE; |
| 1041 break; |
| 1042 } |
| 1043 } |
| 1044 |
726 BuildCompactTrieNode::write(bytes, offset, translate); | 1045 BuildCompactTrieNode::write(bytes, offset, translate); |
727 int32_t count = fChars.length(); | 1046 |
| 1047 // write entries[] to memory |
728 for (int32_t i = 0; i < count; ++i) { | 1048 for (int32_t i = 0; i < count; ++i) { |
729 CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(b
ytes+offset); | 1049 CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(b
ytes+offset); |
730 entry->ch = fChars[i]; | 1050 entry->ch = fChars[i]; |
731 entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[
i])->fNodeID); | 1051 entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[
i])->fNodeID); |
732 #ifdef DEBUG_TRIE_DICT | 1052 #ifdef DEBUG_TRIE_DICT |
733 if (entry->equal == 0) { | 1053 |
| 1054 if ((entry->equal == 0) && !fEqualOverflows) { |
734 fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps
to physical node zero\n", | 1055 fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps
to physical node zero\n", |
735 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); | 1056 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); |
736 } | 1057 } |
737 #endif | 1058 #endif |
738 offset += sizeof(CompactTrieHorizontalEntry); | 1059 offset += sizeof(CompactTrieHorizontalEntry); |
739 } | 1060 } |
| 1061 |
| 1062 // append extra bits of equal nodes to end if fEqualOverflows |
| 1063 if (fEqualOverflows) { |
| 1064 uint16_t leftmostBits = 0; |
| 1065 for (int16_t i = 0; i < count; i++) { |
| 1066 leftmostBits = (leftmostBits << 4) | getLeftmostBits(translate,
i); |
| 1067 |
| 1068 // write filled uint16_t to memory |
| 1069 if(i % 4 == 3){ |
| 1070 *((uint16_t *)(bytes+offset)) = leftmostBits; |
| 1071 leftmostBits = 0; |
| 1072 offset += sizeof(uint16_t); |
| 1073 } |
| 1074 } |
| 1075 |
| 1076 // pad last uint16_t with zeroes if necessary |
| 1077 int remainder = count % 4; |
| 1078 if (remainder > 0) { |
| 1079 *((uint16_t *)(bytes+offset)) = (leftmostBits << (16 - 4 * remai
nder)); |
| 1080 offset += sizeof(uint16_t); |
| 1081 } |
| 1082 } |
| 1083 |
| 1084 BuildCompactTrieNode::writeValue(bytes, offset); |
| 1085 } |
| 1086 |
| 1087 // returns leftmost bits of physical node link |
| 1088 uint16_t getLeftmostBits(const UVector32 &translate, uint32_t i){ |
| 1089 uint16_t leftmostBits = (uint16_t) (translate.elementAti(((BuildCompactT
rieNode *)fLinks[i])->fNodeID) >> 16); |
| 1090 #ifdef DEBUG_TRIE_DICT |
| 1091 if (leftmostBits > 0xF) { |
| 1092 fprintf(stderr, "ERROR: horizontal link %d, logical node %d exceeds
maximum possible node ID value\n", |
| 1093 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); |
| 1094 } |
| 1095 #endif |
| 1096 return leftmostBits; |
740 } | 1097 } |
741 | 1098 |
742 void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) { | 1099 void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) { |
743 fChars.append(ch); | 1100 fChars.append(ch); |
744 fLinks.push(link, status); | 1101 fLinks.push(link, status); |
745 } | 1102 } |
| 1103 |
746 }; | 1104 }; |
747 | 1105 |
748 class BuildCompactTrieVerticalNode: public BuildCompactTrieNode { | 1106 class BuildCompactTrieVerticalNode: public BuildCompactTrieNode { |
749 public: | 1107 public: |
750 BuildCompactTrieNode *fEqual; | 1108 BuildCompactTrieNode *fEqual; |
751 | 1109 |
752 public: | 1110 public: |
753 BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode
&status) | 1111 BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode
&status, uint16_t value = 0) |
754 : BuildCompactTrieNode(parentEndsWord, TRUE, nodes, status) { | 1112 : BuildCompactTrieNode(parentEndsWord, kVerticalType, nodes, status, value)
{ |
755 fEqual = NULL; | 1113 fEqual = NULL; |
756 } | 1114 } |
757 | 1115 |
758 virtual ~BuildCompactTrieVerticalNode() { | 1116 virtual ~BuildCompactTrieVerticalNode() { |
759 } | 1117 } |
760 | 1118 |
761 virtual uint32_t size() { | 1119 // Returns the maximum possible size of this node. See comment in |
762 return offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof
(uint16_t)); | 1120 // BuildCompactTrieHorizontal node for more information. |
| 1121 virtual uint32_t size() { |
| 1122 uint32_t estimatedSize = offsetof(CompactTrieVerticalNode,chars) + (fCha
rs.length()*sizeof(uint16_t)); |
| 1123 if(fValue > 0){ |
| 1124 estimatedSize += sizeof(uint16_t); |
| 1125 } |
| 1126 |
| 1127 if(fEqual->fNodeID > 0xFFFF){ |
| 1128 estimatedSize += sizeof(uint16_t); |
| 1129 } |
| 1130 return estimatedSize; |
763 } | 1131 } |
764 | 1132 |
765 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &transl
ate) { | 1133 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &transl
ate) { |
766 CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset
); | 1134 CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset
); |
| 1135 fEqualOverflows = (translate.elementAti(fEqual->fNodeID) > 0xFFFF); |
767 BuildCompactTrieNode::write(bytes, offset, translate); | 1136 BuildCompactTrieNode::write(bytes, offset, translate); |
768 node->equal = translate.elementAti(fEqual->fNodeID); | 1137 node->equal = translate.elementAti(fEqual->fNodeID); |
769 offset += sizeof(node->equal); | 1138 offset += sizeof(node->equal); |
770 #ifdef DEBUG_TRIE_DICT | 1139 #ifdef DEBUG_TRIE_DICT |
771 if (node->equal == 0) { | 1140 if ((node->equal == 0) && !fEqualOverflows) { |
772 fprintf(stderr, "ERROR: vertical link, logical node %d maps to physi
cal node zero\n", | 1141 fprintf(stderr, "ERROR: vertical link, logical node %d maps to physi
cal node zero\n", |
773 fEqual->fNodeID); | 1142 fEqual->fNodeID); |
774 } | 1143 } |
775 #endif | 1144 #endif |
776 fChars.extract(0, fChars.length(), (UChar *)node->chars); | 1145 fChars.extract(0, fChars.length(), (UChar *)node->chars); |
777 offset += sizeof(uint16_t)*fChars.length(); | 1146 offset += sizeof(UChar)*fChars.length(); |
| 1147 |
| 1148 // append 16 bits of to end for equal node if fEqualOverflows |
| 1149 if (fEqualOverflows) { |
| 1150 *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNodeI
D) >> 16); |
| 1151 offset += sizeof(uint16_t); |
| 1152 } |
| 1153 |
| 1154 BuildCompactTrieNode::writeValue(bytes, offset); |
778 } | 1155 } |
779 | 1156 |
780 void addChar(UChar ch) { | 1157 void addChar(UChar ch) { |
781 fChars.append(ch); | 1158 fChars.append(ch); |
782 } | 1159 } |
783 | 1160 |
784 void setLink(BuildCompactTrieNode *node) { | 1161 void setLink(BuildCompactTrieNode *node) { |
785 fEqual = node; | 1162 fEqual = node; |
786 } | 1163 } |
| 1164 |
787 }; | 1165 }; |
788 | 1166 |
789 // Forward declaration | 1167 // Forward declaration |
790 static void walkHorizontal(const TernaryNode *node, | 1168 static void walkHorizontal(const TernaryNode *node, |
791 BuildCompactTrieHorizontalNode *building, | 1169 BuildCompactTrieHorizontalNode *building, |
792 UStack &nodes, | 1170 UStack &nodes, |
793 UErrorCode &status); | 1171 UErrorCode &status, |
| 1172 Hashtable *values); |
794 | 1173 |
795 // Convert one node. Uses recursion. | 1174 // Convert one TernaryNode into a BuildCompactTrieNode. Uses recursion. |
796 | 1175 |
797 static BuildCompactTrieNode * | 1176 static BuildCompactTrieNode * |
798 compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, UEr
rorCode &status) { | 1177 compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, |
| 1178 UErrorCode &status, Hashtable *values = NULL, uint16_t parentValue = 0)
{ |
799 if (U_FAILURE(status)) { | 1179 if (U_FAILURE(status)) { |
800 return NULL; | 1180 return NULL; |
801 } | 1181 } |
802 BuildCompactTrieNode *result = NULL; | 1182 BuildCompactTrieNode *result = NULL; |
803 UBool horizontal = (node->low != NULL || node->high != NULL); | 1183 UBool horizontal = (node->low != NULL || node->high != NULL); |
804 if (horizontal) { | 1184 if (horizontal) { |
805 BuildCompactTrieHorizontalNode *hResult = | 1185 BuildCompactTrieHorizontalNode *hResult; |
806 new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status
); | 1186 if(values != NULL){ |
| 1187 hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes,
status, parentValue); |
| 1188 } else { |
| 1189 hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes,
status); |
| 1190 } |
| 1191 |
807 if (hResult == NULL) { | 1192 if (hResult == NULL) { |
808 status = U_MEMORY_ALLOCATION_ERROR; | 1193 status = U_MEMORY_ALLOCATION_ERROR; |
809 return NULL; | 1194 return NULL; |
810 } | 1195 } |
811 if (U_SUCCESS(status)) { | 1196 if (U_SUCCESS(status)) { |
812 walkHorizontal(node, hResult, nodes, status); | 1197 walkHorizontal(node, hResult, nodes, status, values); |
813 result = hResult; | 1198 result = hResult; |
814 } | 1199 } |
815 } | 1200 } |
816 else { | 1201 else { |
817 BuildCompactTrieVerticalNode *vResult = | 1202 BuildCompactTrieVerticalNode *vResult; |
818 new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status); | 1203 if(values != NULL){ |
| 1204 vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, st
atus, parentValue); |
| 1205 } else { |
| 1206 vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, st
atus); |
| 1207 } |
| 1208 |
819 if (vResult == NULL) { | 1209 if (vResult == NULL) { |
820 status = U_MEMORY_ALLOCATION_ERROR; | 1210 status = U_MEMORY_ALLOCATION_ERROR; |
| 1211 return NULL; |
821 } | 1212 } |
822 else if (U_SUCCESS(status)) { | 1213 else if (U_SUCCESS(status)) { |
823 UBool endsWord = FALSE; | 1214 uint16_t value = 0; |
| 1215 UBool endsWord = FALSE; |
824 // Take up nodes until we end a word, or hit a node with < or > link
s | 1216 // Take up nodes until we end a word, or hit a node with < or > link
s |
825 do { | 1217 do { |
826 vResult->addChar(node->ch); | 1218 vResult->addChar(node->ch); |
827 endsWord = (node->flags & kEndsWord) != 0; | 1219 value = node->flags; |
| 1220 endsWord = value > 0; |
828 node = node->equal; | 1221 node = node->equal; |
829 } | 1222 } |
830 while(node != NULL && !endsWord && node->low == NULL && node->high =
= NULL); | 1223 while(node != NULL && !endsWord && node->low == NULL && node->high =
= NULL); |
| 1224 |
831 if (node == NULL) { | 1225 if (node == NULL) { |
832 if (!endsWord) { | 1226 if (!endsWord) { |
833 status = U_ILLEGAL_ARGUMENT_ERROR; // Corrupt input trie | 1227 status = U_ILLEGAL_ARGUMENT_ERROR; // Corrupt input trie |
834 } | 1228 } |
835 else { | 1229 else if(values != NULL){ |
| 1230 UnicodeString key(value); //store value as a single-char Uni
codeString |
| 1231 BuildCompactTrieValueNode *link = (BuildCompactTrieValueNode
*) values->get(key); |
| 1232 if(link == NULL){ |
| 1233 link = new BuildCompactTrieValueNode(nodes, status, valu
e); //take out nodes? |
| 1234 values->put(key, link, status); |
| 1235 } |
| 1236 vResult->setLink(link); |
| 1237 } else { |
836 vResult->setLink((BuildCompactTrieNode *)nodes[1]); | 1238 vResult->setLink((BuildCompactTrieNode *)nodes[1]); |
837 } | 1239 } |
838 } | 1240 } |
839 else { | 1241 else { |
840 vResult->setLink(compactOneNode(node, endsWord, nodes, status)); | 1242 vResult->setLink(compactOneNode(node, endsWord, nodes, status, v
alues, value)); |
841 } | 1243 } |
842 result = vResult; | 1244 result = vResult; |
843 } | 1245 } |
844 } | 1246 } |
845 return result; | 1247 return result; |
846 } | 1248 } |
847 | 1249 |
848 // Walk the set of peers at the same level, to build a horizontal node. | 1250 // Walk the set of peers at the same level, to build a horizontal node. |
849 // Uses recursion. | 1251 // Uses recursion. |
850 | 1252 |
851 static void walkHorizontal(const TernaryNode *node, | 1253 static void walkHorizontal(const TernaryNode *node, |
852 BuildCompactTrieHorizontalNode *building, | 1254 BuildCompactTrieHorizontalNode *building, |
853 UStack &nodes, | 1255 UStack &nodes, |
854 UErrorCode &status) { | 1256 UErrorCode &status, Hashtable *values = NULL) { |
855 while (U_SUCCESS(status) && node != NULL) { | 1257 while (U_SUCCESS(status) && node != NULL) { |
856 if (node->low != NULL) { | 1258 if (node->low != NULL) { |
857 walkHorizontal(node->low, building, nodes, status); | 1259 walkHorizontal(node->low, building, nodes, status, values); |
858 } | 1260 } |
859 BuildCompactTrieNode *link = NULL; | 1261 BuildCompactTrieNode *link = NULL; |
860 if (node->equal != NULL) { | 1262 if (node->equal != NULL) { |
861 link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, n
odes, status); | 1263 link = compactOneNode(node->equal, node->flags > 0, nodes, status, v
alues, node->flags); |
862 } | 1264 } |
863 else if (node->flags & kEndsWord) { | 1265 else if (node->flags > 0) { |
864 link = (BuildCompactTrieNode *)nodes[1]; | 1266 if(values != NULL) { |
| 1267 UnicodeString key(node->flags); //store value as a single-char U
nicodeString |
| 1268 link = (BuildCompactTrieValueNode *) values->get(key); |
| 1269 if(link == NULL) { |
| 1270 link = new BuildCompactTrieValueNode(nodes, status, node->fl
ags); //take out nodes? |
| 1271 values->put(key, link, status); |
| 1272 } |
| 1273 } else { |
| 1274 link = (BuildCompactTrieNode *)nodes[1]; |
| 1275 } |
865 } | 1276 } |
866 if (U_SUCCESS(status) && link != NULL) { | 1277 if (U_SUCCESS(status) && link != NULL) { |
867 building->addNode(node->ch, link, status); | 1278 building->addNode(node->ch, link, status); |
868 } | 1279 } |
869 // Tail recurse manually instead of leaving it to the compiler. | 1280 // Tail recurse manually instead of leaving it to the compiler. |
870 //if (node->high != NULL) { | 1281 //if (node->high != NULL) { |
871 // walkHorizontal(node->high, building, nodes, status); | 1282 // walkHorizontal(node->high, building, nodes, status); |
872 //} | 1283 //} |
873 node = node->high; | 1284 node = node->high; |
874 } | 1285 } |
875 } | 1286 } |
876 | 1287 |
877 U_NAMESPACE_END | 1288 U_NAMESPACE_END |
878 U_NAMESPACE_USE | 1289 U_NAMESPACE_USE |
879 U_CDECL_BEGIN | 1290 U_CDECL_BEGIN |
880 static int32_t U_CALLCONV | 1291 static int32_t U_CALLCONV |
881 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr)
{ | 1292 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr)
{ |
882 BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl; | 1293 BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl; |
883 BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr; | 1294 BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr; |
| 1295 |
884 // Check for comparing a node to itself, to avoid spurious duplicates | 1296 // Check for comparing a node to itself, to avoid spurious duplicates |
885 if (left == right) { | 1297 if (left == right) { |
886 return 0; | 1298 return 0; |
887 } | 1299 } |
| 1300 |
888 // Most significant is type of node. Can never coalesce. | 1301 // Most significant is type of node. Can never coalesce. |
889 if (left->fVertical != right->fVertical) { | 1302 if (left->fNodeType != right->fNodeType) { |
890 return left->fVertical - right->fVertical; | 1303 return left->fNodeType - right->fNodeType; |
891 } | 1304 } |
892 // Next, the "parent ends word" flag. If that differs, we cannot coalesce. | 1305 // Next, the "parent ends word" flag. If that differs, we cannot coalesce. |
893 if (left->fParentEndsWord != right->fParentEndsWord) { | 1306 if (left->fParentEndsWord != right->fParentEndsWord) { |
894 return left->fParentEndsWord - right->fParentEndsWord; | 1307 return left->fParentEndsWord - right->fParentEndsWord; |
895 } | 1308 } |
896 // Next, the string. If that differs, we can never coalesce. | 1309 // Next, the string. If that differs, we can never coalesce. |
897 int32_t result = left->fChars.compare(right->fChars); | 1310 int32_t result = left->fChars.compare(right->fChars); |
898 if (result != 0) { | 1311 if (result != 0) { |
899 return result; | 1312 return result; |
900 } | 1313 } |
| 1314 |
| 1315 // If the node value differs, we should not coalesce. |
| 1316 // If values aren't stored, all fValues should be 0. |
| 1317 if (left->fValue != right->fValue) { |
| 1318 return left->fValue - right->fValue; |
| 1319 } |
| 1320 |
901 // We know they're both the same node type, so branch for the two cases. | 1321 // We know they're both the same node type, so branch for the two cases. |
902 if (left->fVertical) { | 1322 if (left->fNodeType == kVerticalType) { |
903 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID | 1323 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID |
904 - ((BuildCompactTrieVerticalNode *)right)->fEqual->f
NodeID; | 1324 - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID; |
905 } | 1325 } |
906 else { | 1326 else if(left->fChars.length() > 0 && right->fChars.length() > 0){ |
907 // We need to compare the links vectors. They should be the | 1327 // We need to compare the links vectors. They should be the |
908 // same size because the strings were equal. | 1328 // same size because the strings were equal. |
909 // We compare the node IDs instead of the pointers, to handle | 1329 // We compare the node IDs instead of the pointers, to handle |
910 // coalesced nodes. | 1330 // coalesced nodes. |
911 BuildCompactTrieHorizontalNode *hleft, *hright; | 1331 BuildCompactTrieHorizontalNode *hleft, *hright; |
912 hleft = (BuildCompactTrieHorizontalNode *)left; | 1332 hleft = (BuildCompactTrieHorizontalNode *)left; |
913 hright = (BuildCompactTrieHorizontalNode *)right; | 1333 hright = (BuildCompactTrieHorizontalNode *)right; |
914 int32_t count = hleft->fLinks.size(); | 1334 int32_t count = hleft->fLinks.size(); |
915 for (int32_t i = 0; i < count && result == 0; ++i) { | 1335 for (int32_t i = 0; i < count && result == 0; ++i) { |
916 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID - | 1336 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID - |
917 ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; | 1337 ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; |
918 } | 1338 } |
919 } | 1339 } |
| 1340 |
920 // If they are equal to each other, mark them (speeds coalescing) | 1341 // If they are equal to each other, mark them (speeds coalescing) |
921 if (result == 0) { | 1342 if (result == 0) { |
922 left->fHasDuplicate = TRUE; | 1343 left->fHasDuplicate = TRUE; |
923 right->fHasDuplicate = TRUE; | 1344 right->fHasDuplicate = TRUE; |
924 } | 1345 } |
925 return result; | 1346 return result; |
926 } | 1347 } |
927 U_CDECL_END | 1348 U_CDECL_END |
928 U_NAMESPACE_BEGIN | 1349 U_NAMESPACE_BEGIN |
929 | 1350 |
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1024 #ifdef DEBUG_TRIE_DICT | 1445 #ifdef DEBUG_TRIE_DICT |
1025 struct tms timing; | 1446 struct tms timing; |
1026 struct tms previous; | 1447 struct tms previous; |
1027 (void) ::times(&previous); | 1448 (void) ::times(&previous); |
1028 #endif | 1449 #endif |
1029 UStack nodes(_deleteBuildNode, NULL, status); // Index of nodes | 1450 UStack nodes(_deleteBuildNode, NULL, status); // Index of nodes |
1030 | 1451 |
1031 // Add node 0, used as the NULL pointer/sentinel. | 1452 // Add node 0, used as the NULL pointer/sentinel. |
1032 nodes.addElement((int32_t)0, status); | 1453 nodes.addElement((int32_t)0, status); |
1033 | 1454 |
| 1455 Hashtable *values = NULL; // Index of (unique) val
ues |
| 1456 if (dict.fValued) { |
| 1457 values = new Hashtable(status); |
| 1458 } |
| 1459 |
1034 // Start by creating the special empty node we use to indicate that the pare
nt | 1460 // Start by creating the special empty node we use to indicate that the pare
nt |
1035 // terminates a word. This must be node 1, because the builder assumes | 1461 // terminates a word. This must be node 1, because the builder assumes |
1036 // that. | 1462 // that. This node will never be used for tries storing numerical values. |
1037 if (U_FAILURE(status)) { | 1463 if (U_FAILURE(status)) { |
1038 return NULL; | 1464 return NULL; |
1039 } | 1465 } |
1040 BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, nodes
, status); | 1466 BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontalT
ype, nodes, status); |
1041 if (terminal == NULL) { | 1467 if (terminal == NULL) { |
1042 status = U_MEMORY_ALLOCATION_ERROR; | 1468 status = U_MEMORY_ALLOCATION_ERROR; |
1043 } | 1469 } |
1044 | 1470 |
1045 // This call does all the work of building the new trie structure. The root | 1471 // This call does all the work of building the new trie structure. The root |
1046 // will be node 2. | 1472 // will have node ID 2 before writing to memory. |
1047 BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status
); | 1473 BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status
, values); |
1048 #ifdef DEBUG_TRIE_DICT | 1474 #ifdef DEBUG_TRIE_DICT |
1049 (void) ::times(&timing); | 1475 (void) ::times(&timing); |
1050 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n", | 1476 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n", |
1051 nodes.size(), (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, | 1477 nodes.size(), (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, |
1052 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); | 1478 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); |
1053 previous = timing; | 1479 previous = timing; |
1054 #endif | 1480 #endif |
1055 | 1481 |
1056 // Now coalesce all duplicate nodes. | 1482 // Now coalesce all duplicate nodes. |
1057 coalesceDuplicates(nodes, status); | 1483 coalesceDuplicates(nodes, status); |
(...skipping 12 matching lines...) Expand all Loading... |
1070 int32_t nodeCount = 1; // The sentinel node we already have | 1496 int32_t nodeCount = 1; // The sentinel node we already have |
1071 BuildCompactTrieNode *node; | 1497 BuildCompactTrieNode *node; |
1072 int32_t i; | 1498 int32_t i; |
1073 UVector32 translate(count, status); // Should be no growth needed after this | 1499 UVector32 translate(count, status); // Should be no growth needed after this |
1074 translate.push(0, status); // The sentinel node | 1500 translate.push(0, status); // The sentinel node |
1075 | 1501 |
1076 if (U_FAILURE(status)) { | 1502 if (U_FAILURE(status)) { |
1077 return NULL; | 1503 return NULL; |
1078 } | 1504 } |
1079 | 1505 |
| 1506 //map terminal value nodes |
| 1507 int valueCount = 0; |
| 1508 UVector valueNodes(status); |
| 1509 if(values != NULL) { |
| 1510 valueCount = values->count(); //number of unique terminal value nodes |
| 1511 } |
| 1512 |
| 1513 // map non-terminal nodes |
| 1514 int valuePos = 1;//, nodePos = valueCount + valuePos; |
| 1515 nodeCount = valueCount + valuePos; |
1080 for (i = 1; i < count; ++i) { | 1516 for (i = 1; i < count; ++i) { |
1081 node = (BuildCompactTrieNode *)nodes[i]; | 1517 node = (BuildCompactTrieNode *)nodes[i]; |
1082 if (node->fNodeID == i) { | 1518 if (node->fNodeID == i) { |
1083 // Only one node out of each duplicate set is used | 1519 // Only one node out of each duplicate set is used |
1084 if (i >= translate.size()) { | 1520 if (node->fNodeID >= translate.size()) { |
1085 // Logically extend the mapping table | 1521 // Logically extend the mapping table |
1086 translate.setSize(i+1); | 1522 translate.setSize(i + 1); |
1087 } | 1523 } |
1088 translate.setElementAt(nodeCount++, i); | 1524 //translate.setElementAt(object, index)! |
| 1525 if(node->fNodeType == kValueType) { |
| 1526 valueNodes.addElement(node, status); |
| 1527 translate.setElementAt(valuePos++, i); |
| 1528 } else { |
| 1529 translate.setElementAt(nodeCount++, i); |
| 1530 } |
1089 totalSize += node->size(); | 1531 totalSize += node->size(); |
1090 } | 1532 } |
1091 } | 1533 } |
1092 | 1534 |
1093 // Check for overflowing 16 bits worth of nodes. | 1535 // Check for overflowing 20 bits worth of nodes. |
1094 if (nodeCount > 0x10000) { | 1536 if (nodeCount > 0x100000) { |
1095 status = U_ILLEGAL_ARGUMENT_ERROR; | 1537 status = U_ILLEGAL_ARGUMENT_ERROR; |
1096 return NULL; | 1538 return NULL; |
1097 } | 1539 } |
1098 | 1540 |
1099 // Add enough room for the offsets. | 1541 // Add enough room for the offsets. |
1100 totalSize += nodeCount*sizeof(uint32_t); | 1542 totalSize += nodeCount*sizeof(uint32_t); |
1101 #ifdef DEBUG_TRIE_DICT | 1543 #ifdef DEBUG_TRIE_DICT |
1102 (void) ::times(&timing); | 1544 (void) ::times(&timing); |
1103 fprintf(stderr, "Sizes/mapping done, time user %f system %f\n", | 1545 fprintf(stderr, "Sizes/mapping done, time user %f system %f\n", |
1104 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, | 1546 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, |
1105 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); | 1547 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); |
1106 previous = timing; | 1548 previous = timing; |
1107 fprintf(stderr, "%d nodes, %d unique, %d bytes\n", nodes.size(), nodeCount,
totalSize); | 1549 fprintf(stderr, "%d nodes, %d unique, %d bytes\n", nodes.size(), nodeCount,
totalSize); |
1108 #endif | 1550 #endif |
1109 uint8_t *bytes = (uint8_t *)uprv_malloc(totalSize); | 1551 uint8_t *bytes = (uint8_t *)uprv_malloc(totalSize); |
1110 if (bytes == NULL) { | 1552 if (bytes == NULL) { |
1111 status = U_MEMORY_ALLOCATION_ERROR; | 1553 status = U_MEMORY_ALLOCATION_ERROR; |
1112 return NULL; | 1554 return NULL; |
1113 } | 1555 } |
1114 | 1556 |
1115 CompactTrieHeader *header = (CompactTrieHeader *)bytes; | 1557 CompactTrieHeader *header = (CompactTrieHeader *)bytes; |
1116 header->size = totalSize; | 1558 //header->size = totalSize; |
| 1559 if(dict.fValued){ |
| 1560 header->magic = COMPACT_TRIE_MAGIC_3; |
| 1561 } else { |
| 1562 header->magic = COMPACT_TRIE_MAGIC_2; |
| 1563 } |
1117 header->nodeCount = nodeCount; | 1564 header->nodeCount = nodeCount; |
1118 header->offsets[0] = 0; // Sentinel | 1565 header->offsets[0] = 0; // Sentinel |
1119 header->root = translate.elementAti(root->fNodeID); | 1566 header->root = translate.elementAti(root->fNodeID); |
1120 #ifdef DEBUG_TRIE_DICT | 1567 #ifdef DEBUG_TRIE_DICT |
1121 if (header->root == 0) { | 1568 if (header->root == 0) { |
1122 fprintf(stderr, "ERROR: root node %d translate to physical zero\n", root
->fNodeID); | 1569 fprintf(stderr, "ERROR: root node %d translate to physical zero\n", root
->fNodeID); |
1123 } | 1570 } |
1124 #endif | 1571 #endif |
1125 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint
32_t)); | 1572 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint
32_t)); |
1126 nodeCount = 1; | 1573 nodeCount = valueCount + 1; |
| 1574 |
| 1575 // Write terminal value nodes to memory |
| 1576 for (i=0; i < valueNodes.size(); i++) { |
| 1577 //header->offsets[i + 1] = offset; |
| 1578 uint32_t tmpOffset = 0; |
| 1579 node = (BuildCompactTrieNode *) valueNodes.elementAt(i); |
| 1580 //header->offsets[i + 1] = (uint32_t)node->fValue; |
| 1581 node->write((uint8_t *)&header->offsets[i+1], tmpOffset, translate); |
| 1582 } |
| 1583 |
1127 // Now write the data | 1584 // Now write the data |
1128 for (i = 1; i < count; ++i) { | 1585 for (i = 1; i < count; ++i) { |
1129 node = (BuildCompactTrieNode *)nodes[i]; | 1586 node = (BuildCompactTrieNode *)nodes[i]; |
1130 if (node->fNodeID == i) { | 1587 if (node->fNodeID == i && node->fNodeType != kValueType) { |
1131 header->offsets[nodeCount++] = offset; | 1588 header->offsets[nodeCount++] = offset; |
1132 node->write(bytes, offset, translate); | 1589 node->write(bytes, offset, translate); |
1133 } | 1590 } |
1134 } | 1591 } |
| 1592 |
| 1593 //free all extra space |
| 1594 uprv_realloc(bytes, offset); |
| 1595 header->size = offset; |
| 1596 |
1135 #ifdef DEBUG_TRIE_DICT | 1597 #ifdef DEBUG_TRIE_DICT |
| 1598 fprintf(stdout, "Space freed: %d\n", totalSize-offset); |
| 1599 |
1136 (void) ::times(&timing); | 1600 (void) ::times(&timing); |
1137 fprintf(stderr, "Trie built, time user %f system %f\n", | 1601 fprintf(stderr, "Trie built, time user %f system %f\n", |
1138 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, | 1602 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, |
1139 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); | 1603 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); |
1140 previous = timing; | 1604 previous = timing; |
1141 fprintf(stderr, "Final offset is %d\n", offset); | 1605 fprintf(stderr, "Final offset is %d\n", offset); |
1142 | 1606 |
1143 // Collect statistics on node types and sizes | 1607 // Collect statistics on node types and sizes |
1144 int hCount = 0; | 1608 int hCount = 0; |
1145 int vCount = 0; | 1609 int vCount = 0; |
1146 size_t hSize = 0; | 1610 size_t hSize = 0; |
1147 size_t vSize = 0; | 1611 size_t vSize = 0; |
1148 size_t hItemCount = 0; | 1612 size_t hItemCount = 0; |
1149 size_t vItemCount = 0; | 1613 size_t vItemCount = 0; |
1150 uint32_t previousOff = offset; | 1614 uint32_t previousOff = offset; |
1151 for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { | 1615 uint32_t numOverflow = 0; |
| 1616 uint32_t valueSpace = 0; |
| 1617 for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { |
1152 const CompactTrieNode *node = getCompactNode(header, nodeIdx); | 1618 const CompactTrieNode *node = getCompactNode(header, nodeIdx); |
1153 if (node->flagscount & kVerticalNode) { | 1619 int itemCount; |
| 1620 if(nodeIdx == header->root) |
| 1621 itemCount = node->flagscount & kRootCountMask; |
| 1622 else |
| 1623 itemCount = getCount(node); |
| 1624 if(node->flagscount & kEqualOverflows){ |
| 1625 numOverflow++; |
| 1626 } |
| 1627 if (node->flagscount & kVerticalNode && nodeIdx != header->root) { |
1154 vCount += 1; | 1628 vCount += 1; |
1155 vItemCount += (node->flagscount & kCountMask); | 1629 vItemCount += itemCount; |
1156 vSize += previousOff-header->offsets[nodeIdx]; | 1630 vSize += previousOff-header->offsets[nodeIdx]; |
1157 } | 1631 } |
1158 else { | 1632 else { |
1159 hCount += 1; | 1633 hCount += 1; |
1160 hItemCount += (node->flagscount & kCountMask); | 1634 hItemCount += itemCount; |
1161 hSize += previousOff-header->offsets[nodeIdx]; | 1635 if(nodeIdx >= header->root) { |
| 1636 hSize += previousOff-header->offsets[nodeIdx]; |
| 1637 } |
1162 } | 1638 } |
| 1639 |
| 1640 if(header->magic == COMPACT_TRIE_MAGIC_3 && node->flagscount & kParentEn
dsWord) |
| 1641 valueSpace += sizeof(uint16_t); |
1163 previousOff = header->offsets[nodeIdx]; | 1642 previousOff = header->offsets[nodeIdx]; |
1164 } | 1643 } |
1165 fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\
n", hCount, | 1644 fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\
n", hCount, |
1166 (double)hSize/hCount, (double)hItemCount/hCount); | 1645 (double)hSize/hCount, (double)hItemCount/hCount); |
1167 fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n"
, vCount, | 1646 fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n"
, vCount, |
1168 (double)vSize/vCount, (double)vItemCount/vCount); | 1647 (double)vSize/vCount, (double)vItemCount/vCount); |
| 1648 fprintf(stderr, "Number of nodes with overflowing nodeIDs: %d \n", numOverfl
ow); |
| 1649 fprintf(stderr, "Space taken up by values: %d \n", valueSpace); |
1169 #endif | 1650 #endif |
1170 | 1651 |
1171 if (U_FAILURE(status)) { | 1652 if (U_FAILURE(status)) { |
1172 uprv_free(bytes); | 1653 uprv_free(bytes); |
1173 header = NULL; | 1654 header = NULL; |
1174 } | 1655 } |
1175 else { | |
1176 header->magic = COMPACT_TRIE_MAGIC_1; | |
1177 } | |
1178 return header; | 1656 return header; |
1179 } | 1657 } |
1180 | 1658 |
1181 // Forward declaration | 1659 // Forward declaration |
1182 static TernaryNode * | 1660 static TernaryNode * |
1183 unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UEr
rorCode &status ); | 1661 unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorC
ode &status ); |
1184 | |
1185 | 1662 |
1186 // Convert a horizontal node (or subarray thereof) into a ternary subtrie | 1663 // Convert a horizontal node (or subarray thereof) into a ternary subtrie |
1187 static TernaryNode * | 1664 static TernaryNode * |
1188 unpackHorizontalArray( const CompactTrieHeader *header, const CompactTrieHorizon
talEntry *array, | 1665 unpackHorizontalArray( const CompactTrieInfo *info, const CompactTrieHorizontalN
ode *hnode, |
1189 int low, int high, UErrorCode &status ) { | 1666 int low, int high, int nodeCount, UErrorCode &status) { |
1190 if (U_FAILURE(status) || low > high) { | 1667 if (U_FAILURE(status) || low > high) { |
1191 return NULL; | 1668 return NULL; |
1192 } | 1669 } |
1193 int middle = (low+high)/2; | 1670 int middle = (low+high)/2; |
1194 TernaryNode *result = new TernaryNode(array[middle].ch); | 1671 TernaryNode *result = new TernaryNode(hnode->entries[middle].ch); |
1195 if (result == NULL) { | 1672 if (result == NULL) { |
1196 status = U_MEMORY_ALLOCATION_ERROR; | 1673 status = U_MEMORY_ALLOCATION_ERROR; |
1197 return NULL; | 1674 return NULL; |
1198 } | 1675 } |
1199 const CompactTrieNode *equal = getCompactNode(header, array[middle].equal); | 1676 const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(hnode, mid
dle, nodeCount)); |
1200 if (equal->flagscount & kParentEndsWord) { | 1677 if (equal->flagscount & kParentEndsWord) { |
1201 result->flags |= kEndsWord; | 1678 if(info->magic == COMPACT_TRIE_MAGIC_3){ |
| 1679 result->flags = getValue(equal); |
| 1680 }else{ |
| 1681 result->flags |= kEndsWord; |
| 1682 } |
1202 } | 1683 } |
1203 result->low = unpackHorizontalArray(header, array, low, middle-1, status); | 1684 result->low = unpackHorizontalArray(info, hnode, low, middle-1, nodeCount, s
tatus); |
1204 result->high = unpackHorizontalArray(header, array, middle+1, high, status); | 1685 result->high = unpackHorizontalArray(info, hnode, middle+1, high, nodeCount,
status); |
1205 result->equal = unpackOneNode(header, equal, status); | 1686 result->equal = unpackOneNode(info, equal, status); |
1206 return result; | 1687 return result; |
1207 } | 1688 } |
1208 | 1689 |
1209 // Convert one compact trie node into a ternary subtrie | 1690 // Convert one compact trie node into a ternary subtrie |
1210 static TernaryNode * | 1691 static TernaryNode * |
1211 unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UEr
rorCode &status ) { | 1692 unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorC
ode &status ) { |
1212 int nodeCount = (node->flagscount & kCountMask); | 1693 int nodeCount = getCount(node); |
1213 if (nodeCount == 0 || U_FAILURE(status)) { | 1694 if (nodeCount == 0 || U_FAILURE(status)) { |
1214 // Failure, or terminal node | 1695 // Failure, or terminal node |
1215 return NULL; | 1696 return NULL; |
1216 } | 1697 } |
1217 if (node->flagscount & kVerticalNode) { | 1698 if (node->flagscount & kVerticalNode) { |
1218 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)
node; | 1699 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)
node; |
1219 TernaryNode *head = NULL; | 1700 TernaryNode *head = NULL; |
1220 TernaryNode *previous = NULL; | 1701 TernaryNode *previous = NULL; |
1221 TernaryNode *latest = NULL; | 1702 TernaryNode *latest = NULL; |
1222 for (int i = 0; i < nodeCount; ++i) { | 1703 for (int i = 0; i < nodeCount; ++i) { |
1223 latest = new TernaryNode(vnode->chars[i]); | 1704 latest = new TernaryNode(vnode->chars[i]); |
1224 if (latest == NULL) { | 1705 if (latest == NULL) { |
1225 status = U_MEMORY_ALLOCATION_ERROR; | 1706 status = U_MEMORY_ALLOCATION_ERROR; |
1226 break; | 1707 break; |
1227 } | 1708 } |
1228 if (head == NULL) { | 1709 if (head == NULL) { |
1229 head = latest; | 1710 head = latest; |
1230 } | 1711 } |
1231 if (previous != NULL) { | 1712 if (previous != NULL) { |
1232 previous->equal = latest; | 1713 previous->equal = latest; |
1233 } | 1714 } |
1234 previous = latest; | 1715 previous = latest; |
1235 } | 1716 } |
1236 if (latest != NULL) { | 1717 if (latest != NULL) { |
1237 const CompactTrieNode *equal = getCompactNode(header, vnode->equal); | 1718 const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(vn
ode)); |
1238 if (equal->flagscount & kParentEndsWord) { | 1719 if (equal->flagscount & kParentEndsWord) { |
1239 latest->flags |= kEndsWord; | 1720 if(info->magic == COMPACT_TRIE_MAGIC_3){ |
| 1721 latest->flags = getValue(equal); |
| 1722 } else { |
| 1723 latest->flags |= kEndsWord; |
| 1724 } |
1240 } | 1725 } |
1241 latest->equal = unpackOneNode(header, equal, status); | 1726 latest->equal = unpackOneNode(info, equal, status); |
1242 } | 1727 } |
1243 return head; | 1728 return head; |
1244 } | 1729 } |
1245 else { | 1730 else { |
1246 // Horizontal node | 1731 // Horizontal node |
1247 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNod
e *)node; | 1732 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNod
e *)node; |
1248 return unpackHorizontalArray(header, &hnode->entries[0], 0, nodeCount-1,
status); | 1733 return unpackHorizontalArray(info, hnode, 0, nodeCount-1, nodeCount, sta
tus); |
1249 } | 1734 } |
1250 } | 1735 } |
1251 | 1736 |
| 1737 // returns a MutableTrieDictionary generated from the CompactTrieDictionary |
1252 MutableTrieDictionary * | 1738 MutableTrieDictionary * |
1253 CompactTrieDictionary::cloneMutable( UErrorCode &status ) const { | 1739 CompactTrieDictionary::cloneMutable( UErrorCode &status ) const { |
1254 MutableTrieDictionary *result = new MutableTrieDictionary( status ); | 1740 MutableTrieDictionary *result = new MutableTrieDictionary( status, fInfo->ma
gic == COMPACT_TRIE_MAGIC_3 ); |
1255 if (result == NULL) { | 1741 if (result == NULL) { |
1256 status = U_MEMORY_ALLOCATION_ERROR; | 1742 status = U_MEMORY_ALLOCATION_ERROR; |
1257 return NULL; | 1743 return NULL; |
1258 } | 1744 } |
1259 TernaryNode *root = unpackOneNode(fData, getCompactNode(fData, fData->root),
status); | 1745 // treat root node as special case: don't call unpackOneNode() or unpackHori
zontalArray() directly |
| 1746 // because only kEqualOverflows flag should be checked in root's flagscount |
| 1747 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)
|
| 1748 getCompactNode(fInfo, fInfo->root); |
| 1749 uint16_t nodeCount = hnode->flagscount & kRootCountMask; |
| 1750 TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1, |
| 1751 nodeCount, status); |
| 1752 |
1260 if (U_FAILURE(status)) { | 1753 if (U_FAILURE(status)) { |
1261 delete root; // Clean up | 1754 delete root; // Clean up |
1262 delete result; | 1755 delete result; |
1263 return NULL; | 1756 return NULL; |
1264 } | 1757 } |
1265 result->fTrie = root; | 1758 result->fTrie = root; |
1266 return result; | 1759 return result; |
1267 } | 1760 } |
1268 | 1761 |
1269 U_NAMESPACE_END | 1762 U_NAMESPACE_END |
1270 | 1763 |
1271 U_CAPI int32_t U_EXPORT2 | 1764 U_CAPI int32_t U_EXPORT2 |
1272 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *
outData, | 1765 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *
outData, |
1273 UErrorCode *status) { | 1766 UErrorCode *status) { |
1274 | 1767 |
1275 if (status == NULL || U_FAILURE(*status)) { | 1768 if (status == NULL || U_FAILURE(*status)) { |
1276 return 0; | 1769 return 0; |
1277 } | 1770 } |
1278 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { | 1771 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { |
1279 *status=U_ILLEGAL_ARGUMENT_ERROR; | 1772 *status=U_ILLEGAL_ARGUMENT_ERROR; |
1280 return 0; | 1773 return 0; |
1281 } | 1774 } |
1282 | 1775 |
1283 // | 1776 // |
1284 // Check that the data header is for for dictionary data. | 1777 // Check that the data header is for for dictionary data. |
1285 // (Header contents are defined in genxxx.cpp) | 1778 // (Header contents are defined in genxxx.cpp) |
1286 // | 1779 // |
1287 const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4); | 1780 const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4); |
1288 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */ | 1781 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */ |
1289 pInfo->dataFormat[1]==0x72 && | 1782 pInfo->dataFormat[1]==0x72 && |
1290 pInfo->dataFormat[2]==0x44 && | 1783 pInfo->dataFormat[2]==0x44 && |
1291 pInfo->dataFormat[3]==0x63 && | 1784 pInfo->dataFormat[3]==0x63 && |
1292 pInfo->formatVersion[0]==1 )) { | 1785 pInfo->formatVersion[0]==1 )) { |
1293 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (
format version %02x) is not recognized\n", | 1786 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (
format version %02x) is not recognized\n", |
1294 pInfo->dataFormat[0], pInfo->dataFormat[1], | 1787 pInfo->dataFormat[0], pInfo->dataFormat[1], |
1295 pInfo->dataFormat[2], pInfo->dataFormat[3], | 1788 pInfo->dataFormat[2], pInfo->dataFormat[3], |
1296 pInfo->formatVersion[0]); | 1789 pInfo->formatVersion[0]); |
1297 *status=U_UNSUPPORTED_ERROR; | 1790 *status=U_UNSUPPORTED_ERROR; |
1298 return 0; | 1791 return 0; |
1299 } | 1792 } |
1300 | 1793 |
1301 // | 1794 // |
1302 // Swap the data header. (This is the generic ICU Data Header, not the | 1795 // Swap the data header. (This is the generic ICU Data Header, not the |
1303 // CompactTrieHeader). This swap also conveniently
gets us | 1796 // CompactTrieHeader). This swap also conveniently
gets us |
1304 // the size of the ICU d.h., which lets us locate th
e start | 1797 // the size of the ICU d.h., which lets us locate th
e start |
1305 // of the RBBI specific data. | 1798 // of the RBBI specific data. |
1306 // | 1799 // |
1307 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status)
; | 1800 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status)
; |
1308 | 1801 |
1309 // | 1802 // |
1310 // Get the CompactTrieHeader, and check that it appears to be OK. | 1803 // Get the CompactTrieHeader, and check that it appears to be OK. |
1311 // | 1804 // |
1312 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; | 1805 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; |
1313 const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes; | 1806 const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes; |
1314 if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1 | 1807 uint32_t magic = ds->readUInt32(header->magic); |
1315 || ds->readUInt32(header->size) < sizeof(CompactTrieHeader)) | 1808 if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic
!= COMPACT_TRIE_MAGIC_3 |
| 1809 || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < s
izeof(CompactTrieHeaderV1) |
| 1810 || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < s
izeof(CompactTrieHeader)) |
1316 { | 1811 { |
1317 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n")
; | 1812 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n")
; |
1318 *status=U_UNSUPPORTED_ERROR; | 1813 *status=U_UNSUPPORTED_ERROR; |
1319 return 0; | 1814 return 0; |
1320 } | 1815 } |
1321 | 1816 |
1322 // | 1817 // |
1323 // Prefight operation? Just return the size | 1818 // Prefight operation? Just return the size |
1324 // | 1819 // |
1325 uint32_t totalSize = ds->readUInt32(header->size); | 1820 uint32_t totalSize = ds->readUInt32(header->size); |
1326 int32_t sizeWithUData = (int32_t)totalSize + headerSize; | 1821 int32_t sizeWithUData = (int32_t)totalSize + headerSize; |
1327 if (length < 0) { | 1822 if (length < 0) { |
1328 return sizeWithUData; | 1823 return sizeWithUData; |
1329 } | 1824 } |
1330 | 1825 |
1331 // | 1826 // |
1332 // Check that length passed in is consistent with length from RBBI data head
er. | 1827 // Check that length passed in is consistent with length from RBBI data head
er. |
1333 // | 1828 // |
1334 if (length < sizeWithUData) { | 1829 if (length < sizeWithUData) { |
1335 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data
header) for trie data.\n", | 1830 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data
header) for trie data.\n", |
1336 totalSize); | 1831 totalSize); |
1337 *status=U_INDEX_OUTOFBOUNDS_ERROR; | 1832 *status=U_INDEX_OUTOFBOUNDS_ERROR; |
1338 return 0; | 1833 return 0; |
1339 } | 1834 } |
1340 | 1835 |
1341 // | 1836 // |
1342 // Swap the Data. Do the data itself first, then the CompactTrieHeader, bec
ause | 1837 // Swap the Data. Do the data itself first, then the CompactTrieHeader, bec
ause |
1343 // we need to reference the header to locate the data, and a
n | 1838 // we need to reference the header to locate the data, and a
n |
1344 // inplace swap of the header leaves it unusable. | 1839 // inplace swap of the header leaves it unusable. |
1345 // | 1840 // |
1346 uint8_t *outBytes = (uint8_t *)outData + headerSize; | 1841 uint8_t *outBytes = (uint8_t *)outData + headerSize; |
1347 CompactTrieHeader *outputHeader = (CompactTrieHeader *)outBytes; | 1842 CompactTrieHeader *outputHeader = (CompactTrieHeader *)outBytes; |
1348 | 1843 |
1349 #if 0 | 1844 #if 0 |
1350 // | 1845 // |
1351 // If not swapping in place, zero out the output buffer before starting. | 1846 // If not swapping in place, zero out the output buffer before starting. |
1352 // | 1847 // |
1353 if (inBytes != outBytes) { | 1848 if (inBytes != outBytes) { |
1354 uprv_memset(outBytes, 0, totalSize); | 1849 uprv_memset(outBytes, 0, totalSize); |
1355 } | 1850 } |
1356 | 1851 |
1357 // We need to loop through all the nodes in the offset table, and swap each
one. | 1852 // We need to loop through all the nodes in the offset table, and swap each
one. |
1358 uint16_t nodeCount = ds->readUInt16(header->nodeCount); | 1853 uint32_t nodeCount, rootId; |
| 1854 if(header->magic == COMPACT_TRIE_MAGIC_1) { |
| 1855 nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount); |
| 1856 rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root); |
| 1857 } else { |
| 1858 nodeCount = ds->readUInt32(header->nodeCount); |
| 1859 rootId = ds->readUInt32(header->root); |
| 1860 } |
| 1861 |
1359 // Skip node 0, which should always be 0. | 1862 // Skip node 0, which should always be 0. |
1360 for (int i = 1; i < nodeCount; ++i) { | 1863 for (uint32_t i = 1; i < nodeCount; ++i) { |
1361 uint32_t nodeOff = ds->readUInt32(header->offsets[i]); | 1864 uint32_t nodeOff = ds->readUInt32(header->offsets[i]); |
1362 const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + node
Off); | 1865 const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + node
Off); |
1363 CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff); | 1866 CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff); |
1364 uint16_t flagscount = ds->readUInt16(inNode->flagscount); | 1867 uint16_t flagscount = ds->readUInt16(inNode->flagscount); |
1365 uint16_t itemCount = flagscount & kCountMask; | 1868 uint16_t itemCount = getCount(inNode); |
| 1869 //uint16_t itemCount = flagscount & kCountMask; |
1366 ds->writeUInt16(&outNode->flagscount, flagscount); | 1870 ds->writeUInt16(&outNode->flagscount, flagscount); |
1367 if (itemCount > 0) { | 1871 if (itemCount > 0) { |
1368 if (flagscount & kVerticalNode) { | 1872 uint16_t overflow = 0; //number of extra uint16_ts needed to be swap
ped |
| 1873 if (flagscount & kVerticalNode && i != rootId) { |
| 1874 if(flagscount & kEqualOverflows){ |
| 1875 // include overflow bits |
| 1876 overflow += 1; |
| 1877 } |
| 1878 if (header->magic == COMPACT_TRIE_MAGIC_3 && flagscount & kEndsP
arentWord) { |
| 1879 //include values |
| 1880 overflow += 1; |
| 1881 } |
1369 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVertical
Node,chars), | 1882 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVertical
Node,chars), |
1370 itemCount*sizeof(uint16_t), | 1883 (itemCount + overflow)*sizeof(uint16_t), |
1371 outBytes+nodeOff+offsetof(CompactTrieVertica
lNode,chars), status); | 1884 outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars)
, status); |
1372 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(Compact
TrieVerticalNode,equal); | 1885 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(Compact
TrieVerticalNode,equal); |
1373 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNod
e,equal)); | 1886 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNod
e,equal)); |
1374 } | 1887 } |
1375 else { | 1888 else { |
1376 const CompactTrieHorizontalNode *inHNode = (const CompactTrieHor
izontalNode *)inNode; | 1889 const CompactTrieHorizontalNode *inHNode = (const CompactTrieHor
izontalNode *)inNode; |
1377 CompactTrieHorizontalNode *outHNode = (CompactTrieHorizontalNode
*)outNode; | 1890 CompactTrieHorizontalNode *outHNode = (CompactTrieHorizontalNode
*)outNode; |
1378 for (int j = 0; j < itemCount; ++j) { | 1891 for (int j = 0; j < itemCount; ++j) { |
1379 uint16_t word = ds->readUInt16(inHNode->entries[j].ch); | 1892 uint16_t word = ds->readUInt16(inHNode->entries[j].ch); |
1380 ds->writeUInt16(&outHNode->entries[j].ch, word); | 1893 ds->writeUInt16(&outHNode->entries[j].ch, word); |
1381 word = ds->readUInt16(inHNode->entries[j].equal); | 1894 word = ds->readUInt16(inHNode->entries[j].equal); |
1382 ds->writeUInt16(&outHNode->entries[j].equal, word); | 1895 ds->writeUInt16(&outHNode->entries[j].equal, word); |
1383 } | 1896 } |
| 1897 |
| 1898 // swap overflow/value information |
| 1899 if(flagscount & kEqualOverflows){ |
| 1900 overflow += (itemCount + 3) / 4; |
| 1901 } |
| 1902 |
| 1903 if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && flag
scount & kEndsParentWord) { |
| 1904 //include values |
| 1905 overflow += 1; |
| 1906 } |
| 1907 |
| 1908 uint16_t *inOverflow = (uint16_t *) &inHNode->entries[itemCount]
; |
| 1909 uint16_t *outOverflow = (uint16_t *) &outHNode->entries[itemCoun
t]; |
| 1910 for(int j = 0; j<overflow; j++){ |
| 1911 uint16_t extraInfo = ds->readUInt16(*inOverflow); |
| 1912 ds->writeUInt16(outOverflow, extraInfo); |
| 1913 |
| 1914 inOverflow++; |
| 1915 outOverflow++; |
| 1916 } |
1384 } | 1917 } |
1385 } | 1918 } |
1386 } | 1919 } |
1387 #endif | 1920 #endif |
1388 | 1921 |
| 1922 // Swap the header |
| 1923 ds->writeUInt32(&outputHeader->size, totalSize); |
| 1924 ds->writeUInt32(&outputHeader->magic, magic); |
| 1925 |
| 1926 uint32_t nodeCount; |
| 1927 uint32_t offsetPos; |
| 1928 if (header->magic == COMPACT_TRIE_MAGIC_1) { |
| 1929 CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *)header; |
| 1930 CompactTrieHeaderV1 *outputHeaderV1 = (CompactTrieHeaderV1 *)outputHeade
r; |
| 1931 |
| 1932 nodeCount = ds->readUInt16(headerV1->nodeCount); |
| 1933 ds->writeUInt16(&outputHeaderV1->nodeCount, nodeCount); |
| 1934 uint16_t root = ds->readUInt16(headerV1->root); |
| 1935 ds->writeUInt16(&outputHeaderV1->root, root); |
| 1936 offsetPos = offsetof(CompactTrieHeaderV1,offsets); |
| 1937 } else { |
| 1938 nodeCount = ds->readUInt32(header->nodeCount); |
| 1939 ds->writeUInt32(&outputHeader->nodeCount, nodeCount); |
| 1940 uint32_t root = ds->readUInt32(header->root); |
| 1941 ds->writeUInt32(&outputHeader->root, root); |
| 1942 offsetPos = offsetof(CompactTrieHeader,offsets); |
| 1943 } |
| 1944 |
1389 // All the data in all the nodes consist of 16 bit items. Swap them all at o
nce. | 1945 // All the data in all the nodes consist of 16 bit items. Swap them all at o
nce. |
1390 uint16_t nodeCount = ds->readUInt16(header->nodeCount); | 1946 uint32_t nodesOff = offsetPos+((uint32_t)nodeCount*sizeof(uint32_t)); |
1391 uint32_t nodesOff = offsetof(CompactTrieHeader,offsets)+((uint32_t)nodeCount
*sizeof(uint32_t)); | |
1392 ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff,
status); | 1947 ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff,
status); |
1393 | 1948 |
1394 // Swap the header | 1949 //swap offsets |
1395 ds->writeUInt32(&outputHeader->size, totalSize); | 1950 ds->swapArray32(ds, inBytes+offsetPos, |
1396 uint32_t magic = ds->readUInt32(header->magic); | 1951 sizeof(uint32_t)*(uint32_t)nodeCount, |
1397 ds->writeUInt32(&outputHeader->magic, magic); | 1952 outBytes+offsetPos, status); |
1398 ds->writeUInt16(&outputHeader->nodeCount, nodeCount); | |
1399 uint16_t root = ds->readUInt16(header->root); | |
1400 ds->writeUInt16(&outputHeader->root, root); | |
1401 ds->swapArray32(ds, inBytes+offsetof(CompactTrieHeader,offsets), | |
1402 sizeof(uint32_t)*(int32_t)nodeCount, | |
1403 outBytes+offsetof(CompactTrieHeader,offsets), status); | |
1404 | 1953 |
1405 return sizeWithUData; | 1954 return sizeWithUData; |
1406 } | 1955 } |
1407 | 1956 |
1408 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | 1957 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
OLD | NEW |