Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(36)

Side by Side Diff: icu46/source/common/triedict.cpp

Issue 6370014: CJK segmentation patch for ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/
Patch Set: Created 9 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « icu46/source/common/triedict.h ('k') | icu46/source/data/Makefile.in » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /** 1 /**
2 ******************************************************************************* 2 *******************************************************************************
3 * Copyright (C) 2006-2008, International Business Machines Corporation * 3 * Copyright (C) 2006-2008, International Business Machines Corporation *
4 * and others. All Rights Reserved. * 4 * and others. All Rights Reserved. *
5 ******************************************************************************* 5 *******************************************************************************
6 */ 6 */
7 7
8 #include "unicode/utypes.h" 8 #include "unicode/utypes.h"
9 9
10 #if !UCONFIG_NO_BREAK_ITERATION 10 #if !UCONFIG_NO_BREAK_ITERATION
11 11
12 #include "triedict.h" 12 #include "triedict.h"
13 #include "unicode/chariter.h" 13 #include "unicode/chariter.h"
14 #include "unicode/uchriter.h" 14 #include "unicode/uchriter.h"
15 #include "unicode/strenum.h" 15 #include "unicode/strenum.h"
16 #include "unicode/uenum.h" 16 #include "unicode/uenum.h"
17 #include "unicode/udata.h" 17 #include "unicode/udata.h"
18 #include "cmemory.h" 18 #include "cmemory.h"
19 #include "udataswp.h" 19 #include "udataswp.h"
20 #include "uvector.h" 20 #include "uvector.h"
21 #include "uvectr32.h" 21 #include "uvectr32.h"
22 #include "uarrsort.h" 22 #include "uarrsort.h"
23 #include "hash.h"
23 24
24 //#define DEBUG_TRIE_DICT 1 25 //#define DEBUG_TRIE_DICT 1
25 26
26 #ifdef DEBUG_TRIE_DICT 27 #ifdef DEBUG_TRIE_DICT
27 #include <sys/times.h> 28 #include <sys/times.h>
28 #include <limits.h> 29 #include <limits.h>
29 #include <stdio.h> 30 #include <stdio.h>
31 #include <time.h>
32 #ifndef CLK_TCK
33 #define CLK_TCK CLOCKS_PER_SEC
34 #endif
35
30 #endif 36 #endif
31 37
32 U_NAMESPACE_BEGIN 38 U_NAMESPACE_BEGIN
33 39
34 /******************************************************************* 40 /*******************************************************************
35 * TrieWordDictionary 41 * TrieWordDictionary
36 */ 42 */
37 43
38 TrieWordDictionary::TrieWordDictionary() { 44 TrieWordDictionary::TrieWordDictionary() {
39 } 45 }
40 46
41 TrieWordDictionary::~TrieWordDictionary() { 47 TrieWordDictionary::~TrieWordDictionary() {
42 } 48 }
43 49
44 /******************************************************************* 50 /*******************************************************************
45 * MutableTrieDictionary 51 * MutableTrieDictionary
46 */ 52 */
47 53
54 //#define MAX_VALUE 65535
55
56 // forward declaration
57 inline uint16_t scaleLogProbabilities(double logprob);
58
48 // Node structure for the ternary, uncompressed trie 59 // Node structure for the ternary, uncompressed trie
49 struct TernaryNode : public UMemory { 60 struct TernaryNode : public UMemory {
50 UChar ch; // UTF-16 code unit 61 UChar ch; // UTF-16 code unit
51 uint16_t flags; // Flag word 62 uint16_t flags; // Flag word
52 TernaryNode *low; // Less-than link 63 TernaryNode *low; // Less-than link
53 TernaryNode *equal; // Equal link 64 TernaryNode *equal; // Equal link
54 TernaryNode *high; // Greater-than link 65 TernaryNode *high; // Greater-than link
55 66
56 TernaryNode(UChar uc); 67 TernaryNode(UChar uc);
57 ~TernaryNode(); 68 ~TernaryNode();
(...skipping 12 matching lines...) Expand all
70 high = NULL; 81 high = NULL;
71 } 82 }
72 83
73 // Not inline since it's recursive 84 // Not inline since it's recursive
74 TernaryNode::~TernaryNode() { 85 TernaryNode::~TernaryNode() {
75 delete low; 86 delete low;
76 delete equal; 87 delete equal;
77 delete high; 88 delete high;
78 } 89 }
79 90
80 MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) { 91 MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status,
92 UBool containsValue /* = FALSE */ ) {
81 // Start the trie off with something. Having the root node already present 93 // Start the trie off with something. Having the root node already present
82 // cuts a special case out of the search/insertion functions. 94 // cuts a special case out of the search/insertion functions.
83 // Making it a median character cuts the worse case for searches from 95 // Making it a median character cuts the worse case for searches from
84 // 4x a balanced trie to 2x a balanced trie. It's best to choose something 96 // 4x a balanced trie to 2x a balanced trie. It's best to choose something
85 // that starts a word that is midway in the list. 97 // that starts a word that is midway in the list.
86 fTrie = new TernaryNode(median); 98 fTrie = new TernaryNode(median);
87 if (fTrie == NULL) { 99 if (fTrie == NULL) {
88 status = U_MEMORY_ALLOCATION_ERROR; 100 status = U_MEMORY_ALLOCATION_ERROR;
89 } 101 }
90 fIter = utext_openUChars(NULL, NULL, 0, &status); 102 fIter = utext_openUChars(NULL, NULL, 0, &status);
91 if (U_SUCCESS(status) && fIter == NULL) { 103 if (U_SUCCESS(status) && fIter == NULL) {
92 status = U_MEMORY_ALLOCATION_ERROR; 104 status = U_MEMORY_ALLOCATION_ERROR;
93 } 105 }
106
107 fValued = containsValue;
94 } 108 }
95 109
96 MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) { 110 MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status,
111 UBool containsValue /* = false */ ) {
97 fTrie = NULL; 112 fTrie = NULL;
98 fIter = utext_openUChars(NULL, NULL, 0, &status); 113 fIter = utext_openUChars(NULL, NULL, 0, &status);
99 if (U_SUCCESS(status) && fIter == NULL) { 114 if (U_SUCCESS(status) && fIter == NULL) {
100 status = U_MEMORY_ALLOCATION_ERROR; 115 status = U_MEMORY_ALLOCATION_ERROR;
101 } 116 }
117
118 fValued = containsValue;
102 } 119 }
103 120
104 MutableTrieDictionary::~MutableTrieDictionary() { 121 MutableTrieDictionary::~MutableTrieDictionary() {
105 delete fTrie; 122 delete fTrie;
106 utext_close(fIter); 123 utext_close(fIter);
107 } 124 }
108 125
109 int32_t 126 int32_t
110 MutableTrieDictionary::search( UText *text, 127 MutableTrieDictionary::search( UText *text,
111 int32_t maxLength, 128 int32_t maxLength,
112 int32_t *lengths, 129 int32_t *lengths,
113 int &count, 130 int &count,
114 int limit, 131 int limit,
115 TernaryNode *&parent, 132 TernaryNode *&parent,
116 UBool &pMatched ) const { 133 UBool &pMatched,
134 uint16_t *values /*=NULL*/) const {
117 // TODO: current implementation works in UTF-16 space 135 // TODO: current implementation works in UTF-16 space
118 const TernaryNode *up = NULL; 136 const TernaryNode *up = NULL;
119 const TernaryNode *p = fTrie; 137 const TernaryNode *p = fTrie;
120 int mycount = 0; 138 int mycount = 0;
121 pMatched = TRUE; 139 pMatched = TRUE;
122 int i; 140 int i;
123 141
142 if (!fValued) {
143 values = NULL;
144 }
145
124 UChar uc = utext_current32(text); 146 UChar uc = utext_current32(text);
125 for (i = 0; i < maxLength && p != NULL; ++i) { 147 for (i = 0; i < maxLength && p != NULL; ++i) {
126 while (p != NULL) { 148 while (p != NULL) {
127 if (uc < p->ch) { 149 if (uc < p->ch) {
128 up = p; 150 up = p;
129 p = p->low; 151 p = p->low;
130 } 152 }
131 else if (uc == p->ch) { 153 else if (uc == p->ch) {
132 break; 154 break;
133 } 155 }
134 else { 156 else {
135 up = p; 157 up = p;
136 p = p->high; 158 p = p->high;
137 } 159 }
138 } 160 }
139 if (p == NULL) { 161 if (p == NULL) {
140 pMatched = FALSE; 162 pMatched = FALSE;
141 break; 163 break;
142 } 164 }
143 // Must be equal to get here 165 // Must be equal to get here
144 if (limit > 0 && (p->flags & kEndsWord)) { 166 if (limit > 0 && (p->flags > 0)) {
167 //is there a more efficient way to add values? ie. remove if stmt
168 if(values != NULL) {
169 values[mycount] = p->flags;
170 }
145 lengths[mycount++] = i+1; 171 lengths[mycount++] = i+1;
146 --limit; 172 --limit;
147 } 173 }
148 up = p; 174 up = p;
149 p = p->equal; 175 p = p->equal;
150 uc = utext_next32(text); 176 uc = utext_next32(text);
151 uc = utext_current32(text); 177 uc = utext_current32(text);
152 } 178 }
153 179
154 // Note that there is no way to reach here with up == 0 unless 180 // Note that there is no way to reach here with up == 0 unless
155 // maxLength is 0 coming in. 181 // maxLength is 0 coming in.
156 parent = (TernaryNode *)up; 182 parent = (TernaryNode *)up;
157 count = mycount; 183 count = mycount;
158 return i; 184 return i;
159 } 185 }
160 186
161 void 187 void
162 MutableTrieDictionary::addWord( const UChar *word, 188 MutableTrieDictionary::addWord( const UChar *word,
163 int32_t length, 189 int32_t length,
164 UErrorCode &status ) { 190 UErrorCode &status,
165 #if 0 191 uint16_t value /* = 0 */ ) {
166 if (length <= 0) { 192 // dictionary cannot store zero values, would interfere with flags
193 if (length <= 0 || (!fValued && value > 0) || (fValued && value == 0)) {
167 status = U_ILLEGAL_ARGUMENT_ERROR; 194 status = U_ILLEGAL_ARGUMENT_ERROR;
168 return; 195 return;
169 } 196 }
170 #endif 197
171 TernaryNode *parent; 198 TernaryNode *parent;
172 UBool pMatched; 199 UBool pMatched;
173 int count; 200 int count;
174 fIter = utext_openUChars(fIter, word, length, &status); 201 fIter = utext_openUChars(fIter, word, length, &status);
175 202
176 int matched; 203 int matched;
177 matched = search(fIter, length, NULL, count, 0, parent, pMatched); 204 matched = search(fIter, length, NULL, count, 0, parent, pMatched);
178 205
179 while (matched++ < length) { 206 while (matched++ < length) {
180 UChar32 uc = utext_next32(fIter); // TODO: supplemetary support? 207 UChar32 uc = utext_next32(fIter); // TODO: supplementary support?
181 U_ASSERT(uc != U_SENTINEL); 208 U_ASSERT(uc != U_SENTINEL);
182 TernaryNode *newNode = new TernaryNode(uc); 209 TernaryNode *newNode = new TernaryNode(uc);
183 if (newNode == NULL) { 210 if (newNode == NULL) {
184 status = U_MEMORY_ALLOCATION_ERROR; 211 status = U_MEMORY_ALLOCATION_ERROR;
185 return; 212 return;
186 } 213 }
187 if (pMatched) { 214 if (pMatched) {
188 parent->equal = newNode; 215 parent->equal = newNode;
189 } 216 }
190 else { 217 else {
191 pMatched = TRUE; 218 pMatched = TRUE;
192 if (uc < parent->ch) { 219 if (uc < parent->ch) {
193 parent->low = newNode; 220 parent->low = newNode;
194 } 221 }
195 else { 222 else {
196 parent->high = newNode; 223 parent->high = newNode;
197 } 224 }
198 } 225 }
199 parent = newNode; 226 parent = newNode;
200 } 227 }
201 228
202 parent->flags |= kEndsWord; 229 if(fValued && value > 0){
203 } 230 parent->flags = value;
204 231 } else {
205 #if 0 232 parent->flags |= kEndsWord;
206 void
207 MutableTrieDictionary::addWords( UEnumeration *words,
208 UErrorCode &status ) {
209 int32_t length;
210 const UChar *word;
211 while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) {
212 addWord(word, length, status);
213 } 233 }
214 } 234 }
215 #endif
216 235
217 int32_t 236 int32_t
218 MutableTrieDictionary::matches( UText *text, 237 MutableTrieDictionary::matches( UText *text,
219 int32_t maxLength, 238 int32_t maxLength,
220 int32_t *lengths, 239 int32_t *lengths,
221 int &count, 240 int &count,
222 int limit ) const { 241 int limit,
242 uint16_t *values /*=NULL*/) const {
223 TernaryNode *parent; 243 TernaryNode *parent;
224 UBool pMatched; 244 UBool pMatched;
225 return search(text, maxLength, lengths, count, limit, parent, pMatched); 245 return search(text, maxLength, lengths, count, limit, parent, pMatched, valu es);
226 } 246 }
227 247
228 // Implementation of iteration for MutableTrieDictionary 248 // Implementation of iteration for MutableTrieDictionary
229 class MutableTrieEnumeration : public StringEnumeration { 249 class MutableTrieEnumeration : public StringEnumeration {
230 private: 250 private:
231 UStack fNodeStack; // Stack of nodes to process 251 UStack fNodeStack; // Stack of nodes to process
232 UVector32 fBranchStack; // Stack of which branch we are working on 252 UVector32 fBranchStack; // Stack of which branch we are working on
233 TernaryNode *fRoot; // Root node 253 TernaryNode *fRoot; // Root node
234 enum StackBranch { 254 enum StackBranch {
235 kLessThan, 255 kLessThan,
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
270 290
271 switch (where) { 291 switch (where) {
272 case kLessThan: 292 case kLessThan:
273 if (node->low != NULL) { 293 if (node->low != NULL) {
274 fBranchStack.setElementAt(kEqual, fBranchStack.size()-1); 294 fBranchStack.setElementAt(kEqual, fBranchStack.size()-1);
275 node = (TernaryNode *) fNodeStack.push(node->low, status); 295 node = (TernaryNode *) fNodeStack.push(node->low, status);
276 where = (StackBranch) fBranchStack.push(kLessThan, status); 296 where = (StackBranch) fBranchStack.push(kLessThan, status);
277 break; 297 break;
278 } 298 }
279 case kEqual: 299 case kEqual:
280 emit = (node->flags & kEndsWord) != 0; 300 emit = node->flags > 0;
281 equal = (node->equal != NULL); 301 equal = (node->equal != NULL);
282 // If this node should be part of the next emitted string, appen d 302 // If this node should be part of the next emitted string, appen d
283 // the UChar to the string, and make sure we pop it when we come 303 // the UChar to the string, and make sure we pop it when we come
284 // back to this node. The character should only be in the string 304 // back to this node. The character should only be in the string
285 // for as long as we're traversing the equal subtree of this nod e 305 // for as long as we're traversing the equal subtree of this nod e
286 if (equal || emit) { 306 if (equal || emit) {
287 unistr.append(node->ch); 307 unistr.append(node->ch);
288 fBranchStack.setElementAt(kGreaterThan, fBranchStack.size()- 1); 308 fBranchStack.setElementAt(kGreaterThan, fBranchStack.size()- 1);
289 } 309 }
290 if (equal) { 310 if (equal) {
291 node = (TernaryNode *) fNodeStack.push(node->equal, status); 311 node = (TernaryNode *) fNodeStack.push(node->equal, status);
292 where = (StackBranch) fBranchStack.push(kLessThan, status); 312 where = (StackBranch) fBranchStack.push(kLessThan, status);
293 } 313 }
294 if (emit) { 314 if (emit) {
295 return &unistr; 315 return &unistr;
296 } 316 }
297 if (equal) { 317 if (equal) {
298 break; 318 break;
299 } 319 }
300 case kGreaterThan: 320 case kGreaterThan:
301 // If this node's character is in the string, remove it. 321 // If this node's character is in the string, remove it.
302 if (node->equal != NULL || (node->flags & kEndsWord)) { 322 if (node->equal != NULL || node->flags > 0) {
303 unistr.truncate(unistr.length()-1); 323 unistr.truncate(unistr.length()-1);
304 } 324 }
305 if (node->high != NULL) { 325 if (node->high != NULL) {
306 fBranchStack.setElementAt(kDone, fBranchStack.size()-1); 326 fBranchStack.setElementAt(kDone, fBranchStack.size()-1);
307 node = (TernaryNode *) fNodeStack.push(node->high, status); 327 node = (TernaryNode *) fNodeStack.push(node->high, status);
308 where = (StackBranch) fBranchStack.push(kLessThan, status); 328 where = (StackBranch) fBranchStack.push(kLessThan, status);
309 break; 329 break;
310 } 330 }
311 case kDone: 331 case kDone:
312 fNodeStack.pop(); 332 fNodeStack.pop();
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
347 if (U_FAILURE(status)) { 367 if (U_FAILURE(status)) {
348 return NULL; 368 return NULL;
349 } 369 }
350 return new MutableTrieEnumeration(fTrie, status); 370 return new MutableTrieEnumeration(fTrie, status);
351 } 371 }
352 372
353 /******************************************************************* 373 /*******************************************************************
354 * CompactTrieDictionary 374 * CompactTrieDictionary
355 */ 375 */
356 376
377 //TODO further optimization:
378 // minimise size of trie with logprobs by storing values
379 // for terminal nodes directly in offsets[]
380 // --> calculating from next offset *might* be simpler, but would have to add
381 // one last offset for logprob of last node
382 // --> if calculate from current offset, need to factor in possible overflow
383 // as well.
384 // idea: store in offset, set first bit to indicate logprob storage-->won't
385 // have to access additional node
386
387 // {'Dic', 1}, version 1: uses old header, no values
388 #define COMPACT_TRIE_MAGIC_1 0x44696301
389 // version 2: uses new header (more than 2^16 nodes), no values
390 #define COMPACT_TRIE_MAGIC_2 0x44696302
391 // version 3: uses new header, includes values
392 #define COMPACT_TRIE_MAGIC_3 0x44696303
393
357 struct CompactTrieHeader { 394 struct CompactTrieHeader {
358 uint32_t size; // Size of the data in bytes 395 uint32_t size; // Size of the data in bytes
359 uint32_t magic; // Magic number (including version) 396 uint32_t magic; // Magic number (including version)
397 uint32_t nodeCount; // Number of entries in offsets[]
398 uint32_t root; // Node number of the root node
399 uint32_t offsets[1]; // Offsets to nodes from start of data
400 };
401
402 // old version of CompactTrieHeader kept for backwards compatibility
403 struct CompactTrieHeaderV1 {
404 uint32_t size; // Size of the data in bytes
405 uint32_t magic; // Magic number (including version)
360 uint16_t nodeCount; // Number of entries in offsets[] 406 uint16_t nodeCount; // Number of entries in offsets[]
361 uint16_t root; // Node number of the root node 407 uint16_t root; // Node number of the root node
362 uint32_t offsets[1]; // Offsets to nodes from start of data 408 uint32_t offsets[1]; // Offsets to nodes from start of data
409 };
410
411 // Helper class for managing CompactTrieHeader and CompactTrieHeaderV1
412 struct CompactTrieInfo {
413 uint32_t size; // Size of the data in bytes
414 uint32_t magic; // Magic number (including version)
415 uint32_t nodeCount; // Number of entries in offsets[]
416 uint32_t root; // Node number of the root node
417 uint32_t *offsets; // Offsets to nodes from start of data
418 uint8_t *address; // pointer to header bytes in memory
419
420 CompactTrieInfo(const void *data, UErrorCode &status){
421 CompactTrieHeader *header = (CompactTrieHeader *) data;
422 if (header->magic != COMPACT_TRIE_MAGIC_1 &&
423 header->magic != COMPACT_TRIE_MAGIC_2 &&
424 header->magic != COMPACT_TRIE_MAGIC_3) {
425 status = U_ILLEGAL_ARGUMENT_ERROR;
426 } else {
427 size = header->size;
428 magic = header->magic;
429
430 if (header->magic == COMPACT_TRIE_MAGIC_1) {
431 CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *) header;
432 nodeCount = headerV1->nodeCount;
433 root = headerV1->root;
434 offsets = &(headerV1->offsets[0]);
435 address = (uint8_t *)headerV1;
436 } else {
437 nodeCount = header->nodeCount;
438 root = header->root;
439 offsets = &(header->offsets[0]);
440 address = (uint8_t *)header;
441 }
442 }
443 }
444
445 ~CompactTrieInfo(){}
363 }; 446 };
364 447
365 // Note that to avoid platform-specific alignment issues, all members of the nod e 448 // Note that to avoid platform-specific alignment issues, all members of the nod e
366 // structures should be the same size, or should contain explicit padding to 449 // structures should be the same size, or should contain explicit padding to
367 // natural alignment boundaries. 450 // natural alignment boundaries.
368 451
369 // We can't use a bitfield for the flags+count field, because the layout of thos e 452 // We can't use a bitfield for the flags+count field, because the layout of thos e
370 // is not portable. 12 bits of count allows for up to 4096 entries in a node. 453 // is not portable. 12 bits of count allows for up to 4096 entries in a node.
371 struct CompactTrieNode { 454 struct CompactTrieNode {
372 uint16_t flagscount; // Count of sub-entries, plus flags 455 uint16_t flagscount; // Count of sub-entries, plus flags
373 }; 456 };
374 457
375 enum CompactTrieNodeFlags { 458 enum CompactTrieNodeFlags {
376 kVerticalNode = 0x1000, // This is a vertical node 459 kVerticalNode = 0x1000, // This is a vertical node
377 kParentEndsWord = 0x2000, // The node whose equal link points to this ends a word 460 kParentEndsWord = 0x2000, // The node whose equal link points to this ends a word
378 kReservedFlag1 = 0x4000, 461 kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kRe servedFlag1
379 kReservedFlag2 = 0x8000, 462 kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReservedF lag2
380 kCountMask = 0x0FFF, // The count portion of flagscount 463 kCountMask = 0x0FFF, // The count portion of flagscount
381 kFlagMask = 0xF000 // The flags portion of flagscount 464 kFlagMask = 0xF000, // The flags portion of flagscount
465 kRootCountMask = 0x7FFF // The count portion of flagscount in the ro ot node
466
467 //offset flags:
468 //kOffsetContainsValue = 0x80000000 // Offset contains value for paren t node
382 }; 469 };
383 470
384 // The two node types are distinguished by the kVerticalNode flag. 471 // The two node types are distinguished by the kVerticalNode flag.
385 472
386 struct CompactTrieHorizontalEntry { 473 struct CompactTrieHorizontalEntry {
387 uint16_t ch; // UChar 474 uint16_t ch; // UChar
388 uint16_t equal; // Equal link node index 475 uint16_t equal; // Equal link node index
389 }; 476 };
390 477
391 // We don't use inheritance here because C++ does not guarantee that the 478 // We don't use inheritance here because C++ does not guarantee that the
392 // base class comes first in memory!! 479 // base class comes first in memory!!
393 480
394 struct CompactTrieHorizontalNode { 481 struct CompactTrieHorizontalNode {
395 uint16_t flagscount; // Count of sub-entries, plus flags 482 uint16_t flagscount; // Count of sub-entries, plus flags
396 CompactTrieHorizontalEntry entries[1]; 483 CompactTrieHorizontalEntry entries[1];
397 }; 484 };
398 485
399 struct CompactTrieVerticalNode { 486 struct CompactTrieVerticalNode {
400 uint16_t flagscount; // Count of sub-entries, plus flags 487 uint16_t flagscount; // Count of sub-entries, plus flags
401 uint16_t equal; // Equal link node index 488 uint16_t equal; // Equal link node index
402 uint16_t chars[1]; // Code units 489 uint16_t chars[1]; // Code units
403 }; 490 };
404 491
405 // {'Dic', 1}, version 1
406 #define COMPACT_TRIE_MAGIC_1 0x44696301
407
408 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj, 492 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj,
409 UErrorCode &status ) 493 UErrorCode &status )
410 : fUData(dataObj) 494 : fUData(dataObj)
411 { 495 {
412 fData = (const CompactTrieHeader *) udata_getMemory(dataObj); 496 fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
497 *fInfo = CompactTrieInfo(udata_getMemory(dataObj), status);
413 fOwnData = FALSE; 498 fOwnData = FALSE;
414 if (fData->magic != COMPACT_TRIE_MAGIC_1) { 499 }
415 status = U_ILLEGAL_ARGUMENT_ERROR; 500
416 fData = NULL;
417 }
418 }
419 CompactTrieDictionary::CompactTrieDictionary( const void *data, 501 CompactTrieDictionary::CompactTrieDictionary( const void *data,
420 UErrorCode &status ) 502 UErrorCode &status )
421 : fUData(NULL) 503 : fUData(NULL)
422 { 504 {
423 fData = (const CompactTrieHeader *) data; 505 fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
506 *fInfo = CompactTrieInfo(data, status);
424 fOwnData = FALSE; 507 fOwnData = FALSE;
425 if (fData->magic != COMPACT_TRIE_MAGIC_1) {
426 status = U_ILLEGAL_ARGUMENT_ERROR;
427 fData = NULL;
428 }
429 } 508 }
430 509
431 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict, 510 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict,
432 UErrorCode &status ) 511 UErrorCode &status )
433 : fUData(NULL) 512 : fUData(NULL)
434 { 513 {
435 fData = compactMutableTrieDictionary(dict, status); 514 const CompactTrieHeader* header = compactMutableTrieDictionary(dict, status) ;
515 if (U_SUCCESS(status)) {
516 fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
517 *fInfo = CompactTrieInfo(header, status);
518 }
519
436 fOwnData = !U_FAILURE(status); 520 fOwnData = !U_FAILURE(status);
437 } 521 }
438 522
439 CompactTrieDictionary::~CompactTrieDictionary() { 523 CompactTrieDictionary::~CompactTrieDictionary() {
440 if (fOwnData) { 524 if (fOwnData) {
441 uprv_free((void *)fData); 525 uprv_free((void *)(fInfo->address));
442 } 526 }
527 uprv_free((void *)fInfo);
528
443 if (fUData) { 529 if (fUData) {
444 udata_close(fUData); 530 udata_close(fUData);
445 } 531 }
446 } 532 }
447 533
534 UBool CompactTrieDictionary::getValued() const{
535 return fInfo->magic == COMPACT_TRIE_MAGIC_3;
536 }
537
448 uint32_t 538 uint32_t
449 CompactTrieDictionary::dataSize() const { 539 CompactTrieDictionary::dataSize() const {
450 return fData->size; 540 return fInfo->size;
451 } 541 }
452 542
453 const void * 543 const void *
454 CompactTrieDictionary::data() const { 544 CompactTrieDictionary::data() const {
455 return fData; 545 return fInfo->address;
456 } 546 }
457 547
458 // This function finds the address of a node for us, given its node ID 548 //This function finds the address of a node for us, given its node ID
459 static inline const CompactTrieNode * 549 static inline const CompactTrieNode *
460 getCompactNode(const CompactTrieHeader *header, uint16_t node) { 550 getCompactNode(const CompactTrieInfo *info, uint32_t node) {
461 return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[n ode]); 551 if(node < info->root-1) {
552 return (const CompactTrieNode *)(&info->offsets[node]);
553 } else {
554 return (const CompactTrieNode *)(info->address + info->offsets[node]);
555 }
556 }
557
558 //this version of getCompactNode is currently only used in compactMutableTrieDic tionary()
559 static inline const CompactTrieNode *
560 getCompactNode(const CompactTrieHeader *header, uint32_t node) {
561 if(node < header->root-1) {
562 return (const CompactTrieNode *)(&header->offsets[node]);
563 } else {
564 return (const CompactTrieNode *)((const uint8_t *)header + header->offse ts[node]);
565 }
566 }
567
568
569 /**
570 * Calculates the number of links in a node
571 * @node The specified node
572 */
573 static inline const uint16_t
574 getCount(const CompactTrieNode *node){
575 return (node->flagscount & kCountMask);
576 //use the code below if number of links ever exceed 4096
577 //return (node->flagscount & kCountMask) + ((node->flagscount & kExceedsCoun t) >> 2);
578 }
579
580 /**
581 * calculates an equal link node ID of a horizontal node
582 * @hnode The horizontal node containing the equal link
583 * @param index The index into hnode->entries[]
584 * @param nodeCount The length of hnode->entries[]
585 */
586 static inline uint32_t calcEqualLink(const CompactTrieVerticalNode *vnode){
587 if(vnode->flagscount & kEqualOverflows){
588 // treat overflow bits as an extension of chars[]
589 uint16_t *overflow = (uint16_t *) &vnode->chars[getCount((CompactTrieNod e*)vnode)];
590 return vnode->equal + (((uint32_t)*overflow) << 16);
591 }else{
592 return vnode->equal;
593 }
594 }
595
596 /**
597 * calculates an equal link node ID of a horizontal node
598 * @hnode The horizontal node containing the equal link
599 * @param index The index into hnode->entries[]
600 * @param nodeCount The length of hnode->entries[]
601 */
602 static inline uint32_t calcEqualLink(const CompactTrieHorizontalNode *hnode, uin t16_t index, uint16_t nodeCount){
603 if(hnode->flagscount & kEqualOverflows){
604 //set overflow to point to the uint16_t containing the overflow bits
605 uint16_t *overflow = (uint16_t *) &hnode->entries[nodeCount];
606 overflow += index/4;
607 uint16_t extraBits = (*overflow >> (3 - (index % 4)) * 4) % 0x10;
608 return hnode->entries[index].equal + (((uint32_t)extraBits) << 16);
609 } else {
610 return hnode->entries[index].equal;
611 }
612 }
613
614 /**
615 * Returns the value stored in the specified node which is associated with its
616 * parent node.
617 * TODO: how to tell that value is stored in node or in offset? check whether
618 * node ID < fInfo->root!
619 */
620 static inline uint16_t getValue(const CompactTrieHorizontalNode *hnode){
621 uint16_t count = getCount((CompactTrieNode *)hnode);
622 uint16_t overflowSize = 0; //size of node ID overflow storage in bytes
623
624 if(hnode->flagscount & kEqualOverflows)
625 overflowSize = (count + 3) / 4 * sizeof(uint16_t);
626 return *((uint16_t *)((uint8_t *)&hnode->entries[count] + overflowSize));
627 }
628
629 static inline uint16_t getValue(const CompactTrieVerticalNode *vnode){
630 // calculate size of total node ID overflow storage in bytes
631 uint16_t overflowSize = (vnode->flagscount & kEqualOverflows)? sizeof(uint16 _t) : 0;
632 return *((uint16_t *)((uint8_t *)&vnode->chars[getCount((CompactTrieNode *)v node)] + overflowSize));
633 }
634
635 static inline uint16_t getValue(const CompactTrieNode *node){
636 if(node->flagscount & kVerticalNode)
637 return getValue((const CompactTrieVerticalNode *)node);
638 else
639 return getValue((const CompactTrieHorizontalNode *)node);
640 }
641
642 //returns index of match in CompactTrieHorizontalNode.entries[] using binary sea rch
643 inline int16_t
644 searchHorizontalEntries(const CompactTrieHorizontalEntry *entries,
645 UChar uc, uint16_t nodeCount){
646 int low = 0;
647 int high = nodeCount-1;
648 int middle;
649 while (high >= low) {
650 middle = (high+low)/2;
651 if (uc == entries[middle].ch) {
652 return middle;
653 }
654 else if (uc < entries[middle].ch) {
655 high = middle-1;
656 }
657 else {
658 low = middle+1;
659 }
660 }
661
662 return -1;
462 } 663 }
463 664
464 int32_t 665 int32_t
465 CompactTrieDictionary::matches( UText *text, 666 CompactTrieDictionary::matches( UText *text,
466 int32_t maxLength, 667 int32_t maxLength,
467 int32_t *lengths, 668 int32_t *lengths,
468 int &count, 669 int &count,
469 int limit ) const { 670 int limit,
671 uint16_t *values /*= NULL*/) const {
672 if (fInfo->magic == COMPACT_TRIE_MAGIC_2)
673 values = NULL;
674
470 // TODO: current implementation works in UTF-16 space 675 // TODO: current implementation works in UTF-16 space
471 const CompactTrieNode *node = getCompactNode(fData, fData->root); 676 const CompactTrieNode *node = getCompactNode(fInfo, fInfo->root);
472 int mycount = 0; 677 int mycount = 0;
473 678
474 UChar uc = utext_current32(text); 679 UChar uc = utext_current32(text);
475 int i = 0; 680 int i = 0;
476 681
682 // handle root node with only kEqualOverflows flag: assume horizontal node w ithout parent
683 if(node != NULL){
684 const CompactTrieHorizontalNode *root = (const CompactTrieHorizontalNode *) node;
685 int index = searchHorizontalEntries(root->entries, uc, root->flagscount & kRootCountMask);
686 if(index > -1){
687 node = getCompactNode(fInfo, calcEqualLink(root, index, root->flagsc ount & kRootCountMask));
688 utext_next32(text);
689 uc = utext_current32(text);
690 ++i;
691 }else{
692 node = NULL;
693 }
694 }
695
477 while (node != NULL) { 696 while (node != NULL) {
478 // Check if the node we just exited ends a word 697 // Check if the node we just exited ends a word
479 if (limit > 0 && (node->flagscount & kParentEndsWord)) { 698 if (limit > 0 && (node->flagscount & kParentEndsWord)) {
699 if(values != NULL){
700 values[mycount] = getValue(node);
701 }
480 lengths[mycount++] = i; 702 lengths[mycount++] = i;
481 --limit; 703 --limit;
482 } 704 }
483 // Check that we haven't exceeded the maximum number of input characters . 705 // Check that we haven't exceeded the maximum number of input characters .
484 // We have to do that here rather than in the while condition so that 706 // We have to do that here rather than in the while condition so that
485 // we can check for ending a word, above. 707 // we can check for ending a word, above.
486 if (i >= maxLength) { 708 if (i >= maxLength) {
487 break; 709 break;
488 } 710 }
489 711
490 int nodeCount = (node->flagscount & kCountMask); 712 int nodeCount = getCount(node);
491 if (nodeCount == 0) { 713 if (nodeCount == 0) {
492 // Special terminal node; return now 714 // Special terminal node; return now
493 break; 715 break;
494 } 716 }
495 if (node->flagscount & kVerticalNode) { 717 if (node->flagscount & kVerticalNode) {
496 // Vertical node; check all the characters in it 718 // Vertical node; check all the characters in it
497 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNod e *)node; 719 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNod e *)node;
498 for (int j = 0; j < nodeCount && i < maxLength; ++j) { 720 for (int j = 0; j < nodeCount && i < maxLength; ++j) {
499 if (uc != vnode->chars[j]) { 721 if (uc != vnode->chars[j]) {
500 // We hit a non-equal character; return 722 // We hit a non-equal character; return
501 goto exit; 723 goto exit;
502 } 724 }
503 utext_next32(text); 725 utext_next32(text);
504 uc = utext_current32(text); 726 uc = utext_current32(text);
505 ++i; 727 ++i;
506 } 728 }
507 // To get here we must have come through the whole list successfully ; 729 // To get here we must have come through the whole list successfully ;
508 // go on to the next node. Note that a word cannot end in the middle 730 // go on to the next node. Note that a word cannot end in the middle
509 // of a vertical node. 731 // of a vertical node.
510 node = getCompactNode(fData, vnode->equal); 732 node = getCompactNode(fInfo, calcEqualLink(vnode));
511 } 733 }
512 else { 734 else {
513 // Horizontal node; do binary search 735 // Horizontal node; do binary search
514 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizonta lNode *)node; 736 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizonta lNode *)node;
515 int low = 0; 737 const CompactTrieHorizontalEntry *entries;
516 int high = nodeCount-1; 738 entries = hnode->entries;
517 int middle; 739
518 node = NULL; // If we don't find a match, we'll fall out of the l oop 740 int index = searchHorizontalEntries(entries, uc, nodeCount);
519 while (high >= low) { 741 if(index > -1){ //
520 middle = (high+low)/2; 742 // We hit a match; get the next node and next character
521 if (uc == hnode->entries[middle].ch) { 743 node = getCompactNode(fInfo, calcEqualLink(hnode, index, nodeCou nt));
522 // We hit a match; get the next node and next character 744 utext_next32(text);
523 node = getCompactNode(fData, hnode->entries[middle].equal); 745 uc = utext_current32(text);
524 utext_next32(text); 746 ++i;
525 uc = utext_current32(text); 747 }else{
526 ++i; 748 node = NULL; // If we don't find a match, we'll fall out of t he loop
527 break;
528 }
529 else if (uc < hnode->entries[middle].ch) {
530 high = middle-1;
531 }
532 else {
533 low = middle+1;
534 }
535 } 749 }
536 } 750 }
537 } 751 }
538 exit: 752 exit:
539 count = mycount; 753 count = mycount;
540 return i; 754 return i;
541 } 755 }
542 756
543 // Implementation of iteration for CompactTrieDictionary 757 // Implementation of iteration for CompactTrieDictionary
544 class CompactTrieEnumeration : public StringEnumeration { 758 class CompactTrieEnumeration : public StringEnumeration {
545 private: 759 private:
546 UVector32 fNodeStack; // Stack of nodes to process 760 UVector32 fNodeStack; // Stack of nodes to process
547 UVector32 fIndexStack; // Stack of where in node we are 761 UVector32 fIndexStack; // Stack of where in node we are
548 const CompactTrieHeader *fHeader; // Trie data 762 const CompactTrieInfo *fInfo; // Trie data
549 763
550 public: 764 public:
551 static UClassID U_EXPORT2 getStaticClassID(void); 765 static UClassID U_EXPORT2 getStaticClassID(void);
552 virtual UClassID getDynamicClassID(void) const; 766 virtual UClassID getDynamicClassID(void) const;
553 public: 767 public:
554 CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status) 768 CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status)
555 : fNodeStack(status), fIndexStack(status) { 769 : fNodeStack(status), fIndexStack(status) {
556 fHeader = header; 770 fInfo = info;
557 fNodeStack.push(header->root, status); 771 fNodeStack.push(info->root, status);
558 fIndexStack.push(0, status); 772 fIndexStack.push(0, status);
559 unistr.remove(); 773 unistr.remove();
560 } 774 }
561 775
562 virtual ~CompactTrieEnumeration() { 776 virtual ~CompactTrieEnumeration() {
563 } 777 }
564 778
565 virtual StringEnumeration *clone() const { 779 virtual StringEnumeration *clone() const {
566 UErrorCode status = U_ZERO_ERROR; 780 UErrorCode status = U_ZERO_ERROR;
567 return new CompactTrieEnumeration(fHeader, status); 781 return new CompactTrieEnumeration(fInfo, status);
568 } 782 }
569 783
570 virtual const UnicodeString * snext(UErrorCode &status); 784 virtual const UnicodeString * snext(UErrorCode &status);
571 785
572 // Very expensive, but this should never be used. 786 // Very expensive, but this should never be used.
573 virtual int32_t count(UErrorCode &status) const { 787 virtual int32_t count(UErrorCode &status) const {
574 CompactTrieEnumeration counter(fHeader, status); 788 CompactTrieEnumeration counter(fInfo, status);
575 int32_t result = 0; 789 int32_t result = 0;
576 while (counter.snext(status) != NULL && U_SUCCESS(status)) { 790 while (counter.snext(status) != NULL && U_SUCCESS(status)) {
577 ++result; 791 ++result;
578 } 792 }
579 return result; 793 return result;
580 } 794 }
581 795
582 virtual void reset(UErrorCode &status) { 796 virtual void reset(UErrorCode &status) {
583 fNodeStack.removeAllElements(); 797 fNodeStack.removeAllElements();
584 fIndexStack.removeAllElements(); 798 fIndexStack.removeAllElements();
585 fNodeStack.push(fHeader->root, status); 799 fNodeStack.push(fInfo->root, status);
586 fIndexStack.push(0, status); 800 fIndexStack.push(0, status);
587 unistr.remove(); 801 unistr.remove();
588 } 802 }
589 }; 803 };
590 804
591 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration) 805 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration)
592 806
593 const UnicodeString * 807 const UnicodeString *
594 CompactTrieEnumeration::snext(UErrorCode &status) { 808 CompactTrieEnumeration::snext(UErrorCode &status) {
595 if (fNodeStack.empty() || U_FAILURE(status)) { 809 if (fNodeStack.empty() || U_FAILURE(status)) {
596 return NULL; 810 return NULL;
597 } 811 }
598 const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki()); 812 const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki());
599 int where = fIndexStack.peeki(); 813 int where = fIndexStack.peeki();
600 while (!fNodeStack.empty() && U_SUCCESS(status)) { 814 while (!fNodeStack.empty() && U_SUCCESS(status)) {
601 int nodeCount = (node->flagscount & kCountMask); 815 int nodeCount;
816
817 bool isRoot = fNodeStack.peeki() == static_cast<int32_t>(fInfo->root);
818 if(isRoot){
819 nodeCount = node->flagscount & kRootCountMask;
820 } else {
821 nodeCount = getCount(node);
822 }
823
602 UBool goingDown = FALSE; 824 UBool goingDown = FALSE;
603 if (nodeCount == 0) { 825 if (nodeCount == 0) {
604 // Terminal node; go up immediately 826 // Terminal node; go up immediately
605 fNodeStack.popi(); 827 fNodeStack.popi();
606 fIndexStack.popi(); 828 fIndexStack.popi();
607 node = getCompactNode(fHeader, fNodeStack.peeki()); 829 node = getCompactNode(fInfo, fNodeStack.peeki());
608 where = fIndexStack.peeki(); 830 where = fIndexStack.peeki();
609 } 831 }
610 else if (node->flagscount & kVerticalNode) { 832 else if ((node->flagscount & kVerticalNode) && !isRoot) {
611 // Vertical node 833 // Vertical node
612 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNod e *)node; 834 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNod e *)node;
613 if (where == 0) { 835 if (where == 0) {
614 // Going down 836 // Going down
615 unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount); 837 unistr.append((const UChar *)vnode->chars, nodeCount);
616 fIndexStack.setElementAt(1, fIndexStack.size()-1); 838 fIndexStack.setElementAt(1, fIndexStack.size()-1);
617 node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, sta tus)); 839 node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnode ), status));
618 where = fIndexStack.push(0, status); 840 where = fIndexStack.push(0, status);
619 goingDown = TRUE; 841 goingDown = TRUE;
620 } 842 }
621 else { 843 else {
622 // Going up 844 // Going up
623 unistr.truncate(unistr.length()-nodeCount); 845 unistr.truncate(unistr.length()-nodeCount);
624 fNodeStack.popi(); 846 fNodeStack.popi();
625 fIndexStack.popi(); 847 fIndexStack.popi();
626 node = getCompactNode(fHeader, fNodeStack.peeki()); 848 node = getCompactNode(fInfo, fNodeStack.peeki());
627 where = fIndexStack.peeki(); 849 where = fIndexStack.peeki();
628 } 850 }
629 } 851 }
630 else { 852 else {
631 // Horizontal node 853 // Horizontal node
632 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizonta lNode *)node; 854 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizonta lNode *)node;
633 if (where > 0) { 855 if (where > 0) {
634 // Pop previous char 856 // Pop previous char
635 unistr.truncate(unistr.length()-1); 857 unistr.truncate(unistr.length()-1);
636 } 858 }
637 if (where < nodeCount) { 859 if (where < nodeCount) {
638 // Push on next node 860 // Push on next node
639 unistr.append((UChar)hnode->entries[where].ch); 861 unistr.append((UChar)hnode->entries[where].ch);
640 fIndexStack.setElementAt(where+1, fIndexStack.size()-1); 862 fIndexStack.setElementAt(where+1, fIndexStack.size()-1);
641 node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[wh ere].equal, status)); 863 node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnode , where, nodeCount), status));
642 where = fIndexStack.push(0, status); 864 where = fIndexStack.push(0, status);
643 goingDown = TRUE; 865 goingDown = TRUE;
644 } 866 }
645 else { 867 else {
646 // Going up 868 // Going up
647 fNodeStack.popi(); 869 fNodeStack.popi();
648 fIndexStack.popi(); 870 fIndexStack.popi();
649 node = getCompactNode(fHeader, fNodeStack.peeki()); 871 node = getCompactNode(fInfo, fNodeStack.peeki());
650 where = fIndexStack.peeki(); 872 where = fIndexStack.peeki();
651 } 873 }
652 } 874 }
875
653 // Check if the parent of the node we've just gone down to ends a 876 // Check if the parent of the node we've just gone down to ends a
654 // word. If so, return it. 877 // word. If so, return it.
878 // The root node should never end up here.
655 if (goingDown && (node->flagscount & kParentEndsWord)) { 879 if (goingDown && (node->flagscount & kParentEndsWord)) {
656 return &unistr; 880 return &unistr;
657 } 881 }
658 } 882 }
659 return NULL; 883 return NULL;
660 } 884 }
661 885
662 StringEnumeration * 886 StringEnumeration *
663 CompactTrieDictionary::openWords( UErrorCode &status ) const { 887 CompactTrieDictionary::openWords( UErrorCode &status ) const {
664 if (U_FAILURE(status)) { 888 if (U_FAILURE(status)) {
665 return NULL; 889 return NULL;
666 } 890 }
667 return new CompactTrieEnumeration(fData, status); 891 return new CompactTrieEnumeration(fInfo, status);
668 } 892 }
669 893
670 // 894 //
671 // Below here is all code related to converting a ternary trie to a compact trie 895 // Below here is all code related to converting a ternary trie to a compact trie
672 // and back again 896 // and back again
673 // 897 //
674 898
675 // Helper classes to construct the compact trie 899 enum CompactTrieNodeType {
900 kHorizontalType = 0,
901 kVerticalType = 1,
902 kValueType = 2
903 };
904
905 /**
906 * The following classes (i.e. BuildCompactTrie*Node) are helper classes to
907 * construct the compact trie by storing information for each node and later
908 * writing the node to memory in a sequential format.
909 */
676 class BuildCompactTrieNode: public UMemory { 910 class BuildCompactTrieNode: public UMemory {
677 public: 911 public:
678 UBool fParentEndsWord; 912 UBool fParentEndsWord;
679 UBool fVertical; 913 CompactTrieNodeType fNodeType;
680 UBool fHasDuplicate; 914 UBool fHasDuplicate;
915 UBool fEqualOverflows;
681 int32_t fNodeID; 916 int32_t fNodeID;
682 UnicodeString fChars; 917 UnicodeString fChars;
918 uint16_t fValue;
683 919
684 public: 920 public:
685 BuildCompactTrieNode(UBool parentEndsWord, UBool vertical, UStack &nodes, UE rrorCode &status) { 921 BuildCompactTrieNode(UBool parentEndsWord, CompactTrieNodeType nodeType,
922 UStack &nodes, UErrorCode &status, uint16_t value = 0) {
686 fParentEndsWord = parentEndsWord; 923 fParentEndsWord = parentEndsWord;
687 fHasDuplicate = FALSE; 924 fHasDuplicate = FALSE;
688 fVertical = vertical; 925 fNodeType = nodeType;
926 fEqualOverflows = FALSE;
689 fNodeID = nodes.size(); 927 fNodeID = nodes.size();
928 fValue = parentEndsWord? value : 0;
690 nodes.push(this, status); 929 nodes.push(this, status);
691 } 930 }
692 931
693 virtual ~BuildCompactTrieNode() { 932 virtual ~BuildCompactTrieNode() {
694 } 933 }
695 934
696 virtual uint32_t size() { 935 virtual uint32_t size() {
697 return sizeof(uint16_t); 936 if(fValue > 0)
937 return sizeof(uint16_t) * 2;
938 else
939 return sizeof(uint16_t);
698 } 940 }
699 941
700 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*tran slate*/) { 942 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*tran slate*/) {
701 // Write flag/count 943 // Write flag/count
702 *((uint16_t *)(bytes+offset)) = (fChars.length() & kCountMask) 944
703 | (fVertical ? kVerticalNode : 0) | (fParentEndsWord ? kParentEndsWo rd : 0 ); 945 // if this ever fails, a flag bit (i.e. kExceedsCount) will need to be
946 // used as a 5th MSB.
947 U_ASSERT(fChars.length() < 4096 || fNodeID == 2);
948
949 *((uint16_t *)(bytes+offset)) = (fEqualOverflows? kEqualOverflows : 0) |
950 ((fNodeID == 2)? (fChars.length() & kRootCountMask):
951 (
952 (fChars.length() & kCountMask) |
953 //((fChars.length() << 2) & kExceedsCount) |
954 (fNodeType == kVerticalType ? kVerticalNode : 0) |
955 (fParentEndsWord ? kParentEndsWord : 0 )
956 )
957 );
704 offset += sizeof(uint16_t); 958 offset += sizeof(uint16_t);
705 } 959 }
960
961 virtual void writeValue(uint8_t *bytes, uint32_t &offset) {
962 if(fValue > 0){
963 *((uint16_t *)(bytes+offset)) = fValue;
964 offset += sizeof(uint16_t);
965 }
966 }
967
968 };
969
970 /**
971 * Stores value of parent terminating nodes that have no more subtries.
972 */
973 class BuildCompactTrieValueNode: public BuildCompactTrieNode {
974 public:
975 BuildCompactTrieValueNode(UStack &nodes, UErrorCode &status, uint16_t value)
976 : BuildCompactTrieNode(TRUE, kValueType, nodes, status, value){
977 }
978
979 virtual ~BuildCompactTrieValueNode(){
980 }
981
982 virtual uint32_t size() {
983 return sizeof(uint16_t) * 2;
984 }
985
986 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &transl ate) {
987 // don't write value directly to memory but store it in offset to be wri tten later
988 //offset = fValue & kOffsetContainsValue;
989 BuildCompactTrieNode::write(bytes, offset, translate);
990 BuildCompactTrieNode::writeValue(bytes, offset);
991 }
706 }; 992 };
707 993
708 class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode { 994 class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode {
709 public: 995 public:
710 UStack fLinks; 996 UStack fLinks;
997 UBool fMayOverflow; //intermediate value for fEqualOverflows
711 998
712 public: 999 public:
713 BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCo de &status) 1000 BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCo de &status, uint16_t value = 0)
714 : BuildCompactTrieNode(parentEndsWord, FALSE, nodes, status), fLinks(sta tus) { 1001 : BuildCompactTrieNode(parentEndsWord, kHorizontalType, nodes, status, value ), fLinks(status) {
1002 fMayOverflow = FALSE;
715 } 1003 }
716 1004
717 virtual ~BuildCompactTrieHorizontalNode() { 1005 virtual ~BuildCompactTrieHorizontalNode() {
718 } 1006 }
719 1007
720 virtual uint32_t size() { 1008 // It is impossible to know beforehand exactly how much space the node will
721 return offsetof(CompactTrieHorizontalNode,entries) + 1009 // need in memory before being written, because the node IDs in the equal
722 (fChars.length()*sizeof(CompactTrieHorizontalEntry)); 1010 // links may or may not overflow after node coalescing. Therefore, this meth od
1011 // returns the maximum size possible for the node.
1012 virtual uint32_t size() {
1013 uint32_t estimatedSize = offsetof(CompactTrieHorizontalNode,entries) +
1014 (fChars.length()*sizeof(CompactTrieHorizontalEntry));
1015
1016 if(fValue > 0)
1017 estimatedSize += sizeof(uint16_t);
1018
1019 //estimate extra space needed to store overflow for node ID links
1020 //may be more than what is actually needed
1021 for(int i=0; i < fChars.length(); i++){
1022 if(((BuildCompactTrieNode *)fLinks[i])->fNodeID > 0xFFFF){
1023 fMayOverflow = TRUE;
1024 break;
1025 }
1026 }
1027 if(fMayOverflow) // added space for overflow should be same as ceil(fCha rs.length()/4) * sizeof(uint16_t)
1028 estimatedSize += (sizeof(uint16_t) * fChars.length() + 2)/4;
1029
1030 return estimatedSize;
723 } 1031 }
724 1032
725 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &transl ate) { 1033 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &transl ate) {
1034 int32_t count = fChars.length();
1035
1036 //if largest nodeID > 2^16, set flag
1037 //large node IDs are more likely to be at the back of the array
1038 for (int32_t i = count-1; i >= 0; --i) {
1039 if(translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID ) > 0xFFFF){
1040 fEqualOverflows = TRUE;
1041 break;
1042 }
1043 }
1044
726 BuildCompactTrieNode::write(bytes, offset, translate); 1045 BuildCompactTrieNode::write(bytes, offset, translate);
727 int32_t count = fChars.length(); 1046
1047 // write entries[] to memory
728 for (int32_t i = 0; i < count; ++i) { 1048 for (int32_t i = 0; i < count; ++i) {
729 CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(b ytes+offset); 1049 CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(b ytes+offset);
730 entry->ch = fChars[i]; 1050 entry->ch = fChars[i];
731 entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[ i])->fNodeID); 1051 entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[ i])->fNodeID);
732 #ifdef DEBUG_TRIE_DICT 1052 #ifdef DEBUG_TRIE_DICT
733 if (entry->equal == 0) { 1053
1054 if ((entry->equal == 0) && !fEqualOverflows) {
734 fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n", 1055 fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n",
735 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); 1056 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
736 } 1057 }
737 #endif 1058 #endif
738 offset += sizeof(CompactTrieHorizontalEntry); 1059 offset += sizeof(CompactTrieHorizontalEntry);
739 } 1060 }
1061
1062 // append extra bits of equal nodes to end if fEqualOverflows
1063 if (fEqualOverflows) {
1064 uint16_t leftmostBits = 0;
1065 for (int16_t i = 0; i < count; i++) {
1066 leftmostBits = (leftmostBits << 4) | getLeftmostBits(translate, i);
1067
1068 // write filled uint16_t to memory
1069 if(i % 4 == 3){
1070 *((uint16_t *)(bytes+offset)) = leftmostBits;
1071 leftmostBits = 0;
1072 offset += sizeof(uint16_t);
1073 }
1074 }
1075
1076 // pad last uint16_t with zeroes if necessary
1077 int remainder = count % 4;
1078 if (remainder > 0) {
1079 *((uint16_t *)(bytes+offset)) = (leftmostBits << (16 - 4 * remai nder));
1080 offset += sizeof(uint16_t);
1081 }
1082 }
1083
1084 BuildCompactTrieNode::writeValue(bytes, offset);
1085 }
1086
1087 // returns leftmost bits of physical node link
1088 uint16_t getLeftmostBits(const UVector32 &translate, uint32_t i){
1089 uint16_t leftmostBits = (uint16_t) (translate.elementAti(((BuildCompactT rieNode *)fLinks[i])->fNodeID) >> 16);
1090 #ifdef DEBUG_TRIE_DICT
1091 if (leftmostBits > 0xF) {
1092 fprintf(stderr, "ERROR: horizontal link %d, logical node %d exceeds maximum possible node ID value\n",
1093 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
1094 }
1095 #endif
1096 return leftmostBits;
740 } 1097 }
741 1098
742 void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) { 1099 void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) {
743 fChars.append(ch); 1100 fChars.append(ch);
744 fLinks.push(link, status); 1101 fLinks.push(link, status);
745 } 1102 }
1103
746 }; 1104 };
747 1105
748 class BuildCompactTrieVerticalNode: public BuildCompactTrieNode { 1106 class BuildCompactTrieVerticalNode: public BuildCompactTrieNode {
749 public: 1107 public:
750 BuildCompactTrieNode *fEqual; 1108 BuildCompactTrieNode *fEqual;
751 1109
752 public: 1110 public:
753 BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status) 1111 BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0)
754 : BuildCompactTrieNode(parentEndsWord, TRUE, nodes, status) { 1112 : BuildCompactTrieNode(parentEndsWord, kVerticalType, nodes, status, value) {
755 fEqual = NULL; 1113 fEqual = NULL;
756 } 1114 }
757 1115
758 virtual ~BuildCompactTrieVerticalNode() { 1116 virtual ~BuildCompactTrieVerticalNode() {
759 } 1117 }
760 1118
761 virtual uint32_t size() { 1119 // Returns the maximum possible size of this node. See comment in
762 return offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof (uint16_t)); 1120 // BuildCompactTrieHorizontal node for more information.
1121 virtual uint32_t size() {
1122 uint32_t estimatedSize = offsetof(CompactTrieVerticalNode,chars) + (fCha rs.length()*sizeof(uint16_t));
1123 if(fValue > 0){
1124 estimatedSize += sizeof(uint16_t);
1125 }
1126
1127 if(fEqual->fNodeID > 0xFFFF){
1128 estimatedSize += sizeof(uint16_t);
1129 }
1130 return estimatedSize;
763 } 1131 }
764 1132
765 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &transl ate) { 1133 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &transl ate) {
766 CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset ); 1134 CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset );
1135 fEqualOverflows = (translate.elementAti(fEqual->fNodeID) > 0xFFFF);
767 BuildCompactTrieNode::write(bytes, offset, translate); 1136 BuildCompactTrieNode::write(bytes, offset, translate);
768 node->equal = translate.elementAti(fEqual->fNodeID); 1137 node->equal = translate.elementAti(fEqual->fNodeID);
769 offset += sizeof(node->equal); 1138 offset += sizeof(node->equal);
770 #ifdef DEBUG_TRIE_DICT 1139 #ifdef DEBUG_TRIE_DICT
771 if (node->equal == 0) { 1140 if ((node->equal == 0) && !fEqualOverflows) {
772 fprintf(stderr, "ERROR: vertical link, logical node %d maps to physi cal node zero\n", 1141 fprintf(stderr, "ERROR: vertical link, logical node %d maps to physi cal node zero\n",
773 fEqual->fNodeID); 1142 fEqual->fNodeID);
774 } 1143 }
775 #endif 1144 #endif
776 fChars.extract(0, fChars.length(), (UChar *)node->chars); 1145 fChars.extract(0, fChars.length(), (UChar *)node->chars);
777 offset += sizeof(uint16_t)*fChars.length(); 1146 offset += sizeof(UChar)*fChars.length();
1147
1148 // append 16 bits of to end for equal node if fEqualOverflows
1149 if (fEqualOverflows) {
1150 *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNodeI D) >> 16);
1151 offset += sizeof(uint16_t);
1152 }
1153
1154 BuildCompactTrieNode::writeValue(bytes, offset);
778 } 1155 }
779 1156
780 void addChar(UChar ch) { 1157 void addChar(UChar ch) {
781 fChars.append(ch); 1158 fChars.append(ch);
782 } 1159 }
783 1160
784 void setLink(BuildCompactTrieNode *node) { 1161 void setLink(BuildCompactTrieNode *node) {
785 fEqual = node; 1162 fEqual = node;
786 } 1163 }
1164
787 }; 1165 };
788 1166
789 // Forward declaration 1167 // Forward declaration
790 static void walkHorizontal(const TernaryNode *node, 1168 static void walkHorizontal(const TernaryNode *node,
791 BuildCompactTrieHorizontalNode *building, 1169 BuildCompactTrieHorizontalNode *building,
792 UStack &nodes, 1170 UStack &nodes,
793 UErrorCode &status); 1171 UErrorCode &status,
1172 Hashtable *values);
794 1173
795 // Convert one node. Uses recursion. 1174 // Convert one TernaryNode into a BuildCompactTrieNode. Uses recursion.
796 1175
797 static BuildCompactTrieNode * 1176 static BuildCompactTrieNode *
798 compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, UEr rorCode &status) { 1177 compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes,
1178 UErrorCode &status, Hashtable *values = NULL, uint16_t parentValue = 0) {
799 if (U_FAILURE(status)) { 1179 if (U_FAILURE(status)) {
800 return NULL; 1180 return NULL;
801 } 1181 }
802 BuildCompactTrieNode *result = NULL; 1182 BuildCompactTrieNode *result = NULL;
803 UBool horizontal = (node->low != NULL || node->high != NULL); 1183 UBool horizontal = (node->low != NULL || node->high != NULL);
804 if (horizontal) { 1184 if (horizontal) {
805 BuildCompactTrieHorizontalNode *hResult = 1185 BuildCompactTrieHorizontalNode *hResult;
806 new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status ); 1186 if(values != NULL){
1187 hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status, parentValue);
1188 } else {
1189 hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status);
1190 }
1191
807 if (hResult == NULL) { 1192 if (hResult == NULL) {
808 status = U_MEMORY_ALLOCATION_ERROR; 1193 status = U_MEMORY_ALLOCATION_ERROR;
809 return NULL; 1194 return NULL;
810 } 1195 }
811 if (U_SUCCESS(status)) { 1196 if (U_SUCCESS(status)) {
812 walkHorizontal(node, hResult, nodes, status); 1197 walkHorizontal(node, hResult, nodes, status, values);
813 result = hResult; 1198 result = hResult;
814 } 1199 }
815 } 1200 }
816 else { 1201 else {
817 BuildCompactTrieVerticalNode *vResult = 1202 BuildCompactTrieVerticalNode *vResult;
818 new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status); 1203 if(values != NULL){
1204 vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, st atus, parentValue);
1205 } else {
1206 vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, st atus);
1207 }
1208
819 if (vResult == NULL) { 1209 if (vResult == NULL) {
820 status = U_MEMORY_ALLOCATION_ERROR; 1210 status = U_MEMORY_ALLOCATION_ERROR;
1211 return NULL;
821 } 1212 }
822 else if (U_SUCCESS(status)) { 1213 else if (U_SUCCESS(status)) {
823 UBool endsWord = FALSE; 1214 uint16_t value = 0;
1215 UBool endsWord = FALSE;
824 // Take up nodes until we end a word, or hit a node with < or > link s 1216 // Take up nodes until we end a word, or hit a node with < or > link s
825 do { 1217 do {
826 vResult->addChar(node->ch); 1218 vResult->addChar(node->ch);
827 endsWord = (node->flags & kEndsWord) != 0; 1219 value = node->flags;
1220 endsWord = value > 0;
828 node = node->equal; 1221 node = node->equal;
829 } 1222 }
830 while(node != NULL && !endsWord && node->low == NULL && node->high = = NULL); 1223 while(node != NULL && !endsWord && node->low == NULL && node->high = = NULL);
1224
831 if (node == NULL) { 1225 if (node == NULL) {
832 if (!endsWord) { 1226 if (!endsWord) {
833 status = U_ILLEGAL_ARGUMENT_ERROR; // Corrupt input trie 1227 status = U_ILLEGAL_ARGUMENT_ERROR; // Corrupt input trie
834 } 1228 }
835 else { 1229 else if(values != NULL){
1230 UnicodeString key(value); //store value as a single-char Uni codeString
1231 BuildCompactTrieValueNode *link = (BuildCompactTrieValueNode *) values->get(key);
1232 if(link == NULL){
1233 link = new BuildCompactTrieValueNode(nodes, status, valu e); //take out nodes?
1234 values->put(key, link, status);
1235 }
1236 vResult->setLink(link);
1237 } else {
836 vResult->setLink((BuildCompactTrieNode *)nodes[1]); 1238 vResult->setLink((BuildCompactTrieNode *)nodes[1]);
837 } 1239 }
838 } 1240 }
839 else { 1241 else {
840 vResult->setLink(compactOneNode(node, endsWord, nodes, status)); 1242 vResult->setLink(compactOneNode(node, endsWord, nodes, status, v alues, value));
841 } 1243 }
842 result = vResult; 1244 result = vResult;
843 } 1245 }
844 } 1246 }
845 return result; 1247 return result;
846 } 1248 }
847 1249
848 // Walk the set of peers at the same level, to build a horizontal node. 1250 // Walk the set of peers at the same level, to build a horizontal node.
849 // Uses recursion. 1251 // Uses recursion.
850 1252
851 static void walkHorizontal(const TernaryNode *node, 1253 static void walkHorizontal(const TernaryNode *node,
852 BuildCompactTrieHorizontalNode *building, 1254 BuildCompactTrieHorizontalNode *building,
853 UStack &nodes, 1255 UStack &nodes,
854 UErrorCode &status) { 1256 UErrorCode &status, Hashtable *values = NULL) {
855 while (U_SUCCESS(status) && node != NULL) { 1257 while (U_SUCCESS(status) && node != NULL) {
856 if (node->low != NULL) { 1258 if (node->low != NULL) {
857 walkHorizontal(node->low, building, nodes, status); 1259 walkHorizontal(node->low, building, nodes, status, values);
858 } 1260 }
859 BuildCompactTrieNode *link = NULL; 1261 BuildCompactTrieNode *link = NULL;
860 if (node->equal != NULL) { 1262 if (node->equal != NULL) {
861 link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, n odes, status); 1263 link = compactOneNode(node->equal, node->flags > 0, nodes, status, v alues, node->flags);
862 } 1264 }
863 else if (node->flags & kEndsWord) { 1265 else if (node->flags > 0) {
864 link = (BuildCompactTrieNode *)nodes[1]; 1266 if(values != NULL) {
1267 UnicodeString key(node->flags); //store value as a single-char U nicodeString
1268 link = (BuildCompactTrieValueNode *) values->get(key);
1269 if(link == NULL) {
1270 link = new BuildCompactTrieValueNode(nodes, status, node->fl ags); //take out nodes?
1271 values->put(key, link, status);
1272 }
1273 } else {
1274 link = (BuildCompactTrieNode *)nodes[1];
1275 }
865 } 1276 }
866 if (U_SUCCESS(status) && link != NULL) { 1277 if (U_SUCCESS(status) && link != NULL) {
867 building->addNode(node->ch, link, status); 1278 building->addNode(node->ch, link, status);
868 } 1279 }
869 // Tail recurse manually instead of leaving it to the compiler. 1280 // Tail recurse manually instead of leaving it to the compiler.
870 //if (node->high != NULL) { 1281 //if (node->high != NULL) {
871 // walkHorizontal(node->high, building, nodes, status); 1282 // walkHorizontal(node->high, building, nodes, status);
872 //} 1283 //}
873 node = node->high; 1284 node = node->high;
874 } 1285 }
875 } 1286 }
876 1287
877 U_NAMESPACE_END 1288 U_NAMESPACE_END
878 U_NAMESPACE_USE 1289 U_NAMESPACE_USE
879 U_CDECL_BEGIN 1290 U_CDECL_BEGIN
880 static int32_t U_CALLCONV 1291 static int32_t U_CALLCONV
881 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) { 1292 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) {
882 BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl; 1293 BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl;
883 BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr; 1294 BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr;
1295
884 // Check for comparing a node to itself, to avoid spurious duplicates 1296 // Check for comparing a node to itself, to avoid spurious duplicates
885 if (left == right) { 1297 if (left == right) {
886 return 0; 1298 return 0;
887 } 1299 }
1300
888 // Most significant is type of node. Can never coalesce. 1301 // Most significant is type of node. Can never coalesce.
889 if (left->fVertical != right->fVertical) { 1302 if (left->fNodeType != right->fNodeType) {
890 return left->fVertical - right->fVertical; 1303 return left->fNodeType - right->fNodeType;
891 } 1304 }
892 // Next, the "parent ends word" flag. If that differs, we cannot coalesce. 1305 // Next, the "parent ends word" flag. If that differs, we cannot coalesce.
893 if (left->fParentEndsWord != right->fParentEndsWord) { 1306 if (left->fParentEndsWord != right->fParentEndsWord) {
894 return left->fParentEndsWord - right->fParentEndsWord; 1307 return left->fParentEndsWord - right->fParentEndsWord;
895 } 1308 }
896 // Next, the string. If that differs, we can never coalesce. 1309 // Next, the string. If that differs, we can never coalesce.
897 int32_t result = left->fChars.compare(right->fChars); 1310 int32_t result = left->fChars.compare(right->fChars);
898 if (result != 0) { 1311 if (result != 0) {
899 return result; 1312 return result;
900 } 1313 }
1314
1315 // If the node value differs, we should not coalesce.
1316 // If values aren't stored, all fValues should be 0.
1317 if (left->fValue != right->fValue) {
1318 return left->fValue - right->fValue;
1319 }
1320
901 // We know they're both the same node type, so branch for the two cases. 1321 // We know they're both the same node type, so branch for the two cases.
902 if (left->fVertical) { 1322 if (left->fNodeType == kVerticalType) {
903 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID 1323 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID
904 - ((BuildCompactTrieVerticalNode *)right)->fEqual->f NodeID; 1324 - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;
905 } 1325 }
906 else { 1326 else if(left->fChars.length() > 0 && right->fChars.length() > 0){
907 // We need to compare the links vectors. They should be the 1327 // We need to compare the links vectors. They should be the
908 // same size because the strings were equal. 1328 // same size because the strings were equal.
909 // We compare the node IDs instead of the pointers, to handle 1329 // We compare the node IDs instead of the pointers, to handle
910 // coalesced nodes. 1330 // coalesced nodes.
911 BuildCompactTrieHorizontalNode *hleft, *hright; 1331 BuildCompactTrieHorizontalNode *hleft, *hright;
912 hleft = (BuildCompactTrieHorizontalNode *)left; 1332 hleft = (BuildCompactTrieHorizontalNode *)left;
913 hright = (BuildCompactTrieHorizontalNode *)right; 1333 hright = (BuildCompactTrieHorizontalNode *)right;
914 int32_t count = hleft->fLinks.size(); 1334 int32_t count = hleft->fLinks.size();
915 for (int32_t i = 0; i < count && result == 0; ++i) { 1335 for (int32_t i = 0; i < count && result == 0; ++i) {
916 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID - 1336 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID -
917 ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; 1337 ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;
918 } 1338 }
919 } 1339 }
1340
920 // If they are equal to each other, mark them (speeds coalescing) 1341 // If they are equal to each other, mark them (speeds coalescing)
921 if (result == 0) { 1342 if (result == 0) {
922 left->fHasDuplicate = TRUE; 1343 left->fHasDuplicate = TRUE;
923 right->fHasDuplicate = TRUE; 1344 right->fHasDuplicate = TRUE;
924 } 1345 }
925 return result; 1346 return result;
926 } 1347 }
927 U_CDECL_END 1348 U_CDECL_END
928 U_NAMESPACE_BEGIN 1349 U_NAMESPACE_BEGIN
929 1350
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after
1024 #ifdef DEBUG_TRIE_DICT 1445 #ifdef DEBUG_TRIE_DICT
1025 struct tms timing; 1446 struct tms timing;
1026 struct tms previous; 1447 struct tms previous;
1027 (void) ::times(&previous); 1448 (void) ::times(&previous);
1028 #endif 1449 #endif
1029 UStack nodes(_deleteBuildNode, NULL, status); // Index of nodes 1450 UStack nodes(_deleteBuildNode, NULL, status); // Index of nodes
1030 1451
1031 // Add node 0, used as the NULL pointer/sentinel. 1452 // Add node 0, used as the NULL pointer/sentinel.
1032 nodes.addElement((int32_t)0, status); 1453 nodes.addElement((int32_t)0, status);
1033 1454
1455 Hashtable *values = NULL; // Index of (unique) val ues
1456 if (dict.fValued) {
1457 values = new Hashtable(status);
1458 }
1459
1034 // Start by creating the special empty node we use to indicate that the pare nt 1460 // Start by creating the special empty node we use to indicate that the pare nt
1035 // terminates a word. This must be node 1, because the builder assumes 1461 // terminates a word. This must be node 1, because the builder assumes
1036 // that. 1462 // that. This node will never be used for tries storing numerical values.
1037 if (U_FAILURE(status)) { 1463 if (U_FAILURE(status)) {
1038 return NULL; 1464 return NULL;
1039 } 1465 }
1040 BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, nodes , status); 1466 BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontalT ype, nodes, status);
1041 if (terminal == NULL) { 1467 if (terminal == NULL) {
1042 status = U_MEMORY_ALLOCATION_ERROR; 1468 status = U_MEMORY_ALLOCATION_ERROR;
1043 } 1469 }
1044 1470
1045 // This call does all the work of building the new trie structure. The root 1471 // This call does all the work of building the new trie structure. The root
1046 // will be node 2. 1472 // will have node ID 2 before writing to memory.
1047 BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status ); 1473 BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status , values);
1048 #ifdef DEBUG_TRIE_DICT 1474 #ifdef DEBUG_TRIE_DICT
1049 (void) ::times(&timing); 1475 (void) ::times(&timing);
1050 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n", 1476 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n",
1051 nodes.size(), (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, 1477 nodes.size(), (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1052 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); 1478 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1053 previous = timing; 1479 previous = timing;
1054 #endif 1480 #endif
1055 1481
1056 // Now coalesce all duplicate nodes. 1482 // Now coalesce all duplicate nodes.
1057 coalesceDuplicates(nodes, status); 1483 coalesceDuplicates(nodes, status);
(...skipping 12 matching lines...) Expand all
1070 int32_t nodeCount = 1; // The sentinel node we already have 1496 int32_t nodeCount = 1; // The sentinel node we already have
1071 BuildCompactTrieNode *node; 1497 BuildCompactTrieNode *node;
1072 int32_t i; 1498 int32_t i;
1073 UVector32 translate(count, status); // Should be no growth needed after this 1499 UVector32 translate(count, status); // Should be no growth needed after this
1074 translate.push(0, status); // The sentinel node 1500 translate.push(0, status); // The sentinel node
1075 1501
1076 if (U_FAILURE(status)) { 1502 if (U_FAILURE(status)) {
1077 return NULL; 1503 return NULL;
1078 } 1504 }
1079 1505
1506 //map terminal value nodes
1507 int valueCount = 0;
1508 UVector valueNodes(status);
1509 if(values != NULL) {
1510 valueCount = values->count(); //number of unique terminal value nodes
1511 }
1512
1513 // map non-terminal nodes
1514 int valuePos = 1;//, nodePos = valueCount + valuePos;
1515 nodeCount = valueCount + valuePos;
1080 for (i = 1; i < count; ++i) { 1516 for (i = 1; i < count; ++i) {
1081 node = (BuildCompactTrieNode *)nodes[i]; 1517 node = (BuildCompactTrieNode *)nodes[i];
1082 if (node->fNodeID == i) { 1518 if (node->fNodeID == i) {
1083 // Only one node out of each duplicate set is used 1519 // Only one node out of each duplicate set is used
1084 if (i >= translate.size()) { 1520 if (node->fNodeID >= translate.size()) {
1085 // Logically extend the mapping table 1521 // Logically extend the mapping table
1086 translate.setSize(i+1); 1522 translate.setSize(i + 1);
1087 } 1523 }
1088 translate.setElementAt(nodeCount++, i); 1524 //translate.setElementAt(object, index)!
1525 if(node->fNodeType == kValueType) {
1526 valueNodes.addElement(node, status);
1527 translate.setElementAt(valuePos++, i);
1528 } else {
1529 translate.setElementAt(nodeCount++, i);
1530 }
1089 totalSize += node->size(); 1531 totalSize += node->size();
1090 } 1532 }
1091 } 1533 }
1092 1534
1093 // Check for overflowing 16 bits worth of nodes. 1535 // Check for overflowing 20 bits worth of nodes.
1094 if (nodeCount > 0x10000) { 1536 if (nodeCount > 0x100000) {
1095 status = U_ILLEGAL_ARGUMENT_ERROR; 1537 status = U_ILLEGAL_ARGUMENT_ERROR;
1096 return NULL; 1538 return NULL;
1097 } 1539 }
1098 1540
1099 // Add enough room for the offsets. 1541 // Add enough room for the offsets.
1100 totalSize += nodeCount*sizeof(uint32_t); 1542 totalSize += nodeCount*sizeof(uint32_t);
1101 #ifdef DEBUG_TRIE_DICT 1543 #ifdef DEBUG_TRIE_DICT
1102 (void) ::times(&timing); 1544 (void) ::times(&timing);
1103 fprintf(stderr, "Sizes/mapping done, time user %f system %f\n", 1545 fprintf(stderr, "Sizes/mapping done, time user %f system %f\n",
1104 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, 1546 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1105 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); 1547 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1106 previous = timing; 1548 previous = timing;
1107 fprintf(stderr, "%d nodes, %d unique, %d bytes\n", nodes.size(), nodeCount, totalSize); 1549 fprintf(stderr, "%d nodes, %d unique, %d bytes\n", nodes.size(), nodeCount, totalSize);
1108 #endif 1550 #endif
1109 uint8_t *bytes = (uint8_t *)uprv_malloc(totalSize); 1551 uint8_t *bytes = (uint8_t *)uprv_malloc(totalSize);
1110 if (bytes == NULL) { 1552 if (bytes == NULL) {
1111 status = U_MEMORY_ALLOCATION_ERROR; 1553 status = U_MEMORY_ALLOCATION_ERROR;
1112 return NULL; 1554 return NULL;
1113 } 1555 }
1114 1556
1115 CompactTrieHeader *header = (CompactTrieHeader *)bytes; 1557 CompactTrieHeader *header = (CompactTrieHeader *)bytes;
1116 header->size = totalSize; 1558 //header->size = totalSize;
1559 if(dict.fValued){
1560 header->magic = COMPACT_TRIE_MAGIC_3;
1561 } else {
1562 header->magic = COMPACT_TRIE_MAGIC_2;
1563 }
1117 header->nodeCount = nodeCount; 1564 header->nodeCount = nodeCount;
1118 header->offsets[0] = 0; // Sentinel 1565 header->offsets[0] = 0; // Sentinel
1119 header->root = translate.elementAti(root->fNodeID); 1566 header->root = translate.elementAti(root->fNodeID);
1120 #ifdef DEBUG_TRIE_DICT 1567 #ifdef DEBUG_TRIE_DICT
1121 if (header->root == 0) { 1568 if (header->root == 0) {
1122 fprintf(stderr, "ERROR: root node %d translate to physical zero\n", root ->fNodeID); 1569 fprintf(stderr, "ERROR: root node %d translate to physical zero\n", root ->fNodeID);
1123 } 1570 }
1124 #endif 1571 #endif
1125 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint 32_t)); 1572 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint 32_t));
1126 nodeCount = 1; 1573 nodeCount = valueCount + 1;
1574
1575 // Write terminal value nodes to memory
1576 for (i=0; i < valueNodes.size(); i++) {
1577 //header->offsets[i + 1] = offset;
1578 uint32_t tmpOffset = 0;
1579 node = (BuildCompactTrieNode *) valueNodes.elementAt(i);
1580 //header->offsets[i + 1] = (uint32_t)node->fValue;
1581 node->write((uint8_t *)&header->offsets[i+1], tmpOffset, translate);
1582 }
1583
1127 // Now write the data 1584 // Now write the data
1128 for (i = 1; i < count; ++i) { 1585 for (i = 1; i < count; ++i) {
1129 node = (BuildCompactTrieNode *)nodes[i]; 1586 node = (BuildCompactTrieNode *)nodes[i];
1130 if (node->fNodeID == i) { 1587 if (node->fNodeID == i && node->fNodeType != kValueType) {
1131 header->offsets[nodeCount++] = offset; 1588 header->offsets[nodeCount++] = offset;
1132 node->write(bytes, offset, translate); 1589 node->write(bytes, offset, translate);
1133 } 1590 }
1134 } 1591 }
1592
1593 //free all extra space
1594 uprv_realloc(bytes, offset);
1595 header->size = offset;
1596
1135 #ifdef DEBUG_TRIE_DICT 1597 #ifdef DEBUG_TRIE_DICT
1598 fprintf(stdout, "Space freed: %d\n", totalSize-offset);
1599
1136 (void) ::times(&timing); 1600 (void) ::times(&timing);
1137 fprintf(stderr, "Trie built, time user %f system %f\n", 1601 fprintf(stderr, "Trie built, time user %f system %f\n",
1138 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, 1602 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1139 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); 1603 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1140 previous = timing; 1604 previous = timing;
1141 fprintf(stderr, "Final offset is %d\n", offset); 1605 fprintf(stderr, "Final offset is %d\n", offset);
1142 1606
1143 // Collect statistics on node types and sizes 1607 // Collect statistics on node types and sizes
1144 int hCount = 0; 1608 int hCount = 0;
1145 int vCount = 0; 1609 int vCount = 0;
1146 size_t hSize = 0; 1610 size_t hSize = 0;
1147 size_t vSize = 0; 1611 size_t vSize = 0;
1148 size_t hItemCount = 0; 1612 size_t hItemCount = 0;
1149 size_t vItemCount = 0; 1613 size_t vItemCount = 0;
1150 uint32_t previousOff = offset; 1614 uint32_t previousOff = offset;
1151 for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { 1615 uint32_t numOverflow = 0;
1616 uint32_t valueSpace = 0;
1617 for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {
1152 const CompactTrieNode *node = getCompactNode(header, nodeIdx); 1618 const CompactTrieNode *node = getCompactNode(header, nodeIdx);
1153 if (node->flagscount & kVerticalNode) { 1619 int itemCount;
1620 if(nodeIdx == header->root)
1621 itemCount = node->flagscount & kRootCountMask;
1622 else
1623 itemCount = getCount(node);
1624 if(node->flagscount & kEqualOverflows){
1625 numOverflow++;
1626 }
1627 if (node->flagscount & kVerticalNode && nodeIdx != header->root) {
1154 vCount += 1; 1628 vCount += 1;
1155 vItemCount += (node->flagscount & kCountMask); 1629 vItemCount += itemCount;
1156 vSize += previousOff-header->offsets[nodeIdx]; 1630 vSize += previousOff-header->offsets[nodeIdx];
1157 } 1631 }
1158 else { 1632 else {
1159 hCount += 1; 1633 hCount += 1;
1160 hItemCount += (node->flagscount & kCountMask); 1634 hItemCount += itemCount;
1161 hSize += previousOff-header->offsets[nodeIdx]; 1635 if(nodeIdx >= header->root) {
1636 hSize += previousOff-header->offsets[nodeIdx];
1637 }
1162 } 1638 }
1639
1640 if(header->magic == COMPACT_TRIE_MAGIC_3 && node->flagscount & kParentEn dsWord)
1641 valueSpace += sizeof(uint16_t);
1163 previousOff = header->offsets[nodeIdx]; 1642 previousOff = header->offsets[nodeIdx];
1164 } 1643 }
1165 fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\ n", hCount, 1644 fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\ n", hCount,
1166 (double)hSize/hCount, (double)hItemCount/hCount); 1645 (double)hSize/hCount, (double)hItemCount/hCount);
1167 fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n" , vCount, 1646 fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n" , vCount,
1168 (double)vSize/vCount, (double)vItemCount/vCount); 1647 (double)vSize/vCount, (double)vItemCount/vCount);
1648 fprintf(stderr, "Number of nodes with overflowing nodeIDs: %d \n", numOverfl ow);
1649 fprintf(stderr, "Space taken up by values: %d \n", valueSpace);
1169 #endif 1650 #endif
1170 1651
1171 if (U_FAILURE(status)) { 1652 if (U_FAILURE(status)) {
1172 uprv_free(bytes); 1653 uprv_free(bytes);
1173 header = NULL; 1654 header = NULL;
1174 } 1655 }
1175 else {
1176 header->magic = COMPACT_TRIE_MAGIC_1;
1177 }
1178 return header; 1656 return header;
1179 } 1657 }
1180 1658
1181 // Forward declaration 1659 // Forward declaration
1182 static TernaryNode * 1660 static TernaryNode *
1183 unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UEr rorCode &status ); 1661 unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorC ode &status );
1184
1185 1662
1186 // Convert a horizontal node (or subarray thereof) into a ternary subtrie 1663 // Convert a horizontal node (or subarray thereof) into a ternary subtrie
1187 static TernaryNode * 1664 static TernaryNode *
1188 unpackHorizontalArray( const CompactTrieHeader *header, const CompactTrieHorizon talEntry *array, 1665 unpackHorizontalArray( const CompactTrieInfo *info, const CompactTrieHorizontalN ode *hnode,
1189 int low, int high, UErrorCode &status ) { 1666 int low, int high, int nodeCount, UErrorCode &status) {
1190 if (U_FAILURE(status) || low > high) { 1667 if (U_FAILURE(status) || low > high) {
1191 return NULL; 1668 return NULL;
1192 } 1669 }
1193 int middle = (low+high)/2; 1670 int middle = (low+high)/2;
1194 TernaryNode *result = new TernaryNode(array[middle].ch); 1671 TernaryNode *result = new TernaryNode(hnode->entries[middle].ch);
1195 if (result == NULL) { 1672 if (result == NULL) {
1196 status = U_MEMORY_ALLOCATION_ERROR; 1673 status = U_MEMORY_ALLOCATION_ERROR;
1197 return NULL; 1674 return NULL;
1198 } 1675 }
1199 const CompactTrieNode *equal = getCompactNode(header, array[middle].equal); 1676 const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(hnode, mid dle, nodeCount));
1200 if (equal->flagscount & kParentEndsWord) { 1677 if (equal->flagscount & kParentEndsWord) {
1201 result->flags |= kEndsWord; 1678 if(info->magic == COMPACT_TRIE_MAGIC_3){
1679 result->flags = getValue(equal);
1680 }else{
1681 result->flags |= kEndsWord;
1682 }
1202 } 1683 }
1203 result->low = unpackHorizontalArray(header, array, low, middle-1, status); 1684 result->low = unpackHorizontalArray(info, hnode, low, middle-1, nodeCount, s tatus);
1204 result->high = unpackHorizontalArray(header, array, middle+1, high, status); 1685 result->high = unpackHorizontalArray(info, hnode, middle+1, high, nodeCount, status);
1205 result->equal = unpackOneNode(header, equal, status); 1686 result->equal = unpackOneNode(info, equal, status);
1206 return result; 1687 return result;
1207 } 1688 }
1208 1689
1209 // Convert one compact trie node into a ternary subtrie 1690 // Convert one compact trie node into a ternary subtrie
1210 static TernaryNode * 1691 static TernaryNode *
1211 unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UEr rorCode &status ) { 1692 unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorC ode &status ) {
1212 int nodeCount = (node->flagscount & kCountMask); 1693 int nodeCount = getCount(node);
1213 if (nodeCount == 0 || U_FAILURE(status)) { 1694 if (nodeCount == 0 || U_FAILURE(status)) {
1214 // Failure, or terminal node 1695 // Failure, or terminal node
1215 return NULL; 1696 return NULL;
1216 } 1697 }
1217 if (node->flagscount & kVerticalNode) { 1698 if (node->flagscount & kVerticalNode) {
1218 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *) node; 1699 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *) node;
1219 TernaryNode *head = NULL; 1700 TernaryNode *head = NULL;
1220 TernaryNode *previous = NULL; 1701 TernaryNode *previous = NULL;
1221 TernaryNode *latest = NULL; 1702 TernaryNode *latest = NULL;
1222 for (int i = 0; i < nodeCount; ++i) { 1703 for (int i = 0; i < nodeCount; ++i) {
1223 latest = new TernaryNode(vnode->chars[i]); 1704 latest = new TernaryNode(vnode->chars[i]);
1224 if (latest == NULL) { 1705 if (latest == NULL) {
1225 status = U_MEMORY_ALLOCATION_ERROR; 1706 status = U_MEMORY_ALLOCATION_ERROR;
1226 break; 1707 break;
1227 } 1708 }
1228 if (head == NULL) { 1709 if (head == NULL) {
1229 head = latest; 1710 head = latest;
1230 } 1711 }
1231 if (previous != NULL) { 1712 if (previous != NULL) {
1232 previous->equal = latest; 1713 previous->equal = latest;
1233 } 1714 }
1234 previous = latest; 1715 previous = latest;
1235 } 1716 }
1236 if (latest != NULL) { 1717 if (latest != NULL) {
1237 const CompactTrieNode *equal = getCompactNode(header, vnode->equal); 1718 const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(vn ode));
1238 if (equal->flagscount & kParentEndsWord) { 1719 if (equal->flagscount & kParentEndsWord) {
1239 latest->flags |= kEndsWord; 1720 if(info->magic == COMPACT_TRIE_MAGIC_3){
1721 latest->flags = getValue(equal);
1722 } else {
1723 latest->flags |= kEndsWord;
1724 }
1240 } 1725 }
1241 latest->equal = unpackOneNode(header, equal, status); 1726 latest->equal = unpackOneNode(info, equal, status);
1242 } 1727 }
1243 return head; 1728 return head;
1244 } 1729 }
1245 else { 1730 else {
1246 // Horizontal node 1731 // Horizontal node
1247 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNod e *)node; 1732 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNod e *)node;
1248 return unpackHorizontalArray(header, &hnode->entries[0], 0, nodeCount-1, status); 1733 return unpackHorizontalArray(info, hnode, 0, nodeCount-1, nodeCount, sta tus);
1249 } 1734 }
1250 } 1735 }
1251 1736
1737 // returns a MutableTrieDictionary generated from the CompactTrieDictionary
1252 MutableTrieDictionary * 1738 MutableTrieDictionary *
1253 CompactTrieDictionary::cloneMutable( UErrorCode &status ) const { 1739 CompactTrieDictionary::cloneMutable( UErrorCode &status ) const {
1254 MutableTrieDictionary *result = new MutableTrieDictionary( status ); 1740 MutableTrieDictionary *result = new MutableTrieDictionary( status, fInfo->ma gic == COMPACT_TRIE_MAGIC_3 );
1255 if (result == NULL) { 1741 if (result == NULL) {
1256 status = U_MEMORY_ALLOCATION_ERROR; 1742 status = U_MEMORY_ALLOCATION_ERROR;
1257 return NULL; 1743 return NULL;
1258 } 1744 }
1259 TernaryNode *root = unpackOneNode(fData, getCompactNode(fData, fData->root), status); 1745 // treat root node as special case: don't call unpackOneNode() or unpackHori zontalArray() directly
1746 // because only kEqualOverflows flag should be checked in root's flagscount
1747 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)
1748 getCompactNode(fInfo, fInfo->root);
1749 uint16_t nodeCount = hnode->flagscount & kRootCountMask;
1750 TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1,
1751 nodeCount, status);
1752
1260 if (U_FAILURE(status)) { 1753 if (U_FAILURE(status)) {
1261 delete root; // Clean up 1754 delete root; // Clean up
1262 delete result; 1755 delete result;
1263 return NULL; 1756 return NULL;
1264 } 1757 }
1265 result->fTrie = root; 1758 result->fTrie = root;
1266 return result; 1759 return result;
1267 } 1760 }
1268 1761
1269 U_NAMESPACE_END 1762 U_NAMESPACE_END
1270 1763
1271 U_CAPI int32_t U_EXPORT2 1764 U_CAPI int32_t U_EXPORT2
1272 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void * outData, 1765 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void * outData,
1273 UErrorCode *status) { 1766 UErrorCode *status) {
1274 1767
1275 if (status == NULL || U_FAILURE(*status)) { 1768 if (status == NULL || U_FAILURE(*status)) {
1276 return 0; 1769 return 0;
1277 } 1770 }
1278 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { 1771 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
1279 *status=U_ILLEGAL_ARGUMENT_ERROR; 1772 *status=U_ILLEGAL_ARGUMENT_ERROR;
1280 return 0; 1773 return 0;
1281 } 1774 }
1282 1775
1283 // 1776 //
1284 // Check that the data header is for for dictionary data. 1777 // Check that the data header is for for dictionary data.
1285 // (Header contents are defined in genxxx.cpp) 1778 // (Header contents are defined in genxxx.cpp)
1286 // 1779 //
1287 const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4); 1780 const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4);
1288 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */ 1781 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */
1289 pInfo->dataFormat[1]==0x72 && 1782 pInfo->dataFormat[1]==0x72 &&
1290 pInfo->dataFormat[2]==0x44 && 1783 pInfo->dataFormat[2]==0x44 &&
1291 pInfo->dataFormat[3]==0x63 && 1784 pInfo->dataFormat[3]==0x63 &&
1292 pInfo->formatVersion[0]==1 )) { 1785 pInfo->formatVersion[0]==1 )) {
1293 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x ( format version %02x) is not recognized\n", 1786 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x ( format version %02x) is not recognized\n",
1294 pInfo->dataFormat[0], pInfo->dataFormat[1], 1787 pInfo->dataFormat[0], pInfo->dataFormat[1],
1295 pInfo->dataFormat[2], pInfo->dataFormat[3], 1788 pInfo->dataFormat[2], pInfo->dataFormat[3],
1296 pInfo->formatVersion[0]); 1789 pInfo->formatVersion[0]);
1297 *status=U_UNSUPPORTED_ERROR; 1790 *status=U_UNSUPPORTED_ERROR;
1298 return 0; 1791 return 0;
1299 } 1792 }
1300 1793
1301 // 1794 //
1302 // Swap the data header. (This is the generic ICU Data Header, not the 1795 // Swap the data header. (This is the generic ICU Data Header, not the
1303 // CompactTrieHeader). This swap also conveniently gets us 1796 // CompactTrieHeader). This swap also conveniently gets us
1304 // the size of the ICU d.h., which lets us locate th e start 1797 // the size of the ICU d.h., which lets us locate th e start
1305 // of the RBBI specific data. 1798 // of the RBBI specific data.
1306 // 1799 //
1307 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status) ; 1800 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status) ;
1308 1801
1309 // 1802 //
1310 // Get the CompactTrieHeader, and check that it appears to be OK. 1803 // Get the CompactTrieHeader, and check that it appears to be OK.
1311 // 1804 //
1312 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; 1805 const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
1313 const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes; 1806 const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes;
1314 if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1 1807 uint32_t magic = ds->readUInt32(header->magic);
1315 || ds->readUInt32(header->size) < sizeof(CompactTrieHeader)) 1808 if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic != COMPACT_TRIE_MAGIC_3
1809 || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < s izeof(CompactTrieHeaderV1)
1810 || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < s izeof(CompactTrieHeader))
1316 { 1811 {
1317 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n") ; 1812 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n") ;
1318 *status=U_UNSUPPORTED_ERROR; 1813 *status=U_UNSUPPORTED_ERROR;
1319 return 0; 1814 return 0;
1320 } 1815 }
1321 1816
1322 // 1817 //
1323 // Prefight operation? Just return the size 1818 // Prefight operation? Just return the size
1324 // 1819 //
1325 uint32_t totalSize = ds->readUInt32(header->size); 1820 uint32_t totalSize = ds->readUInt32(header->size);
1326 int32_t sizeWithUData = (int32_t)totalSize + headerSize; 1821 int32_t sizeWithUData = (int32_t)totalSize + headerSize;
1327 if (length < 0) { 1822 if (length < 0) {
1328 return sizeWithUData; 1823 return sizeWithUData;
1329 } 1824 }
1330 1825
1331 // 1826 //
1332 // Check that length passed in is consistent with length from RBBI data head er. 1827 // Check that length passed in is consistent with length from RBBI data head er.
1333 // 1828 //
1334 if (length < sizeWithUData) { 1829 if (length < sizeWithUData) {
1335 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n", 1830 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",
1336 totalSize); 1831 totalSize);
1337 *status=U_INDEX_OUTOFBOUNDS_ERROR; 1832 *status=U_INDEX_OUTOFBOUNDS_ERROR;
1338 return 0; 1833 return 0;
1339 } 1834 }
1340 1835
1341 // 1836 //
1342 // Swap the Data. Do the data itself first, then the CompactTrieHeader, bec ause 1837 // Swap the Data. Do the data itself first, then the CompactTrieHeader, bec ause
1343 // we need to reference the header to locate the data, and a n 1838 // we need to reference the header to locate the data, and a n
1344 // inplace swap of the header leaves it unusable. 1839 // inplace swap of the header leaves it unusable.
1345 // 1840 //
1346 uint8_t *outBytes = (uint8_t *)outData + headerSize; 1841 uint8_t *outBytes = (uint8_t *)outData + headerSize;
1347 CompactTrieHeader *outputHeader = (CompactTrieHeader *)outBytes; 1842 CompactTrieHeader *outputHeader = (CompactTrieHeader *)outBytes;
1348 1843
1349 #if 0 1844 #if 0
1350 // 1845 //
1351 // If not swapping in place, zero out the output buffer before starting. 1846 // If not swapping in place, zero out the output buffer before starting.
1352 // 1847 //
1353 if (inBytes != outBytes) { 1848 if (inBytes != outBytes) {
1354 uprv_memset(outBytes, 0, totalSize); 1849 uprv_memset(outBytes, 0, totalSize);
1355 } 1850 }
1356 1851
1357 // We need to loop through all the nodes in the offset table, and swap each one. 1852 // We need to loop through all the nodes in the offset table, and swap each one.
1358 uint16_t nodeCount = ds->readUInt16(header->nodeCount); 1853 uint32_t nodeCount, rootId;
1854 if(header->magic == COMPACT_TRIE_MAGIC_1) {
1855 nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount);
1856 rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root);
1857 } else {
1858 nodeCount = ds->readUInt32(header->nodeCount);
1859 rootId = ds->readUInt32(header->root);
1860 }
1861
1359 // Skip node 0, which should always be 0. 1862 // Skip node 0, which should always be 0.
1360 for (int i = 1; i < nodeCount; ++i) { 1863 for (uint32_t i = 1; i < nodeCount; ++i) {
1361 uint32_t nodeOff = ds->readUInt32(header->offsets[i]); 1864 uint32_t nodeOff = ds->readUInt32(header->offsets[i]);
1362 const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + node Off); 1865 const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + node Off);
1363 CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff); 1866 CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff);
1364 uint16_t flagscount = ds->readUInt16(inNode->flagscount); 1867 uint16_t flagscount = ds->readUInt16(inNode->flagscount);
1365 uint16_t itemCount = flagscount & kCountMask; 1868 uint16_t itemCount = getCount(inNode);
1869 //uint16_t itemCount = flagscount & kCountMask;
1366 ds->writeUInt16(&outNode->flagscount, flagscount); 1870 ds->writeUInt16(&outNode->flagscount, flagscount);
1367 if (itemCount > 0) { 1871 if (itemCount > 0) {
1368 if (flagscount & kVerticalNode) { 1872 uint16_t overflow = 0; //number of extra uint16_ts needed to be swap ped
1873 if (flagscount & kVerticalNode && i != rootId) {
1874 if(flagscount & kEqualOverflows){
1875 // include overflow bits
1876 overflow += 1;
1877 }
1878 if (header->magic == COMPACT_TRIE_MAGIC_3 && flagscount & kEndsP arentWord) {
1879 //include values
1880 overflow += 1;
1881 }
1369 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVertical Node,chars), 1882 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVertical Node,chars),
1370 itemCount*sizeof(uint16_t), 1883 (itemCount + overflow)*sizeof(uint16_t),
1371 outBytes+nodeOff+offsetof(CompactTrieVertica lNode,chars), status); 1884 outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars) , status);
1372 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(Compact TrieVerticalNode,equal); 1885 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(Compact TrieVerticalNode,equal);
1373 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNod e,equal)); 1886 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNod e,equal));
1374 } 1887 }
1375 else { 1888 else {
1376 const CompactTrieHorizontalNode *inHNode = (const CompactTrieHor izontalNode *)inNode; 1889 const CompactTrieHorizontalNode *inHNode = (const CompactTrieHor izontalNode *)inNode;
1377 CompactTrieHorizontalNode *outHNode = (CompactTrieHorizontalNode *)outNode; 1890 CompactTrieHorizontalNode *outHNode = (CompactTrieHorizontalNode *)outNode;
1378 for (int j = 0; j < itemCount; ++j) { 1891 for (int j = 0; j < itemCount; ++j) {
1379 uint16_t word = ds->readUInt16(inHNode->entries[j].ch); 1892 uint16_t word = ds->readUInt16(inHNode->entries[j].ch);
1380 ds->writeUInt16(&outHNode->entries[j].ch, word); 1893 ds->writeUInt16(&outHNode->entries[j].ch, word);
1381 word = ds->readUInt16(inHNode->entries[j].equal); 1894 word = ds->readUInt16(inHNode->entries[j].equal);
1382 ds->writeUInt16(&outHNode->entries[j].equal, word); 1895 ds->writeUInt16(&outHNode->entries[j].equal, word);
1383 } 1896 }
1897
1898 // swap overflow/value information
1899 if(flagscount & kEqualOverflows){
1900 overflow += (itemCount + 3) / 4;
1901 }
1902
1903 if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && flag scount & kEndsParentWord) {
1904 //include values
1905 overflow += 1;
1906 }
1907
1908 uint16_t *inOverflow = (uint16_t *) &inHNode->entries[itemCount] ;
1909 uint16_t *outOverflow = (uint16_t *) &outHNode->entries[itemCoun t];
1910 for(int j = 0; j<overflow; j++){
1911 uint16_t extraInfo = ds->readUInt16(*inOverflow);
1912 ds->writeUInt16(outOverflow, extraInfo);
1913
1914 inOverflow++;
1915 outOverflow++;
1916 }
1384 } 1917 }
1385 } 1918 }
1386 } 1919 }
1387 #endif 1920 #endif
1388 1921
1922 // Swap the header
1923 ds->writeUInt32(&outputHeader->size, totalSize);
1924 ds->writeUInt32(&outputHeader->magic, magic);
1925
1926 uint32_t nodeCount;
1927 uint32_t offsetPos;
1928 if (header->magic == COMPACT_TRIE_MAGIC_1) {
1929 CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *)header;
1930 CompactTrieHeaderV1 *outputHeaderV1 = (CompactTrieHeaderV1 *)outputHeade r;
1931
1932 nodeCount = ds->readUInt16(headerV1->nodeCount);
1933 ds->writeUInt16(&outputHeaderV1->nodeCount, nodeCount);
1934 uint16_t root = ds->readUInt16(headerV1->root);
1935 ds->writeUInt16(&outputHeaderV1->root, root);
1936 offsetPos = offsetof(CompactTrieHeaderV1,offsets);
1937 } else {
1938 nodeCount = ds->readUInt32(header->nodeCount);
1939 ds->writeUInt32(&outputHeader->nodeCount, nodeCount);
1940 uint32_t root = ds->readUInt32(header->root);
1941 ds->writeUInt32(&outputHeader->root, root);
1942 offsetPos = offsetof(CompactTrieHeader,offsets);
1943 }
1944
1389 // All the data in all the nodes consist of 16 bit items. Swap them all at o nce. 1945 // All the data in all the nodes consist of 16 bit items. Swap them all at o nce.
1390 uint16_t nodeCount = ds->readUInt16(header->nodeCount); 1946 uint32_t nodesOff = offsetPos+((uint32_t)nodeCount*sizeof(uint32_t));
1391 uint32_t nodesOff = offsetof(CompactTrieHeader,offsets)+((uint32_t)nodeCount *sizeof(uint32_t));
1392 ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status); 1947 ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status);
1393 1948
1394 // Swap the header 1949 //swap offsets
1395 ds->writeUInt32(&outputHeader->size, totalSize); 1950 ds->swapArray32(ds, inBytes+offsetPos,
1396 uint32_t magic = ds->readUInt32(header->magic); 1951 sizeof(uint32_t)*(uint32_t)nodeCount,
1397 ds->writeUInt32(&outputHeader->magic, magic); 1952 outBytes+offsetPos, status);
1398 ds->writeUInt16(&outputHeader->nodeCount, nodeCount);
1399 uint16_t root = ds->readUInt16(header->root);
1400 ds->writeUInt16(&outputHeader->root, root);
1401 ds->swapArray32(ds, inBytes+offsetof(CompactTrieHeader,offsets),
1402 sizeof(uint32_t)*(int32_t)nodeCount,
1403 outBytes+offsetof(CompactTrieHeader,offsets), status);
1404 1953
1405 return sizeWithUData; 1954 return sizeWithUData;
1406 } 1955 }
1407 1956
1408 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1957 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
OLDNEW
« no previous file with comments | « icu46/source/common/triedict.h ('k') | icu46/source/data/Makefile.in » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698