Index: icu46/source/common/triedict.cpp |
=================================================================== |
--- icu46/source/common/triedict.cpp (revision 0) |
+++ icu46/source/common/triedict.cpp (revision 0) |
@@ -0,0 +1,1408 @@ |
+/** |
+ ******************************************************************************* |
+ * Copyright (C) 2006-2008, International Business Machines Corporation * |
+ * and others. All Rights Reserved. * |
+ ******************************************************************************* |
+ */ |
+ |
+#include "unicode/utypes.h" |
+ |
+#if !UCONFIG_NO_BREAK_ITERATION |
+ |
+#include "triedict.h" |
+#include "unicode/chariter.h" |
+#include "unicode/uchriter.h" |
+#include "unicode/strenum.h" |
+#include "unicode/uenum.h" |
+#include "unicode/udata.h" |
+#include "cmemory.h" |
+#include "udataswp.h" |
+#include "uvector.h" |
+#include "uvectr32.h" |
+#include "uarrsort.h" |
+ |
+//#define DEBUG_TRIE_DICT 1 |
+ |
+#ifdef DEBUG_TRIE_DICT |
+#include <sys/times.h> |
+#include <limits.h> |
+#include <stdio.h> |
+#endif |
+ |
+U_NAMESPACE_BEGIN |
+ |
+/******************************************************************* |
+ * TrieWordDictionary |
+ */ |
+ |
+TrieWordDictionary::TrieWordDictionary() { |
+} |
+ |
+TrieWordDictionary::~TrieWordDictionary() { |
+} |
+ |
+/******************************************************************* |
+ * MutableTrieDictionary |
+ */ |
+ |
+// Node structure for the ternary, uncompressed trie |
+struct TernaryNode : public UMemory { |
+ UChar ch; // UTF-16 code unit |
+ uint16_t flags; // Flag word |
+ TernaryNode *low; // Less-than link |
+ TernaryNode *equal; // Equal link |
+ TernaryNode *high; // Greater-than link |
+ |
+ TernaryNode(UChar uc); |
+ ~TernaryNode(); |
+}; |
+ |
+enum MutableTrieNodeFlags { |
+ kEndsWord = 0x0001 // This node marks the end of a valid word |
+}; |
+ |
+inline |
+TernaryNode::TernaryNode(UChar uc) { |
+ ch = uc; |
+ flags = 0; |
+ low = NULL; |
+ equal = NULL; |
+ high = NULL; |
+} |
+ |
+// Not inline since it's recursive |
+TernaryNode::~TernaryNode() { |
+ delete low; |
+ delete equal; |
+ delete high; |
+} |
+ |
+MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) { |
+ // Start the trie off with something. Having the root node already present |
+ // cuts a special case out of the search/insertion functions. |
+ // Making it a median character cuts the worse case for searches from |
+ // 4x a balanced trie to 2x a balanced trie. It's best to choose something |
+ // that starts a word that is midway in the list. |
+ fTrie = new TernaryNode(median); |
+ if (fTrie == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ } |
+ fIter = utext_openUChars(NULL, NULL, 0, &status); |
+ if (U_SUCCESS(status) && fIter == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ } |
+} |
+ |
+MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) { |
+ fTrie = NULL; |
+ fIter = utext_openUChars(NULL, NULL, 0, &status); |
+ if (U_SUCCESS(status) && fIter == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ } |
+} |
+ |
+MutableTrieDictionary::~MutableTrieDictionary() { |
+ delete fTrie; |
+ utext_close(fIter); |
+} |
+ |
+int32_t |
+MutableTrieDictionary::search( UText *text, |
+ int32_t maxLength, |
+ int32_t *lengths, |
+ int &count, |
+ int limit, |
+ TernaryNode *&parent, |
+ UBool &pMatched ) const { |
+ // TODO: current implementation works in UTF-16 space |
+ const TernaryNode *up = NULL; |
+ const TernaryNode *p = fTrie; |
+ int mycount = 0; |
+ pMatched = TRUE; |
+ int i; |
+ |
+ UChar uc = utext_current32(text); |
+ for (i = 0; i < maxLength && p != NULL; ++i) { |
+ while (p != NULL) { |
+ if (uc < p->ch) { |
+ up = p; |
+ p = p->low; |
+ } |
+ else if (uc == p->ch) { |
+ break; |
+ } |
+ else { |
+ up = p; |
+ p = p->high; |
+ } |
+ } |
+ if (p == NULL) { |
+ pMatched = FALSE; |
+ break; |
+ } |
+ // Must be equal to get here |
+ if (limit > 0 && (p->flags & kEndsWord)) { |
+ lengths[mycount++] = i+1; |
+ --limit; |
+ } |
+ up = p; |
+ p = p->equal; |
+ uc = utext_next32(text); |
+ uc = utext_current32(text); |
+ } |
+ |
+ // Note that there is no way to reach here with up == 0 unless |
+ // maxLength is 0 coming in. |
+ parent = (TernaryNode *)up; |
+ count = mycount; |
+ return i; |
+} |
+ |
+void |
+MutableTrieDictionary::addWord( const UChar *word, |
+ int32_t length, |
+ UErrorCode &status ) { |
+#if 0 |
+ if (length <= 0) { |
+ status = U_ILLEGAL_ARGUMENT_ERROR; |
+ return; |
+ } |
+#endif |
+ TernaryNode *parent; |
+ UBool pMatched; |
+ int count; |
+ fIter = utext_openUChars(fIter, word, length, &status); |
+ |
+ int matched; |
+ matched = search(fIter, length, NULL, count, 0, parent, pMatched); |
+ |
+ while (matched++ < length) { |
+ UChar32 uc = utext_next32(fIter); // TODO: supplemetary support? |
+ U_ASSERT(uc != U_SENTINEL); |
+ TernaryNode *newNode = new TernaryNode(uc); |
+ if (newNode == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ return; |
+ } |
+ if (pMatched) { |
+ parent->equal = newNode; |
+ } |
+ else { |
+ pMatched = TRUE; |
+ if (uc < parent->ch) { |
+ parent->low = newNode; |
+ } |
+ else { |
+ parent->high = newNode; |
+ } |
+ } |
+ parent = newNode; |
+ } |
+ |
+ parent->flags |= kEndsWord; |
+} |
+ |
+#if 0 |
+void |
+MutableTrieDictionary::addWords( UEnumeration *words, |
+ UErrorCode &status ) { |
+ int32_t length; |
+ const UChar *word; |
+ while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) { |
+ addWord(word, length, status); |
+ } |
+} |
+#endif |
+ |
+int32_t |
+MutableTrieDictionary::matches( UText *text, |
+ int32_t maxLength, |
+ int32_t *lengths, |
+ int &count, |
+ int limit ) const { |
+ TernaryNode *parent; |
+ UBool pMatched; |
+ return search(text, maxLength, lengths, count, limit, parent, pMatched); |
+} |
+ |
+// Implementation of iteration for MutableTrieDictionary |
+class MutableTrieEnumeration : public StringEnumeration { |
+private: |
+ UStack fNodeStack; // Stack of nodes to process |
+ UVector32 fBranchStack; // Stack of which branch we are working on |
+ TernaryNode *fRoot; // Root node |
+ enum StackBranch { |
+ kLessThan, |
+ kEqual, |
+ kGreaterThan, |
+ kDone |
+ }; |
+ |
+public: |
+ static UClassID U_EXPORT2 getStaticClassID(void); |
+ virtual UClassID getDynamicClassID(void) const; |
+public: |
+ MutableTrieEnumeration(TernaryNode *root, UErrorCode &status) |
+ : fNodeStack(status), fBranchStack(status) { |
+ fRoot = root; |
+ fNodeStack.push(root, status); |
+ fBranchStack.push(kLessThan, status); |
+ unistr.remove(); |
+ } |
+ |
+ virtual ~MutableTrieEnumeration() { |
+ } |
+ |
+ virtual StringEnumeration *clone() const { |
+ UErrorCode status = U_ZERO_ERROR; |
+ return new MutableTrieEnumeration(fRoot, status); |
+ } |
+ |
+ virtual const UnicodeString *snext(UErrorCode &status) { |
+ if (fNodeStack.empty() || U_FAILURE(status)) { |
+ return NULL; |
+ } |
+ TernaryNode *node = (TernaryNode *) fNodeStack.peek(); |
+ StackBranch where = (StackBranch) fBranchStack.peeki(); |
+ while (!fNodeStack.empty() && U_SUCCESS(status)) { |
+ UBool emit; |
+ UBool equal; |
+ |
+ switch (where) { |
+ case kLessThan: |
+ if (node->low != NULL) { |
+ fBranchStack.setElementAt(kEqual, fBranchStack.size()-1); |
+ node = (TernaryNode *) fNodeStack.push(node->low, status); |
+ where = (StackBranch) fBranchStack.push(kLessThan, status); |
+ break; |
+ } |
+ case kEqual: |
+ emit = (node->flags & kEndsWord) != 0; |
+ equal = (node->equal != NULL); |
+ // If this node should be part of the next emitted string, append |
+ // the UChar to the string, and make sure we pop it when we come |
+ // back to this node. The character should only be in the string |
+ // for as long as we're traversing the equal subtree of this node |
+ if (equal || emit) { |
+ unistr.append(node->ch); |
+ fBranchStack.setElementAt(kGreaterThan, fBranchStack.size()-1); |
+ } |
+ if (equal) { |
+ node = (TernaryNode *) fNodeStack.push(node->equal, status); |
+ where = (StackBranch) fBranchStack.push(kLessThan, status); |
+ } |
+ if (emit) { |
+ return &unistr; |
+ } |
+ if (equal) { |
+ break; |
+ } |
+ case kGreaterThan: |
+ // If this node's character is in the string, remove it. |
+ if (node->equal != NULL || (node->flags & kEndsWord)) { |
+ unistr.truncate(unistr.length()-1); |
+ } |
+ if (node->high != NULL) { |
+ fBranchStack.setElementAt(kDone, fBranchStack.size()-1); |
+ node = (TernaryNode *) fNodeStack.push(node->high, status); |
+ where = (StackBranch) fBranchStack.push(kLessThan, status); |
+ break; |
+ } |
+ case kDone: |
+ fNodeStack.pop(); |
+ fBranchStack.popi(); |
+ node = (TernaryNode *) fNodeStack.peek(); |
+ where = (StackBranch) fBranchStack.peeki(); |
+ break; |
+ default: |
+ return NULL; |
+ } |
+ } |
+ return NULL; |
+ } |
+ |
+ // Very expensive, but this should never be used. |
+ virtual int32_t count(UErrorCode &status) const { |
+ MutableTrieEnumeration counter(fRoot, status); |
+ int32_t result = 0; |
+ while (counter.snext(status) != NULL && U_SUCCESS(status)) { |
+ ++result; |
+ } |
+ return result; |
+ } |
+ |
+ virtual void reset(UErrorCode &status) { |
+ fNodeStack.removeAllElements(); |
+ fBranchStack.removeAllElements(); |
+ fNodeStack.push(fRoot, status); |
+ fBranchStack.push(kLessThan, status); |
+ unistr.remove(); |
+ } |
+}; |
+ |
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MutableTrieEnumeration) |
+ |
+StringEnumeration * |
+MutableTrieDictionary::openWords( UErrorCode &status ) const { |
+ if (U_FAILURE(status)) { |
+ return NULL; |
+ } |
+ return new MutableTrieEnumeration(fTrie, status); |
+} |
+ |
+/******************************************************************* |
+ * CompactTrieDictionary |
+ */ |
+ |
+struct CompactTrieHeader { |
+ uint32_t size; // Size of the data in bytes |
+ uint32_t magic; // Magic number (including version) |
+ uint16_t nodeCount; // Number of entries in offsets[] |
+ uint16_t root; // Node number of the root node |
+ uint32_t offsets[1]; // Offsets to nodes from start of data |
+}; |
+ |
+// Note that to avoid platform-specific alignment issues, all members of the node |
+// structures should be the same size, or should contain explicit padding to |
+// natural alignment boundaries. |
+ |
+// We can't use a bitfield for the flags+count field, because the layout of those |
+// is not portable. 12 bits of count allows for up to 4096 entries in a node. |
+struct CompactTrieNode { |
+ uint16_t flagscount; // Count of sub-entries, plus flags |
+}; |
+ |
+enum CompactTrieNodeFlags { |
+ kVerticalNode = 0x1000, // This is a vertical node |
+ kParentEndsWord = 0x2000, // The node whose equal link points to this ends a word |
+ kReservedFlag1 = 0x4000, |
+ kReservedFlag2 = 0x8000, |
+ kCountMask = 0x0FFF, // The count portion of flagscount |
+ kFlagMask = 0xF000 // The flags portion of flagscount |
+}; |
+ |
+// The two node types are distinguished by the kVerticalNode flag. |
+ |
+struct CompactTrieHorizontalEntry { |
+ uint16_t ch; // UChar |
+ uint16_t equal; // Equal link node index |
+}; |
+ |
+// We don't use inheritance here because C++ does not guarantee that the |
+// base class comes first in memory!! |
+ |
+struct CompactTrieHorizontalNode { |
+ uint16_t flagscount; // Count of sub-entries, plus flags |
+ CompactTrieHorizontalEntry entries[1]; |
+}; |
+ |
+struct CompactTrieVerticalNode { |
+ uint16_t flagscount; // Count of sub-entries, plus flags |
+ uint16_t equal; // Equal link node index |
+ uint16_t chars[1]; // Code units |
+}; |
+ |
+// {'Dic', 1}, version 1 |
+#define COMPACT_TRIE_MAGIC_1 0x44696301 |
+ |
+CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj, |
+ UErrorCode &status ) |
+: fUData(dataObj) |
+{ |
+ fData = (const CompactTrieHeader *) udata_getMemory(dataObj); |
+ fOwnData = FALSE; |
+ if (fData->magic != COMPACT_TRIE_MAGIC_1) { |
+ status = U_ILLEGAL_ARGUMENT_ERROR; |
+ fData = NULL; |
+ } |
+} |
+CompactTrieDictionary::CompactTrieDictionary( const void *data, |
+ UErrorCode &status ) |
+: fUData(NULL) |
+{ |
+ fData = (const CompactTrieHeader *) data; |
+ fOwnData = FALSE; |
+ if (fData->magic != COMPACT_TRIE_MAGIC_1) { |
+ status = U_ILLEGAL_ARGUMENT_ERROR; |
+ fData = NULL; |
+ } |
+} |
+ |
+CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict, |
+ UErrorCode &status ) |
+: fUData(NULL) |
+{ |
+ fData = compactMutableTrieDictionary(dict, status); |
+ fOwnData = !U_FAILURE(status); |
+} |
+ |
+CompactTrieDictionary::~CompactTrieDictionary() { |
+ if (fOwnData) { |
+ uprv_free((void *)fData); |
+ } |
+ if (fUData) { |
+ udata_close(fUData); |
+ } |
+} |
+ |
+uint32_t |
+CompactTrieDictionary::dataSize() const { |
+ return fData->size; |
+} |
+ |
+const void * |
+CompactTrieDictionary::data() const { |
+ return fData; |
+} |
+ |
+// This function finds the address of a node for us, given its node ID |
+static inline const CompactTrieNode * |
+getCompactNode(const CompactTrieHeader *header, uint16_t node) { |
+ return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]); |
+} |
+ |
+int32_t |
+CompactTrieDictionary::matches( UText *text, |
+ int32_t maxLength, |
+ int32_t *lengths, |
+ int &count, |
+ int limit ) const { |
+ // TODO: current implementation works in UTF-16 space |
+ const CompactTrieNode *node = getCompactNode(fData, fData->root); |
+ int mycount = 0; |
+ |
+ UChar uc = utext_current32(text); |
+ int i = 0; |
+ |
+ while (node != NULL) { |
+ // Check if the node we just exited ends a word |
+ if (limit > 0 && (node->flagscount & kParentEndsWord)) { |
+ lengths[mycount++] = i; |
+ --limit; |
+ } |
+ // Check that we haven't exceeded the maximum number of input characters. |
+ // We have to do that here rather than in the while condition so that |
+ // we can check for ending a word, above. |
+ if (i >= maxLength) { |
+ break; |
+ } |
+ |
+ int nodeCount = (node->flagscount & kCountMask); |
+ if (nodeCount == 0) { |
+ // Special terminal node; return now |
+ break; |
+ } |
+ if (node->flagscount & kVerticalNode) { |
+ // Vertical node; check all the characters in it |
+ const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node; |
+ for (int j = 0; j < nodeCount && i < maxLength; ++j) { |
+ if (uc != vnode->chars[j]) { |
+ // We hit a non-equal character; return |
+ goto exit; |
+ } |
+ utext_next32(text); |
+ uc = utext_current32(text); |
+ ++i; |
+ } |
+ // To get here we must have come through the whole list successfully; |
+ // go on to the next node. Note that a word cannot end in the middle |
+ // of a vertical node. |
+ node = getCompactNode(fData, vnode->equal); |
+ } |
+ else { |
+ // Horizontal node; do binary search |
+ const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node; |
+ int low = 0; |
+ int high = nodeCount-1; |
+ int middle; |
+ node = NULL; // If we don't find a match, we'll fall out of the loop |
+ while (high >= low) { |
+ middle = (high+low)/2; |
+ if (uc == hnode->entries[middle].ch) { |
+ // We hit a match; get the next node and next character |
+ node = getCompactNode(fData, hnode->entries[middle].equal); |
+ utext_next32(text); |
+ uc = utext_current32(text); |
+ ++i; |
+ break; |
+ } |
+ else if (uc < hnode->entries[middle].ch) { |
+ high = middle-1; |
+ } |
+ else { |
+ low = middle+1; |
+ } |
+ } |
+ } |
+ } |
+exit: |
+ count = mycount; |
+ return i; |
+} |
+ |
+// Implementation of iteration for CompactTrieDictionary |
+class CompactTrieEnumeration : public StringEnumeration { |
+private: |
+ UVector32 fNodeStack; // Stack of nodes to process |
+ UVector32 fIndexStack; // Stack of where in node we are |
+ const CompactTrieHeader *fHeader; // Trie data |
+ |
+public: |
+ static UClassID U_EXPORT2 getStaticClassID(void); |
+ virtual UClassID getDynamicClassID(void) const; |
+public: |
+ CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status) |
+ : fNodeStack(status), fIndexStack(status) { |
+ fHeader = header; |
+ fNodeStack.push(header->root, status); |
+ fIndexStack.push(0, status); |
+ unistr.remove(); |
+ } |
+ |
+ virtual ~CompactTrieEnumeration() { |
+ } |
+ |
+ virtual StringEnumeration *clone() const { |
+ UErrorCode status = U_ZERO_ERROR; |
+ return new CompactTrieEnumeration(fHeader, status); |
+ } |
+ |
+ virtual const UnicodeString * snext(UErrorCode &status); |
+ |
+ // Very expensive, but this should never be used. |
+ virtual int32_t count(UErrorCode &status) const { |
+ CompactTrieEnumeration counter(fHeader, status); |
+ int32_t result = 0; |
+ while (counter.snext(status) != NULL && U_SUCCESS(status)) { |
+ ++result; |
+ } |
+ return result; |
+ } |
+ |
+ virtual void reset(UErrorCode &status) { |
+ fNodeStack.removeAllElements(); |
+ fIndexStack.removeAllElements(); |
+ fNodeStack.push(fHeader->root, status); |
+ fIndexStack.push(0, status); |
+ unistr.remove(); |
+ } |
+}; |
+ |
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration) |
+ |
+const UnicodeString * |
+CompactTrieEnumeration::snext(UErrorCode &status) { |
+ if (fNodeStack.empty() || U_FAILURE(status)) { |
+ return NULL; |
+ } |
+ const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki()); |
+ int where = fIndexStack.peeki(); |
+ while (!fNodeStack.empty() && U_SUCCESS(status)) { |
+ int nodeCount = (node->flagscount & kCountMask); |
+ UBool goingDown = FALSE; |
+ if (nodeCount == 0) { |
+ // Terminal node; go up immediately |
+ fNodeStack.popi(); |
+ fIndexStack.popi(); |
+ node = getCompactNode(fHeader, fNodeStack.peeki()); |
+ where = fIndexStack.peeki(); |
+ } |
+ else if (node->flagscount & kVerticalNode) { |
+ // Vertical node |
+ const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node; |
+ if (where == 0) { |
+ // Going down |
+ unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount); |
+ fIndexStack.setElementAt(1, fIndexStack.size()-1); |
+ node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, status)); |
+ where = fIndexStack.push(0, status); |
+ goingDown = TRUE; |
+ } |
+ else { |
+ // Going up |
+ unistr.truncate(unistr.length()-nodeCount); |
+ fNodeStack.popi(); |
+ fIndexStack.popi(); |
+ node = getCompactNode(fHeader, fNodeStack.peeki()); |
+ where = fIndexStack.peeki(); |
+ } |
+ } |
+ else { |
+ // Horizontal node |
+ const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node; |
+ if (where > 0) { |
+ // Pop previous char |
+ unistr.truncate(unistr.length()-1); |
+ } |
+ if (where < nodeCount) { |
+ // Push on next node |
+ unistr.append((UChar)hnode->entries[where].ch); |
+ fIndexStack.setElementAt(where+1, fIndexStack.size()-1); |
+ node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[where].equal, status)); |
+ where = fIndexStack.push(0, status); |
+ goingDown = TRUE; |
+ } |
+ else { |
+ // Going up |
+ fNodeStack.popi(); |
+ fIndexStack.popi(); |
+ node = getCompactNode(fHeader, fNodeStack.peeki()); |
+ where = fIndexStack.peeki(); |
+ } |
+ } |
+ // Check if the parent of the node we've just gone down to ends a |
+ // word. If so, return it. |
+ if (goingDown && (node->flagscount & kParentEndsWord)) { |
+ return &unistr; |
+ } |
+ } |
+ return NULL; |
+} |
+ |
+StringEnumeration * |
+CompactTrieDictionary::openWords( UErrorCode &status ) const { |
+ if (U_FAILURE(status)) { |
+ return NULL; |
+ } |
+ return new CompactTrieEnumeration(fData, status); |
+} |
+ |
+// |
+// Below here is all code related to converting a ternary trie to a compact trie |
+// and back again |
+// |
+ |
+// Helper classes to construct the compact trie |
+class BuildCompactTrieNode: public UMemory { |
+ public: |
+ UBool fParentEndsWord; |
+ UBool fVertical; |
+ UBool fHasDuplicate; |
+ int32_t fNodeID; |
+ UnicodeString fChars; |
+ |
+ public: |
+ BuildCompactTrieNode(UBool parentEndsWord, UBool vertical, UStack &nodes, UErrorCode &status) { |
+ fParentEndsWord = parentEndsWord; |
+ fHasDuplicate = FALSE; |
+ fVertical = vertical; |
+ fNodeID = nodes.size(); |
+ nodes.push(this, status); |
+ } |
+ |
+ virtual ~BuildCompactTrieNode() { |
+ } |
+ |
+ virtual uint32_t size() { |
+ return sizeof(uint16_t); |
+ } |
+ |
+ virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*translate*/) { |
+ // Write flag/count |
+ *((uint16_t *)(bytes+offset)) = (fChars.length() & kCountMask) |
+ | (fVertical ? kVerticalNode : 0) | (fParentEndsWord ? kParentEndsWord : 0 ); |
+ offset += sizeof(uint16_t); |
+ } |
+}; |
+ |
+class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode { |
+ public: |
+ UStack fLinks; |
+ |
+ public: |
+ BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status) |
+ : BuildCompactTrieNode(parentEndsWord, FALSE, nodes, status), fLinks(status) { |
+ } |
+ |
+ virtual ~BuildCompactTrieHorizontalNode() { |
+ } |
+ |
+ virtual uint32_t size() { |
+ return offsetof(CompactTrieHorizontalNode,entries) + |
+ (fChars.length()*sizeof(CompactTrieHorizontalEntry)); |
+ } |
+ |
+ virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) { |
+ BuildCompactTrieNode::write(bytes, offset, translate); |
+ int32_t count = fChars.length(); |
+ for (int32_t i = 0; i < count; ++i) { |
+ CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(bytes+offset); |
+ entry->ch = fChars[i]; |
+ entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID); |
+#ifdef DEBUG_TRIE_DICT |
+ if (entry->equal == 0) { |
+ fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n", |
+ i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); |
+ } |
+#endif |
+ offset += sizeof(CompactTrieHorizontalEntry); |
+ } |
+ } |
+ |
+ void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) { |
+ fChars.append(ch); |
+ fLinks.push(link, status); |
+ } |
+}; |
+ |
+class BuildCompactTrieVerticalNode: public BuildCompactTrieNode { |
+ public: |
+ BuildCompactTrieNode *fEqual; |
+ |
+ public: |
+ BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status) |
+ : BuildCompactTrieNode(parentEndsWord, TRUE, nodes, status) { |
+ fEqual = NULL; |
+ } |
+ |
+ virtual ~BuildCompactTrieVerticalNode() { |
+ } |
+ |
+ virtual uint32_t size() { |
+ return offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t)); |
+ } |
+ |
+ virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) { |
+ CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset); |
+ BuildCompactTrieNode::write(bytes, offset, translate); |
+ node->equal = translate.elementAti(fEqual->fNodeID); |
+ offset += sizeof(node->equal); |
+#ifdef DEBUG_TRIE_DICT |
+ if (node->equal == 0) { |
+ fprintf(stderr, "ERROR: vertical link, logical node %d maps to physical node zero\n", |
+ fEqual->fNodeID); |
+ } |
+#endif |
+ fChars.extract(0, fChars.length(), (UChar *)node->chars); |
+ offset += sizeof(uint16_t)*fChars.length(); |
+ } |
+ |
+ void addChar(UChar ch) { |
+ fChars.append(ch); |
+ } |
+ |
+ void setLink(BuildCompactTrieNode *node) { |
+ fEqual = node; |
+ } |
+}; |
+ |
+// Forward declaration |
+static void walkHorizontal(const TernaryNode *node, |
+ BuildCompactTrieHorizontalNode *building, |
+ UStack &nodes, |
+ UErrorCode &status); |
+ |
+// Convert one node. Uses recursion. |
+ |
+static BuildCompactTrieNode * |
+compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, UErrorCode &status) { |
+ if (U_FAILURE(status)) { |
+ return NULL; |
+ } |
+ BuildCompactTrieNode *result = NULL; |
+ UBool horizontal = (node->low != NULL || node->high != NULL); |
+ if (horizontal) { |
+ BuildCompactTrieHorizontalNode *hResult = |
+ new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status); |
+ if (hResult == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ return NULL; |
+ } |
+ if (U_SUCCESS(status)) { |
+ walkHorizontal(node, hResult, nodes, status); |
+ result = hResult; |
+ } |
+ } |
+ else { |
+ BuildCompactTrieVerticalNode *vResult = |
+ new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status); |
+ if (vResult == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ } |
+ else if (U_SUCCESS(status)) { |
+ UBool endsWord = FALSE; |
+ // Take up nodes until we end a word, or hit a node with < or > links |
+ do { |
+ vResult->addChar(node->ch); |
+ endsWord = (node->flags & kEndsWord) != 0; |
+ node = node->equal; |
+ } |
+ while(node != NULL && !endsWord && node->low == NULL && node->high == NULL); |
+ if (node == NULL) { |
+ if (!endsWord) { |
+ status = U_ILLEGAL_ARGUMENT_ERROR; // Corrupt input trie |
+ } |
+ else { |
+ vResult->setLink((BuildCompactTrieNode *)nodes[1]); |
+ } |
+ } |
+ else { |
+ vResult->setLink(compactOneNode(node, endsWord, nodes, status)); |
+ } |
+ result = vResult; |
+ } |
+ } |
+ return result; |
+} |
+ |
+// Walk the set of peers at the same level, to build a horizontal node. |
+// Uses recursion. |
+ |
+static void walkHorizontal(const TernaryNode *node, |
+ BuildCompactTrieHorizontalNode *building, |
+ UStack &nodes, |
+ UErrorCode &status) { |
+ while (U_SUCCESS(status) && node != NULL) { |
+ if (node->low != NULL) { |
+ walkHorizontal(node->low, building, nodes, status); |
+ } |
+ BuildCompactTrieNode *link = NULL; |
+ if (node->equal != NULL) { |
+ link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, nodes, status); |
+ } |
+ else if (node->flags & kEndsWord) { |
+ link = (BuildCompactTrieNode *)nodes[1]; |
+ } |
+ if (U_SUCCESS(status) && link != NULL) { |
+ building->addNode(node->ch, link, status); |
+ } |
+ // Tail recurse manually instead of leaving it to the compiler. |
+ //if (node->high != NULL) { |
+ // walkHorizontal(node->high, building, nodes, status); |
+ //} |
+ node = node->high; |
+ } |
+} |
+ |
+U_NAMESPACE_END |
+U_NAMESPACE_USE |
+U_CDECL_BEGIN |
+static int32_t U_CALLCONV |
+_sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) { |
+ BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl; |
+ BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr; |
+ // Check for comparing a node to itself, to avoid spurious duplicates |
+ if (left == right) { |
+ return 0; |
+ } |
+ // Most significant is type of node. Can never coalesce. |
+ if (left->fVertical != right->fVertical) { |
+ return left->fVertical - right->fVertical; |
+ } |
+ // Next, the "parent ends word" flag. If that differs, we cannot coalesce. |
+ if (left->fParentEndsWord != right->fParentEndsWord) { |
+ return left->fParentEndsWord - right->fParentEndsWord; |
+ } |
+ // Next, the string. If that differs, we can never coalesce. |
+ int32_t result = left->fChars.compare(right->fChars); |
+ if (result != 0) { |
+ return result; |
+ } |
+ // We know they're both the same node type, so branch for the two cases. |
+ if (left->fVertical) { |
+ result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID |
+ - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID; |
+ } |
+ else { |
+ // We need to compare the links vectors. They should be the |
+ // same size because the strings were equal. |
+ // We compare the node IDs instead of the pointers, to handle |
+ // coalesced nodes. |
+ BuildCompactTrieHorizontalNode *hleft, *hright; |
+ hleft = (BuildCompactTrieHorizontalNode *)left; |
+ hright = (BuildCompactTrieHorizontalNode *)right; |
+ int32_t count = hleft->fLinks.size(); |
+ for (int32_t i = 0; i < count && result == 0; ++i) { |
+ result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID - |
+ ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; |
+ } |
+ } |
+ // If they are equal to each other, mark them (speeds coalescing) |
+ if (result == 0) { |
+ left->fHasDuplicate = TRUE; |
+ right->fHasDuplicate = TRUE; |
+ } |
+ return result; |
+} |
+U_CDECL_END |
+U_NAMESPACE_BEGIN |
+ |
+static void coalesceDuplicates(UStack &nodes, UErrorCode &status) { |
+ // We sort the array of nodes to place duplicates next to each other |
+ if (U_FAILURE(status)) { |
+ return; |
+ } |
+ int32_t size = nodes.size(); |
+ void **array = (void **)uprv_malloc(sizeof(void *)*size); |
+ if (array == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ return; |
+ } |
+ (void) nodes.toArray(array); |
+ |
+ // Now repeatedly identify duplicates until there are no more |
+ int32_t dupes = 0; |
+ long passCount = 0; |
+#ifdef DEBUG_TRIE_DICT |
+ long totalDupes = 0; |
+#endif |
+ do { |
+ BuildCompactTrieNode *node; |
+ BuildCompactTrieNode *first = NULL; |
+ BuildCompactTrieNode **p; |
+ BuildCompactTrieNode **pFirst = NULL; |
+ int32_t counter = size - 2; |
+ // Sort the array, skipping nodes 0 and 1. Use quicksort for the first |
+ // pass for speed. For the second and subsequent passes, we use stable |
+ // (insertion) sort for two reasons: |
+ // 1. The array is already mostly ordered, so we get better performance. |
+ // 2. The way we find one and only one instance of a set of duplicates is to |
+ // check that the node ID equals the array index. If we used an unstable |
+ // sort for the second or later passes, it's possible that none of the |
+ // duplicates would wind up with a node ID equal to its array index. |
+ // The sort stability guarantees that, because as we coalesce more and |
+ // more groups, the first element of the resultant group will be one of |
+ // the first elements of the groups being coalesced. |
+ // To use quicksort for the second and subsequent passes, we would have to |
+ // find the minimum of the node numbers in a group, and set all the nodes |
+ // in the group to that node number. |
+ uprv_sortArray(array+2, counter, sizeof(void *), _sortBuildNodes, NULL, (passCount > 0), &status); |
+ dupes = 0; |
+ for (p = (BuildCompactTrieNode **)array + 2; counter > 0; --counter, ++p) { |
+ node = *p; |
+ if (node->fHasDuplicate) { |
+ if (first == NULL) { |
+ first = node; |
+ pFirst = p; |
+ } |
+ else if (_sortBuildNodes(NULL, pFirst, p) != 0) { |
+ // Starting a new run of dupes |
+ first = node; |
+ pFirst = p; |
+ } |
+ else if (node->fNodeID != first->fNodeID) { |
+ // Slave one to the other, note duplicate |
+ node->fNodeID = first->fNodeID; |
+ dupes += 1; |
+ } |
+ } |
+ else { |
+ // This node has no dupes |
+ first = NULL; |
+ pFirst = NULL; |
+ } |
+ } |
+ passCount += 1; |
+#ifdef DEBUG_TRIE_DICT |
+ totalDupes += dupes; |
+ fprintf(stderr, "Trie node dupe removal, pass %d: %d nodes tagged\n", passCount, dupes); |
+#endif |
+ } |
+ while (dupes > 0); |
+#ifdef DEBUG_TRIE_DICT |
+ fprintf(stderr, "Trie node dupe removal complete: %d tagged in %d passes\n", totalDupes, passCount); |
+#endif |
+ |
+ // We no longer need the temporary array, as the nodes have all been marked appropriately. |
+ uprv_free(array); |
+} |
+ |
+U_NAMESPACE_END |
+U_CDECL_BEGIN |
+static void U_CALLCONV _deleteBuildNode(void *obj) { |
+ delete (BuildCompactTrieNode *) obj; |
+} |
+U_CDECL_END |
+U_NAMESPACE_BEGIN |
+ |
+CompactTrieHeader * |
+CompactTrieDictionary::compactMutableTrieDictionary( const MutableTrieDictionary &dict, |
+ UErrorCode &status ) { |
+ if (U_FAILURE(status)) { |
+ return NULL; |
+ } |
+#ifdef DEBUG_TRIE_DICT |
+ struct tms timing; |
+ struct tms previous; |
+ (void) ::times(&previous); |
+#endif |
+ UStack nodes(_deleteBuildNode, NULL, status); // Index of nodes |
+ |
+ // Add node 0, used as the NULL pointer/sentinel. |
+ nodes.addElement((int32_t)0, status); |
+ |
+ // Start by creating the special empty node we use to indicate that the parent |
+ // terminates a word. This must be node 1, because the builder assumes |
+ // that. |
+ if (U_FAILURE(status)) { |
+ return NULL; |
+ } |
+ BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, nodes, status); |
+ if (terminal == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ } |
+ |
+ // This call does all the work of building the new trie structure. The root |
+ // will be node 2. |
+ BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status); |
+#ifdef DEBUG_TRIE_DICT |
+ (void) ::times(&timing); |
+ fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n", |
+ nodes.size(), (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, |
+ (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); |
+ previous = timing; |
+#endif |
+ |
+ // Now coalesce all duplicate nodes. |
+ coalesceDuplicates(nodes, status); |
+#ifdef DEBUG_TRIE_DICT |
+ (void) ::times(&timing); |
+ fprintf(stderr, "Duplicates coalesced, time user %f system %f\n", |
+ (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, |
+ (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); |
+ previous = timing; |
+#endif |
+ |
+ // Next, build the output trie. |
+ // First we compute all the sizes and build the node ID translation table. |
+ uint32_t totalSize = offsetof(CompactTrieHeader,offsets); |
+ int32_t count = nodes.size(); |
+ int32_t nodeCount = 1; // The sentinel node we already have |
+ BuildCompactTrieNode *node; |
+ int32_t i; |
+ UVector32 translate(count, status); // Should be no growth needed after this |
+ translate.push(0, status); // The sentinel node |
+ |
+ if (U_FAILURE(status)) { |
+ return NULL; |
+ } |
+ |
+ for (i = 1; i < count; ++i) { |
+ node = (BuildCompactTrieNode *)nodes[i]; |
+ if (node->fNodeID == i) { |
+ // Only one node out of each duplicate set is used |
+ if (i >= translate.size()) { |
+ // Logically extend the mapping table |
+ translate.setSize(i+1); |
+ } |
+ translate.setElementAt(nodeCount++, i); |
+ totalSize += node->size(); |
+ } |
+ } |
+ |
+ // Check for overflowing 16 bits worth of nodes. |
+ if (nodeCount > 0x10000) { |
+ status = U_ILLEGAL_ARGUMENT_ERROR; |
+ return NULL; |
+ } |
+ |
+ // Add enough room for the offsets. |
+ totalSize += nodeCount*sizeof(uint32_t); |
+#ifdef DEBUG_TRIE_DICT |
+ (void) ::times(&timing); |
+ fprintf(stderr, "Sizes/mapping done, time user %f system %f\n", |
+ (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, |
+ (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); |
+ previous = timing; |
+ fprintf(stderr, "%d nodes, %d unique, %d bytes\n", nodes.size(), nodeCount, totalSize); |
+#endif |
+ uint8_t *bytes = (uint8_t *)uprv_malloc(totalSize); |
+ if (bytes == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ return NULL; |
+ } |
+ |
+ CompactTrieHeader *header = (CompactTrieHeader *)bytes; |
+ header->size = totalSize; |
+ header->nodeCount = nodeCount; |
+ header->offsets[0] = 0; // Sentinel |
+ header->root = translate.elementAti(root->fNodeID); |
+#ifdef DEBUG_TRIE_DICT |
+ if (header->root == 0) { |
+ fprintf(stderr, "ERROR: root node %d translate to physical zero\n", root->fNodeID); |
+ } |
+#endif |
+ uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint32_t)); |
+ nodeCount = 1; |
+ // Now write the data |
+ for (i = 1; i < count; ++i) { |
+ node = (BuildCompactTrieNode *)nodes[i]; |
+ if (node->fNodeID == i) { |
+ header->offsets[nodeCount++] = offset; |
+ node->write(bytes, offset, translate); |
+ } |
+ } |
+#ifdef DEBUG_TRIE_DICT |
+ (void) ::times(&timing); |
+ fprintf(stderr, "Trie built, time user %f system %f\n", |
+ (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, |
+ (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); |
+ previous = timing; |
+ fprintf(stderr, "Final offset is %d\n", offset); |
+ |
+ // Collect statistics on node types and sizes |
+ int hCount = 0; |
+ int vCount = 0; |
+ size_t hSize = 0; |
+ size_t vSize = 0; |
+ size_t hItemCount = 0; |
+ size_t vItemCount = 0; |
+ uint32_t previousOff = offset; |
+ for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { |
+ const CompactTrieNode *node = getCompactNode(header, nodeIdx); |
+ if (node->flagscount & kVerticalNode) { |
+ vCount += 1; |
+ vItemCount += (node->flagscount & kCountMask); |
+ vSize += previousOff-header->offsets[nodeIdx]; |
+ } |
+ else { |
+ hCount += 1; |
+ hItemCount += (node->flagscount & kCountMask); |
+ hSize += previousOff-header->offsets[nodeIdx]; |
+ } |
+ previousOff = header->offsets[nodeIdx]; |
+ } |
+ fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\n", hCount, |
+ (double)hSize/hCount, (double)hItemCount/hCount); |
+ fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n", vCount, |
+ (double)vSize/vCount, (double)vItemCount/vCount); |
+#endif |
+ |
+ if (U_FAILURE(status)) { |
+ uprv_free(bytes); |
+ header = NULL; |
+ } |
+ else { |
+ header->magic = COMPACT_TRIE_MAGIC_1; |
+ } |
+ return header; |
+} |
+ |
+// Forward declaration |
+static TernaryNode * |
+unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status ); |
+ |
+ |
+// Convert a horizontal node (or subarray thereof) into a ternary subtrie |
+static TernaryNode * |
+unpackHorizontalArray( const CompactTrieHeader *header, const CompactTrieHorizontalEntry *array, |
+ int low, int high, UErrorCode &status ) { |
+ if (U_FAILURE(status) || low > high) { |
+ return NULL; |
+ } |
+ int middle = (low+high)/2; |
+ TernaryNode *result = new TernaryNode(array[middle].ch); |
+ if (result == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ return NULL; |
+ } |
+ const CompactTrieNode *equal = getCompactNode(header, array[middle].equal); |
+ if (equal->flagscount & kParentEndsWord) { |
+ result->flags |= kEndsWord; |
+ } |
+ result->low = unpackHorizontalArray(header, array, low, middle-1, status); |
+ result->high = unpackHorizontalArray(header, array, middle+1, high, status); |
+ result->equal = unpackOneNode(header, equal, status); |
+ return result; |
+} |
+ |
+// Convert one compact trie node into a ternary subtrie |
+static TernaryNode * |
+unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status ) { |
+ int nodeCount = (node->flagscount & kCountMask); |
+ if (nodeCount == 0 || U_FAILURE(status)) { |
+ // Failure, or terminal node |
+ return NULL; |
+ } |
+ if (node->flagscount & kVerticalNode) { |
+ const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node; |
+ TernaryNode *head = NULL; |
+ TernaryNode *previous = NULL; |
+ TernaryNode *latest = NULL; |
+ for (int i = 0; i < nodeCount; ++i) { |
+ latest = new TernaryNode(vnode->chars[i]); |
+ if (latest == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ break; |
+ } |
+ if (head == NULL) { |
+ head = latest; |
+ } |
+ if (previous != NULL) { |
+ previous->equal = latest; |
+ } |
+ previous = latest; |
+ } |
+ if (latest != NULL) { |
+ const CompactTrieNode *equal = getCompactNode(header, vnode->equal); |
+ if (equal->flagscount & kParentEndsWord) { |
+ latest->flags |= kEndsWord; |
+ } |
+ latest->equal = unpackOneNode(header, equal, status); |
+ } |
+ return head; |
+ } |
+ else { |
+ // Horizontal node |
+ const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node; |
+ return unpackHorizontalArray(header, &hnode->entries[0], 0, nodeCount-1, status); |
+ } |
+} |
+ |
+MutableTrieDictionary * |
+CompactTrieDictionary::cloneMutable( UErrorCode &status ) const { |
+ MutableTrieDictionary *result = new MutableTrieDictionary( status ); |
+ if (result == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ return NULL; |
+ } |
+ TernaryNode *root = unpackOneNode(fData, getCompactNode(fData, fData->root), status); |
+ if (U_FAILURE(status)) { |
+ delete root; // Clean up |
+ delete result; |
+ return NULL; |
+ } |
+ result->fTrie = root; |
+ return result; |
+} |
+ |
+U_NAMESPACE_END |
+ |
+U_CAPI int32_t U_EXPORT2 |
+triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, |
+ UErrorCode *status) { |
+ |
+ if (status == NULL || U_FAILURE(*status)) { |
+ return 0; |
+ } |
+ if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { |
+ *status=U_ILLEGAL_ARGUMENT_ERROR; |
+ return 0; |
+ } |
+ |
+ // |
+ // Check that the data header is for for dictionary data. |
+ // (Header contents are defined in genxxx.cpp) |
+ // |
+ const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4); |
+ if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */ |
+ pInfo->dataFormat[1]==0x72 && |
+ pInfo->dataFormat[2]==0x44 && |
+ pInfo->dataFormat[3]==0x63 && |
+ pInfo->formatVersion[0]==1 )) { |
+ udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n", |
+ pInfo->dataFormat[0], pInfo->dataFormat[1], |
+ pInfo->dataFormat[2], pInfo->dataFormat[3], |
+ pInfo->formatVersion[0]); |
+ *status=U_UNSUPPORTED_ERROR; |
+ return 0; |
+ } |
+ |
+ // |
+ // Swap the data header. (This is the generic ICU Data Header, not the |
+ // CompactTrieHeader). This swap also conveniently gets us |
+ // the size of the ICU d.h., which lets us locate the start |
+ // of the RBBI specific data. |
+ // |
+ int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); |
+ |
+ // |
+ // Get the CompactTrieHeader, and check that it appears to be OK. |
+ // |
+ const uint8_t *inBytes =(const uint8_t *)inData+headerSize; |
+ const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes; |
+ if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1 |
+ || ds->readUInt32(header->size) < sizeof(CompactTrieHeader)) |
+ { |
+ udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n"); |
+ *status=U_UNSUPPORTED_ERROR; |
+ return 0; |
+ } |
+ |
+ // |
+ // Prefight operation? Just return the size |
+ // |
+ uint32_t totalSize = ds->readUInt32(header->size); |
+ int32_t sizeWithUData = (int32_t)totalSize + headerSize; |
+ if (length < 0) { |
+ return sizeWithUData; |
+ } |
+ |
+ // |
+ // Check that length passed in is consistent with length from RBBI data header. |
+ // |
+ if (length < sizeWithUData) { |
+ udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n", |
+ totalSize); |
+ *status=U_INDEX_OUTOFBOUNDS_ERROR; |
+ return 0; |
+ } |
+ |
+ // |
+ // Swap the Data. Do the data itself first, then the CompactTrieHeader, because |
+ // we need to reference the header to locate the data, and an |
+ // inplace swap of the header leaves it unusable. |
+ // |
+ uint8_t *outBytes = (uint8_t *)outData + headerSize; |
+ CompactTrieHeader *outputHeader = (CompactTrieHeader *)outBytes; |
+ |
+#if 0 |
+ // |
+ // If not swapping in place, zero out the output buffer before starting. |
+ // |
+ if (inBytes != outBytes) { |
+ uprv_memset(outBytes, 0, totalSize); |
+ } |
+ |
+ // We need to loop through all the nodes in the offset table, and swap each one. |
+ uint16_t nodeCount = ds->readUInt16(header->nodeCount); |
+ // Skip node 0, which should always be 0. |
+ for (int i = 1; i < nodeCount; ++i) { |
+ uint32_t nodeOff = ds->readUInt32(header->offsets[i]); |
+ const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + nodeOff); |
+ CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff); |
+ uint16_t flagscount = ds->readUInt16(inNode->flagscount); |
+ uint16_t itemCount = flagscount & kCountMask; |
+ ds->writeUInt16(&outNode->flagscount, flagscount); |
+ if (itemCount > 0) { |
+ if (flagscount & kVerticalNode) { |
+ ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), |
+ itemCount*sizeof(uint16_t), |
+ outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status); |
+ uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal); |
+ ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal)); |
+ } |
+ else { |
+ const CompactTrieHorizontalNode *inHNode = (const CompactTrieHorizontalNode *)inNode; |
+ CompactTrieHorizontalNode *outHNode = (CompactTrieHorizontalNode *)outNode; |
+ for (int j = 0; j < itemCount; ++j) { |
+ uint16_t word = ds->readUInt16(inHNode->entries[j].ch); |
+ ds->writeUInt16(&outHNode->entries[j].ch, word); |
+ word = ds->readUInt16(inHNode->entries[j].equal); |
+ ds->writeUInt16(&outHNode->entries[j].equal, word); |
+ } |
+ } |
+ } |
+ } |
+#endif |
+ |
+ // All the data in all the nodes consist of 16 bit items. Swap them all at once. |
+ uint16_t nodeCount = ds->readUInt16(header->nodeCount); |
+ uint32_t nodesOff = offsetof(CompactTrieHeader,offsets)+((uint32_t)nodeCount*sizeof(uint32_t)); |
+ ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status); |
+ |
+ // Swap the header |
+ ds->writeUInt32(&outputHeader->size, totalSize); |
+ uint32_t magic = ds->readUInt32(header->magic); |
+ ds->writeUInt32(&outputHeader->magic, magic); |
+ ds->writeUInt16(&outputHeader->nodeCount, nodeCount); |
+ uint16_t root = ds->readUInt16(header->root); |
+ ds->writeUInt16(&outputHeader->root, root); |
+ ds->swapArray32(ds, inBytes+offsetof(CompactTrieHeader,offsets), |
+ sizeof(uint32_t)*(int32_t)nodeCount, |
+ outBytes+offsetof(CompactTrieHeader,offsets), status); |
+ |
+ return sizeWithUData; |
+} |
+ |
+#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
Property changes on: icu46/source/common/triedict.cpp |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |