Index: source/i18n/collationdatabuilder.h |
diff --git a/source/i18n/collationdatabuilder.h b/source/i18n/collationdatabuilder.h |
new file mode 100644 |
index 0000000000000000000000000000000000000000..cfa46fd5506a0ccfc3cb6592ebce2e34d4ac0833 |
--- /dev/null |
+++ b/source/i18n/collationdatabuilder.h |
@@ -0,0 +1,257 @@ |
+/* |
+******************************************************************************* |
+* Copyright (C) 2012-2014, International Business Machines |
+* Corporation and others. All Rights Reserved. |
+******************************************************************************* |
+* collationdatabuilder.h |
+* |
+* created on: 2012apr01 |
+* created by: Markus W. Scherer |
+*/ |
+ |
+#ifndef __COLLATIONDATABUILDER_H__ |
+#define __COLLATIONDATABUILDER_H__ |
+ |
+#include "unicode/utypes.h" |
+ |
+#if !UCONFIG_NO_COLLATION |
+ |
+#include "unicode/uniset.h" |
+#include "unicode/unistr.h" |
+#include "unicode/uversion.h" |
+#include "collation.h" |
+#include "collationdata.h" |
+#include "collationsettings.h" |
+#include "normalizer2impl.h" |
+#include "utrie2.h" |
+#include "uvectr32.h" |
+#include "uvectr64.h" |
+#include "uvector.h" |
+ |
+U_NAMESPACE_BEGIN |
+ |
+struct ConditionalCE32; |
+ |
+class CollationFastLatinBuilder; |
+class CopyHelper; |
+class DataBuilderCollationIterator; |
+class UCharsTrieBuilder; |
+ |
+/** |
+ * Low-level CollationData builder. |
+ * Takes (character, CE) pairs and builds them into runtime data structures. |
+ * Supports characters with context prefixes and contraction suffixes. |
+ */ |
+class U_I18N_API CollationDataBuilder : public UObject { |
+public: |
+ /** |
+ * Collation element modifier. Interface class for a modifier |
+ * that changes a tailoring builder's temporary CEs to final CEs. |
+ * Called for every non-special CE32 and every expansion CE. |
+ */ |
+ class CEModifier : public UObject { |
+ public: |
+ virtual ~CEModifier(); |
+ /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */ |
+ virtual int64_t modifyCE32(uint32_t ce32) const = 0; |
+ /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */ |
+ virtual int64_t modifyCE(int64_t ce) const = 0; |
+ }; |
+ |
+ CollationDataBuilder(UErrorCode &errorCode); |
+ |
+ virtual ~CollationDataBuilder(); |
+ |
+ void initForTailoring(const CollationData *b, UErrorCode &errorCode); |
+ |
+ virtual UBool isCompressibleLeadByte(uint32_t b) const; |
+ |
+ inline UBool isCompressiblePrimary(uint32_t p) const { |
+ return isCompressibleLeadByte(p >> 24); |
+ } |
+ |
+ /** |
+ * @return TRUE if this builder has mappings (e.g., add() has been called) |
+ */ |
+ UBool hasMappings() const { return modified; } |
+ |
+ /** |
+ * @return TRUE if c has CEs in this builder |
+ */ |
+ UBool isAssigned(UChar32 c) const; |
+ |
+ /** |
+ * @return the three-byte primary if c maps to a single such CE and has no context data, |
+ * otherwise returns 0. |
+ */ |
+ uint32_t getLongPrimaryIfSingleCE(UChar32 c) const; |
+ |
+ /** |
+ * @return the single CE for c. |
+ * Sets an error code if c does not have a single CE. |
+ */ |
+ int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const; |
+ |
+ void add(const UnicodeString &prefix, const UnicodeString &s, |
+ const int64_t ces[], int32_t cesLength, |
+ UErrorCode &errorCode); |
+ |
+ /** |
+ * Encodes the ces as either the returned ce32 by itself, |
+ * or by storing an expansion, with the returned ce32 referring to that. |
+ * |
+ * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength)) |
+ */ |
+ virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode); |
+ void addCE32(const UnicodeString &prefix, const UnicodeString &s, |
+ uint32_t ce32, UErrorCode &errorCode); |
+ |
+ /** |
+ * Sets three-byte-primary CEs for a range of code points in code point order, |
+ * if it is worth doing; otherwise no change is made. |
+ * None of the code points in the range should have complex mappings so far |
+ * (expansions/contractions/prefixes). |
+ * @param start first code point |
+ * @param end last code point (inclusive) |
+ * @param primary primary weight for 'start' |
+ * @param step per-code point primary-weight increment |
+ * @param errorCode ICU in/out error code |
+ * @return TRUE if an OFFSET_TAG range was used for start..end |
+ */ |
+ UBool maybeSetPrimaryRange(UChar32 start, UChar32 end, |
+ uint32_t primary, int32_t step, |
+ UErrorCode &errorCode); |
+ |
+ /** |
+ * Sets three-byte-primary CEs for a range of code points in code point order. |
+ * Sets range values if that is worth doing, or else individual values. |
+ * None of the code points in the range should have complex mappings so far |
+ * (expansions/contractions/prefixes). |
+ * @param start first code point |
+ * @param end last code point (inclusive) |
+ * @param primary primary weight for 'start' |
+ * @param step per-code point primary-weight increment |
+ * @param errorCode ICU in/out error code |
+ * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step |
+ */ |
+ uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end, |
+ uint32_t primary, int32_t step, |
+ UErrorCode &errorCode); |
+ |
+ /** |
+ * Copies all mappings from the src builder, with modifications. |
+ * This builder here must not be built yet, and should be empty. |
+ */ |
+ void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier, |
+ UErrorCode &errorCode); |
+ |
+ void optimize(const UnicodeSet &set, UErrorCode &errorCode); |
+ void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode); |
+ |
+ void enableFastLatin() { fastLatinEnabled = TRUE; } |
+ virtual void build(CollationData &data, UErrorCode &errorCode); |
+ |
+ /** |
+ * Looks up CEs for s and appends them to the ces array. |
+ * Does not handle normalization: s should be in FCD form. |
+ * |
+ * Does not write completely ignorable CEs. |
+ * Does not write beyond Collation::MAX_EXPANSION_LENGTH. |
+ * |
+ * @return incremented cesLength |
+ */ |
+ int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength); |
+ int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s, |
+ int64_t ces[], int32_t cesLength); |
+ |
+protected: |
+ friend class CopyHelper; |
+ friend class DataBuilderCollationIterator; |
+ |
+ uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const; |
+ |
+ int32_t addCE(int64_t ce, UErrorCode &errorCode); |
+ int32_t addCE32(uint32_t ce32, UErrorCode &errorCode); |
+ int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode); |
+ |
+ inline ConditionalCE32 *getConditionalCE32(int32_t index) const { |
+ return static_cast<ConditionalCE32 *>(conditionalCE32s[index]); |
+ } |
+ inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const { |
+ return getConditionalCE32(Collation::indexFromCE32(ce32)); |
+ } |
+ |
+ static uint32_t makeBuilderContextCE32(int32_t index) { |
+ return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index); |
+ } |
+ static inline UBool isBuilderContextCE32(uint32_t ce32) { |
+ return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG); |
+ } |
+ |
+ static uint32_t encodeOneCEAsCE32(int64_t ce); |
+ uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode); |
+ uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode); |
+ uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode); |
+ |
+ uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode); |
+ /** |
+ * Copies base contractions to a list of ConditionalCE32. |
+ * Sets cond->next to the index of the first new item |
+ * and returns the index of the last new item. |
+ */ |
+ int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32, |
+ ConditionalCE32 *cond, UErrorCode &errorCode); |
+ |
+ UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode); |
+ void setDigitTags(UErrorCode &errorCode); |
+ void setLeadSurrogates(UErrorCode &errorCode); |
+ |
+ void buildMappings(CollationData &data, UErrorCode &errorCode); |
+ |
+ void clearContexts(); |
+ void buildContexts(UErrorCode &errorCode); |
+ uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode); |
+ int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder, |
+ UErrorCode &errorCode); |
+ |
+ void buildFastLatinTable(CollationData &data, UErrorCode &errorCode); |
+ |
+ int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength); |
+ |
+ static UChar32 jamoCpFromIndex(int32_t i) { |
+ // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27 |
+ if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; } |
+ i -= Hangul::JAMO_L_COUNT; |
+ if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; } |
+ i -= Hangul::JAMO_V_COUNT; |
+ // i < 27 |
+ return Hangul::JAMO_T_BASE + 1 + i; |
+ } |
+ |
+ /** @see Collation::BUILDER_DATA_TAG */ |
+ static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100; |
+ |
+ const Normalizer2Impl &nfcImpl; |
+ const CollationData *base; |
+ const CollationSettings *baseSettings; |
+ UTrie2 *trie; |
+ UVector32 ce32s; |
+ UVector64 ce64s; |
+ UVector conditionalCE32s; // vector of ConditionalCE32 |
+ // Characters that have context (prefixes or contraction suffixes). |
+ UnicodeSet contextChars; |
+ // Serialized UCharsTrie structures for finalized contexts. |
+ UnicodeString contexts; |
+ UnicodeSet unsafeBackwardSet; |
+ UBool modified; |
+ |
+ UBool fastLatinEnabled; |
+ CollationFastLatinBuilder *fastLatinBuilder; |
+ |
+ DataBuilderCollationIterator *collIter; |
+}; |
+ |
+U_NAMESPACE_END |
+ |
+#endif // !UCONFIG_NO_COLLATION |
+#endif // __COLLATIONDATABUILDER_H__ |