Index: source/i18n/collationdatareader.h |
diff --git a/source/i18n/collationdatareader.h b/source/i18n/collationdatareader.h |
new file mode 100644 |
index 0000000000000000000000000000000000000000..df65b862ab4049459287d9d56016166275995c50 |
--- /dev/null |
+++ b/source/i18n/collationdatareader.h |
@@ -0,0 +1,208 @@ |
+/* |
+******************************************************************************* |
+* Copyright (C) 2013-2014, International Business Machines |
+* Corporation and others. All Rights Reserved. |
+******************************************************************************* |
+* collationdatareader.h |
+* |
+* created on: 2013feb07 |
+* created by: Markus W. Scherer |
+*/ |
+ |
+#ifndef __COLLATIONDATAREADER_H__ |
+#define __COLLATIONDATAREADER_H__ |
+ |
+#include "unicode/utypes.h" |
+ |
+#if !UCONFIG_NO_COLLATION |
+ |
+#include "unicode/udata.h" |
+ |
+struct UDataMemory; |
+ |
+U_NAMESPACE_BEGIN |
+ |
+struct CollationTailoring; |
+ |
+/** |
+ * Collation binary data reader. |
+ */ |
+struct U_I18N_API CollationDataReader /* all static */ { |
+ // The following constants are also copied into source/common/ucol_swp.cpp. |
+ // Keep them in sync! |
+ enum { |
+ /** |
+ * Number of int32_t indexes. |
+ * |
+ * Can be 2 if there are only options. |
+ * Can be 7 or 8 if there are only options and a script reordering. |
+ * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0. |
+ */ |
+ IX_INDEXES_LENGTH, // 0 |
+ /** |
+ * Bits 31..24: numericPrimary, for numeric collation |
+ * 23..16: fast Latin format version (0 = no fast Latin table) |
+ * 15.. 0: options bit set |
+ */ |
+ IX_OPTIONS, |
+ IX_RESERVED2, |
+ IX_RESERVED3, |
+ |
+ /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */ |
+ IX_JAMO_CE32S_START, // 4 |
+ |
+ // Byte offsets from the start of the data, after the generic header. |
+ // The indexes[] are at byte offset 0, other data follows. |
+ // Each data item is aligned properly. |
+ // The data items should be in descending order of unit size, |
+ // to minimize the need for padding. |
+ // Each item's byte length is given by the difference between its offset and |
+ // the next index/offset value. |
+ /** Byte offset to int32_t reorderCodes[]. */ |
+ IX_REORDER_CODES_OFFSET, |
+ /** |
+ * Byte offset to uint8_t reorderTable[]. |
+ * Empty table if <256 bytes (padding only). |
+ * Otherwise 256 bytes or more (with padding). |
+ */ |
+ IX_REORDER_TABLE_OFFSET, |
+ /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */ |
+ IX_TRIE_OFFSET, |
+ |
+ IX_RESERVED8_OFFSET, // 8 |
+ /** Byte offset to int64_t ces[]. */ |
+ IX_CES_OFFSET, |
+ IX_RESERVED10_OFFSET, |
+ /** Byte offset to uint32_t ce32s[]. */ |
+ IX_CE32S_OFFSET, |
+ |
+ /** Byte offset to uint32_t rootElements[]. */ |
+ IX_ROOT_ELEMENTS_OFFSET, // 12 |
+ /** Byte offset to UChar *contexts[]. */ |
+ IX_CONTEXTS_OFFSET, |
+ /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */ |
+ IX_UNSAFE_BWD_OFFSET, |
+ /** Byte offset to uint16_t fastLatinTable[]. */ |
+ IX_FAST_LATIN_TABLE_OFFSET, |
+ |
+ /** Byte offset to uint16_t scripts[]. */ |
+ IX_SCRIPTS_OFFSET, // 16 |
+ /** |
+ * Byte offset to UBool compressibleBytes[]. |
+ * Empty table if <256 bytes (padding only). |
+ * Otherwise 256 bytes or more (with padding). |
+ */ |
+ IX_COMPRESSIBLE_BYTES_OFFSET, |
+ IX_RESERVED18_OFFSET, |
+ IX_TOTAL_SIZE |
+ }; |
+ |
+ static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength, |
+ CollationTailoring &tailoring, UErrorCode &errorCode); |
+ |
+ static UBool U_CALLCONV |
+ isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo); |
+ |
+private: |
+ CollationDataReader(); // no constructor |
+}; |
+ |
+/* |
+ * Format of collation data (ucadata.icu, binary data in coll/ *.res files). |
+ * Format version 4.0. |
+ * |
+ * The root collation data is stored in the ucadata.icu file. |
+ * Tailorings are stored inside .res resource bundle files, with a complete file header. |
+ * |
+ * Collation data begins with a standard ICU data file header |
+ * (DataHeader, see ucmndata.h and unicode/udata.h). |
+ * The UDataInfo.dataVersion field contains the UCA and other version numbers, |
+ * see the comments for CollationTailoring.version. |
+ * |
+ * After the header, the file contains the following parts. |
+ * Constants are defined as enum values of the CollationDataReader class. |
+ * See also the Collation class. |
+ * |
+ * int32_t indexes[indexesLength]; |
+ * The indexes array has variable length. |
+ * Some tailorings only need the length and the options, |
+ * others only add reorderCodes and the reorderTable, |
+ * some need to store mappings. |
+ * Only as many indexes are stored as needed to read all of the data. |
+ * |
+ * Index 0: indexesLength |
+ * Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS |
+ * Index 2..3: Unused/reserved/0. |
+ * Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo |
+ * are stored in a short, contiguous part of the ce32s array. |
+ * |
+ * Indexes 5..19 are byte offsets in ascending order. |
+ * Each byte offset marks the start of the next part in the data file, |
+ * and the end of the previous one. |
+ * When two consecutive byte offsets are the same (or too short), |
+ * then the corresponding part is empty. |
+ * Byte offsets are offsets from after the header, |
+ * that is, from the beginning of the indexes[]. |
+ * Each part starts at an offset with proper alignment for its data. |
+ * If necessary, the previous part may include padding bytes to achieve this alignment. |
+ * The last byte offset that is stored in the indexes indicates the total size of the data |
+ * (starting with the indexes). |
+ * |
+ * int32_t reorderCodes[]; -- empty in root |
+ * The list of script and reordering codes. |
+ * |
+ * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes |
+ * Primary-weight lead byte permutation table. |
+ * Normally present when the reorderCodes are, but can be built at load time. |
+ * |
+ * UTrie2 trie; -- see utrie2_impl.h and utrie2.h |
+ * The trie holds the main collation data. Each code point is mapped to a 32-bit value. |
+ * It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set, |
+ * in which case it is a special CE32 and contains a 4-bit tag and further data. |
+ * See the Collation class for details. |
+ * |
+ * The trie has a value for each lead surrogate code unit with some bits encoding |
+ * collective properties of the 1024 supplementary characters whose UTF-16 form starts with |
+ * the lead surrogate. See Collation::LEAD_SURROGATE_TAG.. |
+ * |
+ * int64_t ces[]; |
+ * 64-bit CEs and expansions that cannot be stored in a more compact form. |
+ * |
+ * uint32_t ce32s[]; |
+ * CE32s for expansions in compact form, and for characters whose trie values |
+ * contain special data. |
+ * |
+ * uint32_t rootElements[]; -- empty in all tailorings |
+ * Compact storage for all of the CEs that occur in the root collation. |
+ * See the CollationRootElements class. |
+ * |
+ * UChar *contexts[]; |
+ * Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings. |
+ * |
+ * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize() |
+ * Serialized form of characters that are unsafe when iterating backwards, |
+ * and at the end of an identical string prefix. |
+ * Back up to a safe character. |
+ * Lead surrogates are "unsafe" when any of their corresponding supplementary |
+ * code points are unsafe. |
+ * Does not include [:^lccc=0:][:^tccc=0:]. |
+ * For each tailoring, the root unsafeBackwardSet is subtracted. |
+ * (As a result, in many tailorings no set needs to be stored.) |
+ * |
+ * uint16_t fastLatinTable[]; |
+ * Optional optimization for Latin text. |
+ * See the CollationFastLatin class. |
+ * |
+ * uint16_t scripts[]; -- empty in all tailorings |
+ * Table of the reordering groups with their first and last lead bytes, |
+ * and their script and reordering codes. |
+ * See CollationData::scripts. |
+ * |
+ * UBool compressibleBytes[]; -- empty in all tailorings |
+ * Flag for getSortKey(), indicating primary weight lead bytes that are compressible. |
+ */ |
+ |
+U_NAMESPACE_END |
+ |
+#endif // !UCONFIG_NO_COLLATION |
+#endif // __COLLATIONDATAREADER_H__ |