OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * Copyright (C) 2013-2014, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* |
| 6 * collationdatareader.h |
| 7 * |
| 8 * created on: 2013feb07 |
| 9 * created by: Markus W. Scherer |
| 10 */ |
| 11 |
| 12 #ifndef __COLLATIONDATAREADER_H__ |
| 13 #define __COLLATIONDATAREADER_H__ |
| 14 |
| 15 #include "unicode/utypes.h" |
| 16 |
| 17 #if !UCONFIG_NO_COLLATION |
| 18 |
| 19 #include "unicode/udata.h" |
| 20 |
| 21 struct UDataMemory; |
| 22 |
| 23 U_NAMESPACE_BEGIN |
| 24 |
| 25 struct CollationTailoring; |
| 26 |
| 27 /** |
| 28 * Collation binary data reader. |
| 29 */ |
| 30 struct U_I18N_API CollationDataReader /* all static */ { |
| 31 // The following constants are also copied into source/common/ucol_swp.cpp. |
| 32 // Keep them in sync! |
| 33 enum { |
| 34 /** |
| 35 * Number of int32_t indexes. |
| 36 * |
| 37 * Can be 2 if there are only options. |
| 38 * Can be 7 or 8 if there are only options and a script reordering. |
| 39 * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0. |
| 40 */ |
| 41 IX_INDEXES_LENGTH, // 0 |
| 42 /** |
| 43 * Bits 31..24: numericPrimary, for numeric collation |
| 44 * 23..16: fast Latin format version (0 = no fast Latin table) |
| 45 * 15.. 0: options bit set |
| 46 */ |
| 47 IX_OPTIONS, |
| 48 IX_RESERVED2, |
| 49 IX_RESERVED3, |
| 50 |
| 51 /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */ |
| 52 IX_JAMO_CE32S_START, // 4 |
| 53 |
| 54 // Byte offsets from the start of the data, after the generic header. |
| 55 // The indexes[] are at byte offset 0, other data follows. |
| 56 // Each data item is aligned properly. |
| 57 // The data items should be in descending order of unit size, |
| 58 // to minimize the need for padding. |
| 59 // Each item's byte length is given by the difference between its offset
and |
| 60 // the next index/offset value. |
| 61 /** Byte offset to int32_t reorderCodes[]. */ |
| 62 IX_REORDER_CODES_OFFSET, |
| 63 /** |
| 64 * Byte offset to uint8_t reorderTable[]. |
| 65 * Empty table if <256 bytes (padding only). |
| 66 * Otherwise 256 bytes or more (with padding). |
| 67 */ |
| 68 IX_REORDER_TABLE_OFFSET, |
| 69 /** Byte offset to the collation trie. Its length is a multiple of 8 byt
es. */ |
| 70 IX_TRIE_OFFSET, |
| 71 |
| 72 IX_RESERVED8_OFFSET, // 8 |
| 73 /** Byte offset to int64_t ces[]. */ |
| 74 IX_CES_OFFSET, |
| 75 IX_RESERVED10_OFFSET, |
| 76 /** Byte offset to uint32_t ce32s[]. */ |
| 77 IX_CE32S_OFFSET, |
| 78 |
| 79 /** Byte offset to uint32_t rootElements[]. */ |
| 80 IX_ROOT_ELEMENTS_OFFSET, // 12 |
| 81 /** Byte offset to UChar *contexts[]. */ |
| 82 IX_CONTEXTS_OFFSET, |
| 83 /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */ |
| 84 IX_UNSAFE_BWD_OFFSET, |
| 85 /** Byte offset to uint16_t fastLatinTable[]. */ |
| 86 IX_FAST_LATIN_TABLE_OFFSET, |
| 87 |
| 88 /** Byte offset to uint16_t scripts[]. */ |
| 89 IX_SCRIPTS_OFFSET, // 16 |
| 90 /** |
| 91 * Byte offset to UBool compressibleBytes[]. |
| 92 * Empty table if <256 bytes (padding only). |
| 93 * Otherwise 256 bytes or more (with padding). |
| 94 */ |
| 95 IX_COMPRESSIBLE_BYTES_OFFSET, |
| 96 IX_RESERVED18_OFFSET, |
| 97 IX_TOTAL_SIZE |
| 98 }; |
| 99 |
| 100 static void read(const CollationTailoring *base, const uint8_t *inBytes, int
32_t inLength, |
| 101 CollationTailoring &tailoring, UErrorCode &errorCode); |
| 102 |
| 103 static UBool U_CALLCONV |
| 104 isAcceptable(void *context, const char *type, const char *name, const UDataI
nfo *pInfo); |
| 105 |
| 106 private: |
| 107 CollationDataReader(); // no constructor |
| 108 }; |
| 109 |
| 110 /* |
| 111 * Format of collation data (ucadata.icu, binary data in coll/ *.res files). |
| 112 * Format version 4.0. |
| 113 * |
| 114 * The root collation data is stored in the ucadata.icu file. |
| 115 * Tailorings are stored inside .res resource bundle files, with a complete file
header. |
| 116 * |
| 117 * Collation data begins with a standard ICU data file header |
| 118 * (DataHeader, see ucmndata.h and unicode/udata.h). |
| 119 * The UDataInfo.dataVersion field contains the UCA and other version numbers, |
| 120 * see the comments for CollationTailoring.version. |
| 121 * |
| 122 * After the header, the file contains the following parts. |
| 123 * Constants are defined as enum values of the CollationDataReader class. |
| 124 * See also the Collation class. |
| 125 * |
| 126 * int32_t indexes[indexesLength]; |
| 127 * The indexes array has variable length. |
| 128 * Some tailorings only need the length and the options, |
| 129 * others only add reorderCodes and the reorderTable, |
| 130 * some need to store mappings. |
| 131 * Only as many indexes are stored as needed to read all of the data. |
| 132 * |
| 133 * Index 0: indexesLength |
| 134 * Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see I
X_OPTIONS |
| 135 * Index 2..3: Unused/reserved/0. |
| 136 * Index 4: Index into the ce32s array where the CE32s of the conjoining Ja
mo |
| 137 * are stored in a short, contiguous part of the ce32s array. |
| 138 * |
| 139 * Indexes 5..19 are byte offsets in ascending order. |
| 140 * Each byte offset marks the start of the next part in the data file, |
| 141 * and the end of the previous one. |
| 142 * When two consecutive byte offsets are the same (or too short), |
| 143 * then the corresponding part is empty. |
| 144 * Byte offsets are offsets from after the header, |
| 145 * that is, from the beginning of the indexes[]. |
| 146 * Each part starts at an offset with proper alignment for its data. |
| 147 * If necessary, the previous part may include padding bytes to achieve thi
s alignment. |
| 148 * The last byte offset that is stored in the indexes indicates the total s
ize of the data |
| 149 * (starting with the indexes). |
| 150 * |
| 151 * int32_t reorderCodes[]; -- empty in root |
| 152 * The list of script and reordering codes. |
| 153 * |
| 154 * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding
bytes |
| 155 * Primary-weight lead byte permutation table. |
| 156 * Normally present when the reorderCodes are, but can be built at load tim
e. |
| 157 * |
| 158 * UTrie2 trie; -- see utrie2_impl.h and utrie2.h |
| 159 * The trie holds the main collation data. Each code point is mapped to a 3
2-bit value. |
| 160 * It encodes a simple collation element (CE) in compact form, unless bits
7..6 are both set, |
| 161 * in which case it is a special CE32 and contains a 4-bit tag and further
data. |
| 162 * See the Collation class for details. |
| 163 * |
| 164 * The trie has a value for each lead surrogate code unit with some bits en
coding |
| 165 * collective properties of the 1024 supplementary characters whose UTF-16
form starts with |
| 166 * the lead surrogate. See Collation::LEAD_SURROGATE_TAG.. |
| 167 * |
| 168 * int64_t ces[]; |
| 169 * 64-bit CEs and expansions that cannot be stored in a more compact form. |
| 170 * |
| 171 * uint32_t ce32s[]; |
| 172 * CE32s for expansions in compact form, and for characters whose trie valu
es |
| 173 * contain special data. |
| 174 * |
| 175 * uint32_t rootElements[]; -- empty in all tailorings |
| 176 * Compact storage for all of the CEs that occur in the root collation. |
| 177 * See the CollationRootElements class. |
| 178 * |
| 179 * UChar *contexts[]; |
| 180 * Serialized UCharsTrie structures with prefix (pre-context) and contracti
on mappings. |
| 181 * |
| 182 * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize() |
| 183 * Serialized form of characters that are unsafe when iterating backwards, |
| 184 * and at the end of an identical string prefix. |
| 185 * Back up to a safe character. |
| 186 * Lead surrogates are "unsafe" when any of their corresponding supplementa
ry |
| 187 * code points are unsafe. |
| 188 * Does not include [:^lccc=0:][:^tccc=0:]. |
| 189 * For each tailoring, the root unsafeBackwardSet is subtracted. |
| 190 * (As a result, in many tailorings no set needs to be stored.) |
| 191 * |
| 192 * uint16_t fastLatinTable[]; |
| 193 * Optional optimization for Latin text. |
| 194 * See the CollationFastLatin class. |
| 195 * |
| 196 * uint16_t scripts[]; -- empty in all tailorings |
| 197 * Table of the reordering groups with their first and last lead bytes, |
| 198 * and their script and reordering codes. |
| 199 * See CollationData::scripts. |
| 200 * |
| 201 * UBool compressibleBytes[]; -- empty in all tailorings |
| 202 * Flag for getSortKey(), indicating primary weight lead bytes that are com
pressible. |
| 203 */ |
| 204 |
| 205 U_NAMESPACE_END |
| 206 |
| 207 #endif // !UCONFIG_NO_COLLATION |
| 208 #endif // __COLLATIONDATAREADER_H__ |
OLD | NEW |