OLD | NEW |
1 /* | 1 /* |
2 ******************************************************************************* | 2 ******************************************************************************* |
3 * Copyright (C) 2013-2014, International Business Machines | 3 * Copyright (C) 2013-2015, International Business Machines |
4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
5 ******************************************************************************* | 5 ******************************************************************************* |
6 * collationdatareader.h | 6 * collationdatareader.h |
7 * | 7 * |
8 * created on: 2013feb07 | 8 * created on: 2013feb07 |
9 * created by: Markus W. Scherer | 9 * created by: Markus W. Scherer |
10 */ | 10 */ |
11 | 11 |
12 #ifndef __COLLATIONDATAREADER_H__ | 12 #ifndef __COLLATIONDATAREADER_H__ |
13 #define __COLLATIONDATAREADER_H__ | 13 #define __COLLATIONDATAREADER_H__ |
(...skipping 88 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
102 | 102 |
103 static UBool U_CALLCONV | 103 static UBool U_CALLCONV |
104 isAcceptable(void *context, const char *type, const char *name, const UDataI
nfo *pInfo); | 104 isAcceptable(void *context, const char *type, const char *name, const UDataI
nfo *pInfo); |
105 | 105 |
106 private: | 106 private: |
107 CollationDataReader(); // no constructor | 107 CollationDataReader(); // no constructor |
108 }; | 108 }; |
109 | 109 |
110 /* | 110 /* |
111 * Format of collation data (ucadata.icu, binary data in coll/ *.res files). | 111 * Format of collation data (ucadata.icu, binary data in coll/ *.res files). |
112 * Format version 4.0. | 112 * Format version 5. |
113 * | 113 * |
114 * The root collation data is stored in the ucadata.icu file. | 114 * The root collation data is stored in the ucadata.icu file. |
115 * Tailorings are stored inside .res resource bundle files, with a complete file
header. | 115 * Tailorings are stored inside .res resource bundle files, with a complete file
header. |
116 * | 116 * |
117 * Collation data begins with a standard ICU data file header | 117 * Collation data begins with a standard ICU data file header |
118 * (DataHeader, see ucmndata.h and unicode/udata.h). | 118 * (DataHeader, see ucmndata.h and unicode/udata.h). |
119 * The UDataInfo.dataVersion field contains the UCA and other version numbers, | 119 * The UDataInfo.dataVersion field contains the UCA and other version numbers, |
120 * see the comments for CollationTailoring.version. | 120 * see the comments for CollationTailoring.version. |
121 * | 121 * |
122 * After the header, the file contains the following parts. | 122 * After the header, the file contains the following parts. |
(...skipping 21 matching lines...) Expand all Loading... |
144 * Byte offsets are offsets from after the header, | 144 * Byte offsets are offsets from after the header, |
145 * that is, from the beginning of the indexes[]. | 145 * that is, from the beginning of the indexes[]. |
146 * Each part starts at an offset with proper alignment for its data. | 146 * Each part starts at an offset with proper alignment for its data. |
147 * If necessary, the previous part may include padding bytes to achieve thi
s alignment. | 147 * If necessary, the previous part may include padding bytes to achieve thi
s alignment. |
148 * The last byte offset that is stored in the indexes indicates the total s
ize of the data | 148 * The last byte offset that is stored in the indexes indicates the total s
ize of the data |
149 * (starting with the indexes). | 149 * (starting with the indexes). |
150 * | 150 * |
151 * int32_t reorderCodes[]; -- empty in root | 151 * int32_t reorderCodes[]; -- empty in root |
152 * The list of script and reordering codes. | 152 * The list of script and reordering codes. |
153 * | 153 * |
| 154 * Beginning with format version 5, this array may optionally |
| 155 * have trailing entries with a full list of reorder ranges |
| 156 * as described for CollationSettings::reorderRanges. |
| 157 * |
| 158 * Script or reorder codes are first and do not exceed 16-bit values. |
| 159 * Range limits are stored in the upper 16 bits, and are never 0. |
| 160 * Split this array into reorder codes and ranges at the first entry |
| 161 * with non-zero upper 16 bits. |
| 162 * |
| 163 * If the ranges are missing but needed for split-reordered primary lead by
tes, |
| 164 * then they are regenerated at load time. |
| 165 * |
154 * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding
bytes | 166 * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding
bytes |
155 * Primary-weight lead byte permutation table. | 167 * Primary-weight lead byte permutation table. |
156 * Normally present when the reorderCodes are, but can be built at load tim
e. | 168 * Normally present when the reorderCodes are, but can be built at load tim
e. |
157 * | 169 * |
| 170 * Beginning with format version 5, a 0 entry at a non-zero index |
| 171 * (which is otherwise an illegal value) |
| 172 * means that the primary lead byte is "split" |
| 173 * (there are different offsets for primaries that share that lead byte) |
| 174 * and the reordering offset must be determined via the reorder ranges |
| 175 * that are either stored as part of the reorderCodes array |
| 176 * or regenerated at load time. |
| 177 * |
158 * UTrie2 trie; -- see utrie2_impl.h and utrie2.h | 178 * UTrie2 trie; -- see utrie2_impl.h and utrie2.h |
159 * The trie holds the main collation data. Each code point is mapped to a 3
2-bit value. | 179 * The trie holds the main collation data. Each code point is mapped to a 3
2-bit value. |
160 * It encodes a simple collation element (CE) in compact form, unless bits
7..6 are both set, | 180 * It encodes a simple collation element (CE) in compact form, unless bits
7..6 are both set, |
161 * in which case it is a special CE32 and contains a 4-bit tag and further
data. | 181 * in which case it is a special CE32 and contains a 4-bit tag and further
data. |
162 * See the Collation class for details. | 182 * See the Collation class for details. |
163 * | 183 * |
164 * The trie has a value for each lead surrogate code unit with some bits en
coding | 184 * The trie has a value for each lead surrogate code unit with some bits en
coding |
165 * collective properties of the 1024 supplementary characters whose UTF-16
form starts with | 185 * collective properties of the 1024 supplementary characters whose UTF-16
form starts with |
166 * the lead surrogate. See Collation::LEAD_SURROGATE_TAG.. | 186 * the lead surrogate. See Collation::LEAD_SURROGATE_TAG.. |
167 * | 187 * |
(...skipping 19 matching lines...) Expand all Loading... |
187 * code points are unsafe. | 207 * code points are unsafe. |
188 * Does not include [:^lccc=0:][:^tccc=0:]. | 208 * Does not include [:^lccc=0:][:^tccc=0:]. |
189 * For each tailoring, the root unsafeBackwardSet is subtracted. | 209 * For each tailoring, the root unsafeBackwardSet is subtracted. |
190 * (As a result, in many tailorings no set needs to be stored.) | 210 * (As a result, in many tailorings no set needs to be stored.) |
191 * | 211 * |
192 * uint16_t fastLatinTable[]; | 212 * uint16_t fastLatinTable[]; |
193 * Optional optimization for Latin text. | 213 * Optional optimization for Latin text. |
194 * See the CollationFastLatin class. | 214 * See the CollationFastLatin class. |
195 * | 215 * |
196 * uint16_t scripts[]; -- empty in all tailorings | 216 * uint16_t scripts[]; -- empty in all tailorings |
| 217 * Format version 5: |
| 218 * uint16_t numScripts; |
| 219 * uint16_t scriptsIndex[numScripts+16]; |
| 220 * uint16_t scriptStarts[]; |
| 221 * See CollationData::numScripts etc. |
| 222 * |
| 223 * Format version 4: |
197 * Table of the reordering groups with their first and last lead bytes, | 224 * Table of the reordering groups with their first and last lead bytes, |
198 * and their script and reordering codes. | 225 * and their script and reordering codes. |
199 * See CollationData::scripts. | 226 * See CollationData::scripts. |
200 * | 227 * |
201 * UBool compressibleBytes[]; -- empty in all tailorings | 228 * UBool compressibleBytes[]; -- empty in all tailorings |
202 * Flag for getSortKey(), indicating primary weight lead bytes that are com
pressible. | 229 * Flag for getSortKey(), indicating primary weight lead bytes that are com
pressible. |
| 230 * |
| 231 * ----------------- |
| 232 * Changes for formatVersion 5 (ICU 55) |
| 233 * |
| 234 * Reordering moves single scripts, not groups of scripts. |
| 235 * Reorder ranges are optionally appended to the reorderCodes, |
| 236 * and a 0 entry in the reorderTable indicates a split lead byte. |
| 237 * The scripts data has a new format. |
| 238 * |
| 239 * The rootElements may contain secondary and tertiary weights below common=05. |
| 240 * (Used for small Hiragana letters.) |
| 241 * Where is occurs, there is also an explicit unit with common secondary & terti
ary weights. |
| 242 * There are no other data structure changes, but builder code needs to be able
to handle such data. |
| 243 * |
| 244 * The collation element for the merge separator code point U+FFFE |
| 245 * does not necessarily have special, unique secondary/tertiary weights any more
. |
203 */ | 246 */ |
204 | 247 |
205 U_NAMESPACE_END | 248 U_NAMESPACE_END |
206 | 249 |
207 #endif // !UCONFIG_NO_COLLATION | 250 #endif // !UCONFIG_NO_COLLATION |
208 #endif // __COLLATIONDATAREADER_H__ | 251 #endif // __COLLATIONDATAREADER_H__ |
OLD | NEW |