OLD | NEW |
1 /* | 1 /* |
2 ******************************************************************************* | 2 ******************************************************************************* |
3 * Copyright (C) 2013-2014, International Business Machines | 3 * Copyright (C) 2013-2015, International Business Machines |
4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
5 ******************************************************************************* | 5 ******************************************************************************* |
6 * collationfastlatin.h | 6 * collationfastlatin.h |
7 * | 7 * |
8 * created on: 2013aug09 | 8 * created on: 2013aug09 |
9 * created by: Markus W. Scherer | 9 * created by: Markus W. Scherer |
10 */ | 10 */ |
11 | 11 |
12 #ifndef __COLLATIONFASTLATIN_H__ | 12 #ifndef __COLLATIONFASTLATIN_H__ |
13 #define __COLLATIONFASTLATIN_H__ | 13 #define __COLLATIONFASTLATIN_H__ |
(...skipping 10 matching lines...) Expand all Loading... |
24 class U_I18N_API CollationFastLatin /* all static */ { | 24 class U_I18N_API CollationFastLatin /* all static */ { |
25 public: | 25 public: |
26 /** | 26 /** |
27 * Fast Latin format version (one byte 1..FF). | 27 * Fast Latin format version (one byte 1..FF). |
28 * Must be incremented for any runtime-incompatible changes, | 28 * Must be incremented for any runtime-incompatible changes, |
29 * in particular, for changes to any of the following constants. | 29 * in particular, for changes to any of the following constants. |
30 * | 30 * |
31 * When the major version number of the main data format changes, | 31 * When the major version number of the main data format changes, |
32 * we can reset this fast Latin version to 1. | 32 * we can reset this fast Latin version to 1. |
33 */ | 33 */ |
34 static const uint16_t VERSION = 1; | 34 static const uint16_t VERSION = 2; |
35 | 35 |
36 static const int32_t LATIN_MAX = 0x17f; | 36 static const int32_t LATIN_MAX = 0x17f; |
37 static const int32_t LATIN_LIMIT = LATIN_MAX + 1; | 37 static const int32_t LATIN_LIMIT = LATIN_MAX + 1; |
38 | 38 |
39 static const int32_t LATIN_MAX_UTF8_LEAD = 0xc5; // UTF-8 lead byte of LATI
N_MAX | 39 static const int32_t LATIN_MAX_UTF8_LEAD = 0xc5; // UTF-8 lead byte of LATI
N_MAX |
40 | 40 |
41 static const int32_t PUNCT_START = 0x2000; | 41 static const int32_t PUNCT_START = 0x2000; |
42 static const int32_t PUNCT_LIMIT = 0x2040; | 42 static const int32_t PUNCT_LIMIT = 0x2040; |
43 | 43 |
44 // excludes U+FFFE & U+FFFF | 44 // excludes U+FFFE & U+FFFF |
(...skipping 200 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
245 static uint32_t getCases(uint32_t variableTop, UBool strengthIsPrimary, uint
32_t pair); | 245 static uint32_t getCases(uint32_t variableTop, UBool strengthIsPrimary, uint
32_t pair); |
246 static uint32_t getTertiaries(uint32_t variableTop, UBool withCaseBits, uint
32_t pair); | 246 static uint32_t getTertiaries(uint32_t variableTop, UBool withCaseBits, uint
32_t pair); |
247 static uint32_t getQuaternaries(uint32_t variableTop, uint32_t pair); | 247 static uint32_t getQuaternaries(uint32_t variableTop, uint32_t pair); |
248 | 248 |
249 private: | 249 private: |
250 CollationFastLatin(); // no constructor | 250 CollationFastLatin(); // no constructor |
251 }; | 251 }; |
252 | 252 |
253 /* | 253 /* |
254 * Format of the CollationFastLatin data table. | 254 * Format of the CollationFastLatin data table. |
255 * CollationFastLatin::VERSION = 1. | 255 * CollationFastLatin::VERSION = 2. |
256 * | 256 * |
257 * This table contains data for a Latin-text collation fastpath. | 257 * This table contains data for a Latin-text collation fastpath. |
258 * The data is stored as an array of uint16_t which contains the following parts
. | 258 * The data is stored as an array of uint16_t which contains the following parts
. |
259 * | 259 * |
260 * uint16_t -- version & header length | 260 * uint16_t -- version & header length |
261 * Bits 15..8: version, must match the VERSION | 261 * Bits 15..8: version, must match the VERSION |
262 * 7..0: length of the header | 262 * 7..0: length of the header |
263 * | 263 * |
264 * uint16_t varTops[header length - 1] | 264 * uint16_t varTops[header length - 1] |
| 265 * Version 2: |
| 266 * varTops[m] is the highest CollationFastLatin long-primary weight |
| 267 * of supported maxVariable group m |
| 268 * (special reorder group space, punct, symbol, currency). |
| 269 * |
| 270 * Version 1: |
265 * Each of these values maps the variable top lead byte of a supported maxVari
able group | 271 * Each of these values maps the variable top lead byte of a supported maxVari
able group |
266 * to the highest CollationFastLatin long-primary weight. | 272 * to the highest CollationFastLatin long-primary weight. |
267 * The values are stored in ascending order. | 273 * The values are stored in ascending order. |
268 * Bits 15..7: max fast-Latin long-primary weight (bits 11..3 shifted left by
4 bits) | 274 * Bits 15..7: max fast-Latin long-primary weight (bits 11..3 shifted left by
4 bits) |
269 * 6..0: regular primary lead byte | 275 * 6..0: regular primary lead byte |
270 * | 276 * |
271 * uint16_t miniCEs[0x1c0] | 277 * uint16_t miniCEs[0x1c0] |
272 * A mini collation element for each character U+0000..U+017F and U+2000..U+20
3F. | 278 * A mini collation element for each character U+0000..U+017F and U+2000..U+20
3F. |
273 * Each value encodes one or two mini CEs (two are possible if the first one | 279 * Each value encodes one or two mini CEs (two are possible if the first one |
274 * has a short mini primary and the second one is a secondary CE, i.e., primar
y == 0), | 280 * has a short mini primary and the second one is a secondary CE, i.e., primar
y == 0), |
(...skipping 11 matching lines...) Expand all Loading... |
286 * uint16_t contractions[variable length]; | 292 * uint16_t contractions[variable length]; |
287 * Contraction mini CEs contain an offset relative to just after the miniCEs t
able. | 293 * Contraction mini CEs contain an offset relative to just after the miniCEs t
able. |
288 * It points to a list of tuples which map from a contraction suffix character
to a result. | 294 * It points to a list of tuples which map from a contraction suffix character
to a result. |
289 * First uint16_t of each tuple: | 295 * First uint16_t of each tuple: |
290 * Bits 10..9: Length of the result (1..3), see comments on CONTR_LENGTH_SHI
FT. | 296 * Bits 10..9: Length of the result (1..3), see comments on CONTR_LENGTH_SHI
FT. |
291 * Bits 8..0: Contraction character, see comments on CONTR_CHAR_MASK. | 297 * Bits 8..0: Contraction character, see comments on CONTR_CHAR_MASK. |
292 * This is followed by 0, 1, or 2 uint16_t according to the length. | 298 * This is followed by 0, 1, or 2 uint16_t according to the length. |
293 * Each list is terminated by an entry with CONTR_CHAR_MASK. | 299 * Each list is terminated by an entry with CONTR_CHAR_MASK. |
294 * Each list starts with such an entry which also contains the default result | 300 * Each list starts with such an entry which also contains the default result |
295 * for when there is no contraction match. | 301 * for when there is no contraction match. |
| 302 * |
| 303 * ----------------- |
| 304 * Changes for version 2 (ICU 55) |
| 305 * |
| 306 * Special reorder groups do not necessarily start on whole primary lead bytes a
ny more. |
| 307 * Therefore, the varTops data has a new format: |
| 308 * Version 1 stored the lead bytes of the highest root primaries for |
| 309 * the maxVariable-supported special reorder groups. |
| 310 * Now the top 16 bits would need to be stored, |
| 311 * and it is simpler to store only the fast-Latin weights. |
296 */ | 312 */ |
297 | 313 |
298 U_NAMESPACE_END | 314 U_NAMESPACE_END |
299 | 315 |
300 #endif // !UCONFIG_NO_COLLATION | 316 #endif // !UCONFIG_NO_COLLATION |
301 #endif // __COLLATIONFASTLATIN_H__ | 317 #endif // __COLLATIONFASTLATIN_H__ |
OLD | NEW |