OLD | NEW |
1 /* | 1 /* |
2 ******************************************************************************* | 2 ******************************************************************************* |
3 * Copyright (C) 2013-2014, International Business Machines | 3 * Copyright (C) 2013-2015, International Business Machines |
4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
5 ******************************************************************************* | 5 ******************************************************************************* |
6 * collationfastlatin.cpp | 6 * collationfastlatin.cpp |
7 * | 7 * |
8 * created on: 2013aug18 | 8 * created on: 2013aug18 |
9 * created by: Markus W. Scherer | 9 * created by: Markus W. Scherer |
10 */ | 10 */ |
11 | 11 |
12 #include "unicode/utypes.h" | 12 #include "unicode/utypes.h" |
13 | 13 |
14 #if !UCONFIG_NO_COLLATION | 14 #if !UCONFIG_NO_COLLATION |
15 | 15 |
16 #include "unicode/ucol.h" | 16 #include "unicode/ucol.h" |
17 #include "collationdata.h" | 17 #include "collationdata.h" |
18 #include "collationfastlatin.h" | 18 #include "collationfastlatin.h" |
19 #include "collationsettings.h" | 19 #include "collationsettings.h" |
20 #include "putilimp.h" // U_ALIGN_CODE | |
21 #include "uassert.h" | 20 #include "uassert.h" |
22 | 21 |
23 U_NAMESPACE_BEGIN | 22 U_NAMESPACE_BEGIN |
24 | 23 |
25 int32_t | 24 int32_t |
26 CollationFastLatin::getOptions(const CollationData *data, const CollationSetting
s &settings, | 25 CollationFastLatin::getOptions(const CollationData *data, const CollationSetting
s &settings, |
27 uint16_t *primaries, int32_t capacity) { | 26 uint16_t *primaries, int32_t capacity) { |
28 const uint16_t *table = data->fastLatinTable; | 27 const uint16_t *table = data->fastLatinTable; |
29 if(table == NULL) { return -1; } | 28 if(table == NULL) { return -1; } |
30 U_ASSERT(capacity == LATIN_LIMIT); | 29 U_ASSERT(capacity == LATIN_LIMIT); |
31 if(capacity != LATIN_LIMIT) { return -1; } | 30 if(capacity != LATIN_LIMIT) { return -1; } |
32 | 31 |
33 uint32_t miniVarTop; | 32 uint32_t miniVarTop; |
34 if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) { | 33 if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) { |
35 // No mini primaries are variable, set a variableTop just below the | 34 // No mini primaries are variable, set a variableTop just below the |
36 // lowest long mini primary. | 35 // lowest long mini primary. |
37 miniVarTop = MIN_LONG - 1; | 36 miniVarTop = MIN_LONG - 1; |
38 } else { | 37 } else { |
39 uint32_t v1 = settings.variableTop >> 24; | |
40 int32_t headerLength = *table & 0xff; | 38 int32_t headerLength = *table & 0xff; |
41 int32_t i = headerLength - 1; | 39 int32_t i = 1 + settings.getMaxVariable(); |
42 if(i <= 0 || v1 > (table[i] & 0x7fu)) { | 40 if(i >= headerLength) { |
43 return -1; // variableTop >= digits, should not occur | 41 return -1; // variableTop >= digits, should not occur |
44 } | 42 } |
45 while(i > 1 && v1 <= (table[i - 1] & 0x7fu)) { --i; } | 43 miniVarTop = table[i]; |
46 // In the table header, the miniVarTop is in bits 15..7, with 4 zero bit
s 19..16 implied. | |
47 // Shift right to make it comparable with long mini primaries in bits 15
..3. | |
48 miniVarTop = (table[i] & 0xff80) >> 4; | |
49 } | 44 } |
50 | 45 |
51 const uint8_t *reorderTable = settings.reorderTable; | 46 UBool digitsAreReordered = FALSE; |
52 if(reorderTable != NULL) { | 47 if(settings.hasReordering()) { |
53 const uint16_t *scripts = data->scripts; | 48 uint32_t prevStart = 0; |
54 int32_t length = data->scriptsLength; | 49 uint32_t beforeDigitStart = 0; |
55 uint32_t prevLastByte = 0; | 50 uint32_t digitStart = 0; |
56 for(int32_t i = 0; i < length;) { | 51 uint32_t afterDigitStart = 0; |
57 // reordered last byte of the group | 52 for(int32_t group = UCOL_REORDER_CODE_FIRST; |
58 uint32_t lastByte = reorderTable[scripts[i] & 0xff]; | 53 group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL
_REORDER_CODES; |
59 if(lastByte < prevLastByte) { | 54 ++group) { |
60 // The permutation affects the groups up to Latin. | 55 uint32_t start = data->getFirstPrimaryForGroup(group); |
61 return -1; | 56 start = settings.reorder(start); |
| 57 if(group == UCOL_REORDER_CODE_DIGIT) { |
| 58 beforeDigitStart = prevStart; |
| 59 digitStart = start; |
| 60 } else if(start != 0) { |
| 61 if(start < prevStart) { |
| 62 // The permutation affects the groups up to Latin. |
| 63 return -1; |
| 64 } |
| 65 // In the future, there might be a special group between digits
& Latin. |
| 66 if(digitStart != 0 && afterDigitStart == 0 && prevStart == befor
eDigitStart) { |
| 67 afterDigitStart = start; |
| 68 } |
| 69 prevStart = start; |
62 } | 70 } |
63 if(scripts[i + 2] == USCRIPT_LATIN) { break; } | 71 } |
64 i = i + 2 + scripts[i + 1]; | 72 uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN); |
65 prevLastByte = lastByte; | 73 latinStart = settings.reorder(latinStart); |
| 74 if(latinStart < prevStart) { |
| 75 return -1; |
| 76 } |
| 77 if(afterDigitStart == 0) { |
| 78 afterDigitStart = latinStart; |
| 79 } |
| 80 if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) { |
| 81 digitsAreReordered = TRUE; |
66 } | 82 } |
67 } | 83 } |
68 | 84 |
69 table += (table[0] & 0xff); // skip the header | 85 table += (table[0] & 0xff); // skip the header |
70 for(UChar32 c = 0; c < LATIN_LIMIT; ++c) { | 86 for(UChar32 c = 0; c < LATIN_LIMIT; ++c) { |
71 uint32_t p = table[c]; | 87 uint32_t p = table[c]; |
72 if(p >= MIN_SHORT) { | 88 if(p >= MIN_SHORT) { |
73 p &= SHORT_PRIMARY_MASK; | 89 p &= SHORT_PRIMARY_MASK; |
74 } else if(p > miniVarTop) { | 90 } else if(p > miniVarTop) { |
75 p &= LONG_PRIMARY_MASK; | 91 p &= LONG_PRIMARY_MASK; |
76 } else { | 92 } else { |
77 p = 0; | 93 p = 0; |
78 } | 94 } |
79 primaries[c] = (uint16_t)p; | 95 primaries[c] = (uint16_t)p; |
80 } | 96 } |
81 if((settings.options & CollationSettings::NUMERIC) != 0) { | 97 if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) !=
0) { |
82 // Bail out for digits. | 98 // Bail out for digits. |
83 for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; } | 99 for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; } |
84 } | 100 } |
85 | 101 |
86 // Shift the miniVarTop above other options. | 102 // Shift the miniVarTop above other options. |
87 return ((int32_t)miniVarTop << 16) | settings.options; | 103 return ((int32_t)miniVarTop << 16) | settings.options; |
88 } | 104 } |
89 | 105 |
90 int32_t | 106 int32_t |
91 CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primarie
s, int32_t options, | 107 CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primarie
s, int32_t options, |
92 const UChar *left, int32_t leftLength, | 108 const UChar *left, int32_t leftLength, |
93 const UChar *right, int32_t rightLength) { | 109 const UChar *right, int32_t rightLength) { |
94 // This is a modified copy of CollationCompare::compareUpToQuaternary(), | 110 // This is a modified copy of CollationCompare::compareUpToQuaternary(), |
95 // optimized for common Latin text. | 111 // optimized for common Latin text. |
96 // Keep them in sync! | 112 // Keep them in sync! |
97 // Keep compareUTF16() and compareUTF8() in sync very closely! | 113 // Keep compareUTF16() and compareUTF8() in sync very closely! |
98 | 114 |
99 U_ASSERT((table[0] >> 8) == VERSION); | 115 U_ASSERT((table[0] >> 8) == VERSION); |
100 table += (table[0] & 0xff); // skip the header | 116 table += (table[0] & 0xff); // skip the header |
101 uint32_t variableTop = (uint32_t)options >> 16; // see getOptions() | 117 uint32_t variableTop = (uint32_t)options >> 16; // see getOptions() |
102 options &= 0xffff; // needed for CollationSettings::getStrength() to work | 118 options &= 0xffff; // needed for CollationSettings::getStrength() to work |
103 | 119 |
104 // Check for supported characters, fetch mini CEs, and compare primaries. | 120 // Check for supported characters, fetch mini CEs, and compare primaries. |
105 U_ALIGN_CODE(16); | |
106 int32_t leftIndex = 0, rightIndex = 0; | 121 int32_t leftIndex = 0, rightIndex = 0; |
107 /** | 122 /** |
108 * Single mini CE or a pair. | 123 * Single mini CE or a pair. |
109 * The current mini CE is in the lower 16 bits, the next one is in the upper
16 bits. | 124 * The current mini CE is in the lower 16 bits, the next one is in the upper
16 bits. |
110 * If there is only one, then it is in the lower bits, and the upper bits ar
e 0. | 125 * If there is only one, then it is in the lower bits, and the upper bits ar
e 0. |
111 */ | 126 */ |
112 uint32_t leftPair = 0, rightPair = 0; | 127 uint32_t leftPair = 0, rightPair = 0; |
113 for(;;) { | 128 for(;;) { |
114 // We fetch CEs until we get a non-ignorable primary or reach the end. | 129 // We fetch CEs until we get a non-ignorable primary or reach the end. |
115 while(leftPair == 0) { | 130 while(leftPair == 0) { |
(...skipping 316 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
432 const uint8_t *left, int32_t leftLength, | 447 const uint8_t *left, int32_t leftLength, |
433 const uint8_t *right, int32_t rightLength) { | 448 const uint8_t *right, int32_t rightLength) { |
434 // Keep compareUTF16() and compareUTF8() in sync very closely! | 449 // Keep compareUTF16() and compareUTF8() in sync very closely! |
435 | 450 |
436 U_ASSERT((table[0] >> 8) == VERSION); | 451 U_ASSERT((table[0] >> 8) == VERSION); |
437 table += (table[0] & 0xff); // skip the header | 452 table += (table[0] & 0xff); // skip the header |
438 uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::g
etFastLatinOptions() | 453 uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::g
etFastLatinOptions() |
439 options &= 0xffff; // needed for CollationSettings::getStrength() to work | 454 options &= 0xffff; // needed for CollationSettings::getStrength() to work |
440 | 455 |
441 // Check for supported characters, fetch mini CEs, and compare primaries. | 456 // Check for supported characters, fetch mini CEs, and compare primaries. |
442 U_ALIGN_CODE(16); | |
443 int32_t leftIndex = 0, rightIndex = 0; | 457 int32_t leftIndex = 0, rightIndex = 0; |
444 /** | 458 /** |
445 * Single mini CE or a pair. | 459 * Single mini CE or a pair. |
446 * The current mini CE is in the lower 16 bits, the next one is in the upper
16 bits. | 460 * The current mini CE is in the lower 16 bits, the next one is in the upper
16 bits. |
447 * If there is only one, then it is in the lower bits, and the upper bits ar
e 0. | 461 * If there is only one, then it is in the lower bits, and the upper bits ar
e 0. |
448 */ | 462 */ |
449 uint32_t leftPair = 0, rightPair = 0; | 463 uint32_t leftPair = 0, rightPair = 0; |
450 // Note: There is no need to assemble the code point. | 464 // Note: There is no need to assemble the code point. |
451 // We only need to look up the table entry for the character, | 465 // We only need to look up the table entry for the character, |
452 // and nextPair() looks for whether c==0. | 466 // and nextPair() looks for whether c==0. |
(...skipping 621 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1074 U_ASSERT(ce >= MIN_LONG); | 1088 U_ASSERT(ce >= MIN_LONG); |
1075 pair &= TWO_LONG_PRIMARIES_MASK; // variable | 1089 pair &= TWO_LONG_PRIMARIES_MASK; // variable |
1076 } | 1090 } |
1077 } | 1091 } |
1078 return pair; | 1092 return pair; |
1079 } | 1093 } |
1080 | 1094 |
1081 U_NAMESPACE_END | 1095 U_NAMESPACE_END |
1082 | 1096 |
1083 #endif // !UCONFIG_NO_COLLATION | 1097 #endif // !UCONFIG_NO_COLLATION |
OLD | NEW |