OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * Copyright (C) 2012-2014, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* |
| 6 * collationdata.cpp |
| 7 * |
| 8 * created on: 2012jul28 |
| 9 * created by: Markus W. Scherer |
| 10 */ |
| 11 |
| 12 #include "unicode/utypes.h" |
| 13 |
| 14 #if !UCONFIG_NO_COLLATION |
| 15 |
| 16 #include "unicode/ucol.h" |
| 17 #include "unicode/udata.h" |
| 18 #include "unicode/uscript.h" |
| 19 #include "cmemory.h" |
| 20 #include "collation.h" |
| 21 #include "collationdata.h" |
| 22 #include "uassert.h" |
| 23 #include "utrie2.h" |
| 24 |
| 25 U_NAMESPACE_BEGIN |
| 26 |
| 27 uint32_t |
| 28 CollationData::getIndirectCE32(uint32_t ce32) const { |
| 29 U_ASSERT(Collation::isSpecialCE32(ce32)); |
| 30 int32_t tag = Collation::tagFromCE32(ce32); |
| 31 if(tag == Collation::DIGIT_TAG) { |
| 32 // Fetch the non-numeric-collation CE32. |
| 33 ce32 = ce32s[Collation::indexFromCE32(ce32)]; |
| 34 } else if(tag == Collation::LEAD_SURROGATE_TAG) { |
| 35 ce32 = Collation::UNASSIGNED_CE32; |
| 36 } else if(tag == Collation::U0000_TAG) { |
| 37 // Fetch the normal ce32 for U+0000. |
| 38 ce32 = ce32s[0]; |
| 39 } |
| 40 return ce32; |
| 41 } |
| 42 |
| 43 uint32_t |
| 44 CollationData::getFinalCE32(uint32_t ce32) const { |
| 45 if(Collation::isSpecialCE32(ce32)) { |
| 46 ce32 = getIndirectCE32(ce32); |
| 47 } |
| 48 return ce32; |
| 49 } |
| 50 |
| 51 int64_t |
| 52 CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const { |
| 53 if(U_FAILURE(errorCode)) { return 0; } |
| 54 // Keep parallel with CollationDataBuilder::getSingleCE(). |
| 55 const CollationData *d; |
| 56 uint32_t ce32 = getCE32(c); |
| 57 if(ce32 == Collation::FALLBACK_CE32) { |
| 58 d = base; |
| 59 ce32 = base->getCE32(c); |
| 60 } else { |
| 61 d = this; |
| 62 } |
| 63 while(Collation::isSpecialCE32(ce32)) { |
| 64 switch(Collation::tagFromCE32(ce32)) { |
| 65 case Collation::LATIN_EXPANSION_TAG: |
| 66 case Collation::BUILDER_DATA_TAG: |
| 67 case Collation::PREFIX_TAG: |
| 68 case Collation::CONTRACTION_TAG: |
| 69 case Collation::HANGUL_TAG: |
| 70 case Collation::LEAD_SURROGATE_TAG: |
| 71 errorCode = U_UNSUPPORTED_ERROR; |
| 72 return 0; |
| 73 case Collation::FALLBACK_TAG: |
| 74 case Collation::RESERVED_TAG_3: |
| 75 errorCode = U_INTERNAL_PROGRAM_ERROR; |
| 76 return 0; |
| 77 case Collation::LONG_PRIMARY_TAG: |
| 78 return Collation::ceFromLongPrimaryCE32(ce32); |
| 79 case Collation::LONG_SECONDARY_TAG: |
| 80 return Collation::ceFromLongSecondaryCE32(ce32); |
| 81 case Collation::EXPANSION32_TAG: |
| 82 if(Collation::lengthFromCE32(ce32) == 1) { |
| 83 ce32 = d->ce32s[Collation::indexFromCE32(ce32)]; |
| 84 break; |
| 85 } else { |
| 86 errorCode = U_UNSUPPORTED_ERROR; |
| 87 return 0; |
| 88 } |
| 89 case Collation::EXPANSION_TAG: { |
| 90 if(Collation::lengthFromCE32(ce32) == 1) { |
| 91 return d->ces[Collation::indexFromCE32(ce32)]; |
| 92 } else { |
| 93 errorCode = U_UNSUPPORTED_ERROR; |
| 94 return 0; |
| 95 } |
| 96 } |
| 97 case Collation::DIGIT_TAG: |
| 98 // Fetch the non-numeric-collation CE32 and continue. |
| 99 ce32 = d->ce32s[Collation::indexFromCE32(ce32)]; |
| 100 break; |
| 101 case Collation::U0000_TAG: |
| 102 U_ASSERT(c == 0); |
| 103 // Fetch the normal ce32 for U+0000 and continue. |
| 104 ce32 = d->ce32s[0]; |
| 105 break; |
| 106 case Collation::OFFSET_TAG: |
| 107 return d->getCEFromOffsetCE32(c, ce32); |
| 108 case Collation::IMPLICIT_TAG: |
| 109 return Collation::unassignedCEFromCodePoint(c); |
| 110 } |
| 111 } |
| 112 return Collation::ceFromSimpleCE32(ce32); |
| 113 } |
| 114 |
| 115 uint32_t |
| 116 CollationData::getFirstPrimaryForGroup(int32_t script) const { |
| 117 int32_t index = findScript(script); |
| 118 if(index < 0) { |
| 119 return 0; |
| 120 } |
| 121 uint32_t head = scripts[index]; |
| 122 return (head & 0xff00) << 16; |
| 123 } |
| 124 |
| 125 uint32_t |
| 126 CollationData::getLastPrimaryForGroup(int32_t script) const { |
| 127 int32_t index = findScript(script); |
| 128 if(index < 0) { |
| 129 return 0; |
| 130 } |
| 131 uint32_t head = scripts[index]; |
| 132 uint32_t lastByte = head & 0xff; |
| 133 return ((lastByte + 1) << 24) - 1; |
| 134 } |
| 135 |
| 136 int32_t |
| 137 CollationData::getGroupForPrimary(uint32_t p) const { |
| 138 p >>= 24; // Reordering groups are distinguished by primary lead bytes. |
| 139 for(int32_t i = 0; i < scriptsLength; i = i + 2 + scripts[i + 1]) { |
| 140 uint32_t lastByte = scripts[i] & 0xff; |
| 141 if(p <= lastByte) { |
| 142 return scripts[i + 2]; |
| 143 } |
| 144 } |
| 145 return -1; |
| 146 } |
| 147 |
| 148 int32_t |
| 149 CollationData::findScript(int32_t script) const { |
| 150 if(script < 0 || 0xffff < script) { return -1; } |
| 151 for(int32_t i = 0; i < scriptsLength;) { |
| 152 int32_t limit = i + 2 + scripts[i + 1]; |
| 153 for(int32_t j = i + 2; j < limit; ++j) { |
| 154 if(script == scripts[j]) { return i; } |
| 155 } |
| 156 i = limit; |
| 157 } |
| 158 return -1; |
| 159 } |
| 160 |
| 161 int32_t |
| 162 CollationData::getEquivalentScripts(int32_t script, |
| 163 int32_t dest[], int32_t capacity, |
| 164 UErrorCode &errorCode) const { |
| 165 if(U_FAILURE(errorCode)) { return 0; } |
| 166 int32_t i = findScript(script); |
| 167 if(i < 0) { return 0; } |
| 168 int32_t length = scripts[i + 1]; |
| 169 U_ASSERT(length != 0); |
| 170 if(length > capacity) { |
| 171 errorCode = U_BUFFER_OVERFLOW_ERROR; |
| 172 return length; |
| 173 } |
| 174 i += 2; |
| 175 dest[0] = scripts[i++]; |
| 176 for(int32_t j = 1; j < length; ++j) { |
| 177 script = scripts[i++]; |
| 178 // Sorted insertion. |
| 179 for(int32_t k = j;; --k) { |
| 180 // Invariant: dest[k] is free to receive either script or dest[k - 1
]. |
| 181 if(k > 0 && script < dest[k - 1]) { |
| 182 dest[k] = dest[k - 1]; |
| 183 } else { |
| 184 dest[k] = script; |
| 185 break; |
| 186 } |
| 187 } |
| 188 } |
| 189 return length; |
| 190 } |
| 191 |
| 192 void |
| 193 CollationData::makeReorderTable(const int32_t *reorder, int32_t length, |
| 194 uint8_t table[256], UErrorCode &errorCode) const
{ |
| 195 if(U_FAILURE(errorCode)) { return; } |
| 196 |
| 197 // Initialize the table. |
| 198 // Never reorder special low and high primary lead bytes. |
| 199 int32_t lowByte; |
| 200 for(lowByte = 0; lowByte <= Collation::MERGE_SEPARATOR_BYTE; ++lowByte) { |
| 201 table[lowByte] = lowByte; |
| 202 } |
| 203 // lowByte == 03 |
| 204 |
| 205 int32_t highByte; |
| 206 for(highByte = 0xff; highByte >= Collation::TRAIL_WEIGHT_BYTE; --highByte) { |
| 207 table[highByte] = highByte; |
| 208 } |
| 209 // highByte == FE |
| 210 |
| 211 // Set intermediate bytes to 0 to indicate that they have not been set yet. |
| 212 for(int32_t i = lowByte; i <= highByte; ++i) { |
| 213 table[i] = 0; |
| 214 } |
| 215 |
| 216 // Get the set of special reorder codes in the input list. |
| 217 // This supports up to 32 special reorder codes; |
| 218 // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT. |
| 219 uint32_t specials = 0; |
| 220 for(int32_t i = 0; i < length; ++i) { |
| 221 int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST; |
| 222 if(0 <= reorderCode && reorderCode <= 31) { |
| 223 specials |= (uint32_t)1 << reorderCode; |
| 224 } |
| 225 } |
| 226 |
| 227 // Start the reordering with the special low reorder codes that do not occur
in the input. |
| 228 for(int32_t i = 0;; i += 3) { |
| 229 if(scripts[i + 1] != 1) { break; } // Went beyond special single-code r
eorder codes. |
| 230 int32_t reorderCode = (int32_t)scripts[i + 2] - UCOL_REORDER_CODE_FIRST; |
| 231 if(reorderCode < 0) { break; } // Went beyond special reorder codes. |
| 232 if((specials & ((uint32_t)1 << reorderCode)) == 0) { |
| 233 int32_t head = scripts[i]; |
| 234 int32_t firstByte = head >> 8; |
| 235 int32_t lastByte = head & 0xff; |
| 236 do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte); |
| 237 } |
| 238 } |
| 239 |
| 240 // Reorder according to the input scripts, continuing from the bottom of the
bytes range. |
| 241 for(int32_t i = 0; i < length;) { |
| 242 int32_t script = reorder[i++]; |
| 243 if(script == USCRIPT_UNKNOWN) { |
| 244 // Put the remaining scripts at the top. |
| 245 while(i < length) { |
| 246 script = reorder[--length]; |
| 247 if(script == USCRIPT_UNKNOWN || // Must occur at most once. |
| 248 script == UCOL_REORDER_CODE_DEFAULT) { |
| 249 errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 250 return; |
| 251 } |
| 252 int32_t index = findScript(script); |
| 253 if(index < 0) { continue; } |
| 254 int32_t head = scripts[index]; |
| 255 int32_t firstByte = head >> 8; |
| 256 int32_t lastByte = head & 0xff; |
| 257 if(table[firstByte] != 0) { // Duplicate or equivalent script. |
| 258 errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 259 return; |
| 260 } |
| 261 do { table[lastByte--] = highByte--; } while(firstByte <= lastBy
te); |
| 262 } |
| 263 break; |
| 264 } |
| 265 if(script == UCOL_REORDER_CODE_DEFAULT) { |
| 266 // The default code must be the only one in the list, and that is ha
ndled by the caller. |
| 267 // Otherwise it must not be used. |
| 268 errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 269 return; |
| 270 } |
| 271 int32_t index = findScript(script); |
| 272 if(index < 0) { continue; } |
| 273 int32_t head = scripts[index]; |
| 274 int32_t firstByte = head >> 8; |
| 275 int32_t lastByte = head & 0xff; |
| 276 if(table[firstByte] != 0) { // Duplicate or equivalent script. |
| 277 errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 278 return; |
| 279 } |
| 280 do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte); |
| 281 } |
| 282 |
| 283 // Put all remaining scripts into the middle. |
| 284 // Avoid table[0] which must remain 0. |
| 285 for(int32_t i = 1; i <= 0xff; ++i) { |
| 286 if(table[i] == 0) { table[i] = lowByte++; } |
| 287 } |
| 288 U_ASSERT(lowByte == highByte + 1); |
| 289 } |
| 290 |
| 291 U_NAMESPACE_END |
| 292 |
| 293 #endif // !UCONFIG_NO_COLLATION |
OLD | NEW |