| OLD | NEW |
| 1 /* | 1 /* |
| 2 ******************************************************************************* | 2 ******************************************************************************* |
| 3 * Copyright (C) 2012-2014, International Business Machines | 3 * Copyright (C) 2012-2015, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* | 5 ******************************************************************************* |
| 6 * collationdata.cpp | 6 * collationdata.cpp |
| 7 * | 7 * |
| 8 * created on: 2012jul28 | 8 * created on: 2012jul28 |
| 9 * created by: Markus W. Scherer | 9 * created by: Markus W. Scherer |
| 10 */ | 10 */ |
| 11 | 11 |
| 12 #include "unicode/utypes.h" | 12 #include "unicode/utypes.h" |
| 13 | 13 |
| 14 #if !UCONFIG_NO_COLLATION | 14 #if !UCONFIG_NO_COLLATION |
| 15 | 15 |
| 16 #include "unicode/ucol.h" | 16 #include "unicode/ucol.h" |
| 17 #include "unicode/udata.h" | 17 #include "unicode/udata.h" |
| 18 #include "unicode/uscript.h" | 18 #include "unicode/uscript.h" |
| 19 #include "cmemory.h" | 19 #include "cmemory.h" |
| 20 #include "collation.h" | 20 #include "collation.h" |
| 21 #include "collationdata.h" | 21 #include "collationdata.h" |
| 22 #include "uassert.h" | 22 #include "uassert.h" |
| 23 #include "utrie2.h" | 23 #include "utrie2.h" |
| 24 #include "uvectr32.h" |
| 24 | 25 |
| 25 U_NAMESPACE_BEGIN | 26 U_NAMESPACE_BEGIN |
| 26 | 27 |
| 27 uint32_t | 28 uint32_t |
| 28 CollationData::getIndirectCE32(uint32_t ce32) const { | 29 CollationData::getIndirectCE32(uint32_t ce32) const { |
| 29 U_ASSERT(Collation::isSpecialCE32(ce32)); | 30 U_ASSERT(Collation::isSpecialCE32(ce32)); |
| 30 int32_t tag = Collation::tagFromCE32(ce32); | 31 int32_t tag = Collation::tagFromCE32(ce32); |
| 31 if(tag == Collation::DIGIT_TAG) { | 32 if(tag == Collation::DIGIT_TAG) { |
| 32 // Fetch the non-numeric-collation CE32. | 33 // Fetch the non-numeric-collation CE32. |
| 33 ce32 = ce32s[Collation::indexFromCE32(ce32)]; | 34 ce32 = ce32s[Collation::indexFromCE32(ce32)]; |
| (...skipping 73 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 107 return d->getCEFromOffsetCE32(c, ce32); | 108 return d->getCEFromOffsetCE32(c, ce32); |
| 108 case Collation::IMPLICIT_TAG: | 109 case Collation::IMPLICIT_TAG: |
| 109 return Collation::unassignedCEFromCodePoint(c); | 110 return Collation::unassignedCEFromCodePoint(c); |
| 110 } | 111 } |
| 111 } | 112 } |
| 112 return Collation::ceFromSimpleCE32(ce32); | 113 return Collation::ceFromSimpleCE32(ce32); |
| 113 } | 114 } |
| 114 | 115 |
| 115 uint32_t | 116 uint32_t |
| 116 CollationData::getFirstPrimaryForGroup(int32_t script) const { | 117 CollationData::getFirstPrimaryForGroup(int32_t script) const { |
| 117 int32_t index = findScript(script); | 118 int32_t index = getScriptIndex(script); |
| 118 if(index < 0) { | 119 return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16; |
| 119 return 0; | |
| 120 } | |
| 121 uint32_t head = scripts[index]; | |
| 122 return (head & 0xff00) << 16; | |
| 123 } | 120 } |
| 124 | 121 |
| 125 uint32_t | 122 uint32_t |
| 126 CollationData::getLastPrimaryForGroup(int32_t script) const { | 123 CollationData::getLastPrimaryForGroup(int32_t script) const { |
| 127 int32_t index = findScript(script); | 124 int32_t index = getScriptIndex(script); |
| 128 if(index < 0) { | 125 if(index == 0) { |
| 129 return 0; | 126 return 0; |
| 130 } | 127 } |
| 131 uint32_t head = scripts[index]; | 128 uint32_t limit = scriptStarts[index + 1]; |
| 132 uint32_t lastByte = head & 0xff; | 129 return (limit << 16) - 1; |
| 133 return ((lastByte + 1) << 24) - 1; | |
| 134 } | 130 } |
| 135 | 131 |
| 136 int32_t | 132 int32_t |
| 137 CollationData::getGroupForPrimary(uint32_t p) const { | 133 CollationData::getGroupForPrimary(uint32_t p) const { |
| 138 p >>= 24; // Reordering groups are distinguished by primary lead bytes. | 134 p >>= 16; |
| 139 for(int32_t i = 0; i < scriptsLength; i = i + 2 + scripts[i + 1]) { | 135 if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) { |
| 140 uint32_t lastByte = scripts[i] & 0xff; | 136 return -1; |
| 141 if(p <= lastByte) { | 137 } |
| 142 return scripts[i + 2]; | 138 int32_t index = 1; |
| 139 while(p >= scriptStarts[index + 1]) { ++index; } |
| 140 for(int32_t i = 0; i < numScripts; ++i) { |
| 141 if(scriptsIndex[i] == index) { |
| 142 return i; |
| 143 } |
| 144 } |
| 145 for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) { |
| 146 if(scriptsIndex[numScripts + i] == index) { |
| 147 return UCOL_REORDER_CODE_FIRST + i; |
| 143 } | 148 } |
| 144 } | 149 } |
| 145 return -1; | 150 return -1; |
| 146 } | 151 } |
| 147 | 152 |
| 148 int32_t | 153 int32_t |
| 149 CollationData::findScript(int32_t script) const { | 154 CollationData::getScriptIndex(int32_t script) const { |
| 150 if(script < 0 || 0xffff < script) { return -1; } | 155 if(script < 0) { |
| 151 for(int32_t i = 0; i < scriptsLength;) { | 156 return 0; |
| 152 int32_t limit = i + 2 + scripts[i + 1]; | 157 } else if(script < numScripts) { |
| 153 for(int32_t j = i + 2; j < limit; ++j) { | 158 return scriptsIndex[script]; |
| 154 if(script == scripts[j]) { return i; } | 159 } else if(script < UCOL_REORDER_CODE_FIRST) { |
| 160 return 0; |
| 161 } else { |
| 162 script -= UCOL_REORDER_CODE_FIRST; |
| 163 if(script < MAX_NUM_SPECIAL_REORDER_CODES) { |
| 164 return scriptsIndex[numScripts + script]; |
| 165 } else { |
| 166 return 0; |
| 155 } | 167 } |
| 156 i = limit; | |
| 157 } | 168 } |
| 158 return -1; | |
| 159 } | 169 } |
| 160 | 170 |
| 161 int32_t | 171 int32_t |
| 162 CollationData::getEquivalentScripts(int32_t script, | 172 CollationData::getEquivalentScripts(int32_t script, |
| 163 int32_t dest[], int32_t capacity, | 173 int32_t dest[], int32_t capacity, |
| 164 UErrorCode &errorCode) const { | 174 UErrorCode &errorCode) const { |
| 165 if(U_FAILURE(errorCode)) { return 0; } | 175 if(U_FAILURE(errorCode)) { return 0; } |
| 166 int32_t i = findScript(script); | 176 int32_t index = getScriptIndex(script); |
| 167 if(i < 0) { return 0; } | 177 if(index == 0) { return 0; } |
| 168 int32_t length = scripts[i + 1]; | 178 if(script >= UCOL_REORDER_CODE_FIRST) { |
| 169 U_ASSERT(length != 0); | 179 // Special groups have no aliases. |
| 180 if(capacity > 0) { |
| 181 dest[0] = script; |
| 182 } else { |
| 183 errorCode = U_BUFFER_OVERFLOW_ERROR; |
| 184 } |
| 185 return 1; |
| 186 } |
| 187 |
| 188 int32_t length = 0; |
| 189 for(int32_t i = 0; i < numScripts; ++i) { |
| 190 if(scriptsIndex[i] == index) { |
| 191 if(length < capacity) { |
| 192 dest[length] = i; |
| 193 } |
| 194 ++length; |
| 195 } |
| 196 } |
| 170 if(length > capacity) { | 197 if(length > capacity) { |
| 171 errorCode = U_BUFFER_OVERFLOW_ERROR; | 198 errorCode = U_BUFFER_OVERFLOW_ERROR; |
| 172 return length; | |
| 173 } | |
| 174 i += 2; | |
| 175 dest[0] = scripts[i++]; | |
| 176 for(int32_t j = 1; j < length; ++j) { | |
| 177 script = scripts[i++]; | |
| 178 // Sorted insertion. | |
| 179 for(int32_t k = j;; --k) { | |
| 180 // Invariant: dest[k] is free to receive either script or dest[k - 1
]. | |
| 181 if(k > 0 && script < dest[k - 1]) { | |
| 182 dest[k] = dest[k - 1]; | |
| 183 } else { | |
| 184 dest[k] = script; | |
| 185 break; | |
| 186 } | |
| 187 } | |
| 188 } | 199 } |
| 189 return length; | 200 return length; |
| 190 } | 201 } |
| 191 | 202 |
| 192 void | 203 void |
| 193 CollationData::makeReorderTable(const int32_t *reorder, int32_t length, | 204 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length, |
| 194 uint8_t table[256], UErrorCode &errorCode) const
{ | 205 UVector32 &ranges, UErrorCode &errorCode) const
{ |
| 206 makeReorderRanges(reorder, length, FALSE, ranges, errorCode); |
| 207 } |
| 208 |
| 209 void |
| 210 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length, |
| 211 UBool latinMustMove, |
| 212 UVector32 &ranges, UErrorCode &errorCode) const
{ |
| 195 if(U_FAILURE(errorCode)) { return; } | 213 if(U_FAILURE(errorCode)) { return; } |
| 196 | 214 ranges.removeAllElements(); |
| 197 // Initialize the table. | 215 if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) { |
| 198 // Never reorder special low and high primary lead bytes. | 216 return; |
| 199 int32_t lowByte; | |
| 200 for(lowByte = 0; lowByte <= Collation::MERGE_SEPARATOR_BYTE; ++lowByte) { | |
| 201 table[lowByte] = lowByte; | |
| 202 } | |
| 203 // lowByte == 03 | |
| 204 | |
| 205 int32_t highByte; | |
| 206 for(highByte = 0xff; highByte >= Collation::TRAIL_WEIGHT_BYTE; --highByte) { | |
| 207 table[highByte] = highByte; | |
| 208 } | |
| 209 // highByte == FE | |
| 210 | |
| 211 // Set intermediate bytes to 0 to indicate that they have not been set yet. | |
| 212 for(int32_t i = lowByte; i <= highByte; ++i) { | |
| 213 table[i] = 0; | |
| 214 } | 217 } |
| 215 | 218 |
| 219 // Maps each script-or-group range to a new lead byte. |
| 220 uint8_t table[MAX_NUM_SCRIPT_RANGES]; |
| 221 uprv_memset(table, 0, sizeof(table)); |
| 222 |
| 223 { |
| 224 // Set "don't care" values for reserved ranges. |
| 225 int32_t index = scriptsIndex[ |
| 226 numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_F
IRST]; |
| 227 if(index != 0) { |
| 228 table[index] = 0xff; |
| 229 } |
| 230 index = scriptsIndex[ |
| 231 numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FI
RST]; |
| 232 if(index != 0) { |
| 233 table[index] = 0xff; |
| 234 } |
| 235 } |
| 236 |
| 237 // Never reorder special low and high primary lead bytes. |
| 238 U_ASSERT(scriptStartsLength >= 2); |
| 239 U_ASSERT(scriptStarts[0] == 0); |
| 240 int32_t lowStart = scriptStarts[1]; |
| 241 U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8)); |
| 242 int32_t highLimit = scriptStarts[scriptStartsLength - 1]; |
| 243 U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8)); |
| 244 |
| 216 // Get the set of special reorder codes in the input list. | 245 // Get the set of special reorder codes in the input list. |
| 217 // This supports up to 32 special reorder codes; | 246 // This supports a fixed number of special reorder codes; |
| 218 // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT. | 247 // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT. |
| 219 uint32_t specials = 0; | 248 uint32_t specials = 0; |
| 220 for(int32_t i = 0; i < length; ++i) { | 249 for(int32_t i = 0; i < length; ++i) { |
| 221 int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST; | 250 int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST; |
| 222 if(0 <= reorderCode && reorderCode <= 31) { | 251 if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) { |
| 223 specials |= (uint32_t)1 << reorderCode; | 252 specials |= (uint32_t)1 << reorderCode; |
| 224 } | 253 } |
| 225 } | 254 } |
| 226 | 255 |
| 227 // Start the reordering with the special low reorder codes that do not occur
in the input. | 256 // Start the reordering with the special low reorder codes that do not occur
in the input. |
| 228 for(int32_t i = 0;; i += 3) { | 257 for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) { |
| 229 if(scripts[i + 1] != 1) { break; } // Went beyond special single-code r
eorder codes. | 258 int32_t index = scriptsIndex[numScripts + i]; |
| 230 int32_t reorderCode = (int32_t)scripts[i + 2] - UCOL_REORDER_CODE_FIRST; | 259 if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) { |
| 231 if(reorderCode < 0) { break; } // Went beyond special reorder codes. | 260 lowStart = addLowScriptRange(table, index, lowStart); |
| 232 if((specials & ((uint32_t)1 << reorderCode)) == 0) { | |
| 233 int32_t head = scripts[i]; | |
| 234 int32_t firstByte = head >> 8; | |
| 235 int32_t lastByte = head & 0xff; | |
| 236 do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte); | |
| 237 } | 261 } |
| 238 } | 262 } |
| 239 | 263 |
| 240 // Reorder according to the input scripts, continuing from the bottom of the
bytes range. | 264 // Skip the reserved range before Latin if Latin is the first script, |
| 265 // so that we do not move it unnecessarily. |
| 266 int32_t skippedReserved = 0; |
| 267 if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) { |
| 268 int32_t index = scriptsIndex[USCRIPT_LATIN]; |
| 269 U_ASSERT(index != 0); |
| 270 int32_t start = scriptStarts[index]; |
| 271 U_ASSERT(lowStart <= start); |
| 272 skippedReserved = start - lowStart; |
| 273 lowStart = start; |
| 274 } |
| 275 |
| 276 // Reorder according to the input scripts, continuing from the bottom of the
primary range. |
| 277 int32_t originalLength = length; // length will be decremented if "others"
is in the list. |
| 278 UBool hasReorderToEnd = FALSE; |
| 241 for(int32_t i = 0; i < length;) { | 279 for(int32_t i = 0; i < length;) { |
| 242 int32_t script = reorder[i++]; | 280 int32_t script = reorder[i++]; |
| 243 if(script == USCRIPT_UNKNOWN) { | 281 if(script == USCRIPT_UNKNOWN) { |
| 244 // Put the remaining scripts at the top. | 282 // Put the remaining scripts at the top. |
| 283 hasReorderToEnd = TRUE; |
| 245 while(i < length) { | 284 while(i < length) { |
| 246 script = reorder[--length]; | 285 script = reorder[--length]; |
| 247 if(script == USCRIPT_UNKNOWN || // Must occur at most once. | 286 if(script == USCRIPT_UNKNOWN || // Must occur at most once. |
| 248 script == UCOL_REORDER_CODE_DEFAULT) { | 287 script == UCOL_REORDER_CODE_DEFAULT) { |
| 249 errorCode = U_ILLEGAL_ARGUMENT_ERROR; | 288 errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 250 return; | 289 return; |
| 251 } | 290 } |
| 252 int32_t index = findScript(script); | 291 int32_t index = getScriptIndex(script); |
| 253 if(index < 0) { continue; } | 292 if(index == 0) { continue; } |
| 254 int32_t head = scripts[index]; | 293 if(table[index] != 0) { // Duplicate or equivalent script. |
| 255 int32_t firstByte = head >> 8; | |
| 256 int32_t lastByte = head & 0xff; | |
| 257 if(table[firstByte] != 0) { // Duplicate or equivalent script. | |
| 258 errorCode = U_ILLEGAL_ARGUMENT_ERROR; | 294 errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 259 return; | 295 return; |
| 260 } | 296 } |
| 261 do { table[lastByte--] = highByte--; } while(firstByte <= lastBy
te); | 297 highLimit = addHighScriptRange(table, index, highLimit); |
| 262 } | 298 } |
| 263 break; | 299 break; |
| 264 } | 300 } |
| 265 if(script == UCOL_REORDER_CODE_DEFAULT) { | 301 if(script == UCOL_REORDER_CODE_DEFAULT) { |
| 266 // The default code must be the only one in the list, and that is ha
ndled by the caller. | 302 // The default code must be the only one in the list, and that is ha
ndled by the caller. |
| 267 // Otherwise it must not be used. | 303 // Otherwise it must not be used. |
| 268 errorCode = U_ILLEGAL_ARGUMENT_ERROR; | 304 errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 269 return; | 305 return; |
| 270 } | 306 } |
| 271 int32_t index = findScript(script); | 307 int32_t index = getScriptIndex(script); |
| 272 if(index < 0) { continue; } | 308 if(index == 0) { continue; } |
| 273 int32_t head = scripts[index]; | 309 if(table[index] != 0) { // Duplicate or equivalent script. |
| 274 int32_t firstByte = head >> 8; | |
| 275 int32_t lastByte = head & 0xff; | |
| 276 if(table[firstByte] != 0) { // Duplicate or equivalent script. | |
| 277 errorCode = U_ILLEGAL_ARGUMENT_ERROR; | 310 errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 278 return; | 311 return; |
| 279 } | 312 } |
| 280 do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte); | 313 lowStart = addLowScriptRange(table, index, lowStart); |
| 281 } | 314 } |
| 282 | 315 |
| 283 // Put all remaining scripts into the middle. | 316 // Put all remaining scripts into the middle. |
| 284 // Avoid table[0] which must remain 0. | 317 for(int32_t i = 1; i < scriptStartsLength - 1; ++i) { |
| 285 for(int32_t i = 1; i <= 0xff; ++i) { | 318 int32_t leadByte = table[i]; |
| 286 if(table[i] == 0) { table[i] = lowByte++; } | 319 if(leadByte != 0) { continue; } |
| 320 int32_t start = scriptStarts[i]; |
| 321 if(!hasReorderToEnd && start > lowStart) { |
| 322 // No need to move this script. |
| 323 lowStart = start; |
| 324 } |
| 325 lowStart = addLowScriptRange(table, i, lowStart); |
| 287 } | 326 } |
| 288 U_ASSERT(lowByte == highByte + 1); | 327 if(lowStart > highLimit) { |
| 328 if((lowStart - (skippedReserved & 0xff00)) <= highLimit) { |
| 329 // Try not skipping the before-Latin reserved range. |
| 330 makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode); |
| 331 return; |
| 332 } |
| 333 // We need more primary lead bytes than available, despite the reserved
ranges. |
| 334 errorCode = U_BUFFER_OVERFLOW_ERROR; |
| 335 return; |
| 336 } |
| 337 |
| 338 // Turn lead bytes into a list of (limit, offset) pairs. |
| 339 // Encode each pair in one list element: |
| 340 // Upper 16 bits = limit, lower 16 = signed lead byte offset. |
| 341 int32_t offset = 0; |
| 342 for(int32_t i = 1;; ++i) { |
| 343 int32_t nextOffset = offset; |
| 344 while(i < scriptStartsLength - 1) { |
| 345 int32_t newLeadByte = table[i]; |
| 346 if(newLeadByte == 0xff) { |
| 347 // "Don't care" lead byte for reserved range, continue with curr
ent offset. |
| 348 } else { |
| 349 nextOffset = newLeadByte - (scriptStarts[i] >> 8); |
| 350 if(nextOffset != offset) { break; } |
| 351 } |
| 352 ++i; |
| 353 } |
| 354 if(offset != 0 || i < scriptStartsLength - 1) { |
| 355 ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xfff
f), errorCode); |
| 356 } |
| 357 if(i == scriptStartsLength - 1) { break; } |
| 358 offset = nextOffset; |
| 359 } |
| 360 } |
| 361 |
| 362 int32_t |
| 363 CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStar
t) const { |
| 364 int32_t start = scriptStarts[index]; |
| 365 if((start & 0xff) < (lowStart & 0xff)) { |
| 366 lowStart += 0x100; |
| 367 } |
| 368 table[index] = (uint8_t)(lowStart >> 8); |
| 369 int32_t limit = scriptStarts[index + 1]; |
| 370 lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (
limit & 0xff); |
| 371 return lowStart; |
| 372 } |
| 373 |
| 374 int32_t |
| 375 CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLi
mit) const { |
| 376 int32_t limit = scriptStarts[index + 1]; |
| 377 if((limit & 0xff) > (highLimit & 0xff)) { |
| 378 highLimit -= 0x100; |
| 379 } |
| 380 int32_t start = scriptStarts[index]; |
| 381 highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) |
(start & 0xff); |
| 382 table[index] = (uint8_t)(highLimit >> 8); |
| 383 return highLimit; |
| 289 } | 384 } |
| 290 | 385 |
| 291 U_NAMESPACE_END | 386 U_NAMESPACE_END |
| 292 | 387 |
| 293 #endif // !UCONFIG_NO_COLLATION | 388 #endif // !UCONFIG_NO_COLLATION |
| OLD | NEW |