| OLD | NEW |
| 1 /* | 1 /* |
| 2 ******************************************************************************* | 2 ******************************************************************************* |
| 3 * Copyright (C) 2013-2014, International Business Machines | 3 * Copyright (C) 2013-2015, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* | 5 ******************************************************************************* |
| 6 * collationdatareader.cpp | 6 * collationdatareader.cpp |
| 7 * | 7 * |
| 8 * created on: 2013feb07 | 8 * created on: 2013feb07 |
| 9 * created by: Markus W. Scherer | 9 * created by: Markus W. Scherer |
| 10 */ | 10 */ |
| 11 | 11 |
| 12 #include "unicode/utypes.h" | 12 #include "unicode/utypes.h" |
| 13 | 13 |
| 14 #if !UCONFIG_NO_COLLATION | 14 #if !UCONFIG_NO_COLLATION |
| 15 | 15 |
| 16 #include "unicode/ucol.h" | 16 #include "unicode/ucol.h" |
| 17 #include "unicode/udata.h" | 17 #include "unicode/udata.h" |
| 18 #include "unicode/uscript.h" | 18 #include "unicode/uscript.h" |
| 19 #include "cmemory.h" | 19 #include "cmemory.h" |
| 20 #include "collation.h" | 20 #include "collation.h" |
| 21 #include "collationdata.h" | 21 #include "collationdata.h" |
| 22 #include "collationdatareader.h" | 22 #include "collationdatareader.h" |
| 23 #include "collationfastlatin.h" | 23 #include "collationfastlatin.h" |
| 24 #include "collationkeys.h" | 24 #include "collationkeys.h" |
| 25 #include "collationrootelements.h" | 25 #include "collationrootelements.h" |
| 26 #include "collationsettings.h" | 26 #include "collationsettings.h" |
| 27 #include "collationtailoring.h" | 27 #include "collationtailoring.h" |
| 28 #include "collunsafe.h" |
| 28 #include "normalizer2impl.h" | 29 #include "normalizer2impl.h" |
| 29 #include "uassert.h" | 30 #include "uassert.h" |
| 30 #include "ucmndata.h" | 31 #include "ucmndata.h" |
| 31 #include "utrie2.h" | 32 #include "utrie2.h" |
| 32 | 33 |
| 33 U_NAMESPACE_BEGIN | 34 U_NAMESPACE_BEGIN |
| 34 | 35 |
| 35 namespace { | 36 namespace { |
| 36 | 37 |
| 37 int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) { | 38 int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) { |
| (...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 95 length = 0; // only indexes, and inLength was already checked for them | 96 length = 0; // only indexes, and inLength was already checked for them |
| 96 } | 97 } |
| 97 if(0 <= inLength && inLength < length) { | 98 if(0 <= inLength && inLength < length) { |
| 98 errorCode = U_INVALID_FORMAT_ERROR; | 99 errorCode = U_INVALID_FORMAT_ERROR; |
| 99 return; | 100 return; |
| 100 } | 101 } |
| 101 | 102 |
| 102 const CollationData *baseData = base == NULL ? NULL : base->data; | 103 const CollationData *baseData = base == NULL ? NULL : base->data; |
| 103 const int32_t *reorderCodes = NULL; | 104 const int32_t *reorderCodes = NULL; |
| 104 int32_t reorderCodesLength = 0; | 105 int32_t reorderCodesLength = 0; |
| 106 const uint32_t *reorderRanges = NULL; |
| 107 int32_t reorderRangesLength = 0; |
| 105 index = IX_REORDER_CODES_OFFSET; | 108 index = IX_REORDER_CODES_OFFSET; |
| 106 offset = getIndex(inIndexes, indexesLength, index); | 109 offset = getIndex(inIndexes, indexesLength, index); |
| 107 length = getIndex(inIndexes, indexesLength, index + 1) - offset; | 110 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
| 108 if(length >= 4) { | 111 if(length >= 4) { |
| 109 if(baseData == NULL) { | 112 if(baseData == NULL) { |
| 110 // We assume for collation settings that | 113 // We assume for collation settings that |
| 111 // the base data does not have a reordering. | 114 // the base data does not have a reordering. |
| 112 errorCode = U_INVALID_FORMAT_ERROR; | 115 errorCode = U_INVALID_FORMAT_ERROR; |
| 113 return; | 116 return; |
| 114 } | 117 } |
| 115 reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset); | 118 reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset); |
| 116 reorderCodesLength = length / 4; | 119 reorderCodesLength = length / 4; |
| 120 |
| 121 // The reorderRanges (if any) are the trailing reorderCodes entries. |
| 122 // Split the array at the boundary. |
| 123 // Script or reorder codes do not exceed 16-bit values. |
| 124 // Range limits are stored in the upper 16 bits, and are never 0. |
| 125 while(reorderRangesLength < reorderCodesLength && |
| 126 (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0x
ffff0000) != 0) { |
| 127 ++reorderRangesLength; |
| 128 } |
| 129 U_ASSERT(reorderRangesLength < reorderCodesLength); |
| 130 if(reorderRangesLength != 0) { |
| 131 reorderCodesLength -= reorderRangesLength; |
| 132 reorderRanges = reinterpret_cast<const uint32_t *>(reorderCodes + re
orderCodesLength); |
| 133 } |
| 117 } | 134 } |
| 118 | 135 |
| 119 // There should be a reorder table only if there are reorder codes. | 136 // There should be a reorder table only if there are reorder codes. |
| 120 // However, when there are reorder codes the reorder table may be omitted to
reduce | 137 // However, when there are reorder codes the reorder table may be omitted to
reduce |
| 121 // the data size. | 138 // the data size. |
| 122 const uint8_t *reorderTable = NULL; | 139 const uint8_t *reorderTable = NULL; |
| 123 index = IX_REORDER_TABLE_OFFSET; | 140 index = IX_REORDER_TABLE_OFFSET; |
| 124 offset = getIndex(inIndexes, indexesLength, index); | 141 offset = getIndex(inIndexes, indexesLength, index); |
| 125 length = getIndex(inIndexes, indexesLength, index + 1) - offset; | 142 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
| 126 if(length >= 256) { | 143 if(length >= 256) { |
| (...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 239 | 256 |
| 240 index = IX_UNSAFE_BWD_OFFSET; | 257 index = IX_UNSAFE_BWD_OFFSET; |
| 241 offset = getIndex(inIndexes, indexesLength, index); | 258 offset = getIndex(inIndexes, indexesLength, index); |
| 242 length = getIndex(inIndexes, indexesLength, index + 1) - offset; | 259 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
| 243 if(length >= 2) { | 260 if(length >= 2) { |
| 244 if(data == NULL) { | 261 if(data == NULL) { |
| 245 errorCode = U_INVALID_FORMAT_ERROR; | 262 errorCode = U_INVALID_FORMAT_ERROR; |
| 246 return; | 263 return; |
| 247 } | 264 } |
| 248 if(baseData == NULL) { | 265 if(baseData == NULL) { |
| 266 #if defined(COLLUNSAFE_COLL_VERSION) && defined (COLLUNSAFE_SERIALIZE) |
| 267 tailoring.unsafeBackwardSet = new UnicodeSet(unsafe_serializedData, un
safe_serializedCount, UnicodeSet::kSerialized, errorCode); |
| 268 if(tailoring.unsafeBackwardSet == NULL) { |
| 269 errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 270 return; |
| 271 } else if (U_FAILURE(errorCode)) { |
| 272 return; |
| 273 } |
| 274 #else |
| 249 // Create the unsafe-backward set for the root collator. | 275 // Create the unsafe-backward set for the root collator. |
| 250 // Include all non-zero combining marks and trail surrogates. | 276 // Include all non-zero combining marks and trail surrogates. |
| 251 // We do this at load time, rather than at build time, | 277 // We do this at load time, rather than at build time, |
| 252 // to simplify Unicode version bootstrapping: | 278 // to simplify Unicode version bootstrapping: |
| 253 // The root data builder only needs the new FractionalUCA.txt data, | 279 // The root data builder only needs the new FractionalUCA.txt data, |
| 254 // but it need not be built with a version of ICU already updated to | 280 // but it need not be built with a version of ICU already updated to |
| 255 // the corresponding new Unicode Character Database. | 281 // the corresponding new Unicode Character Database. |
| 256 // | 282 // |
| 257 // The following is an optimized version of | 283 // The following is an optimized version of |
| 258 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]"). | 284 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]"). |
| 259 // It is faster and requires fewer code dependencies. | 285 // It is faster and requires fewer code dependencies. |
| 260 tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // tr
ail surrogates | 286 tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // tr
ail surrogates |
| 261 if(tailoring.unsafeBackwardSet == NULL) { | 287 if(tailoring.unsafeBackwardSet == NULL) { |
| 262 errorCode = U_MEMORY_ALLOCATION_ERROR; | 288 errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 263 return; | 289 return; |
| 264 } | 290 } |
| 265 data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet); | 291 data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet); |
| 292 #endif // !COLLUNSAFE_SERIALIZE || !COLLUNSAFE_COLL_VERSION |
| 266 } else { | 293 } else { |
| 267 // Clone the root collator's set contents. | 294 // Clone the root collator's set contents. |
| 268 tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>( | 295 tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>( |
| 269 baseData->unsafeBackwardSet->cloneAsThawed()); | 296 baseData->unsafeBackwardSet->cloneAsThawed()); |
| 270 if(tailoring.unsafeBackwardSet == NULL) { | 297 if(tailoring.unsafeBackwardSet == NULL) { |
| 271 errorCode = U_MEMORY_ALLOCATION_ERROR; | 298 errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 272 return; | 299 return; |
| 273 } | 300 } |
| 274 } | 301 } |
| 275 // Add the ranges from the data file to the unsafe-backward set. | 302 // Add the ranges from the data file to the unsafe-backward set. |
| (...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 330 } | 357 } |
| 331 | 358 |
| 332 index = IX_SCRIPTS_OFFSET; | 359 index = IX_SCRIPTS_OFFSET; |
| 333 offset = getIndex(inIndexes, indexesLength, index); | 360 offset = getIndex(inIndexes, indexesLength, index); |
| 334 length = getIndex(inIndexes, indexesLength, index + 1) - offset; | 361 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
| 335 if(length >= 2) { | 362 if(length >= 2) { |
| 336 if(data == NULL) { | 363 if(data == NULL) { |
| 337 errorCode = U_INVALID_FORMAT_ERROR; | 364 errorCode = U_INVALID_FORMAT_ERROR; |
| 338 return; | 365 return; |
| 339 } | 366 } |
| 340 data->scripts = reinterpret_cast<const uint16_t *>(inBytes + offset); | 367 const uint16_t *scripts = reinterpret_cast<const uint16_t *>(inBytes + o
ffset); |
| 341 data->scriptsLength = length / 2; | 368 int32_t scriptsLength = length / 2; |
| 369 data->numScripts = scripts[0]; |
| 370 // There must be enough entries for both arrays, including more than two
range starts. |
| 371 data->scriptStartsLength = scriptsLength - (1 + data->numScripts + 16); |
| 372 if(data->scriptStartsLength <= 2 || |
| 373 CollationData::MAX_NUM_SCRIPT_RANGES < data->scriptStartsLength)
{ |
| 374 errorCode = U_INVALID_FORMAT_ERROR; |
| 375 return; |
| 376 } |
| 377 data->scriptsIndex = scripts + 1; |
| 378 data->scriptStarts = scripts + 1 + data->numScripts + 16; |
| 379 if(!(data->scriptStarts[0] == 0 && |
| 380 data->scriptStarts[1] == ((Collation::MERGE_SEPARATOR_BYTE + 1)
<< 8) && |
| 381 data->scriptStarts[data->scriptStartsLength - 1] == |
| 382 (Collation::TRAIL_WEIGHT_BYTE << 8))) { |
| 383 errorCode = U_INVALID_FORMAT_ERROR; |
| 384 return; |
| 385 } |
| 342 } else if(data == NULL) { | 386 } else if(data == NULL) { |
| 343 // Nothing to do. | 387 // Nothing to do. |
| 344 } else if(baseData != NULL) { | 388 } else if(baseData != NULL) { |
| 345 data->scripts = baseData->scripts; | 389 data->numScripts = baseData->numScripts; |
| 346 data->scriptsLength = baseData->scriptsLength; | 390 data->scriptsIndex = baseData->scriptsIndex; |
| 391 data->scriptStarts = baseData->scriptStarts; |
| 392 data->scriptStartsLength = baseData->scriptStartsLength; |
| 347 } | 393 } |
| 348 | 394 |
| 349 index = IX_COMPRESSIBLE_BYTES_OFFSET; | 395 index = IX_COMPRESSIBLE_BYTES_OFFSET; |
| 350 offset = getIndex(inIndexes, indexesLength, index); | 396 offset = getIndex(inIndexes, indexesLength, index); |
| 351 length = getIndex(inIndexes, indexesLength, index + 1) - offset; | 397 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
| 352 if(length >= 256) { | 398 if(length >= 256) { |
| 353 if(data == NULL) { | 399 if(data == NULL) { |
| 354 errorCode = U_INVALID_FORMAT_ERROR; | 400 errorCode = U_INVALID_FORMAT_ERROR; |
| 355 return; | 401 return; |
| 356 } | 402 } |
| (...skipping 29 matching lines...) Expand all Loading... |
| 386 } | 432 } |
| 387 settings->options = options; | 433 settings->options = options; |
| 388 // Set variableTop from options and scripts data. | 434 // Set variableTop from options and scripts data. |
| 389 settings->variableTop = tailoring.data->getLastPrimaryForGroup( | 435 settings->variableTop = tailoring.data->getLastPrimaryForGroup( |
| 390 UCOL_REORDER_CODE_FIRST + settings->getMaxVariable()); | 436 UCOL_REORDER_CODE_FIRST + settings->getMaxVariable()); |
| 391 if(settings->variableTop == 0) { | 437 if(settings->variableTop == 0) { |
| 392 errorCode = U_INVALID_FORMAT_ERROR; | 438 errorCode = U_INVALID_FORMAT_ERROR; |
| 393 return; | 439 return; |
| 394 } | 440 } |
| 395 | 441 |
| 396 if(reorderCodesLength == 0 || reorderTable != NULL) { | 442 if(reorderCodesLength != 0) { |
| 397 settings->aliasReordering(reorderCodes, reorderCodesLength, reorderTable
); | 443 settings->aliasReordering(*baseData, reorderCodes, reorderCodesLength, |
| 398 } else { | 444 reorderRanges, reorderRangesLength, |
| 399 uint8_t table[256]; | 445 reorderTable, errorCode); |
| 400 baseData->makeReorderTable(reorderCodes, reorderCodesLength, table, erro
rCode); | |
| 401 if(U_FAILURE(errorCode)) { return; } | |
| 402 if(!settings->setReordering(reorderCodes, reorderCodesLength,table)) { | |
| 403 errorCode = U_MEMORY_ALLOCATION_ERROR; | |
| 404 return; | |
| 405 } | |
| 406 } | 446 } |
| 407 | 447 |
| 408 settings->fastLatinOptions = CollationFastLatin::getOptions( | 448 settings->fastLatinOptions = CollationFastLatin::getOptions( |
| 409 tailoring.data, *settings, | 449 tailoring.data, *settings, |
| 410 settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries
)); | 450 settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries
)); |
| 411 } | 451 } |
| 412 | 452 |
| 413 UBool U_CALLCONV | 453 UBool U_CALLCONV |
| 414 CollationDataReader::isAcceptable(void *context, | 454 CollationDataReader::isAcceptable(void *context, |
| 415 const char * /* type */, const char * /*name*/
, | 455 const char * /* type */, const char * /*name*/
, |
| 416 const UDataInfo *pInfo) { | 456 const UDataInfo *pInfo) { |
| 417 if( | 457 if( |
| 418 pInfo->size >= 20 && | 458 pInfo->size >= 20 && |
| 419 pInfo->isBigEndian == U_IS_BIG_ENDIAN && | 459 pInfo->isBigEndian == U_IS_BIG_ENDIAN && |
| 420 pInfo->charsetFamily == U_CHARSET_FAMILY && | 460 pInfo->charsetFamily == U_CHARSET_FAMILY && |
| 421 pInfo->dataFormat[0] == 0x55 && // dataFormat="UCol" | 461 pInfo->dataFormat[0] == 0x55 && // dataFormat="UCol" |
| 422 pInfo->dataFormat[1] == 0x43 && | 462 pInfo->dataFormat[1] == 0x43 && |
| 423 pInfo->dataFormat[2] == 0x6f && | 463 pInfo->dataFormat[2] == 0x6f && |
| 424 pInfo->dataFormat[3] == 0x6c && | 464 pInfo->dataFormat[3] == 0x6c && |
| 425 pInfo->formatVersion[0] == 4 | 465 pInfo->formatVersion[0] == 5 |
| 426 ) { | 466 ) { |
| 427 UVersionInfo *version = static_cast<UVersionInfo *>(context); | 467 UVersionInfo *version = static_cast<UVersionInfo *>(context); |
| 428 if(version != NULL) { | 468 if(version != NULL) { |
| 429 uprv_memcpy(version, pInfo->dataVersion, 4); | 469 uprv_memcpy(version, pInfo->dataVersion, 4); |
| 430 } | 470 } |
| 431 return TRUE; | 471 return TRUE; |
| 432 } else { | 472 } else { |
| 433 return FALSE; | 473 return FALSE; |
| 434 } | 474 } |
| 435 } | 475 } |
| 436 | 476 |
| 437 U_NAMESPACE_END | 477 U_NAMESPACE_END |
| 438 | 478 |
| 439 #endif // !UCONFIG_NO_COLLATION | 479 #endif // !UCONFIG_NO_COLLATION |
| OLD | NEW |