OLD | NEW |
1 /* | 1 /* |
2 ******************************************************************************* | 2 ******************************************************************************* |
3 * Copyright (C) 2013-2014, International Business Machines | 3 * Copyright (C) 2013-2015, International Business Machines |
4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
5 ******************************************************************************* | 5 ******************************************************************************* |
6 * collationdatareader.cpp | 6 * collationdatareader.cpp |
7 * | 7 * |
8 * created on: 2013feb07 | 8 * created on: 2013feb07 |
9 * created by: Markus W. Scherer | 9 * created by: Markus W. Scherer |
10 */ | 10 */ |
11 | 11 |
12 #include "unicode/utypes.h" | 12 #include "unicode/utypes.h" |
13 | 13 |
14 #if !UCONFIG_NO_COLLATION | 14 #if !UCONFIG_NO_COLLATION |
15 | 15 |
16 #include "unicode/ucol.h" | 16 #include "unicode/ucol.h" |
17 #include "unicode/udata.h" | 17 #include "unicode/udata.h" |
18 #include "unicode/uscript.h" | 18 #include "unicode/uscript.h" |
19 #include "cmemory.h" | 19 #include "cmemory.h" |
20 #include "collation.h" | 20 #include "collation.h" |
21 #include "collationdata.h" | 21 #include "collationdata.h" |
22 #include "collationdatareader.h" | 22 #include "collationdatareader.h" |
23 #include "collationfastlatin.h" | 23 #include "collationfastlatin.h" |
24 #include "collationkeys.h" | 24 #include "collationkeys.h" |
25 #include "collationrootelements.h" | 25 #include "collationrootelements.h" |
26 #include "collationsettings.h" | 26 #include "collationsettings.h" |
27 #include "collationtailoring.h" | 27 #include "collationtailoring.h" |
| 28 #include "collunsafe.h" |
28 #include "normalizer2impl.h" | 29 #include "normalizer2impl.h" |
29 #include "uassert.h" | 30 #include "uassert.h" |
30 #include "ucmndata.h" | 31 #include "ucmndata.h" |
31 #include "utrie2.h" | 32 #include "utrie2.h" |
32 | 33 |
33 U_NAMESPACE_BEGIN | 34 U_NAMESPACE_BEGIN |
34 | 35 |
35 namespace { | 36 namespace { |
36 | 37 |
37 int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) { | 38 int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) { |
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
95 length = 0; // only indexes, and inLength was already checked for them | 96 length = 0; // only indexes, and inLength was already checked for them |
96 } | 97 } |
97 if(0 <= inLength && inLength < length) { | 98 if(0 <= inLength && inLength < length) { |
98 errorCode = U_INVALID_FORMAT_ERROR; | 99 errorCode = U_INVALID_FORMAT_ERROR; |
99 return; | 100 return; |
100 } | 101 } |
101 | 102 |
102 const CollationData *baseData = base == NULL ? NULL : base->data; | 103 const CollationData *baseData = base == NULL ? NULL : base->data; |
103 const int32_t *reorderCodes = NULL; | 104 const int32_t *reorderCodes = NULL; |
104 int32_t reorderCodesLength = 0; | 105 int32_t reorderCodesLength = 0; |
| 106 const uint32_t *reorderRanges = NULL; |
| 107 int32_t reorderRangesLength = 0; |
105 index = IX_REORDER_CODES_OFFSET; | 108 index = IX_REORDER_CODES_OFFSET; |
106 offset = getIndex(inIndexes, indexesLength, index); | 109 offset = getIndex(inIndexes, indexesLength, index); |
107 length = getIndex(inIndexes, indexesLength, index + 1) - offset; | 110 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
108 if(length >= 4) { | 111 if(length >= 4) { |
109 if(baseData == NULL) { | 112 if(baseData == NULL) { |
110 // We assume for collation settings that | 113 // We assume for collation settings that |
111 // the base data does not have a reordering. | 114 // the base data does not have a reordering. |
112 errorCode = U_INVALID_FORMAT_ERROR; | 115 errorCode = U_INVALID_FORMAT_ERROR; |
113 return; | 116 return; |
114 } | 117 } |
115 reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset); | 118 reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset); |
116 reorderCodesLength = length / 4; | 119 reorderCodesLength = length / 4; |
| 120 |
| 121 // The reorderRanges (if any) are the trailing reorderCodes entries. |
| 122 // Split the array at the boundary. |
| 123 // Script or reorder codes do not exceed 16-bit values. |
| 124 // Range limits are stored in the upper 16 bits, and are never 0. |
| 125 while(reorderRangesLength < reorderCodesLength && |
| 126 (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0x
ffff0000) != 0) { |
| 127 ++reorderRangesLength; |
| 128 } |
| 129 U_ASSERT(reorderRangesLength < reorderCodesLength); |
| 130 if(reorderRangesLength != 0) { |
| 131 reorderCodesLength -= reorderRangesLength; |
| 132 reorderRanges = reinterpret_cast<const uint32_t *>(reorderCodes + re
orderCodesLength); |
| 133 } |
117 } | 134 } |
118 | 135 |
119 // There should be a reorder table only if there are reorder codes. | 136 // There should be a reorder table only if there are reorder codes. |
120 // However, when there are reorder codes the reorder table may be omitted to
reduce | 137 // However, when there are reorder codes the reorder table may be omitted to
reduce |
121 // the data size. | 138 // the data size. |
122 const uint8_t *reorderTable = NULL; | 139 const uint8_t *reorderTable = NULL; |
123 index = IX_REORDER_TABLE_OFFSET; | 140 index = IX_REORDER_TABLE_OFFSET; |
124 offset = getIndex(inIndexes, indexesLength, index); | 141 offset = getIndex(inIndexes, indexesLength, index); |
125 length = getIndex(inIndexes, indexesLength, index + 1) - offset; | 142 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
126 if(length >= 256) { | 143 if(length >= 256) { |
(...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
239 | 256 |
240 index = IX_UNSAFE_BWD_OFFSET; | 257 index = IX_UNSAFE_BWD_OFFSET; |
241 offset = getIndex(inIndexes, indexesLength, index); | 258 offset = getIndex(inIndexes, indexesLength, index); |
242 length = getIndex(inIndexes, indexesLength, index + 1) - offset; | 259 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
243 if(length >= 2) { | 260 if(length >= 2) { |
244 if(data == NULL) { | 261 if(data == NULL) { |
245 errorCode = U_INVALID_FORMAT_ERROR; | 262 errorCode = U_INVALID_FORMAT_ERROR; |
246 return; | 263 return; |
247 } | 264 } |
248 if(baseData == NULL) { | 265 if(baseData == NULL) { |
| 266 #if defined(COLLUNSAFE_COLL_VERSION) && defined (COLLUNSAFE_SERIALIZE) |
| 267 tailoring.unsafeBackwardSet = new UnicodeSet(unsafe_serializedData, un
safe_serializedCount, UnicodeSet::kSerialized, errorCode); |
| 268 if(tailoring.unsafeBackwardSet == NULL) { |
| 269 errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 270 return; |
| 271 } else if (U_FAILURE(errorCode)) { |
| 272 return; |
| 273 } |
| 274 #else |
249 // Create the unsafe-backward set for the root collator. | 275 // Create the unsafe-backward set for the root collator. |
250 // Include all non-zero combining marks and trail surrogates. | 276 // Include all non-zero combining marks and trail surrogates. |
251 // We do this at load time, rather than at build time, | 277 // We do this at load time, rather than at build time, |
252 // to simplify Unicode version bootstrapping: | 278 // to simplify Unicode version bootstrapping: |
253 // The root data builder only needs the new FractionalUCA.txt data, | 279 // The root data builder only needs the new FractionalUCA.txt data, |
254 // but it need not be built with a version of ICU already updated to | 280 // but it need not be built with a version of ICU already updated to |
255 // the corresponding new Unicode Character Database. | 281 // the corresponding new Unicode Character Database. |
256 // | 282 // |
257 // The following is an optimized version of | 283 // The following is an optimized version of |
258 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]"). | 284 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]"). |
259 // It is faster and requires fewer code dependencies. | 285 // It is faster and requires fewer code dependencies. |
260 tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // tr
ail surrogates | 286 tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // tr
ail surrogates |
261 if(tailoring.unsafeBackwardSet == NULL) { | 287 if(tailoring.unsafeBackwardSet == NULL) { |
262 errorCode = U_MEMORY_ALLOCATION_ERROR; | 288 errorCode = U_MEMORY_ALLOCATION_ERROR; |
263 return; | 289 return; |
264 } | 290 } |
265 data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet); | 291 data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet); |
| 292 #endif // !COLLUNSAFE_SERIALIZE || !COLLUNSAFE_COLL_VERSION |
266 } else { | 293 } else { |
267 // Clone the root collator's set contents. | 294 // Clone the root collator's set contents. |
268 tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>( | 295 tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>( |
269 baseData->unsafeBackwardSet->cloneAsThawed()); | 296 baseData->unsafeBackwardSet->cloneAsThawed()); |
270 if(tailoring.unsafeBackwardSet == NULL) { | 297 if(tailoring.unsafeBackwardSet == NULL) { |
271 errorCode = U_MEMORY_ALLOCATION_ERROR; | 298 errorCode = U_MEMORY_ALLOCATION_ERROR; |
272 return; | 299 return; |
273 } | 300 } |
274 } | 301 } |
275 // Add the ranges from the data file to the unsafe-backward set. | 302 // Add the ranges from the data file to the unsafe-backward set. |
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
330 } | 357 } |
331 | 358 |
332 index = IX_SCRIPTS_OFFSET; | 359 index = IX_SCRIPTS_OFFSET; |
333 offset = getIndex(inIndexes, indexesLength, index); | 360 offset = getIndex(inIndexes, indexesLength, index); |
334 length = getIndex(inIndexes, indexesLength, index + 1) - offset; | 361 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
335 if(length >= 2) { | 362 if(length >= 2) { |
336 if(data == NULL) { | 363 if(data == NULL) { |
337 errorCode = U_INVALID_FORMAT_ERROR; | 364 errorCode = U_INVALID_FORMAT_ERROR; |
338 return; | 365 return; |
339 } | 366 } |
340 data->scripts = reinterpret_cast<const uint16_t *>(inBytes + offset); | 367 const uint16_t *scripts = reinterpret_cast<const uint16_t *>(inBytes + o
ffset); |
341 data->scriptsLength = length / 2; | 368 int32_t scriptsLength = length / 2; |
| 369 data->numScripts = scripts[0]; |
| 370 // There must be enough entries for both arrays, including more than two
range starts. |
| 371 data->scriptStartsLength = scriptsLength - (1 + data->numScripts + 16); |
| 372 if(data->scriptStartsLength <= 2 || |
| 373 CollationData::MAX_NUM_SCRIPT_RANGES < data->scriptStartsLength)
{ |
| 374 errorCode = U_INVALID_FORMAT_ERROR; |
| 375 return; |
| 376 } |
| 377 data->scriptsIndex = scripts + 1; |
| 378 data->scriptStarts = scripts + 1 + data->numScripts + 16; |
| 379 if(!(data->scriptStarts[0] == 0 && |
| 380 data->scriptStarts[1] == ((Collation::MERGE_SEPARATOR_BYTE + 1)
<< 8) && |
| 381 data->scriptStarts[data->scriptStartsLength - 1] == |
| 382 (Collation::TRAIL_WEIGHT_BYTE << 8))) { |
| 383 errorCode = U_INVALID_FORMAT_ERROR; |
| 384 return; |
| 385 } |
342 } else if(data == NULL) { | 386 } else if(data == NULL) { |
343 // Nothing to do. | 387 // Nothing to do. |
344 } else if(baseData != NULL) { | 388 } else if(baseData != NULL) { |
345 data->scripts = baseData->scripts; | 389 data->numScripts = baseData->numScripts; |
346 data->scriptsLength = baseData->scriptsLength; | 390 data->scriptsIndex = baseData->scriptsIndex; |
| 391 data->scriptStarts = baseData->scriptStarts; |
| 392 data->scriptStartsLength = baseData->scriptStartsLength; |
347 } | 393 } |
348 | 394 |
349 index = IX_COMPRESSIBLE_BYTES_OFFSET; | 395 index = IX_COMPRESSIBLE_BYTES_OFFSET; |
350 offset = getIndex(inIndexes, indexesLength, index); | 396 offset = getIndex(inIndexes, indexesLength, index); |
351 length = getIndex(inIndexes, indexesLength, index + 1) - offset; | 397 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
352 if(length >= 256) { | 398 if(length >= 256) { |
353 if(data == NULL) { | 399 if(data == NULL) { |
354 errorCode = U_INVALID_FORMAT_ERROR; | 400 errorCode = U_INVALID_FORMAT_ERROR; |
355 return; | 401 return; |
356 } | 402 } |
(...skipping 29 matching lines...) Expand all Loading... |
386 } | 432 } |
387 settings->options = options; | 433 settings->options = options; |
388 // Set variableTop from options and scripts data. | 434 // Set variableTop from options and scripts data. |
389 settings->variableTop = tailoring.data->getLastPrimaryForGroup( | 435 settings->variableTop = tailoring.data->getLastPrimaryForGroup( |
390 UCOL_REORDER_CODE_FIRST + settings->getMaxVariable()); | 436 UCOL_REORDER_CODE_FIRST + settings->getMaxVariable()); |
391 if(settings->variableTop == 0) { | 437 if(settings->variableTop == 0) { |
392 errorCode = U_INVALID_FORMAT_ERROR; | 438 errorCode = U_INVALID_FORMAT_ERROR; |
393 return; | 439 return; |
394 } | 440 } |
395 | 441 |
396 if(reorderCodesLength == 0 || reorderTable != NULL) { | 442 if(reorderCodesLength != 0) { |
397 settings->aliasReordering(reorderCodes, reorderCodesLength, reorderTable
); | 443 settings->aliasReordering(*baseData, reorderCodes, reorderCodesLength, |
398 } else { | 444 reorderRanges, reorderRangesLength, |
399 uint8_t table[256]; | 445 reorderTable, errorCode); |
400 baseData->makeReorderTable(reorderCodes, reorderCodesLength, table, erro
rCode); | |
401 if(U_FAILURE(errorCode)) { return; } | |
402 if(!settings->setReordering(reorderCodes, reorderCodesLength,table)) { | |
403 errorCode = U_MEMORY_ALLOCATION_ERROR; | |
404 return; | |
405 } | |
406 } | 446 } |
407 | 447 |
408 settings->fastLatinOptions = CollationFastLatin::getOptions( | 448 settings->fastLatinOptions = CollationFastLatin::getOptions( |
409 tailoring.data, *settings, | 449 tailoring.data, *settings, |
410 settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries
)); | 450 settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries
)); |
411 } | 451 } |
412 | 452 |
413 UBool U_CALLCONV | 453 UBool U_CALLCONV |
414 CollationDataReader::isAcceptable(void *context, | 454 CollationDataReader::isAcceptable(void *context, |
415 const char * /* type */, const char * /*name*/
, | 455 const char * /* type */, const char * /*name*/
, |
416 const UDataInfo *pInfo) { | 456 const UDataInfo *pInfo) { |
417 if( | 457 if( |
418 pInfo->size >= 20 && | 458 pInfo->size >= 20 && |
419 pInfo->isBigEndian == U_IS_BIG_ENDIAN && | 459 pInfo->isBigEndian == U_IS_BIG_ENDIAN && |
420 pInfo->charsetFamily == U_CHARSET_FAMILY && | 460 pInfo->charsetFamily == U_CHARSET_FAMILY && |
421 pInfo->dataFormat[0] == 0x55 && // dataFormat="UCol" | 461 pInfo->dataFormat[0] == 0x55 && // dataFormat="UCol" |
422 pInfo->dataFormat[1] == 0x43 && | 462 pInfo->dataFormat[1] == 0x43 && |
423 pInfo->dataFormat[2] == 0x6f && | 463 pInfo->dataFormat[2] == 0x6f && |
424 pInfo->dataFormat[3] == 0x6c && | 464 pInfo->dataFormat[3] == 0x6c && |
425 pInfo->formatVersion[0] == 4 | 465 pInfo->formatVersion[0] == 5 |
426 ) { | 466 ) { |
427 UVersionInfo *version = static_cast<UVersionInfo *>(context); | 467 UVersionInfo *version = static_cast<UVersionInfo *>(context); |
428 if(version != NULL) { | 468 if(version != NULL) { |
429 uprv_memcpy(version, pInfo->dataVersion, 4); | 469 uprv_memcpy(version, pInfo->dataVersion, 4); |
430 } | 470 } |
431 return TRUE; | 471 return TRUE; |
432 } else { | 472 } else { |
433 return FALSE; | 473 return FALSE; |
434 } | 474 } |
435 } | 475 } |
436 | 476 |
437 U_NAMESPACE_END | 477 U_NAMESPACE_END |
438 | 478 |
439 #endif // !UCONFIG_NO_COLLATION | 479 #endif // !UCONFIG_NO_COLLATION |
OLD | NEW |