Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(142)

Side by Side Diff: source/i18n/collationdatareader.cpp

Issue 1621843002: ICU 56 update step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@561
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/i18n/collationdatareader.h ('k') | source/i18n/collationdatawriter.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 ******************************************************************************* 2 *******************************************************************************
3 * Copyright (C) 2013-2014, International Business Machines 3 * Copyright (C) 2013-2015, International Business Machines
4 * Corporation and others. All Rights Reserved. 4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************* 5 *******************************************************************************
6 * collationdatareader.cpp 6 * collationdatareader.cpp
7 * 7 *
8 * created on: 2013feb07 8 * created on: 2013feb07
9 * created by: Markus W. Scherer 9 * created by: Markus W. Scherer
10 */ 10 */
11 11
12 #include "unicode/utypes.h" 12 #include "unicode/utypes.h"
13 13
14 #if !UCONFIG_NO_COLLATION 14 #if !UCONFIG_NO_COLLATION
15 15
16 #include "unicode/ucol.h" 16 #include "unicode/ucol.h"
17 #include "unicode/udata.h" 17 #include "unicode/udata.h"
18 #include "unicode/uscript.h" 18 #include "unicode/uscript.h"
19 #include "cmemory.h" 19 #include "cmemory.h"
20 #include "collation.h" 20 #include "collation.h"
21 #include "collationdata.h" 21 #include "collationdata.h"
22 #include "collationdatareader.h" 22 #include "collationdatareader.h"
23 #include "collationfastlatin.h" 23 #include "collationfastlatin.h"
24 #include "collationkeys.h" 24 #include "collationkeys.h"
25 #include "collationrootelements.h" 25 #include "collationrootelements.h"
26 #include "collationsettings.h" 26 #include "collationsettings.h"
27 #include "collationtailoring.h" 27 #include "collationtailoring.h"
28 #include "collunsafe.h"
28 #include "normalizer2impl.h" 29 #include "normalizer2impl.h"
29 #include "uassert.h" 30 #include "uassert.h"
30 #include "ucmndata.h" 31 #include "ucmndata.h"
31 #include "utrie2.h" 32 #include "utrie2.h"
32 33
33 U_NAMESPACE_BEGIN 34 U_NAMESPACE_BEGIN
34 35
35 namespace { 36 namespace {
36 37
37 int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) { 38 int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) {
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after
95 length = 0; // only indexes, and inLength was already checked for them 96 length = 0; // only indexes, and inLength was already checked for them
96 } 97 }
97 if(0 <= inLength && inLength < length) { 98 if(0 <= inLength && inLength < length) {
98 errorCode = U_INVALID_FORMAT_ERROR; 99 errorCode = U_INVALID_FORMAT_ERROR;
99 return; 100 return;
100 } 101 }
101 102
102 const CollationData *baseData = base == NULL ? NULL : base->data; 103 const CollationData *baseData = base == NULL ? NULL : base->data;
103 const int32_t *reorderCodes = NULL; 104 const int32_t *reorderCodes = NULL;
104 int32_t reorderCodesLength = 0; 105 int32_t reorderCodesLength = 0;
106 const uint32_t *reorderRanges = NULL;
107 int32_t reorderRangesLength = 0;
105 index = IX_REORDER_CODES_OFFSET; 108 index = IX_REORDER_CODES_OFFSET;
106 offset = getIndex(inIndexes, indexesLength, index); 109 offset = getIndex(inIndexes, indexesLength, index);
107 length = getIndex(inIndexes, indexesLength, index + 1) - offset; 110 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
108 if(length >= 4) { 111 if(length >= 4) {
109 if(baseData == NULL) { 112 if(baseData == NULL) {
110 // We assume for collation settings that 113 // We assume for collation settings that
111 // the base data does not have a reordering. 114 // the base data does not have a reordering.
112 errorCode = U_INVALID_FORMAT_ERROR; 115 errorCode = U_INVALID_FORMAT_ERROR;
113 return; 116 return;
114 } 117 }
115 reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset); 118 reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset);
116 reorderCodesLength = length / 4; 119 reorderCodesLength = length / 4;
120
121 // The reorderRanges (if any) are the trailing reorderCodes entries.
122 // Split the array at the boundary.
123 // Script or reorder codes do not exceed 16-bit values.
124 // Range limits are stored in the upper 16 bits, and are never 0.
125 while(reorderRangesLength < reorderCodesLength &&
126 (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0x ffff0000) != 0) {
127 ++reorderRangesLength;
128 }
129 U_ASSERT(reorderRangesLength < reorderCodesLength);
130 if(reorderRangesLength != 0) {
131 reorderCodesLength -= reorderRangesLength;
132 reorderRanges = reinterpret_cast<const uint32_t *>(reorderCodes + re orderCodesLength);
133 }
117 } 134 }
118 135
119 // There should be a reorder table only if there are reorder codes. 136 // There should be a reorder table only if there are reorder codes.
120 // However, when there are reorder codes the reorder table may be omitted to reduce 137 // However, when there are reorder codes the reorder table may be omitted to reduce
121 // the data size. 138 // the data size.
122 const uint8_t *reorderTable = NULL; 139 const uint8_t *reorderTable = NULL;
123 index = IX_REORDER_TABLE_OFFSET; 140 index = IX_REORDER_TABLE_OFFSET;
124 offset = getIndex(inIndexes, indexesLength, index); 141 offset = getIndex(inIndexes, indexesLength, index);
125 length = getIndex(inIndexes, indexesLength, index + 1) - offset; 142 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
126 if(length >= 256) { 143 if(length >= 256) {
(...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after
239 256
240 index = IX_UNSAFE_BWD_OFFSET; 257 index = IX_UNSAFE_BWD_OFFSET;
241 offset = getIndex(inIndexes, indexesLength, index); 258 offset = getIndex(inIndexes, indexesLength, index);
242 length = getIndex(inIndexes, indexesLength, index + 1) - offset; 259 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
243 if(length >= 2) { 260 if(length >= 2) {
244 if(data == NULL) { 261 if(data == NULL) {
245 errorCode = U_INVALID_FORMAT_ERROR; 262 errorCode = U_INVALID_FORMAT_ERROR;
246 return; 263 return;
247 } 264 }
248 if(baseData == NULL) { 265 if(baseData == NULL) {
266 #if defined(COLLUNSAFE_COLL_VERSION) && defined (COLLUNSAFE_SERIALIZE)
267 tailoring.unsafeBackwardSet = new UnicodeSet(unsafe_serializedData, un safe_serializedCount, UnicodeSet::kSerialized, errorCode);
268 if(tailoring.unsafeBackwardSet == NULL) {
269 errorCode = U_MEMORY_ALLOCATION_ERROR;
270 return;
271 } else if (U_FAILURE(errorCode)) {
272 return;
273 }
274 #else
249 // Create the unsafe-backward set for the root collator. 275 // Create the unsafe-backward set for the root collator.
250 // Include all non-zero combining marks and trail surrogates. 276 // Include all non-zero combining marks and trail surrogates.
251 // We do this at load time, rather than at build time, 277 // We do this at load time, rather than at build time,
252 // to simplify Unicode version bootstrapping: 278 // to simplify Unicode version bootstrapping:
253 // The root data builder only needs the new FractionalUCA.txt data, 279 // The root data builder only needs the new FractionalUCA.txt data,
254 // but it need not be built with a version of ICU already updated to 280 // but it need not be built with a version of ICU already updated to
255 // the corresponding new Unicode Character Database. 281 // the corresponding new Unicode Character Database.
256 // 282 //
257 // The following is an optimized version of 283 // The following is an optimized version of
258 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]"). 284 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
259 // It is faster and requires fewer code dependencies. 285 // It is faster and requires fewer code dependencies.
260 tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // tr ail surrogates 286 tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // tr ail surrogates
261 if(tailoring.unsafeBackwardSet == NULL) { 287 if(tailoring.unsafeBackwardSet == NULL) {
262 errorCode = U_MEMORY_ALLOCATION_ERROR; 288 errorCode = U_MEMORY_ALLOCATION_ERROR;
263 return; 289 return;
264 } 290 }
265 data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet); 291 data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet);
292 #endif // !COLLUNSAFE_SERIALIZE || !COLLUNSAFE_COLL_VERSION
266 } else { 293 } else {
267 // Clone the root collator's set contents. 294 // Clone the root collator's set contents.
268 tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>( 295 tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>(
269 baseData->unsafeBackwardSet->cloneAsThawed()); 296 baseData->unsafeBackwardSet->cloneAsThawed());
270 if(tailoring.unsafeBackwardSet == NULL) { 297 if(tailoring.unsafeBackwardSet == NULL) {
271 errorCode = U_MEMORY_ALLOCATION_ERROR; 298 errorCode = U_MEMORY_ALLOCATION_ERROR;
272 return; 299 return;
273 } 300 }
274 } 301 }
275 // Add the ranges from the data file to the unsafe-backward set. 302 // Add the ranges from the data file to the unsafe-backward set.
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after
330 } 357 }
331 358
332 index = IX_SCRIPTS_OFFSET; 359 index = IX_SCRIPTS_OFFSET;
333 offset = getIndex(inIndexes, indexesLength, index); 360 offset = getIndex(inIndexes, indexesLength, index);
334 length = getIndex(inIndexes, indexesLength, index + 1) - offset; 361 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
335 if(length >= 2) { 362 if(length >= 2) {
336 if(data == NULL) { 363 if(data == NULL) {
337 errorCode = U_INVALID_FORMAT_ERROR; 364 errorCode = U_INVALID_FORMAT_ERROR;
338 return; 365 return;
339 } 366 }
340 data->scripts = reinterpret_cast<const uint16_t *>(inBytes + offset); 367 const uint16_t *scripts = reinterpret_cast<const uint16_t *>(inBytes + o ffset);
341 data->scriptsLength = length / 2; 368 int32_t scriptsLength = length / 2;
369 data->numScripts = scripts[0];
370 // There must be enough entries for both arrays, including more than two range starts.
371 data->scriptStartsLength = scriptsLength - (1 + data->numScripts + 16);
372 if(data->scriptStartsLength <= 2 ||
373 CollationData::MAX_NUM_SCRIPT_RANGES < data->scriptStartsLength) {
374 errorCode = U_INVALID_FORMAT_ERROR;
375 return;
376 }
377 data->scriptsIndex = scripts + 1;
378 data->scriptStarts = scripts + 1 + data->numScripts + 16;
379 if(!(data->scriptStarts[0] == 0 &&
380 data->scriptStarts[1] == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8) &&
381 data->scriptStarts[data->scriptStartsLength - 1] ==
382 (Collation::TRAIL_WEIGHT_BYTE << 8))) {
383 errorCode = U_INVALID_FORMAT_ERROR;
384 return;
385 }
342 } else if(data == NULL) { 386 } else if(data == NULL) {
343 // Nothing to do. 387 // Nothing to do.
344 } else if(baseData != NULL) { 388 } else if(baseData != NULL) {
345 data->scripts = baseData->scripts; 389 data->numScripts = baseData->numScripts;
346 data->scriptsLength = baseData->scriptsLength; 390 data->scriptsIndex = baseData->scriptsIndex;
391 data->scriptStarts = baseData->scriptStarts;
392 data->scriptStartsLength = baseData->scriptStartsLength;
347 } 393 }
348 394
349 index = IX_COMPRESSIBLE_BYTES_OFFSET; 395 index = IX_COMPRESSIBLE_BYTES_OFFSET;
350 offset = getIndex(inIndexes, indexesLength, index); 396 offset = getIndex(inIndexes, indexesLength, index);
351 length = getIndex(inIndexes, indexesLength, index + 1) - offset; 397 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
352 if(length >= 256) { 398 if(length >= 256) {
353 if(data == NULL) { 399 if(data == NULL) {
354 errorCode = U_INVALID_FORMAT_ERROR; 400 errorCode = U_INVALID_FORMAT_ERROR;
355 return; 401 return;
356 } 402 }
(...skipping 29 matching lines...) Expand all
386 } 432 }
387 settings->options = options; 433 settings->options = options;
388 // Set variableTop from options and scripts data. 434 // Set variableTop from options and scripts data.
389 settings->variableTop = tailoring.data->getLastPrimaryForGroup( 435 settings->variableTop = tailoring.data->getLastPrimaryForGroup(
390 UCOL_REORDER_CODE_FIRST + settings->getMaxVariable()); 436 UCOL_REORDER_CODE_FIRST + settings->getMaxVariable());
391 if(settings->variableTop == 0) { 437 if(settings->variableTop == 0) {
392 errorCode = U_INVALID_FORMAT_ERROR; 438 errorCode = U_INVALID_FORMAT_ERROR;
393 return; 439 return;
394 } 440 }
395 441
396 if(reorderCodesLength == 0 || reorderTable != NULL) { 442 if(reorderCodesLength != 0) {
397 settings->aliasReordering(reorderCodes, reorderCodesLength, reorderTable ); 443 settings->aliasReordering(*baseData, reorderCodes, reorderCodesLength,
398 } else { 444 reorderRanges, reorderRangesLength,
399 uint8_t table[256]; 445 reorderTable, errorCode);
400 baseData->makeReorderTable(reorderCodes, reorderCodesLength, table, erro rCode);
401 if(U_FAILURE(errorCode)) { return; }
402 if(!settings->setReordering(reorderCodes, reorderCodesLength,table)) {
403 errorCode = U_MEMORY_ALLOCATION_ERROR;
404 return;
405 }
406 } 446 }
407 447
408 settings->fastLatinOptions = CollationFastLatin::getOptions( 448 settings->fastLatinOptions = CollationFastLatin::getOptions(
409 tailoring.data, *settings, 449 tailoring.data, *settings,
410 settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries )); 450 settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries ));
411 } 451 }
412 452
413 UBool U_CALLCONV 453 UBool U_CALLCONV
414 CollationDataReader::isAcceptable(void *context, 454 CollationDataReader::isAcceptable(void *context,
415 const char * /* type */, const char * /*name*/ , 455 const char * /* type */, const char * /*name*/ ,
416 const UDataInfo *pInfo) { 456 const UDataInfo *pInfo) {
417 if( 457 if(
418 pInfo->size >= 20 && 458 pInfo->size >= 20 &&
419 pInfo->isBigEndian == U_IS_BIG_ENDIAN && 459 pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
420 pInfo->charsetFamily == U_CHARSET_FAMILY && 460 pInfo->charsetFamily == U_CHARSET_FAMILY &&
421 pInfo->dataFormat[0] == 0x55 && // dataFormat="UCol" 461 pInfo->dataFormat[0] == 0x55 && // dataFormat="UCol"
422 pInfo->dataFormat[1] == 0x43 && 462 pInfo->dataFormat[1] == 0x43 &&
423 pInfo->dataFormat[2] == 0x6f && 463 pInfo->dataFormat[2] == 0x6f &&
424 pInfo->dataFormat[3] == 0x6c && 464 pInfo->dataFormat[3] == 0x6c &&
425 pInfo->formatVersion[0] == 4 465 pInfo->formatVersion[0] == 5
426 ) { 466 ) {
427 UVersionInfo *version = static_cast<UVersionInfo *>(context); 467 UVersionInfo *version = static_cast<UVersionInfo *>(context);
428 if(version != NULL) { 468 if(version != NULL) {
429 uprv_memcpy(version, pInfo->dataVersion, 4); 469 uprv_memcpy(version, pInfo->dataVersion, 4);
430 } 470 }
431 return TRUE; 471 return TRUE;
432 } else { 472 } else {
433 return FALSE; 473 return FALSE;
434 } 474 }
435 } 475 }
436 476
437 U_NAMESPACE_END 477 U_NAMESPACE_END
438 478
439 #endif // !UCONFIG_NO_COLLATION 479 #endif // !UCONFIG_NO_COLLATION
OLDNEW
« no previous file with comments | « source/i18n/collationdatareader.h ('k') | source/i18n/collationdatawriter.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698