OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * Copyright (C) 2013-2014, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* |
| 6 * collationdatareader.cpp |
| 7 * |
| 8 * created on: 2013feb07 |
| 9 * created by: Markus W. Scherer |
| 10 */ |
| 11 |
| 12 #include "unicode/utypes.h" |
| 13 |
| 14 #if !UCONFIG_NO_COLLATION |
| 15 |
| 16 #include "unicode/ucol.h" |
| 17 #include "unicode/udata.h" |
| 18 #include "unicode/uscript.h" |
| 19 #include "cmemory.h" |
| 20 #include "collation.h" |
| 21 #include "collationdata.h" |
| 22 #include "collationdatareader.h" |
| 23 #include "collationfastlatin.h" |
| 24 #include "collationkeys.h" |
| 25 #include "collationrootelements.h" |
| 26 #include "collationsettings.h" |
| 27 #include "collationtailoring.h" |
| 28 #include "normalizer2impl.h" |
| 29 #include "uassert.h" |
| 30 #include "ucmndata.h" |
| 31 #include "utrie2.h" |
| 32 |
| 33 U_NAMESPACE_BEGIN |
| 34 |
| 35 namespace { |
| 36 |
| 37 int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) { |
| 38 return (i < length) ? indexes[i] : -1; |
| 39 } |
| 40 |
| 41 } // namespace |
| 42 |
| 43 void |
| 44 CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes
, int32_t inLength, |
| 45 CollationTailoring &tailoring, UErrorCode &errorCode)
{ |
| 46 if(U_FAILURE(errorCode)) { return; } |
| 47 if(base != NULL) { |
| 48 if(inBytes == NULL || (0 <= inLength && inLength < 24)) { |
| 49 errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 50 return; |
| 51 } |
| 52 const DataHeader *header = reinterpret_cast<const DataHeader *>(inBytes)
; |
| 53 if(!(header->dataHeader.magic1 == 0xda && header->dataHeader.magic2 == 0
x27 && |
| 54 isAcceptable(tailoring.version, NULL, NULL, &header->info))) { |
| 55 errorCode = U_INVALID_FORMAT_ERROR; |
| 56 return; |
| 57 } |
| 58 if(base->getUCAVersion() != tailoring.getUCAVersion()) { |
| 59 errorCode = U_COLLATOR_VERSION_MISMATCH; |
| 60 return; |
| 61 } |
| 62 int32_t headerLength = header->dataHeader.headerSize; |
| 63 inBytes += headerLength; |
| 64 if(inLength >= 0) { |
| 65 inLength -= headerLength; |
| 66 } |
| 67 } |
| 68 |
| 69 if(inBytes == NULL || (0 <= inLength && inLength < 8)) { |
| 70 errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 71 return; |
| 72 } |
| 73 const int32_t *inIndexes = reinterpret_cast<const int32_t *>(inBytes); |
| 74 int32_t indexesLength = inIndexes[IX_INDEXES_LENGTH]; |
| 75 if(indexesLength < 2 || (0 <= inLength && inLength < indexesLength * 4)) { |
| 76 errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes. |
| 77 return; |
| 78 } |
| 79 |
| 80 // Assume that the tailoring data is in initial state, |
| 81 // with NULL pointers and 0 lengths. |
| 82 |
| 83 // Set pointers to non-empty data parts. |
| 84 // Do this in order of their byte offsets. (Should help porting to Java.) |
| 85 |
| 86 int32_t index; // one of the indexes[] slots |
| 87 int32_t offset; // byte offset for the index part |
| 88 int32_t length; // number of bytes in the index part |
| 89 |
| 90 if(indexesLength > IX_TOTAL_SIZE) { |
| 91 length = inIndexes[IX_TOTAL_SIZE]; |
| 92 } else if(indexesLength > IX_REORDER_CODES_OFFSET) { |
| 93 length = inIndexes[indexesLength - 1]; |
| 94 } else { |
| 95 length = 0; // only indexes, and inLength was already checked for them |
| 96 } |
| 97 if(0 <= inLength && inLength < length) { |
| 98 errorCode = U_INVALID_FORMAT_ERROR; |
| 99 return; |
| 100 } |
| 101 |
| 102 const CollationData *baseData = base == NULL ? NULL : base->data; |
| 103 const int32_t *reorderCodes = NULL; |
| 104 int32_t reorderCodesLength = 0; |
| 105 index = IX_REORDER_CODES_OFFSET; |
| 106 offset = getIndex(inIndexes, indexesLength, index); |
| 107 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
| 108 if(length >= 4) { |
| 109 if(baseData == NULL) { |
| 110 // We assume for collation settings that |
| 111 // the base data does not have a reordering. |
| 112 errorCode = U_INVALID_FORMAT_ERROR; |
| 113 return; |
| 114 } |
| 115 reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset); |
| 116 reorderCodesLength = length / 4; |
| 117 } |
| 118 |
| 119 // There should be a reorder table only if there are reorder codes. |
| 120 // However, when there are reorder codes the reorder table may be omitted to
reduce |
| 121 // the data size. |
| 122 const uint8_t *reorderTable = NULL; |
| 123 index = IX_REORDER_TABLE_OFFSET; |
| 124 offset = getIndex(inIndexes, indexesLength, index); |
| 125 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
| 126 if(length >= 256) { |
| 127 if(reorderCodesLength == 0) { |
| 128 errorCode = U_INVALID_FORMAT_ERROR; // Reordering table without reo
rdering codes. |
| 129 return; |
| 130 } |
| 131 reorderTable = inBytes + offset; |
| 132 } else { |
| 133 // If we have reorder codes, then build the reorderTable at the end, |
| 134 // when the CollationData is otherwise complete. |
| 135 } |
| 136 |
| 137 if(baseData != NULL && baseData->numericPrimary != (inIndexes[IX_OPTIONS] &
0xff000000)) { |
| 138 errorCode = U_INVALID_FORMAT_ERROR; |
| 139 return; |
| 140 } |
| 141 CollationData *data = NULL; // Remains NULL if there are no mappings. |
| 142 |
| 143 index = IX_TRIE_OFFSET; |
| 144 offset = getIndex(inIndexes, indexesLength, index); |
| 145 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
| 146 if(length >= 8) { |
| 147 if(!tailoring.ensureOwnedData(errorCode)) { return; } |
| 148 data = tailoring.ownedData; |
| 149 data->base = baseData; |
| 150 data->numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000; |
| 151 data->trie = tailoring.trie = utrie2_openFromSerialized( |
| 152 UTRIE2_32_VALUE_BITS, inBytes + offset, length, NULL, |
| 153 &errorCode); |
| 154 if(U_FAILURE(errorCode)) { return; } |
| 155 } else if(baseData != NULL) { |
| 156 // Use the base data. Only the settings are tailored. |
| 157 tailoring.data = baseData; |
| 158 } else { |
| 159 errorCode = U_INVALID_FORMAT_ERROR; // No mappings. |
| 160 return; |
| 161 } |
| 162 |
| 163 index = IX_CES_OFFSET; |
| 164 offset = getIndex(inIndexes, indexesLength, index); |
| 165 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
| 166 if(length >= 8) { |
| 167 if(data == NULL) { |
| 168 errorCode = U_INVALID_FORMAT_ERROR; // Tailored ces without tailore
d trie. |
| 169 return; |
| 170 } |
| 171 data->ces = reinterpret_cast<const int64_t *>(inBytes + offset); |
| 172 data->cesLength = length / 8; |
| 173 } |
| 174 |
| 175 index = IX_CE32S_OFFSET; |
| 176 offset = getIndex(inIndexes, indexesLength, index); |
| 177 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
| 178 if(length >= 4) { |
| 179 if(data == NULL) { |
| 180 errorCode = U_INVALID_FORMAT_ERROR; // Tailored ce32s without tailo
red trie. |
| 181 return; |
| 182 } |
| 183 data->ce32s = reinterpret_cast<const uint32_t *>(inBytes + offset); |
| 184 data->ce32sLength = length / 4; |
| 185 } |
| 186 |
| 187 int32_t jamoCE32sStart = getIndex(inIndexes, indexesLength, IX_JAMO_CE32S_ST
ART); |
| 188 if(jamoCE32sStart >= 0) { |
| 189 if(data == NULL || data->ce32s == NULL) { |
| 190 errorCode = U_INVALID_FORMAT_ERROR; // Index into non-existent ce32
s[]. |
| 191 return; |
| 192 } |
| 193 data->jamoCE32s = data->ce32s + jamoCE32sStart; |
| 194 } else if(data == NULL) { |
| 195 // Nothing to do. |
| 196 } else if(baseData != NULL) { |
| 197 data->jamoCE32s = baseData->jamoCE32s; |
| 198 } else { |
| 199 errorCode = U_INVALID_FORMAT_ERROR; // No Jamo CE32s for Hangul process
ing. |
| 200 return; |
| 201 } |
| 202 |
| 203 index = IX_ROOT_ELEMENTS_OFFSET; |
| 204 offset = getIndex(inIndexes, indexesLength, index); |
| 205 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
| 206 if(length >= 4) { |
| 207 length /= 4; |
| 208 if(data == NULL || length <= CollationRootElements::IX_SEC_TER_BOUNDARIE
S) { |
| 209 errorCode = U_INVALID_FORMAT_ERROR; |
| 210 return; |
| 211 } |
| 212 data->rootElements = reinterpret_cast<const uint32_t *>(inBytes + offset
); |
| 213 data->rootElementsLength = length; |
| 214 uint32_t commonSecTer = data->rootElements[CollationRootElements::IX_COM
MON_SEC_AND_TER_CE]; |
| 215 if(commonSecTer != Collation::COMMON_SEC_AND_TER_CE) { |
| 216 errorCode = U_INVALID_FORMAT_ERROR; |
| 217 return; |
| 218 } |
| 219 uint32_t secTerBoundaries = data->rootElements[CollationRootElements::IX
_SEC_TER_BOUNDARIES]; |
| 220 if((secTerBoundaries >> 24) < CollationKeys::SEC_COMMON_HIGH) { |
| 221 // [fixed last secondary common byte] is too low, |
| 222 // and secondary weights would collide with compressed common second
aries. |
| 223 errorCode = U_INVALID_FORMAT_ERROR; |
| 224 return; |
| 225 } |
| 226 } |
| 227 |
| 228 index = IX_CONTEXTS_OFFSET; |
| 229 offset = getIndex(inIndexes, indexesLength, index); |
| 230 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
| 231 if(length >= 2) { |
| 232 if(data == NULL) { |
| 233 errorCode = U_INVALID_FORMAT_ERROR; // Tailored contexts without ta
ilored trie. |
| 234 return; |
| 235 } |
| 236 data->contexts = reinterpret_cast<const UChar *>(inBytes + offset); |
| 237 data->contextsLength = length / 2; |
| 238 } |
| 239 |
| 240 index = IX_UNSAFE_BWD_OFFSET; |
| 241 offset = getIndex(inIndexes, indexesLength, index); |
| 242 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
| 243 if(length >= 2) { |
| 244 if(data == NULL) { |
| 245 errorCode = U_INVALID_FORMAT_ERROR; |
| 246 return; |
| 247 } |
| 248 if(baseData == NULL) { |
| 249 // Create the unsafe-backward set for the root collator. |
| 250 // Include all non-zero combining marks and trail surrogates. |
| 251 // We do this at load time, rather than at build time, |
| 252 // to simplify Unicode version bootstrapping: |
| 253 // The root data builder only needs the new FractionalUCA.txt data, |
| 254 // but it need not be built with a version of ICU already updated to |
| 255 // the corresponding new Unicode Character Database. |
| 256 // |
| 257 // The following is an optimized version of |
| 258 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]"). |
| 259 // It is faster and requires fewer code dependencies. |
| 260 tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // tr
ail surrogates |
| 261 if(tailoring.unsafeBackwardSet == NULL) { |
| 262 errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 263 return; |
| 264 } |
| 265 data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet); |
| 266 } else { |
| 267 // Clone the root collator's set contents. |
| 268 tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>( |
| 269 baseData->unsafeBackwardSet->cloneAsThawed()); |
| 270 if(tailoring.unsafeBackwardSet == NULL) { |
| 271 errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 272 return; |
| 273 } |
| 274 } |
| 275 // Add the ranges from the data file to the unsafe-backward set. |
| 276 USerializedSet sset; |
| 277 const uint16_t *unsafeData = reinterpret_cast<const uint16_t *>(inBytes
+ offset); |
| 278 if(!uset_getSerializedSet(&sset, unsafeData, length / 2)) { |
| 279 errorCode = U_INVALID_FORMAT_ERROR; |
| 280 return; |
| 281 } |
| 282 int32_t count = uset_getSerializedRangeCount(&sset); |
| 283 for(int32_t i = 0; i < count; ++i) { |
| 284 UChar32 start, end; |
| 285 uset_getSerializedRange(&sset, i, &start, &end); |
| 286 tailoring.unsafeBackwardSet->add(start, end); |
| 287 } |
| 288 // Mark each lead surrogate as "unsafe" |
| 289 // if any of its 1024 associated supplementary code points is "unsafe". |
| 290 UChar32 c = 0x10000; |
| 291 for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { |
| 292 if(!tailoring.unsafeBackwardSet->containsNone(c, c + 0x3ff)) { |
| 293 tailoring.unsafeBackwardSet->add(lead); |
| 294 } |
| 295 } |
| 296 tailoring.unsafeBackwardSet->freeze(); |
| 297 data->unsafeBackwardSet = tailoring.unsafeBackwardSet; |
| 298 } else if(data == NULL) { |
| 299 // Nothing to do. |
| 300 } else if(baseData != NULL) { |
| 301 // No tailoring-specific data: Alias the root collator's set. |
| 302 data->unsafeBackwardSet = baseData->unsafeBackwardSet; |
| 303 } else { |
| 304 errorCode = U_INVALID_FORMAT_ERROR; // No unsafeBackwardSet. |
| 305 return; |
| 306 } |
| 307 |
| 308 // If the fast Latin format version is different, |
| 309 // or the version is set to 0 for "no fast Latin table", |
| 310 // then just always use the normal string comparison path. |
| 311 if(data != NULL) { |
| 312 data->fastLatinTable = NULL; |
| 313 data->fastLatinTableLength = 0; |
| 314 if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin::VERSION
) { |
| 315 index = IX_FAST_LATIN_TABLE_OFFSET; |
| 316 offset = getIndex(inIndexes, indexesLength, index); |
| 317 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
| 318 if(length >= 2) { |
| 319 data->fastLatinTable = reinterpret_cast<const uint16_t *>(inByte
s + offset); |
| 320 data->fastLatinTableLength = length / 2; |
| 321 if((*data->fastLatinTable >> 8) != CollationFastLatin::VERSION)
{ |
| 322 errorCode = U_INVALID_FORMAT_ERROR; // header vs. table ver
sion mismatch |
| 323 return; |
| 324 } |
| 325 } else if(baseData != NULL) { |
| 326 data->fastLatinTable = baseData->fastLatinTable; |
| 327 data->fastLatinTableLength = baseData->fastLatinTableLength; |
| 328 } |
| 329 } |
| 330 } |
| 331 |
| 332 index = IX_SCRIPTS_OFFSET; |
| 333 offset = getIndex(inIndexes, indexesLength, index); |
| 334 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
| 335 if(length >= 2) { |
| 336 if(data == NULL) { |
| 337 errorCode = U_INVALID_FORMAT_ERROR; |
| 338 return; |
| 339 } |
| 340 data->scripts = reinterpret_cast<const uint16_t *>(inBytes + offset); |
| 341 data->scriptsLength = length / 2; |
| 342 } else if(data == NULL) { |
| 343 // Nothing to do. |
| 344 } else if(baseData != NULL) { |
| 345 data->scripts = baseData->scripts; |
| 346 data->scriptsLength = baseData->scriptsLength; |
| 347 } |
| 348 |
| 349 index = IX_COMPRESSIBLE_BYTES_OFFSET; |
| 350 offset = getIndex(inIndexes, indexesLength, index); |
| 351 length = getIndex(inIndexes, indexesLength, index + 1) - offset; |
| 352 if(length >= 256) { |
| 353 if(data == NULL) { |
| 354 errorCode = U_INVALID_FORMAT_ERROR; |
| 355 return; |
| 356 } |
| 357 data->compressibleBytes = reinterpret_cast<const UBool *>(inBytes + offs
et); |
| 358 } else if(data == NULL) { |
| 359 // Nothing to do. |
| 360 } else if(baseData != NULL) { |
| 361 data->compressibleBytes = baseData->compressibleBytes; |
| 362 } else { |
| 363 errorCode = U_INVALID_FORMAT_ERROR; // No compressibleBytes[]. |
| 364 return; |
| 365 } |
| 366 |
| 367 const CollationSettings &ts = *tailoring.settings; |
| 368 int32_t options = inIndexes[IX_OPTIONS] & 0xffff; |
| 369 uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT]; |
| 370 int32_t fastLatinOptions = CollationFastLatin::getOptions( |
| 371 tailoring.data, ts, fastLatinPrimaries, UPRV_LENGTHOF(fastLatinPrima
ries)); |
| 372 if(options == ts.options && ts.variableTop != 0 && |
| 373 reorderCodesLength == ts.reorderCodesLength && |
| 374 uprv_memcmp(reorderCodes, ts.reorderCodes, reorderCodesLength * 4) =
= 0 && |
| 375 fastLatinOptions == ts.fastLatinOptions && |
| 376 (fastLatinOptions < 0 || |
| 377 uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries, |
| 378 sizeof(fastLatinPrimaries)) == 0)) { |
| 379 return; |
| 380 } |
| 381 |
| 382 CollationSettings *settings = SharedObject::copyOnWrite(tailoring.settings); |
| 383 if(settings == NULL) { |
| 384 errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 385 return; |
| 386 } |
| 387 settings->options = options; |
| 388 // Set variableTop from options and scripts data. |
| 389 settings->variableTop = tailoring.data->getLastPrimaryForGroup( |
| 390 UCOL_REORDER_CODE_FIRST + settings->getMaxVariable()); |
| 391 if(settings->variableTop == 0) { |
| 392 errorCode = U_INVALID_FORMAT_ERROR; |
| 393 return; |
| 394 } |
| 395 |
| 396 if(reorderCodesLength == 0 || reorderTable != NULL) { |
| 397 settings->aliasReordering(reorderCodes, reorderCodesLength, reorderTable
); |
| 398 } else { |
| 399 uint8_t table[256]; |
| 400 baseData->makeReorderTable(reorderCodes, reorderCodesLength, table, erro
rCode); |
| 401 if(U_FAILURE(errorCode)) { return; } |
| 402 if(!settings->setReordering(reorderCodes, reorderCodesLength,table)) { |
| 403 errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 404 return; |
| 405 } |
| 406 } |
| 407 |
| 408 settings->fastLatinOptions = CollationFastLatin::getOptions( |
| 409 tailoring.data, *settings, |
| 410 settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries
)); |
| 411 } |
| 412 |
| 413 UBool U_CALLCONV |
| 414 CollationDataReader::isAcceptable(void *context, |
| 415 const char * /* type */, const char * /*name*/
, |
| 416 const UDataInfo *pInfo) { |
| 417 if( |
| 418 pInfo->size >= 20 && |
| 419 pInfo->isBigEndian == U_IS_BIG_ENDIAN && |
| 420 pInfo->charsetFamily == U_CHARSET_FAMILY && |
| 421 pInfo->dataFormat[0] == 0x55 && // dataFormat="UCol" |
| 422 pInfo->dataFormat[1] == 0x43 && |
| 423 pInfo->dataFormat[2] == 0x6f && |
| 424 pInfo->dataFormat[3] == 0x6c && |
| 425 pInfo->formatVersion[0] == 4 |
| 426 ) { |
| 427 UVersionInfo *version = static_cast<UVersionInfo *>(context); |
| 428 if(version != NULL) { |
| 429 uprv_memcpy(version, pInfo->dataVersion, 4); |
| 430 } |
| 431 return TRUE; |
| 432 } else { |
| 433 return FALSE; |
| 434 } |
| 435 } |
| 436 |
| 437 U_NAMESPACE_END |
| 438 |
| 439 #endif // !UCONFIG_NO_COLLATION |
OLD | NEW |