Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(426)

Side by Side Diff: source/i18n/collationdatareader.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master
Patch Set: remove unusued directories Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/i18n/collationdatareader.h ('k') | source/i18n/collationdatawriter.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /*
2 *******************************************************************************
3 * Copyright (C) 2013-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationdatareader.cpp
7 *
8 * created on: 2013feb07
9 * created by: Markus W. Scherer
10 */
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_COLLATION
15
16 #include "unicode/ucol.h"
17 #include "unicode/udata.h"
18 #include "unicode/uscript.h"
19 #include "cmemory.h"
20 #include "collation.h"
21 #include "collationdata.h"
22 #include "collationdatareader.h"
23 #include "collationfastlatin.h"
24 #include "collationkeys.h"
25 #include "collationrootelements.h"
26 #include "collationsettings.h"
27 #include "collationtailoring.h"
28 #include "normalizer2impl.h"
29 #include "uassert.h"
30 #include "ucmndata.h"
31 #include "utrie2.h"
32
33 U_NAMESPACE_BEGIN
34
35 namespace {
36
37 int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) {
38 return (i < length) ? indexes[i] : -1;
39 }
40
41 } // namespace
42
43 void
44 CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes , int32_t inLength,
45 CollationTailoring &tailoring, UErrorCode &errorCode) {
46 if(U_FAILURE(errorCode)) { return; }
47 if(base != NULL) {
48 if(inBytes == NULL || (0 <= inLength && inLength < 24)) {
49 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
50 return;
51 }
52 const DataHeader *header = reinterpret_cast<const DataHeader *>(inBytes) ;
53 if(!(header->dataHeader.magic1 == 0xda && header->dataHeader.magic2 == 0 x27 &&
54 isAcceptable(tailoring.version, NULL, NULL, &header->info))) {
55 errorCode = U_INVALID_FORMAT_ERROR;
56 return;
57 }
58 if(base->getUCAVersion() != tailoring.getUCAVersion()) {
59 errorCode = U_COLLATOR_VERSION_MISMATCH;
60 return;
61 }
62 int32_t headerLength = header->dataHeader.headerSize;
63 inBytes += headerLength;
64 if(inLength >= 0) {
65 inLength -= headerLength;
66 }
67 }
68
69 if(inBytes == NULL || (0 <= inLength && inLength < 8)) {
70 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
71 return;
72 }
73 const int32_t *inIndexes = reinterpret_cast<const int32_t *>(inBytes);
74 int32_t indexesLength = inIndexes[IX_INDEXES_LENGTH];
75 if(indexesLength < 2 || (0 <= inLength && inLength < indexesLength * 4)) {
76 errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes.
77 return;
78 }
79
80 // Assume that the tailoring data is in initial state,
81 // with NULL pointers and 0 lengths.
82
83 // Set pointers to non-empty data parts.
84 // Do this in order of their byte offsets. (Should help porting to Java.)
85
86 int32_t index; // one of the indexes[] slots
87 int32_t offset; // byte offset for the index part
88 int32_t length; // number of bytes in the index part
89
90 if(indexesLength > IX_TOTAL_SIZE) {
91 length = inIndexes[IX_TOTAL_SIZE];
92 } else if(indexesLength > IX_REORDER_CODES_OFFSET) {
93 length = inIndexes[indexesLength - 1];
94 } else {
95 length = 0; // only indexes, and inLength was already checked for them
96 }
97 if(0 <= inLength && inLength < length) {
98 errorCode = U_INVALID_FORMAT_ERROR;
99 return;
100 }
101
102 const CollationData *baseData = base == NULL ? NULL : base->data;
103 const int32_t *reorderCodes = NULL;
104 int32_t reorderCodesLength = 0;
105 index = IX_REORDER_CODES_OFFSET;
106 offset = getIndex(inIndexes, indexesLength, index);
107 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
108 if(length >= 4) {
109 if(baseData == NULL) {
110 // We assume for collation settings that
111 // the base data does not have a reordering.
112 errorCode = U_INVALID_FORMAT_ERROR;
113 return;
114 }
115 reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset);
116 reorderCodesLength = length / 4;
117 }
118
119 // There should be a reorder table only if there are reorder codes.
120 // However, when there are reorder codes the reorder table may be omitted to reduce
121 // the data size.
122 const uint8_t *reorderTable = NULL;
123 index = IX_REORDER_TABLE_OFFSET;
124 offset = getIndex(inIndexes, indexesLength, index);
125 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
126 if(length >= 256) {
127 if(reorderCodesLength == 0) {
128 errorCode = U_INVALID_FORMAT_ERROR; // Reordering table without reo rdering codes.
129 return;
130 }
131 reorderTable = inBytes + offset;
132 } else {
133 // If we have reorder codes, then build the reorderTable at the end,
134 // when the CollationData is otherwise complete.
135 }
136
137 if(baseData != NULL && baseData->numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000)) {
138 errorCode = U_INVALID_FORMAT_ERROR;
139 return;
140 }
141 CollationData *data = NULL; // Remains NULL if there are no mappings.
142
143 index = IX_TRIE_OFFSET;
144 offset = getIndex(inIndexes, indexesLength, index);
145 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
146 if(length >= 8) {
147 if(!tailoring.ensureOwnedData(errorCode)) { return; }
148 data = tailoring.ownedData;
149 data->base = baseData;
150 data->numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000;
151 data->trie = tailoring.trie = utrie2_openFromSerialized(
152 UTRIE2_32_VALUE_BITS, inBytes + offset, length, NULL,
153 &errorCode);
154 if(U_FAILURE(errorCode)) { return; }
155 } else if(baseData != NULL) {
156 // Use the base data. Only the settings are tailored.
157 tailoring.data = baseData;
158 } else {
159 errorCode = U_INVALID_FORMAT_ERROR; // No mappings.
160 return;
161 }
162
163 index = IX_CES_OFFSET;
164 offset = getIndex(inIndexes, indexesLength, index);
165 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
166 if(length >= 8) {
167 if(data == NULL) {
168 errorCode = U_INVALID_FORMAT_ERROR; // Tailored ces without tailore d trie.
169 return;
170 }
171 data->ces = reinterpret_cast<const int64_t *>(inBytes + offset);
172 data->cesLength = length / 8;
173 }
174
175 index = IX_CE32S_OFFSET;
176 offset = getIndex(inIndexes, indexesLength, index);
177 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
178 if(length >= 4) {
179 if(data == NULL) {
180 errorCode = U_INVALID_FORMAT_ERROR; // Tailored ce32s without tailo red trie.
181 return;
182 }
183 data->ce32s = reinterpret_cast<const uint32_t *>(inBytes + offset);
184 data->ce32sLength = length / 4;
185 }
186
187 int32_t jamoCE32sStart = getIndex(inIndexes, indexesLength, IX_JAMO_CE32S_ST ART);
188 if(jamoCE32sStart >= 0) {
189 if(data == NULL || data->ce32s == NULL) {
190 errorCode = U_INVALID_FORMAT_ERROR; // Index into non-existent ce32 s[].
191 return;
192 }
193 data->jamoCE32s = data->ce32s + jamoCE32sStart;
194 } else if(data == NULL) {
195 // Nothing to do.
196 } else if(baseData != NULL) {
197 data->jamoCE32s = baseData->jamoCE32s;
198 } else {
199 errorCode = U_INVALID_FORMAT_ERROR; // No Jamo CE32s for Hangul process ing.
200 return;
201 }
202
203 index = IX_ROOT_ELEMENTS_OFFSET;
204 offset = getIndex(inIndexes, indexesLength, index);
205 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
206 if(length >= 4) {
207 length /= 4;
208 if(data == NULL || length <= CollationRootElements::IX_SEC_TER_BOUNDARIE S) {
209 errorCode = U_INVALID_FORMAT_ERROR;
210 return;
211 }
212 data->rootElements = reinterpret_cast<const uint32_t *>(inBytes + offset );
213 data->rootElementsLength = length;
214 uint32_t commonSecTer = data->rootElements[CollationRootElements::IX_COM MON_SEC_AND_TER_CE];
215 if(commonSecTer != Collation::COMMON_SEC_AND_TER_CE) {
216 errorCode = U_INVALID_FORMAT_ERROR;
217 return;
218 }
219 uint32_t secTerBoundaries = data->rootElements[CollationRootElements::IX _SEC_TER_BOUNDARIES];
220 if((secTerBoundaries >> 24) < CollationKeys::SEC_COMMON_HIGH) {
221 // [fixed last secondary common byte] is too low,
222 // and secondary weights would collide with compressed common second aries.
223 errorCode = U_INVALID_FORMAT_ERROR;
224 return;
225 }
226 }
227
228 index = IX_CONTEXTS_OFFSET;
229 offset = getIndex(inIndexes, indexesLength, index);
230 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
231 if(length >= 2) {
232 if(data == NULL) {
233 errorCode = U_INVALID_FORMAT_ERROR; // Tailored contexts without ta ilored trie.
234 return;
235 }
236 data->contexts = reinterpret_cast<const UChar *>(inBytes + offset);
237 data->contextsLength = length / 2;
238 }
239
240 index = IX_UNSAFE_BWD_OFFSET;
241 offset = getIndex(inIndexes, indexesLength, index);
242 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
243 if(length >= 2) {
244 if(data == NULL) {
245 errorCode = U_INVALID_FORMAT_ERROR;
246 return;
247 }
248 if(baseData == NULL) {
249 // Create the unsafe-backward set for the root collator.
250 // Include all non-zero combining marks and trail surrogates.
251 // We do this at load time, rather than at build time,
252 // to simplify Unicode version bootstrapping:
253 // The root data builder only needs the new FractionalUCA.txt data,
254 // but it need not be built with a version of ICU already updated to
255 // the corresponding new Unicode Character Database.
256 //
257 // The following is an optimized version of
258 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
259 // It is faster and requires fewer code dependencies.
260 tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // tr ail surrogates
261 if(tailoring.unsafeBackwardSet == NULL) {
262 errorCode = U_MEMORY_ALLOCATION_ERROR;
263 return;
264 }
265 data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet);
266 } else {
267 // Clone the root collator's set contents.
268 tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>(
269 baseData->unsafeBackwardSet->cloneAsThawed());
270 if(tailoring.unsafeBackwardSet == NULL) {
271 errorCode = U_MEMORY_ALLOCATION_ERROR;
272 return;
273 }
274 }
275 // Add the ranges from the data file to the unsafe-backward set.
276 USerializedSet sset;
277 const uint16_t *unsafeData = reinterpret_cast<const uint16_t *>(inBytes + offset);
278 if(!uset_getSerializedSet(&sset, unsafeData, length / 2)) {
279 errorCode = U_INVALID_FORMAT_ERROR;
280 return;
281 }
282 int32_t count = uset_getSerializedRangeCount(&sset);
283 for(int32_t i = 0; i < count; ++i) {
284 UChar32 start, end;
285 uset_getSerializedRange(&sset, i, &start, &end);
286 tailoring.unsafeBackwardSet->add(start, end);
287 }
288 // Mark each lead surrogate as "unsafe"
289 // if any of its 1024 associated supplementary code points is "unsafe".
290 UChar32 c = 0x10000;
291 for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
292 if(!tailoring.unsafeBackwardSet->containsNone(c, c + 0x3ff)) {
293 tailoring.unsafeBackwardSet->add(lead);
294 }
295 }
296 tailoring.unsafeBackwardSet->freeze();
297 data->unsafeBackwardSet = tailoring.unsafeBackwardSet;
298 } else if(data == NULL) {
299 // Nothing to do.
300 } else if(baseData != NULL) {
301 // No tailoring-specific data: Alias the root collator's set.
302 data->unsafeBackwardSet = baseData->unsafeBackwardSet;
303 } else {
304 errorCode = U_INVALID_FORMAT_ERROR; // No unsafeBackwardSet.
305 return;
306 }
307
308 // If the fast Latin format version is different,
309 // or the version is set to 0 for "no fast Latin table",
310 // then just always use the normal string comparison path.
311 if(data != NULL) {
312 data->fastLatinTable = NULL;
313 data->fastLatinTableLength = 0;
314 if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin::VERSION ) {
315 index = IX_FAST_LATIN_TABLE_OFFSET;
316 offset = getIndex(inIndexes, indexesLength, index);
317 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
318 if(length >= 2) {
319 data->fastLatinTable = reinterpret_cast<const uint16_t *>(inByte s + offset);
320 data->fastLatinTableLength = length / 2;
321 if((*data->fastLatinTable >> 8) != CollationFastLatin::VERSION) {
322 errorCode = U_INVALID_FORMAT_ERROR; // header vs. table ver sion mismatch
323 return;
324 }
325 } else if(baseData != NULL) {
326 data->fastLatinTable = baseData->fastLatinTable;
327 data->fastLatinTableLength = baseData->fastLatinTableLength;
328 }
329 }
330 }
331
332 index = IX_SCRIPTS_OFFSET;
333 offset = getIndex(inIndexes, indexesLength, index);
334 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
335 if(length >= 2) {
336 if(data == NULL) {
337 errorCode = U_INVALID_FORMAT_ERROR;
338 return;
339 }
340 data->scripts = reinterpret_cast<const uint16_t *>(inBytes + offset);
341 data->scriptsLength = length / 2;
342 } else if(data == NULL) {
343 // Nothing to do.
344 } else if(baseData != NULL) {
345 data->scripts = baseData->scripts;
346 data->scriptsLength = baseData->scriptsLength;
347 }
348
349 index = IX_COMPRESSIBLE_BYTES_OFFSET;
350 offset = getIndex(inIndexes, indexesLength, index);
351 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
352 if(length >= 256) {
353 if(data == NULL) {
354 errorCode = U_INVALID_FORMAT_ERROR;
355 return;
356 }
357 data->compressibleBytes = reinterpret_cast<const UBool *>(inBytes + offs et);
358 } else if(data == NULL) {
359 // Nothing to do.
360 } else if(baseData != NULL) {
361 data->compressibleBytes = baseData->compressibleBytes;
362 } else {
363 errorCode = U_INVALID_FORMAT_ERROR; // No compressibleBytes[].
364 return;
365 }
366
367 const CollationSettings &ts = *tailoring.settings;
368 int32_t options = inIndexes[IX_OPTIONS] & 0xffff;
369 uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT];
370 int32_t fastLatinOptions = CollationFastLatin::getOptions(
371 tailoring.data, ts, fastLatinPrimaries, UPRV_LENGTHOF(fastLatinPrima ries));
372 if(options == ts.options && ts.variableTop != 0 &&
373 reorderCodesLength == ts.reorderCodesLength &&
374 uprv_memcmp(reorderCodes, ts.reorderCodes, reorderCodesLength * 4) = = 0 &&
375 fastLatinOptions == ts.fastLatinOptions &&
376 (fastLatinOptions < 0 ||
377 uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries,
378 sizeof(fastLatinPrimaries)) == 0)) {
379 return;
380 }
381
382 CollationSettings *settings = SharedObject::copyOnWrite(tailoring.settings);
383 if(settings == NULL) {
384 errorCode = U_MEMORY_ALLOCATION_ERROR;
385 return;
386 }
387 settings->options = options;
388 // Set variableTop from options and scripts data.
389 settings->variableTop = tailoring.data->getLastPrimaryForGroup(
390 UCOL_REORDER_CODE_FIRST + settings->getMaxVariable());
391 if(settings->variableTop == 0) {
392 errorCode = U_INVALID_FORMAT_ERROR;
393 return;
394 }
395
396 if(reorderCodesLength == 0 || reorderTable != NULL) {
397 settings->aliasReordering(reorderCodes, reorderCodesLength, reorderTable );
398 } else {
399 uint8_t table[256];
400 baseData->makeReorderTable(reorderCodes, reorderCodesLength, table, erro rCode);
401 if(U_FAILURE(errorCode)) { return; }
402 if(!settings->setReordering(reorderCodes, reorderCodesLength,table)) {
403 errorCode = U_MEMORY_ALLOCATION_ERROR;
404 return;
405 }
406 }
407
408 settings->fastLatinOptions = CollationFastLatin::getOptions(
409 tailoring.data, *settings,
410 settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries ));
411 }
412
413 UBool U_CALLCONV
414 CollationDataReader::isAcceptable(void *context,
415 const char * /* type */, const char * /*name*/ ,
416 const UDataInfo *pInfo) {
417 if(
418 pInfo->size >= 20 &&
419 pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
420 pInfo->charsetFamily == U_CHARSET_FAMILY &&
421 pInfo->dataFormat[0] == 0x55 && // dataFormat="UCol"
422 pInfo->dataFormat[1] == 0x43 &&
423 pInfo->dataFormat[2] == 0x6f &&
424 pInfo->dataFormat[3] == 0x6c &&
425 pInfo->formatVersion[0] == 4
426 ) {
427 UVersionInfo *version = static_cast<UVersionInfo *>(context);
428 if(version != NULL) {
429 uprv_memcpy(version, pInfo->dataVersion, 4);
430 }
431 return TRUE;
432 } else {
433 return FALSE;
434 }
435 }
436
437 U_NAMESPACE_END
438
439 #endif // !UCONFIG_NO_COLLATION
OLDNEW
« no previous file with comments | « source/i18n/collationdatareader.h ('k') | source/i18n/collationdatawriter.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698