OLD | NEW |
1 /* | 1 /* |
2 ******************************************************************************* | 2 ******************************************************************************* |
3 * | 3 * |
4 * Copyright (C) 2009-2013, International Business Machines | 4 * Copyright (C) 2009-2014, International Business Machines |
5 * Corporation and others. All Rights Reserved. | 5 * Corporation and others. All Rights Reserved. |
6 * | 6 * |
7 ******************************************************************************* | 7 ******************************************************************************* |
8 * file name: normalizer2impl.cpp | 8 * file name: normalizer2impl.cpp |
9 * encoding: US-ASCII | 9 * encoding: US-ASCII |
10 * tab size: 8 (not used) | 10 * tab size: 8 (not used) |
11 * indentation:4 | 11 * indentation:4 |
12 * | 12 * |
13 * created on: 2009nov22 | 13 * created on: 2009nov22 |
14 * created by: Markus W. Scherer | 14 * created by: Markus W. Scherer |
(...skipping 231 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
246 | 246 |
247 struct CanonIterData : public UMemory { | 247 struct CanonIterData : public UMemory { |
248 CanonIterData(UErrorCode &errorCode); | 248 CanonIterData(UErrorCode &errorCode); |
249 ~CanonIterData(); | 249 ~CanonIterData(); |
250 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode
); | 250 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode
); |
251 UTrie2 *trie; | 251 UTrie2 *trie; |
252 UVector canonStartSets; // contains UnicodeSet * | 252 UVector canonStartSets; // contains UnicodeSet * |
253 }; | 253 }; |
254 | 254 |
255 Normalizer2Impl::~Normalizer2Impl() { | 255 Normalizer2Impl::~Normalizer2Impl() { |
256 udata_close(memory); | |
257 utrie2_close(normTrie); | |
258 delete fCanonIterData; | 256 delete fCanonIterData; |
259 } | 257 } |
260 | 258 |
261 UBool U_CALLCONV | |
262 Normalizer2Impl::isAcceptable(void *context, | |
263 const char * /* type */, const char * /*name*/, | |
264 const UDataInfo *pInfo) { | |
265 if( | |
266 pInfo->size>=20 && | |
267 pInfo->isBigEndian==U_IS_BIG_ENDIAN && | |
268 pInfo->charsetFamily==U_CHARSET_FAMILY && | |
269 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ | |
270 pInfo->dataFormat[1]==0x72 && | |
271 pInfo->dataFormat[2]==0x6d && | |
272 pInfo->dataFormat[3]==0x32 && | |
273 pInfo->formatVersion[0]==2 | |
274 ) { | |
275 Normalizer2Impl *me=(Normalizer2Impl *)context; | |
276 uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4); | |
277 return TRUE; | |
278 } else { | |
279 return FALSE; | |
280 } | |
281 } | |
282 | |
283 void | 259 void |
284 Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &err
orCode) { | 260 Normalizer2Impl::init(const int32_t *inIndexes, const UTrie2 *inTrie, |
285 if(U_FAILURE(errorCode)) { | 261 const uint16_t *inExtraData, const uint8_t *inSmallFCD) { |
286 return; | |
287 } | |
288 memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &error
Code); | |
289 if(U_FAILURE(errorCode)) { | |
290 return; | |
291 } | |
292 const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory); | |
293 const int32_t *inIndexes=(const int32_t *)inBytes; | |
294 int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4; | |
295 if(indexesLength<=IX_MIN_MAYBE_YES) { | |
296 errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes. | |
297 return; | |
298 } | |
299 | |
300 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; | 262 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; |
301 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; | 263 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; |
302 | 264 |
303 minYesNo=inIndexes[IX_MIN_YES_NO]; | 265 minYesNo=inIndexes[IX_MIN_YES_NO]; |
304 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; | 266 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; |
305 minNoNo=inIndexes[IX_MIN_NO_NO]; | 267 minNoNo=inIndexes[IX_MIN_NO_NO]; |
306 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; | 268 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; |
307 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; | 269 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; |
308 | 270 |
309 int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET]; | 271 normTrie=inTrie; |
310 int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; | |
311 normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, | |
312 inBytes+offset, nextOffset-offset, NULL, | |
313 &errorCode); | |
314 if(U_FAILURE(errorCode)) { | |
315 return; | |
316 } | |
317 | 272 |
318 offset=nextOffset; | 273 maybeYesCompositions=inExtraData; |
319 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; | |
320 maybeYesCompositions=(const uint16_t *)(inBytes+offset); | |
321 extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes); | 274 extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes); |
322 | 275 |
323 // smallFCD: new in formatVersion 2 | 276 smallFCD=inSmallFCD; |
324 offset=nextOffset; | |
325 smallFCD=inBytes+offset; | |
326 | 277 |
327 // Build tccc180[]. | 278 // Build tccc180[]. |
328 // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. | 279 // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. |
329 uint8_t bits=0; | 280 uint8_t bits=0; |
330 for(UChar c=0; c<0x180; bits>>=1) { | 281 for(UChar c=0; c<0x180; bits>>=1) { |
331 if((c&0xff)==0) { | 282 if((c&0xff)==0) { |
332 bits=smallFCD[c>>8]; // one byte per 0x100 code points | 283 bits=smallFCD[c>>8]; // one byte per 0x100 code points |
333 } | 284 } |
334 if(bits&1) { | 285 if(bits&1) { |
335 for(int i=0; i<0x20; ++i, ++c) { | 286 for(int i=0; i<0x20; ++i, ++c) { |
(...skipping 14 matching lines...) Expand all Loading... |
350 c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]); | 301 c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]); |
351 } | 302 } |
352 uint16_t prevNorm16=getNorm16(c); | 303 uint16_t prevNorm16=getNorm16(c); |
353 if(prevNorm16<=minYesNo) { | 304 if(prevNorm16<=minYesNo) { |
354 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 | 305 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 |
355 } else { | 306 } else { |
356 return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo | 307 return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo |
357 } | 308 } |
358 } | 309 } |
359 | 310 |
| 311 namespace { |
| 312 |
| 313 class LcccContext { |
| 314 public: |
| 315 LcccContext(const Normalizer2Impl &ni, UnicodeSet &s) : impl(ni), set(s) {} |
| 316 |
| 317 void handleRange(UChar32 start, UChar32 end, uint16_t norm16) { |
| 318 if(impl.isAlgorithmicNoNo(norm16)) { |
| 319 // Range of code points with same-norm16-value algorithmic decomposi
tions. |
| 320 // They might have different non-zero FCD16 values. |
| 321 do { |
| 322 uint16_t fcd16=impl.getFCD16(start); |
| 323 if(fcd16>0xff) { set.add(start); } |
| 324 } while(++start<=end); |
| 325 } else { |
| 326 uint16_t fcd16=impl.getFCD16(start); |
| 327 if(fcd16>0xff) { set.add(start, end); } |
| 328 } |
| 329 } |
| 330 |
| 331 private: |
| 332 const Normalizer2Impl &impl; |
| 333 UnicodeSet &set; |
| 334 }; |
| 335 |
| 336 struct PropertyStartsContext { |
| 337 PropertyStartsContext(const Normalizer2Impl &ni, const USetAdder *adder) |
| 338 : impl(ni), sa(adder) {} |
| 339 |
| 340 const Normalizer2Impl &impl; |
| 341 const USetAdder *sa; |
| 342 }; |
| 343 |
| 344 } // namespace |
| 345 |
360 U_CDECL_BEGIN | 346 U_CDECL_BEGIN |
361 | 347 |
362 static UBool U_CALLCONV | 348 static UBool U_CALLCONV |
| 349 enumLcccRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { |
| 350 ((LcccContext *)context)->handleRange(start, end, (uint16_t)value); |
| 351 return TRUE; |
| 352 } |
| 353 |
| 354 static UBool U_CALLCONV |
| 355 enumNorm16PropertyStartsRange(const void *context, UChar32 start, UChar32 end, u
int32_t value) { |
| 356 /* add the start code point to the USet */ |
| 357 const PropertyStartsContext *ctx=(const PropertyStartsContext *)context; |
| 358 const USetAdder *sa=ctx->sa; |
| 359 sa->add(sa->set, start); |
| 360 if(start!=end && ctx->impl.isAlgorithmicNoNo((uint16_t)value)) { |
| 361 // Range of code points with same-norm16-value algorithmic decomposition
s. |
| 362 // They might have different non-zero FCD16 values. |
| 363 uint16_t prevFCD16=ctx->impl.getFCD16(start); |
| 364 while(++start<=end) { |
| 365 uint16_t fcd16=ctx->impl.getFCD16(start); |
| 366 if(fcd16!=prevFCD16) { |
| 367 sa->add(sa->set, start); |
| 368 prevFCD16=fcd16; |
| 369 } |
| 370 } |
| 371 } |
| 372 return TRUE; |
| 373 } |
| 374 |
| 375 static UBool U_CALLCONV |
363 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uin
t32_t /*value*/) { | 376 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uin
t32_t /*value*/) { |
364 /* add the start code point to the USet */ | 377 /* add the start code point to the USet */ |
365 const USetAdder *sa=(const USetAdder *)context; | 378 const USetAdder *sa=(const USetAdder *)context; |
366 sa->add(sa->set, start); | 379 sa->add(sa->set, start); |
367 return TRUE; | 380 return TRUE; |
368 } | 381 } |
369 | 382 |
370 static uint32_t U_CALLCONV | 383 static uint32_t U_CALLCONV |
371 segmentStarterMapper(const void * /*context*/, uint32_t value) { | 384 segmentStarterMapper(const void * /*context*/, uint32_t value) { |
372 return value&CANON_NOT_SEGMENT_STARTER; | 385 return value&CANON_NOT_SEGMENT_STARTER; |
373 } | 386 } |
374 | 387 |
375 U_CDECL_END | 388 U_CDECL_END |
376 | 389 |
377 void | 390 void |
| 391 Normalizer2Impl::addLcccChars(UnicodeSet &set) const { |
| 392 /* add the start code point of each same-value range of each trie */ |
| 393 LcccContext context(*this, set); |
| 394 utrie2_enum(normTrie, NULL, enumLcccRange, &context); |
| 395 } |
| 396 |
| 397 void |
378 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode
*/) const { | 398 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode
*/) const { |
379 /* add the start code point of each same-value range of each trie */ | 399 /* add the start code point of each same-value range of each trie */ |
380 utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa); | 400 PropertyStartsContext context(*this, sa); |
| 401 utrie2_enum(normTrie, NULL, enumNorm16PropertyStartsRange, &context); |
381 | 402 |
382 /* add Hangul LV syllables and LV+1 because of skippables */ | 403 /* add Hangul LV syllables and LV+1 because of skippables */ |
383 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_C
OUNT) { | 404 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_C
OUNT) { |
384 sa->add(sa->set, c); | 405 sa->add(sa->set, c); |
385 sa->add(sa->set, c+1); | 406 sa->add(sa->set, c+1); |
386 } | 407 } |
387 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with oth
er properties */ | 408 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with oth
er properties */ |
388 } | 409 } |
389 | 410 |
390 void | 411 void |
(...skipping 21 matching lines...) Expand all Loading... |
412 // Back out the last character for full processing. | 433 // Back out the last character for full processing. |
413 // Copy this prefix. | 434 // Copy this prefix. |
414 if(--src!=prevSrc) { | 435 if(--src!=prevSrc) { |
415 if(buffer!=NULL) { | 436 if(buffer!=NULL) { |
416 buffer->appendZeroCC(prevSrc, src, errorCode); | 437 buffer->appendZeroCC(prevSrc, src, errorCode); |
417 } | 438 } |
418 } | 439 } |
419 return src; | 440 return src; |
420 } | 441 } |
421 | 442 |
| 443 UnicodeString & |
| 444 Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest, |
| 445 UErrorCode &errorCode) const { |
| 446 if(U_FAILURE(errorCode)) { |
| 447 dest.setToBogus(); |
| 448 return dest; |
| 449 } |
| 450 const UChar *sArray=src.getBuffer(); |
| 451 if(&dest==&src || sArray==NULL) { |
| 452 errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 453 dest.setToBogus(); |
| 454 return dest; |
| 455 } |
| 456 decompose(sArray, sArray+src.length(), dest, src.length(), errorCode); |
| 457 return dest; |
| 458 } |
| 459 |
| 460 void |
| 461 Normalizer2Impl::decompose(const UChar *src, const UChar *limit, |
| 462 UnicodeString &dest, |
| 463 int32_t destLengthEstimate, |
| 464 UErrorCode &errorCode) const { |
| 465 if(destLengthEstimate<0 && limit!=NULL) { |
| 466 destLengthEstimate=(int32_t)(limit-src); |
| 467 } |
| 468 dest.remove(); |
| 469 ReorderingBuffer buffer(*this, dest); |
| 470 if(buffer.init(destLengthEstimate, errorCode)) { |
| 471 decompose(src, limit, &buffer, errorCode); |
| 472 } |
| 473 } |
| 474 |
422 // Dual functionality: | 475 // Dual functionality: |
423 // buffer!=NULL: normalize | 476 // buffer!=NULL: normalize |
424 // buffer==NULL: isNormalized/spanQuickCheckYes | 477 // buffer==NULL: isNormalized/spanQuickCheckYes |
425 const UChar * | 478 const UChar * |
426 Normalizer2Impl::decompose(const UChar *src, const UChar *limit, | 479 Normalizer2Impl::decompose(const UChar *src, const UChar *limit, |
427 ReorderingBuffer *buffer, | 480 ReorderingBuffer *buffer, |
428 UErrorCode &errorCode) const { | 481 UErrorCode &errorCode) const { |
429 UChar32 minNoCP=minDecompNoCP; | 482 UChar32 minNoCP=minDecompNoCP; |
430 if(limit==NULL) { | 483 if(limit==NULL) { |
431 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode); | 484 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode); |
(...skipping 1614 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2046 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1]; | 2099 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1]; |
2047 offset=nextOffset; | 2100 offset=nextOffset; |
2048 | 2101 |
2049 U_ASSERT(offset==size); | 2102 U_ASSERT(offset==size); |
2050 } | 2103 } |
2051 | 2104 |
2052 return headerSize+size; | 2105 return headerSize+size; |
2053 } | 2106 } |
2054 | 2107 |
2055 #endif // !UCONFIG_NO_NORMALIZATION | 2108 #endif // !UCONFIG_NO_NORMALIZATION |
OLD | NEW |