icu46/source/common/normalizer2impl.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/normalizer2impl.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 *******************************************************************************

	3 *

	4 * Copyright (C) 2009-2010, International Business Machines

	5 * Corporation and others. All Rights Reserved.

	6 *

	7 *******************************************************************************

	8 * file name: normalizer2impl.cpp

	9 * encoding: US-ASCII

	10 * tab size: 8 (not used)

	11 * indentation:4

	12 *

	13 * created on: 2009nov22

	14 * created by: Markus W. Scherer

	15 */

	16

	17 #include "unicode/utypes.h"

	18

	19 #if !UCONFIG_NO_NORMALIZATION

	20

	21 #include "unicode/normalizer2.h"

	22 #include "unicode/udata.h"

	23 #include "unicode/ustring.h"

	24 #include "cmemory.h"

	25 #include "mutex.h"

	26 #include "normalizer2impl.h"

	27 #include "uassert.h"

	28 #include "uhash.h"

	29 #include "uset_imp.h"

	30 #include "utrie2.h"

	31 #include "uvector.h"

	32

	33 U_NAMESPACE_BEGIN

	34

	35 // ReorderingBuffer -------------------------------------------------------- ***

	36

	37 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {

	38 int32_t length=str.length();

	39 start=str.getBuffer(destCapacity);

	40 if(start==NULL) {

	41 // getBuffer() already did str.setToBogus()

	42 errorCode=U_MEMORY_ALLOCATION_ERROR;

	43 return FALSE;

	44 }

	45 limit=start+length;

	46 remainingCapacity=str.getCapacity()-length;

	47 reorderStart=start;

	48 if(start==limit) {

	49 lastCC=0;

	50 } else {

	51 setIterator();

	52 lastCC=previousCC();

	53 // Set reorderStart after the last code point with cc<=1 if there is one .

	54 if(lastCC>1) {

	55 while(previousCC()>1) {}

	56 }

	57 reorderStart=codePointLimit;

	58 }

	59 return TRUE;

	60 }

	61

	62 UBool ReorderingBuffer::equals(const UChar otherStart, const UChar otherLimit) const {

	63 int32_t length=(int32_t)(limit-start);

	64 return

	65 length==(int32_t)(otherLimit-otherStart) &&

	66 0==u_memcmp(start, otherStart, length);

	67 }

	68

	69 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &e rrorCode) {

	70 if(remainingCapacity<2 && !resize(2, errorCode)) {

	71 return FALSE;

	72 }

	73 if(lastCC<=cc \|\| cc==0) {

	74 limit[0]=U16_LEAD(c);

	75 limit[1]=U16_TRAIL(c);

	76 limit+=2;

	77 lastCC=cc;

	78 if(cc<=1) {

	79 reorderStart=limit;

	80 }

	81 } else {

	82 insert(c, cc);

	83 }

	84 remainingCapacity-=2;

	85 return TRUE;

	86 }

	87

	88 UBool ReorderingBuffer::append(const UChar *s, int32_t length,

	89 uint8_t leadCC, uint8_t trailCC,

	90 UErrorCode &errorCode) {

	91 if(length==0) {

	92 return TRUE;

	93 }

	94 if(remainingCapacity<length && !resize(length, errorCode)) {

	95 return FALSE;

	96 }

	97 remainingCapacity-=length;

	98 if(lastCC<=leadCC \|\| leadCC==0) {

	99 if(trailCC<=1) {

	100 reorderStart=limit+length;

	101 } else if(leadCC<=1) {

	102 reorderStart=limit+1; // Ok if not a code point boundary.

	103 }

	104 const UChar *sLimit=s+length;

	105 do { limit++=s++; } while(s!=sLimit);

	106 lastCC=trailCC;

	107 } else {

	108 int32_t i=0;

	109 UChar32 c;

	110 U16_NEXT(s, i, length, c);

	111 insert(c, leadCC); // insert first code point

	112 while(i<length) {

	113 U16_NEXT(s, i, length, c);

	114 if(i<length) {

	115 // s must be in NFD, otherwise we need to use getCC().

	116 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));

	117 } else {

	118 leadCC=trailCC;

	119 }

	120 append(c, leadCC, errorCode);

	121 }

	122 }

	123 return TRUE;

	124 }

	125

	126 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {

	127 int32_t cpLength=U16_LENGTH(c);

	128 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {

	129 return FALSE;

	130 }

	131 remainingCapacity-=cpLength;

	132 if(cpLength==1) {

	133 *limit++=(UChar)c;

	134 } else {

	135 limit[0]=U16_LEAD(c);

	136 limit[1]=U16_TRAIL(c);

	137 limit+=2;

	138 }

	139 lastCC=0;

	140 reorderStart=limit;

	141 return TRUE;

	142 }

	143

	144 UBool ReorderingBuffer::appendZeroCC(const UChar s, const UChar sLimit, UError Code &errorCode) {

	145 if(s==sLimit) {

	146 return TRUE;

	147 }

	148 int32_t length=(int32_t)(sLimit-s);

	149 if(remainingCapacity<length && !resize(length, errorCode)) {

	150 return FALSE;

	151 }

	152 u_memcpy(limit, s, length);

	153 limit+=length;

	154 remainingCapacity-=length;

	155 lastCC=0;

	156 reorderStart=limit;

	157 return TRUE;

	158 }

	159

	160 void ReorderingBuffer::remove() {

	161 reorderStart=limit=start;

	162 remainingCapacity=str.getCapacity();

	163 lastCC=0;

	164 }

	165

	166 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {

	167 if(suffixLength<(limit-start)) {

	168 limit-=suffixLength;

	169 remainingCapacity+=suffixLength;

	170 } else {

	171 limit=start;

	172 remainingCapacity=str.getCapacity();

	173 }

	174 lastCC=0;

	175 reorderStart=limit;

	176 }

	177

	178 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {

	179 int32_t reorderStartIndex=(int32_t)(reorderStart-start);

	180 int32_t length=(int32_t)(limit-start);

	181 str.releaseBuffer(length);

	182 int32_t newCapacity=length+appendLength;

	183 int32_t doubleCapacity=2*str.getCapacity();

	184 if(newCapacity<doubleCapacity) {

	185 newCapacity=doubleCapacity;

	186 }

	187 if(newCapacity<256) {

	188 newCapacity=256;

	189 }

	190 start=str.getBuffer(newCapacity);

	191 if(start==NULL) {

	192 // getBuffer() already did str.setToBogus()

	193 errorCode=U_MEMORY_ALLOCATION_ERROR;

	194 return FALSE;

	195 }

	196 reorderStart=start+reorderStartIndex;

	197 limit=start+length;

	198 remainingCapacity=str.getCapacity()-length;

	199 return TRUE;

	200 }

	201

	202 void ReorderingBuffer::skipPrevious() {

	203 codePointLimit=codePointStart;

	204 UChar c=*--codePointStart;

	205 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1 ))) {

	206 --codePointStart;

	207 }

	208 }

	209

	210 uint8_t ReorderingBuffer::previousCC() {

	211 codePointLimit=codePointStart;

	212 if(reorderStart>=codePointStart) {

	213 return 0;

	214 }

	215 UChar32 c=*--codePointStart;

	216 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {

	217 return 0;

	218 }

	219

	220 UChar c2;

	221 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStar t-1))) {

	222 --codePointStart;

	223 c=U16_GET_SUPPLEMENTARY(c2, c);

	224 }

	225 return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));

	226 }

	227

	228 // Inserts c somewhere before the last character.

	229 // Requires 0<cc<lastCC which implies reorderStart<limit.

	230 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {

	231 for(setIterator(), skipPrevious(); previousCC()>cc;) {}

	232 // insert c at codePointLimit, after the character with prevCC<=cc

	233 UChar *q=limit;

	234 UChar *r=limit+=U16_LENGTH(c);

	235 do {

	236 --r=--q;

	237 } while(codePointLimit!=q);

	238 writeCodePoint(q, c);

	239 if(cc<=1) {

	240 reorderStart=r;

	241 }

	242 }

	243

	244 // Normalizer2Impl --------------------------------------------------------- ***

	245

	246 struct CanonIterData : public UMemory {

	247 CanonIterData(UErrorCode &errorCode);

	248 ~CanonIterData();

	249 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode );

	250 UTrie2 *trie;

	251 UVector canonStartSets; // contains UnicodeSet *

	252 };

	253

	254 Normalizer2Impl::~Normalizer2Impl() {

	255 udata_close(memory);

	256 utrie2_close(normTrie);

	257 UTrie2Singleton(fcdTrieSingleton).deleteInstance();

	258 delete (CanonIterData *)canonIterDataSingleton.fInstance;

	259 }

	260

	261 UBool U_CALLCONV

	262 Normalizer2Impl::isAcceptable(void *context,

	263 const char * /* type /, const char /name/,

	264 const UDataInfo *pInfo) {

	265 if(

	266 pInfo->size>=20 &&

	267 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&

	268 pInfo->charsetFamily==U_CHARSET_FAMILY &&

	269 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */

	270 pInfo->dataFormat[1]==0x72 &&

	271 pInfo->dataFormat[2]==0x6d &&

	272 pInfo->dataFormat[3]==0x32 &&

	273 pInfo->formatVersion[0]==1

	274 ) {

	275 Normalizer2Impl me=(Normalizer2Impl )context;

	276 uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);

	277 return TRUE;

	278 } else {

	279 return FALSE;

	280 }

	281 }

	282

	283 void

	284 Normalizer2Impl::load(const char packageName, const char name, UErrorCode &err orCode) {

	285 if(U_FAILURE(errorCode)) {

	286 return;

	287 }

	288 memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &error Code);

	289 if(U_FAILURE(errorCode)) {

	290 return;

	291 }

	292 const uint8_t inBytes=(const uint8_t )udata_getMemory(memory);

	293 const int32_t inIndexes=(const int32_t )inBytes;

	294 int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;

	295 if(indexesLength<=IX_MIN_MAYBE_YES) {

	296 errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes.

	297 return;

	298 }

	299

	300 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];

	301 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];

	302

	303 minYesNo=inIndexes[IX_MIN_YES_NO];

	304 minNoNo=inIndexes[IX_MIN_NO_NO];

	305 limitNoNo=inIndexes[IX_LIMIT_NO_NO];

	306 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];

	307

	308 int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];

	309 int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];

	310 normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,

	311 inBytes+offset, nextOffset-offset, NULL,

	312 &errorCode);

	313 if(U_FAILURE(errorCode)) {

	314 return;

	315 }

	316

	317 offset=nextOffset;

	318 maybeYesCompositions=(const uint16_t *)(inBytes+offset);

	319 extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);

	320 }

	321

	322 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar cpStart, co nst UChar cpLimit) const {

	323 UChar32 c;

	324 if(cpStart==(cpLimit-1)) {

	325 c=*cpStart;

	326 } else {

	327 c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);

	328 }

	329 uint16_t prevNorm16=getNorm16(c);

	330 if(prevNorm16<=minYesNo) {

	331 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0

	332 } else {

	333 return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo

	334 }

	335 }

	336

	337 U_CDECL_BEGIN

	338

	339 static UBool U_CALLCONV

	340 enumPropertyStartsRange(const void context, UChar32 start, UChar32 /end/, uin t32_t /value*/) {

	341 /* add the start code point to the USet */

	342 const USetAdder sa=(const USetAdder )context;

	343 sa->add(sa->set, start);

	344 return TRUE;

	345 }

	346

	347 static uint32_t U_CALLCONV

	348 segmentStarterMapper(const void * /context/, uint32_t value) {

	349 return value&CANON_NOT_SEGMENT_STARTER;

	350 }

	351

	352 U_CDECL_END

	353

	354 void

	355 Normalizer2Impl::addPropertyStarts(const USetAdder sa, UErrorCode & /errorCode */) const {

	356 /* add the start code point of each same-value range of each trie */

	357 utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa);

	358

	359 /* add Hangul LV syllables and LV+1 because of skippables */

	360 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_C OUNT) {

	361 sa->add(sa->set, c);

	362 sa->add(sa->set, c+1);

	363 }

	364 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with oth er properties */

	365 }

	366

	367 void

	368 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &err orCode) const {

	369 /* add the start code point of each same-value range of the canonical iterat or data trie */

	370 if(ensureCanonIterData(errorCode)) {

	371 // currently only used for the SEGMENT_STARTER property

	372 utrie2_enum(((CanonIterData *)canonIterDataSingleton.fInstance)->trie,

	373 segmentStarterMapper, enumPropertyStartsRange, sa);

	374 }

	375 }

	376

	377 const UChar *

	378 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,

	379 UChar32 minNeedDataCP,

	380 ReorderingBuffer *buffer,

	381 UErrorCode &errorCode) const {

	382 // Make some effort to support NUL-terminated strings reasonably.

	383 // Take the part of the fast quick check loop that does not look up

	384 // data and check the first part of the string.

	385 // After this prefix, determine the string length to simplify the rest

	386 // of the code.

	387 const UChar *prevSrc=src;

	388 UChar c;

	389 while((c=*src++)<minNeedDataCP && c!=0) {}

	390 // Back out the last character for full processing.

	391 // Copy this prefix.

	392 if(--src!=prevSrc) {

	393 if(buffer!=NULL) {

	394 buffer->appendZeroCC(prevSrc, src, errorCode);

	395 }

	396 }

	397 return src;

	398 }

	399

	400 // Dual functionality:

	401 // buffer!=NULL: normalize

	402 // buffer==NULL: isNormalized/spanQuickCheckYes

	403 const UChar *

	404 Normalizer2Impl::decompose(const UChar src, const UChar limit,

	405 ReorderingBuffer *buffer,

	406 UErrorCode &errorCode) const {

	407 UChar32 minNoCP=minDecompNoCP;

	408 if(limit==NULL) {

	409 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);

	410 if(U_FAILURE(errorCode)) {

	411 return src;

	412 }

	413 limit=u_strchr(src, 0);

	414 }

	415

	416 const UChar *prevSrc;

	417 UChar32 c=0;

	418 uint16_t norm16=0;

	419

	420 // only for quick check

	421 const UChar *prevBoundary=src;

	422 uint8_t prevCC=0;

	423

	424 for(;;) {

	425 // count code units below the minimum or with irrelevant data for the qu ick check

	426 for(prevSrc=src; src!=limit;) {

	427 if( (c=*src)<minNoCP \|\|

	428 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEA D(normTrie, c))

	429 ) {

	430 ++src;

	431 } else if(!U16_IS_SURROGATE(c)) {

	432 break;

	433 } else {

	434 UChar c2;

	435 if(U16_IS_SURROGATE_LEAD(c)) {

	436 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {

	437 c=U16_GET_SUPPLEMENTARY(c, c2);

	438 }

	439 } else /* trail surrogate */ {

	440 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {

	441 --src;

	442 c=U16_GET_SUPPLEMENTARY(c2, c);

	443 }

	444 }

	445 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {

	446 src+=U16_LENGTH(c);

	447 } else {

	448 break;

	449 }

	450 }

	451 }

	452 // copy these code units all at once

	453 if(src!=prevSrc) {

	454 if(buffer!=NULL) {

	455 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {

	456 break;

	457 }

	458 } else {

	459 prevCC=0;

	460 prevBoundary=src;

	461 }

	462 }

	463 if(src==limit) {

	464 break;

	465 }

	466

	467 // Check one above-minimum, relevant code point.

	468 src+=U16_LENGTH(c);

	469 if(buffer!=NULL) {

	470 if(!decompose(c, norm16, *buffer, errorCode)) {

	471 break;

	472 }

	473 } else {

	474 if(isDecompYes(norm16)) {

	475 uint8_t cc=getCCFromYesOrMaybe(norm16);

	476 if(prevCC<=cc \|\| cc==0) {

	477 prevCC=cc;

	478 if(cc<=1) {

	479 prevBoundary=src;

	480 }

	481 continue;

	482 }

	483 }

	484 return prevBoundary; // "no" or cc out of order

	485 }

	486 }

	487 return src;

	488 }

	489

	490 // Decompose a short piece of text which is likely to contain characters that

	491 // fail the quick check loop and/or where the quick check loop's overhead

	492 // is unlikely to be amortized.

	493 // Called by the compose() and makeFCD() implementations.

	494 UBool Normalizer2Impl::decomposeShort(const UChar src, const UChar limit,

	495 ReorderingBuffer &buffer,

	496 UErrorCode &errorCode) const {

	497 while(src<limit) {

	498 UChar32 c;

	499 uint16_t norm16;

	500 UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);

	501 if(!decompose(c, norm16, buffer, errorCode)) {

	502 return FALSE;

	503 }

	504 }

	505 return TRUE;

	506 }

	507

	508 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,

	509 ReorderingBuffer &buffer,

	510 UErrorCode &errorCode) const {

	511 // Only loops for 1:1 algorithmic mappings.

	512 for(;;) {

	513 // get the decomposition and the lead and trail cc's

	514 if(isDecompYes(norm16)) {

	515 // c does not decompose

	516 return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);

	517 } else if(isHangul(norm16)) {

	518 // Hangul syllable: decompose algorithmically

	519 UChar jamos[3];

	520 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);

	521 } else if(isDecompNoAlgorithmic(norm16)) {

	522 c=mapAlgorithmic(c, norm16);

	523 norm16=getNorm16(c);

	524 } else {

	525 // c decomposes, get everything from the variable-length extra data

	526 const uint16_t *mapping=getMapping(norm16);

	527 uint16_t firstUnit=*mapping++;

	528 int32_t length=firstUnit&MAPPING_LENGTH_MASK;

	529 uint8_t leadCC, trailCC;

	530 trailCC=(uint8_t)(firstUnit>>8);

	531 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {

	532 leadCC=(uint8_t)(*mapping++>>8);

	533 } else {

	534 leadCC=0;

	535 }

	536 return buffer.append((const UChar *)mapping, length, leadCC, trailCC , errorCode);

	537 }

	538 }

	539 }

	540

	541 const UChar *

	542 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) c onst {

	543 const UChar *decomp=NULL;

	544 uint16_t norm16;

	545 for(;;) {

	546 if(c<minDecompNoCP \|\| isDecompYes(norm16=getNorm16(c))) {

	547 // c does not decompose

	548 return decomp;

	549 } else if(isHangul(norm16)) {

	550 // Hangul syllable: decompose algorithmically

	551 length=Hangul::decompose(c, buffer);

	552 return buffer;

	553 } else if(isDecompNoAlgorithmic(norm16)) {

	554 c=mapAlgorithmic(c, norm16);

	555 decomp=buffer;

	556 length=0;

	557 U16_APPEND_UNSAFE(buffer, length, c);

	558 } else {

	559 // c decomposes, get everything from the variable-length extra data

	560 const uint16_t *mapping=getMapping(norm16);

	561 uint16_t firstUnit=*mapping++;

	562 length=firstUnit&MAPPING_LENGTH_MASK;

	563 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {

	564 ++mapping;

	565 }

	566 return (const UChar *)mapping;

	567 }

	568 }

	569 }

	570

	571 void Normalizer2Impl::decomposeAndAppend(const UChar src, const UChar limit,

	572 UBool doDecompose,

	573 ReorderingBuffer &buffer,

	574 UErrorCode &errorCode) const {

	575 if(doDecompose) {

	576 decompose(src, limit, &buffer, errorCode);

	577 return;

	578 }

	579 // Just merge the strings at the boundary.

	580 ForwardUTrie2StringIterator iter(normTrie, src, limit);

	581 uint8_t firstCC, prevCC, cc;

	582 firstCC=prevCC=cc=getCC(iter.next16());

	583 while(cc!=0) {

	584 prevCC=cc;

	585 cc=getCC(iter.next16());

	586 };

	587 buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, erro rCode) &&

	588 buffer.appendZeroCC(iter.codePointStart, limit, errorCode);

	589 }

	590

	591 // Note: hasDecompBoundary() could be implemented as aliases to

	592 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()

	593 // at the cost of building the FCD trie for a decomposition normalizer.

	594 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {

	595 for(;;) {

	596 if(c<minDecompNoCP) {

	597 return TRUE;

	598 }

	599 uint16_t norm16=getNorm16(c);

	600 if(isHangul(norm16) \|\| isDecompYesAndZeroCC(norm16)) {

	601 return TRUE;

	602 } else if(norm16>MIN_NORMAL_MAYBE_YES) {

	603 return FALSE; // ccc!=0

	604 } else if(isDecompNoAlgorithmic(norm16)) {

	605 c=mapAlgorithmic(c, norm16);

	606 } else {

	607 // c decomposes, get everything from the variable-length extra data

	608 const uint16_t *mapping=getMapping(norm16);

	609 uint16_t firstUnit=*mapping++;

	610 if((firstUnit&MAPPING_LENGTH_MASK)==0) {

	611 return FALSE;

	612 }

	613 if(!before) {

	614 // decomp after-boundary: same as hasFCDBoundaryAfter(),

	615 // fcd16<=1 \|\| trailCC==0

	616 if(firstUnit>0x1ff) {

	617 return FALSE; // trailCC>1

	618 }

	619 if(firstUnit<=0xff) {

	620 return TRUE; // trailCC==0

	621 }

	622 // if(trailCC==1) test leadCC==0, same as checking for before-bo undary

	623 }

	624 // TRUE if leadCC==0 (hasFCDBoundaryBefore())

	625 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 \|\| (*mapping&0xff00) ==0;

	626 }

	627 }

	628 }

	629

	630 /*

	631 * Finds the recomposition result for

	632 * a forward-combining "lead" character,

	633 * specified with a pointer to its compositions list,

	634 * and a backward-combining "trail" character.

	635 *

	636 * If the lead and trail characters combine, then this function returns

	637 * the following "compositeAndFwd" value:

	638 * Bits 21..1 composite character

	639 * Bit 0 set if the composite is a forward-combining starter

	640 * otherwise it returns -1.

	641 *

	642 * The compositions list has (trail, compositeAndFwd) pair entries,

	643 * encoded as either pairs or triples of 16-bit units.

	644 * The last entry has the high bit of its first unit set.

	645 *

	646 * The list is sorted by ascending trail characters (there are no duplicates).

	647 * A linear search is used.

	648 *

	649 * See normalizer2impl.h for a more detailed description

	650 * of the compositions list format.

	651 */

	652 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {

	653 uint16_t key1, firstUnit;

	654 if(trail<COMP_1_TRAIL_LIMIT) {

	655 // trail character is 0..33FF

	656 // result entry may have 2 or 3 units

	657 key1=(uint16_t)(trail<<1);

	658 while(key1>(firstUnit=*list)) {

	659 list+=2+(firstUnit&COMP_1_TRIPLE);

	660 }

	661 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {

	662 if(firstUnit&COMP_1_TRIPLE) {

	663 return ((int32_t)list[1]<<16)\|list[2];

	664 } else {

	665 return list[1];

	666 }

	667 }

	668 } else {

	669 // trail character is 3400..10FFFF

	670 // result entry has 3 units

	671 key1=(uint16_t)(COMP_1_TRAIL_LIMIT+

	672 (((trail>>COMP_1_TRAIL_SHIFT))&

	673 ~COMP_1_TRIPLE));

	674 uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);

	675 uint16_t secondUnit;

	676 for(;;) {

	677 if(key1>(firstUnit=*list)) {

	678 list+=2+(firstUnit&COMP_1_TRIPLE);

	679 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {

	680 if(key2>(secondUnit=list[1])) {

	681 if(firstUnit&COMP_1_LAST_TUPLE) {

	682 break;

	683 } else {

	684 list+=3;

	685 }

	686 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {

	687 return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)\|list[2 ];

	688 } else {

	689 break;

	690 }

	691 } else {

	692 break;

	693 }

	694 }

	695 }

	696 return -1;

	697 }

	698

	699 /**

	700 * @param list some character's compositions list

	701 * @param set recursively receives the composites from these compositions

	702 */

	703 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {

	704 uint16_t firstUnit;

	705 int32_t compositeAndFwd;

	706 do {

	707 firstUnit=*list;

	708 if((firstUnit&COMP_1_TRIPLE)==0) {

	709 compositeAndFwd=list[1];

	710 list+=2;

	711 } else {

	712 compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)\|list[2];

	713 list+=3;

	714 }

	715 UChar32 composite=compositeAndFwd>>1;

	716 if((compositeAndFwd&1)!=0) {

	717 addComposites(getCompositionsListForComposite(getNorm16(composite)), set);

	718 }

	719 set.add(composite);

	720 } while((firstUnit&COMP_1_LAST_TUPLE)==0);

	721 }

	722

	723 /*

	724 * Recomposes the buffer text starting at recomposeStartIndex

	725 * (which is in NFD - decomposed and canonically ordered),

	726 * and truncates the buffer contents.

	727 *

	728 * Note that recomposition never lengthens the text:

	729 * Any character consists of either one or two code units;

	730 * a composition may contain at most one more code unit than the original starte r,

	731 * while the combining mark that is removed has at least one code unit.

	732 */

	733 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStart Index,

	734 UBool onlyContiguous) const {

	735 UChar *p=buffer.getStart()+recomposeStartIndex;

	736 UChar *limit=buffer.getLimit();

	737 if(p==limit) {

	738 return;

	739 }

	740

	741 UChar starter, pRemove, q, r;

	742 const uint16_t *compositionsList;

	743 UChar32 c, compositeAndFwd;

	744 uint16_t norm16;

	745 uint8_t cc, prevCC;

	746 UBool starterIsSupplementary;

	747

	748 // Some of the following variables are not used until we have a forward-comb ining starter

	749 // and are only initialized now to avoid compiler warnings.

	750 compositionsList=NULL; // used as indicator for whether we have a forward-c ombining starter

	751 starter=NULL;

	752 starterIsSupplementary=FALSE;

	753 prevCC=0;

	754

	755 for(;;) {

	756 UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);

	757 cc=getCCFromYesOrMaybe(norm16);

	758 if( // this character combines backward and

	759 isMaybe(norm16) &&

	760 // we have seen a starter that combines forward and

	761 compositionsList!=NULL &&

	762 // the backward-combining character is not blocked

	763 (prevCC<cc \|\| prevCC==0)

	764 ) {

	765 if(isJamoVT(norm16)) {

	766 // c is a Jamo V/T, see if we can compose it with the previous c haracter.

	767 if(c<Hangul::JAMO_T_BASE) {

	768 // c is a Jamo Vowel, compose with previous Jamo L and follo wing Jamo T.

	769 UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);

	770 if(prev<Hangul::JAMO_L_COUNT) {

	771 pRemove=p-1;

	772 UChar syllable=(UChar)

	773 (Hangul::HANGUL_BASE+

	774 (prevHangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))

	775 Hangul::JAMO_T_COUNT);

	776 UChar t;

	777 if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangu l::JAMO_T_COUNT) {

	778 ++p;

	779 syllable+=t; // The next character was a Jamo T.

	780 }

	781 *starter=syllable;

	782 // remove the Jamo V/T

	783 q=pRemove;

	784 r=p;

	785 while(r<limit) {

	786 q++=r++;

	787 }

	788 limit=q;

	789 p=pRemove;

	790 }

	791 }

	792 /*

	793 * No "else" for Jamo T:

	794 * Since the input is in NFD, there are no Hangul LV syllables t hat

	795 * a Jamo T could combine with.

	796 * All Jamo Ts are combined above when handling Jamo Vs.

	797 */

	798 if(p==limit) {

	799 break;

	800 }

	801 compositionsList=NULL;

	802 continue;

	803 } else if((compositeAndFwd=combine(compositionsList, c))>=0) {

	804 // The starter and the combining mark (c) do combine.

	805 UChar32 composite=compositeAndFwd>>1;

	806

	807 // Replace the starter with the composite, remove the combining mark.

	808 pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the c ombining mark

	809 if(starterIsSupplementary) {

	810 if(U_IS_SUPPLEMENTARY(composite)) {

	811 // both are supplementary

	812 starter[0]=U16_LEAD(composite);

	813 starter[1]=U16_TRAIL(composite);

	814 } else {

	815 *starter=(UChar)composite;

	816 // The composite is shorter than the starter,

	817 // move the intermediate characters forward one.

	818 starterIsSupplementary=FALSE;

	819 q=starter+1;

	820 r=q+1;

	821 while(r<pRemove) {

	822 q++=r++;

	823 }

	824 --pRemove;

	825 }

	826 } else if(U_IS_SUPPLEMENTARY(composite)) {

	827 // The composite is longer than the starter,

	828 // move the intermediate characters back one.

	829 starterIsSupplementary=TRUE;

	830 ++starter; // temporarily increment for the loop boundary

	831 q=pRemove;

	832 r=++pRemove;

	833 while(starter<q) {

	834 --r=--q;

	835 }

	836 *starter=U16_TRAIL(composite);

	837 *--starter=U16_LEAD(composite); // undo the temporary incre ment

	838 } else {

	839 // both are on the BMP

	840 *starter=(UChar)composite;

	841 }

	842

	843 /* remove the combining mark by moving the following text over i t */

	844 if(pRemove<p) {

	845 q=pRemove;

	846 r=p;

	847 while(r<limit) {

	848 q++=r++;

	849 }

	850 limit=q;

	851 p=pRemove;

	852 }

	853 // Keep prevCC because we removed the combining mark.

	854

	855 if(p==limit) {

	856 break;

	857 }

	858 // Is the composite a starter that combines forward?

	859 if(compositeAndFwd&1) {

	860 compositionsList=

	861 getCompositionsListForComposite(getNorm16(composite));

	862 } else {

	863 compositionsList=NULL;

	864 }

	865

	866 // We combined; continue with looking for compositions.

	867 continue;

	868 }

	869 }

	870

	871 // no combination this time

	872 prevCC=cc;

	873 if(p==limit) {

	874 break;

	875 }

	876

	877 // If c did not combine, then check if it is a starter.

	878 if(cc==0) {

	879 // Found a new starter.

	880 if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {

	881 // It may combine with something, prepare for it.

	882 if(U_IS_BMP(c)) {

	883 starterIsSupplementary=FALSE;

	884 starter=p-1;

	885 } else {

	886 starterIsSupplementary=TRUE;

	887 starter=p-2;

	888 }

	889 }

	890 } else if(onlyContiguous) {

	891 // FCC: no discontiguous compositions; any intervening character blo cks.

	892 compositionsList=NULL;

	893 }

	894 }

	895 buffer.setReorderingLimit(limit);

	896 }

	897

	898 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.

	899 // doCompose: normalize

	900 // !doCompose: isNormalized (buffer must be empty and initialized)

	901 UBool

	902 Normalizer2Impl::compose(const UChar src, const UChar limit,

	903 UBool onlyContiguous,

	904 UBool doCompose,

	905 ReorderingBuffer &buffer,

	906 UErrorCode &errorCode) const {

	907 /*

	908 * prevBoundary points to the last character before the current one

	909 * that has a composition boundary before it with ccc==0 and quick check "ye s".

	910 * Keeping track of prevBoundary saves us looking for a composition boundary

	911 * when we find a "no" or "maybe".

	912 *

	913 * When we back out from prevSrc back to prevBoundary,

	914 * then we also remove those same characters (which had been simply copied

	915 * or canonically-order-inserted) from the ReorderingBuffer.

	916 * Therefore, at all times, the [prevBoundary..prevSrc[ source units

	917 * must correspond 1:1 to destination units at the end of the destination bu ffer.

	918 */

	919 const UChar *prevBoundary=src;

	920 UChar32 minNoMaybeCP=minCompNoMaybeCP;

	921 if(limit==NULL) {

	922 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,

	923 doCompose ? &buffer : NULL,

	924 errorCode);

	925 if(U_FAILURE(errorCode)) {

	926 return FALSE;

	927 }

	928 if(prevBoundary<src) {

	929 // Set prevBoundary to the last character in the prefix.

	930 prevBoundary=src-1;

	931 }

	932 limit=u_strchr(src, 0);

	933 }

	934

	935 const UChar *prevSrc;

	936 UChar32 c=0;

	937 uint16_t norm16=0;

	938

	939 // only for isNormalized

	940 uint8_t prevCC=0;

	941

	942 for(;;) {

	943 // count code units below the minimum or with irrelevant data for the qu ick check

	944 for(prevSrc=src; src!=limit;) {

	945 if( (c=*src)<minNoMaybeCP \|\|

	946 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(norm Trie, c))

	947 ) {

	948 ++src;

	949 } else if(!U16_IS_SURROGATE(c)) {

	950 break;

	951 } else {

	952 UChar c2;

	953 if(U16_IS_SURROGATE_LEAD(c)) {

	954 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {

	955 c=U16_GET_SUPPLEMENTARY(c, c2);

	956 }

	957 } else /* trail surrogate */ {

	958 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {

	959 --src;

	960 c=U16_GET_SUPPLEMENTARY(c2, c);

	961 }

	962 }

	963 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {

	964 src+=U16_LENGTH(c);

	965 } else {

	966 break;

	967 }

	968 }

	969 }

	970 // copy these code units all at once

	971 if(src!=prevSrc) {

	972 if(doCompose) {

	973 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {

	974 break;

	975 }

	976 } else {

	977 prevCC=0;

	978 }

	979 if(src==limit) {

	980 break;

	981 }

	982 // Set prevBoundary to the last character in the quick check loop.

	983 prevBoundary=src-1;

	984 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&

	985 U16_IS_LEAD(*(prevBoundary-1))

	986 ) {

	987 --prevBoundary;

	988 }

	989 // The start of the current character (c).

	990 prevSrc=src;

	991 } else if(src==limit) {

	992 break;

	993 }

	994

	995 src+=U16_LENGTH(c);

	996 /*

	997 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.

	998 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backwa rd)

	999 * or has ccc!=0.

	1000 * Check for Jamo V/T, then for regular characters.

	1001 * c is not a Hangul syllable or Jamo L because those have "yes" propert ies.

	1002 */

	1003 if(isJamoVT(norm16) && prevBoundary!=prevSrc) {

	1004 UChar prev=*(prevSrc-1);

	1005 UBool needToDecompose=FALSE;

	1006 if(c<Hangul::JAMO_T_BASE) {

	1007 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.

	1008 prev=(UChar)(prev-Hangul::JAMO_L_BASE);

	1009 if(prev<Hangul::JAMO_L_COUNT) {

	1010 if(!doCompose) {

	1011 return FALSE;

	1012 }

	1013 UChar syllable=(UChar)

	1014 (Hangul::HANGUL_BASE+

	1015 (prevHangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))

	1016 Hangul::JAMO_T_COUNT);

	1017 UChar t;

	1018 if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangu l::JAMO_T_COUNT) {

	1019 ++src;

	1020 syllable+=t; // The next character was a Jamo T.

	1021 prevBoundary=src;

	1022 buffer.setLastChar(syllable);

	1023 continue;

	1024 }

	1025 // If we see L+V+x where x!=T then we drop to the slow path,

	1026 // decompose and recompose.

	1027 // This is to deal with NFKC finding normal L and V but a

	1028 // compatibility variant of a T. We need to either fully com pose that

	1029 // combination here (which would complicate the code and may not work

	1030 // with strange custom data) or use the slow path -- or else our replacing

	1031 // two input characters (L+V) with one output character (LV syllable)

	1032 // would violate the invariant that [prevBoundary..prevSrc[ has the same

	1033 // length as what we appended to the buffer since prevBounda ry.

	1034 needToDecompose=TRUE;

	1035 }

	1036 } else if(Hangul::isHangulWithoutJamoT(prev)) {

	1037 // c is a Jamo Trailing consonant,

	1038 // compose with previous Hangul LV that does not contain a Jamo T.

	1039 if(!doCompose) {

	1040 return FALSE;

	1041 }

	1042 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));

	1043 prevBoundary=src;

	1044 continue;

	1045 }

	1046 if(!needToDecompose) {

	1047 // The Jamo V/T did not compose into a Hangul syllable.

	1048 if(doCompose) {

	1049 if(!buffer.appendBMP((UChar)c, 0, errorCode)) {

	1050 break;

	1051 }

	1052 } else {

	1053 prevCC=0;

	1054 }

	1055 continue;

	1056 }

	1057 }

	1058 /*

	1059 * Source buffer pointers:

	1060 *

	1061 * all done quick check current char not yet

	1062 * "yes" but (c) processed

	1063 * may combine

	1064 * forward

	1065 * [-------------[-------------[-------------[-------------[

	1066 * \| \| \| \| \|

	1067 * orig. src prevBoundary prevSrc src limit

	1068 *

	1069 *

	1070 * Destination buffer pointers inside the ReorderingBuffer:

	1071 *

	1072 * all done might take not filled yet

	1073 * characters for

	1074 * reordering

	1075 * [-------------[-------------[-------------[

	1076 * \| \| \| \|

	1077 * start reorderStart limit \|

	1078 * +remainingCap.+

	1079 */

	1080 if(norm16>=MIN_YES_YES_WITH_CC) {

	1081 uint8_t cc=(uint8_t)norm16; // cc!=0

	1082 if( onlyContiguous && // FCC

	1083 (doCompose ? buffer.getLastCC() : prevCC)==0 &&

	1084 prevBoundary<prevSrc &&

	1085 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that

	1086 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)

	1087 // passed the quick check "yes && ccc==0" test.

	1088 // Check whether the last character was a "yesYes" or a "yesNo".

	1089 // If a "yesNo", then we get its trailing ccc from its

	1090 // mapping and check for canonical order.

	1091 // All other cases are ok.

	1092 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc

	1093 ) {

	1094 // Fails FCD test, need to decompose and contiguously recompose.

	1095 if(!doCompose) {

	1096 return FALSE;

	1097 }

	1098 } else if(doCompose) {

	1099 if(!buffer.append(c, cc, errorCode)) {

	1100 break;

	1101 }

	1102 continue;

	1103 } else if(prevCC<=cc) {

	1104 prevCC=cc;

	1105 continue;

	1106 } else {

	1107 return FALSE;

	1108 }

	1109 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {

	1110 return FALSE;

	1111 }

	1112

	1113 /*

	1114 * Find appropriate boundaries around this character,

	1115 * decompose the source text from between the boundaries,

	1116 * and recompose it.

	1117 *

	1118 * We may need to remove the last few characters from the ReorderingBuff er

	1119 * to account for source text that was copied or appended

	1120 * but needs to take part in the recomposition.

	1121 */

	1122

	1123 /*

	1124 * Find the last composition boundary in [prevBoundary..src[.

	1125 * It is either the decomposition of the current character (at prevSrc),

	1126 * or prevBoundary.

	1127 */

	1128 if(hasCompBoundaryBefore(c, norm16)) {

	1129 prevBoundary=prevSrc;

	1130 } else if(doCompose) {

	1131 buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));

	1132 }

	1133

	1134 // Find the next composition boundary in [src..limit[ -

	1135 // modifies src to point to the next starter.

	1136 src=(UChar *)findNextCompBoundary(src, limit);

	1137

	1138 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.

	1139 int32_t recomposeStartIndex=buffer.length();

	1140 if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {

	1141 break;

	1142 }

	1143 recompose(buffer, recomposeStartIndex, onlyContiguous);

	1144 if(!doCompose) {

	1145 if(!buffer.equals(prevBoundary, src)) {

	1146 return FALSE;

	1147 }

	1148 buffer.remove();

	1149 prevCC=0;

	1150 }

	1151

	1152 // Move to the next starter. We never need to look back before this poin t again.

	1153 prevBoundary=src;

	1154 }

	1155 return TRUE;

	1156 }

	1157

	1158 // Very similar to compose(): Make the same changes in both places if relevant.

	1159 // pQCResult==NULL: spanQuickCheckYes

	1160 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)

	1161 const UChar *

	1162 Normalizer2Impl::composeQuickCheck(const UChar src, const UChar limit,

	1163 UBool onlyContiguous,

	1164 UNormalizationCheckResult *pQCResult) const {

	1165 /*

	1166 * prevBoundary points to the last character before the current one

	1167 * that has a composition boundary before it with ccc==0 and quick check "ye s".

	1168 */

	1169 const UChar *prevBoundary=src;

	1170 UChar32 minNoMaybeCP=minCompNoMaybeCP;

	1171 if(limit==NULL) {

	1172 UErrorCode errorCode=U_ZERO_ERROR;

	1173 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);

	1174 if(prevBoundary<src) {

	1175 // Set prevBoundary to the last character in the prefix.

	1176 prevBoundary=src-1;

	1177 }

	1178 limit=u_strchr(src, 0);

	1179 }

	1180

	1181 const UChar *prevSrc;

	1182 UChar32 c=0;

	1183 uint16_t norm16=0;

	1184 uint8_t prevCC=0;

	1185

	1186 for(;;) {

	1187 // count code units below the minimum or with irrelevant data for the qu ick check

	1188 for(prevSrc=src;;) {

	1189 if(src==limit) {

	1190 return src;

	1191 }

	1192 if( (c=*src)<minNoMaybeCP \|\|

	1193 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(norm Trie, c))

	1194 ) {

	1195 ++src;

	1196 } else if(!U16_IS_SURROGATE(c)) {

	1197 break;

	1198 } else {

	1199 UChar c2;

	1200 if(U16_IS_SURROGATE_LEAD(c)) {

	1201 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {

	1202 c=U16_GET_SUPPLEMENTARY(c, c2);

	1203 }

	1204 } else /* trail surrogate */ {

	1205 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {

	1206 --src;

	1207 c=U16_GET_SUPPLEMENTARY(c2, c);

	1208 }

	1209 }

	1210 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {

	1211 src+=U16_LENGTH(c);

	1212 } else {

	1213 break;

	1214 }

	1215 }

	1216 }

	1217 if(src!=prevSrc) {

	1218 // Set prevBoundary to the last character in the quick check loop.

	1219 prevBoundary=src-1;

	1220 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&

	1221 U16_IS_LEAD(*(prevBoundary-1))

	1222 ) {

	1223 --prevBoundary;

	1224 }

	1225 prevCC=0;

	1226 // The start of the current character (c).

	1227 prevSrc=src;

	1228 }

	1229

	1230 src+=U16_LENGTH(c);

	1231 /*

	1232 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.

	1233 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backwa rd)

	1234 * or has ccc!=0.

	1235 */

	1236 if(isMaybeOrNonZeroCC(norm16)) {

	1237 uint8_t cc=getCCFromYesOrMaybe(norm16);

	1238 if( onlyContiguous && // FCC

	1239 cc!=0 &&

	1240 prevCC==0 &&

	1241 prevBoundary<prevSrc &&

	1242 // prevCC==0 && prevBoundary<prevSrc tell us that

	1243 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)

	1244 // passed the quick check "yes && ccc==0" test.

	1245 // Check whether the last character was a "yesYes" or a "yesNo".

	1246 // If a "yesNo", then we get its trailing ccc from its

	1247 // mapping and check for canonical order.

	1248 // All other cases are ok.

	1249 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc

	1250 ) {

	1251 // Fails FCD test.

	1252 } else if(prevCC<=cc \|\| cc==0) {

	1253 prevCC=cc;

	1254 if(norm16<MIN_YES_YES_WITH_CC) {

	1255 if(pQCResult!=NULL) {

	1256 *pQCResult=UNORM_MAYBE;

	1257 } else {

	1258 return prevBoundary;

	1259 }

	1260 }

	1261 continue;

	1262 }

	1263 }

	1264 if(pQCResult!=NULL) {

	1265 *pQCResult=UNORM_NO;

	1266 }

	1267 return prevBoundary;

	1268 }

	1269 }

	1270

	1271 void Normalizer2Impl::composeAndAppend(const UChar src, const UChar limit,

	1272 UBool doCompose,

	1273 UBool onlyContiguous,

	1274 ReorderingBuffer &buffer,

	1275 UErrorCode &errorCode) const {

	1276 if(!buffer.isEmpty()) {

	1277 const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);

	1278 if(src!=firstStarterInSrc) {

	1279 const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getSt art(),

	1280 buffer.getLi mit());

	1281 UnicodeString middle(lastStarterInDest,

	1282 (int32_t)(buffer.getLimit()-lastStarterInDest)) ;

	1283 buffer.removeSuffix((int32_t)(buffer.getLimit()-lastStarterInDest));

	1284 middle.append(src, (int32_t)(firstStarterInSrc-src));

	1285 const UChar *middleStart=middle.getBuffer();

	1286 compose(middleStart, middleStart+middle.length(), onlyContiguous,

	1287 TRUE, buffer, errorCode);

	1288 if(U_FAILURE(errorCode)) {

	1289 return;

	1290 }

	1291 src=firstStarterInSrc;

	1292 }

	1293 }

	1294 if(doCompose) {

	1295 compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);

	1296 } else {

	1297 buffer.appendZeroCC(src, limit, errorCode);

	1298 }

	1299 }

	1300

	1301 /**

	1302 * Does c have a composition boundary before it?

	1303 * True if its decomposition begins with a character that has

	1304 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).

	1305 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes

	1306 * (isCompYesAndZeroCC()) so we need not decompose.

	1307 */

	1308 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {

	1309 for(;;) {

	1310 if(isCompYesAndZeroCC(norm16)) {

	1311 return TRUE;

	1312 } else if(isMaybeOrNonZeroCC(norm16)) {

	1313 return FALSE;

	1314 } else if(isDecompNoAlgorithmic(norm16)) {

	1315 c=mapAlgorithmic(c, norm16);

	1316 norm16=getNorm16(c);

	1317 } else {

	1318 // c decomposes, get everything from the variable-length extra data

	1319 const uint16_t *mapping=getMapping(norm16);

	1320 uint16_t firstUnit=*mapping++;

	1321 if((firstUnit&MAPPING_LENGTH_MASK)==0) {

	1322 return FALSE;

	1323 }

	1324 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*mapping++&0xff00)) {

	1325 return FALSE; // non-zero leadCC

	1326 }

	1327 int32_t i=0;

	1328 UChar32 c;

	1329 U16_NEXT_UNSAFE(mapping, i, c);

	1330 return isCompYesAndZeroCC(getNorm16(c));

	1331 }

	1332 }

	1333 }

	1334

	1335 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBo ol testInert) const {

	1336 for(;;) {

	1337 uint16_t norm16=getNorm16(c);

	1338 if(isInert(norm16)) {

	1339 return TRUE;

	1340 } else if(norm16<=minYesNo) {

	1341 // Hangul LVT (==minYesNo) has a boundary after it.

	1342 // Hangul LV and non-inert yesYes characters combine forward.

	1343 return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);

	1344 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {

	1345 return FALSE;

	1346 } else if(isDecompNoAlgorithmic(norm16)) {

	1347 c=mapAlgorithmic(c, norm16);

	1348 } else {

	1349 // c decomposes, get everything from the variable-length extra data.

	1350 // If testInert, then c must be a yesNo character which has lccc=0,

	1351 // otherwise it could be a noNo.

	1352 const uint16_t *mapping=getMapping(norm16);

	1353 uint16_t firstUnit=*mapping;

	1354 // TRUE if

	1355 // c is not deleted, and

	1356 // it and its decomposition do not combine forward, and it has a starter, and

	1357 // if FCC then trailCC<=1

	1358 return

	1359 (firstUnit&MAPPING_LENGTH_MASK)!=0 &&

	1360 (firstUnit&(MAPPING_PLUS_COMPOSITION_LIST\|MAPPING_NO_COMP_BOUNDA RY_AFTER))==0 &&

	1361 (!onlyContiguous \|\| firstUnit<=0x1ff);

	1362 }

	1363 }

	1364 }

	1365

	1366 const UChar Normalizer2Impl::findPreviousCompBoundary(const UChar start, const UChar *p) const {

	1367 BackwardUTrie2StringIterator iter(normTrie, start, p);

	1368 uint16_t norm16;

	1369 do {

	1370 norm16=iter.previous16();

	1371 } while(!hasCompBoundaryBefore(iter.codePoint, norm16));

	1372 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,

	1373 // but that's probably not worth the extra cost.

	1374 return iter.codePointStart;

	1375 }

	1376

	1377 const UChar Normalizer2Impl::findNextCompBoundary(const UChar p, const UChar * limit) const {

	1378 ForwardUTrie2StringIterator iter(normTrie, p, limit);

	1379 uint16_t norm16;

	1380 do {

	1381 norm16=iter.next16();

	1382 } while(!hasCompBoundaryBefore(iter.codePoint, norm16));

	1383 return iter.codePointStart;

	1384 }

	1385

	1386 class FCDTrieSingleton : public UTrie2Singleton {

	1387 public:

	1388 FCDTrieSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :

	1389 UTrie2Singleton(s), impl(ni), errorCode(ec) {}

	1390 UTrie2 *getInstance(UErrorCode &errorCode) {

	1391 return UTrie2Singleton::getInstance(createInstance, this, errorCode);

	1392 }

	1393 static void createInstance(const void context, UErrorCode &errorCode);

	1394 UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {

	1395 if(value!=0) {

	1396 impl.setFCD16FromNorm16(start, end, (uint16_t)value, newFCDTrie, err orCode);

	1397 }

	1398 return U_SUCCESS(errorCode);

	1399 }

	1400

	1401 Normalizer2Impl &impl;

	1402 UTrie2 *newFCDTrie;

	1403 UErrorCode &errorCode;

	1404 };

	1405

	1406 U_CDECL_BEGIN

	1407

	1408 // Set the FCD value for a range of same-norm16 characters.

	1409 static UBool U_CALLCONV

	1410 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value ) {

	1411 return ((FCDTrieSingleton *)context)->rangeHandler(start, end, value);

	1412 }

	1413

	1414 // Collect (OR together) the FCD values for a range of supplementary characters,

	1415 // for their lead surrogate code unit.

	1416 static UBool U_CALLCONV

	1417 enumRangeOrValue(const void context, UChar32 /start/, UChar32 /end*/, uint32 _t value) {

	1418 ((uint32_t )context)\|=value;

	1419 return TRUE;

	1420 }

	1421

	1422 U_CDECL_END

	1423

	1424 void FCDTrieSingleton::createInstance(const void context, UErrorCode &errorCod e) {

	1425 FCDTrieSingleton me=(FCDTrieSingleton )context;

	1426 me->newFCDTrie=utrie2_open(0, 0, &errorCode);

	1427 if(U_SUCCESS(errorCode)) {

	1428 utrie2_enum(me->impl.getNormTrie(), NULL, enumRangeHandler, me);

	1429 for(UChar lead=0xd800; lead<0xdc00; ++lead) {

	1430 uint32_t oredValue=utrie2_get32(me->newFCDTrie, lead);

	1431 utrie2_enumForLeadSurrogate(me->newFCDTrie, lead, NULL, enumRangeOrV alue, &oredValue);

	1432 if(oredValue!=0) {

	1433 // Set a "bad" value for makeFCD() to break the quick check loop

	1434 // and look up the value for the supplementary code point.

	1435 // If there is any lccc, then set the worst-case lccc of 1.

	1436 // The ORed-together value's tccc is already the worst case.

	1437 if(oredValue>0xff) {

	1438 oredValue=0x100\|(oredValue&0xff);

	1439 }

	1440 utrie2_set32ForLeadSurrogateCodeUnit(me->newFCDTrie, lead, oredV alue, &errorCode);

	1441 }

	1442 }

	1443 utrie2_freeze(me->newFCDTrie, UTRIE2_16_VALUE_BITS, &errorCode);

	1444 if(U_SUCCESS(errorCode)) {

	1445 return me->newFCDTrie;

	1446 }

	1447 }

	1448 utrie2_close(me->newFCDTrie);

	1449 return NULL;

	1450 }

	1451

	1452 void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t no rm16,

	1453 UTrie2 *newFCDTrie, UErrorCode &errorCo de) const {

	1454 // Only loops for 1:1 algorithmic mappings.

	1455 for(;;) {

	1456 if(norm16>=MIN_NORMAL_MAYBE_YES) {

	1457 norm16&=0xff;

	1458 norm16\|=norm16<<8;

	1459 } else if(norm16<=minYesNo \|\| minMaybeYes<=norm16) {

	1460 // no decomposition or Hangul syllable, all zeros

	1461 break;

	1462 } else if(limitNoNo<=norm16) {

	1463 int32_t delta=norm16-(minMaybeYes-MAX_DELTA-1);

	1464 if(start==end) {

	1465 start+=delta;

	1466 norm16=getNorm16(start);

	1467 } else {

	1468 // the same delta leads from different original characters to di fferent mappings

	1469 do {

	1470 UChar32 c=start+delta;

	1471 setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie, errorCode );

	1472 } while(++start<=end);

	1473 break;

	1474 }

	1475 } else {

	1476 // c decomposes, get everything from the variable-length extra data

	1477 const uint16_t *mapping=getMapping(norm16);

	1478 uint16_t firstUnit=*mapping;

	1479 if((firstUnit&MAPPING_LENGTH_MASK)==0) {

	1480 // A character that is deleted (maps to an empty string) must

	1481 // get the worst-case lccc and tccc values because arbitrary

	1482 // characters on both sides will become adjacent.

	1483 norm16=0x1ff;

	1484 } else {

	1485 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {

	1486 norm16=mapping[1]&0xff00; // lccc

	1487 } else {

	1488 norm16=0;

	1489 }

	1490 norm16\|=firstUnit>>8; // tccc

	1491 }

	1492 }

	1493 utrie2_setRange32(newFCDTrie, start, end, norm16, TRUE, &errorCode);

	1494 break;

	1495 }

	1496 }

	1497

	1498 const UTrie2 *Normalizer2Impl::getFCDTrie(UErrorCode &errorCode) const {

	1499 // Logically const: Synchronized instantiation.

	1500 Normalizer2Impl me=const_cast<Normalizer2Impl >(this);

	1501 return FCDTrieSingleton(me->fcdTrieSingleton, *me, errorCode).getInstance(er rorCode);

	1502 }

	1503

	1504 // Dual functionality:

	1505 // buffer!=NULL: normalize

	1506 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes

	1507 const UChar *

	1508 Normalizer2Impl::makeFCD(const UChar src, const UChar limit,

	1509 ReorderingBuffer *buffer,

	1510 UErrorCode &errorCode) const {

	1511 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordere d tccc<=1.

	1512 // Similar to the prevBoundary in the compose() implementation.

	1513 const UChar *prevBoundary=src;

	1514 int32_t prevFCD16=0;

	1515 if(limit==NULL) {

	1516 src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCo de);

	1517 if(U_FAILURE(errorCode)) {

	1518 return src;

	1519 }

	1520 if(prevBoundary<src) {

	1521 prevBoundary=src;

	1522 // We know that the previous character's lccc==0.

	1523 // Fetching the fcd16 value was deferred for this below-U+0300 code point.

	1524 prevFCD16=getFCD16FromSingleLead(*(src-1));

	1525 if(prevFCD16>1) {

	1526 --prevBoundary;

	1527 }

	1528 }

	1529 limit=u_strchr(src, 0);

	1530 }

	1531

	1532 // Note: In this function we use buffer->appendZeroCC() because we track

	1533 // the lead and trail combining classes here, rather than leaving it to

	1534 // the ReorderingBuffer.

	1535 // The exception is the call to decomposeShort() which uses the buffer

	1536 // in the normal way.

	1537

	1538 const UTrie2 *trie=fcdTrie();

	1539

	1540 const UChar *prevSrc;

	1541 UChar32 c=0;

	1542 uint16_t fcd16=0;

	1543

	1544 for(;;) {

	1545 // count code units with lccc==0

	1546 for(prevSrc=src; src!=limit;) {

	1547 if((c=*src)<MIN_CCC_LCCC_CP) {

	1548 prevFCD16=~c;

	1549 ++src;

	1550 } else if((fcd16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, c))<=0xff) {

	1551 prevFCD16=fcd16;

	1552 ++src;

	1553 } else if(!U16_IS_SURROGATE(c)) {

	1554 break;

	1555 } else {

	1556 UChar c2;

	1557 if(U16_IS_SURROGATE_LEAD(c)) {

	1558 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {

	1559 c=U16_GET_SUPPLEMENTARY(c, c2);

	1560 }

	1561 } else /* trail surrogate */ {

	1562 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {

	1563 --src;

	1564 c=U16_GET_SUPPLEMENTARY(c2, c);

	1565 }

	1566 }

	1567 if((fcd16=getFCD16(c))<=0xff) {

	1568 prevFCD16=fcd16;

	1569 src+=U16_LENGTH(c);

	1570 } else {

	1571 break;

	1572 }

	1573 }

	1574 }

	1575 // copy these code units all at once

	1576 if(src!=prevSrc) {

	1577 if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {

	1578 break;

	1579 }

	1580 if(src==limit) {

	1581 break;

	1582 }

	1583 prevBoundary=src;

	1584 // We know that the previous character's lccc==0.

	1585 if(prevFCD16<0) {

	1586 // Fetching the fcd16 value was deferred for this below-U+0300 c ode point.

	1587 prevFCD16=getFCD16FromSingleLead((UChar)~prevFCD16);

	1588 if(prevFCD16>1) {

	1589 --prevBoundary;

	1590 }

	1591 } else {

	1592 const UChar *p=src-1;

	1593 if(U16_IS_TRAIL(p) && prevSrc<p && U16_IS_LEAD((p-1))) {

	1594 --p;

	1595 // Need to fetch the previous character's FCD value because

	1596 // prevFCD16 was just for the trail surrogate code point.

	1597 prevFCD16=getFCD16FromSurrogatePair(p[0], p[1]);

	1598 // Still known to have lccc==0 because its lead surrogate un it had lccc==0.

	1599 }

	1600 if(prevFCD16>1) {

	1601 prevBoundary=p;

	1602 }

	1603 }

	1604 // The start of the current character (c).

	1605 prevSrc=src;

	1606 } else if(src==limit) {

	1607 break;

	1608 }

	1609

	1610 src+=U16_LENGTH(c);

	1611 // The current character (c) at [prevSrc..src[ has a non-zero lead combi ning class.

	1612 // Check for proper order, and decompose locally if necessary.

	1613 if((prevFCD16&0xff)<=(fcd16>>8)) {

	1614 // proper order: prev tccc <= current lccc

	1615 if((fcd16&0xff)<=1) {

	1616 prevBoundary=src;

	1617 }

	1618 if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {

	1619 break;

	1620 }

	1621 prevFCD16=fcd16;

	1622 continue;

	1623 } else if(buffer==NULL) {

	1624 return prevBoundary; // quick check "no"

	1625 } else {

	1626 /*

	1627 * Back out the part of the source that we copied or appended

	1628 * already but is now going to be decomposed.

	1629 * prevSrc is set to after what was copied/appended.

	1630 */

	1631 buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));

	1632 /*

	1633 * Find the part of the source that needs to be decomposed,

	1634 * up to the next safe boundary.

	1635 */

	1636 src=findNextFCDBoundary(src, limit);

	1637 /*

	1638 * The source text does not fulfill the conditions for FCD.

	1639 * Decompose and reorder a limited piece of the text.

	1640 */

	1641 if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {

	1642 break;

	1643 }

	1644 prevBoundary=src;

	1645 prevFCD16=0;

	1646 }

	1647 }

	1648 return src;

	1649 }

	1650

	1651 void Normalizer2Impl::makeFCDAndAppend(const UChar src, const UChar limit,

	1652 UBool doMakeFCD,

	1653 ReorderingBuffer &buffer,

	1654 UErrorCode &errorCode) const {

	1655 if(!buffer.isEmpty()) {

	1656 const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);

	1657 if(src!=firstBoundaryInSrc) {

	1658 const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getSt art(),

	1659 buffer.getLi mit());

	1660 UnicodeString middle(lastBoundaryInDest,

	1661 (int32_t)(buffer.getLimit()-lastBoundaryInDest) );

	1662 buffer.removeSuffix((int32_t)(buffer.getLimit()-lastBoundaryInDest)) ;

	1663 middle.append(src, (int32_t)(firstBoundaryInSrc-src));

	1664 const UChar *middleStart=middle.getBuffer();

	1665 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode );

	1666 if(U_FAILURE(errorCode)) {

	1667 return;

	1668 }

	1669 src=firstBoundaryInSrc;

	1670 }

	1671 }

	1672 if(doMakeFCD) {

	1673 makeFCD(src, limit, &buffer, errorCode);

	1674 } else {

	1675 buffer.appendZeroCC(src, limit, errorCode);

	1676 }

	1677 }

	1678

	1679 const UChar Normalizer2Impl::findPreviousFCDBoundary(const UChar start, const UChar *p) const {

	1680 BackwardUTrie2StringIterator iter(fcdTrie(), start, p);

	1681 uint16_t fcd16;

	1682 do {

	1683 fcd16=iter.previous16();

	1684 } while(fcd16>0xff);

	1685 return iter.codePointStart;

	1686 }

	1687

	1688 const UChar Normalizer2Impl::findNextFCDBoundary(const UChar p, const UChar *l imit) const {

	1689 ForwardUTrie2StringIterator iter(fcdTrie(), p, limit);

	1690 uint16_t fcd16;

	1691 do {

	1692 fcd16=iter.next16();

	1693 } while(fcd16>0xff);

	1694 return iter.codePointStart;

	1695 }

	1696

	1697 // CanonicalIterator data -------------------------------------------------- ***

	1698

	1699 CanonIterData::CanonIterData(UErrorCode &errorCode) :

	1700 trie(utrie2_open(0, 0, &errorCode)),

	1701 canonStartSets(uhash_deleteUObject, NULL, errorCode) {}

	1702

	1703 CanonIterData::~CanonIterData() {

	1704 utrie2_close(trie);

	1705 }

	1706

	1707 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {

	1708 uint32_t canonValue=utrie2_get32(trie, decompLead);

	1709 if((canonValue&(CANON_HAS_SET\|CANON_VALUE_MASK))==0 && origin!=0) {

	1710 // origin is the first character whose decomposition starts with

	1711 // the character for which we are setting the value.

	1712 utrie2_set32(trie, decompLead, canonValue\|origin, &errorCode);

	1713 } else {

	1714 // origin is not the first character, or it is U+0000.

	1715 UnicodeSet *set;

	1716 if((canonValue&CANON_HAS_SET)==0) {

	1717 set=new UnicodeSet;

	1718 if(set==NULL) {

	1719 errorCode=U_MEMORY_ALLOCATION_ERROR;

	1720 return;

	1721 }

	1722 UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);

	1723 canonValue=(canonValue&~CANON_VALUE_MASK)\|CANON_HAS_SET\|(uint32_t)ca nonStartSets.size();

	1724 utrie2_set32(trie, decompLead, canonValue, &errorCode);

	1725 canonStartSets.addElement(set, errorCode);

	1726 if(firstOrigin!=0) {

	1727 set->add(firstOrigin);

	1728 }

	1729 } else {

	1730 set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MA SK)];

	1731 }

	1732 set->add(origin);

	1733 }

	1734 }

	1735

	1736 class CanonIterDataSingleton {

	1737 public:

	1738 CanonIterDataSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode & ec) :

	1739 singleton(s), impl(ni), errorCode(ec) {}

	1740 CanonIterData *getInstance(UErrorCode &errorCode) {

	1741 void *duplicate;

	1742 CanonIterData *instance=

	1743 (CanonIterData *)singleton.getInstance(createInstance, this, duplica te, errorCode);

	1744 delete (CanonIterData *)duplicate;

	1745 return instance;

	1746 }

	1747 static void createInstance(const void context, UErrorCode &errorCode);

	1748 UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {

	1749 if(value!=0) {

	1750 impl.makeCanonIterDataFromNorm16(start, end, (uint16_t)value, *newDa ta, errorCode);

	1751 }

	1752 return U_SUCCESS(errorCode);

	1753 }

	1754

	1755 private:

	1756 SimpleSingleton &singleton;

	1757 Normalizer2Impl &impl;

	1758 CanonIterData *newData;

	1759 UErrorCode &errorCode;

	1760 };

	1761

	1762 U_CDECL_BEGIN

	1763

	1764 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm1 6 characters.

	1765 static UBool U_CALLCONV

	1766 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t va lue) {

	1767 return ((CanonIterDataSingleton *)context)->rangeHandler(start, end, value);

	1768 }

	1769

	1770 U_CDECL_END

	1771

	1772 void CanonIterDataSingleton::createInstance(const void context, UErrorCode &er rorCode) {

	1773 CanonIterDataSingleton me=(CanonIterDataSingleton )context;

	1774 me->newData=new CanonIterData(errorCode);

	1775 if(me->newData==NULL) {

	1776 errorCode=U_MEMORY_ALLOCATION_ERROR;

	1777 return NULL;

	1778 }

	1779 if(U_SUCCESS(errorCode)) {

	1780 utrie2_enum(me->impl.getNormTrie(), NULL, enumCIDRangeHandler, me);

	1781 utrie2_freeze(me->newData->trie, UTRIE2_32_VALUE_BITS, &errorCode);

	1782 if(U_SUCCESS(errorCode)) {

	1783 return me->newData;

	1784 }

	1785 }

	1786 delete me->newData;

	1787 return NULL;

	1788 }

	1789

	1790 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, ui nt16_t norm16,

	1791 CanonIterData &newData,

	1792 UErrorCode &errorCode) const {

	1793 if(norm16==0 \|\| (minYesNo<=norm16 && norm16<minNoNo)) {

	1794 // Inert, or 2-way mapping (including Hangul syllable).

	1795 // We do not write a canonStartSet for any yesNo character.

	1796 // Composites from 2-way mappings are added at runtime from the

	1797 // starter's compositions list, and the other characters in

	1798 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are

	1799 // "maybe" characters.

	1800 return;

	1801 }

	1802 for(UChar32 c=start; c<=end; ++c) {

	1803 uint32_t oldValue=utrie2_get32(newData.trie, c);

	1804 uint32_t newValue=oldValue;

	1805 if(norm16>=minMaybeYes) {

	1806 // not a segment starter if it occurs in a decomposition or has cc!= 0

	1807 newValue\|=CANON_NOT_SEGMENT_STARTER;

	1808 if(norm16<MIN_NORMAL_MAYBE_YES) {

	1809 newValue\|=CANON_HAS_COMPOSITIONS;

	1810 }

	1811 } else if(norm16<minYesNo) {

	1812 newValue\|=CANON_HAS_COMPOSITIONS;

	1813 } else {

	1814 // c has a one-way decomposition

	1815 UChar32 c2=c;

	1816 uint16_t norm16_2=norm16;

	1817 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {

	1818 c2=mapAlgorithmic(c2, norm16_2);

	1819 norm16_2=getNorm16(c2);

	1820 }

	1821 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {

	1822 // c decomposes, get everything from the variable-length extra d ata

	1823 const uint16_t *mapping=getMapping(norm16_2);

	1824 uint16_t firstUnit=*mapping++;

	1825 int32_t length=firstUnit&MAPPING_LENGTH_MASK;

	1826 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {

	1827 if(c==c2 && (*mapping&0xff)!=0) {

	1828 newValue\|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0

	1829 }

	1830 ++mapping;

	1831 }

	1832 // Skip empty mappings (no characters in the decomposition).

	1833 if(length!=0) {

	1834 // add c to first code point's start set

	1835 int32_t i=0;

	1836 U16_NEXT_UNSAFE(mapping, i, c2);

	1837 newData.addToStartSet(c, c2, errorCode);

	1838 // Set CANON_NOT_SEGMENT_STARTER for each remaining code poi nt of a

	1839 // one-way mapping. A 2-way mapping is possible here after

	1840 // intermediate algorithmic mapping.

	1841 if(norm16_2>=minNoNo) {

	1842 while(i<length) {

	1843 U16_NEXT_UNSAFE(mapping, i, c2);

	1844 uint32_t c2Value=utrie2_get32(newData.trie, c2);

	1845 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {

	1846 utrie2_set32(newData.trie, c2, c2Value\|CANON_NOT _SEGMENT_STARTER,

	1847 &errorCode);

	1848 }

	1849 }

	1850 }

	1851 }

	1852 } else {

	1853 // c decomposed to c2 algorithmically; c has cc==0

	1854 newData.addToStartSet(c, c2, errorCode);

	1855 }

	1856 }

	1857 if(newValue!=oldValue) {

	1858 utrie2_set32(newData.trie, c, newValue, &errorCode);

	1859 }

	1860 }

	1861 }

	1862

	1863 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {

	1864 // Logically const: Synchronized instantiation.

	1865 Normalizer2Impl me=const_cast<Normalizer2Impl >(this);

	1866 CanonIterDataSingleton(me->canonIterDataSingleton, *me, errorCode).getInstan ce(errorCode);

	1867 return U_SUCCESS(errorCode);

	1868 }

	1869

	1870 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {

	1871 return (int32_t)utrie2_get32(((CanonIterData *)canonIterDataSingleton.fInsta nce)->trie, c);

	1872 }

	1873

	1874 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {

	1875 return (const UnicodeSet )(

	1876 ((CanonIterData *)canonIterDataSingleton.fInstance)->canonStartSets[n]);

	1877 }

	1878

	1879 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {

	1880 return getCanonValue(c)>=0;

	1881 }

	1882

	1883 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {

	1884 int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;

	1885 if(canonValue==0) {

	1886 return FALSE;

	1887 }

	1888 set.clear();

	1889 int32_t value=canonValue&CANON_VALUE_MASK;

	1890 if((canonValue&CANON_HAS_SET)!=0) {

	1891 set.addAll(getCanonStartSet(value));

	1892 } else if(value!=0) {

	1893 set.add(value);

	1894 }

	1895 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {

	1896 uint16_t norm16=getNorm16(c);

	1897 if(norm16==JAMO_L) {

	1898 UChar32 syllable=

	1899 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JA MO_VT_COUNT);

	1900 set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);

	1901 } else {

	1902 addComposites(getCompositionsList(norm16), set);

	1903 }

	1904 }

	1905 return TRUE;

	1906 }

	1907

	1908 U_NAMESPACE_END

	1909

	1910 // Normalizer2 data swapping ----------------------------------------------- ***

	1911

	1912 U_NAMESPACE_USE

	1913

	1914 U_CAPI int32_t U_EXPORT2

	1915 unorm2_swap(const UDataSwapper *ds,

	1916 const void inData, int32_t length, void outData,

	1917 UErrorCode *pErrorCode) {

	1918 const UDataInfo *pInfo;

	1919 int32_t headerSize;

	1920

	1921 const uint8_t *inBytes;

	1922 uint8_t *outBytes;

	1923

	1924 const int32_t *inIndexes;

	1925 int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];

	1926

	1927 int32_t i, offset, nextOffset, size;

	1928

	1929 /* udata_swapDataHeader checks the arguments */

	1930 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);

	1931 if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {

	1932 return 0;

	1933 }

	1934

	1935 /* check data format and format version */

	1936 pInfo=(const UDataInfo )((const char )inData+4);

	1937 if(!(

	1938 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */

	1939 pInfo->dataFormat[1]==0x72 &&

	1940 pInfo->dataFormat[2]==0x6d &&

	1941 pInfo->dataFormat[3]==0x32 &&

	1942 pInfo->formatVersion[0]==1

	1943 )) {

	1944 udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (fo rmat version %02x) is not recognized as Normalizer2 data\n",

	1945 pInfo->dataFormat[0], pInfo->dataFormat[1],

	1946 pInfo->dataFormat[2], pInfo->dataFormat[3],

	1947 pInfo->formatVersion[0]);

	1948 *pErrorCode=U_UNSUPPORTED_ERROR;

	1949 return 0;

	1950 }

	1951

	1952 inBytes=(const uint8_t *)inData+headerSize;

	1953 outBytes=(uint8_t *)outData+headerSize;

	1954

	1955 inIndexes=(const int32_t *)inBytes;

	1956

	1957 if(length>=0) {

	1958 length-=headerSize;

	1959 if(length<(int32_t)sizeof(indexes)) {

	1960 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",

	1961 length);

	1962 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;

	1963 return 0;

	1964 }

	1965 }

	1966

	1967 /* read the first few indexes */

	1968 for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {

	1969 indexes[i]=udata_readInt32(ds, inIndexes[i]);

	1970 }

	1971

	1972 /* get the total length of the data */

	1973 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];

	1974

	1975 if(length>=0) {

	1976 if(length<size) {

	1977 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",

	1978 length);

	1979 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;

	1980 return 0;

	1981 }

	1982

	1983 /* copy the data for inaccessible bytes */

	1984 if(inBytes!=outBytes) {

	1985 uprv_memcpy(outBytes, inBytes, size);

	1986 }

	1987

	1988 offset=0;

	1989

	1990 /* swap the int32_t indexes[] */

	1991 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];

	1992 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);

	1993 offset=nextOffset;

	1994

	1995 /* swap the UTrie2 */

	1996 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];

	1997 utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErr orCode);

	1998 offset=nextOffset;

	1999

	2000 /* swap the uint16_t extraData[] */

	2001 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET+1];

	2002 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);

	2003 offset=nextOffset;

	2004

	2005 U_ASSERT(offset==size);

	2006 }

	2007

	2008 return headerSize+size;

	2009 }

	2010

	2011 #endif // !UCONFIG_NO_NORMALIZATION

OLD	NEW

« no previous file with comments | « icu46/source/common/normalizer2impl.h ('k') | icu46/source/common/normlzr.cpp » ('j') | no next file with comments »