icu46/source/common/ucase.c - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/ucase.c

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 *******************************************************************************

	3 *

	4 * Copyright (C) 2004-2010, International Business Machines

	5 * Corporation and others. All Rights Reserved.

	6 *

	7 *******************************************************************************

	8 * file name: ucase.c

	9 * encoding: US-ASCII

	10 * tab size: 8 (not used)

	11 * indentation:4

	12 *

	13 * created on: 2004aug30

	14 * created by: Markus W. Scherer

	15 *

	16 * Low-level Unicode character/string case mapping code.

	17 * Much code moved here (and modified) from uchar.c.

	18 */

	19

	20 #include "unicode/utypes.h"

	21 #include "unicode/uset.h"

	22 #include "unicode/udata.h" /* UDataInfo */

	23 #include "ucmndata.h" /* DataHeader */

	24 #include "udatamem.h"

	25 #include "umutex.h"

	26 #include "uassert.h"

	27 #include "cmemory.h"

	28 #include "utrie2.h"

	29 #include "ucase.h"

	30 #include "ucln_cmn.h"

	31

	32 struct UCaseProps {

	33 UDataMemory *mem;

	34 const int32_t *indexes;

	35 const uint16_t *exceptions;

	36 const UChar *unfold;

	37

	38 UTrie2 trie;

	39 uint8_t formatVersion[4];

	40 };

	41

	42 /* ucase_props_data.c is machine-generated by gencase --csource */

	43 #include "ucase_props_data.c"

	44

	45 /* UCaseProps singleton ----------------------------------------------------- */

	46

	47 U_CAPI const UCaseProps * U_EXPORT2

	48 ucase_getSingleton() {

	49 return &ucase_props_singleton;

	50 }

	51

	52 /* set of property starts for UnicodeSet ------------------------------------ */

	53

	54 static UBool U_CALLCONV

	55 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32 _t value) {

	56 /* add the start code point to the USet */

	57 const USetAdder sa=(const USetAdder )context;

	58 sa->add(sa->set, start);

	59 return TRUE;

	60 }

	61

	62 U_CFUNC void U_EXPORT2

	63 ucase_addPropertyStarts(const UCaseProps csp, const USetAdder sa, UErrorCode * pErrorCode) {

	64 if(U_FAILURE(*pErrorCode)) {

	65 return;

	66 }

	67

	68 /* add the start code point of each same-value range of the trie */

	69 utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);

	70

	71 /* add code points with hardcoded properties, plus the ones following them * /

	72

	73 /* (none right now, see comment below) */

	74

	75 /*

	76 * Omit code points with hardcoded specialcasing properties

	77 * because we do not build property UnicodeSets for them right now.

	78 */

	79 }

	80

	81 /* data access primitives --------------------------------------------------- */

	82

	83 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT) )

	84

	85 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)

	86

	87 /* number of bits in an 8-bit integer value */

	88 static const uint8_t flagsOffset[256]={

	89 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,

	90 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

	91 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

	92 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

	93 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

	94 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

	95 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

	96 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

	97 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

	98 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

	99 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

	100 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

	101 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

	102 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

	103 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

	104 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8

	105 };

	106

	107 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))

	108 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]

	109

	110 /*

	111 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).

	112 *

	113 * @param excWord (in) initial exceptions word

	114 * @param idx (in) desired slot index

	115 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;

	116 * moved to the last uint16_t of the value, use +1 for beginning o f next slot

	117 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modif ied

	118 */

	119 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \

	120 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \

	121 (pExc16)+=SLOT_OFFSET(excWord, idx); \

	122 (value)=*pExc16; \

	123 } else { \

	124 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \

	125 (value)=*pExc16++; \

	126 (value)=((value)<<16)\|*pExc16; \

	127 }

	128

	129 /* simple case mappings ----------------------------------------------------- */

	130

	131 U_CAPI UChar32 U_EXPORT2

	132 ucase_tolower(const UCaseProps *csp, UChar32 c) {

	133 uint16_t props=UTRIE2_GET16(&csp->trie, c);

	134 if(!PROPS_HAS_EXCEPTION(props)) {

	135 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {

	136 c+=UCASE_GET_DELTA(props);

	137 }

	138 } else {

	139 const uint16_t *pe=GET_EXCEPTIONS(csp, props);

	140 uint16_t excWord=*pe++;

	141 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {

	142 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);

	143 }

	144 }

	145 return c;

	146 }

	147

	148 U_CAPI UChar32 U_EXPORT2

	149 ucase_toupper(const UCaseProps *csp, UChar32 c) {

	150 uint16_t props=UTRIE2_GET16(&csp->trie, c);

	151 if(!PROPS_HAS_EXCEPTION(props)) {

	152 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {

	153 c+=UCASE_GET_DELTA(props);

	154 }

	155 } else {

	156 const uint16_t *pe=GET_EXCEPTIONS(csp, props);

	157 uint16_t excWord=*pe++;

	158 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {

	159 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);

	160 }

	161 }

	162 return c;

	163 }

	164

	165 U_CAPI UChar32 U_EXPORT2

	166 ucase_totitle(const UCaseProps *csp, UChar32 c) {

	167 uint16_t props=UTRIE2_GET16(&csp->trie, c);

	168 if(!PROPS_HAS_EXCEPTION(props)) {

	169 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {

	170 c+=UCASE_GET_DELTA(props);

	171 }

	172 } else {

	173 const uint16_t *pe=GET_EXCEPTIONS(csp, props);

	174 uint16_t excWord=*pe++;

	175 int32_t idx;

	176 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {

	177 idx=UCASE_EXC_TITLE;

	178 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {

	179 idx=UCASE_EXC_UPPER;

	180 } else {

	181 return c;

	182 }

	183 GET_SLOT_VALUE(excWord, idx, pe, c);

	184 }

	185 return c;

	186 }

	187

	188 static const UChar iDot[2] = { 0x69, 0x307 };

	189 static const UChar jDot[2] = { 0x6a, 0x307 };

	190 static const UChar iOgonekDot[3] = { 0x12f, 0x307 };

	191 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };

	192 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };

	193 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };

	194

	195

	196 U_CFUNC void U_EXPORT2

	197 ucase_addCaseClosure(const UCaseProps csp, UChar32 c, const USetAdder sa) {

	198 uint16_t props;

	199

	200 /*

	201 * Hardcode the case closure of i and its relatives and ignore the

	202 * data file data for these characters.

	203 * The Turkic dotless i and dotted I with their case mapping conditions

	204 * and case folding option make the related characters behave specially.

	205 * This code matches their closure behavior to their case folding behavior.

	206 */

	207

	208 switch(c) {

	209 case 0x49:

	210 /* regular i and I are in one equivalence class */

	211 sa->add(sa->set, 0x69);

	212 return;

	213 case 0x69:

	214 sa->add(sa->set, 0x49);

	215 return;

	216 case 0x130:

	217 /* dotted I is in a class with <0069 0307> (for canonical equivalence wi th <0049 0307>) */

	218 sa->addString(sa->set, iDot, 2);

	219 return;

	220 case 0x131:

	221 /* dotless i is in a class by itself */

	222 return;

	223 default:

	224 /* otherwise use the data file data */

	225 break;

	226 }

	227

	228 props=UTRIE2_GET16(&csp->trie, c);

	229 if(!PROPS_HAS_EXCEPTION(props)) {

	230 if(UCASE_GET_TYPE(props)!=UCASE_NONE) {

	231 /* add the one simple case mapping, no matter what type it is */

	232 int32_t delta=UCASE_GET_DELTA(props);

	233 if(delta!=0) {

	234 sa->add(sa->set, c+delta);

	235 }

	236 }

	237 } else {

	238 /*

	239 * c has exceptions, so there may be multiple simple and/or

	240 * full case mappings. Add them all.

	241 */

	242 const uint16_t pe0, pe=GET_EXCEPTIONS(csp, props);

	243 const UChar *closure;

	244 uint16_t excWord=*pe++;

	245 int32_t idx, closureLength, fullLength, length;

	246

	247 pe0=pe;

	248

	249 /* add all simple case mappings */

	250 for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {

	251 if(HAS_SLOT(excWord, idx)) {

	252 pe=pe0;

	253 GET_SLOT_VALUE(excWord, idx, pe, c);

	254 sa->add(sa->set, c);

	255 }

	256 }

	257

	258 /* get the closure string pointer & length */

	259 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {

	260 pe=pe0;

	261 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);

	262 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */

	263 closure=(const UChar )pe+1; / behind this slot, unless there are f ull case mappings */

	264 } else {

	265 closureLength=0;

	266 closure=NULL;

	267 }

	268

	269 /* add the full case folding */

	270 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {

	271 pe=pe0;

	272 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);

	273

	274 /* start of full case mapping strings */

	275 ++pe;

	276

	277 fullLength&=0xffff; /* bits 16 and higher are reserved */

	278

	279 /* skip the lowercase result string */

	280 pe+=fullLength&UCASE_FULL_LOWER;

	281 fullLength>>=4;

	282

	283 /* add the full case folding string */

	284 length=fullLength&0xf;

	285 if(length!=0) {

	286 sa->addString(sa->set, (const UChar *)pe, length);

	287 pe+=length;

	288 }

	289

	290 /* skip the uppercase and titlecase strings */

	291 fullLength>>=4;

	292 pe+=fullLength&0xf;

	293 fullLength>>=4;

	294 pe+=fullLength;

	295

	296 closure=(const UChar )pe; / behind full case mappings */

	297 }

	298

	299 /* add each code point in the closure string */

	300 for(idx=0; idx<closureLength;) {

	301 U16_NEXT_UNSAFE(closure, idx, c);

	302 sa->add(sa->set, c);

	303 }

	304 }

	305 }

	306

	307 /*

	308 * compare s, which has a length, with t, which has a maximum length or is NUL-t erminated

	309 * must be length>0 and max>0 and length<=max

	310 */

	311 static U_INLINE int32_t

	312 strcmpMax(const UChar s, int32_t length, const UChar t, int32_t max) {

	313 int32_t c1, c2;

	314

	315 max-=length; /* we require length<=max, so no need to decrement max in the l oop */

	316 do {

	317 c1=*s++;

	318 c2=*t++;

	319 if(c2==0) {

	320 return 1; /* reached the end of t but not of s */

	321 }

	322 c1-=c2;

	323 if(c1!=0) {

	324 return c1; /* return difference result */

	325 }

	326 } while(--length>0);

	327 /* ends with length==0 */

	328

	329 if(max==0 \|\| *t==0) {

	330 return 0; /* equal to length of both strings */

	331 } else {

	332 return -max; /* return lengh difference */

	333 }

	334 }

	335

	336 U_CFUNC UBool U_EXPORT2

	337 ucase_addStringCaseClosure(const UCaseProps csp, const UChar s, int32_t length , const USetAdder *sa) {

	338 const UChar unfold, p;

	339 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWid th;

	340

	341 if(csp->unfold==NULL \|\| s==NULL) {

	342 return FALSE; /* no reverse case folding data, or no string */

	343 }

	344 if(length<=1) {

	345 /* the string is too short to find any match */

	346 /*

	347 * more precise would be:

	348 * if(!u_strHasMoreChar32Than(s, length, 1))

	349 * but this does not make much practical difference because

	350 * a single supplementary code point would just not be found

	351 */

	352 return FALSE;

	353 }

	354

	355 unfold=csp->unfold;

	356 unfoldRows=unfold[UCASE_UNFOLD_ROWS];

	357 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];

	358 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];

	359 unfold+=unfoldRowWidth;

	360

	361 if(length>unfoldStringWidth) {

	362 /* the string is too long to find any match */

	363 return FALSE;

	364 }

	365

	366 /* do a binary search for the string */

	367 start=0;

	368 limit=unfoldRows;

	369 while(start<limit) {

	370 i=(start+limit)/2;

	371 p=unfold+(i*unfoldRowWidth);

	372 result=strcmpMax(s, length, p, unfoldStringWidth);

	373

	374 if(result==0) {

	375 /* found the string: add each code point, and its case closure */

	376 UChar32 c;

	377

	378 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {

	379 U16_NEXT_UNSAFE(p, i, c);

	380 sa->add(sa->set, c);

	381 ucase_addCaseClosure(csp, c, sa);

	382 }

	383 return TRUE;

	384 } else if(result<0) {

	385 limit=i;

	386 } else /* result>0 */ {

	387 start=i+1;

	388 }

	389 }

	390

	391 return FALSE; /* string not found */

	392 }

	393

	394 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */

	395 U_CAPI int32_t U_EXPORT2

	396 ucase_getType(const UCaseProps *csp, UChar32 c) {

	397 uint16_t props=UTRIE2_GET16(&csp->trie, c);

	398 return UCASE_GET_TYPE(props);

	399 }

	400

	401 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */

	402 U_CAPI int32_t U_EXPORT2

	403 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {

	404 uint16_t props=UTRIE2_GET16(&csp->trie, c);

	405 int32_t type=UCASE_GET_TYPE(props);

	406 if(props&UCASE_EXCEPTION) {

	407 const uint16_t *pe=GET_EXCEPTIONS(csp, props);

	408 if(*pe&UCASE_EXC_CASE_IGNORABLE) {

	409 type\|=4;

	410 }

	411 } else if(type==UCASE_NONE && (props&UCASE_CASE_IGNORABLE)) {

	412 type\|=4;

	413 }

	414 return type;

	415 }

	416

	417 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */

	418 static U_INLINE int32_t

	419 getDotType(const UCaseProps *csp, UChar32 c) {

	420 uint16_t props=UTRIE2_GET16(&csp->trie, c);

	421 if(!PROPS_HAS_EXCEPTION(props)) {

	422 return props&UCASE_DOT_MASK;

	423 } else {

	424 const uint16_t *pe=GET_EXCEPTIONS(csp, props);

	425 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;

	426 }

	427 }

	428

	429 U_CAPI UBool U_EXPORT2

	430 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {

	431 return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);

	432 }

	433

	434 U_CAPI UBool U_EXPORT2

	435 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {

	436 uint16_t props=UTRIE2_GET16(&csp->trie, c);

	437 return (UBool)((props&UCASE_SENSITIVE)!=0);

	438 }

	439

	440 /* string casing ------------------------------------------------------------ */

	441

	442 /*

	443 * These internal functions form the core of string case mappings.

	444 * They map single code points to result code points or strings and take

	445 * all necessary conditions (context, locale ID, options) into account.

	446 *

	447 * They do not iterate over the source or write to the destination

	448 * so that the same functions are useful for non-standard string storage,

	449 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.

	450 * For the same reason, the "surrounding text" context is passed in as a

	451 * UCaseContextIterator which does not make any assumptions about

	452 * the underlying storage.

	453 *

	454 * This section contains helper functions that check for conditions

	455 * in the input text surrounding the current code point

	456 * according to SpecialCasing.txt.

	457 *

	458 * Each helper function gets the index

	459 * - after the current code point if it looks at following text

	460 * - before the current code point if it looks at preceding text

	461 *

	462 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:

	463 *

	464 * Final_Sigma

	465 * C is preceded by a sequence consisting of

	466 * a cased letter and a case-ignorable sequence,

	467 * and C is not followed by a sequence consisting of

	468 * an ignorable sequence and then a cased letter.

	469 *

	470 * More_Above

	471 * C is followed by one or more characters of combining class 230 (ABOVE)

	472 * in the combining character sequence.

	473 *

	474 * After_Soft_Dotted

	475 * The last preceding character with combining class of zero before C

	476 * was Soft_Dotted,

	477 * and there is no intervening combining character class 230 (ABOVE).

	478 *

	479 * Before_Dot

	480 * C is followed by combining dot above (U+0307).

	481 * Any sequence of characters with a combining class that is neither 0 nor 230

	482 * may intervene between the current character and the combining dot above.

	483 *

	484 * The erratum from 2002-10-31 adds the condition

	485 *

	486 * After_I

	487 * The last preceding base character was an uppercase I, and there is no

	488 * intervening combining character class 230 (ABOVE).

	489 *

	490 * (See Jitterbug 2344 and the comments on After_I below.)

	491 *

	492 * Helper definitions in Unicode 3.2 UAX 21:

	493 *

	494 * D1. A character C is defined to be cased

	495 * if it meets any of the following criteria:

	496 *

	497 * - The general category of C is Titlecase Letter (Lt)

	498 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase

	499 * - Given D = NFD(C), then it is not the case that:

	500 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)

	501 * (This third criterium does not add any characters to the list

	502 * for Unicode 3.2. Ignored.)

	503 *

	504 * D2. A character C is defined to be case-ignorable

	505 * if it meets either of the following criteria:

	506 *

	507 * - The general category of C is

	508 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or

	509 * Letter Modifier (Lm), or Symbol Modifier (Sk)

	510 * - C is one of the following characters

	511 * U+0027 APOSTROPHE

	512 * U+00AD SOFT HYPHEN (SHY)

	513 * U+2019 RIGHT SINGLE QUOTATION MARK

	514 * (the preferred character for apostrophe)

	515 *

	516 * D3. A case-ignorable sequence is a sequence of

	517 * zero or more case-ignorable characters.

	518 */

	519

	520 #define is_a(c) ((c)=='a' \|\| (c)=='A')

	521 #define is_d(c) ((c)=='d' \|\| (c)=='D')

	522 #define is_e(c) ((c)=='e' \|\| (c)=='E')

	523 #define is_i(c) ((c)=='i' \|\| (c)=='I')

	524 #define is_l(c) ((c)=='l' \|\| (c)=='L')

	525 #define is_n(c) ((c)=='n' \|\| (c)=='N')

	526 #define is_r(c) ((c)=='r' \|\| (c)=='R')

	527 #define is_t(c) ((c)=='t' \|\| (c)=='T')

	528 #define is_u(c) ((c)=='u' \|\| (c)=='U')

	529 #define is_z(c) ((c)=='z' \|\| (c)=='Z')

	530

	531 /* separator? */

	532 #define is_sep(c) ((c)=='_' \|\| (c)=='-' \|\| (c)==0)

	533

	534 /**

	535 * Requires non-NULL locale ID but otherwise does the equivalent of

	536 * checking for language codes as if uloc_getLanguage() were called:

	537 * Accepts both 2- and 3-letter codes and accepts case variants.

	538 */

	539 U_CFUNC int32_t

	540 ucase_getCaseLocale(const char locale, int32_t locCache) {

	541 int32_t result;

	542 char c;

	543

	544 if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {

	545 return result;

	546 }

	547

	548 result=UCASE_LOC_ROOT;

	549

	550 /*

	551 * This function used to use uloc_getLanguage(), but the current code

	552 * removes the dependency of this low-level code on uloc implementation code

	553 * and is faster because not the whole locale ID has to be

	554 * examined and copied/transformed.

	555 *

	556 * Because this code does not want to depend on uloc, the caller must

	557 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().

	558 */

	559 c=*locale++;

	560 if(is_t(c)) {

	561 /* tr or tur? */

	562 c=*locale++;

	563 if(is_u(c)) {

	564 c=*locale++;

	565 }

	566 if(is_r(c)) {

	567 c=*locale;

	568 if(is_sep(c)) {

	569 result=UCASE_LOC_TURKISH;

	570 }

	571 }

	572 } else if(is_a(c)) {

	573 /* az or aze? */

	574 c=*locale++;

	575 if(is_z(c)) {

	576 c=*locale++;

	577 if(is_e(c)) {

	578 c=*locale;

	579 }

	580 if(is_sep(c)) {

	581 result=UCASE_LOC_TURKISH;

	582 }

	583 }

	584 } else if(is_l(c)) {

	585 /* lt or lit? */

	586 c=*locale++;

	587 if(is_i(c)) {

	588 c=*locale++;

	589 }

	590 if(is_t(c)) {

	591 c=*locale;

	592 if(is_sep(c)) {

	593 result=UCASE_LOC_LITHUANIAN;

	594 }

	595 }

	596 } else if(is_n(c)) {

	597 /* nl or nld? */

	598 c=*locale++;

	599 if(is_l(c)) {

	600 c=*locale++;

	601 if(is_d(c)) {

	602 c=*locale;

	603 }

	604 if(is_sep(c)) {

	605 result=UCASE_LOC_DUTCH;

	606 }

	607 }

	608 }

	609

	610 if(locCache!=NULL) {

	611 *locCache=result;

	612 }

	613 return result;

	614 }

	615

	616 /*

	617 * Is followed by

	618 * {case-ignorable}* cased

	619 * ?

	620 * (dir determines looking forward/backward)

	621 * If a character is case-ignorable, it is skipped regardless of whether

	622 * it is also cased or not.

	623 */

	624 static UBool

	625 isFollowedByCasedLetter(const UCaseProps csp, UCaseContextIterator iter, void *context, int8_t dir) {

	626 UChar32 c;

	627

	628 if(iter==NULL) {

	629 return FALSE;

	630 }

	631

	632 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {

	633 int32_t type=ucase_getTypeOrIgnorable(csp, c);

	634 if(type&4) {

	635 /* case-ignorable, continue with the loop */

	636 } else if(type!=UCASE_NONE) {

	637 return TRUE; /* followed by cased letter */

	638 } else {

	639 return FALSE; /* uncased and not case-ignorable */

	640 }

	641 }

	642

	643 return FALSE; /* not followed by cased letter */

	644 }

	645

	646 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */

	647 static UBool

	648 isPrecededBySoftDotted(const UCaseProps csp, UCaseContextIterator iter, void * context) {

	649 UChar32 c;

	650 int32_t dotType;

	651 int8_t dir;

	652

	653 if(iter==NULL) {

	654 return FALSE;

	655 }

	656

	657 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {

	658 dotType=getDotType(csp, c);

	659 if(dotType==UCASE_SOFT_DOTTED) {

	660 return TRUE; /* preceded by TYPE_i */

	661 } else if(dotType!=UCASE_OTHER_ACCENT) {

	662 return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */

	663 }

	664 }

	665

	666 return FALSE; /* not preceded by TYPE_i */

	667 }

	668

	669 /*

	670 * See Jitterbug 2344:

	671 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above

	672 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because

	673 * we made those releases compatible with Unicode 3.2 which had not fixed

	674 * a related bug in SpecialCasing.txt.

	675 *

	676 * From the Jitterbug 2344 text:

	677 * ... this bug is listed as a Unicode erratum

	678 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html

	679 * <quote>

	680 * There are two errors in SpecialCasing.txt.

	681 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]

	682 * 2. An incorrect context definition. Correct as follows:

	683 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE

	684 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE

	685 * ---

	686 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE

	687 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE

	688 * where the context After_I is defined as:

	689 * The last preceding base character was an uppercase I, and there is no

	690 * intervening combining character class 230 (ABOVE).

	691 * </quote>

	692 *

	693 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:

	694 *

	695 * # When lowercasing, remove dot_above in the sequence I + dot_above, which wil l turn into i.

	696 * # This matches the behavior of the canonically equivalent I-dot_above

	697 *

	698 * See also the description in this place in older versions of uchar.c (revision 1.100).

	699 *

	700 * Markus W. Scherer 2003-feb-15

	701 */

	702

	703 /* Is preceded by base character 'I' with no intervening cc=230 ? */

	704 static UBool

	705 isPrecededBy_I(const UCaseProps csp, UCaseContextIterator iter, void *context) {

	706 UChar32 c;

	707 int32_t dotType;

	708 int8_t dir;

	709

	710 if(iter==NULL) {

	711 return FALSE;

	712 }

	713

	714 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {

	715 if(c==0x49) {

	716 return TRUE; /* preceded by I */

	717 }

	718 dotType=getDotType(csp, c);

	719 if(dotType!=UCASE_OTHER_ACCENT) {

	720 return FALSE; /* preceded by different base character (not I), or in tervening cc==230 */

	721 }

	722 }

	723

	724 return FALSE; /* not preceded by I */

	725 }

	726

	727 /* Is followed by one or more cc==230 ? */

	728 static UBool

	729 isFollowedByMoreAbove(const UCaseProps csp, UCaseContextIterator iter, void *c ontext) {

	730 UChar32 c;

	731 int32_t dotType;

	732 int8_t dir;

	733

	734 if(iter==NULL) {

	735 return FALSE;

	736 }

	737

	738 for(dir=1; (c=iter(context, dir))>=0; dir=0) {

	739 dotType=getDotType(csp, c);

	740 if(dotType==UCASE_ABOVE) {

	741 return TRUE; /* at least one cc==230 following */

	742 } else if(dotType!=UCASE_OTHER_ACCENT) {

	743 return FALSE; /* next base character, no more cc==230 following */

	744 }

	745 }

	746

	747 return FALSE; /* no more cc==230 following */

	748 }

	749

	750 /* Is followed by a dot above (without cc==230 in between) ? */

	751 static UBool

	752 isFollowedByDotAbove(const UCaseProps csp, UCaseContextIterator iter, void *co ntext) {

	753 UChar32 c;

	754 int32_t dotType;

	755 int8_t dir;

	756

	757 if(iter==NULL) {

	758 return FALSE;

	759 }

	760

	761 for(dir=1; (c=iter(context, dir))>=0; dir=0) {

	762 if(c==0x307) {

	763 return TRUE;

	764 }

	765 dotType=getDotType(csp, c);

	766 if(dotType!=UCASE_OTHER_ACCENT) {

	767 return FALSE; /* next base character or cc==230 in between */

	768 }

	769 }

	770

	771 return FALSE; /* no dot above following */

	772 }

	773

	774 U_CAPI int32_t U_EXPORT2

	775 ucase_toFullLower(const UCaseProps *csp, UChar32 c,

	776 UCaseContextIterator iter, void context,

	777 const UChar **pString,

	778 const char locale, int32_t locCache)

	779 {

	780 UChar32 result=c;

	781 uint16_t props=UTRIE2_GET16(&csp->trie, c);

	782 if(!PROPS_HAS_EXCEPTION(props)) {

	783 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {

	784 result=c+UCASE_GET_DELTA(props);

	785 }

	786 } else {

	787 const uint16_t pe=GET_EXCEPTIONS(csp, props), pe2;

	788 uint16_t excWord=*pe++;

	789 int32_t full;

	790

	791 pe2=pe;

	792

	793 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {

	794 /* use hardcoded conditions and mappings */

	795 int32_t loc=ucase_getCaseLocale(locale, locCache);

	796

	797 /*

	798 * Test for conditional mappings first

	799 * (otherwise the unconditional default mappings are always taken) ,

	800 * then test for characters that have unconditional mappings in Spec ialCasing.txt,

	801 * then get the UnicodeData.txt mappings.

	802 */

	803 if( loc==UCASE_LOC_LITHUANIAN &&

	804 /* base characters, find accents above */

	805 (((c==0x49 \|\| c==0x4a \|\| c==0x12e) &&

	806 isFollowedByMoreAbove(csp, iter, context)) \|\|

	807 /* precomposed with accent above, no need to find one */

	808 (c==0xcc \|\| c==0xcd \|\| c==0x128))

	809 ) {

	810 /*

	811 # Lithuanian

	812

	813 # Lithuanian retains the dot in a lowercase i when followed by accents.

	814

	815 # Introduce an explicit dot above when lowercasing capital I 's and J's

	816 # whenever there are more accents above.

	817 # (of the accents used in Lithuanian: grave, acute, tilde ab ove, and ogonek)

	818

	819 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I

	820 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J

	821 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK

	822 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE

	823 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE

	824 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE

	825 */

	826 switch(c) {

	827 case 0x49: /* LATIN CAPITAL LETTER I */

	828 *pString=iDot;

	829 return 2;

	830 case 0x4a: /* LATIN CAPITAL LETTER J */

	831 *pString=jDot;

	832 return 2;

	833 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */

	834 *pString=iOgonekDot;

	835 return 2;

	836 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */

	837 *pString=iDotGrave;

	838 return 3;

	839 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */

	840 *pString=iDotAcute;

	841 return 3;

	842 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */

	843 *pString=iDotTilde;

	844 return 3;

	845 default:

	846 return 0; /* will not occur */

	847 }

	848 /* # Turkish and Azeri */

	849 } else if(loc==UCASE_LOC_TURKISH && c==0x130) {

	850 /*

	851 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri

	852 # The following rules handle those cases.

	853

	854 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE

	855 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE

	856 */

	857 return 0x69;

	858 } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {

	859 /*

	860 # When lowercasing, remove dot_above in the sequence I + dot _above, which will turn into i.

	861 # This matches the behavior of the canonically equivalent I- dot_above

	862

	863 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE

	864 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE

	865 */

	866 return 0; /* remove the dot (continue without output) */

	867 } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove (csp, iter, context)) {

	868 /*

	869 # When lowercasing, unless an I is before a dot_above, it tu rns into a dotless i.

	870

	871 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL L ETTER I

	872 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL L ETTER I

	873 */

	874 return 0x131;

	875 } else if(c==0x130) {

	876 /*

	877 # Preserve canonical equivalence for I with dot. Turkic is h andled below.

	878

	879 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH D OT ABOVE

	880 */

	881 *pString=iDot;

	882 return 2;

	883 } else if( c==0x3a3 &&

	884 !isFollowedByCasedLetter(csp, iter, context, 1) &&

	885 isFollowedByCasedLetter(csp, iter, context, -1) /* -1=pr eceded */

	886 ) {

	887 /* greek capital sigma maps depending on surrounding cased lette rs (see SpecialCasing.txt) */

	888 /*

	889 # Special case for final form of sigma

	890

	891 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA

	892 */

	893 return 0x3c2; /* greek small final sigma */

	894 } else {

	895 /* no known conditional special case mapping, use a normal mappi ng */

	896 }

	897 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {

	898 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);

	899 full&=UCASE_FULL_LOWER;

	900 if(full!=0) {

	901 /* set the output pointer to the lowercase mapping */

	902 *pString=pe+1;

	903

	904 /* return the string length */

	905 return full;

	906 }

	907 }

	908

	909 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {

	910 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);

	911 }

	912 }

	913

	914 return (result==c) ? ~result : result;

	915 }

	916

	917 /* internal */

	918 static int32_t

	919 toUpperOrTitle(const UCaseProps *csp, UChar32 c,

	920 UCaseContextIterator iter, void context,

	921 const UChar **pString,

	922 const char locale, int32_t locCache,

	923 UBool upperNotTitle) {

	924 UChar32 result=c;

	925 uint16_t props=UTRIE2_GET16(&csp->trie, c);

	926 if(!PROPS_HAS_EXCEPTION(props)) {

	927 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {

	928 result=c+UCASE_GET_DELTA(props);

	929 }

	930 } else {

	931 const uint16_t pe=GET_EXCEPTIONS(csp, props), pe2;

	932 uint16_t excWord=*pe++;

	933 int32_t full, idx;

	934

	935 pe2=pe;

	936

	937 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {

	938 /* use hardcoded conditions and mappings */

	939 int32_t loc=ucase_getCaseLocale(locale, locCache);

	940

	941 if(loc==UCASE_LOC_TURKISH && c==0x69) {

	942 /*

	943 # Turkish and Azeri

	944

	945 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri

	946 # The following rules handle those cases.

	947

	948 # When uppercasing, i turns into a dotted capital I

	949

	950 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I

	951 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I

	952 */

	953 return 0x130;

	954 } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftD otted(csp, iter, context)) {

	955 /*

	956 # Lithuanian

	957

	958 # Lithuanian retains the dot in a lowercase i when followed by accents.

	959

	960 # Remove DOT ABOVE after "i" with upper or titlecase

	961

	962 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE

	963 */

	964 return 0; /* remove the dot (continue without output) */

	965 } else {

	966 /* no known conditional special case mapping, use a normal mappi ng */

	967 }

	968 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {

	969 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);

	970

	971 /* start of full case mapping strings */

	972 ++pe;

	973

	974 /* skip the lowercase and case-folding result strings */

	975 pe+=full&UCASE_FULL_LOWER;

	976 full>>=4;

	977 pe+=full&0xf;

	978 full>>=4;

	979

	980 if(upperNotTitle) {

	981 full&=0xf;

	982 } else {

	983 /* skip the uppercase result string */

	984 pe+=full&0xf;

	985 full=(full>>4)&0xf;

	986 }

	987

	988 if(full!=0) {

	989 /* set the output pointer to the result string */

	990 *pString=pe;

	991

	992 /* return the string length */

	993 return full;

	994 }

	995 }

	996

	997 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {

	998 idx=UCASE_EXC_TITLE;

	999 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {

	1000 /* here, titlecase is same as uppercase */

	1001 idx=UCASE_EXC_UPPER;

	1002 } else {

	1003 return ~c;

	1004 }

	1005 GET_SLOT_VALUE(excWord, idx, pe2, result);

	1006 }

	1007

	1008 return (result==c) ? ~result : result;

	1009 }

	1010

	1011 U_CAPI int32_t U_EXPORT2

	1012 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,

	1013 UCaseContextIterator iter, void context,

	1014 const UChar **pString,

	1015 const char locale, int32_t locCache) {

	1016 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE );

	1017 }

	1018

	1019 U_CAPI int32_t U_EXPORT2

	1020 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,

	1021 UCaseContextIterator iter, void context,

	1022 const UChar **pString,

	1023 const char locale, int32_t locCache) {

	1024 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALS E);

	1025 }

	1026

	1027 /* case folding ------------------------------------------------------------- */

	1028

	1029 /*

	1030 * Case folding is similar to lowercasing.

	1031 * The result may be a simple mapping, i.e., a single code point, or

	1032 * a full mapping, i.e., a string.

	1033 * If the case folding for a code point is the same as its simple (1:1) lowercas e mapping,

	1034 * then only the lowercase mapping is stored.

	1035 *

	1036 * Some special cases are hardcoded because their conditions cannot be

	1037 * parsed and processed from CaseFolding.txt.

	1038 *

	1039 * Unicode 3.2 CaseFolding.txt specifies for its status field:

	1040

	1041 # C: common case folding, common mappings shared by both simple and full mapping s.

	1042 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.

	1043 # S: simple case folding, mappings to single characters where different from F.

	1044 # T: special case for uppercase I and dotted uppercase I

	1045 # - For non-Turkic languages, this mapping is normally not used.

	1046 # - For Turkic languages (tr, az), this mapping can be used instead of the no rmal mapping for these characters.

	1047 #

	1048 # Usage:

	1049 # A. To do a simple case folding, use the mappings with status C + S.

	1050 # B. To do a full case folding, use the mappings with status C + F.

	1051 #

	1052 # The mappings with status T can be used or omitted depending on the desired case-folding

	1053 # behavior. (The default option is to exclude them.)

	1054

	1055 * Unicode 3.2 has 'T' mappings as follows:

	1056

	1057 0049; T; 0131; # LATIN CAPITAL LETTER I

	1058 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE

	1059

	1060 * while the default mappings for these code points are:

	1061

	1062 0049; C; 0069; # LATIN CAPITAL LETTER I

	1063 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE

	1064

	1065 * U+0130 has no simple case folding (simple-case-folds to itself).

	1066 */

	1067

	1068 /* return the simple case folding mapping for c */

	1069 U_CAPI UChar32 U_EXPORT2

	1070 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {

	1071 uint16_t props=UTRIE2_GET16(&csp->trie, c);

	1072 if(!PROPS_HAS_EXCEPTION(props)) {

	1073 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {

	1074 c+=UCASE_GET_DELTA(props);

	1075 }

	1076 } else {

	1077 const uint16_t *pe=GET_EXCEPTIONS(csp, props);

	1078 uint16_t excWord=*pe++;

	1079 int32_t idx;

	1080 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {

	1081 /* special case folding mappings, hardcoded */

	1082 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {

	1083 /* default mappings */

	1084 if(c==0x49) {

	1085 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */

	1086 return 0x69;

	1087 } else if(c==0x130) {

	1088 /* no simple case folding for U+0130 */

	1089 return c;

	1090 }

	1091 } else {

	1092 /* Turkic mappings */

	1093 if(c==0x49) {

	1094 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */

	1095 return 0x131;

	1096 } else if(c==0x130) {

	1097 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */

	1098 return 0x69;

	1099 }

	1100 }

	1101 }

	1102 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {

	1103 idx=UCASE_EXC_FOLD;

	1104 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {

	1105 idx=UCASE_EXC_LOWER;

	1106 } else {

	1107 return c;

	1108 }

	1109 GET_SLOT_VALUE(excWord, idx, pe, c);

	1110 }

	1111 return c;

	1112 }

	1113

	1114 /*

	1115 * Issue for canonical caseless match (UAX #21):

	1116 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve

	1117 * canonical equivalence, unlike default-option casefolding.

	1118 * For example, I-grave and I + grave fold to strings that are not canonically

	1119 * equivalent.

	1120 * For more details, see the comment in unorm_compare() in unorm.cpp

	1121 * and the intermediate prototype changes for Jitterbug 2021.

	1122 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)

	1123 *

	1124 * This did not get fixed because it appears that it is not possible to fix

	1125 * it for uppercase and lowercase characters (I-grave vs. i-grave)

	1126 * together in a way that they still fold to common result strings.

	1127 */

	1128

	1129 U_CAPI int32_t U_EXPORT2

	1130 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,

	1131 const UChar **pString,

	1132 uint32_t options)

	1133 {

	1134 UChar32 result=c;

	1135 uint16_t props=UTRIE2_GET16(&csp->trie, c);

	1136 if(!PROPS_HAS_EXCEPTION(props)) {

	1137 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {

	1138 result=c+UCASE_GET_DELTA(props);

	1139 }

	1140 } else {

	1141 const uint16_t pe=GET_EXCEPTIONS(csp, props), pe2;

	1142 uint16_t excWord=*pe++;

	1143 int32_t full, idx;

	1144

	1145 pe2=pe;

	1146

	1147 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {

	1148 /* use hardcoded conditions and mappings */

	1149 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {

	1150 /* default mappings */

	1151 if(c==0x49) {

	1152 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */

	1153 return 0x69;

	1154 } else if(c==0x130) {

	1155 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABO VE */

	1156 *pString=iDot;

	1157 return 2;

	1158 }

	1159 } else {

	1160 /* Turkic mappings */

	1161 if(c==0x49) {

	1162 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */

	1163 return 0x131;

	1164 } else if(c==0x130) {

	1165 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */

	1166 return 0x69;

	1167 }

	1168 }

	1169 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {

	1170 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);

	1171

	1172 /* start of full case mapping strings */

	1173 ++pe;

	1174

	1175 /* skip the lowercase result string */

	1176 pe+=full&UCASE_FULL_LOWER;

	1177 full=(full>>4)&0xf;

	1178

	1179 if(full!=0) {

	1180 /* set the output pointer to the result string */

	1181 *pString=pe;

	1182

	1183 /* return the string length */

	1184 return full;

	1185 }

	1186 }

	1187

	1188 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {

	1189 idx=UCASE_EXC_FOLD;

	1190 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {

	1191 idx=UCASE_EXC_LOWER;

	1192 } else {

	1193 return ~c;

	1194 }

	1195 GET_SLOT_VALUE(excWord, idx, pe2, result);

	1196 }

	1197

	1198 return (result==c) ? ~result : result;

	1199 }

	1200

	1201 /* case mapping properties API ---------------------------------------------- */

	1202

	1203 #define GET_CASE_PROPS() &ucase_props_singleton

	1204

	1205 /* public API (see uchar.h) */

	1206

	1207 U_CAPI UBool U_EXPORT2

	1208 u_isULowercase(UChar32 c) {

	1209 return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));

	1210 }

	1211

	1212 U_CAPI UBool U_EXPORT2

	1213 u_isUUppercase(UChar32 c) {

	1214 return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));

	1215 }

	1216

	1217 /* Transforms the Unicode character to its lower case equivalent.*/

	1218 U_CAPI UChar32 U_EXPORT2

	1219 u_tolower(UChar32 c) {

	1220 return ucase_tolower(GET_CASE_PROPS(), c);

	1221 }

	1222

	1223 /* Transforms the Unicode character to its upper case equivalent.*/

	1224 U_CAPI UChar32 U_EXPORT2

	1225 u_toupper(UChar32 c) {

	1226 return ucase_toupper(GET_CASE_PROPS(), c);

	1227 }

	1228

	1229 /* Transforms the Unicode character to its title case equivalent.*/

	1230 U_CAPI UChar32 U_EXPORT2

	1231 u_totitle(UChar32 c) {

	1232 return ucase_totitle(GET_CASE_PROPS(), c);

	1233 }

	1234

	1235 /* return the simple case folding mapping for c */

	1236 U_CAPI UChar32 U_EXPORT2

	1237 u_foldCase(UChar32 c, uint32_t options) {

	1238 return ucase_fold(GET_CASE_PROPS(), c, options);

	1239 }

	1240

	1241 U_CFUNC int32_t U_EXPORT2

	1242 ucase_hasBinaryProperty(UChar32 c, UProperty which) {

	1243 /* case mapping properties */

	1244 const UChar *resultString;

	1245 int32_t locCache;

	1246 const UCaseProps *csp=GET_CASE_PROPS();

	1247 if(csp==NULL) {

	1248 return FALSE;

	1249 }

	1250 switch(which) {

	1251 case UCHAR_LOWERCASE:

	1252 return (UBool)(UCASE_LOWER==ucase_getType(csp, c));

	1253 case UCHAR_UPPERCASE:

	1254 return (UBool)(UCASE_UPPER==ucase_getType(csp, c));

	1255 case UCHAR_SOFT_DOTTED:

	1256 return ucase_isSoftDotted(csp, c);

	1257 case UCHAR_CASE_SENSITIVE:

	1258 return ucase_isCaseSensitive(csp, c);

	1259 case UCHAR_CASED:

	1260 return (UBool)(UCASE_NONE!=ucase_getType(csp, c));

	1261 case UCHAR_CASE_IGNORABLE:

	1262 return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2);

	1263 /*

	1264 * Note: The following Changes_When_Xyz are defined as testing whether

	1265 * the NFD form of the input changes when Xyz-case-mapped.

	1266 * However, this simpler implementation of these properties,

	1267 * ignoring NFD, passes the tests.

	1268 * The implementation needs to be changed if the tests start failing.

	1269 * When that happens, optimizations should be used to work with the

	1270 * per-single-code point ucase_toFullXyz() functions unless

	1271 * the NFD form has more than one code point,

	1272 * and the property starts set needs to be the union of the

	1273 * start sets for normalization and case mappings.

	1274 */

	1275 case UCHAR_CHANGES_WHEN_LOWERCASED:

	1276 locCache=UCASE_LOC_ROOT;

	1277 return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);

	1278 case UCHAR_CHANGES_WHEN_UPPERCASED:

	1279 locCache=UCASE_LOC_ROOT;

	1280 return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);

	1281 case UCHAR_CHANGES_WHEN_TITLECASED:

	1282 locCache=UCASE_LOC_ROOT;

	1283 return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);

	1284 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */

	1285 case UCHAR_CHANGES_WHEN_CASEMAPPED:

	1286 locCache=UCASE_LOC_ROOT;

	1287 return (UBool)(

	1288 ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)> =0 \|\|

	1289 ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)> =0 \|\|

	1290 ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)> =0);

	1291 default:

	1292 return FALSE;

	1293 }

	1294 }

OLD	NEW

« no previous file with comments | « icu46/source/common/ucase.h ('k') | icu46/source/common/ucase_props_data.c » ('j') | no next file with comments »