icu46/source/i18n/ucol_tok.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/i18n/ucol_tok.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 *******************************************************************************

	3 *

	4 * Copyright (C) 2001-2010, International Business Machines

	5 * Corporation and others. All Rights Reserved.

	6 *

	7 *******************************************************************************

	8 * file name: ucol_tok.cpp

	9 * encoding: US-ASCII

	10 * tab size: 8 (not used)

	11 * indentation:4

	12 *

	13 * created 02/22/2001

	14 * created by: Vladimir Weinstein

	15 *

	16 * This module reads a tailoring rule string and produces a list of

	17 * tokens that will be turned into collation elements

	18 *

	19 */

	20

	21 #include "unicode/utypes.h"

	22

	23 #if !UCONFIG_NO_COLLATION

	24

	25 #include "unicode/uscript.h"

	26 #include "unicode/ustring.h"

	27 #include "unicode/uchar.h"

	28 #include "unicode/uniset.h"

	29

	30 #include "cmemory.h"

	31 #include "cstring.h"

	32 #include "ucol_bld.h"

	33 #include "ucol_tok.h"

	34 #include "ulocimp.h"

	35 #include "uresimp.h"

	36 #include "util.h"

	37

	38 // Define this only for debugging.

	39 // #define DEBUG_FOR_COLL_RULES 1

	40

	41 #ifdef DEBUG_FOR_COLL_RULES

	42 #include <iostream>

	43 #endif

	44

	45 U_NAMESPACE_USE

	46

	47 U_CDECL_BEGIN

	48 static int32_t U_CALLCONV

	49 uhash_hashTokens(const UHashTok k)

	50 {

	51 int32_t hash = 0;

	52 //uint32_t key = (uint32_t)k.integer;

	53 UColToken key = (UColToken )k.pointer;

	54 if (key != 0) {

	55 int32_t len = (key->source & 0xFF000000)>>24;

	56 int32_t inc = ((len - 32) / 32) + 1;

	57

	58 const UChar p = (key->source & 0x00FFFFFF) + (key->rulesToParseHdl);

	59 const UChar *limit = p + len;

	60

	61 while (p<limit) {

	62 hash = (hash * 37) + *p;

	63 p += inc;

	64 }

	65 }

	66 return hash;

	67 }

	68

	69 static UBool U_CALLCONV

	70 uhash_compareTokens(const UHashTok key1, const UHashTok key2)

	71 {

	72 //uint32_t p1 = (uint32_t) key1.integer;

	73 //uint32_t p2 = (uint32_t) key2.integer;

	74 UColToken p1 = (UColToken )key1.pointer;

	75 UColToken p2 = (UColToken )key2.pointer;

	76 const UChar s1 = (p1->source & 0x00FFFFFF) + (p1->rulesToParseHdl);

	77 const UChar s2 = (p2->source & 0x00FFFFFF) + (p2->rulesToParseHdl);

	78 uint32_t s1L = ((p1->source & 0xFF000000) >> 24);

	79 uint32_t s2L = ((p2->source & 0xFF000000) >> 24);

	80 const UChar *end = s1+s1L-1;

	81

	82 if (p1 == p2) {

	83 return TRUE;

	84 }

	85 if (p1->source == 0 \|\| p2->source == 0) {

	86 return FALSE;

	87 }

	88 if(s1L != s2L) {

	89 return FALSE;

	90 }

	91 if(p1->source == p2->source) {

	92 return TRUE;

	93 }

	94 while((s1 < end) && s1 == s2) {

	95 ++s1;

	96 ++s2;

	97 }

	98 if(s1 == s2) {

	99 return TRUE;

	100 } else {

	101 return FALSE;

	102 }

	103 }

	104 U_CDECL_END

	105

	106 /*

	107 * Debug messages used to pinpoint where a format error occurred.

	108 * A better way is to include context-sensitive information in syntaxError() fun ction.

	109 *

	110 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR

	111 * in the compile line.

	112 */

	113 /* #define DEBUG_FOR_FORMAT_ERROR 1 */

	114

	115 #ifdef DEBUG_FOR_FORMAT_ERROR

	116 #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__) ;}

	117 #else

	118 #define DBG_FORMAT_ERROR

	119 #endif

	120

	121

	122 /*

	123 * Controls debug messages so that the output can be compared before and after a

	124 * big change. Prints the information of every code point that comes out of the

	125 * collation parser and its strength into a file. When a big change in format

	126 * happens, the files before and after the change should be identical.

	127 *

	128 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS

	129 * in the compile line.

	130 */

	131 // #define DEBUG_FOR_CODE_POINTS 1

	132

	133 #ifdef DEBUG_FOR_CODE_POINTS

	134 FILE* dfcp_fp = NULL;

	135 #endif

	136

	137

	138 /*static inline void U_CALLCONV

	139 uhash_freeBlockWrapper(void *obj) {

	140 uhash_freeBlock(obj);

	141 }*/

	142

	143

	144 typedef struct {

	145 uint32_t startCE;

	146 uint32_t startContCE;

	147 uint32_t limitCE;

	148 uint32_t limitContCE;

	149 } indirectBoundaries;

	150

	151 /* these values are used for finding CE values for indirect positioning. */

	152 /* Indirect positioning is a mechanism for allowing resets on symbolic */

	153 /* values. It only works for resets and you cannot tailor indirect names */

	154 /* An indirect name can define either an anchor point or a range. An */

	155 /* anchor point behaves in exactly the same way as a code point in reset */

	156 /* would, except that it cannot be tailored. A range (we currently only */

	157 /* know for the [top] range will explicitly set the upper bound for */

	158 /* generated CEs, thus allowing for better control over how many CEs can */

	159 /* be squeezed between in the range without performance penalty. */

	160 /* In that respect, we use [top] for tailoring of locales that use CJK */

	161 /* characters. Other indirect values are currently a pure convenience, */

	162 /* they can be used to assure that the CEs will be always positioned in */

	163 /* the same place relative to a point with known properties (e.g. first */

	164 /* primary ignorable). */

	165 static indirectBoundaries ucolIndirectBoundaries[15];

	166 /*

	167 static indirectBoundaries ucolIndirectBoundaries[11] = {

	168 { UCOL_RESET_TOP_VALUE, 0,

	169 UCOL_NEXT_TOP_VALUE, 0 },

	170 { UCOL_FIRST_PRIMARY_IGNORABLE, 0,

	171 0, 0 },

	172 { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT,

	173 0, 0 },

	174 { UCOL_FIRST_SECONDARY_IGNORABLE, 0,

	175 0, 0 },

	176 { UCOL_LAST_SECONDARY_IGNORABLE, 0,

	177 0, 0 },

	178 { UCOL_FIRST_TERTIARY_IGNORABLE, 0,

	179 0, 0 },

	180 { UCOL_LAST_TERTIARY_IGNORABLE, 0,

	181 0, 0 },

	182 { UCOL_FIRST_VARIABLE, 0,

	183 0, 0 },

	184 { UCOL_LAST_VARIABLE, 0,

	185 0, 0 },

	186 { UCOL_FIRST_NON_VARIABLE, 0,

	187 0, 0 },

	188 { UCOL_LAST_NON_VARIABLE, 0,

	189 0, 0 },

	190 };

	191 */

	192

	193 static void setIndirectBoundaries(uint32_t indexR, uint32_t start, uint32_t en d) {

	194

	195 // Set values for the top - TODO: once we have values for all the indirects, we are going

	196 // to initalize here.

	197 ucolIndirectBoundaries[indexR].startCE = start[0];

	198 ucolIndirectBoundaries[indexR].startContCE = start[1];

	199 if(end) {

	200 ucolIndirectBoundaries[indexR].limitCE = end[0];

	201 ucolIndirectBoundaries[indexR].limitContCE = end[1];

	202 } else {

	203 ucolIndirectBoundaries[indexR].limitCE = 0;

	204 ucolIndirectBoundaries[indexR].limitContCE = 0;

	205 }

	206 }

	207

	208

	209 static inline

	210 void syntaxError(const UChar* rules,

	211 int32_t pos,

	212 int32_t rulesLen,

	213 UParseError* parseError)

	214 {

	215 parseError->offset = pos;

	216 parseError->line = 0 ; /* we are not using line numbers */

	217

	218 // for pre-context

	219 int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN -1));

	220 int32_t stop = pos;

	221

	222 u_memcpy(parseError->preContext,rules+start,stop-start);

	223 //null terminate the buffer

	224 parseError->preContext[stop-start] = 0;

	225

	226 //for post-context

	227 start = pos+1;

	228 stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1 )) :

	229 rulesLen;

	230

	231 if(start < stop) {

	232 u_memcpy(parseError->postContext,rules+start,stop-start);

	233 //null terminate the buffer

	234 parseError->postContext[stop-start]= 0;

	235 } else {

	236 parseError->postContext[0] = 0;

	237 }

	238 }

	239

	240 static

	241 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, U ColAttributeValue value) {

	242 switch(attrib) {

	243 case UCOL_HIRAGANA_QUATERNARY_MODE:

	244 opts->hiraganaQ = value;

	245 break;

	246 case UCOL_FRENCH_COLLATION:

	247 opts->frenchCollation = value;

	248 break;

	249 case UCOL_ALTERNATE_HANDLING:

	250 opts->alternateHandling = value;

	251 break;

	252 case UCOL_CASE_FIRST:

	253 opts->caseFirst = value;

	254 break;

	255 case UCOL_CASE_LEVEL:

	256 opts->caseLevel = value;

	257 break;

	258 case UCOL_NORMALIZATION_MODE:

	259 opts->normalizationMode = value;

	260 break;

	261 case UCOL_STRENGTH:

	262 opts->strength = value;

	263 break;

	264 case UCOL_NUMERIC_COLLATION:

	265 opts->numericCollation = value;

	266 break;

	267 case UCOL_ATTRIBUTE_COUNT:

	268 default:

	269 break;

	270 }

	271 }

	272

	273 #define UTOK_OPTION_COUNT 22

	274

	275 static UBool didInit = FALSE;

	276 /* we can be strict, or we can be lenient */

	277 /* I'd surely be lenient with the option arguments */

	278 /* maybe even with options */

	279 U_STRING_DECL(suboption_00, "non-ignorable", 13);

	280 U_STRING_DECL(suboption_01, "shifted", 7);

	281

	282 U_STRING_DECL(suboption_02, "lower", 5);

	283 U_STRING_DECL(suboption_03, "upper", 5);

	284 U_STRING_DECL(suboption_04, "off", 3);

	285 U_STRING_DECL(suboption_05, "on", 2);

	286 U_STRING_DECL(suboption_06, "1", 1);

	287 U_STRING_DECL(suboption_07, "2", 1);

	288 U_STRING_DECL(suboption_08, "3", 1);

	289 U_STRING_DECL(suboption_09, "4", 1);

	290 U_STRING_DECL(suboption_10, "I", 1);

	291

	292 U_STRING_DECL(suboption_11, "primary", 7);

	293 U_STRING_DECL(suboption_12, "secondary", 9);

	294 U_STRING_DECL(suboption_13, "tertiary", 8);

	295 U_STRING_DECL(suboption_14, "variable", 8);

	296 U_STRING_DECL(suboption_15, "regular", 7);

	297 U_STRING_DECL(suboption_16, "implicit", 8);

	298 U_STRING_DECL(suboption_17, "trailing", 8);

	299

	300

	301 U_STRING_DECL(option_00, "undefined", 9);

	302 U_STRING_DECL(option_01, "rearrange", 9);

	303 U_STRING_DECL(option_02, "alternate", 9);

	304 U_STRING_DECL(option_03, "backwards", 9);

	305 U_STRING_DECL(option_04, "variable top", 12);

	306 U_STRING_DECL(option_05, "top", 3);

	307 U_STRING_DECL(option_06, "normalization", 13);

	308 U_STRING_DECL(option_07, "caseLevel", 9);

	309 U_STRING_DECL(option_08, "caseFirst", 9);

	310 U_STRING_DECL(option_09, "scriptOrder", 11);

	311 U_STRING_DECL(option_10, "charsetname", 11);

	312 U_STRING_DECL(option_11, "charset", 7);

	313 U_STRING_DECL(option_12, "before", 6);

	314 U_STRING_DECL(option_13, "hiraganaQ", 9);

	315 U_STRING_DECL(option_14, "strength", 8);

	316 U_STRING_DECL(option_15, "first", 5);

	317 U_STRING_DECL(option_16, "last", 4);

	318 U_STRING_DECL(option_17, "optimize", 8);

	319 U_STRING_DECL(option_18, "suppressContractions", 20);

	320 U_STRING_DECL(option_19, "numericOrdering", 15);

	321 U_STRING_DECL(option_20, "import", 6);

	322 U_STRING_DECL(option_21, "reorder", 7);

	323

	324 /*

	325 [last variable] last variable value

	326 [last primary ignorable] largest CE for primary ignorable

	327 [last secondary ignorable] largest CE for secondary ignorable

	328 [last tertiary ignorable] largest CE for tertiary ignorable

	329 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8 )

	330 */

	331

	332

	333 static const ucolTokSuboption alternateSub[2] = {

	334 {suboption_00, 13, UCOL_NON_IGNORABLE},

	335 {suboption_01, 7, UCOL_SHIFTED}

	336 };

	337

	338 static const ucolTokSuboption caseFirstSub[3] = {

	339 {suboption_02, 5, UCOL_LOWER_FIRST},

	340 {suboption_03, 5, UCOL_UPPER_FIRST},

	341 {suboption_04, 3, UCOL_OFF},

	342 };

	343

	344 static const ucolTokSuboption onOffSub[2] = {

	345 {suboption_04, 3, UCOL_OFF},

	346 {suboption_05, 2, UCOL_ON}

	347 };

	348

	349 static const ucolTokSuboption frenchSub[1] = {

	350 {suboption_07, 1, UCOL_ON}

	351 };

	352

	353 static const ucolTokSuboption beforeSub[3] = {

	354 {suboption_06, 1, UCOL_PRIMARY},

	355 {suboption_07, 1, UCOL_SECONDARY},

	356 {suboption_08, 1, UCOL_TERTIARY}

	357 };

	358

	359 static const ucolTokSuboption strengthSub[5] = {

	360 {suboption_06, 1, UCOL_PRIMARY},

	361 {suboption_07, 1, UCOL_SECONDARY},

	362 {suboption_08, 1, UCOL_TERTIARY},

	363 {suboption_09, 1, UCOL_QUATERNARY},

	364 {suboption_10, 1, UCOL_IDENTICAL},

	365 };

	366

	367 static const ucolTokSuboption firstLastSub[7] = {

	368 {suboption_11, 7, UCOL_PRIMARY},

	369 {suboption_12, 9, UCOL_PRIMARY},

	370 {suboption_13, 8, UCOL_PRIMARY},

	371 {suboption_14, 8, UCOL_PRIMARY},

	372 {suboption_15, 7, UCOL_PRIMARY},

	373 {suboption_16, 8, UCOL_PRIMARY},

	374 {suboption_17, 8, UCOL_PRIMARY},

	375 };

	376

	377 enum OptionNumber {

	378 OPTION_ALTERNATE_HANDLING = 0,

	379 OPTION_FRENCH_COLLATION,

	380 OPTION_CASE_LEVEL,

	381 OPTION_CASE_FIRST,

	382 OPTION_NORMALIZATION_MODE,

	383 OPTION_HIRAGANA_QUATERNARY,

	384 OPTION_STRENGTH,

	385 OPTION_NUMERIC_COLLATION,

	386 OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,

	387 OPTION_VARIABLE_TOP,

	388 OPTION_REARRANGE,

	389 OPTION_BEFORE,

	390 OPTION_TOP,

	391 OPTION_FIRST,

	392 OPTION_LAST,

	393 OPTION_OPTIMIZE,

	394 OPTION_SUPPRESS_CONTRACTIONS,

	395 OPTION_UNDEFINED,

	396 OPTION_SCRIPT_ORDER,

	397 OPTION_CHARSET_NAME,

	398 OPTION_CHARSET,

	399 OPTION_IMPORT,

	400 OPTION_SCRIPTREORDER

	401 } ;

	402

	403 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {

	404 /00/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /"alterna te" /

	405 /01/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /"backwards" /

	406 /02/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /"caseLevel" /

	407 /03/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /"caseFirst" /

	408 /04/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /"normalizati on" /

	409 /05/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /"hiraga naQ" /

	410 /06/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /"strength" /

	411 /07/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /"numericOrde ring"/

	412 /08/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"variable top" /

	413 /09/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"rearrange" /

	414 /10/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /"before" /

	415 /11/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"top" /

	416 /12/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /"first" /

	417 /13/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /"last" /

	418 /14/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"optimize" /

	419 /15/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"suppressContractio ns" /

	420 /16/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"undefined" /

	421 /17/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"scriptOrder" /

	422 /18/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"charsetname" /

	423 /19/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"charset" /

	424 /20/ {option_20, 6, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"import" /

	425 /21/ {option_21, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /"reorder" /

	426 };

	427

	428 static

	429 int32_t u_strncmpNoCase(const UChar *s1,

	430 const UChar *s2,

	431 int32_t n)

	432 {

	433 if(n > 0) {

	434 int32_t rc;

	435 for(;;) {

	436 rc = (int32_t)u_tolower(s1) - (int32_t)u_tolower(s2);

	437 if(rc != 0 \|\| *s1 == 0 \|\| --n == 0) {

	438 return rc;

	439 }

	440 ++s1;

	441 ++s2;

	442 }

	443 }

	444 return 0;

	445 }

	446

	447 static

	448 void ucol_uprv_tok_initData() {

	449 if(!didInit) {

	450 U_STRING_INIT(suboption_00, "non-ignorable", 13);

	451 U_STRING_INIT(suboption_01, "shifted", 7);

	452

	453 U_STRING_INIT(suboption_02, "lower", 5);

	454 U_STRING_INIT(suboption_03, "upper", 5);

	455 U_STRING_INIT(suboption_04, "off", 3);

	456 U_STRING_INIT(suboption_05, "on", 2);

	457

	458 U_STRING_INIT(suboption_06, "1", 1);

	459 U_STRING_INIT(suboption_07, "2", 1);

	460 U_STRING_INIT(suboption_08, "3", 1);

	461 U_STRING_INIT(suboption_09, "4", 1);

	462 U_STRING_INIT(suboption_10, "I", 1);

	463

	464 U_STRING_INIT(suboption_11, "primary", 7);

	465 U_STRING_INIT(suboption_12, "secondary", 9);

	466 U_STRING_INIT(suboption_13, "tertiary", 8);

	467 U_STRING_INIT(suboption_14, "variable", 8);

	468 U_STRING_INIT(suboption_15, "regular", 7);

	469 U_STRING_INIT(suboption_16, "implicit", 8);

	470 U_STRING_INIT(suboption_17, "trailing", 8);

	471

	472

	473 U_STRING_INIT(option_00, "undefined", 9);

	474 U_STRING_INIT(option_01, "rearrange", 9);

	475 U_STRING_INIT(option_02, "alternate", 9);

	476 U_STRING_INIT(option_03, "backwards", 9);

	477 U_STRING_INIT(option_04, "variable top", 12);

	478 U_STRING_INIT(option_05, "top", 3);

	479 U_STRING_INIT(option_06, "normalization", 13);

	480 U_STRING_INIT(option_07, "caseLevel", 9);

	481 U_STRING_INIT(option_08, "caseFirst", 9);

	482 U_STRING_INIT(option_09, "scriptOrder", 11);

	483 U_STRING_INIT(option_10, "charsetname", 11);

	484 U_STRING_INIT(option_11, "charset", 7);

	485 U_STRING_INIT(option_12, "before", 6);

	486 U_STRING_INIT(option_13, "hiraganaQ", 9);

	487 U_STRING_INIT(option_14, "strength", 8);

	488 U_STRING_INIT(option_15, "first", 5);

	489 U_STRING_INIT(option_16, "last", 4);

	490 U_STRING_INIT(option_17, "optimize", 8);

	491 U_STRING_INIT(option_18, "suppressContractions", 20);

	492 U_STRING_INIT(option_19, "numericOrdering", 15);

	493 U_STRING_INIT(option_20, "import ", 6);

	494 U_STRING_INIT(option_21, "reorder", 7);

	495 didInit = TRUE;

	496 }

	497 }

	498

	499

	500 // This function reads basic options to set in the runtime collator

	501 // used by data driven tests. Should not support build time options

	502 U_CAPI const UChar * U_EXPORT2

	503 ucol_tok_getNextArgument(const UChar start, const UChar end,

	504 UColAttribute attrib, UColAttributeValue value,

	505 UErrorCode *status)

	506 {

	507 uint32_t i = 0;

	508 int32_t j=0;

	509 UBool foundOption = FALSE;

	510 const UChar *optionArg = NULL;

	511

	512 ucol_uprv_tok_initData();

	513

	514 while(start < end && (u_isWhitespace(start) \|\| uprv_isRuleWhiteSpace(start ))) { /* eat whitespace */

	515 start++;

	516 }

	517 if(start >= end) {

	518 return NULL;

	519 }

	520 /* skip opening '[' */

	521 if(*start == 0x005b) {

	522 start++;

	523 } else {

	524 *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['

	525 return NULL;

	526 }

	527

	528 while(i < UTOK_OPTION_COUNT) {

	529 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].op tionLen) == 0) {

	530 foundOption = TRUE;

	531 if(end - start > rulesOptions[i].optionLen) {

	532 optionArg = start+rulesOptions[i].optionLen+1; /* start of the o ptions, skip space */

	533 while(u_isWhitespace(optionArg) \|\| uprv_isRuleWhiteSpace(optio nArg)) { /* eat whitespace */

	534 optionArg++;

	535 }

	536 }

	537 break;

	538 }

	539 i++;

	540 }

	541

	542 if(!foundOption) {

	543 *status = U_ILLEGAL_ARGUMENT_ERROR;

	544 return NULL;

	545 }

	546

	547 if(optionArg) {

	548 for(j = 0; j<rulesOptions[i].subSize; j++) {

	549 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, ru lesOptions[i].subopts[j].subLen) == 0) {

	550 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr , rulesOptions[i].subopts[j].attrVal);

	551 *attrib = rulesOptions[i].attr;

	552 *value = rulesOptions[i].subopts[j].attrVal;

	553 optionArg += rulesOptions[i].subopts[j].subLen;

	554 while(u_isWhitespace(optionArg) \|\| uprv_isRuleWhiteSpace(optio nArg)) { /* eat whitespace */

	555 optionArg++;

	556 }

	557 if(*optionArg == 0x005d) {

	558 optionArg++;

	559 return optionArg;

	560 } else {

	561 *status = U_ILLEGAL_ARGUMENT_ERROR;

	562 return NULL;

	563 }

	564 }

	565 }

	566 }

	567 *status = U_ILLEGAL_ARGUMENT_ERROR;

	568 return NULL;

	569 }

	570

	571 static

	572 USet ucol_uprv_tok_readAndSetUnicodeSet(const UChar start, const UChar end, U ErrorCode status) {

	573 while(start != 0x005b) { / advance while we find the first '[' */

	574 start++;

	575 }

	576 // now we need to get a balanced set of '[]'. The problem is that a set can have

	577 // many, and *end point to the first closing '['

	578 int32_t noOpenBraces = 1;

	579 int32_t current = 1; // skip the opening brace

	580 while(start+current < end && noOpenBraces != 0) {

	581 if(start[current] == 0x005b) {

	582 noOpenBraces++;

	583 } else if(start[current] == 0x005D) { // closing brace

	584 noOpenBraces--;

	585 }

	586 current++;

	587 }

	588

	589 if(noOpenBraces != 0 \|\| u_strchr(start+current, 0x005d /']'/) == NULL) {

	590 *status = U_ILLEGAL_ARGUMENT_ERROR;

	591 return NULL;

	592 }

	593 return uset_openPattern(start, current, status);

	594 }

	595

	596 /**

	597 * Reads an option and matches the option name with the predefined options. (Cas e-insensitive.)

	598 * @param start Pointer to the start UChar.

	599 * @param end Pointer to the last valid pointer beyond which the option will not extend.

	600 * @param optionArg Address of the pointer at which the options start (after the option name)

	601 * @return The index of the option, or -1 if the option is not valid.

	602 */

	603 static

	604 int32_t ucol_uprv_tok_readOption(const UChar start, const UChar end, const UCh ar **optionArg) {

	605 int32_t i = 0;

	606 ucol_uprv_tok_initData();

	607

	608 while(u_isWhitespace(start) \|\| uprv_isRuleWhiteSpace(start)) { /* eat whit espace */

	609 start++;

	610 }

	611 while(i < UTOK_OPTION_COUNT) {

	612 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].op tionLen) == 0) {

	613 if(end - start > rulesOptions[i].optionLen) {

	614 optionArg = start+rulesOptions[i].optionLen; / End of option n ame; start of the options */

	615 while(u_isWhitespace(optionArg) \|\| uprv_isRuleWhiteSpace(opt ionArg)) { /* eat whitespace */

	616 (*optionArg)++;

	617 }

	618 }

	619 break;

	620 }

	621 i++;

	622 }

	623 if(i == UTOK_OPTION_COUNT) {

	624 i = -1; // didn't find an option

	625 }

	626 return i;

	627 }

	628

	629

	630 static

	631 void ucol_tok_parseScriptReorder(UColTokenParser src, UErrorCode status) {

	632 int32_t codeCount = 0;

	633 int32_t codeIndex = 0;

	634 char conversion[64];

	635 int32_t tokenLength = 0;

	636 const UChar* space;

	637

	638 const UChar* current = src->current;

	639 const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current);

	640

	641 // eat leading whitespace

	642 while(current < end && u_isWhitespace(*current)) {

	643 current++;

	644 }

	645

	646 while(current < end) {

	647 space = u_memchr(current, 0x0020, end - current);

	648 space = space == 0 ? end : space;

	649 tokenLength = space - current;

	650 if (tokenLength < 4) {

	651 *status = U_INVALID_FORMAT_ERROR;

	652 return;

	653 }

	654 codeCount++;

	655 current += tokenLength;

	656 while(current < end && u_isWhitespace(current)) { / eat whitespace */

	657 ++current;

	658 }

	659 }

	660

	661 if (codeCount == 0) {

	662 *status = U_INVALID_FORMAT_ERROR;

	663 }

	664

	665 src->reorderCodesLength = codeCount;

	666 src->reorderCodes = (int32_t)uprv_malloc(codeCount sizeof(int32_t));

	667 current = src->current;

	668

	669 // eat leading whitespace

	670 while(current < end && u_isWhitespace(*current)) {

	671 current++;

	672 }

	673

	674 while(current < end) {

	675 space = u_memchr(current, 0x0020, end - current);

	676 space = space == 0 ? end : space;

	677 tokenLength = space - current;

	678 if (tokenLength < 4) {

	679 *status = U_ILLEGAL_ARGUMENT_ERROR;

	680 return;

	681 } else {

	682 u_UCharsToChars(current, conversion, tokenLength);

	683 conversion[tokenLength] = '\0';

	684 src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion);

	685 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {

	686 src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRI PT, conversion);

	687 }

	688 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {

	689 *status = U_ILLEGAL_ARGUMENT_ERROR;

	690 }

	691 }

	692 codeIndex++;

	693 current += tokenLength;

	694 while(current < end && u_isWhitespace(current)) { / eat whitespace */

	695 ++current;

	696 }

	697 }

	698 }

	699

	700 // reads and conforms to various options in rules

	701 // end is the position of the first closing ']'

	702 // However, some of the options take an UnicodeSet definition

	703 // which needs to duplicate the closing ']'

	704 // for example: '[copy [\uAC00-\uD7FF]]'

	705 // These options will move end to the second ']' and the

	706 // caller will set the current to it.

	707 static

	708 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser src, UErrorCode status) {

	709 const UChar* start = src->current;

	710 int32_t i = 0;

	711 int32_t j=0;

	712 const UChar *optionArg = NULL;

	713

	714 uint8_t result = 0;

	715

	716 start++; /skip opening '['/

	717 i = ucol_uprv_tok_readOption(start, src->end, &optionArg);

	718 if(optionArg) {

	719 src->current = optionArg;

	720 }

	721

	722 if(i < 0) {

	723 *status = U_ILLEGAL_ARGUMENT_ERROR;

	724 } else {

	725 int32_t noOpenBraces = 1;

	726 switch(i) {

	727 case OPTION_ALTERNATE_HANDLING:

	728 case OPTION_FRENCH_COLLATION:

	729 case OPTION_CASE_LEVEL:

	730 case OPTION_CASE_FIRST:

	731 case OPTION_NORMALIZATION_MODE:

	732 case OPTION_HIRAGANA_QUATERNARY:

	733 case OPTION_STRENGTH:

	734 case OPTION_NUMERIC_COLLATION:

	735 if(optionArg) {

	736 for(j = 0; j<rulesOptions[i].subSize; j++) {

	737 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName , rulesOptions[i].subopts[j].subLen) == 0) {

	738 ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].at tr, rulesOptions[i].subopts[j].attrVal);

	739 result = UCOL_TOK_SUCCESS;

	740 }

	741 }

	742 }

	743 if(result == 0) {

	744 *status = U_ILLEGAL_ARGUMENT_ERROR;

	745 }

	746 break;

	747 case OPTION_VARIABLE_TOP:

	748 result = UCOL_TOK_SUCCESS \| UCOL_TOK_VARIABLE_TOP;

	749 break;

	750 case OPTION_REARRANGE:

	751 result = UCOL_TOK_SUCCESS;

	752 break;

	753 case OPTION_BEFORE:

	754 if(optionArg) {

	755 for(j = 0; j<rulesOptions[i].subSize; j++) {

	756 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName , rulesOptions[i].subopts[j].subLen) == 0) {

	757 result = UCOL_TOK_SUCCESS \| (rulesOptions[i].subopts[j].attr Val + 1);

	758 }

	759 }

	760 }

	761 if(result == 0) {

	762 *status = U_ILLEGAL_ARGUMENT_ERROR;

	763 }

	764 break;

	765 case OPTION_TOP: /* we are going to have an array with structures of limit C Es */

	766 /* index to this array will be src->parsedToken.indirectIndex*/

	767 src->parsedToken.indirectIndex = 0;

	768 result = UCOL_TOK_SUCCESS \| UCOL_TOK_TOP;

	769 break;

	770 case OPTION_FIRST:

	771 case OPTION_LAST: /* first, last */

	772 for(j = 0; j<rulesOptions[i].subSize; j++) {

	773 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, ru lesOptions[i].subopts[j].subLen) == 0) {

	774 // the calculation below assumes that OPTION_FIRST and OPTION_LA ST are at i and i+1 and that the first

	775 // element of indirect boundaries is reserved for top.

	776 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2 );

	777 result = UCOL_TOK_SUCCESS \| UCOL_TOK_TOP;;

	778 }

	779 }

	780 if(result == 0) {

	781 *status = U_ILLEGAL_ARGUMENT_ERROR;

	782 }

	783 break;

	784 case OPTION_OPTIMIZE:

	785 case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before no rmalization

	786 // we need to move end here

	787 src->current++; // skip opening brace

	788 while(src->current < src->end && noOpenBraces != 0) {

	789 if(*src->current == 0x005b) {

	790 noOpenBraces++;

	791 } else if(*src->current == 0x005D) { // closing brace

	792 noOpenBraces--;

	793 }

	794 src->current++;

	795 }

	796 result = UCOL_TOK_SUCCESS;

	797 break;

	798 case OPTION_SCRIPTREORDER:

	799 ucol_tok_parseScriptReorder(src, status);

	800 break;

	801 default:

	802 *status = U_UNSUPPORTED_ERROR;

	803 break;

	804 }

	805 }

	806 src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->curren t));

	807 return result;

	808 }

	809

	810

	811 inline void ucol_tok_addToExtraCurrent(UColTokenParser src, const UChar stuff, int32_t len, UErrorCode *status) {

	812 if (stuff == NULL \|\| len <= 0) {

	813 return;

	814 }

	815 UnicodeString tempStuff(FALSE, stuff, len);

	816 if(src->extraCurrent+len >= src->extraEnd) {

	817 /* reallocate */

	818 if (stuff >= src->source && stuff <= src->end) {

	819 // Copy the "stuff" contents into tempStuff's own buffer.

	820 // UnicodeString is copy-on-write.

	821 if (len > 0) {

	822 tempStuff.setCharAt(0, tempStuff[0]);

	823 } else {

	824 tempStuff.remove();

	825 }

	826 }

	827 UChar newSrc = (UChar )uprv_realloc(src->source, (src->extraEnd-src->s ource)2sizeof(UChar));

	828 if(newSrc != NULL) {

	829 src->current = newSrc + (src->current - src->source);

	830 src->extraCurrent = newSrc + (src->extraCurrent - src->source);

	831 src->end = newSrc + (src->end - src->source);

	832 src->extraEnd = newSrc + (src->extraEnd-src->source)*2;

	833 src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);

	834 src->source = newSrc;

	835 } else {

	836 *status = U_MEMORY_ALLOCATION_ERROR;

	837 return;

	838 }

	839 }

	840 if(len == 1) {

	841 *src->extraCurrent++ = tempStuff[0];

	842 } else {

	843 u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len);

	844 src->extraCurrent += len;

	845 }

	846 }

	847

	848 inline UBool ucol_tok_doSetTop(UColTokenParser src, UErrorCode status) {

	849 /*

	850 top = TRUE;

	851 */

	852 UChar buff[5];

	853 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);

	854 buff[0] = 0xFFFE;

	855 buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].sta rtCE >> 16);

	856 buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].sta rtCE & 0xFFFF);

	857 if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {

	858 src->parsedToken.charsLen = 3;

	859 ucol_tok_addToExtraCurrent(src, buff, 3, status);

	860 } else {

	861 buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex] .startContCE >> 16);

	862 buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex] .startContCE & 0xFFFF);

	863 src->parsedToken.charsLen = 5;

	864 ucol_tok_addToExtraCurrent(src, buff, 5, status);

	865 }

	866 return TRUE;

	867 }

	868

	869 static UBool isCharNewLine(UChar c){

	870 switch(c){

	871 case 0x000A: /* LF */

	872 case 0x000D: /* CR */

	873 case 0x000C: /* FF */

	874 case 0x0085: /* NEL */

	875 case 0x2028: /* LS */

	876 case 0x2029: /* PS */

	877 return TRUE;

	878 default:

	879 return FALSE;

	880 }

	881 }

	882

	883 /*

	884 * This function is called several times when a range is processed. Each time, the next code point

	885 * is processed.

	886 * The following variables must be set before calling this function:

	887 * src->currentRangeCp: The current code point to process.

	888 * src->lastRangeCp: The last code point in the range.

	889 * Pre-requisite: src->currentRangeCp <= src->lastRangeCp.

	890 */

	891 static const UChar*

	892 ucol_tok_processNextCodePointInRange(UColTokenParser *src,

	893 UErrorCode *status)

	894 {

	895 // Append current code point to source

	896 UChar buff[U16_MAX_LENGTH];

	897 uint32_t i = 0;

	898

	899 uint32_t nChars = U16_LENGTH(src->currentRangeCp);

	900 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);

	901 src->parsedToken.charsLen = nChars;

	902

	903 U16_APPEND_UNSAFE(buff, i, src->currentRangeCp);

	904 ucol_tok_addToExtraCurrent(src, buff, nChars, status);

	905

	906 ++src->currentRangeCp;

	907 if (src->currentRangeCp > src->lastRangeCp) {

	908 src->inRange = FALSE;

	909

	910 if (src->currentStarredCharIndex > src->lastStarredCharIndex) {

	911 src->isStarred = FALSE;

	912 }

	913 } else {

	914 src->previousCp = src->currentRangeCp;

	915 }

	916 return src->current;

	917 }

	918

	919 /*

	920 * This function is called several times when a starred list is processed. Each time, the next code point

	921 * in the list is processed.

	922 * The following variables must be set before calling this function:

	923 * src->currentStarredCharIndex: Index (in src->source) of the first char of the current code point.

	924 * src->lastStarredCharIndex: Index to the last character in the list.

	925 * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex.

	926 */

	927 static const UChar*

	928 ucol_tok_processNextTokenInStarredList(UColTokenParser *src)

	929 {

	930 // Extract the characters corresponding to the next code point.

	931 UChar32 cp;

	932 src->parsedToken.charsOffset = src->currentStarredCharIndex;

	933 int32_t prev = src->currentStarredCharIndex;

	934 U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src- >source), cp);

	935 src->parsedToken.charsLen = src->currentStarredCharIndex - prev;

	936

	937 // When we are done parsing the starred string, turn the flag off so that

	938 // the normal processing is restored.

	939 if (src->currentStarredCharIndex > src->lastStarredCharIndex) {

	940 src->isStarred = FALSE;

	941 }

	942 src->previousCp = cp;

	943 return src->current;

	944 }

	945

	946 /*

	947 * Partially parses the next token, keeps the indices in src->parsedToken, and u pdates the counters.

	948 *

	949 * This routine parses and separates almost all tokens. The following are the sy ntax characters recognized.

	950 * # : Comment character

	951 * & : Reset operator

	952 * = : Equality

	953 * < : Primary collation

	954 * << : Secondary collation

	955 * <<< : Tertiary collation

	956 * ; : Secondary collation

	957 * , : Tertiary collation

	958 * / : Expansions

	959 * \| : Prefix

	960 * - : Range

	961

	962 * ! : Java Thai modifier, ignored

	963 * @ : French only

	964

	965 * [] : Options

	966 * '' : Quotes

	967 *

	968 * Along with operators =, <, <<, <<<, the operator * is supported to indicate a list. For example, &a<*bcdexyz

	969 * is equivalent to &a<b<c<d<e<x<y<z. In lists, ranges also can be given, so & a*b-ex-z is equivalent to the above.

	970 * This function do not separate the tokens in a list. Instead, &a<*b-ex-z is parsed as three tokens - "&a",

	971 * "<*b", "-ex", "-z". The strength (< in this case), whether in a list, wheth er in a range and the previous

	972 * character returned as cached so that the calling program can do further spli tting.

	973 */

	974 static const UChar*

	975 ucol_tok_parseNextTokenInternal(UColTokenParser *src,

	976 UBool startOfRules,

	977 UParseError *parseError,

	978 UErrorCode *status)

	979 {

	980 UBool variableTop = FALSE;

	981 UBool top = FALSE;

	982 UBool inChars = TRUE;

	983 UBool inQuote = FALSE;

	984 UBool wasInQuote = FALSE;

	985 uint8_t before = 0;

	986 UBool isEscaped = FALSE;

	987

	988 // TODO: replace these variables with src->parsedToken counterparts

	989 // no need to use them anymore since we have src->parsedToken.

	990 // Ideally, token parser would be a nice class... Once, when I have

	991 // more time (around 2020 probably).

	992 uint32_t newExtensionLen = 0;

	993 uint32_t extensionOffset = 0;

	994 uint32_t newStrength = UCOL_TOK_UNSET;

	995 UChar buff[10];

	996

	997 src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0;

	998 src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;

	999 src->parsedToken.indirectIndex = 0;

	1000

	1001 while (src->current < src->end) {

	1002 UChar ch = *(src->current);

	1003

	1004 if (inQuote) {

	1005 if (ch == 0x0027/'\''/) {

	1006 inQuote = FALSE;

	1007 } else {

	1008 if ((src->parsedToken.charsLen == 0) \|\| inChars) {

	1009 if(src->parsedToken.charsLen == 0) {

	1010 src->parsedToken.charsOffset = (uint32_t)(src->extraCurr ent - src->source);

	1011 }

	1012 src->parsedToken.charsLen++;

	1013 } else {

	1014 if(newExtensionLen == 0) {

	1015 extensionOffset = (uint32_t)(src->extraCurrent - src->so urce);

	1016 }

	1017 newExtensionLen++;

	1018 }

	1019 }

	1020 }else if(isEscaped){

	1021 isEscaped =FALSE;

	1022 if (newStrength == UCOL_TOK_UNSET) {

	1023 *status = U_INVALID_FORMAT_ERROR;

	1024 syntaxError(src->source,(int32_t)(src->current-src->source),(int 32_t)(src->end-src->source),parseError);

	1025 DBG_FORMAT_ERROR

	1026 return NULL;

	1027 // enabling rules to start with non-tokens a < b

	1028 // newStrength = UCOL_TOK_RESET;

	1029 }

	1030 if(ch != 0x0000 && src->current != src->end) {

	1031 if (inChars) {

	1032 if(src->parsedToken.charsLen == 0) {

	1033 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);

	1034 }

	1035 src->parsedToken.charsLen++;

	1036 } else {

	1037 if(newExtensionLen == 0) {

	1038 extensionOffset = (uint32_t)(src->current - src->source) ;

	1039 }

	1040 newExtensionLen++;

	1041 }

	1042 }

	1043 }else {

	1044 if(!uprv_isRuleWhiteSpace(ch)) {

	1045 /* Sets the strength for this entry */

	1046 switch (ch) {

	1047 case 0x003D/'='/ :

	1048 if (newStrength != UCOL_TOK_UNSET) {

	1049 goto EndOfLoop;

	1050 }

	1051

	1052 /* if we start with strength, we'll reset to top */

	1053 if(startOfRules == TRUE) {

	1054 src->parsedToken.indirectIndex = 5;

	1055 top = ucol_tok_doSetTop(src, status);

	1056 newStrength = UCOL_TOK_RESET;

	1057 goto EndOfLoop;

	1058 }

	1059 newStrength = UCOL_IDENTICAL;

	1060 if((src->current+1) == 0x002A) {/''/

	1061 src->current++;

	1062 src->isStarred = TRUE;

	1063 }

	1064 break;

	1065

	1066 case 0x002C/','/:

	1067 if (newStrength != UCOL_TOK_UNSET) {

	1068 goto EndOfLoop;

	1069 }

	1070

	1071 /* if we start with strength, we'll reset to top */

	1072 if(startOfRules == TRUE) {

	1073 src->parsedToken.indirectIndex = 5;

	1074 top = ucol_tok_doSetTop(src, status);

	1075 newStrength = UCOL_TOK_RESET;

	1076 goto EndOfLoop;

	1077 }

	1078 newStrength = UCOL_TERTIARY;

	1079 break;

	1080

	1081 case 0x003B/';'/:

	1082 if (newStrength != UCOL_TOK_UNSET) {

	1083 goto EndOfLoop;

	1084 }

	1085

	1086 /* if we start with strength, we'll reset to top */

	1087 if(startOfRules == TRUE) {

	1088 src->parsedToken.indirectIndex = 5;

	1089 top = ucol_tok_doSetTop(src, status);

	1090 newStrength = UCOL_TOK_RESET;

	1091 goto EndOfLoop;

	1092 }

	1093 newStrength = UCOL_SECONDARY;

	1094 break;

	1095

	1096 case 0x003C/'<'/:

	1097 if (newStrength != UCOL_TOK_UNSET) {

	1098 goto EndOfLoop;

	1099 }

	1100

	1101 /* if we start with strength, we'll reset to top */

	1102 if(startOfRules == TRUE) {

	1103 src->parsedToken.indirectIndex = 5;

	1104 top = ucol_tok_doSetTop(src, status);

	1105 newStrength = UCOL_TOK_RESET;

	1106 goto EndOfLoop;

	1107 }

	1108 /* before this, do a scan to verify whether this is */

	1109 /* another strength */

	1110 if(*(src->current+1) == 0x003C) {

	1111 src->current++;

	1112 if(*(src->current+1) == 0x003C) {

	1113 src->current++; /* three in a row! */

	1114 newStrength = UCOL_TERTIARY;

	1115 } else { /* two in a row */

	1116 newStrength = UCOL_SECONDARY;

	1117 }

	1118 } else { /* just one */

	1119 newStrength = UCOL_PRIMARY;

	1120 }

	1121 if((src->current+1) == 0x002A) {/''/

	1122 src->current++;

	1123 src->isStarred = TRUE;

	1124 }

	1125 break;

	1126

	1127 case 0x0026/'&'/:

	1128 if (newStrength != UCOL_TOK_UNSET) {

	1129 /**/

	1130 goto EndOfLoop;

	1131 }

	1132

	1133 newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */

	1134 break;

	1135

	1136 case 0x005b/'['/:

	1137 /* options - read an option, analyze it */

	1138 if(u_strchr(src->current, 0x005d /']'/) != NULL) {

	1139 uint8_t result = ucol_uprv_tok_readAndSetOption(src, sta tus);

	1140 if(U_SUCCESS(*status)) {

	1141 if(result & UCOL_TOK_TOP) {

	1142 if(newStrength == UCOL_TOK_RESET) {

	1143 top = ucol_tok_doSetTop(src, status);

	1144 if(before) { // This is a combination of bef ore and indirection like '&[before 2][first regular]<b'

	1145 src->parsedToken.charsLen+=2;

	1146 buff[0] = 0x002d;

	1147 buff[1] = before;

	1148 ucol_tok_addToExtraCurrent(src, buff, 2, status);

	1149 }

	1150

	1151 src->current++;

	1152 goto EndOfLoop;

	1153 } else {

	1154 *status = U_INVALID_FORMAT_ERROR;

	1155 syntaxError(src->source,(int32_t)(src->curre nt-src->source),(int32_t)(src->end-src->source),parseError);

	1156 DBG_FORMAT_ERROR

	1157 }

	1158 } else if(result & UCOL_TOK_VARIABLE_TOP) {

	1159 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {

	1160 variableTop = TRUE;

	1161 src->parsedToken.charsOffset = (uint32_t)(sr c->extraCurrent - src->source);

	1162 src->parsedToken.charsLen = 1;

	1163 buff[0] = 0xFFFF;

	1164 ucol_tok_addToExtraCurrent(src, buff, 1, sta tus);

	1165 src->current++;

	1166 goto EndOfLoop;

	1167 } else {

	1168 *status = U_INVALID_FORMAT_ERROR;

	1169 syntaxError(src->source,(int32_t)(src->curre nt-src->source),(int32_t)(src->end-src->source),parseError);

	1170 DBG_FORMAT_ERROR

	1171 }

	1172 } else if (result & UCOL_TOK_BEFORE){

	1173 if(newStrength == UCOL_TOK_RESET) {

	1174 before = result & UCOL_TOK_BEFORE;

	1175 } else {

	1176 *status = U_INVALID_FORMAT_ERROR;

	1177 syntaxError(src->source,(int32_t)(src->curre nt-src->source),(int32_t)(src->end-src->source),parseError);

	1178 DBG_FORMAT_ERROR

	1179 }

	1180 }

	1181 } else {

	1182 *status = U_INVALID_FORMAT_ERROR;

	1183 syntaxError(src->source,(int32_t)(src->current-src-> source),(int32_t)(src->end-src->source),parseError);

	1184 DBG_FORMAT_ERROR

	1185 return NULL;

	1186 }

	1187 }

	1188 break;

	1189 case 0x0021/! skip java thai modifier reordering/:

	1190 break;

	1191 case 0x002F/'/'/:

	1192 wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */

	1193 inChars = FALSE; /* we're now processing expansion */

	1194 break;

	1195 case 0x005C /* back slash for escaped chars */:

	1196 isEscaped = TRUE;

	1197 break;

	1198 /* found a quote, we're gonna start copying */

	1199 case 0x0027/'\''/:

	1200 if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal unt il we have a strength */

	1201 *status = U_INVALID_FORMAT_ERROR;

	1202 syntaxError(src->source,(int32_t)(src->current-src->source ),(int32_t)(src->end-src->source),parseError);

	1203 DBG_FORMAT_ERROR

	1204 return NULL;

	1205 // enabling rules to start with a non-token character a < b

	1206 // newStrength = UCOL_TOK_RESET;

	1207 }

	1208

	1209 inQuote = TRUE;

	1210

	1211 if(inChars) { /* we're doing characters */

	1212 if(wasInQuote == FALSE) {

	1213 src->parsedToken.charsOffset = (uint32_t)(src->extra Current - src->source);

	1214 }

	1215 if (src->parsedToken.charsLen != 0) {

	1216 ucol_tok_addToExtraCurrent(src, src->current - src-> parsedToken.charsLen, src->parsedToken.charsLen, status);

	1217 }

	1218 src->parsedToken.charsLen++;

	1219 } else { /* we're doing an expansion */

	1220 if(wasInQuote == FALSE) {

	1221 extensionOffset = (uint32_t)(src->extraCurrent - src ->source);

	1222 }

	1223 if (newExtensionLen != 0) {

	1224 ucol_tok_addToExtraCurrent(src, src->current - newEx tensionLen, newExtensionLen, status);

	1225 }

	1226 newExtensionLen++;

	1227 }

	1228

	1229 wasInQuote = TRUE;

	1230

	1231 ch = *(++(src->current));

	1232 if(ch == 0x0027) { /* copy the double quote */

	1233 ucol_tok_addToExtraCurrent(src, &ch, 1, status);

	1234 inQuote = FALSE;

	1235 }

	1236 break;

	1237

	1238 /* '@' is french only if the strength is not currently set * /

	1239 /* if it is, it's just a regular character in collation rule s */

	1240 case 0x0040/'@'/:

	1241 if (newStrength == UCOL_TOK_UNSET) {

	1242 src->opts->frenchCollation = UCOL_ON;

	1243 break;

	1244 }

	1245

	1246 case 0x007C /\|/: /* this means we have actually been reading p refix part */

	1247 // we want to store read characters to the prefix part and c ontinue reading

	1248 // the characters (proper way would be to restart reading th e chars, but in

	1249 // that case we would have to complicate the token hasher, w hich I do not

	1250 // intend to play with. Instead, we will do prefixes when pr efixes are due

	1251 // (before adding the elements).

	1252 src->parsedToken.prefixOffset = src->parsedToken.charsOffset ;

	1253 src->parsedToken.prefixLen = src->parsedToken.charsLen;

	1254

	1255 if(inChars) { /* we're doing characters */

	1256 if(wasInQuote == FALSE) {

	1257 src->parsedToken.charsOffset = (uint32_t)(src->extra Current - src->source);

	1258 }

	1259 if (src->parsedToken.charsLen != 0) {

	1260 ucol_tok_addToExtraCurrent(src, src->current - src-> parsedToken.charsLen, src->parsedToken.charsLen, status);

	1261 }

	1262 src->parsedToken.charsLen++;

	1263 }

	1264

	1265 wasInQuote = TRUE;

	1266

	1267 do {

	1268 ch = *(++(src->current));

	1269 // skip whitespace between '\|' and the character

	1270 } while (uprv_isRuleWhiteSpace(ch));

	1271 break;

	1272

	1273 //charsOffset = 0;

	1274 //newCharsLen = 0;

	1275 //break; // We want to store the whole prefix/character sequ ence. If we break

	1276 // the '\|' is going to get lost.

	1277

	1278 case 0x002D /-/: /* A range. */

	1279 if (newStrength != UCOL_TOK_UNSET) {

	1280 // While processing the pending token, the isStarred field

	1281 // is reset, so it needs to be saved for the next

	1282 // invocation.

	1283 src->savedIsStarred = src->isStarred;

	1284 goto EndOfLoop;

	1285 }

	1286 src->isStarred = src->savedIsStarred;

	1287

	1288 // Ranges are valid only in starred tokens.

	1289 if (!src->isStarred) {

	1290 *status = U_INVALID_FORMAT_ERROR;

	1291 syntaxError(src->source,(int32_t)(src->current-src->source) ,(int32_t)(src->end-src->source),parseError);

	1292 DBG_FORMAT_ERROR

	1293 return NULL;

	1294 }

	1295 newStrength = src->parsedToken.strength;

	1296 src->inRange = TRUE;

	1297 break;

	1298

	1299 case 0x0023 /#/: /* this is a comment, skip everything through the end of line */

	1300 do {

	1301 ch = *(++(src->current));

	1302 } while (!isCharNewLine(ch));

	1303

	1304 break;

	1305 default:

	1306 if (newStrength == UCOL_TOK_UNSET) {

	1307 *status = U_INVALID_FORMAT_ERROR;

	1308 syntaxError(src->source,(int32_t)(src->current-src->source ),(int32_t)(src->end-src->source),parseError);

	1309 DBG_FORMAT_ERROR

	1310 return NULL;

	1311 }

	1312

	1313 if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {

	1314 *status = U_INVALID_FORMAT_ERROR;

	1315 syntaxError(src->source,(int32_t)(src->current-src->sour ce),(int32_t)(src->end-src->source),parseError);

	1316 DBG_FORMAT_ERROR

	1317 return NULL;

	1318 }

	1319

	1320 if(ch == 0x0000 && src->current+1 == src->end) {

	1321 break;

	1322 }

	1323

	1324 if (inChars) {

	1325 if(src->parsedToken.charsLen == 0) {

	1326 src->parsedToken.charsOffset = (uint32_t)(src->curre nt - src->source);

	1327 }

	1328 src->parsedToken.charsLen++;

	1329 } else {

	1330 if(newExtensionLen == 0) {

	1331 extensionOffset = (uint32_t)(src->current - src->sou rce);

	1332 }

	1333 newExtensionLen++;

	1334 }

	1335

	1336 break;

	1337 }

	1338 }

	1339 }

	1340

	1341 if(wasInQuote) {

	1342 if(ch != 0x27) {

	1343 if(inQuote \|\| !uprv_isRuleWhiteSpace(ch)) {

	1344 ucol_tok_addToExtraCurrent(src, &ch, 1, status);

	1345 }

	1346 }

	1347 }

	1348

	1349 src->current++;

	1350 }

	1351

	1352 EndOfLoop:

	1353 wasInQuote = FALSE;

	1354 if (newStrength == UCOL_TOK_UNSET) {

	1355 return NULL;

	1356 }

	1357

	1358 if (src->parsedToken.charsLen == 0 && top == FALSE) {

	1359 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(sr c->end-src->source),parseError);

	1360 *status = U_INVALID_FORMAT_ERROR;

	1361 DBG_FORMAT_ERROR

	1362 return NULL;

	1363 }

	1364

	1365 src->parsedToken.strength = newStrength;

	1366 src->parsedToken.extensionOffset = extensionOffset;

	1367 src->parsedToken.extensionLen = newExtensionLen;

	1368 src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) \| (UCOL _TOK_TOP * (top?1:0)) \| before;

	1369

	1370 return src->current;

	1371 }

	1372

	1373 /*

	1374 * Parses the next token, keeps the indices in src->parsedToken, and updates the counters.

	1375 * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported.

	1376 *

	1377 * In addition to what ucol_tok_parseNextTokenInternal() does, this function doe s the following:

	1378 * 1) ucol_tok_parseNextTokenInternal() returns a range as a single token. Thi s function separates

	1379 * it to separate tokens and returns one by one. In order to do that, the n ecessary states are

	1380 * cached as member variables of the token parser.

	1381 * 2) When encountering a range, ucol_tok_parseNextTokenInternal() processes ch aracters up to the

	1382 * starting character as a single list token (which is separated into indivi dual characters here)

	1383 * and as another list token starting with the last character in the range. Before expanding it

	1384 * as a list of tokens, this function expands the range by filling the inter mediate characters and

	1385 * returns them one by one as separate tokens.

	1386 * Necessary checks are done for invalid combinations.

	1387 */

	1388 U_CAPI const UChar* U_EXPORT2

	1389 ucol_tok_parseNextToken(UColTokenParser *src,

	1390 UBool startOfRules,

	1391 UParseError *parseError,

	1392 UErrorCode *status)

	1393 {

	1394 const UChar *nextToken;

	1395

	1396 if (src->inRange) {

	1397 // We are not done processing a range. Continue it.

	1398 return ucol_tok_processNextCodePointInRange(src, status);

	1399 } else if (src->isStarred) {

	1400 // We are not done processing a starred token. Continue it.

	1401 return ucol_tok_processNextTokenInStarredList(src);

	1402 }

	1403

	1404 // Get the next token.

	1405 nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, sta tus);

	1406

	1407 if (nextToken == NULL) {

	1408 return NULL;

	1409 }

	1410

	1411 if (src->inRange) {

	1412 // A new range has started.

	1413 // Check whether it is a chain of ranges with more than one hyphen.

	1414 if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) {

	1415 *status = U_INVALID_FORMAT_ERROR;

	1416 syntaxError(src->source,src->parsedToken.charsOffset-1,

	1417 src->parsedToken.charsOffset+src->parsedToken.charsLen, pars eError);

	1418 DBG_FORMAT_ERROR

	1419 return NULL;

	1420 }

	1421

	1422 // The current token indicates the second code point of the range.

	1423 // Process just that, and then proceed with the star.

	1424 src->currentStarredCharIndex = src->parsedToken.charsOffset;

	1425 U16_NEXT(src->source, src->currentStarredCharIndex,

	1426 (uint32_t)(src->end - src->source), src->lastRangeCp);

	1427 if (src->lastRangeCp <= src->previousCp) {

	1428 *status = U_INVALID_FORMAT_ERROR;

	1429 syntaxError(src->source,src->parsedToken.charsOffset-1,

	1430 src->parsedToken.charsOffset+src->parsedToken.charsLen,parse Error);

	1431 DBG_FORMAT_ERROR

	1432 return NULL;

	1433 }

	1434

	1435 // Set current range code point to process the range loop

	1436 src->currentRangeCp = src->previousCp + 1;

	1437

	1438 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken. charsLen - 1;

	1439

	1440 return ucol_tok_processNextCodePointInRange(src, status);

	1441 } else if (src->isStarred) {

	1442 // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharInd ex_ so that

	1443 // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be

	1444 // separated into several tokens and returned.

	1445 src->currentStarredCharIndex = src->parsedToken.charsOffset;

	1446 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken .charsLen - 1;

	1447

	1448 return ucol_tok_processNextTokenInStarredList(src);

	1449 } else {

	1450 // Set previous codepoint

	1451 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp);

	1452 }

	1453 return nextToken;

	1454 }

	1455

	1456

	1457 /*

	1458 Processing Description

	1459 1 Build a ListList. Each list has a header, which contains two lists (positive

	1460 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and

	1461 reset may be null.

	1462 2 As you process, you keep a LAST pointer that points to the last token you

	1463 handled.

	1464

	1465 */

	1466

	1467 static UColToken ucol_tok_initAReset(UColTokenParser src, const UChar expand, uint32_t expandNext,

	1468 UParseError parseError, UErrorCode statu s)

	1469 {

	1470 if(src->resultLen == src->listCapacity) {

	1471 // Unfortunately, this won't work, as we store addresses of lhs in token

	1472 src->listCapacity *= 2;

	1473 src->lh = (UColTokListHeader )uprv_realloc(src->lh, src->listCapacitys izeof(UColTokListHeader));

	1474 if(src->lh == NULL) {

	1475 *status = U_MEMORY_ALLOCATION_ERROR;

	1476 return NULL;

	1477 }

	1478 }

	1479 /* do the reset thing */

	1480 UColToken sourceToken = (UColToken )uprv_malloc(sizeof(UColToken));

	1481 /* test for NULL */

	1482 if (sourceToken == NULL) {

	1483 *status = U_MEMORY_ALLOCATION_ERROR;

	1484 return NULL;

	1485 }

	1486 sourceToken->rulesToParseHdl = &(src->source);

	1487 sourceToken->source = src->parsedToken.charsLen << 24 \| src->parsedToken.cha rsOffset;

	1488 sourceToken->expansion = src->parsedToken.extensionLen << 24 \| src->parsedTo ken.extensionOffset;

	1489

	1490 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);

	1491 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffs et);

	1492

	1493 // keep the flags around so that we know about before

	1494 sourceToken->flags = src->parsedToken.flags;

	1495

	1496 if(src->parsedToken.prefixOffset != 0) {

	1497 // this is a syntax error

	1498 *status = U_INVALID_FORMAT_ERROR;

	1499 syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken. charsOffset+src->parsedToken.charsLen,parseError);

	1500 DBG_FORMAT_ERROR

	1501 uprv_free(sourceToken);

	1502 return 0;

	1503 } else {

	1504 sourceToken->prefix = 0;

	1505 }

	1506

	1507 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should als o handle reverse */

	1508 sourceToken->strength = UCOL_TOK_RESET;

	1509 sourceToken->next = NULL;

	1510 sourceToken->previous = NULL;

	1511 sourceToken->noOfCEs = 0;

	1512 sourceToken->noOfExpCEs = 0;

	1513 sourceToken->listHeader = &src->lh[src->resultLen];

	1514

	1515 src->lh[src->resultLen].first = NULL;

	1516 src->lh[src->resultLen].last = NULL;

	1517 src->lh[src->resultLen].first = NULL;

	1518 src->lh[src->resultLen].last = NULL;

	1519

	1520 src->lh[src->resultLen].reset = sourceToken;

	1521

	1522 /*

	1523 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...

	1524 First convert all expansions into normal form. Examples:

	1525 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *

	1526 d * ... into &x * c/y * d * ...

	1527 Note: reset values can never have expansions, although they can cause the

	1528 very next item to have one. They may be contractions, if they are found

	1529 earlier in the list.

	1530 */

	1531 *expandNext = 0;

	1532 if(expand != NULL) {

	1533 /* check to see if there is an expansion */

	1534 if(src->parsedToken.charsLen > 1) {

	1535 uint32_t resetCharsOffset;

	1536 resetCharsOffset = (uint32_t)(expand - src->source);

	1537 sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOff set ) << 24) \| src->parsedToken.charsOffset;

	1538 *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOf fset - resetCharsOffset)<<24) \| (resetCharsOffset);

	1539 }

	1540 }

	1541

	1542 src->resultLen++;

	1543

	1544 uhash_put(src->tailored, sourceToken, sourceToken, status);

	1545

	1546 return sourceToken;

	1547 }

	1548

	1549 static

	1550 inline UColToken getVirginBefore(UColTokenParser src, UColToken sourceToken, uint8_t strength, UParseError parseError, UErrorCode *status) {

	1551 if(U_FAILURE(*status)) {

	1552 return NULL;

	1553 }

	1554 /* this is a virgin before - we need to fish the anchor from the UCA */

	1555 collIterate s;

	1556 uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;

	1557 uint32_t CE, SecondCE;

	1558 uint32_t invPos;

	1559 if(sourceToken != NULL) {

	1560 uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFF FFF), 1, &s, status);

	1561 } else {

	1562 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /*charsOffset/, 1, &s, status);

	1563 }

	1564 if(U_FAILURE(*status)) {

	1565 return NULL;

	1566 }

	1567

	1568 baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;

	1569 baseContCE = ucol_getNextCE(src->UCA, &s, status);

	1570 if(baseContCE == UCOL_NO_MORE_CES) {

	1571 baseContCE = 0;

	1572 }

	1573

	1574

	1575 UCAConstants consts = (UCAConstants )((uint8_t *)src->UCA->image + src->UC A->image->UCAConsts);

	1576 uint32_t ch = 0;

	1577 uint32_t expandNext = 0;

	1578 UColToken key;

	1579

	1580 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseC E & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */

	1581 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) \| ((baseContCE & UCOL_PRI MARYMASK) >> 16);

	1582 uint32_t raw = uprv_uca_getRawFromImplicit(primary);

	1583 ch = uprv_uca_getCodePointFromRaw(raw-1);

	1584 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);

	1585 CE = (primaryCE & UCOL_PRIMARYMASK) \| 0x0505;

	1586 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) \| UCOL_CONTINUATION_MA RKER;

	1587

	1588 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->sourc e);

	1589 *src->extraCurrent++ = 0xFFFE;

	1590 *src->extraCurrent++ = (UChar)ch;

	1591 src->parsedToken.charsLen++;

	1592

	1593 key.source = (src->parsedToken.charsLen/*newCharsLen/ << 24) \| src->pa rsedToken.charsOffset/*charsOffset/;

	1594 key.rulesToParseHdl = &(src->source);

	1595

	1596 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);

	1597 sourceToken = (UColToken *)uhash_get(src->tailored, &key);

	1598

	1599 if(sourceToken == NULL) {

	1600 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;

	1601 if(isContinuation(SecondCE)) {

	1602 src->lh[src->resultLen].baseContCE = SecondCE;

	1603 } else {

	1604 src->lh[src->resultLen].baseContCE = 0;

	1605 }

	1606 src->lh[src->resultLen].nextCE = 0;

	1607 src->lh[src->resultLen].nextContCE = 0;

	1608 src->lh[src->resultLen].previousCE = 0;

	1609 src->lh[src->resultLen].previousContCE = 0;

	1610

	1611 src->lh[src->resultLen].indirect = FALSE;

	1612

	1613 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, s tatus);

	1614 }

	1615

	1616 } else {

	1617 invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, str ength);

	1618

	1619 // we got the previous CE. Now we need to see if the difference between

	1620 // the two CEs is really of the requested strength.

	1621 // if it's a bigger difference (we asked for secondary and got primary), we

	1622 // need to modify the CE.

	1623 if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < stre ngth) {

	1624 // adjust the strength

	1625 // now we are in the situation where our baseCE should actually be m odified in

	1626 // order to get the CE in the right position.

	1627 if(strength == UCOL_SECONDARY) {

	1628 CE = baseCE - 0x0200;

	1629 } else { // strength == UCOL_TERTIARY

	1630 CE = baseCE - 0x02;

	1631 }

	1632 if(baseContCE) {

	1633 if(strength == UCOL_SECONDARY) {

	1634 SecondCE = baseContCE - 0x0200;

	1635 } else { // strength == UCOL_TERTIARY

	1636 SecondCE = baseContCE - 0x02;

	1637 }

	1638 }

	1639 }

	1640

	1641 #if 0

	1642 // the code below relies on getting a code point from the inverse table, in order to be

	1643 // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:

	1644 // 1. There are many code points that have the same CE

	1645 // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2 ] are broken.

	1646 // Also, in case when there is no equivalent strength before an element, we have to actually

	1647 // construct one. For example, &[before 2]a << x won't result in x << a, because the element

	1648 // before a is a primary difference.

	1649

	1650 //uint32_t CETable = (uint32_t )((uint8_t *)src->invUCA+src->invUCA->t able);

	1651

	1652

	1653 ch = CETable[3*invPos+2];

	1654

	1655 if((ch & UCOL_INV_SIZEMASK) != 0) {

	1656 uint16_t conts = (uint16_t )((uint8_t *)src->invUCA+src->invUCA->c onts);

	1657 uint32_t offset = (ch & UCOL_INV_OFFSETMASK);

	1658 ch = conts[offset];

	1659 }

	1660

	1661 *src->extraCurrent++ = (UChar)ch;

	1662 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->sourc e - 1);

	1663 src->parsedToken.charsLen = 1;

	1664

	1665 // We got an UCA before. However, this might have been tailored.

	1666 // example:

	1667 // &\u30ca = \u306a

	1668 // &[before 3]\u306a<<<\u306a\|\u309d

	1669

	1670

	1671 // uint32_t key = (newCharsLen << 24) \| charsOffset;

	1672 key.source = (src->parsedToken.charsLen/*newCharsLen/ << 24) \| src->pa rsedToken.charsOffset/*charsOffset/;

	1673 key.rulesToParseHdl = &(src->source);

	1674

	1675 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);

	1676 sourceToken = (UColToken *)uhash_get(src->tailored, &key);

	1677 #endif

	1678

	1679 // here is how it should be. The situation such as &[before 1]a < x, sho uld be

	1680 // resolved exactly as if we wrote &a > x.

	1681 // therefore, I don't really care if the UCA value before a has been cha nged.

	1682 // However, I do care if the strength between my element and the previou s element

	1683 // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll

	1684 // have to construct the base CE.

	1685

	1686

	1687

	1688 // if we found a tailored thing, we have to use the UCA value and constr uct

	1689 // a new reset token with constructed name

	1690 //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {

	1691 // character to which we want to anchor is already tailored.

	1692 // We need to construct a new token which will be the anchor

	1693 // point

	1694 //*(src->extraCurrent-1) = 0xFFFE;

	1695 //*src->extraCurrent++ = (UChar)ch;

	1696 // grab before

	1697 src->parsedToken.charsOffset -= 10;

	1698 src->parsedToken.charsLen += 10;

	1699 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;

	1700 if(isContinuation(SecondCE)) {

	1701 src->lh[src->resultLen].baseContCE = SecondCE;

	1702 } else {

	1703 src->lh[src->resultLen].baseContCE = 0;

	1704 }

	1705 src->lh[src->resultLen].nextCE = 0;

	1706 src->lh[src->resultLen].nextContCE = 0;

	1707 src->lh[src->resultLen].previousCE = 0;

	1708 src->lh[src->resultLen].previousContCE = 0;

	1709

	1710 src->lh[src->resultLen].indirect = FALSE;

	1711

	1712 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, statu s);

	1713 //}

	1714 }

	1715

	1716 return sourceToken;

	1717

	1718 }

	1719

	1720 uint32_t ucol_tok_assembleTokenList(UColTokenParser src, UParseError parseErro r, UErrorCode *status) {

	1721 UColToken *lastToken = NULL;

	1722 const UChar *parseEnd = NULL;

	1723 uint32_t expandNext = 0;

	1724 UBool variableTop = FALSE;

	1725 UBool top = FALSE;

	1726 uint16_t specs = 0;

	1727 UColTokListHeader *ListList = NULL;

	1728

	1729 src->parsedToken.strength = UCOL_TOK_UNSET;

	1730

	1731 ListList = src->lh;

	1732

	1733 if(U_FAILURE(*status)) {

	1734 return 0;

	1735 }

	1736 #ifdef DEBUG_FOR_CODE_POINTS

	1737 char filename[35];

	1738 sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid());

	1739 dfcp_fp = fopen(filename, "a");

	1740 fprintf(stdout, "Output is in the file %s.\n", filename);

	1741 #endif

	1742

	1743 #ifdef DEBUG_FOR_COLL_RULES

	1744 std::string s3;

	1745 UnicodeString(src->source).toUTF8String(s3);

	1746 std::cout << "src->source = " << s3 << std::endl;

	1747 #endif

	1748

	1749 while(src->current < src->end \|\| src->isStarred) {

	1750 src->parsedToken.prefixOffset = 0;

	1751

	1752 parseEnd = ucol_tok_parseNextToken(src,

	1753 (UBool)(lastToken == NULL),

	1754 parseError,

	1755 status);

	1756

	1757 specs = src->parsedToken.flags;

	1758

	1759

	1760 variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);

	1761 top = ((specs & UCOL_TOK_TOP) != 0);

	1762

	1763 if(U_SUCCESS(*status) && parseEnd != NULL) {

	1764 UColToken *sourceToken = NULL;

	1765 //uint32_t key = 0;

	1766 uint32_t lastStrength = UCOL_TOK_UNSET;

	1767

	1768 if(lastToken != NULL ) {

	1769 lastStrength = lastToken->strength;

	1770 }

	1771

	1772 #ifdef DEBUG_FOR_CODE_POINTS

	1773 UChar32 cp;

	1774 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src ->extraEnd - src->source), cp);

	1775 fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsed Token.strength);

	1776 #endif

	1777 //key = newCharsLen << 24 \| charsOffset;

	1778 UColToken key;

	1779 key.source = src->parsedToken.charsLen << 24 \| src->parsedToken.char sOffset;

	1780 key.rulesToParseHdl = &(src->source);

	1781

	1782 /* 4 Lookup each source in the CharsToToken map, and find a sourceT oken */

	1783 sourceToken = (UColToken *)uhash_get(src->tailored, &key);

	1784

	1785 if(src->parsedToken.strength != UCOL_TOK_RESET) {

	1786 if(lastToken == NULL) { /* this means that rules haven't started properly */

	1787 *status = U_INVALID_FORMAT_ERROR;

	1788 syntaxError(src->source,0,(int32_t)(src->end-src->source),pa rseError);

	1789 DBG_FORMAT_ERROR

	1790 return 0;

	1791 }

	1792 /* 6 Otherwise (when relation != reset) */

	1793 if(sourceToken == NULL) {

	1794 /* If sourceToken is null, create new one, */

	1795 sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));

	1796 /* test for NULL */

	1797 if (sourceToken == NULL) {

	1798 *status = U_MEMORY_ALLOCATION_ERROR;

	1799 return 0;

	1800 }

	1801 sourceToken->rulesToParseHdl = &(src->source);

	1802 sourceToken->source = src->parsedToken.charsLen << 24 \| src- >parsedToken.charsOffset;

	1803

	1804 sourceToken->debugSource = *(src->source + src->parsedToken. charsOffset);

	1805

	1806 sourceToken->prefix = src->parsedToken.prefixLen << 24 \| src ->parsedToken.prefixOffset;

	1807 sourceToken->debugPrefix = *(src->source + src->parsedToken. prefixOffset);

	1808

	1809 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */

	1810 sourceToken->next = NULL;

	1811 sourceToken->previous = NULL;

	1812 sourceToken->noOfCEs = 0;

	1813 sourceToken->noOfExpCEs = 0;

	1814 // keep the flags around so that we know about before

	1815 sourceToken->flags = src->parsedToken.flags;

	1816 uhash_put(src->tailored, sourceToken, sourceToken, status);

	1817 if(U_FAILURE(*status)) {

	1818 return 0;

	1819 }

	1820 } else {

	1821 /* we could have fished out a reset here */

	1822 if(sourceToken->strength != UCOL_TOK_RESET && lastToken != s ourceToken) {

	1823 /* otherwise remove sourceToken from where it was. */

	1824 if(sourceToken->next != NULL) {

	1825 if(sourceToken->next->strength > sourceToken->streng th) {

	1826 sourceToken->next->strength = sourceToken->stren gth;

	1827 }

	1828 sourceToken->next->previous = sourceToken->previous;

	1829 } else {

	1830 sourceToken->listHeader->last = sourceToken->previou s;

	1831 }

	1832

	1833 if(sourceToken->previous != NULL) {

	1834 sourceToken->previous->next = sourceToken->next;

	1835 } else {

	1836 sourceToken->listHeader->first = sourceToken->next;

	1837 }

	1838 sourceToken->next = NULL;

	1839 sourceToken->previous = NULL;

	1840 }

	1841 }

	1842

	1843 sourceToken->strength = src->parsedToken.strength;

	1844 sourceToken->listHeader = lastToken->listHeader;

	1845

	1846 /*

	1847 1. Find the strongest strength in each list, and set strongestP and strongestN

	1848 accordingly in the headers.

	1849 */

	1850 if(lastStrength == UCOL_TOK_RESET

	1851 \|\| sourceToken->listHeader->first == 0) {

	1852 /* If LAST is a reset

	1853 insert sourceToken in the list. */

	1854 if(sourceToken->listHeader->first == 0) {

	1855 sourceToken->listHeader->first = sourceToken;

	1856 sourceToken->listHeader->last = sourceToken;

	1857 } else { /* we need to find a place for us */

	1858 /* and we'll get in front of the same strength */

	1859 if(sourceToken->listHeader->first->strength <= sourc eToken->strength) {

	1860 sourceToken->next = sourceToken->listHeader->fir st;

	1861 sourceToken->next->previous = sourceToken;

	1862 sourceToken->listHeader->first = sourceToken;

	1863 sourceToken->previous = NULL;

	1864 } else {

	1865 lastToken = sourceToken->listHeader->first;

	1866 while(lastToken->next != NULL && lastToken->next ->strength > sourceToken->strength) {

	1867 lastToken = lastToken->next;

	1868 }

	1869 if(lastToken->next != NULL) {

	1870 lastToken->next->previous = sourceToken;

	1871 } else {

	1872 sourceToken->listHeader->last = sourceToken;

	1873 }

	1874 sourceToken->previous = lastToken;

	1875 sourceToken->next = lastToken->next;

	1876 lastToken->next = sourceToken;

	1877 }

	1878 }

	1879 } else {

	1880 /* Otherwise (when LAST is not a reset)

	1881 if polarity (LAST) == polarity(relation), insert sourceT oken after LAST,

	1882 otherwise insert before.

	1883 when inserting after or before, search to the next posit ion with the same

	1884 strength in that direction. (This is called postpone ins ertion). */

	1885 if(sourceToken != lastToken) {

	1886 if(lastToken->polarity == sourceToken->polarity) {

	1887 while(lastToken->next != NULL && lastToken->next ->strength > sourceToken->strength) {

	1888 lastToken = lastToken->next;

	1889 }

	1890 sourceToken->previous = lastToken;

	1891 if(lastToken->next != NULL) {

	1892 lastToken->next->previous = sourceToken;

	1893 } else {

	1894 sourceToken->listHeader->last = sourceToken;

	1895 }

	1896

	1897 sourceToken->next = lastToken->next;

	1898 lastToken->next = sourceToken;

	1899 } else {

	1900 while(lastToken->previous != NULL && lastToken-> previous->strength > sourceToken->strength) {

	1901 lastToken = lastToken->previous;

	1902 }

	1903 sourceToken->next = lastToken;

	1904 if(lastToken->previous != NULL) {

	1905 lastToken->previous->next = sourceToken;

	1906 } else {

	1907 sourceToken->listHeader->first = sourceToken ;

	1908 }

	1909 sourceToken->previous = lastToken->previous;

	1910 lastToken->previous = sourceToken;

	1911 }

	1912 } else { /* repeated one thing twice in rules, stay with the stronger strength */

	1913 if(lastStrength < sourceToken->strength) {

	1914 sourceToken->strength = lastStrength;

	1915 }

	1916 }

	1917 }

	1918

	1919 /* if the token was a variable top, we're gonna put it in */

	1920 if(variableTop == TRUE && src->varTop == NULL) {

	1921 variableTop = FALSE;

	1922 src->varTop = sourceToken;

	1923 }

	1924

	1925 // Treat the expansions.

	1926 // There are two types of expansions: explicit (x / y) and r eset based propagating expansions

	1927 // (&abc * d * e <=> &ab * d / c * e / c)

	1928 // if both of them are in effect for a token, they are combi ned.

	1929

	1930 sourceToken->expansion = src->parsedToken.extensionLen << 24 \| src->parsedToken.extensionOffset;

	1931

	1932 if(expandNext != 0) {

	1933 if(sourceToken->strength == UCOL_PRIMARY) { /* primary s trength kills off the implicit expansion */

	1934 expandNext = 0;

	1935 } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */

	1936 sourceToken->expansion = expandNext;

	1937 } else { /* there is both explicit and implicit expansio n. We need to make a combination */

	1938 uprv_memcpy(src->extraCurrent, src->source + (expand Next & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));

	1939 uprv_memcpy(src->extraCurrent+(expandNext >> 24), sr c->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*size of(UChar));

	1940 sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 \| (uint32_t)(src->extraCurrent - src->s ource));

	1941 src->extraCurrent += (expandNext >> 24) + src->parse dToken.extensionLen;

	1942 }

	1943 }

	1944

	1945 // This is just for debugging purposes

	1946 if(sourceToken->expansion != 0) {

	1947 sourceToken->debugExpansion = *(src->source + src->parse dToken.extensionOffset);

	1948 } else {

	1949 sourceToken->debugExpansion = 0;

	1950 }

	1951 // if the previous token was a reset before, the strength of this

	1952 // token must match the strength of before. Otherwise we hav e an

	1953 // undefined situation.

	1954 // In other words, we currently have a cludge which we use t o

	1955 // represent &a >> x. This is written as &[before 2]a << x.

	1956 if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {

	1957 uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BE FORE) - 1;

	1958 if(beforeStrength != sourceToken->strength) {

	1959 *status = U_INVALID_FORMAT_ERROR;

	1960 syntaxError(src->source,0,(int32_t)(src->end-src->so urce),parseError);

	1961 DBG_FORMAT_ERROR

	1962 return 0;

	1963 }

	1964 }

	1965 } else {

	1966 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {

	1967 /* if the previous token was also a reset, */

	1968 /this means that we have two consecutive resets /

	1969 /* and we want to remove the previous one if empty*/

	1970 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {

	1971 src->resultLen--;

	1972 }

	1973 }

	1974

	1975 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */

	1976 uint32_t searchCharsLen = src->parsedToken.charsLen;

	1977 while(searchCharsLen > 1 && sourceToken == NULL) {

	1978 searchCharsLen--;

	1979 //key = searchCharsLen << 24 \| charsOffset;

	1980 UColToken key;

	1981 key.source = searchCharsLen << 24 \| src->parsedToken.cha rsOffset;

	1982 key.rulesToParseHdl = &(src->source);

	1983 sourceToken = (UColToken *)uhash_get(src->tailored, &key );

	1984 }

	1985 if(sourceToken != NULL) {

	1986 expandNext = (src->parsedToken.charsLen - searchCharsLen ) << 24 \| (src->parsedToken.charsOffset + searchCharsLen);

	1987 }

	1988 }

	1989

	1990 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */

	1991 if(top == FALSE) { /* there is no indirection */

	1992 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;

	1993 if(sourceToken != NULL && sourceToken->strength != UCOL_ TOK_RESET) {

	1994 /* this is a before that is already ordered in the U CA - so we need to get the previous with good strength */

	1995 while(sourceToken->strength > strength && sourceToke n->previous != NULL) {

	1996 sourceToken = sourceToken->previous;

	1997 }

	1998 /* here, either we hit the strength or NULL */

	1999 if(sourceToken->strength == strength) {

	2000 if(sourceToken->previous != NULL) {

	2001 sourceToken = sourceToken->previous;

	2002 } else { /* start of list */

	2003 sourceToken = sourceToken->listHeader->reset ;

	2004 }

	2005 } else { /* we hit NULL */

	2006 /* we should be doing the else part */

	2007 sourceToken = sourceToken->listHeader->reset;

	2008 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);

	2009 }

	2010 } else {

	2011 sourceToken = getVirginBefore(src, sourceToken, stre ngth, parseError, status);

	2012 }

	2013 } else { /* this is both before and indirection */

	2014 top = FALSE;

	2015 ListList[src->resultLen].previousCE = 0;

	2016 ListList[src->resultLen].previousContCE = 0;

	2017 ListList[src->resultLen].indirect = TRUE;

	2018 /* we need to do slightly more work. we need to get the baseCE using the */

	2019 /* inverse UCA & getPrevious. The next bound is not set, and will be decided */

	2020 /* in ucol_bld */

	2021 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;

	2022 uint32_t baseCE = ucolIndirectBoundaries[src->parsedToke n.indirectIndex].startCE;

	2023 uint32_t baseContCE = ucolIndirectBoundaries[src->parsed Token.indirectIndex].startContCE;//&0xFFFFFF3F;

	2024 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;

	2025

	2026 UCAConstants consts = (UCAConstants )((uint8_t *)src-> UCA->image + src->UCA->image->UCAConsts);

	2027 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICI T_MIN<<24) &&

	2028 (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICI T_MAX<<24) ) { /* implicits - */

	2029 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) \| ((b aseContCE & UCOL_PRIMARYMASK) >> 16);

	2030 uint32_t raw = uprv_uca_getRawFromImplicit(primary);

	2031 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw -1);

	2032 CE = (primaryCE & UCOL_PRIMARYMASK) \| 0x0505;

	2033 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) \| UCOL_CONTINUATION_MARKER;

	2034 } else {

	2035 /int32_t invPos = ucol_inv_getPrevCE(baseCE, baseCo ntCE, &CE, &SecondCE, strength);/

	2036 ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &Se condCE, strength);

	2037 }

	2038

	2039 ListList[src->resultLen].baseCE = CE;

	2040 ListList[src->resultLen].baseContCE = SecondCE;

	2041 ListList[src->resultLen].nextCE = 0;

	2042 ListList[src->resultLen].nextContCE = 0;

	2043

	2044 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, p arseError, status);

	2045 }

	2046 }

	2047

	2048

	2049 /* 5 If the relation is a reset:

	2050 If sourceToken is null

	2051 Create new list, create new sourceToken, make the baseCE from so urce, put

	2052 the sourceToken in ListHeader of the new list */

	2053 if(sourceToken == NULL) {

	2054 /*

	2055 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...

	2056 First convert all expansions into normal form. Examples:

	2057 If "xy" doesn't occur earlier in the list or in the UCA, con vert &xy * c *

	2058 d * ... into &x * c/y * d * ...

	2059 Note: reset values can never have expansions, although they can cause the

	2060 very next item to have one. They may be contractions, if the y are found

	2061 earlier in the list.

	2062 */

	2063 if(top == FALSE) {

	2064 collIterate s;

	2065 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;

	2066

	2067 uprv_init_collIterate(src->UCA, src->source+src->parsedT oken.charsOffset, src->parsedToken.charsLen, &s, status);

	2068

	2069 CE = ucol_getNextCE(src->UCA, &s, status);

	2070 const UChar *expand = s.pos;

	2071 SecondCE = ucol_getNextCE(src->UCA, &s, status);

	2072

	2073 ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;

	2074 if(isContinuation(SecondCE)) {

	2075 ListList[src->resultLen].baseContCE = SecondCE;

	2076 } else {

	2077 ListList[src->resultLen].baseContCE = 0;

	2078 }

	2079 ListList[src->resultLen].nextCE = 0;

	2080 ListList[src->resultLen].nextContCE = 0;

	2081 ListList[src->resultLen].previousCE = 0;

	2082 ListList[src->resultLen].previousContCE = 0;

	2083 ListList[src->resultLen].indirect = FALSE;

	2084 sourceToken = ucol_tok_initAReset(src, expand, &expandNe xt, parseError, status);

	2085 } else { /* top == TRUE */

	2086 /* just use the supplied values */

	2087 top = FALSE;

	2088 ListList[src->resultLen].previousCE = 0;

	2089 ListList[src->resultLen].previousContCE = 0;

	2090 ListList[src->resultLen].indirect = TRUE;

	2091 ListList[src->resultLen].baseCE = ucolIndirectBoundaries [src->parsedToken.indirectIndex].startCE;

	2092 ListList[src->resultLen].baseContCE = ucolIndirectBounda ries[src->parsedToken.indirectIndex].startContCE;

	2093 ListList[src->resultLen].nextCE = ucolIndirectBoundaries [src->parsedToken.indirectIndex].limitCE;

	2094 ListList[src->resultLen].nextContCE = ucolIndirectBounda ries[src->parsedToken.indirectIndex].limitContCE;

	2095

	2096 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, p arseError, status);

	2097

	2098 }

	2099 } else { /* reset to something already in rules */

	2100 top = FALSE;

	2101 }

	2102 }

	2103 /* 7 After all this, set LAST to point to sourceToken, and goto ste p 3. */

	2104 lastToken = sourceToken;

	2105 } else {

	2106 if(U_FAILURE(*status)) {

	2107 return 0;

	2108 }

	2109 }

	2110 }

	2111 #ifdef DEBUG_FOR_CODE_POINTS

	2112 fclose(dfcp_fp);

	2113 #endif

	2114

	2115

	2116 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {

	2117 src->resultLen--;

	2118 }

	2119 return src->resultLen;

	2120 }

	2121

	2122 const UChar* ucol_tok_getRulesFromBundle(

	2123 void* /context/,

	2124 const char* locale,

	2125 const char* type,

	2126 int32_t* pLength,

	2127 UErrorCode* status)

	2128 {

	2129 const UChar* rules = NULL;

	2130 UResourceBundle* bundle;

	2131 UResourceBundle* collations;

	2132 UResourceBundle* collation;

	2133

	2134 *pLength = 0;

	2135

	2136 bundle = ures_open(U_ICUDATA_COLL, locale, status);

	2137 if(U_SUCCESS(*status)){

	2138 collations = ures_getByKey(bundle, "collations", NULL, status);

	2139 if(U_SUCCESS(*status)){

	2140 collation = ures_getByKey(collations, type, NULL, status);

	2141 if(U_SUCCESS(*status)){

	2142 rules = ures_getStringByKey(collation, "Sequence", pLength, stat us);

	2143 if(U_FAILURE(*status)){

	2144 *pLength = 0;

	2145 rules = NULL;

	2146 }

	2147 ures_close(collation);

	2148 }

	2149 ures_close(collations);

	2150 }

	2151 }

	2152

	2153 ures_close(bundle);

	2154

	2155 return rules;

	2156 }

	2157

	2158 void ucol_tok_initTokenList(

	2159 UColTokenParser *src,

	2160 const UChar *rules,

	2161 uint32_t rulesLength,

	2162 const UCollator *UCA,

	2163 GetCollationRulesFunction importFunc,

	2164 void* context,

	2165 UErrorCode *status) {

	2166 U_NAMESPACE_USE

	2167

	2168 uint32_t nSize = 0;

	2169 uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);

	2170

	2171 bool needToDeallocRules = false;

	2172

	2173 if(U_FAILURE(*status)) {

	2174 return;

	2175 }

	2176

	2177 // set everything to zero, so that we can clean up gracefully

	2178 uprv_memset(src, 0, sizeof(UColTokenParser));

	2179

	2180 // first we need to find options that don't like to be normalized,

	2181 // like copy and remove...

	2182 //const UChar *openBrace = rules;

	2183 int32_t optionNumber = -1;

	2184 const UChar *setStart = NULL;

	2185 uint32_t i = 0;

	2186 while(i < rulesLength) {

	2187 if(rules[i] == 0x005B) { // '[': start of an option

	2188 /* Gets the following:

	2189 optionNumber: The index of the option.

	2190 setStart: The pointer at which the option arguments start.

	2191 */

	2192 optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength , &setStart);

	2193

	2194 if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tail oring */

	2195 // [optimize]

	2196 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rule s+rulesLength, status);

	2197 if(U_SUCCESS(*status)) {

	2198 if(src->copySet == NULL) {

	2199 src->copySet = newSet;

	2200 } else {

	2201 uset_addAll(src->copySet, newSet);

	2202 uset_close(newSet);

	2203 }

	2204 } else {

	2205 return;

	2206 }

	2207 } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {

	2208 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rule s+rulesLength, status);

	2209 if(U_SUCCESS(*status)) {

	2210 if(src->removeSet == NULL) {

	2211 src->removeSet = newSet;

	2212 } else {

	2213 uset_addAll(src->removeSet, newSet);

	2214 uset_close(newSet);

	2215 }

	2216 } else {

	2217 return;

	2218 }

	2219 } else if(optionNumber == OPTION_IMPORT){

	2220 // [import <collation-name>]

	2221

	2222 // Find the address of the closing ].

	2223 UChar* import_end = u_strchr(setStart, 0x005D);

	2224 int32_t optionEndOffset = (int32_t)(import_end + 1 - rules);

	2225 // Ignore trailing whitespace.

	2226 while(uprv_isRuleWhiteSpace(*(import_end-1))) {

	2227 --import_end;

	2228 }

	2229

	2230 int32_t optionLength = (int32_t)(import_end - setStart);

	2231 char option[50];

	2232 if(optionLength >= (int32_t)sizeof(option)) {

	2233 *status = U_ILLEGAL_ARGUMENT_ERROR;

	2234 return;

	2235 }

	2236 u_UCharsToChars(setStart, option, optionLength);

	2237 option[optionLength] = 0;

	2238

	2239 *status = U_ZERO_ERROR;

	2240 char locale[50];

	2241 int32_t templ;

	2242 uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &te mpl, status);

	2243 if(U_FAILURE(*status)) {

	2244 *status = U_ILLEGAL_ARGUMENT_ERROR;

	2245 return;

	2246 }

	2247

	2248 char type[50];

	2249 if (uloc_getKeywordValue(locale, "collation", type, (int32_t)siz eof(type), status) <= 0 \|\|

	2250 U_FAILURE(*status)

	2251 ) {

	2252 *status = U_ZERO_ERROR;

	2253 uprv_strcpy(type, "standard");

	2254 }

	2255

	2256 // TODO: Use public functions when available, see ticket #8134.

	2257 char keywords = (char )locale_getKeywordsStart(locale);

	2258 if(keywords != NULL) {

	2259 *keywords = 0;

	2260 }

	2261

	2262 int32_t importRulesLength = 0;

	2263 const UChar* importRules = importFunc(context, locale, type, &im portRulesLength, status);

	2264

	2265 #ifdef DEBUG_FOR_COLL_RULES

	2266 std::string s;

	2267 UnicodeString(importRules).toUTF8String(s);

	2268 std::cout << "Import rules = " << s << std::endl;

	2269 #endif

	2270

	2271 // Add the length of the imported rules to length of the origina l rules,

	2272 // and subtract the length of the import option.

	2273 uint32_t newRulesLength = rulesLength + importRulesLength - (opt ionEndOffset - i);

	2274

	2275 UChar* newRules = (UChar)uprv_malloc(newRulesLengthsizeof(UCha r));

	2276

	2277 #ifdef DEBUG_FOR_COLL_RULES

	2278 std::string s1;

	2279 UnicodeString(rules).toUTF8String(s1);

	2280 std::cout << "Original rules = " << s1 << std::endl;

	2281 #endif

	2282

	2283

	2284 // Copy the section of the original rules leading up to the impo rt

	2285 uprv_memcpy(newRules, rules, i*sizeof(UChar));

	2286 // Copy the imported rules

	2287 uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UC har));

	2288 // Copy the rest of the original rules (minus the import option itself)

	2289 uprv_memcpy(newRules+i+importRulesLength,

	2290 rules+optionEndOffset,

	2291 (rulesLength-optionEndOffset)*sizeof(UChar));

	2292

	2293 #ifdef DEBUG_FOR_COLL_RULES

	2294 std::string s2;

	2295 UnicodeString(newRules).toUTF8String(s2);

	2296 std::cout << "Resulting rules = " << s2 << std::endl;

	2297 #endif

	2298

	2299 if(needToDeallocRules){

	2300 // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free

	2301 uprv_free((void*)rules);

	2302 }

	2303 needToDeallocRules = true;

	2304 rules = newRules;

	2305 rulesLength = newRulesLength;

	2306

	2307 estimatedSize += importRulesLength*2;

	2308

	2309 // First character of the new rules needs to be processed

	2310 i--;

	2311 }

	2312 }

	2313 //openBrace++;

	2314 i++;

	2315 }

	2316

	2317 src->source = (UChar )uprv_malloc(estimatedSizesizeof(UChar));

	2318 /* test for NULL */

	2319 if (src->source == NULL) {

	2320 *status = U_MEMORY_ALLOCATION_ERROR;

	2321 return;

	2322 }

	2323 uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));

	2324 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estim atedSize, status);

	2325 if(nSize > estimatedSize \|\| *status == U_BUFFER_OVERFLOW_ERROR) {

	2326 *status = U_ZERO_ERROR;

	2327 src->source = (UChar )uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_R ULE_SPACE_SIZE)sizeof(UChar));

	2328 /* test for NULL */

	2329 if (src->source == NULL) {

	2330 *status = U_MEMORY_ALLOCATION_ERROR;

	2331 return;

	2332 }

	2333 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, n Size+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);

	2334 }

	2335 if(needToDeallocRules){

	2336 // if needToDeallocRules is set, then we allocated rules, so it's safe t o cast and free

	2337 uprv_free((void*)rules);

	2338 }

	2339

	2340

	2341 src->current = src->source;

	2342 src->end = src->source+nSize;

	2343 src->sourceCurrent = src->source;

	2344 src->extraCurrent = src->end+1; // Preserve terminating zero in the rule str ing so that option scanning works correctly

	2345 src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SP ACE_SIZE;

	2346 src->varTop = NULL;

	2347 src->UCA = UCA;

	2348 src->invUCA = ucol_initInverseUCA(status);

	2349 src->parsedToken.charsLen = 0;

	2350 src->parsedToken.charsOffset = 0;

	2351 src->parsedToken.extensionLen = 0;

	2352 src->parsedToken.extensionOffset = 0;

	2353 src->parsedToken.prefixLen = 0;

	2354 src->parsedToken.prefixOffset = 0;

	2355 src->parsedToken.flags = 0;

	2356 src->parsedToken.strength = UCOL_TOK_UNSET;

	2357 src->buildCCTabFlag = FALSE;

	2358 src->isStarred = FALSE;

	2359 src->inRange = FALSE;

	2360 src->lastRangeCp = 0;

	2361 src->previousCp = 0;

	2362

	2363 if(U_FAILURE(*status)) {

	2364 return;

	2365 }

	2366 src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, stat us);

	2367 if(U_FAILURE(*status)) {

	2368 return;

	2369 }

	2370 uhash_setValueDeleter(src->tailored, uhash_freeBlock);

	2371

	2372 src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));

	2373 /* test for NULL */

	2374 if (src->opts == NULL) {

	2375 *status = U_MEMORY_ALLOCATION_ERROR;

	2376 return;

	2377 }

	2378

	2379 uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));

	2380

	2381 src->lh = 0;

	2382 src->listCapacity = 1024;

	2383 src->lh = (UColTokListHeader )uprv_malloc(src->listCapacitysizeof(UColTokL istHeader));

	2384 //Test for NULL

	2385 if (src->lh == NULL) {

	2386 *status = U_MEMORY_ALLOCATION_ERROR;

	2387 return;

	2388 }

	2389 uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));

	2390 src->resultLen = 0;

	2391

	2392 UCAConstants consts = (UCAConstants )((uint8_t *)src->UCA->image + src->UC A->image->UCAConsts);

	2393

	2394 // UCOL_RESET_TOP_VALUE

	2395 setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IM PLICIT);

	2396 // UCOL_FIRST_PRIMARY_IGNORABLE

	2397 setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);

	2398 // UCOL_LAST_PRIMARY_IGNORABLE

	2399 setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);

	2400 // UCOL_FIRST_SECONDARY_IGNORABLE

	2401 setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);

	2402 // UCOL_LAST_SECONDARY_IGNORABLE

	2403 setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);

	2404 // UCOL_FIRST_TERTIARY_IGNORABLE

	2405 setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);

	2406 // UCOL_LAST_TERTIARY_IGNORABLE

	2407 setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);

	2408 // UCOL_FIRST_VARIABLE

	2409 setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);

	2410 // UCOL_LAST_VARIABLE

	2411 setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);

	2412 // UCOL_FIRST_NON_VARIABLE

	2413 setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);

	2414 // UCOL_LAST_NON_VARIABLE

	2415 setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_I MPLICIT);

	2416 // UCOL_FIRST_IMPLICIT

	2417 setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);

	2418 // UCOL_LAST_IMPLICIT

	2419 setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAIL ING);

	2420 // UCOL_FIRST_TRAILING

	2421 setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);

	2422 // UCOL_LAST_TRAILING

	2423 setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);

	2424 ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);

	2425 }

	2426

	2427

	2428 void ucol_tok_closeTokenList(UColTokenParser *src) {

	2429 if(src->copySet != NULL) {

	2430 uset_close(src->copySet);

	2431 }

	2432 if(src->removeSet != NULL) {

	2433 uset_close(src->removeSet);

	2434 }

	2435 if(src->tailored != NULL) {

	2436 uhash_close(src->tailored);

	2437 }

	2438 if(src->lh != NULL) {

	2439 uprv_free(src->lh);

	2440 }

	2441 if(src->source != NULL) {

	2442 uprv_free(src->source);

	2443 }

	2444 if(src->opts != NULL) {

	2445 uprv_free(src->opts);

	2446 }

	2447 if (src->reorderCodes != NULL) {

	2448 uprv_free(src->reorderCodes);

	2449 }

	2450 }

	2451

	2452 #endif /* #if !UCONFIG_NO_COLLATION */

OLD	NEW

« no previous file with comments | « icu46/source/i18n/ucol_tok.h ('k') | icu46/source/i18n/ucol_wgt.h » ('j') | no next file with comments »