icu46/source/i18n/ucol_tok.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Unified Diff: icu46/source/i18n/ucol_tok.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu46/source/i18n/ucol_tok.cpp

===================================================================

--- icu46/source/i18n/ucol_tok.cpp (revision 0)

+++ icu46/source/i18n/ucol_tok.cpp (revision 0)

@@ -0,0 +1,2452 @@

+/*

+*******************************************************************************

+* file name: ucol_tok.cpp

+* encoding: US-ASCII

+* tab size: 8 (not used)

+* indentation:4

+* created 02/22/2001

+* created by: Vladimir Weinstein

+* This module reads a tailoring rule string and produces a list of

+* tokens that will be turned into collation elements

+*/

+#include "unicode/utypes.h"

+#if !UCONFIG_NO_COLLATION

+#include "unicode/uscript.h"

+#include "unicode/ustring.h"

+#include "unicode/uchar.h"

+#include "unicode/uniset.h"

+#include "cmemory.h"

+#include "cstring.h"

+#include "ucol_bld.h"

+#include "ucol_tok.h"

+#include "ulocimp.h"

+#include "uresimp.h"

+#include "util.h"

+// Define this only for debugging.

+// #define DEBUG_FOR_COLL_RULES 1

+#ifdef DEBUG_FOR_COLL_RULES

+#include <iostream>

+#endif

+U_NAMESPACE_USE

+U_CDECL_BEGIN

+static int32_t U_CALLCONV

+uhash_hashTokens(const UHashTok k)

+ int32_t hash = 0;

+ //uint32_t key = (uint32_t)k.integer;

+ UColToken *key = (UColToken *)k.pointer;

+ if (key != 0) {

+ int32_t len = (key->source & 0xFF000000)>>24;

+ int32_t inc = ((len - 32) / 32) + 1;

+ const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl);

+ const UChar *limit = p + len;

+ while (p<limit) {

+ hash = (hash * 37) + *p;

+ p += inc;

+ }

+ return hash;

+static UBool U_CALLCONV

+uhash_compareTokens(const UHashTok key1, const UHashTok key2)

+ //uint32_t p1 = (uint32_t) key1.integer;

+ //uint32_t p2 = (uint32_t) key2.integer;

+ UColToken *p1 = (UColToken *)key1.pointer;

+ UColToken *p2 = (UColToken *)key2.pointer;

+ const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl);

+ const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl);

+ uint32_t s1L = ((p1->source & 0xFF000000) >> 24);

+ uint32_t s2L = ((p2->source & 0xFF000000) >> 24);

+ const UChar *end = s1+s1L-1;

+ if (p1 == p2) {

+ return TRUE;

+ }

+ if (p1->source == 0 || p2->source == 0) {

+ return FALSE;

+ }

+ if(s1L != s2L) {

+ return FALSE;

+ }

+ if(p1->source == p2->source) {

+ return TRUE;

+ }

+ while((s1 < end) && *s1 == *s2) {

+ ++s1;

+ ++s2;

+ }

+ if(*s1 == *s2) {

+ return TRUE;

+ } else {

+ return FALSE;

+ }

+U_CDECL_END

+/*

+ * Debug messages used to pinpoint where a format error occurred.

+ * A better way is to include context-sensitive information in syntaxError() function.

+ *

+ * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR

+ * in the compile line.

+ */

+/* #define DEBUG_FOR_FORMAT_ERROR 1 */

+#ifdef DEBUG_FOR_FORMAT_ERROR

+#define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);}

+#else

+#define DBG_FORMAT_ERROR

+#endif

+/*

+ * Controls debug messages so that the output can be compared before and after a

+ * big change. Prints the information of every code point that comes out of the

+ * collation parser and its strength into a file. When a big change in format

+ * happens, the files before and after the change should be identical.

+ *

+ * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS

+ * in the compile line.

+ */

+// #define DEBUG_FOR_CODE_POINTS 1

+#ifdef DEBUG_FOR_CODE_POINTS

+ FILE* dfcp_fp = NULL;

+#endif

+/*static inline void U_CALLCONV

+uhash_freeBlockWrapper(void *obj) {

+ uhash_freeBlock(obj);

+}*/

+typedef struct {

+ uint32_t startCE;

+ uint32_t startContCE;

+ uint32_t limitCE;

+ uint32_t limitContCE;

+} indirectBoundaries;

+/* these values are used for finding CE values for indirect positioning. */

+/* Indirect positioning is a mechanism for allowing resets on symbolic */

+/* values. It only works for resets and you cannot tailor indirect names */

+/* An indirect name can define either an anchor point or a range. An */

+/* anchor point behaves in exactly the same way as a code point in reset */

+/* would, except that it cannot be tailored. A range (we currently only */

+/* know for the [top] range will explicitly set the upper bound for */

+/* generated CEs, thus allowing for better control over how many CEs can */

+/* be squeezed between in the range without performance penalty. */

+/* In that respect, we use [top] for tailoring of locales that use CJK */

+/* characters. Other indirect values are currently a pure convenience, */

+/* they can be used to assure that the CEs will be always positioned in */

+/* the same place relative to a point with known properties (e.g. first */

+/* primary ignorable). */

+static indirectBoundaries ucolIndirectBoundaries[15];

+/*

+static indirectBoundaries ucolIndirectBoundaries[11] = {

+{ UCOL_RESET_TOP_VALUE, 0,

+UCOL_NEXT_TOP_VALUE, 0 },

+{ UCOL_FIRST_PRIMARY_IGNORABLE, 0,

+0, 0 },

+{ UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT,

+0, 0 },

+{ UCOL_FIRST_SECONDARY_IGNORABLE, 0,

+0, 0 },

+{ UCOL_LAST_SECONDARY_IGNORABLE, 0,

+0, 0 },

+{ UCOL_FIRST_TERTIARY_IGNORABLE, 0,

+0, 0 },

+{ UCOL_LAST_TERTIARY_IGNORABLE, 0,

+0, 0 },

+{ UCOL_FIRST_VARIABLE, 0,

+0, 0 },

+{ UCOL_LAST_VARIABLE, 0,

+0, 0 },

+{ UCOL_FIRST_NON_VARIABLE, 0,

+0, 0 },

+{ UCOL_LAST_NON_VARIABLE, 0,

+0, 0 },

+};

+*/

+static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {

+ // Set values for the top - TODO: once we have values for all the indirects, we are going

+ // to initalize here.

+ ucolIndirectBoundaries[indexR].startCE = start[0];

+ ucolIndirectBoundaries[indexR].startContCE = start[1];

+ if(end) {

+ ucolIndirectBoundaries[indexR].limitCE = end[0];

+ ucolIndirectBoundaries[indexR].limitContCE = end[1];

+ } else {

+ ucolIndirectBoundaries[indexR].limitCE = 0;

+ ucolIndirectBoundaries[indexR].limitContCE = 0;

+ }

+static inline

+void syntaxError(const UChar* rules,

+ int32_t pos,

+ int32_t rulesLen,

+ UParseError* parseError)

+ parseError->offset = pos;

+ parseError->line = 0 ; /* we are not using line numbers */

+ // for pre-context

+ int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));

+ int32_t stop = pos;

+ u_memcpy(parseError->preContext,rules+start,stop-start);

+ //null terminate the buffer

+ parseError->preContext[stop-start] = 0;

+ //for post-context

+ start = pos+1;

+ stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :

+ rulesLen;

+ if(start < stop) {

+ u_memcpy(parseError->postContext,rules+start,stop-start);

+ //null terminate the buffer

+ parseError->postContext[stop-start]= 0;

+ } else {

+ parseError->postContext[0] = 0;

+ }

+static

+void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {

+ switch(attrib) {

+ case UCOL_HIRAGANA_QUATERNARY_MODE:

+ opts->hiraganaQ = value;

+ break;

+ case UCOL_FRENCH_COLLATION:

+ opts->frenchCollation = value;

+ break;

+ case UCOL_ALTERNATE_HANDLING:

+ opts->alternateHandling = value;

+ break;

+ case UCOL_CASE_FIRST:

+ opts->caseFirst = value;

+ break;

+ case UCOL_CASE_LEVEL:

+ opts->caseLevel = value;

+ break;

+ case UCOL_NORMALIZATION_MODE:

+ opts->normalizationMode = value;

+ break;

+ case UCOL_STRENGTH:

+ opts->strength = value;

+ break;

+ case UCOL_NUMERIC_COLLATION:

+ opts->numericCollation = value;

+ break;

+ case UCOL_ATTRIBUTE_COUNT:

+ default:

+ break;

+ }

+#define UTOK_OPTION_COUNT 22

+static UBool didInit = FALSE;

+/* we can be strict, or we can be lenient */

+/* I'd surely be lenient with the option arguments */

+/* maybe even with options */

+U_STRING_DECL(suboption_00, "non-ignorable", 13);

+U_STRING_DECL(suboption_01, "shifted", 7);

+U_STRING_DECL(suboption_02, "lower", 5);

+U_STRING_DECL(suboption_03, "upper", 5);

+U_STRING_DECL(suboption_04, "off", 3);

+U_STRING_DECL(suboption_05, "on", 2);

+U_STRING_DECL(suboption_06, "1", 1);

+U_STRING_DECL(suboption_07, "2", 1);

+U_STRING_DECL(suboption_08, "3", 1);

+U_STRING_DECL(suboption_09, "4", 1);

+U_STRING_DECL(suboption_10, "I", 1);

+U_STRING_DECL(suboption_11, "primary", 7);

+U_STRING_DECL(suboption_12, "secondary", 9);

+U_STRING_DECL(suboption_13, "tertiary", 8);

+U_STRING_DECL(suboption_14, "variable", 8);

+U_STRING_DECL(suboption_15, "regular", 7);

+U_STRING_DECL(suboption_16, "implicit", 8);

+U_STRING_DECL(suboption_17, "trailing", 8);

+U_STRING_DECL(option_00, "undefined", 9);

+U_STRING_DECL(option_01, "rearrange", 9);

+U_STRING_DECL(option_02, "alternate", 9);

+U_STRING_DECL(option_03, "backwards", 9);

+U_STRING_DECL(option_04, "variable top", 12);

+U_STRING_DECL(option_05, "top", 3);

+U_STRING_DECL(option_06, "normalization", 13);

+U_STRING_DECL(option_07, "caseLevel", 9);

+U_STRING_DECL(option_08, "caseFirst", 9);

+U_STRING_DECL(option_09, "scriptOrder", 11);

+U_STRING_DECL(option_10, "charsetname", 11);

+U_STRING_DECL(option_11, "charset", 7);

+U_STRING_DECL(option_12, "before", 6);

+U_STRING_DECL(option_13, "hiraganaQ", 9);

+U_STRING_DECL(option_14, "strength", 8);

+U_STRING_DECL(option_15, "first", 5);

+U_STRING_DECL(option_16, "last", 4);

+U_STRING_DECL(option_17, "optimize", 8);

+U_STRING_DECL(option_18, "suppressContractions", 20);

+U_STRING_DECL(option_19, "numericOrdering", 15);

+U_STRING_DECL(option_20, "import", 6);

+U_STRING_DECL(option_21, "reorder", 7);

+/*

+[last variable] last variable value

+[last primary ignorable] largest CE for primary ignorable

+[last secondary ignorable] largest CE for secondary ignorable

+[last tertiary ignorable] largest CE for tertiary ignorable

+[top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)

+*/

+static const ucolTokSuboption alternateSub[2] = {

+ {suboption_00, 13, UCOL_NON_IGNORABLE},

+ {suboption_01, 7, UCOL_SHIFTED}

+};

+static const ucolTokSuboption caseFirstSub[3] = {

+ {suboption_02, 5, UCOL_LOWER_FIRST},

+ {suboption_03, 5, UCOL_UPPER_FIRST},

+ {suboption_04, 3, UCOL_OFF},

+};

+static const ucolTokSuboption onOffSub[2] = {

+ {suboption_04, 3, UCOL_OFF},

+ {suboption_05, 2, UCOL_ON}

+};

+static const ucolTokSuboption frenchSub[1] = {

+ {suboption_07, 1, UCOL_ON}

+};

+static const ucolTokSuboption beforeSub[3] = {

+ {suboption_06, 1, UCOL_PRIMARY},

+ {suboption_07, 1, UCOL_SECONDARY},

+ {suboption_08, 1, UCOL_TERTIARY}

+};

+static const ucolTokSuboption strengthSub[5] = {

+ {suboption_06, 1, UCOL_PRIMARY},

+ {suboption_07, 1, UCOL_SECONDARY},

+ {suboption_08, 1, UCOL_TERTIARY},

+ {suboption_09, 1, UCOL_QUATERNARY},

+ {suboption_10, 1, UCOL_IDENTICAL},

+};

+static const ucolTokSuboption firstLastSub[7] = {

+ {suboption_11, 7, UCOL_PRIMARY},

+ {suboption_12, 9, UCOL_PRIMARY},

+ {suboption_13, 8, UCOL_PRIMARY},

+ {suboption_14, 8, UCOL_PRIMARY},

+ {suboption_15, 7, UCOL_PRIMARY},

+ {suboption_16, 8, UCOL_PRIMARY},

+ {suboption_17, 8, UCOL_PRIMARY},

+};

+enum OptionNumber {

+ OPTION_ALTERNATE_HANDLING = 0,

+ OPTION_FRENCH_COLLATION,

+ OPTION_CASE_LEVEL,

+ OPTION_CASE_FIRST,

+ OPTION_NORMALIZATION_MODE,

+ OPTION_HIRAGANA_QUATERNARY,

+ OPTION_STRENGTH,

+ OPTION_NUMERIC_COLLATION,

+ OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,

+ OPTION_VARIABLE_TOP,

+ OPTION_REARRANGE,

+ OPTION_BEFORE,

+ OPTION_TOP,

+ OPTION_FIRST,

+ OPTION_LAST,

+ OPTION_OPTIMIZE,

+ OPTION_SUPPRESS_CONTRACTIONS,

+ OPTION_UNDEFINED,

+ OPTION_SCRIPT_ORDER,

+ OPTION_CHARSET_NAME,

+ OPTION_CHARSET,

+ OPTION_IMPORT,

+ OPTION_SCRIPTREORDER

+} ;

+static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {

+ /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */

+ /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */

+ /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */

+ /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */

+ /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */

+ /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */

+ /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */

+ /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/

+ /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */

+ /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */

+ /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */

+ /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */

+ /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */

+ /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */

+ /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */

+ /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */

+ /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */

+ /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */

+ /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */

+ /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charset" */

+ /*20*/ {option_20, 6, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"import" */

+ /*21*/ {option_21, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"reorder" */

+};

+static

+int32_t u_strncmpNoCase(const UChar *s1,

+ const UChar *s2,

+ int32_t n)

+ if(n > 0) {

+ int32_t rc;

+ for(;;) {

+ rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);

+ if(rc != 0 || *s1 == 0 || --n == 0) {

+ return rc;

+ }

+ ++s1;

+ ++s2;

+ }

+ return 0;

+static

+void ucol_uprv_tok_initData() {

+ if(!didInit) {

+ U_STRING_INIT(suboption_00, "non-ignorable", 13);

+ U_STRING_INIT(suboption_01, "shifted", 7);

+ U_STRING_INIT(suboption_02, "lower", 5);

+ U_STRING_INIT(suboption_03, "upper", 5);

+ U_STRING_INIT(suboption_04, "off", 3);

+ U_STRING_INIT(suboption_05, "on", 2);

+ U_STRING_INIT(suboption_06, "1", 1);

+ U_STRING_INIT(suboption_07, "2", 1);

+ U_STRING_INIT(suboption_08, "3", 1);

+ U_STRING_INIT(suboption_09, "4", 1);

+ U_STRING_INIT(suboption_10, "I", 1);

+ U_STRING_INIT(suboption_11, "primary", 7);

+ U_STRING_INIT(suboption_12, "secondary", 9);

+ U_STRING_INIT(suboption_13, "tertiary", 8);

+ U_STRING_INIT(suboption_14, "variable", 8);

+ U_STRING_INIT(suboption_15, "regular", 7);

+ U_STRING_INIT(suboption_16, "implicit", 8);

+ U_STRING_INIT(suboption_17, "trailing", 8);

+ U_STRING_INIT(option_00, "undefined", 9);

+ U_STRING_INIT(option_01, "rearrange", 9);

+ U_STRING_INIT(option_02, "alternate", 9);

+ U_STRING_INIT(option_03, "backwards", 9);

+ U_STRING_INIT(option_04, "variable top", 12);

+ U_STRING_INIT(option_05, "top", 3);

+ U_STRING_INIT(option_06, "normalization", 13);

+ U_STRING_INIT(option_07, "caseLevel", 9);

+ U_STRING_INIT(option_08, "caseFirst", 9);

+ U_STRING_INIT(option_09, "scriptOrder", 11);

+ U_STRING_INIT(option_10, "charsetname", 11);

+ U_STRING_INIT(option_11, "charset", 7);

+ U_STRING_INIT(option_12, "before", 6);

+ U_STRING_INIT(option_13, "hiraganaQ", 9);

+ U_STRING_INIT(option_14, "strength", 8);

+ U_STRING_INIT(option_15, "first", 5);

+ U_STRING_INIT(option_16, "last", 4);

+ U_STRING_INIT(option_17, "optimize", 8);

+ U_STRING_INIT(option_18, "suppressContractions", 20);

+ U_STRING_INIT(option_19, "numericOrdering", 15);

+ U_STRING_INIT(option_20, "import ", 6);

+ U_STRING_INIT(option_21, "reorder", 7);

+ didInit = TRUE;

+ }

+// This function reads basic options to set in the runtime collator

+// used by data driven tests. Should not support build time options

+U_CAPI const UChar * U_EXPORT2

+ucol_tok_getNextArgument(const UChar *start, const UChar *end,

+ UColAttribute *attrib, UColAttributeValue *value,

+ UErrorCode *status)

+ uint32_t i = 0;

+ int32_t j=0;

+ UBool foundOption = FALSE;

+ const UChar *optionArg = NULL;

+ ucol_uprv_tok_initData();

+ while(start < end && (u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start))) { /* eat whitespace */

+ start++;

+ }

+ if(start >= end) {

+ return NULL;

+ }

+ /* skip opening '[' */

+ if(*start == 0x005b) {

+ start++;

+ } else {

+ *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['

+ return NULL;

+ }

+ while(i < UTOK_OPTION_COUNT) {

+ if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {

+ foundOption = TRUE;

+ if(end - start > rulesOptions[i].optionLen) {

+ optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */

+ while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optionArg)) { /* eat whitespace */

+ optionArg++;

+ }

+ break;

+ }

+ i++;

+ }

+ if(!foundOption) {

+ *status = U_ILLEGAL_ARGUMENT_ERROR;

+ return NULL;

+ }

+ if(optionArg) {

+ for(j = 0; j<rulesOptions[i].subSize; j++) {

+ if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {

+ //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);

+ *attrib = rulesOptions[i].attr;

+ *value = rulesOptions[i].subopts[j].attrVal;

+ optionArg += rulesOptions[i].subopts[j].subLen;

+ while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optionArg)) { /* eat whitespace */

+ optionArg++;

+ }

+ if(*optionArg == 0x005d) {

+ optionArg++;

+ return optionArg;

+ } else {

+ *status = U_ILLEGAL_ARGUMENT_ERROR;

+ return NULL;

+ }

+ *status = U_ILLEGAL_ARGUMENT_ERROR;

+ return NULL;

+static

+USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {

+ while(*start != 0x005b) { /* advance while we find the first '[' */

+ start++;

+ }

+ // now we need to get a balanced set of '[]'. The problem is that a set can have

+ // many, and *end point to the first closing '['

+ int32_t noOpenBraces = 1;

+ int32_t current = 1; // skip the opening brace

+ while(start+current < end && noOpenBraces != 0) {

+ if(start[current] == 0x005b) {

+ noOpenBraces++;

+ } else if(start[current] == 0x005D) { // closing brace

+ noOpenBraces--;

+ }

+ current++;

+ }

+ if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {

+ *status = U_ILLEGAL_ARGUMENT_ERROR;

+ return NULL;

+ }

+ return uset_openPattern(start, current, status);

+/**

+ * Reads an option and matches the option name with the predefined options. (Case-insensitive.)

+ * @param start Pointer to the start UChar.

+ * @param end Pointer to the last valid pointer beyond which the option will not extend.

+ * @param optionArg Address of the pointer at which the options start (after the option name)

+ * @return The index of the option, or -1 if the option is not valid.

+ */

+static

+int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {

+ int32_t i = 0;

+ ucol_uprv_tok_initData();

+ while(u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start)) { /* eat whitespace */

+ start++;

+ }

+ while(i < UTOK_OPTION_COUNT) {

+ if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {

+ if(end - start > rulesOptions[i].optionLen) {

+ *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */

+ while(u_isWhitespace(**optionArg) || uprv_isRuleWhiteSpace(**optionArg)) { /* eat whitespace */

+ (*optionArg)++;

+ }

+ break;

+ }

+ i++;

+ }

+ if(i == UTOK_OPTION_COUNT) {

+ i = -1; // didn't find an option

+ }

+ return i;

+static

+void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) {

+ int32_t codeCount = 0;

+ int32_t codeIndex = 0;

+ char conversion[64];

+ int32_t tokenLength = 0;

+ const UChar* space;

+ const UChar* current = src->current;

+ const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current);

+ // eat leading whitespace

+ while(current < end && u_isWhitespace(*current)) {

+ current++;

+ }

+ while(current < end) {

+ space = u_memchr(current, 0x0020, end - current);

+ space = space == 0 ? end : space;

+ tokenLength = space - current;

+ if (tokenLength < 4) {

+ *status = U_INVALID_FORMAT_ERROR;

+ return;

+ }

+ codeCount++;

+ current += tokenLength;

+ while(current < end && u_isWhitespace(*current)) { /* eat whitespace */

+ ++current;

+ }

+ if (codeCount == 0) {

+ *status = U_INVALID_FORMAT_ERROR;

+ }

+ src->reorderCodesLength = codeCount;

+ src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t));

+ current = src->current;

+ // eat leading whitespace

+ while(current < end && u_isWhitespace(*current)) {

+ current++;

+ }

+ while(current < end) {

+ space = u_memchr(current, 0x0020, end - current);

+ space = space == 0 ? end : space;

+ tokenLength = space - current;

+ if (tokenLength < 4) {

+ *status = U_ILLEGAL_ARGUMENT_ERROR;

+ return;

+ } else {

+ u_UCharsToChars(current, conversion, tokenLength);

+ conversion[tokenLength] = '\0';

+ src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion);

+ if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {

+ src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion);

+ }

+ if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {

+ *status = U_ILLEGAL_ARGUMENT_ERROR;

+ }

+ codeIndex++;

+ current += tokenLength;

+ while(current < end && u_isWhitespace(*current)) { /* eat whitespace */

+ ++current;

+ }

+// reads and conforms to various options in rules

+// end is the position of the first closing ']'

+// However, some of the options take an UnicodeSet definition

+// which needs to duplicate the closing ']'

+// for example: '[copy [\uAC00-\uD7FF]]'

+// These options will move end to the second ']' and the

+// caller will set the current to it.

+static

+uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {

+ const UChar* start = src->current;

+ int32_t i = 0;

+ int32_t j=0;

+ const UChar *optionArg = NULL;

+ uint8_t result = 0;

+ start++; /*skip opening '['*/

+ i = ucol_uprv_tok_readOption(start, src->end, &optionArg);

+ if(optionArg) {

+ src->current = optionArg;

+ }

+ if(i < 0) {

+ *status = U_ILLEGAL_ARGUMENT_ERROR;

+ } else {

+ int32_t noOpenBraces = 1;

+ switch(i) {

+ case OPTION_ALTERNATE_HANDLING:

+ case OPTION_FRENCH_COLLATION:

+ case OPTION_CASE_LEVEL:

+ case OPTION_CASE_FIRST:

+ case OPTION_NORMALIZATION_MODE:

+ case OPTION_HIRAGANA_QUATERNARY:

+ case OPTION_STRENGTH:

+ case OPTION_NUMERIC_COLLATION:

+ if(optionArg) {

+ for(j = 0; j<rulesOptions[i].subSize; j++) {

+ if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {

+ ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);

+ result = UCOL_TOK_SUCCESS;

+ }

+ if(result == 0) {

+ *status = U_ILLEGAL_ARGUMENT_ERROR;

+ }

+ break;

+ case OPTION_VARIABLE_TOP:

+ result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;

+ break;

+ case OPTION_REARRANGE:

+ result = UCOL_TOK_SUCCESS;

+ break;

+ case OPTION_BEFORE:

+ if(optionArg) {

+ for(j = 0; j<rulesOptions[i].subSize; j++) {

+ if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {

+ result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attrVal + 1);

+ }

+ if(result == 0) {

+ *status = U_ILLEGAL_ARGUMENT_ERROR;

+ }

+ break;

+ case OPTION_TOP: /* we are going to have an array with structures of limit CEs */

+ /* index to this array will be src->parsedToken.indirectIndex*/

+ src->parsedToken.indirectIndex = 0;

+ result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;

+ break;

+ case OPTION_FIRST:

+ case OPTION_LAST: /* first, last */

+ for(j = 0; j<rulesOptions[i].subSize; j++) {

+ if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {

+ // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first

+ // element of indirect boundaries is reserved for top.

+ src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);

+ result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;

+ }

+ if(result == 0) {

+ *status = U_ILLEGAL_ARGUMENT_ERROR;

+ }

+ break;

+ case OPTION_OPTIMIZE:

+ case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization

+ // we need to move end here

+ src->current++; // skip opening brace

+ while(src->current < src->end && noOpenBraces != 0) {

+ if(*src->current == 0x005b) {

+ noOpenBraces++;

+ } else if(*src->current == 0x005D) { // closing brace

+ noOpenBraces--;

+ }

+ src->current++;

+ }

+ result = UCOL_TOK_SUCCESS;

+ break;

+ case OPTION_SCRIPTREORDER:

+ ucol_tok_parseScriptReorder(src, status);

+ break;

+ default:

+ *status = U_UNSUPPORTED_ERROR;

+ break;

+ }

+ src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current));

+ return result;

+inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {

+ if (stuff == NULL || len <= 0) {

+ return;

+ }

+ UnicodeString tempStuff(FALSE, stuff, len);

+ if(src->extraCurrent+len >= src->extraEnd) {

+ /* reallocate */

+ if (stuff >= src->source && stuff <= src->end) {

+ // Copy the "stuff" contents into tempStuff's own buffer.

+ // UnicodeString is copy-on-write.

+ if (len > 0) {

+ tempStuff.setCharAt(0, tempStuff[0]);

+ } else {

+ tempStuff.remove();

+ }

+ UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));

+ if(newSrc != NULL) {

+ src->current = newSrc + (src->current - src->source);

+ src->extraCurrent = newSrc + (src->extraCurrent - src->source);

+ src->end = newSrc + (src->end - src->source);

+ src->extraEnd = newSrc + (src->extraEnd-src->source)*2;

+ src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);

+ src->source = newSrc;

+ } else {

+ *status = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ if(len == 1) {

+ *src->extraCurrent++ = tempStuff[0];

+ } else {

+ u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len);

+ src->extraCurrent += len;

+ }

+inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {

+ /*

+ top = TRUE;

+ */

+ UChar buff[5];

+ src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);

+ buff[0] = 0xFFFE;

+ buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);

+ buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);

+ if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {

+ src->parsedToken.charsLen = 3;

+ ucol_tok_addToExtraCurrent(src, buff, 3, status);

+ } else {

+ buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);

+ buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);

+ src->parsedToken.charsLen = 5;

+ ucol_tok_addToExtraCurrent(src, buff, 5, status);

+ }

+ return TRUE;

+static UBool isCharNewLine(UChar c){

+ switch(c){

+ case 0x000A: /* LF */

+ case 0x000D: /* CR */

+ case 0x000C: /* FF */

+ case 0x0085: /* NEL */

+ case 0x2028: /* LS */

+ case 0x2029: /* PS */

+ return TRUE;

+ default:

+ return FALSE;

+ }

+/*

+ * This function is called several times when a range is processed. Each time, the next code point

+ * is processed.

+ * The following variables must be set before calling this function:

+ * src->currentRangeCp: The current code point to process.

+ * src->lastRangeCp: The last code point in the range.

+ * Pre-requisite: src->currentRangeCp <= src->lastRangeCp.

+ */

+static const UChar*

+ucol_tok_processNextCodePointInRange(UColTokenParser *src,

+ UErrorCode *status)

+ // Append current code point to source

+ UChar buff[U16_MAX_LENGTH];

+ uint32_t i = 0;

+ uint32_t nChars = U16_LENGTH(src->currentRangeCp);

+ src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);

+ src->parsedToken.charsLen = nChars;

+ U16_APPEND_UNSAFE(buff, i, src->currentRangeCp);

+ ucol_tok_addToExtraCurrent(src, buff, nChars, status);

+ ++src->currentRangeCp;

+ if (src->currentRangeCp > src->lastRangeCp) {

+ src->inRange = FALSE;

+ if (src->currentStarredCharIndex > src->lastStarredCharIndex) {

+ src->isStarred = FALSE;

+ }

+ } else {

+ src->previousCp = src->currentRangeCp;

+ }

+ return src->current;

+/*

+ * This function is called several times when a starred list is processed. Each time, the next code point

+ * in the list is processed.

+ * The following variables must be set before calling this function:

+ * src->currentStarredCharIndex: Index (in src->source) of the first char of the current code point.

+ * src->lastStarredCharIndex: Index to the last character in the list.

+ * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex.

+ */

+static const UChar*

+ucol_tok_processNextTokenInStarredList(UColTokenParser *src)

+ // Extract the characters corresponding to the next code point.

+ UChar32 cp;

+ src->parsedToken.charsOffset = src->currentStarredCharIndex;

+ int32_t prev = src->currentStarredCharIndex;

+ U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp);

+ src->parsedToken.charsLen = src->currentStarredCharIndex - prev;

+ // When we are done parsing the starred string, turn the flag off so that

+ // the normal processing is restored.

+ if (src->currentStarredCharIndex > src->lastStarredCharIndex) {

+ src->isStarred = FALSE;

+ }

+ src->previousCp = cp;

+ return src->current;

+/*

+ * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters.

+ *

+ * This routine parses and separates almost all tokens. The following are the syntax characters recognized.

+ * # : Comment character

+ * & : Reset operator

+ * = : Equality

+ * < : Primary collation

+ * << : Secondary collation

+ * <<< : Tertiary collation

+ * ; : Secondary collation

+ * , : Tertiary collation

+ * / : Expansions

+ * | : Prefix

+ * - : Range

+ * ! : Java Thai modifier, ignored

+ * @ : French only

+ * [] : Options

+ * '' : Quotes

+ *

+ * Along with operators =, <, <<, <<<, the operator * is supported to indicate a list. For example, &a<*bcdexyz

+ * is equivalent to &a<b<c<d<e<x<y<z. In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above.

+ * This function do not separate the tokens in a list. Instead, &a<*b-ex-z is parsed as three tokens - "&a",

+ * "<*b", "-ex", "-z". The strength (< in this case), whether in a list, whether in a range and the previous

+ * character returned as cached so that the calling program can do further splitting.

+ */

+static const UChar*

+ucol_tok_parseNextTokenInternal(UColTokenParser *src,

+ UBool startOfRules,

+ UParseError *parseError,

+ UErrorCode *status)

+ UBool variableTop = FALSE;

+ UBool top = FALSE;

+ UBool inChars = TRUE;

+ UBool inQuote = FALSE;

+ UBool wasInQuote = FALSE;

+ uint8_t before = 0;

+ UBool isEscaped = FALSE;

+ // TODO: replace these variables with src->parsedToken counterparts

+ // no need to use them anymore since we have src->parsedToken.

+ // Ideally, token parser would be a nice class... Once, when I have

+ // more time (around 2020 probably).

+ uint32_t newExtensionLen = 0;

+ uint32_t extensionOffset = 0;

+ uint32_t newStrength = UCOL_TOK_UNSET;

+ UChar buff[10];

+ src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0;

+ src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;

+ src->parsedToken.indirectIndex = 0;

+ while (src->current < src->end) {

+ UChar ch = *(src->current);

+ if (inQuote) {

+ if (ch == 0x0027/*'\''*/) {

+ inQuote = FALSE;

+ } else {

+ if ((src->parsedToken.charsLen == 0) || inChars) {

+ if(src->parsedToken.charsLen == 0) {

+ src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);

+ }

+ src->parsedToken.charsLen++;

+ } else {

+ if(newExtensionLen == 0) {

+ extensionOffset = (uint32_t)(src->extraCurrent - src->source);

+ }

+ newExtensionLen++;

+ }

+ }else if(isEscaped){

+ isEscaped =FALSE;

+ if (newStrength == UCOL_TOK_UNSET) {

+ *status = U_INVALID_FORMAT_ERROR;

+ syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);

+ DBG_FORMAT_ERROR

+ return NULL;

+ // enabling rules to start with non-tokens a < b

+ // newStrength = UCOL_TOK_RESET;

+ }

+ if(ch != 0x0000 && src->current != src->end) {

+ if (inChars) {

+ if(src->parsedToken.charsLen == 0) {

+ src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);

+ }

+ src->parsedToken.charsLen++;

+ } else {

+ if(newExtensionLen == 0) {

+ extensionOffset = (uint32_t)(src->current - src->source);

+ }

+ newExtensionLen++;

+ }

+ }else {

+ if(!uprv_isRuleWhiteSpace(ch)) {

+ /* Sets the strength for this entry */

+ switch (ch) {

+ case 0x003D/*'='*/ :

+ if (newStrength != UCOL_TOK_UNSET) {

+ goto EndOfLoop;

+ }

+ /* if we start with strength, we'll reset to top */

+ if(startOfRules == TRUE) {

+ src->parsedToken.indirectIndex = 5;

+ top = ucol_tok_doSetTop(src, status);

+ newStrength = UCOL_TOK_RESET;

+ goto EndOfLoop;

+ }

+ newStrength = UCOL_IDENTICAL;

+ if(*(src->current+1) == 0x002A) {/*'*'*/

+ src->current++;

+ src->isStarred = TRUE;

+ }

+ break;

+ case 0x002C/*','*/:

+ if (newStrength != UCOL_TOK_UNSET) {

+ goto EndOfLoop;

+ }

+ /* if we start with strength, we'll reset to top */

+ if(startOfRules == TRUE) {

+ src->parsedToken.indirectIndex = 5;

+ top = ucol_tok_doSetTop(src, status);

+ newStrength = UCOL_TOK_RESET;

+ goto EndOfLoop;

+ }

+ newStrength = UCOL_TERTIARY;

+ break;

+ case 0x003B/*';'*/:

+ if (newStrength != UCOL_TOK_UNSET) {

+ goto EndOfLoop;

+ }

+ /* if we start with strength, we'll reset to top */

+ if(startOfRules == TRUE) {

+ src->parsedToken.indirectIndex = 5;

+ top = ucol_tok_doSetTop(src, status);

+ newStrength = UCOL_TOK_RESET;

+ goto EndOfLoop;

+ }

+ newStrength = UCOL_SECONDARY;

+ break;

+ case 0x003C/*'<'*/:

+ if (newStrength != UCOL_TOK_UNSET) {

+ goto EndOfLoop;

+ }

+ /* if we start with strength, we'll reset to top */

+ if(startOfRules == TRUE) {

+ src->parsedToken.indirectIndex = 5;

+ top = ucol_tok_doSetTop(src, status);

+ newStrength = UCOL_TOK_RESET;

+ goto EndOfLoop;

+ }

+ /* before this, do a scan to verify whether this is */

+ /* another strength */

+ if(*(src->current+1) == 0x003C) {

+ src->current++;

+ if(*(src->current+1) == 0x003C) {

+ src->current++; /* three in a row! */

+ newStrength = UCOL_TERTIARY;

+ } else { /* two in a row */

+ newStrength = UCOL_SECONDARY;

+ }

+ } else { /* just one */

+ newStrength = UCOL_PRIMARY;

+ }

+ if(*(src->current+1) == 0x002A) {/*'*'*/

+ src->current++;

+ src->isStarred = TRUE;

+ }

+ break;

+ case 0x0026/*'&'*/:

+ if (newStrength != UCOL_TOK_UNSET) {

+ /**/

+ goto EndOfLoop;

+ }

+ newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */

+ break;

+ case 0x005b/*'['*/:

+ /* options - read an option, analyze it */

+ if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {

+ uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);

+ if(U_SUCCESS(*status)) {

+ if(result & UCOL_TOK_TOP) {

+ if(newStrength == UCOL_TOK_RESET) {

+ top = ucol_tok_doSetTop(src, status);

+ if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'

+ src->parsedToken.charsLen+=2;

+ buff[0] = 0x002d;

+ buff[1] = before;

+ ucol_tok_addToExtraCurrent(src, buff, 2, status);

+ }

+ src->current++;

+ goto EndOfLoop;

+ } else {

+ *status = U_INVALID_FORMAT_ERROR;

+ syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);

+ DBG_FORMAT_ERROR

+ }

+ } else if(result & UCOL_TOK_VARIABLE_TOP) {

+ if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {

+ variableTop = TRUE;

+ src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);

+ src->parsedToken.charsLen = 1;

+ buff[0] = 0xFFFF;

+ ucol_tok_addToExtraCurrent(src, buff, 1, status);

+ src->current++;

+ goto EndOfLoop;

+ } else {

+ *status = U_INVALID_FORMAT_ERROR;

+ syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);

+ DBG_FORMAT_ERROR

+ }

+ } else if (result & UCOL_TOK_BEFORE){

+ if(newStrength == UCOL_TOK_RESET) {

+ before = result & UCOL_TOK_BEFORE;

+ } else {

+ *status = U_INVALID_FORMAT_ERROR;

+ syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);

+ DBG_FORMAT_ERROR

+ }

+ } else {

+ *status = U_INVALID_FORMAT_ERROR;

+ syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);

+ DBG_FORMAT_ERROR

+ return NULL;

+ }

+ break;

+ case 0x0021/*! skip java thai modifier reordering*/:

+ break;

+ case 0x002F/*'/'*/:

+ wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */

+ inChars = FALSE; /* we're now processing expansion */

+ break;

+ case 0x005C /* back slash for escaped chars */:

+ isEscaped = TRUE;

+ break;

+ /* found a quote, we're gonna start copying */

+ case 0x0027/*'\''*/:

+ if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */

+ *status = U_INVALID_FORMAT_ERROR;

+ syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);

+ DBG_FORMAT_ERROR

+ return NULL;

+ // enabling rules to start with a non-token character a < b

+ // newStrength = UCOL_TOK_RESET;

+ }

+ inQuote = TRUE;

+ if(inChars) { /* we're doing characters */

+ if(wasInQuote == FALSE) {

+ src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);

+ }

+ if (src->parsedToken.charsLen != 0) {

+ ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);

+ }

+ src->parsedToken.charsLen++;

+ } else { /* we're doing an expansion */

+ if(wasInQuote == FALSE) {

+ extensionOffset = (uint32_t)(src->extraCurrent - src->source);

+ }

+ if (newExtensionLen != 0) {

+ ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);

+ }

+ newExtensionLen++;

+ }

+ wasInQuote = TRUE;

+ ch = *(++(src->current));

+ if(ch == 0x0027) { /* copy the double quote */

+ ucol_tok_addToExtraCurrent(src, &ch, 1, status);

+ inQuote = FALSE;

+ }

+ break;

+ /* '@' is french only if the strength is not currently set */

+ /* if it is, it's just a regular character in collation rules */

+ case 0x0040/*'@'*/:

+ if (newStrength == UCOL_TOK_UNSET) {

+ src->opts->frenchCollation = UCOL_ON;

+ break;

+ }

+ case 0x007C /*|*/: /* this means we have actually been reading prefix part */

+ // we want to store read characters to the prefix part and continue reading

+ // the characters (proper way would be to restart reading the chars, but in

+ // that case we would have to complicate the token hasher, which I do not

+ // intend to play with. Instead, we will do prefixes when prefixes are due

+ // (before adding the elements).

+ src->parsedToken.prefixOffset = src->parsedToken.charsOffset;

+ src->parsedToken.prefixLen = src->parsedToken.charsLen;

+ if(inChars) { /* we're doing characters */

+ if(wasInQuote == FALSE) {

+ src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);

+ }

+ if (src->parsedToken.charsLen != 0) {

+ ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);

+ }

+ src->parsedToken.charsLen++;

+ }

+ wasInQuote = TRUE;

+ do {

+ ch = *(++(src->current));

+ // skip whitespace between '|' and the character

+ } while (uprv_isRuleWhiteSpace(ch));

+ break;

+ //charsOffset = 0;

+ //newCharsLen = 0;

+ //break; // We want to store the whole prefix/character sequence. If we break

+ // the '|' is going to get lost.

+ case 0x002D /*-*/: /* A range. */

+ if (newStrength != UCOL_TOK_UNSET) {

+ // While processing the pending token, the isStarred field

+ // is reset, so it needs to be saved for the next

+ // invocation.

+ src->savedIsStarred = src->isStarred;

+ goto EndOfLoop;

+ }

+ src->isStarred = src->savedIsStarred;

+ // Ranges are valid only in starred tokens.

+ if (!src->isStarred) {

+ *status = U_INVALID_FORMAT_ERROR;

+ syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);

+ DBG_FORMAT_ERROR

+ return NULL;

+ }

+ newStrength = src->parsedToken.strength;

+ src->inRange = TRUE;

+ break;

+ case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */

+ do {

+ ch = *(++(src->current));

+ } while (!isCharNewLine(ch));

+ break;

+ default:

+ if (newStrength == UCOL_TOK_UNSET) {

+ *status = U_INVALID_FORMAT_ERROR;

+ syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);

+ DBG_FORMAT_ERROR

+ return NULL;

+ }

+ if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {

+ *status = U_INVALID_FORMAT_ERROR;

+ syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);

+ DBG_FORMAT_ERROR

+ return NULL;

+ }

+ if(ch == 0x0000 && src->current+1 == src->end) {

+ break;

+ }

+ if (inChars) {

+ if(src->parsedToken.charsLen == 0) {

+ src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);

+ }

+ src->parsedToken.charsLen++;

+ } else {

+ if(newExtensionLen == 0) {

+ extensionOffset = (uint32_t)(src->current - src->source);

+ }

+ newExtensionLen++;

+ }

+ break;

+ }

+ if(wasInQuote) {

+ if(ch != 0x27) {

+ if(inQuote || !uprv_isRuleWhiteSpace(ch)) {

+ ucol_tok_addToExtraCurrent(src, &ch, 1, status);

+ }

+ src->current++;

+ }

+EndOfLoop:

+ wasInQuote = FALSE;

+ if (newStrength == UCOL_TOK_UNSET) {

+ return NULL;

+ }

+ if (src->parsedToken.charsLen == 0 && top == FALSE) {

+ syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);

+ *status = U_INVALID_FORMAT_ERROR;

+ DBG_FORMAT_ERROR

+ return NULL;

+ }

+ src->parsedToken.strength = newStrength;

+ src->parsedToken.extensionOffset = extensionOffset;

+ src->parsedToken.extensionLen = newExtensionLen;

+ src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;

+ return src->current;

+/*

+ * Parses the next token, keeps the indices in src->parsedToken, and updates the counters.

+ * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported.

+ *

+ * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following:

+ * 1) ucol_tok_parseNextTokenInternal() returns a range as a single token. This function separates

+ * it to separate tokens and returns one by one. In order to do that, the necessary states are

+ * cached as member variables of the token parser.

+ * 2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the

+ * starting character as a single list token (which is separated into individual characters here)

+ * and as another list token starting with the last character in the range. Before expanding it

+ * as a list of tokens, this function expands the range by filling the intermediate characters and

+ * returns them one by one as separate tokens.

+ * Necessary checks are done for invalid combinations.

+ */

+U_CAPI const UChar* U_EXPORT2

+ucol_tok_parseNextToken(UColTokenParser *src,

+ UBool startOfRules,

+ UParseError *parseError,

+ UErrorCode *status)

+ const UChar *nextToken;

+ if (src->inRange) {

+ // We are not done processing a range. Continue it.

+ return ucol_tok_processNextCodePointInRange(src, status);

+ } else if (src->isStarred) {

+ // We are not done processing a starred token. Continue it.

+ return ucol_tok_processNextTokenInStarredList(src);

+ }

+ // Get the next token.

+ nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status);

+ if (nextToken == NULL) {

+ return NULL;

+ }

+ if (src->inRange) {

+ // A new range has started.

+ // Check whether it is a chain of ranges with more than one hyphen.

+ if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) {

+ *status = U_INVALID_FORMAT_ERROR;

+ syntaxError(src->source,src->parsedToken.charsOffset-1,

+ src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError);

+ DBG_FORMAT_ERROR

+ return NULL;

+ }

+ // The current token indicates the second code point of the range.

+ // Process just that, and then proceed with the star.

+ src->currentStarredCharIndex = src->parsedToken.charsOffset;

+ U16_NEXT(src->source, src->currentStarredCharIndex,

+ (uint32_t)(src->end - src->source), src->lastRangeCp);

+ if (src->lastRangeCp <= src->previousCp) {

+ *status = U_INVALID_FORMAT_ERROR;

+ syntaxError(src->source,src->parsedToken.charsOffset-1,

+ src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);

+ DBG_FORMAT_ERROR

+ return NULL;

+ }

+ // Set current range code point to process the range loop

+ src->currentRangeCp = src->previousCp + 1;

+ src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;

+ return ucol_tok_processNextCodePointInRange(src, status);

+ } else if (src->isStarred) {

+ // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that

+ // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be

+ // separated into several tokens and returned.

+ src->currentStarredCharIndex = src->parsedToken.charsOffset;

+ src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;

+ return ucol_tok_processNextTokenInStarredList(src);

+ } else {

+ // Set previous codepoint

+ U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp);

+ }

+ return nextToken;

+/*

+Processing Description

+1 Build a ListList. Each list has a header, which contains two lists (positive

+and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and

+reset may be null.

+2 As you process, you keep a LAST pointer that points to the last token you

+handled.

+*/

+static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext,

+ UParseError *parseError, UErrorCode *status)

+ if(src->resultLen == src->listCapacity) {

+ // Unfortunately, this won't work, as we store addresses of lhs in token

+ src->listCapacity *= 2;

+ src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));

+ if(src->lh == NULL) {

+ *status = U_MEMORY_ALLOCATION_ERROR;

+ return NULL;

+ }

+ /* do the reset thing */

+ UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));

+ /* test for NULL */

+ if (sourceToken == NULL) {

+ *status = U_MEMORY_ALLOCATION_ERROR;

+ return NULL;

+ }

+ sourceToken->rulesToParseHdl = &(src->source);

+ sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;

+ sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;

+ sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);

+ sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);

+ // keep the flags around so that we know about before

+ sourceToken->flags = src->parsedToken.flags;

+ if(src->parsedToken.prefixOffset != 0) {

+ // this is a syntax error

+ *status = U_INVALID_FORMAT_ERROR;

+ syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);

+ DBG_FORMAT_ERROR

+ uprv_free(sourceToken);

+ return 0;

+ } else {

+ sourceToken->prefix = 0;

+ }

+ sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */

+ sourceToken->strength = UCOL_TOK_RESET;

+ sourceToken->next = NULL;

+ sourceToken->previous = NULL;

+ sourceToken->noOfCEs = 0;

+ sourceToken->noOfExpCEs = 0;

+ sourceToken->listHeader = &src->lh[src->resultLen];

+ src->lh[src->resultLen].first = NULL;

+ src->lh[src->resultLen].last = NULL;

+ src->lh[src->resultLen].first = NULL;

+ src->lh[src->resultLen].last = NULL;

+ src->lh[src->resultLen].reset = sourceToken;

+ /*

+ 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...

+ First convert all expansions into normal form. Examples:

+ If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *

+ d * ... into &x * c/y * d * ...

+ Note: reset values can never have expansions, although they can cause the

+ very next item to have one. They may be contractions, if they are found

+ earlier in the list.

+ */

+ *expandNext = 0;

+ if(expand != NULL) {

+ /* check to see if there is an expansion */

+ if(src->parsedToken.charsLen > 1) {

+ uint32_t resetCharsOffset;

+ resetCharsOffset = (uint32_t)(expand - src->source);

+ sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;

+ *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);

+ }

+ src->resultLen++;

+ uhash_put(src->tailored, sourceToken, sourceToken, status);

+ return sourceToken;

+static

+inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {

+ if(U_FAILURE(*status)) {

+ return NULL;

+ }

+ /* this is a virgin before - we need to fish the anchor from the UCA */

+ collIterate s;

+ uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;

+ uint32_t CE, SecondCE;

+ uint32_t invPos;

+ if(sourceToken != NULL) {

+ uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status);

+ } else {

+ uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status);

+ }

+ if(U_FAILURE(*status)) {

+ return NULL;

+ }

+ baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;

+ baseContCE = ucol_getNextCE(src->UCA, &s, status);

+ if(baseContCE == UCOL_NO_MORE_CES) {

+ baseContCE = 0;

+ }

+ UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);

+ uint32_t ch = 0;

+ uint32_t expandNext = 0;

+ UColToken key;

+ if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */

+ uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);

+ uint32_t raw = uprv_uca_getRawFromImplicit(primary);

+ ch = uprv_uca_getCodePointFromRaw(raw-1);

+ uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);

+ CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;

+ SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;

+ src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);

+ *src->extraCurrent++ = 0xFFFE;

+ *src->extraCurrent++ = (UChar)ch;

+ src->parsedToken.charsLen++;

+ key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;

+ key.rulesToParseHdl = &(src->source);

+ //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);

+ sourceToken = (UColToken *)uhash_get(src->tailored, &key);

+ if(sourceToken == NULL) {

+ src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;

+ if(isContinuation(SecondCE)) {

+ src->lh[src->resultLen].baseContCE = SecondCE;

+ } else {

+ src->lh[src->resultLen].baseContCE = 0;

+ }

+ src->lh[src->resultLen].nextCE = 0;

+ src->lh[src->resultLen].nextContCE = 0;

+ src->lh[src->resultLen].previousCE = 0;

+ src->lh[src->resultLen].previousContCE = 0;

+ src->lh[src->resultLen].indirect = FALSE;

+ sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);

+ }

+ } else {

+ invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);

+ // we got the previous CE. Now we need to see if the difference between

+ // the two CEs is really of the requested strength.

+ // if it's a bigger difference (we asked for secondary and got primary), we

+ // need to modify the CE.

+ if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {

+ // adjust the strength

+ // now we are in the situation where our baseCE should actually be modified in

+ // order to get the CE in the right position.

+ if(strength == UCOL_SECONDARY) {

+ CE = baseCE - 0x0200;

+ } else { // strength == UCOL_TERTIARY

+ CE = baseCE - 0x02;

+ }

+ if(baseContCE) {

+ if(strength == UCOL_SECONDARY) {

+ SecondCE = baseContCE - 0x0200;

+ } else { // strength == UCOL_TERTIARY

+ SecondCE = baseContCE - 0x02;

+ }

+#if 0

+ // the code below relies on getting a code point from the inverse table, in order to be

+ // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:

+ // 1. There are many code points that have the same CE

+ // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.

+ // Also, in case when there is no equivalent strength before an element, we have to actually

+ // construct one. For example, &[before 2]a << x won't result in x << a, because the element

+ // before a is a primary difference.

+ //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);

+ ch = CETable[3*invPos+2];

+ if((ch & UCOL_INV_SIZEMASK) != 0) {

+ uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);

+ uint32_t offset = (ch & UCOL_INV_OFFSETMASK);

+ ch = conts[offset];

+ }

+ *src->extraCurrent++ = (UChar)ch;

+ src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);

+ src->parsedToken.charsLen = 1;

+ // We got an UCA before. However, this might have been tailored.

+ // example:

+ // &\u30ca = \u306a

+ // &[before 3]\u306a<<<\u306a|\u309d

+ // uint32_t key = (*newCharsLen << 24) | *charsOffset;

+ key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;

+ key.rulesToParseHdl = &(src->source);

+ //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);

+ sourceToken = (UColToken *)uhash_get(src->tailored, &key);

+#endif

+ // here is how it should be. The situation such as &[before 1]a < x, should be

+ // resolved exactly as if we wrote &a > x.

+ // therefore, I don't really care if the UCA value before a has been changed.

+ // However, I do care if the strength between my element and the previous element

+ // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll

+ // have to construct the base CE.

+ // if we found a tailored thing, we have to use the UCA value and construct

+ // a new reset token with constructed name

+ //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {

+ // character to which we want to anchor is already tailored.

+ // We need to construct a new token which will be the anchor

+ // point

+ //*(src->extraCurrent-1) = 0xFFFE;

+ //*src->extraCurrent++ = (UChar)ch;

+ // grab before

+ src->parsedToken.charsOffset -= 10;

+ src->parsedToken.charsLen += 10;

+ src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;

+ if(isContinuation(SecondCE)) {

+ src->lh[src->resultLen].baseContCE = SecondCE;

+ } else {

+ src->lh[src->resultLen].baseContCE = 0;

+ }

+ src->lh[src->resultLen].nextCE = 0;

+ src->lh[src->resultLen].nextContCE = 0;

+ src->lh[src->resultLen].previousCE = 0;

+ src->lh[src->resultLen].previousContCE = 0;

+ src->lh[src->resultLen].indirect = FALSE;

+ sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);

+ //}

+ }

+ return sourceToken;

+uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {

+ UColToken *lastToken = NULL;

+ const UChar *parseEnd = NULL;

+ uint32_t expandNext = 0;

+ UBool variableTop = FALSE;

+ UBool top = FALSE;

+ uint16_t specs = 0;

+ UColTokListHeader *ListList = NULL;

+ src->parsedToken.strength = UCOL_TOK_UNSET;

+ ListList = src->lh;

+ if(U_FAILURE(*status)) {

+ return 0;

+ }

+#ifdef DEBUG_FOR_CODE_POINTS

+ char filename[35];

+ sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid());

+ dfcp_fp = fopen(filename, "a");

+ fprintf(stdout, "Output is in the file %s.\n", filename);

+#endif

+#ifdef DEBUG_FOR_COLL_RULES

+ std::string s3;

+ UnicodeString(src->source).toUTF8String(s3);

+ std::cout << "src->source = " << s3 << std::endl;

+#endif

+ while(src->current < src->end || src->isStarred) {

+ src->parsedToken.prefixOffset = 0;

+ parseEnd = ucol_tok_parseNextToken(src,

+ (UBool)(lastToken == NULL),

+ parseError,

+ status);

+ specs = src->parsedToken.flags;

+ variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);

+ top = ((specs & UCOL_TOK_TOP) != 0);

+ if(U_SUCCESS(*status) && parseEnd != NULL) {

+ UColToken *sourceToken = NULL;

+ //uint32_t key = 0;

+ uint32_t lastStrength = UCOL_TOK_UNSET;

+ if(lastToken != NULL ) {

+ lastStrength = lastToken->strength;

+ }

+#ifdef DEBUG_FOR_CODE_POINTS

+ UChar32 cp;

+ U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp);

+ fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength);

+#endif

+ //key = newCharsLen << 24 | charsOffset;

+ UColToken key;

+ key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;

+ key.rulesToParseHdl = &(src->source);

+ /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */

+ sourceToken = (UColToken *)uhash_get(src->tailored, &key);

+ if(src->parsedToken.strength != UCOL_TOK_RESET) {

+ if(lastToken == NULL) { /* this means that rules haven't started properly */

+ *status = U_INVALID_FORMAT_ERROR;

+ syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);

+ DBG_FORMAT_ERROR

+ return 0;

+ }

+ /* 6 Otherwise (when relation != reset) */

+ if(sourceToken == NULL) {

+ /* If sourceToken is null, create new one, */

+ sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));

+ /* test for NULL */

+ if (sourceToken == NULL) {

+ *status = U_MEMORY_ALLOCATION_ERROR;

+ return 0;

+ }

+ sourceToken->rulesToParseHdl = &(src->source);

+ sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;

+ sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);

+ sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;

+ sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);

+ sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */

+ sourceToken->next = NULL;

+ sourceToken->previous = NULL;

+ sourceToken->noOfCEs = 0;

+ sourceToken->noOfExpCEs = 0;

+ // keep the flags around so that we know about before

+ sourceToken->flags = src->parsedToken.flags;

+ uhash_put(src->tailored, sourceToken, sourceToken, status);

+ if(U_FAILURE(*status)) {

+ return 0;

+ }

+ } else {

+ /* we could have fished out a reset here */

+ if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {

+ /* otherwise remove sourceToken from where it was. */

+ if(sourceToken->next != NULL) {

+ if(sourceToken->next->strength > sourceToken->strength) {

+ sourceToken->next->strength = sourceToken->strength;

+ }

+ sourceToken->next->previous = sourceToken->previous;

+ } else {

+ sourceToken->listHeader->last = sourceToken->previous;

+ }

+ if(sourceToken->previous != NULL) {

+ sourceToken->previous->next = sourceToken->next;

+ } else {

+ sourceToken->listHeader->first = sourceToken->next;

+ }

+ sourceToken->next = NULL;

+ sourceToken->previous = NULL;

+ }

+ sourceToken->strength = src->parsedToken.strength;

+ sourceToken->listHeader = lastToken->listHeader;

+ /*

+ 1. Find the strongest strength in each list, and set strongestP and strongestN

+ accordingly in the headers.

+ */

+ if(lastStrength == UCOL_TOK_RESET

+ || sourceToken->listHeader->first == 0) {

+ /* If LAST is a reset

+ insert sourceToken in the list. */

+ if(sourceToken->listHeader->first == 0) {

+ sourceToken->listHeader->first = sourceToken;

+ sourceToken->listHeader->last = sourceToken;

+ } else { /* we need to find a place for us */

+ /* and we'll get in front of the same strength */

+ if(sourceToken->listHeader->first->strength <= sourceToken->strength) {

+ sourceToken->next = sourceToken->listHeader->first;

+ sourceToken->next->previous = sourceToken;

+ sourceToken->listHeader->first = sourceToken;

+ sourceToken->previous = NULL;

+ } else {

+ lastToken = sourceToken->listHeader->first;

+ while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {

+ lastToken = lastToken->next;

+ }

+ if(lastToken->next != NULL) {

+ lastToken->next->previous = sourceToken;

+ } else {

+ sourceToken->listHeader->last = sourceToken;

+ }

+ sourceToken->previous = lastToken;

+ sourceToken->next = lastToken->next;

+ lastToken->next = sourceToken;

+ }

+ } else {

+ /* Otherwise (when LAST is not a reset)

+ if polarity (LAST) == polarity(relation), insert sourceToken after LAST,

+ otherwise insert before.

+ when inserting after or before, search to the next position with the same

+ strength in that direction. (This is called postpone insertion). */

+ if(sourceToken != lastToken) {

+ if(lastToken->polarity == sourceToken->polarity) {

+ while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {

+ lastToken = lastToken->next;

+ }

+ sourceToken->previous = lastToken;

+ if(lastToken->next != NULL) {

+ lastToken->next->previous = sourceToken;

+ } else {

+ sourceToken->listHeader->last = sourceToken;

+ }

+ sourceToken->next = lastToken->next;

+ lastToken->next = sourceToken;

+ } else {

+ while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {

+ lastToken = lastToken->previous;

+ }

+ sourceToken->next = lastToken;

+ if(lastToken->previous != NULL) {

+ lastToken->previous->next = sourceToken;

+ } else {

+ sourceToken->listHeader->first = sourceToken;

+ }

+ sourceToken->previous = lastToken->previous;

+ lastToken->previous = sourceToken;

+ }

+ } else { /* repeated one thing twice in rules, stay with the stronger strength */

+ if(lastStrength < sourceToken->strength) {

+ sourceToken->strength = lastStrength;

+ }

+ /* if the token was a variable top, we're gonna put it in */

+ if(variableTop == TRUE && src->varTop == NULL) {

+ variableTop = FALSE;

+ src->varTop = sourceToken;

+ }

+ // Treat the expansions.

+ // There are two types of expansions: explicit (x / y) and reset based propagating expansions

+ // (&abc * d * e <=> &ab * d / c * e / c)

+ // if both of them are in effect for a token, they are combined.

+ sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;

+ if(expandNext != 0) {

+ if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */

+ expandNext = 0;

+ } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */

+ sourceToken->expansion = expandNext;

+ } else { /* there is both explicit and implicit expansion. We need to make a combination */

+ uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));

+ uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));

+ sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));

+ src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;

+ }

+ // This is just for debugging purposes

+ if(sourceToken->expansion != 0) {

+ sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);

+ } else {

+ sourceToken->debugExpansion = 0;

+ }

+ // if the previous token was a reset before, the strength of this

+ // token must match the strength of before. Otherwise we have an

+ // undefined situation.

+ // In other words, we currently have a cludge which we use to

+ // represent &a >> x. This is written as &[before 2]a << x.

+ if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {

+ uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;

+ if(beforeStrength != sourceToken->strength) {

+ *status = U_INVALID_FORMAT_ERROR;

+ syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);

+ DBG_FORMAT_ERROR

+ return 0;

+ }

+ } else {

+ if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {

+ /* if the previous token was also a reset, */

+ /*this means that we have two consecutive resets */

+ /* and we want to remove the previous one if empty*/

+ if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {

+ src->resultLen--;

+ }

+ if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */

+ uint32_t searchCharsLen = src->parsedToken.charsLen;

+ while(searchCharsLen > 1 && sourceToken == NULL) {

+ searchCharsLen--;

+ //key = searchCharsLen << 24 | charsOffset;

+ UColToken key;

+ key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;

+ key.rulesToParseHdl = &(src->source);

+ sourceToken = (UColToken *)uhash_get(src->tailored, &key);

+ }

+ if(sourceToken != NULL) {

+ expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);

+ }

+ if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */

+ if(top == FALSE) { /* there is no indirection */

+ uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;

+ if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {

+ /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */

+ while(sourceToken->strength > strength && sourceToken->previous != NULL) {

+ sourceToken = sourceToken->previous;

+ }

+ /* here, either we hit the strength or NULL */

+ if(sourceToken->strength == strength) {

+ if(sourceToken->previous != NULL) {

+ sourceToken = sourceToken->previous;

+ } else { /* start of list */

+ sourceToken = sourceToken->listHeader->reset;

+ }

+ } else { /* we hit NULL */

+ /* we should be doing the else part */

+ sourceToken = sourceToken->listHeader->reset;

+ sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);

+ }

+ } else {

+ sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);

+ }

+ } else { /* this is both before and indirection */

+ top = FALSE;

+ ListList[src->resultLen].previousCE = 0;

+ ListList[src->resultLen].previousContCE = 0;

+ ListList[src->resultLen].indirect = TRUE;

+ /* we need to do slightly more work. we need to get the baseCE using the */

+ /* inverse UCA & getPrevious. The next bound is not set, and will be decided */

+ /* in ucol_bld */

+ uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;

+ uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;

+ uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;

+ uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;

+ UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);

+ if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) &&

+ (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */

+ uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);

+ uint32_t raw = uprv_uca_getRawFromImplicit(primary);

+ uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);

+ CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;

+ SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;

+ } else {

+ /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/

+ ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);

+ }

+ ListList[src->resultLen].baseCE = CE;

+ ListList[src->resultLen].baseContCE = SecondCE;

+ ListList[src->resultLen].nextCE = 0;

+ ListList[src->resultLen].nextContCE = 0;

+ sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);

+ }

+ /* 5 If the relation is a reset:

+ If sourceToken is null

+ Create new list, create new sourceToken, make the baseCE from source, put

+ the sourceToken in ListHeader of the new list */

+ if(sourceToken == NULL) {

+ /*

+ 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...

+ First convert all expansions into normal form. Examples:

+ If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *

+ d * ... into &x * c/y * d * ...

+ Note: reset values can never have expansions, although they can cause the

+ very next item to have one. They may be contractions, if they are found

+ earlier in the list.

+ */

+ if(top == FALSE) {

+ collIterate s;

+ uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;

+ uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status);

+ CE = ucol_getNextCE(src->UCA, &s, status);

+ const UChar *expand = s.pos;

+ SecondCE = ucol_getNextCE(src->UCA, &s, status);

+ ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;

+ if(isContinuation(SecondCE)) {

+ ListList[src->resultLen].baseContCE = SecondCE;

+ } else {

+ ListList[src->resultLen].baseContCE = 0;

+ }

+ ListList[src->resultLen].nextCE = 0;

+ ListList[src->resultLen].nextContCE = 0;

+ ListList[src->resultLen].previousCE = 0;

+ ListList[src->resultLen].previousContCE = 0;

+ ListList[src->resultLen].indirect = FALSE;

+ sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);

+ } else { /* top == TRUE */

+ /* just use the supplied values */

+ top = FALSE;

+ ListList[src->resultLen].previousCE = 0;

+ ListList[src->resultLen].previousContCE = 0;

+ ListList[src->resultLen].indirect = TRUE;

+ ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;

+ ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;

+ ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;

+ ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;

+ sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);

+ }

+ } else { /* reset to something already in rules */

+ top = FALSE;

+ }

+ /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */

+ lastToken = sourceToken;

+ } else {

+ if(U_FAILURE(*status)) {

+ return 0;

+ }

+#ifdef DEBUG_FOR_CODE_POINTS

+ fclose(dfcp_fp);

+#endif

+ if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {

+ src->resultLen--;

+ }

+ return src->resultLen;

+const UChar* ucol_tok_getRulesFromBundle(

+ void* /*context*/,

+ const char* locale,

+ const char* type,

+ int32_t* pLength,

+ UErrorCode* status)

+ const UChar* rules = NULL;

+ UResourceBundle* bundle;

+ UResourceBundle* collations;

+ UResourceBundle* collation;

+ *pLength = 0;

+ bundle = ures_open(U_ICUDATA_COLL, locale, status);

+ if(U_SUCCESS(*status)){

+ collations = ures_getByKey(bundle, "collations", NULL, status);

+ if(U_SUCCESS(*status)){

+ collation = ures_getByKey(collations, type, NULL, status);

+ if(U_SUCCESS(*status)){

+ rules = ures_getStringByKey(collation, "Sequence", pLength, status);

+ if(U_FAILURE(*status)){

+ *pLength = 0;

+ rules = NULL;

+ }

+ ures_close(collation);

+ }

+ ures_close(collations);

+ }

+ ures_close(bundle);

+ return rules;

+void ucol_tok_initTokenList(

+ UColTokenParser *src,

+ const UChar *rules,

+ uint32_t rulesLength,

+ const UCollator *UCA,

+ GetCollationRulesFunction importFunc,

+ void* context,

+ UErrorCode *status) {

+ U_NAMESPACE_USE

+ uint32_t nSize = 0;

+ uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);

+ bool needToDeallocRules = false;

+ if(U_FAILURE(*status)) {

+ return;

+ }

+ // set everything to zero, so that we can clean up gracefully

+ uprv_memset(src, 0, sizeof(UColTokenParser));

+ // first we need to find options that don't like to be normalized,

+ // like copy and remove...

+ //const UChar *openBrace = rules;

+ int32_t optionNumber = -1;

+ const UChar *setStart = NULL;

+ uint32_t i = 0;

+ while(i < rulesLength) {

+ if(rules[i] == 0x005B) { // '[': start of an option

+ /* Gets the following:

+ optionNumber: The index of the option.

+ setStart: The pointer at which the option arguments start.

+ */

+ optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);

+ if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */

+ // [optimize]

+ USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);

+ if(U_SUCCESS(*status)) {

+ if(src->copySet == NULL) {

+ src->copySet = newSet;

+ } else {

+ uset_addAll(src->copySet, newSet);

+ uset_close(newSet);

+ }

+ } else {

+ return;

+ }

+ } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {

+ USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);

+ if(U_SUCCESS(*status)) {

+ if(src->removeSet == NULL) {

+ src->removeSet = newSet;

+ } else {

+ uset_addAll(src->removeSet, newSet);

+ uset_close(newSet);

+ }

+ } else {

+ return;

+ }

+ } else if(optionNumber == OPTION_IMPORT){

+ // [import <collation-name>]

+ // Find the address of the closing ].

+ UChar* import_end = u_strchr(setStart, 0x005D);

+ int32_t optionEndOffset = (int32_t)(import_end + 1 - rules);

+ // Ignore trailing whitespace.

+ while(uprv_isRuleWhiteSpace(*(import_end-1))) {

+ --import_end;

+ }

+ int32_t optionLength = (int32_t)(import_end - setStart);

+ char option[50];

+ if(optionLength >= (int32_t)sizeof(option)) {

+ *status = U_ILLEGAL_ARGUMENT_ERROR;

+ return;

+ }

+ u_UCharsToChars(setStart, option, optionLength);

+ option[optionLength] = 0;

+ *status = U_ZERO_ERROR;

+ char locale[50];

+ int32_t templ;

+ uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status);

+ if(U_FAILURE(*status)) {

+ *status = U_ILLEGAL_ARGUMENT_ERROR;

+ return;

+ }

+ char type[50];

+ if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 ||

+ U_FAILURE(*status)

+ ) {

+ *status = U_ZERO_ERROR;

+ uprv_strcpy(type, "standard");

+ }

+ // TODO: Use public functions when available, see ticket #8134.

+ char *keywords = (char *)locale_getKeywordsStart(locale);

+ if(keywords != NULL) {

+ *keywords = 0;

+ }

+ int32_t importRulesLength = 0;

+ const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status);

+#ifdef DEBUG_FOR_COLL_RULES

+ std::string s;

+ UnicodeString(importRules).toUTF8String(s);

+ std::cout << "Import rules = " << s << std::endl;

+#endif

+ // Add the length of the imported rules to length of the original rules,

+ // and subtract the length of the import option.

+ uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i);

+ UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar));

+#ifdef DEBUG_FOR_COLL_RULES

+ std::string s1;

+ UnicodeString(rules).toUTF8String(s1);

+ std::cout << "Original rules = " << s1 << std::endl;

+#endif

+ // Copy the section of the original rules leading up to the import

+ uprv_memcpy(newRules, rules, i*sizeof(UChar));

+ // Copy the imported rules

+ uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar));

+ // Copy the rest of the original rules (minus the import option itself)

+ uprv_memcpy(newRules+i+importRulesLength,

+ rules+optionEndOffset,

+ (rulesLength-optionEndOffset)*sizeof(UChar));

+#ifdef DEBUG_FOR_COLL_RULES

+ std::string s2;

+ UnicodeString(newRules).toUTF8String(s2);

+ std::cout << "Resulting rules = " << s2 << std::endl;

+#endif

+ if(needToDeallocRules){

+ // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free

+ uprv_free((void*)rules);

+ }

+ needToDeallocRules = true;

+ rules = newRules;

+ rulesLength = newRulesLength;

+ estimatedSize += importRulesLength*2;

+ // First character of the new rules needs to be processed

+ i--;

+ }

+ //openBrace++;

+ i++;

+ }

+ src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));

+ /* test for NULL */

+ if (src->source == NULL) {

+ *status = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));

+ nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);

+ if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {

+ *status = U_ZERO_ERROR;

+ src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));

+ /* test for NULL */

+ if (src->source == NULL) {

+ *status = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);

+ }

+ if(needToDeallocRules){

+ // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free

+ uprv_free((void*)rules);

+ }

+ src->current = src->source;

+ src->end = src->source+nSize;

+ src->sourceCurrent = src->source;

+ src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly

+ src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;

+ src->varTop = NULL;

+ src->UCA = UCA;

+ src->invUCA = ucol_initInverseUCA(status);

+ src->parsedToken.charsLen = 0;

+ src->parsedToken.charsOffset = 0;

+ src->parsedToken.extensionLen = 0;

+ src->parsedToken.extensionOffset = 0;

+ src->parsedToken.prefixLen = 0;

+ src->parsedToken.prefixOffset = 0;

+ src->parsedToken.flags = 0;

+ src->parsedToken.strength = UCOL_TOK_UNSET;

+ src->buildCCTabFlag = FALSE;

+ src->isStarred = FALSE;

+ src->inRange = FALSE;

+ src->lastRangeCp = 0;

+ src->previousCp = 0;

+ if(U_FAILURE(*status)) {

+ return;

+ }

+ src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);

+ if(U_FAILURE(*status)) {

+ return;

+ }

+ uhash_setValueDeleter(src->tailored, uhash_freeBlock);

+ src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));

+ /* test for NULL */

+ if (src->opts == NULL) {

+ *status = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));

+ src->lh = 0;

+ src->listCapacity = 1024;

+ src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));

+ //Test for NULL

+ if (src->lh == NULL) {

+ *status = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));

+ src->resultLen = 0;

+ UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);

+ // UCOL_RESET_TOP_VALUE

+ setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);

+ // UCOL_FIRST_PRIMARY_IGNORABLE

+ setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);

+ // UCOL_LAST_PRIMARY_IGNORABLE

+ setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);

+ // UCOL_FIRST_SECONDARY_IGNORABLE

+ setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);

+ // UCOL_LAST_SECONDARY_IGNORABLE

+ setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);

+ // UCOL_FIRST_TERTIARY_IGNORABLE

+ setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);

+ // UCOL_LAST_TERTIARY_IGNORABLE

+ setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);

+ // UCOL_FIRST_VARIABLE

+ setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);

+ // UCOL_LAST_VARIABLE

+ setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);

+ // UCOL_FIRST_NON_VARIABLE

+ setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);

+ // UCOL_LAST_NON_VARIABLE

+ setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);

+ // UCOL_FIRST_IMPLICIT

+ setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);

+ // UCOL_LAST_IMPLICIT

+ setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);

+ // UCOL_FIRST_TRAILING

+ setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);

+ // UCOL_LAST_TRAILING

+ setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);

+ ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);

+void ucol_tok_closeTokenList(UColTokenParser *src) {

+ if(src->copySet != NULL) {

+ uset_close(src->copySet);

+ }

+ if(src->removeSet != NULL) {

+ uset_close(src->removeSet);

+ }

+ if(src->tailored != NULL) {

+ uhash_close(src->tailored);

+ }

+ if(src->lh != NULL) {

+ uprv_free(src->lh);

+ }

+ if(src->source != NULL) {

+ uprv_free(src->source);

+ }

+ if(src->opts != NULL) {

+ uprv_free(src->opts);

+ }

+ if (src->reorderCodes != NULL) {

+ uprv_free(src->reorderCodes);

+ }

+#endif /* #if !UCONFIG_NO_COLLATION */

Property changes on: icu46/source/i18n/ucol_tok.cpp

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu46/source/i18n/ucol_tok.h ('k') | icu46/source/i18n/ucol_wgt.h » ('j') | no next file with comments »