OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * |
| 4 * Copyright (C) 2001-2010, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ******************************************************************************* |
| 8 * file name: ucol_tok.cpp |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created 02/22/2001 |
| 14 * created by: Vladimir Weinstein |
| 15 * |
| 16 * This module reads a tailoring rule string and produces a list of |
| 17 * tokens that will be turned into collation elements |
| 18 * |
| 19 */ |
| 20 |
| 21 #include "unicode/utypes.h" |
| 22 |
| 23 #if !UCONFIG_NO_COLLATION |
| 24 |
| 25 #include "unicode/uscript.h" |
| 26 #include "unicode/ustring.h" |
| 27 #include "unicode/uchar.h" |
| 28 #include "unicode/uniset.h" |
| 29 |
| 30 #include "cmemory.h" |
| 31 #include "cstring.h" |
| 32 #include "ucol_bld.h" |
| 33 #include "ucol_tok.h" |
| 34 #include "ulocimp.h" |
| 35 #include "uresimp.h" |
| 36 #include "util.h" |
| 37 |
| 38 // Define this only for debugging. |
| 39 // #define DEBUG_FOR_COLL_RULES 1 |
| 40 |
| 41 #ifdef DEBUG_FOR_COLL_RULES |
| 42 #include <iostream> |
| 43 #endif |
| 44 |
| 45 U_NAMESPACE_USE |
| 46 |
| 47 U_CDECL_BEGIN |
| 48 static int32_t U_CALLCONV |
| 49 uhash_hashTokens(const UHashTok k) |
| 50 { |
| 51 int32_t hash = 0; |
| 52 //uint32_t key = (uint32_t)k.integer; |
| 53 UColToken *key = (UColToken *)k.pointer; |
| 54 if (key != 0) { |
| 55 int32_t len = (key->source & 0xFF000000)>>24; |
| 56 int32_t inc = ((len - 32) / 32) + 1; |
| 57 |
| 58 const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl); |
| 59 const UChar *limit = p + len; |
| 60 |
| 61 while (p<limit) { |
| 62 hash = (hash * 37) + *p; |
| 63 p += inc; |
| 64 } |
| 65 } |
| 66 return hash; |
| 67 } |
| 68 |
| 69 static UBool U_CALLCONV |
| 70 uhash_compareTokens(const UHashTok key1, const UHashTok key2) |
| 71 { |
| 72 //uint32_t p1 = (uint32_t) key1.integer; |
| 73 //uint32_t p2 = (uint32_t) key2.integer; |
| 74 UColToken *p1 = (UColToken *)key1.pointer; |
| 75 UColToken *p2 = (UColToken *)key2.pointer; |
| 76 const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl); |
| 77 const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl); |
| 78 uint32_t s1L = ((p1->source & 0xFF000000) >> 24); |
| 79 uint32_t s2L = ((p2->source & 0xFF000000) >> 24); |
| 80 const UChar *end = s1+s1L-1; |
| 81 |
| 82 if (p1 == p2) { |
| 83 return TRUE; |
| 84 } |
| 85 if (p1->source == 0 || p2->source == 0) { |
| 86 return FALSE; |
| 87 } |
| 88 if(s1L != s2L) { |
| 89 return FALSE; |
| 90 } |
| 91 if(p1->source == p2->source) { |
| 92 return TRUE; |
| 93 } |
| 94 while((s1 < end) && *s1 == *s2) { |
| 95 ++s1; |
| 96 ++s2; |
| 97 } |
| 98 if(*s1 == *s2) { |
| 99 return TRUE; |
| 100 } else { |
| 101 return FALSE; |
| 102 } |
| 103 } |
| 104 U_CDECL_END |
| 105 |
| 106 /* |
| 107 * Debug messages used to pinpoint where a format error occurred. |
| 108 * A better way is to include context-sensitive information in syntaxError() fun
ction. |
| 109 * |
| 110 * To turn this debugging on, either uncomment the following line, or define use
-DDEBUG_FOR_FORMAT_ERROR |
| 111 * in the compile line. |
| 112 */ |
| 113 /* #define DEBUG_FOR_FORMAT_ERROR 1 */ |
| 114 |
| 115 #ifdef DEBUG_FOR_FORMAT_ERROR |
| 116 #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__)
;} |
| 117 #else |
| 118 #define DBG_FORMAT_ERROR |
| 119 #endif |
| 120 |
| 121 |
| 122 /* |
| 123 * Controls debug messages so that the output can be compared before and after a |
| 124 * big change. Prints the information of every code point that comes out of the |
| 125 * collation parser and its strength into a file. When a big change in format |
| 126 * happens, the files before and after the change should be identical. |
| 127 * |
| 128 * To turn this debugging on, either uncomment the following line, or define use
-DDEBUG_FOR_CODE_POINTS |
| 129 * in the compile line. |
| 130 */ |
| 131 // #define DEBUG_FOR_CODE_POINTS 1 |
| 132 |
| 133 #ifdef DEBUG_FOR_CODE_POINTS |
| 134 FILE* dfcp_fp = NULL; |
| 135 #endif |
| 136 |
| 137 |
| 138 /*static inline void U_CALLCONV |
| 139 uhash_freeBlockWrapper(void *obj) { |
| 140 uhash_freeBlock(obj); |
| 141 }*/ |
| 142 |
| 143 |
| 144 typedef struct { |
| 145 uint32_t startCE; |
| 146 uint32_t startContCE; |
| 147 uint32_t limitCE; |
| 148 uint32_t limitContCE; |
| 149 } indirectBoundaries; |
| 150 |
| 151 /* these values are used for finding CE values for indirect positioning. */ |
| 152 /* Indirect positioning is a mechanism for allowing resets on symbolic */ |
| 153 /* values. It only works for resets and you cannot tailor indirect names */ |
| 154 /* An indirect name can define either an anchor point or a range. An */ |
| 155 /* anchor point behaves in exactly the same way as a code point in reset */ |
| 156 /* would, except that it cannot be tailored. A range (we currently only */ |
| 157 /* know for the [top] range will explicitly set the upper bound for */ |
| 158 /* generated CEs, thus allowing for better control over how many CEs can */ |
| 159 /* be squeezed between in the range without performance penalty. */ |
| 160 /* In that respect, we use [top] for tailoring of locales that use CJK */ |
| 161 /* characters. Other indirect values are currently a pure convenience, */ |
| 162 /* they can be used to assure that the CEs will be always positioned in */ |
| 163 /* the same place relative to a point with known properties (e.g. first */ |
| 164 /* primary ignorable). */ |
| 165 static indirectBoundaries ucolIndirectBoundaries[15]; |
| 166 /* |
| 167 static indirectBoundaries ucolIndirectBoundaries[11] = { |
| 168 { UCOL_RESET_TOP_VALUE, 0, |
| 169 UCOL_NEXT_TOP_VALUE, 0 }, |
| 170 { UCOL_FIRST_PRIMARY_IGNORABLE, 0, |
| 171 0, 0 }, |
| 172 { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT, |
| 173 0, 0 }, |
| 174 { UCOL_FIRST_SECONDARY_IGNORABLE, 0, |
| 175 0, 0 }, |
| 176 { UCOL_LAST_SECONDARY_IGNORABLE, 0, |
| 177 0, 0 }, |
| 178 { UCOL_FIRST_TERTIARY_IGNORABLE, 0, |
| 179 0, 0 }, |
| 180 { UCOL_LAST_TERTIARY_IGNORABLE, 0, |
| 181 0, 0 }, |
| 182 { UCOL_FIRST_VARIABLE, 0, |
| 183 0, 0 }, |
| 184 { UCOL_LAST_VARIABLE, 0, |
| 185 0, 0 }, |
| 186 { UCOL_FIRST_NON_VARIABLE, 0, |
| 187 0, 0 }, |
| 188 { UCOL_LAST_NON_VARIABLE, 0, |
| 189 0, 0 }, |
| 190 }; |
| 191 */ |
| 192 |
| 193 static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *en
d) { |
| 194 |
| 195 // Set values for the top - TODO: once we have values for all the indirects,
we are going |
| 196 // to initalize here. |
| 197 ucolIndirectBoundaries[indexR].startCE = start[0]; |
| 198 ucolIndirectBoundaries[indexR].startContCE = start[1]; |
| 199 if(end) { |
| 200 ucolIndirectBoundaries[indexR].limitCE = end[0]; |
| 201 ucolIndirectBoundaries[indexR].limitContCE = end[1]; |
| 202 } else { |
| 203 ucolIndirectBoundaries[indexR].limitCE = 0; |
| 204 ucolIndirectBoundaries[indexR].limitContCE = 0; |
| 205 } |
| 206 } |
| 207 |
| 208 |
| 209 static inline |
| 210 void syntaxError(const UChar* rules, |
| 211 int32_t pos, |
| 212 int32_t rulesLen, |
| 213 UParseError* parseError) |
| 214 { |
| 215 parseError->offset = pos; |
| 216 parseError->line = 0 ; /* we are not using line numbers */ |
| 217 |
| 218 // for pre-context |
| 219 int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN
-1)); |
| 220 int32_t stop = pos; |
| 221 |
| 222 u_memcpy(parseError->preContext,rules+start,stop-start); |
| 223 //null terminate the buffer |
| 224 parseError->preContext[stop-start] = 0; |
| 225 |
| 226 //for post-context |
| 227 start = pos+1; |
| 228 stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1
)) : |
| 229 rulesLen; |
| 230 |
| 231 if(start < stop) { |
| 232 u_memcpy(parseError->postContext,rules+start,stop-start); |
| 233 //null terminate the buffer |
| 234 parseError->postContext[stop-start]= 0; |
| 235 } else { |
| 236 parseError->postContext[0] = 0; |
| 237 } |
| 238 } |
| 239 |
| 240 static |
| 241 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, U
ColAttributeValue value) { |
| 242 switch(attrib) { |
| 243 case UCOL_HIRAGANA_QUATERNARY_MODE: |
| 244 opts->hiraganaQ = value; |
| 245 break; |
| 246 case UCOL_FRENCH_COLLATION: |
| 247 opts->frenchCollation = value; |
| 248 break; |
| 249 case UCOL_ALTERNATE_HANDLING: |
| 250 opts->alternateHandling = value; |
| 251 break; |
| 252 case UCOL_CASE_FIRST: |
| 253 opts->caseFirst = value; |
| 254 break; |
| 255 case UCOL_CASE_LEVEL: |
| 256 opts->caseLevel = value; |
| 257 break; |
| 258 case UCOL_NORMALIZATION_MODE: |
| 259 opts->normalizationMode = value; |
| 260 break; |
| 261 case UCOL_STRENGTH: |
| 262 opts->strength = value; |
| 263 break; |
| 264 case UCOL_NUMERIC_COLLATION: |
| 265 opts->numericCollation = value; |
| 266 break; |
| 267 case UCOL_ATTRIBUTE_COUNT: |
| 268 default: |
| 269 break; |
| 270 } |
| 271 } |
| 272 |
| 273 #define UTOK_OPTION_COUNT 22 |
| 274 |
| 275 static UBool didInit = FALSE; |
| 276 /* we can be strict, or we can be lenient */ |
| 277 /* I'd surely be lenient with the option arguments */ |
| 278 /* maybe even with options */ |
| 279 U_STRING_DECL(suboption_00, "non-ignorable", 13); |
| 280 U_STRING_DECL(suboption_01, "shifted", 7); |
| 281 |
| 282 U_STRING_DECL(suboption_02, "lower", 5); |
| 283 U_STRING_DECL(suboption_03, "upper", 5); |
| 284 U_STRING_DECL(suboption_04, "off", 3); |
| 285 U_STRING_DECL(suboption_05, "on", 2); |
| 286 U_STRING_DECL(suboption_06, "1", 1); |
| 287 U_STRING_DECL(suboption_07, "2", 1); |
| 288 U_STRING_DECL(suboption_08, "3", 1); |
| 289 U_STRING_DECL(suboption_09, "4", 1); |
| 290 U_STRING_DECL(suboption_10, "I", 1); |
| 291 |
| 292 U_STRING_DECL(suboption_11, "primary", 7); |
| 293 U_STRING_DECL(suboption_12, "secondary", 9); |
| 294 U_STRING_DECL(suboption_13, "tertiary", 8); |
| 295 U_STRING_DECL(suboption_14, "variable", 8); |
| 296 U_STRING_DECL(suboption_15, "regular", 7); |
| 297 U_STRING_DECL(suboption_16, "implicit", 8); |
| 298 U_STRING_DECL(suboption_17, "trailing", 8); |
| 299 |
| 300 |
| 301 U_STRING_DECL(option_00, "undefined", 9); |
| 302 U_STRING_DECL(option_01, "rearrange", 9); |
| 303 U_STRING_DECL(option_02, "alternate", 9); |
| 304 U_STRING_DECL(option_03, "backwards", 9); |
| 305 U_STRING_DECL(option_04, "variable top", 12); |
| 306 U_STRING_DECL(option_05, "top", 3); |
| 307 U_STRING_DECL(option_06, "normalization", 13); |
| 308 U_STRING_DECL(option_07, "caseLevel", 9); |
| 309 U_STRING_DECL(option_08, "caseFirst", 9); |
| 310 U_STRING_DECL(option_09, "scriptOrder", 11); |
| 311 U_STRING_DECL(option_10, "charsetname", 11); |
| 312 U_STRING_DECL(option_11, "charset", 7); |
| 313 U_STRING_DECL(option_12, "before", 6); |
| 314 U_STRING_DECL(option_13, "hiraganaQ", 9); |
| 315 U_STRING_DECL(option_14, "strength", 8); |
| 316 U_STRING_DECL(option_15, "first", 5); |
| 317 U_STRING_DECL(option_16, "last", 4); |
| 318 U_STRING_DECL(option_17, "optimize", 8); |
| 319 U_STRING_DECL(option_18, "suppressContractions", 20); |
| 320 U_STRING_DECL(option_19, "numericOrdering", 15); |
| 321 U_STRING_DECL(option_20, "import", 6); |
| 322 U_STRING_DECL(option_21, "reorder", 7); |
| 323 |
| 324 /* |
| 325 [last variable] last variable value |
| 326 [last primary ignorable] largest CE for primary ignorable |
| 327 [last secondary ignorable] largest CE for secondary ignorable |
| 328 [last tertiary ignorable] largest CE for tertiary ignorable |
| 329 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8
) |
| 330 */ |
| 331 |
| 332 |
| 333 static const ucolTokSuboption alternateSub[2] = { |
| 334 {suboption_00, 13, UCOL_NON_IGNORABLE}, |
| 335 {suboption_01, 7, UCOL_SHIFTED} |
| 336 }; |
| 337 |
| 338 static const ucolTokSuboption caseFirstSub[3] = { |
| 339 {suboption_02, 5, UCOL_LOWER_FIRST}, |
| 340 {suboption_03, 5, UCOL_UPPER_FIRST}, |
| 341 {suboption_04, 3, UCOL_OFF}, |
| 342 }; |
| 343 |
| 344 static const ucolTokSuboption onOffSub[2] = { |
| 345 {suboption_04, 3, UCOL_OFF}, |
| 346 {suboption_05, 2, UCOL_ON} |
| 347 }; |
| 348 |
| 349 static const ucolTokSuboption frenchSub[1] = { |
| 350 {suboption_07, 1, UCOL_ON} |
| 351 }; |
| 352 |
| 353 static const ucolTokSuboption beforeSub[3] = { |
| 354 {suboption_06, 1, UCOL_PRIMARY}, |
| 355 {suboption_07, 1, UCOL_SECONDARY}, |
| 356 {suboption_08, 1, UCOL_TERTIARY} |
| 357 }; |
| 358 |
| 359 static const ucolTokSuboption strengthSub[5] = { |
| 360 {suboption_06, 1, UCOL_PRIMARY}, |
| 361 {suboption_07, 1, UCOL_SECONDARY}, |
| 362 {suboption_08, 1, UCOL_TERTIARY}, |
| 363 {suboption_09, 1, UCOL_QUATERNARY}, |
| 364 {suboption_10, 1, UCOL_IDENTICAL}, |
| 365 }; |
| 366 |
| 367 static const ucolTokSuboption firstLastSub[7] = { |
| 368 {suboption_11, 7, UCOL_PRIMARY}, |
| 369 {suboption_12, 9, UCOL_PRIMARY}, |
| 370 {suboption_13, 8, UCOL_PRIMARY}, |
| 371 {suboption_14, 8, UCOL_PRIMARY}, |
| 372 {suboption_15, 7, UCOL_PRIMARY}, |
| 373 {suboption_16, 8, UCOL_PRIMARY}, |
| 374 {suboption_17, 8, UCOL_PRIMARY}, |
| 375 }; |
| 376 |
| 377 enum OptionNumber { |
| 378 OPTION_ALTERNATE_HANDLING = 0, |
| 379 OPTION_FRENCH_COLLATION, |
| 380 OPTION_CASE_LEVEL, |
| 381 OPTION_CASE_FIRST, |
| 382 OPTION_NORMALIZATION_MODE, |
| 383 OPTION_HIRAGANA_QUATERNARY, |
| 384 OPTION_STRENGTH, |
| 385 OPTION_NUMERIC_COLLATION, |
| 386 OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION, |
| 387 OPTION_VARIABLE_TOP, |
| 388 OPTION_REARRANGE, |
| 389 OPTION_BEFORE, |
| 390 OPTION_TOP, |
| 391 OPTION_FIRST, |
| 392 OPTION_LAST, |
| 393 OPTION_OPTIMIZE, |
| 394 OPTION_SUPPRESS_CONTRACTIONS, |
| 395 OPTION_UNDEFINED, |
| 396 OPTION_SCRIPT_ORDER, |
| 397 OPTION_CHARSET_NAME, |
| 398 OPTION_CHARSET, |
| 399 OPTION_IMPORT, |
| 400 OPTION_SCRIPTREORDER |
| 401 } ; |
| 402 |
| 403 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = { |
| 404 /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alterna
te" */ |
| 405 /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"
*/ |
| 406 /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */ |
| 407 /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */ |
| 408 /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalizati
on" */ |
| 409 /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraga
naQ" */ |
| 410 /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */ |
| 411 /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrde
ring"*/ |
| 412 /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */ |
| 413 /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */ |
| 414 /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */ |
| 415 /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */ |
| 416 /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */ |
| 417 /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */ |
| 418 /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */ |
| 419 /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractio
ns" */ |
| 420 /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */ |
| 421 /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */ |
| 422 /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */ |
| 423 /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charset" *
/ |
| 424 /*20*/ {option_20, 6, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"import" */ |
| 425 /*21*/ {option_21, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"reorder" */ |
| 426 }; |
| 427 |
| 428 static |
| 429 int32_t u_strncmpNoCase(const UChar *s1, |
| 430 const UChar *s2, |
| 431 int32_t n) |
| 432 { |
| 433 if(n > 0) { |
| 434 int32_t rc; |
| 435 for(;;) { |
| 436 rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2); |
| 437 if(rc != 0 || *s1 == 0 || --n == 0) { |
| 438 return rc; |
| 439 } |
| 440 ++s1; |
| 441 ++s2; |
| 442 } |
| 443 } |
| 444 return 0; |
| 445 } |
| 446 |
| 447 static |
| 448 void ucol_uprv_tok_initData() { |
| 449 if(!didInit) { |
| 450 U_STRING_INIT(suboption_00, "non-ignorable", 13); |
| 451 U_STRING_INIT(suboption_01, "shifted", 7); |
| 452 |
| 453 U_STRING_INIT(suboption_02, "lower", 5); |
| 454 U_STRING_INIT(suboption_03, "upper", 5); |
| 455 U_STRING_INIT(suboption_04, "off", 3); |
| 456 U_STRING_INIT(suboption_05, "on", 2); |
| 457 |
| 458 U_STRING_INIT(suboption_06, "1", 1); |
| 459 U_STRING_INIT(suboption_07, "2", 1); |
| 460 U_STRING_INIT(suboption_08, "3", 1); |
| 461 U_STRING_INIT(suboption_09, "4", 1); |
| 462 U_STRING_INIT(suboption_10, "I", 1); |
| 463 |
| 464 U_STRING_INIT(suboption_11, "primary", 7); |
| 465 U_STRING_INIT(suboption_12, "secondary", 9); |
| 466 U_STRING_INIT(suboption_13, "tertiary", 8); |
| 467 U_STRING_INIT(suboption_14, "variable", 8); |
| 468 U_STRING_INIT(suboption_15, "regular", 7); |
| 469 U_STRING_INIT(suboption_16, "implicit", 8); |
| 470 U_STRING_INIT(suboption_17, "trailing", 8); |
| 471 |
| 472 |
| 473 U_STRING_INIT(option_00, "undefined", 9); |
| 474 U_STRING_INIT(option_01, "rearrange", 9); |
| 475 U_STRING_INIT(option_02, "alternate", 9); |
| 476 U_STRING_INIT(option_03, "backwards", 9); |
| 477 U_STRING_INIT(option_04, "variable top", 12); |
| 478 U_STRING_INIT(option_05, "top", 3); |
| 479 U_STRING_INIT(option_06, "normalization", 13); |
| 480 U_STRING_INIT(option_07, "caseLevel", 9); |
| 481 U_STRING_INIT(option_08, "caseFirst", 9); |
| 482 U_STRING_INIT(option_09, "scriptOrder", 11); |
| 483 U_STRING_INIT(option_10, "charsetname", 11); |
| 484 U_STRING_INIT(option_11, "charset", 7); |
| 485 U_STRING_INIT(option_12, "before", 6); |
| 486 U_STRING_INIT(option_13, "hiraganaQ", 9); |
| 487 U_STRING_INIT(option_14, "strength", 8); |
| 488 U_STRING_INIT(option_15, "first", 5); |
| 489 U_STRING_INIT(option_16, "last", 4); |
| 490 U_STRING_INIT(option_17, "optimize", 8); |
| 491 U_STRING_INIT(option_18, "suppressContractions", 20); |
| 492 U_STRING_INIT(option_19, "numericOrdering", 15); |
| 493 U_STRING_INIT(option_20, "import ", 6); |
| 494 U_STRING_INIT(option_21, "reorder", 7); |
| 495 didInit = TRUE; |
| 496 } |
| 497 } |
| 498 |
| 499 |
| 500 // This function reads basic options to set in the runtime collator |
| 501 // used by data driven tests. Should not support build time options |
| 502 U_CAPI const UChar * U_EXPORT2 |
| 503 ucol_tok_getNextArgument(const UChar *start, const UChar *end, |
| 504 UColAttribute *attrib, UColAttributeValue *value, |
| 505 UErrorCode *status) |
| 506 { |
| 507 uint32_t i = 0; |
| 508 int32_t j=0; |
| 509 UBool foundOption = FALSE; |
| 510 const UChar *optionArg = NULL; |
| 511 |
| 512 ucol_uprv_tok_initData(); |
| 513 |
| 514 while(start < end && (u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start
))) { /* eat whitespace */ |
| 515 start++; |
| 516 } |
| 517 if(start >= end) { |
| 518 return NULL; |
| 519 } |
| 520 /* skip opening '[' */ |
| 521 if(*start == 0x005b) { |
| 522 start++; |
| 523 } else { |
| 524 *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '[' |
| 525 return NULL; |
| 526 } |
| 527 |
| 528 while(i < UTOK_OPTION_COUNT) { |
| 529 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].op
tionLen) == 0) { |
| 530 foundOption = TRUE; |
| 531 if(end - start > rulesOptions[i].optionLen) { |
| 532 optionArg = start+rulesOptions[i].optionLen+1; /* start of the o
ptions, skip space */ |
| 533 while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optio
nArg)) { /* eat whitespace */ |
| 534 optionArg++; |
| 535 } |
| 536 } |
| 537 break; |
| 538 } |
| 539 i++; |
| 540 } |
| 541 |
| 542 if(!foundOption) { |
| 543 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 544 return NULL; |
| 545 } |
| 546 |
| 547 if(optionArg) { |
| 548 for(j = 0; j<rulesOptions[i].subSize; j++) { |
| 549 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, ru
lesOptions[i].subopts[j].subLen) == 0) { |
| 550 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr
, rulesOptions[i].subopts[j].attrVal); |
| 551 *attrib = rulesOptions[i].attr; |
| 552 *value = rulesOptions[i].subopts[j].attrVal; |
| 553 optionArg += rulesOptions[i].subopts[j].subLen; |
| 554 while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optio
nArg)) { /* eat whitespace */ |
| 555 optionArg++; |
| 556 } |
| 557 if(*optionArg == 0x005d) { |
| 558 optionArg++; |
| 559 return optionArg; |
| 560 } else { |
| 561 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 562 return NULL; |
| 563 } |
| 564 } |
| 565 } |
| 566 } |
| 567 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 568 return NULL; |
| 569 } |
| 570 |
| 571 static |
| 572 USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, U
ErrorCode *status) { |
| 573 while(*start != 0x005b) { /* advance while we find the first '[' */ |
| 574 start++; |
| 575 } |
| 576 // now we need to get a balanced set of '[]'. The problem is that a set can
have |
| 577 // many, and *end point to the first closing '[' |
| 578 int32_t noOpenBraces = 1; |
| 579 int32_t current = 1; // skip the opening brace |
| 580 while(start+current < end && noOpenBraces != 0) { |
| 581 if(start[current] == 0x005b) { |
| 582 noOpenBraces++; |
| 583 } else if(start[current] == 0x005D) { // closing brace |
| 584 noOpenBraces--; |
| 585 } |
| 586 current++; |
| 587 } |
| 588 |
| 589 if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) { |
| 590 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 591 return NULL; |
| 592 } |
| 593 return uset_openPattern(start, current, status); |
| 594 } |
| 595 |
| 596 /** |
| 597 * Reads an option and matches the option name with the predefined options. (Cas
e-insensitive.) |
| 598 * @param start Pointer to the start UChar. |
| 599 * @param end Pointer to the last valid pointer beyond which the option will not
extend. |
| 600 * @param optionArg Address of the pointer at which the options start (after the
option name) |
| 601 * @return The index of the option, or -1 if the option is not valid. |
| 602 */ |
| 603 static |
| 604 int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UCh
ar **optionArg) { |
| 605 int32_t i = 0; |
| 606 ucol_uprv_tok_initData(); |
| 607 |
| 608 while(u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start)) { /* eat whit
espace */ |
| 609 start++; |
| 610 } |
| 611 while(i < UTOK_OPTION_COUNT) { |
| 612 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].op
tionLen) == 0) { |
| 613 if(end - start > rulesOptions[i].optionLen) { |
| 614 *optionArg = start+rulesOptions[i].optionLen; /* End of option n
ame; start of the options */ |
| 615 while(u_isWhitespace(**optionArg) || uprv_isRuleWhiteSpace(**opt
ionArg)) { /* eat whitespace */ |
| 616 (*optionArg)++; |
| 617 } |
| 618 } |
| 619 break; |
| 620 } |
| 621 i++; |
| 622 } |
| 623 if(i == UTOK_OPTION_COUNT) { |
| 624 i = -1; // didn't find an option |
| 625 } |
| 626 return i; |
| 627 } |
| 628 |
| 629 |
| 630 static |
| 631 void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) { |
| 632 int32_t codeCount = 0; |
| 633 int32_t codeIndex = 0; |
| 634 char conversion[64]; |
| 635 int32_t tokenLength = 0; |
| 636 const UChar* space; |
| 637 |
| 638 const UChar* current = src->current; |
| 639 const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current); |
| 640 |
| 641 // eat leading whitespace |
| 642 while(current < end && u_isWhitespace(*current)) { |
| 643 current++; |
| 644 } |
| 645 |
| 646 while(current < end) { |
| 647 space = u_memchr(current, 0x0020, end - current); |
| 648 space = space == 0 ? end : space; |
| 649 tokenLength = space - current; |
| 650 if (tokenLength < 4) { |
| 651 *status = U_INVALID_FORMAT_ERROR; |
| 652 return; |
| 653 } |
| 654 codeCount++; |
| 655 current += tokenLength; |
| 656 while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ |
| 657 ++current; |
| 658 } |
| 659 } |
| 660 |
| 661 if (codeCount == 0) { |
| 662 *status = U_INVALID_FORMAT_ERROR; |
| 663 } |
| 664 |
| 665 src->reorderCodesLength = codeCount; |
| 666 src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t)); |
| 667 current = src->current; |
| 668 |
| 669 // eat leading whitespace |
| 670 while(current < end && u_isWhitespace(*current)) { |
| 671 current++; |
| 672 } |
| 673 |
| 674 while(current < end) { |
| 675 space = u_memchr(current, 0x0020, end - current); |
| 676 space = space == 0 ? end : space; |
| 677 tokenLength = space - current; |
| 678 if (tokenLength < 4) { |
| 679 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 680 return; |
| 681 } else { |
| 682 u_UCharsToChars(current, conversion, tokenLength); |
| 683 conversion[tokenLength] = '\0'; |
| 684 src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion); |
| 685 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) { |
| 686 src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRI
PT, conversion); |
| 687 } |
| 688 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) { |
| 689 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 690 } |
| 691 } |
| 692 codeIndex++; |
| 693 current += tokenLength; |
| 694 while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ |
| 695 ++current; |
| 696 } |
| 697 } |
| 698 } |
| 699 |
| 700 // reads and conforms to various options in rules |
| 701 // end is the position of the first closing ']' |
| 702 // However, some of the options take an UnicodeSet definition |
| 703 // which needs to duplicate the closing ']' |
| 704 // for example: '[copy [\uAC00-\uD7FF]]' |
| 705 // These options will move end to the second ']' and the |
| 706 // caller will set the current to it. |
| 707 static |
| 708 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status)
{ |
| 709 const UChar* start = src->current; |
| 710 int32_t i = 0; |
| 711 int32_t j=0; |
| 712 const UChar *optionArg = NULL; |
| 713 |
| 714 uint8_t result = 0; |
| 715 |
| 716 start++; /*skip opening '['*/ |
| 717 i = ucol_uprv_tok_readOption(start, src->end, &optionArg); |
| 718 if(optionArg) { |
| 719 src->current = optionArg; |
| 720 } |
| 721 |
| 722 if(i < 0) { |
| 723 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 724 } else { |
| 725 int32_t noOpenBraces = 1; |
| 726 switch(i) { |
| 727 case OPTION_ALTERNATE_HANDLING: |
| 728 case OPTION_FRENCH_COLLATION: |
| 729 case OPTION_CASE_LEVEL: |
| 730 case OPTION_CASE_FIRST: |
| 731 case OPTION_NORMALIZATION_MODE: |
| 732 case OPTION_HIRAGANA_QUATERNARY: |
| 733 case OPTION_STRENGTH: |
| 734 case OPTION_NUMERIC_COLLATION: |
| 735 if(optionArg) { |
| 736 for(j = 0; j<rulesOptions[i].subSize; j++) { |
| 737 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName
, rulesOptions[i].subopts[j].subLen) == 0) { |
| 738 ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].at
tr, rulesOptions[i].subopts[j].attrVal); |
| 739 result = UCOL_TOK_SUCCESS; |
| 740 } |
| 741 } |
| 742 } |
| 743 if(result == 0) { |
| 744 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 745 } |
| 746 break; |
| 747 case OPTION_VARIABLE_TOP: |
| 748 result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP; |
| 749 break; |
| 750 case OPTION_REARRANGE: |
| 751 result = UCOL_TOK_SUCCESS; |
| 752 break; |
| 753 case OPTION_BEFORE: |
| 754 if(optionArg) { |
| 755 for(j = 0; j<rulesOptions[i].subSize; j++) { |
| 756 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName
, rulesOptions[i].subopts[j].subLen) == 0) { |
| 757 result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attr
Val + 1); |
| 758 } |
| 759 } |
| 760 } |
| 761 if(result == 0) { |
| 762 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 763 } |
| 764 break; |
| 765 case OPTION_TOP: /* we are going to have an array with structures of limit C
Es */ |
| 766 /* index to this array will be src->parsedToken.indirectIndex*/ |
| 767 src->parsedToken.indirectIndex = 0; |
| 768 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP; |
| 769 break; |
| 770 case OPTION_FIRST: |
| 771 case OPTION_LAST: /* first, last */ |
| 772 for(j = 0; j<rulesOptions[i].subSize; j++) { |
| 773 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, ru
lesOptions[i].subopts[j].subLen) == 0) { |
| 774 // the calculation below assumes that OPTION_FIRST and OPTION_LA
ST are at i and i+1 and that the first |
| 775 // element of indirect boundaries is reserved for top. |
| 776 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2
); |
| 777 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;; |
| 778 } |
| 779 } |
| 780 if(result == 0) { |
| 781 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 782 } |
| 783 break; |
| 784 case OPTION_OPTIMIZE: |
| 785 case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before no
rmalization |
| 786 // we need to move end here |
| 787 src->current++; // skip opening brace |
| 788 while(src->current < src->end && noOpenBraces != 0) { |
| 789 if(*src->current == 0x005b) { |
| 790 noOpenBraces++; |
| 791 } else if(*src->current == 0x005D) { // closing brace |
| 792 noOpenBraces--; |
| 793 } |
| 794 src->current++; |
| 795 } |
| 796 result = UCOL_TOK_SUCCESS; |
| 797 break; |
| 798 case OPTION_SCRIPTREORDER: |
| 799 ucol_tok_parseScriptReorder(src, status); |
| 800 break; |
| 801 default: |
| 802 *status = U_UNSUPPORTED_ERROR; |
| 803 break; |
| 804 } |
| 805 } |
| 806 src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->curren
t)); |
| 807 return result; |
| 808 } |
| 809 |
| 810 |
| 811 inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff,
int32_t len, UErrorCode *status) { |
| 812 if (stuff == NULL || len <= 0) { |
| 813 return; |
| 814 } |
| 815 UnicodeString tempStuff(FALSE, stuff, len); |
| 816 if(src->extraCurrent+len >= src->extraEnd) { |
| 817 /* reallocate */ |
| 818 if (stuff >= src->source && stuff <= src->end) { |
| 819 // Copy the "stuff" contents into tempStuff's own buffer. |
| 820 // UnicodeString is copy-on-write. |
| 821 if (len > 0) { |
| 822 tempStuff.setCharAt(0, tempStuff[0]); |
| 823 } else { |
| 824 tempStuff.remove(); |
| 825 } |
| 826 } |
| 827 UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->s
ource)*2*sizeof(UChar)); |
| 828 if(newSrc != NULL) { |
| 829 src->current = newSrc + (src->current - src->source); |
| 830 src->extraCurrent = newSrc + (src->extraCurrent - src->source); |
| 831 src->end = newSrc + (src->end - src->source); |
| 832 src->extraEnd = newSrc + (src->extraEnd-src->source)*2; |
| 833 src->sourceCurrent = newSrc + (src->sourceCurrent-src->source); |
| 834 src->source = newSrc; |
| 835 } else { |
| 836 *status = U_MEMORY_ALLOCATION_ERROR; |
| 837 return; |
| 838 } |
| 839 } |
| 840 if(len == 1) { |
| 841 *src->extraCurrent++ = tempStuff[0]; |
| 842 } else { |
| 843 u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len); |
| 844 src->extraCurrent += len; |
| 845 } |
| 846 } |
| 847 |
| 848 inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) { |
| 849 /* |
| 850 top = TRUE; |
| 851 */ |
| 852 UChar buff[5]; |
| 853 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
| 854 buff[0] = 0xFFFE; |
| 855 buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].sta
rtCE >> 16); |
| 856 buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].sta
rtCE & 0xFFFF); |
| 857 if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0)
{ |
| 858 src->parsedToken.charsLen = 3; |
| 859 ucol_tok_addToExtraCurrent(src, buff, 3, status); |
| 860 } else { |
| 861 buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex]
.startContCE >> 16); |
| 862 buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex]
.startContCE & 0xFFFF); |
| 863 src->parsedToken.charsLen = 5; |
| 864 ucol_tok_addToExtraCurrent(src, buff, 5, status); |
| 865 } |
| 866 return TRUE; |
| 867 } |
| 868 |
| 869 static UBool isCharNewLine(UChar c){ |
| 870 switch(c){ |
| 871 case 0x000A: /* LF */ |
| 872 case 0x000D: /* CR */ |
| 873 case 0x000C: /* FF */ |
| 874 case 0x0085: /* NEL */ |
| 875 case 0x2028: /* LS */ |
| 876 case 0x2029: /* PS */ |
| 877 return TRUE; |
| 878 default: |
| 879 return FALSE; |
| 880 } |
| 881 } |
| 882 |
| 883 /* |
| 884 * This function is called several times when a range is processed. Each time,
the next code point |
| 885 * is processed. |
| 886 * The following variables must be set before calling this function: |
| 887 * src->currentRangeCp: The current code point to process. |
| 888 * src->lastRangeCp: The last code point in the range. |
| 889 * Pre-requisite: src->currentRangeCp <= src->lastRangeCp. |
| 890 */ |
| 891 static const UChar* |
| 892 ucol_tok_processNextCodePointInRange(UColTokenParser *src, |
| 893 UErrorCode *status) |
| 894 { |
| 895 // Append current code point to source |
| 896 UChar buff[U16_MAX_LENGTH]; |
| 897 uint32_t i = 0; |
| 898 |
| 899 uint32_t nChars = U16_LENGTH(src->currentRangeCp); |
| 900 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
| 901 src->parsedToken.charsLen = nChars; |
| 902 |
| 903 U16_APPEND_UNSAFE(buff, i, src->currentRangeCp); |
| 904 ucol_tok_addToExtraCurrent(src, buff, nChars, status); |
| 905 |
| 906 ++src->currentRangeCp; |
| 907 if (src->currentRangeCp > src->lastRangeCp) { |
| 908 src->inRange = FALSE; |
| 909 |
| 910 if (src->currentStarredCharIndex > src->lastStarredCharIndex) { |
| 911 src->isStarred = FALSE; |
| 912 } |
| 913 } else { |
| 914 src->previousCp = src->currentRangeCp; |
| 915 } |
| 916 return src->current; |
| 917 } |
| 918 |
| 919 /* |
| 920 * This function is called several times when a starred list is processed. Each
time, the next code point |
| 921 * in the list is processed. |
| 922 * The following variables must be set before calling this function: |
| 923 * src->currentStarredCharIndex: Index (in src->source) of the first char of
the current code point. |
| 924 * src->lastStarredCharIndex: Index to the last character in the list. |
| 925 * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex. |
| 926 */ |
| 927 static const UChar* |
| 928 ucol_tok_processNextTokenInStarredList(UColTokenParser *src) |
| 929 { |
| 930 // Extract the characters corresponding to the next code point. |
| 931 UChar32 cp; |
| 932 src->parsedToken.charsOffset = src->currentStarredCharIndex; |
| 933 int32_t prev = src->currentStarredCharIndex; |
| 934 U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src-
>source), cp); |
| 935 src->parsedToken.charsLen = src->currentStarredCharIndex - prev; |
| 936 |
| 937 // When we are done parsing the starred string, turn the flag off so that |
| 938 // the normal processing is restored. |
| 939 if (src->currentStarredCharIndex > src->lastStarredCharIndex) { |
| 940 src->isStarred = FALSE; |
| 941 } |
| 942 src->previousCp = cp; |
| 943 return src->current; |
| 944 } |
| 945 |
| 946 /* |
| 947 * Partially parses the next token, keeps the indices in src->parsedToken, and u
pdates the counters. |
| 948 * |
| 949 * This routine parses and separates almost all tokens. The following are the sy
ntax characters recognized. |
| 950 * # : Comment character |
| 951 * & : Reset operator |
| 952 * = : Equality |
| 953 * < : Primary collation |
| 954 * << : Secondary collation |
| 955 * <<< : Tertiary collation |
| 956 * ; : Secondary collation |
| 957 * , : Tertiary collation |
| 958 * / : Expansions |
| 959 * | : Prefix |
| 960 * - : Range |
| 961 |
| 962 * ! : Java Thai modifier, ignored |
| 963 * @ : French only |
| 964 |
| 965 * [] : Options |
| 966 * '' : Quotes |
| 967 * |
| 968 * Along with operators =, <, <<, <<<, the operator * is supported to indicate
a list. For example, &a<*bcdexyz |
| 969 * is equivalent to &a<b<c<d<e<x<y<z. In lists, ranges also can be given, so &
a*b-ex-z is equivalent to the above. |
| 970 * This function do not separate the tokens in a list. Instead, &a<*b-ex-z is
parsed as three tokens - "&a", |
| 971 * "<*b", "-ex", "-z". The strength (< in this case), whether in a list, wheth
er in a range and the previous |
| 972 * character returned as cached so that the calling program can do further spli
tting. |
| 973 */ |
| 974 static const UChar* |
| 975 ucol_tok_parseNextTokenInternal(UColTokenParser *src, |
| 976 UBool startOfRules, |
| 977 UParseError *parseError, |
| 978 UErrorCode *status) |
| 979 { |
| 980 UBool variableTop = FALSE; |
| 981 UBool top = FALSE; |
| 982 UBool inChars = TRUE; |
| 983 UBool inQuote = FALSE; |
| 984 UBool wasInQuote = FALSE; |
| 985 uint8_t before = 0; |
| 986 UBool isEscaped = FALSE; |
| 987 |
| 988 // TODO: replace these variables with src->parsedToken counterparts |
| 989 // no need to use them anymore since we have src->parsedToken. |
| 990 // Ideally, token parser would be a nice class... Once, when I have |
| 991 // more time (around 2020 probably). |
| 992 uint32_t newExtensionLen = 0; |
| 993 uint32_t extensionOffset = 0; |
| 994 uint32_t newStrength = UCOL_TOK_UNSET; |
| 995 UChar buff[10]; |
| 996 |
| 997 src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0; |
| 998 src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0; |
| 999 src->parsedToken.indirectIndex = 0; |
| 1000 |
| 1001 while (src->current < src->end) { |
| 1002 UChar ch = *(src->current); |
| 1003 |
| 1004 if (inQuote) { |
| 1005 if (ch == 0x0027/*'\''*/) { |
| 1006 inQuote = FALSE; |
| 1007 } else { |
| 1008 if ((src->parsedToken.charsLen == 0) || inChars) { |
| 1009 if(src->parsedToken.charsLen == 0) { |
| 1010 src->parsedToken.charsOffset = (uint32_t)(src->extraCurr
ent - src->source); |
| 1011 } |
| 1012 src->parsedToken.charsLen++; |
| 1013 } else { |
| 1014 if(newExtensionLen == 0) { |
| 1015 extensionOffset = (uint32_t)(src->extraCurrent - src->so
urce); |
| 1016 } |
| 1017 newExtensionLen++; |
| 1018 } |
| 1019 } |
| 1020 }else if(isEscaped){ |
| 1021 isEscaped =FALSE; |
| 1022 if (newStrength == UCOL_TOK_UNSET) { |
| 1023 *status = U_INVALID_FORMAT_ERROR; |
| 1024 syntaxError(src->source,(int32_t)(src->current-src->source),(int
32_t)(src->end-src->source),parseError); |
| 1025 DBG_FORMAT_ERROR |
| 1026 return NULL; |
| 1027 // enabling rules to start with non-tokens a < b |
| 1028 // newStrength = UCOL_TOK_RESET; |
| 1029 } |
| 1030 if(ch != 0x0000 && src->current != src->end) { |
| 1031 if (inChars) { |
| 1032 if(src->parsedToken.charsLen == 0) { |
| 1033 src->parsedToken.charsOffset = (uint32_t)(src->current -
src->source); |
| 1034 } |
| 1035 src->parsedToken.charsLen++; |
| 1036 } else { |
| 1037 if(newExtensionLen == 0) { |
| 1038 extensionOffset = (uint32_t)(src->current - src->source)
; |
| 1039 } |
| 1040 newExtensionLen++; |
| 1041 } |
| 1042 } |
| 1043 }else { |
| 1044 if(!uprv_isRuleWhiteSpace(ch)) { |
| 1045 /* Sets the strength for this entry */ |
| 1046 switch (ch) { |
| 1047 case 0x003D/*'='*/ : |
| 1048 if (newStrength != UCOL_TOK_UNSET) { |
| 1049 goto EndOfLoop; |
| 1050 } |
| 1051 |
| 1052 /* if we start with strength, we'll reset to top */ |
| 1053 if(startOfRules == TRUE) { |
| 1054 src->parsedToken.indirectIndex = 5; |
| 1055 top = ucol_tok_doSetTop(src, status); |
| 1056 newStrength = UCOL_TOK_RESET; |
| 1057 goto EndOfLoop; |
| 1058 } |
| 1059 newStrength = UCOL_IDENTICAL; |
| 1060 if(*(src->current+1) == 0x002A) {/*'*'*/ |
| 1061 src->current++; |
| 1062 src->isStarred = TRUE; |
| 1063 } |
| 1064 break; |
| 1065 |
| 1066 case 0x002C/*','*/: |
| 1067 if (newStrength != UCOL_TOK_UNSET) { |
| 1068 goto EndOfLoop; |
| 1069 } |
| 1070 |
| 1071 /* if we start with strength, we'll reset to top */ |
| 1072 if(startOfRules == TRUE) { |
| 1073 src->parsedToken.indirectIndex = 5; |
| 1074 top = ucol_tok_doSetTop(src, status); |
| 1075 newStrength = UCOL_TOK_RESET; |
| 1076 goto EndOfLoop; |
| 1077 } |
| 1078 newStrength = UCOL_TERTIARY; |
| 1079 break; |
| 1080 |
| 1081 case 0x003B/*';'*/: |
| 1082 if (newStrength != UCOL_TOK_UNSET) { |
| 1083 goto EndOfLoop; |
| 1084 } |
| 1085 |
| 1086 /* if we start with strength, we'll reset to top */ |
| 1087 if(startOfRules == TRUE) { |
| 1088 src->parsedToken.indirectIndex = 5; |
| 1089 top = ucol_tok_doSetTop(src, status); |
| 1090 newStrength = UCOL_TOK_RESET; |
| 1091 goto EndOfLoop; |
| 1092 } |
| 1093 newStrength = UCOL_SECONDARY; |
| 1094 break; |
| 1095 |
| 1096 case 0x003C/*'<'*/: |
| 1097 if (newStrength != UCOL_TOK_UNSET) { |
| 1098 goto EndOfLoop; |
| 1099 } |
| 1100 |
| 1101 /* if we start with strength, we'll reset to top */ |
| 1102 if(startOfRules == TRUE) { |
| 1103 src->parsedToken.indirectIndex = 5; |
| 1104 top = ucol_tok_doSetTop(src, status); |
| 1105 newStrength = UCOL_TOK_RESET; |
| 1106 goto EndOfLoop; |
| 1107 } |
| 1108 /* before this, do a scan to verify whether this is */ |
| 1109 /* another strength */ |
| 1110 if(*(src->current+1) == 0x003C) { |
| 1111 src->current++; |
| 1112 if(*(src->current+1) == 0x003C) { |
| 1113 src->current++; /* three in a row! */ |
| 1114 newStrength = UCOL_TERTIARY; |
| 1115 } else { /* two in a row */ |
| 1116 newStrength = UCOL_SECONDARY; |
| 1117 } |
| 1118 } else { /* just one */ |
| 1119 newStrength = UCOL_PRIMARY; |
| 1120 } |
| 1121 if(*(src->current+1) == 0x002A) {/*'*'*/ |
| 1122 src->current++; |
| 1123 src->isStarred = TRUE; |
| 1124 } |
| 1125 break; |
| 1126 |
| 1127 case 0x0026/*'&'*/: |
| 1128 if (newStrength != UCOL_TOK_UNSET) { |
| 1129 /**/ |
| 1130 goto EndOfLoop; |
| 1131 } |
| 1132 |
| 1133 newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */ |
| 1134 break; |
| 1135 |
| 1136 case 0x005b/*'['*/: |
| 1137 /* options - read an option, analyze it */ |
| 1138 if(u_strchr(src->current, 0x005d /*']'*/) != NULL) { |
| 1139 uint8_t result = ucol_uprv_tok_readAndSetOption(src, sta
tus); |
| 1140 if(U_SUCCESS(*status)) { |
| 1141 if(result & UCOL_TOK_TOP) { |
| 1142 if(newStrength == UCOL_TOK_RESET) { |
| 1143 top = ucol_tok_doSetTop(src, status); |
| 1144 if(before) { // This is a combination of bef
ore and indirection like '&[before 2][first regular]<b' |
| 1145 src->parsedToken.charsLen+=2; |
| 1146 buff[0] = 0x002d; |
| 1147 buff[1] = before; |
| 1148 ucol_tok_addToExtraCurrent(src, buff, 2,
status); |
| 1149 } |
| 1150 |
| 1151 src->current++; |
| 1152 goto EndOfLoop; |
| 1153 } else { |
| 1154 *status = U_INVALID_FORMAT_ERROR; |
| 1155 syntaxError(src->source,(int32_t)(src->curre
nt-src->source),(int32_t)(src->end-src->source),parseError); |
| 1156 DBG_FORMAT_ERROR |
| 1157 } |
| 1158 } else if(result & UCOL_TOK_VARIABLE_TOP) { |
| 1159 if(newStrength != UCOL_TOK_RESET && newStrength
!= UCOL_TOK_UNSET) { |
| 1160 variableTop = TRUE; |
| 1161 src->parsedToken.charsOffset = (uint32_t)(sr
c->extraCurrent - src->source); |
| 1162 src->parsedToken.charsLen = 1; |
| 1163 buff[0] = 0xFFFF; |
| 1164 ucol_tok_addToExtraCurrent(src, buff, 1, sta
tus); |
| 1165 src->current++; |
| 1166 goto EndOfLoop; |
| 1167 } else { |
| 1168 *status = U_INVALID_FORMAT_ERROR; |
| 1169 syntaxError(src->source,(int32_t)(src->curre
nt-src->source),(int32_t)(src->end-src->source),parseError); |
| 1170 DBG_FORMAT_ERROR |
| 1171 } |
| 1172 } else if (result & UCOL_TOK_BEFORE){ |
| 1173 if(newStrength == UCOL_TOK_RESET) { |
| 1174 before = result & UCOL_TOK_BEFORE; |
| 1175 } else { |
| 1176 *status = U_INVALID_FORMAT_ERROR; |
| 1177 syntaxError(src->source,(int32_t)(src->curre
nt-src->source),(int32_t)(src->end-src->source),parseError); |
| 1178 DBG_FORMAT_ERROR |
| 1179 } |
| 1180 } |
| 1181 } else { |
| 1182 *status = U_INVALID_FORMAT_ERROR; |
| 1183 syntaxError(src->source,(int32_t)(src->current-src->
source),(int32_t)(src->end-src->source),parseError); |
| 1184 DBG_FORMAT_ERROR |
| 1185 return NULL; |
| 1186 } |
| 1187 } |
| 1188 break; |
| 1189 case 0x0021/*! skip java thai modifier reordering*/: |
| 1190 break; |
| 1191 case 0x002F/*'/'*/: |
| 1192 wasInQuote = FALSE; /* if we were copying source characters,
we want to stop now */ |
| 1193 inChars = FALSE; /* we're now processing expansion */ |
| 1194 break; |
| 1195 case 0x005C /* back slash for escaped chars */: |
| 1196 isEscaped = TRUE; |
| 1197 break; |
| 1198 /* found a quote, we're gonna start copying */ |
| 1199 case 0x0027/*'\''*/: |
| 1200 if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal unt
il we have a strength */ |
| 1201 *status = U_INVALID_FORMAT_ERROR; |
| 1202 syntaxError(src->source,(int32_t)(src->current-src->source
),(int32_t)(src->end-src->source),parseError); |
| 1203 DBG_FORMAT_ERROR |
| 1204 return NULL; |
| 1205 // enabling rules to start with a non-token character a <
b |
| 1206 // newStrength = UCOL_TOK_RESET; |
| 1207 } |
| 1208 |
| 1209 inQuote = TRUE; |
| 1210 |
| 1211 if(inChars) { /* we're doing characters */ |
| 1212 if(wasInQuote == FALSE) { |
| 1213 src->parsedToken.charsOffset = (uint32_t)(src->extra
Current - src->source); |
| 1214 } |
| 1215 if (src->parsedToken.charsLen != 0) { |
| 1216 ucol_tok_addToExtraCurrent(src, src->current - src->
parsedToken.charsLen, src->parsedToken.charsLen, status); |
| 1217 } |
| 1218 src->parsedToken.charsLen++; |
| 1219 } else { /* we're doing an expansion */ |
| 1220 if(wasInQuote == FALSE) { |
| 1221 extensionOffset = (uint32_t)(src->extraCurrent - src
->source); |
| 1222 } |
| 1223 if (newExtensionLen != 0) { |
| 1224 ucol_tok_addToExtraCurrent(src, src->current - newEx
tensionLen, newExtensionLen, status); |
| 1225 } |
| 1226 newExtensionLen++; |
| 1227 } |
| 1228 |
| 1229 wasInQuote = TRUE; |
| 1230 |
| 1231 ch = *(++(src->current)); |
| 1232 if(ch == 0x0027) { /* copy the double quote */ |
| 1233 ucol_tok_addToExtraCurrent(src, &ch, 1, status); |
| 1234 inQuote = FALSE; |
| 1235 } |
| 1236 break; |
| 1237 |
| 1238 /* '@' is french only if the strength is not currently set *
/ |
| 1239 /* if it is, it's just a regular character in collation rule
s */ |
| 1240 case 0x0040/*'@'*/: |
| 1241 if (newStrength == UCOL_TOK_UNSET) { |
| 1242 src->opts->frenchCollation = UCOL_ON; |
| 1243 break; |
| 1244 } |
| 1245 |
| 1246 case 0x007C /*|*/: /* this means we have actually been reading p
refix part */ |
| 1247 // we want to store read characters to the prefix part and c
ontinue reading |
| 1248 // the characters (proper way would be to restart reading th
e chars, but in |
| 1249 // that case we would have to complicate the token hasher, w
hich I do not |
| 1250 // intend to play with. Instead, we will do prefixes when pr
efixes are due |
| 1251 // (before adding the elements). |
| 1252 src->parsedToken.prefixOffset = src->parsedToken.charsOffset
; |
| 1253 src->parsedToken.prefixLen = src->parsedToken.charsLen; |
| 1254 |
| 1255 if(inChars) { /* we're doing characters */ |
| 1256 if(wasInQuote == FALSE) { |
| 1257 src->parsedToken.charsOffset = (uint32_t)(src->extra
Current - src->source); |
| 1258 } |
| 1259 if (src->parsedToken.charsLen != 0) { |
| 1260 ucol_tok_addToExtraCurrent(src, src->current - src->
parsedToken.charsLen, src->parsedToken.charsLen, status); |
| 1261 } |
| 1262 src->parsedToken.charsLen++; |
| 1263 } |
| 1264 |
| 1265 wasInQuote = TRUE; |
| 1266 |
| 1267 do { |
| 1268 ch = *(++(src->current)); |
| 1269 // skip whitespace between '|' and the character |
| 1270 } while (uprv_isRuleWhiteSpace(ch)); |
| 1271 break; |
| 1272 |
| 1273 //charsOffset = 0; |
| 1274 //newCharsLen = 0; |
| 1275 //break; // We want to store the whole prefix/character sequ
ence. If we break |
| 1276 // the '|' is going to get lost. |
| 1277 |
| 1278 case 0x002D /*-*/: /* A range. */ |
| 1279 if (newStrength != UCOL_TOK_UNSET) { |
| 1280 // While processing the pending token, the isStarred field |
| 1281 // is reset, so it needs to be saved for the next |
| 1282 // invocation. |
| 1283 src->savedIsStarred = src->isStarred; |
| 1284 goto EndOfLoop; |
| 1285 } |
| 1286 src->isStarred = src->savedIsStarred; |
| 1287 |
| 1288 // Ranges are valid only in starred tokens. |
| 1289 if (!src->isStarred) { |
| 1290 *status = U_INVALID_FORMAT_ERROR; |
| 1291 syntaxError(src->source,(int32_t)(src->current-src->source)
,(int32_t)(src->end-src->source),parseError); |
| 1292 DBG_FORMAT_ERROR |
| 1293 return NULL; |
| 1294 } |
| 1295 newStrength = src->parsedToken.strength; |
| 1296 src->inRange = TRUE; |
| 1297 break; |
| 1298 |
| 1299 case 0x0023 /*#*/: /* this is a comment, skip everything through
the end of line */ |
| 1300 do { |
| 1301 ch = *(++(src->current)); |
| 1302 } while (!isCharNewLine(ch)); |
| 1303 |
| 1304 break; |
| 1305 default: |
| 1306 if (newStrength == UCOL_TOK_UNSET) { |
| 1307 *status = U_INVALID_FORMAT_ERROR; |
| 1308 syntaxError(src->source,(int32_t)(src->current-src->source
),(int32_t)(src->end-src->source),parseError); |
| 1309 DBG_FORMAT_ERROR |
| 1310 return NULL; |
| 1311 } |
| 1312 |
| 1313 if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) { |
| 1314 *status = U_INVALID_FORMAT_ERROR; |
| 1315 syntaxError(src->source,(int32_t)(src->current-src->sour
ce),(int32_t)(src->end-src->source),parseError); |
| 1316 DBG_FORMAT_ERROR |
| 1317 return NULL; |
| 1318 } |
| 1319 |
| 1320 if(ch == 0x0000 && src->current+1 == src->end) { |
| 1321 break; |
| 1322 } |
| 1323 |
| 1324 if (inChars) { |
| 1325 if(src->parsedToken.charsLen == 0) { |
| 1326 src->parsedToken.charsOffset = (uint32_t)(src->curre
nt - src->source); |
| 1327 } |
| 1328 src->parsedToken.charsLen++; |
| 1329 } else { |
| 1330 if(newExtensionLen == 0) { |
| 1331 extensionOffset = (uint32_t)(src->current - src->sou
rce); |
| 1332 } |
| 1333 newExtensionLen++; |
| 1334 } |
| 1335 |
| 1336 break; |
| 1337 } |
| 1338 } |
| 1339 } |
| 1340 |
| 1341 if(wasInQuote) { |
| 1342 if(ch != 0x27) { |
| 1343 if(inQuote || !uprv_isRuleWhiteSpace(ch)) { |
| 1344 ucol_tok_addToExtraCurrent(src, &ch, 1, status); |
| 1345 } |
| 1346 } |
| 1347 } |
| 1348 |
| 1349 src->current++; |
| 1350 } |
| 1351 |
| 1352 EndOfLoop: |
| 1353 wasInQuote = FALSE; |
| 1354 if (newStrength == UCOL_TOK_UNSET) { |
| 1355 return NULL; |
| 1356 } |
| 1357 |
| 1358 if (src->parsedToken.charsLen == 0 && top == FALSE) { |
| 1359 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(sr
c->end-src->source),parseError); |
| 1360 *status = U_INVALID_FORMAT_ERROR; |
| 1361 DBG_FORMAT_ERROR |
| 1362 return NULL; |
| 1363 } |
| 1364 |
| 1365 src->parsedToken.strength = newStrength; |
| 1366 src->parsedToken.extensionOffset = extensionOffset; |
| 1367 src->parsedToken.extensionLen = newExtensionLen; |
| 1368 src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL
_TOK_TOP * (top?1:0)) | before; |
| 1369 |
| 1370 return src->current; |
| 1371 } |
| 1372 |
| 1373 /* |
| 1374 * Parses the next token, keeps the indices in src->parsedToken, and updates the
counters. |
| 1375 * @see ucol_tok_parseNextTokenInternal() for the description of what operators
are supported. |
| 1376 * |
| 1377 * In addition to what ucol_tok_parseNextTokenInternal() does, this function doe
s the following: |
| 1378 * 1) ucol_tok_parseNextTokenInternal() returns a range as a single token. Thi
s function separates |
| 1379 * it to separate tokens and returns one by one. In order to do that, the n
ecessary states are |
| 1380 * cached as member variables of the token parser. |
| 1381 * 2) When encountering a range, ucol_tok_parseNextTokenInternal() processes ch
aracters up to the |
| 1382 * starting character as a single list token (which is separated into indivi
dual characters here) |
| 1383 * and as another list token starting with the last character in the range.
Before expanding it |
| 1384 * as a list of tokens, this function expands the range by filling the inter
mediate characters and |
| 1385 * returns them one by one as separate tokens. |
| 1386 * Necessary checks are done for invalid combinations. |
| 1387 */ |
| 1388 U_CAPI const UChar* U_EXPORT2 |
| 1389 ucol_tok_parseNextToken(UColTokenParser *src, |
| 1390 UBool startOfRules, |
| 1391 UParseError *parseError, |
| 1392 UErrorCode *status) |
| 1393 { |
| 1394 const UChar *nextToken; |
| 1395 |
| 1396 if (src->inRange) { |
| 1397 // We are not done processing a range. Continue it. |
| 1398 return ucol_tok_processNextCodePointInRange(src, status); |
| 1399 } else if (src->isStarred) { |
| 1400 // We are not done processing a starred token. Continue it. |
| 1401 return ucol_tok_processNextTokenInStarredList(src); |
| 1402 } |
| 1403 |
| 1404 // Get the next token. |
| 1405 nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, sta
tus); |
| 1406 |
| 1407 if (nextToken == NULL) { |
| 1408 return NULL; |
| 1409 } |
| 1410 |
| 1411 if (src->inRange) { |
| 1412 // A new range has started. |
| 1413 // Check whether it is a chain of ranges with more than one hyphen. |
| 1414 if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) { |
| 1415 *status = U_INVALID_FORMAT_ERROR; |
| 1416 syntaxError(src->source,src->parsedToken.charsOffset-1, |
| 1417 src->parsedToken.charsOffset+src->parsedToken.charsLen, pars
eError); |
| 1418 DBG_FORMAT_ERROR |
| 1419 return NULL; |
| 1420 } |
| 1421 |
| 1422 // The current token indicates the second code point of the range. |
| 1423 // Process just that, and then proceed with the star. |
| 1424 src->currentStarredCharIndex = src->parsedToken.charsOffset; |
| 1425 U16_NEXT(src->source, src->currentStarredCharIndex, |
| 1426 (uint32_t)(src->end - src->source), src->lastRangeCp); |
| 1427 if (src->lastRangeCp <= src->previousCp) { |
| 1428 *status = U_INVALID_FORMAT_ERROR; |
| 1429 syntaxError(src->source,src->parsedToken.charsOffset-1, |
| 1430 src->parsedToken.charsOffset+src->parsedToken.charsLen,parse
Error); |
| 1431 DBG_FORMAT_ERROR |
| 1432 return NULL; |
| 1433 } |
| 1434 |
| 1435 // Set current range code point to process the range loop |
| 1436 src->currentRangeCp = src->previousCp + 1; |
| 1437 |
| 1438 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.
charsLen - 1; |
| 1439 |
| 1440 return ucol_tok_processNextCodePointInRange(src, status); |
| 1441 } else if (src->isStarred) { |
| 1442 // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharInd
ex_ so that |
| 1443 // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive,
need to be |
| 1444 // separated into several tokens and returned. |
| 1445 src->currentStarredCharIndex = src->parsedToken.charsOffset; |
| 1446 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken
.charsLen - 1; |
| 1447 |
| 1448 return ucol_tok_processNextTokenInStarredList(src); |
| 1449 } else { |
| 1450 // Set previous codepoint |
| 1451 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end -
src->source), src->previousCp); |
| 1452 } |
| 1453 return nextToken; |
| 1454 } |
| 1455 |
| 1456 |
| 1457 /* |
| 1458 Processing Description |
| 1459 1 Build a ListList. Each list has a header, which contains two lists (positive |
| 1460 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and |
| 1461 reset may be null. |
| 1462 2 As you process, you keep a LAST pointer that points to the last token you |
| 1463 handled. |
| 1464 |
| 1465 */ |
| 1466 |
| 1467 static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand,
uint32_t *expandNext, |
| 1468 UParseError *parseError, UErrorCode *statu
s) |
| 1469 { |
| 1470 if(src->resultLen == src->listCapacity) { |
| 1471 // Unfortunately, this won't work, as we store addresses of lhs in token |
| 1472 src->listCapacity *= 2; |
| 1473 src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*s
izeof(UColTokListHeader)); |
| 1474 if(src->lh == NULL) { |
| 1475 *status = U_MEMORY_ALLOCATION_ERROR; |
| 1476 return NULL; |
| 1477 } |
| 1478 } |
| 1479 /* do the reset thing */ |
| 1480 UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); |
| 1481 /* test for NULL */ |
| 1482 if (sourceToken == NULL) { |
| 1483 *status = U_MEMORY_ALLOCATION_ERROR; |
| 1484 return NULL; |
| 1485 } |
| 1486 sourceToken->rulesToParseHdl = &(src->source); |
| 1487 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.cha
rsOffset; |
| 1488 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedTo
ken.extensionOffset; |
| 1489 |
| 1490 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); |
| 1491 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffs
et); |
| 1492 |
| 1493 // keep the flags around so that we know about before |
| 1494 sourceToken->flags = src->parsedToken.flags; |
| 1495 |
| 1496 if(src->parsedToken.prefixOffset != 0) { |
| 1497 // this is a syntax error |
| 1498 *status = U_INVALID_FORMAT_ERROR; |
| 1499 syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.
charsOffset+src->parsedToken.charsLen,parseError); |
| 1500 DBG_FORMAT_ERROR |
| 1501 uprv_free(sourceToken); |
| 1502 return 0; |
| 1503 } else { |
| 1504 sourceToken->prefix = 0; |
| 1505 } |
| 1506 |
| 1507 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should als
o handle reverse */ |
| 1508 sourceToken->strength = UCOL_TOK_RESET; |
| 1509 sourceToken->next = NULL; |
| 1510 sourceToken->previous = NULL; |
| 1511 sourceToken->noOfCEs = 0; |
| 1512 sourceToken->noOfExpCEs = 0; |
| 1513 sourceToken->listHeader = &src->lh[src->resultLen]; |
| 1514 |
| 1515 src->lh[src->resultLen].first = NULL; |
| 1516 src->lh[src->resultLen].last = NULL; |
| 1517 src->lh[src->resultLen].first = NULL; |
| 1518 src->lh[src->resultLen].last = NULL; |
| 1519 |
| 1520 src->lh[src->resultLen].reset = sourceToken; |
| 1521 |
| 1522 /* |
| 1523 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... |
| 1524 First convert all expansions into normal form. Examples: |
| 1525 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * |
| 1526 d * ... into &x * c/y * d * ... |
| 1527 Note: reset values can never have expansions, although they can cause the |
| 1528 very next item to have one. They may be contractions, if they are found |
| 1529 earlier in the list. |
| 1530 */ |
| 1531 *expandNext = 0; |
| 1532 if(expand != NULL) { |
| 1533 /* check to see if there is an expansion */ |
| 1534 if(src->parsedToken.charsLen > 1) { |
| 1535 uint32_t resetCharsOffset; |
| 1536 resetCharsOffset = (uint32_t)(expand - src->source); |
| 1537 sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOff
set ) << 24) | src->parsedToken.charsOffset; |
| 1538 *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOf
fset - resetCharsOffset)<<24) | (resetCharsOffset); |
| 1539 } |
| 1540 } |
| 1541 |
| 1542 src->resultLen++; |
| 1543 |
| 1544 uhash_put(src->tailored, sourceToken, sourceToken, status); |
| 1545 |
| 1546 return sourceToken; |
| 1547 } |
| 1548 |
| 1549 static |
| 1550 inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken,
uint8_t strength, UParseError *parseError, UErrorCode *status) { |
| 1551 if(U_FAILURE(*status)) { |
| 1552 return NULL; |
| 1553 } |
| 1554 /* this is a virgin before - we need to fish the anchor from the UCA */ |
| 1555 collIterate s; |
| 1556 uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND; |
| 1557 uint32_t CE, SecondCE; |
| 1558 uint32_t invPos; |
| 1559 if(sourceToken != NULL) { |
| 1560 uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFF
FFF), 1, &s, status); |
| 1561 } else { |
| 1562 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset
/**charsOffset*/, 1, &s, status); |
| 1563 } |
| 1564 if(U_FAILURE(*status)) { |
| 1565 return NULL; |
| 1566 } |
| 1567 |
| 1568 baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F; |
| 1569 baseContCE = ucol_getNextCE(src->UCA, &s, status); |
| 1570 if(baseContCE == UCOL_NO_MORE_CES) { |
| 1571 baseContCE = 0; |
| 1572 } |
| 1573 |
| 1574 |
| 1575 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UC
A->image->UCAConsts); |
| 1576 uint32_t ch = 0; |
| 1577 uint32_t expandNext = 0; |
| 1578 UColToken key; |
| 1579 |
| 1580 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseC
E & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ |
| 1581 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRI
MARYMASK) >> 16); |
| 1582 uint32_t raw = uprv_uca_getRawFromImplicit(primary); |
| 1583 ch = uprv_uca_getCodePointFromRaw(raw-1); |
| 1584 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); |
| 1585 CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; |
| 1586 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MA
RKER; |
| 1587 |
| 1588 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->sourc
e); |
| 1589 *src->extraCurrent++ = 0xFFFE; |
| 1590 *src->extraCurrent++ = (UChar)ch; |
| 1591 src->parsedToken.charsLen++; |
| 1592 |
| 1593 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->pa
rsedToken.charsOffset/**charsOffset*/; |
| 1594 key.rulesToParseHdl = &(src->source); |
| 1595 |
| 1596 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); |
| 1597 sourceToken = (UColToken *)uhash_get(src->tailored, &key); |
| 1598 |
| 1599 if(sourceToken == NULL) { |
| 1600 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; |
| 1601 if(isContinuation(SecondCE)) { |
| 1602 src->lh[src->resultLen].baseContCE = SecondCE; |
| 1603 } else { |
| 1604 src->lh[src->resultLen].baseContCE = 0; |
| 1605 } |
| 1606 src->lh[src->resultLen].nextCE = 0; |
| 1607 src->lh[src->resultLen].nextContCE = 0; |
| 1608 src->lh[src->resultLen].previousCE = 0; |
| 1609 src->lh[src->resultLen].previousContCE = 0; |
| 1610 |
| 1611 src->lh[src->resultLen].indirect = FALSE; |
| 1612 |
| 1613 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, s
tatus); |
| 1614 } |
| 1615 |
| 1616 } else { |
| 1617 invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, str
ength); |
| 1618 |
| 1619 // we got the previous CE. Now we need to see if the difference between |
| 1620 // the two CEs is really of the requested strength. |
| 1621 // if it's a bigger difference (we asked for secondary and got primary),
we |
| 1622 // need to modify the CE. |
| 1623 if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < stre
ngth) { |
| 1624 // adjust the strength |
| 1625 // now we are in the situation where our baseCE should actually be m
odified in |
| 1626 // order to get the CE in the right position. |
| 1627 if(strength == UCOL_SECONDARY) { |
| 1628 CE = baseCE - 0x0200; |
| 1629 } else { // strength == UCOL_TERTIARY |
| 1630 CE = baseCE - 0x02; |
| 1631 } |
| 1632 if(baseContCE) { |
| 1633 if(strength == UCOL_SECONDARY) { |
| 1634 SecondCE = baseContCE - 0x0200; |
| 1635 } else { // strength == UCOL_TERTIARY |
| 1636 SecondCE = baseContCE - 0x02; |
| 1637 } |
| 1638 } |
| 1639 } |
| 1640 |
| 1641 #if 0 |
| 1642 // the code below relies on getting a code point from the inverse table,
in order to be |
| 1643 // able to merge the situations like &x < 9 &[before 1]a < d. This won't
work: |
| 1644 // 1. There are many code points that have the same CE |
| 1645 // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2
] are broken. |
| 1646 // Also, in case when there is no equivalent strength before an element,
we have to actually |
| 1647 // construct one. For example, &[before 2]a << x won't result in x << a,
because the element |
| 1648 // before a is a primary difference. |
| 1649 |
| 1650 //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->t
able); |
| 1651 |
| 1652 |
| 1653 ch = CETable[3*invPos+2]; |
| 1654 |
| 1655 if((ch & UCOL_INV_SIZEMASK) != 0) { |
| 1656 uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->c
onts); |
| 1657 uint32_t offset = (ch & UCOL_INV_OFFSETMASK); |
| 1658 ch = conts[offset]; |
| 1659 } |
| 1660 |
| 1661 *src->extraCurrent++ = (UChar)ch; |
| 1662 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->sourc
e - 1); |
| 1663 src->parsedToken.charsLen = 1; |
| 1664 |
| 1665 // We got an UCA before. However, this might have been tailored. |
| 1666 // example: |
| 1667 // &\u30ca = \u306a |
| 1668 // &[before 3]\u306a<<<\u306a|\u309d |
| 1669 |
| 1670 |
| 1671 // uint32_t key = (*newCharsLen << 24) | *charsOffset; |
| 1672 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->pa
rsedToken.charsOffset/**charsOffset*/; |
| 1673 key.rulesToParseHdl = &(src->source); |
| 1674 |
| 1675 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); |
| 1676 sourceToken = (UColToken *)uhash_get(src->tailored, &key); |
| 1677 #endif |
| 1678 |
| 1679 // here is how it should be. The situation such as &[before 1]a < x, sho
uld be |
| 1680 // resolved exactly as if we wrote &a > x. |
| 1681 // therefore, I don't really care if the UCA value before a has been cha
nged. |
| 1682 // However, I do care if the strength between my element and the previou
s element |
| 1683 // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2],
then i'll |
| 1684 // have to construct the base CE. |
| 1685 |
| 1686 |
| 1687 |
| 1688 // if we found a tailored thing, we have to use the UCA value and constr
uct |
| 1689 // a new reset token with constructed name |
| 1690 //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { |
| 1691 // character to which we want to anchor is already tailored. |
| 1692 // We need to construct a new token which will be the anchor |
| 1693 // point |
| 1694 //*(src->extraCurrent-1) = 0xFFFE; |
| 1695 //*src->extraCurrent++ = (UChar)ch; |
| 1696 // grab before |
| 1697 src->parsedToken.charsOffset -= 10; |
| 1698 src->parsedToken.charsLen += 10; |
| 1699 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; |
| 1700 if(isContinuation(SecondCE)) { |
| 1701 src->lh[src->resultLen].baseContCE = SecondCE; |
| 1702 } else { |
| 1703 src->lh[src->resultLen].baseContCE = 0; |
| 1704 } |
| 1705 src->lh[src->resultLen].nextCE = 0; |
| 1706 src->lh[src->resultLen].nextContCE = 0; |
| 1707 src->lh[src->resultLen].previousCE = 0; |
| 1708 src->lh[src->resultLen].previousContCE = 0; |
| 1709 |
| 1710 src->lh[src->resultLen].indirect = FALSE; |
| 1711 |
| 1712 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, statu
s); |
| 1713 //} |
| 1714 } |
| 1715 |
| 1716 return sourceToken; |
| 1717 |
| 1718 } |
| 1719 |
| 1720 uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseErro
r, UErrorCode *status) { |
| 1721 UColToken *lastToken = NULL; |
| 1722 const UChar *parseEnd = NULL; |
| 1723 uint32_t expandNext = 0; |
| 1724 UBool variableTop = FALSE; |
| 1725 UBool top = FALSE; |
| 1726 uint16_t specs = 0; |
| 1727 UColTokListHeader *ListList = NULL; |
| 1728 |
| 1729 src->parsedToken.strength = UCOL_TOK_UNSET; |
| 1730 |
| 1731 ListList = src->lh; |
| 1732 |
| 1733 if(U_FAILURE(*status)) { |
| 1734 return 0; |
| 1735 } |
| 1736 #ifdef DEBUG_FOR_CODE_POINTS |
| 1737 char filename[35]; |
| 1738 sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid()); |
| 1739 dfcp_fp = fopen(filename, "a"); |
| 1740 fprintf(stdout, "Output is in the file %s.\n", filename); |
| 1741 #endif |
| 1742 |
| 1743 #ifdef DEBUG_FOR_COLL_RULES |
| 1744 std::string s3; |
| 1745 UnicodeString(src->source).toUTF8String(s3); |
| 1746 std::cout << "src->source = " << s3 << std::endl; |
| 1747 #endif |
| 1748 |
| 1749 while(src->current < src->end || src->isStarred) { |
| 1750 src->parsedToken.prefixOffset = 0; |
| 1751 |
| 1752 parseEnd = ucol_tok_parseNextToken(src, |
| 1753 (UBool)(lastToken == NULL), |
| 1754 parseError, |
| 1755 status); |
| 1756 |
| 1757 specs = src->parsedToken.flags; |
| 1758 |
| 1759 |
| 1760 variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0); |
| 1761 top = ((specs & UCOL_TOK_TOP) != 0); |
| 1762 |
| 1763 if(U_SUCCESS(*status) && parseEnd != NULL) { |
| 1764 UColToken *sourceToken = NULL; |
| 1765 //uint32_t key = 0; |
| 1766 uint32_t lastStrength = UCOL_TOK_UNSET; |
| 1767 |
| 1768 if(lastToken != NULL ) { |
| 1769 lastStrength = lastToken->strength; |
| 1770 } |
| 1771 |
| 1772 #ifdef DEBUG_FOR_CODE_POINTS |
| 1773 UChar32 cp; |
| 1774 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src
->extraEnd - src->source), cp); |
| 1775 fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsed
Token.strength); |
| 1776 #endif |
| 1777 //key = newCharsLen << 24 | charsOffset; |
| 1778 UColToken key; |
| 1779 key.source = src->parsedToken.charsLen << 24 | src->parsedToken.char
sOffset; |
| 1780 key.rulesToParseHdl = &(src->source); |
| 1781 |
| 1782 /* 4 Lookup each source in the CharsToToken map, and find a sourceT
oken */ |
| 1783 sourceToken = (UColToken *)uhash_get(src->tailored, &key); |
| 1784 |
| 1785 if(src->parsedToken.strength != UCOL_TOK_RESET) { |
| 1786 if(lastToken == NULL) { /* this means that rules haven't started
properly */ |
| 1787 *status = U_INVALID_FORMAT_ERROR; |
| 1788 syntaxError(src->source,0,(int32_t)(src->end-src->source),pa
rseError); |
| 1789 DBG_FORMAT_ERROR |
| 1790 return 0; |
| 1791 } |
| 1792 /* 6 Otherwise (when relation != reset) */ |
| 1793 if(sourceToken == NULL) { |
| 1794 /* If sourceToken is null, create new one, */ |
| 1795 sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); |
| 1796 /* test for NULL */ |
| 1797 if (sourceToken == NULL) { |
| 1798 *status = U_MEMORY_ALLOCATION_ERROR; |
| 1799 return 0; |
| 1800 } |
| 1801 sourceToken->rulesToParseHdl = &(src->source); |
| 1802 sourceToken->source = src->parsedToken.charsLen << 24 | src-
>parsedToken.charsOffset; |
| 1803 |
| 1804 sourceToken->debugSource = *(src->source + src->parsedToken.
charsOffset); |
| 1805 |
| 1806 sourceToken->prefix = src->parsedToken.prefixLen << 24 | src
->parsedToken.prefixOffset; |
| 1807 sourceToken->debugPrefix = *(src->source + src->parsedToken.
prefixOffset); |
| 1808 |
| 1809 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO:
this should also handle reverse */ |
| 1810 sourceToken->next = NULL; |
| 1811 sourceToken->previous = NULL; |
| 1812 sourceToken->noOfCEs = 0; |
| 1813 sourceToken->noOfExpCEs = 0; |
| 1814 // keep the flags around so that we know about before |
| 1815 sourceToken->flags = src->parsedToken.flags; |
| 1816 uhash_put(src->tailored, sourceToken, sourceToken, status); |
| 1817 if(U_FAILURE(*status)) { |
| 1818 return 0; |
| 1819 } |
| 1820 } else { |
| 1821 /* we could have fished out a reset here */ |
| 1822 if(sourceToken->strength != UCOL_TOK_RESET && lastToken != s
ourceToken) { |
| 1823 /* otherwise remove sourceToken from where it was. */ |
| 1824 if(sourceToken->next != NULL) { |
| 1825 if(sourceToken->next->strength > sourceToken->streng
th) { |
| 1826 sourceToken->next->strength = sourceToken->stren
gth; |
| 1827 } |
| 1828 sourceToken->next->previous = sourceToken->previous; |
| 1829 } else { |
| 1830 sourceToken->listHeader->last = sourceToken->previou
s; |
| 1831 } |
| 1832 |
| 1833 if(sourceToken->previous != NULL) { |
| 1834 sourceToken->previous->next = sourceToken->next; |
| 1835 } else { |
| 1836 sourceToken->listHeader->first = sourceToken->next; |
| 1837 } |
| 1838 sourceToken->next = NULL; |
| 1839 sourceToken->previous = NULL; |
| 1840 } |
| 1841 } |
| 1842 |
| 1843 sourceToken->strength = src->parsedToken.strength; |
| 1844 sourceToken->listHeader = lastToken->listHeader; |
| 1845 |
| 1846 /* |
| 1847 1. Find the strongest strength in each list, and set strongestP
and strongestN |
| 1848 accordingly in the headers. |
| 1849 */ |
| 1850 if(lastStrength == UCOL_TOK_RESET |
| 1851 || sourceToken->listHeader->first == 0) { |
| 1852 /* If LAST is a reset |
| 1853 insert sourceToken in the list. */ |
| 1854 if(sourceToken->listHeader->first == 0) { |
| 1855 sourceToken->listHeader->first = sourceToken; |
| 1856 sourceToken->listHeader->last = sourceToken; |
| 1857 } else { /* we need to find a place for us */ |
| 1858 /* and we'll get in front of the same strength */ |
| 1859 if(sourceToken->listHeader->first->strength <= sourc
eToken->strength) { |
| 1860 sourceToken->next = sourceToken->listHeader->fir
st; |
| 1861 sourceToken->next->previous = sourceToken; |
| 1862 sourceToken->listHeader->first = sourceToken; |
| 1863 sourceToken->previous = NULL; |
| 1864 } else { |
| 1865 lastToken = sourceToken->listHeader->first; |
| 1866 while(lastToken->next != NULL && lastToken->next
->strength > sourceToken->strength) { |
| 1867 lastToken = lastToken->next; |
| 1868 } |
| 1869 if(lastToken->next != NULL) { |
| 1870 lastToken->next->previous = sourceToken; |
| 1871 } else { |
| 1872 sourceToken->listHeader->last = sourceToken; |
| 1873 } |
| 1874 sourceToken->previous = lastToken; |
| 1875 sourceToken->next = lastToken->next; |
| 1876 lastToken->next = sourceToken; |
| 1877 } |
| 1878 } |
| 1879 } else { |
| 1880 /* Otherwise (when LAST is not a reset) |
| 1881 if polarity (LAST) == polarity(relation), insert sourceT
oken after LAST, |
| 1882 otherwise insert before. |
| 1883 when inserting after or before, search to the next posit
ion with the same |
| 1884 strength in that direction. (This is called postpone ins
ertion). */ |
| 1885 if(sourceToken != lastToken) { |
| 1886 if(lastToken->polarity == sourceToken->polarity) { |
| 1887 while(lastToken->next != NULL && lastToken->next
->strength > sourceToken->strength) { |
| 1888 lastToken = lastToken->next; |
| 1889 } |
| 1890 sourceToken->previous = lastToken; |
| 1891 if(lastToken->next != NULL) { |
| 1892 lastToken->next->previous = sourceToken; |
| 1893 } else { |
| 1894 sourceToken->listHeader->last = sourceToken; |
| 1895 } |
| 1896 |
| 1897 sourceToken->next = lastToken->next; |
| 1898 lastToken->next = sourceToken; |
| 1899 } else { |
| 1900 while(lastToken->previous != NULL && lastToken->
previous->strength > sourceToken->strength) { |
| 1901 lastToken = lastToken->previous; |
| 1902 } |
| 1903 sourceToken->next = lastToken; |
| 1904 if(lastToken->previous != NULL) { |
| 1905 lastToken->previous->next = sourceToken; |
| 1906 } else { |
| 1907 sourceToken->listHeader->first = sourceToken
; |
| 1908 } |
| 1909 sourceToken->previous = lastToken->previous; |
| 1910 lastToken->previous = sourceToken; |
| 1911 } |
| 1912 } else { /* repeated one thing twice in rules, stay with
the stronger strength */ |
| 1913 if(lastStrength < sourceToken->strength) { |
| 1914 sourceToken->strength = lastStrength; |
| 1915 } |
| 1916 } |
| 1917 } |
| 1918 |
| 1919 /* if the token was a variable top, we're gonna put it in */ |
| 1920 if(variableTop == TRUE && src->varTop == NULL) { |
| 1921 variableTop = FALSE; |
| 1922 src->varTop = sourceToken; |
| 1923 } |
| 1924 |
| 1925 // Treat the expansions. |
| 1926 // There are two types of expansions: explicit (x / y) and r
eset based propagating expansions |
| 1927 // (&abc * d * e <=> &ab * d / c * e / c) |
| 1928 // if both of them are in effect for a token, they are combi
ned. |
| 1929 |
| 1930 sourceToken->expansion = src->parsedToken.extensionLen << 24
| src->parsedToken.extensionOffset; |
| 1931 |
| 1932 if(expandNext != 0) { |
| 1933 if(sourceToken->strength == UCOL_PRIMARY) { /* primary s
trength kills off the implicit expansion */ |
| 1934 expandNext = 0; |
| 1935 } else if(sourceToken->expansion == 0) { /* if there is
no expansion, implicit is just added to the token */ |
| 1936 sourceToken->expansion = expandNext; |
| 1937 } else { /* there is both explicit and implicit expansio
n. We need to make a combination */ |
| 1938 uprv_memcpy(src->extraCurrent, src->source + (expand
Next & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar)); |
| 1939 uprv_memcpy(src->extraCurrent+(expandNext >> 24), sr
c->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*size
of(UChar)); |
| 1940 sourceToken->expansion = (uint32_t)(((expandNext >>
24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->s
ource)); |
| 1941 src->extraCurrent += (expandNext >> 24) + src->parse
dToken.extensionLen; |
| 1942 } |
| 1943 } |
| 1944 |
| 1945 // This is just for debugging purposes |
| 1946 if(sourceToken->expansion != 0) { |
| 1947 sourceToken->debugExpansion = *(src->source + src->parse
dToken.extensionOffset); |
| 1948 } else { |
| 1949 sourceToken->debugExpansion = 0; |
| 1950 } |
| 1951 // if the previous token was a reset before, the strength of
this |
| 1952 // token must match the strength of before. Otherwise we hav
e an |
| 1953 // undefined situation. |
| 1954 // In other words, we currently have a cludge which we use t
o |
| 1955 // represent &a >> x. This is written as &[before 2]a << x. |
| 1956 if((lastToken->flags & UCOL_TOK_BEFORE) != 0) { |
| 1957 uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BE
FORE) - 1; |
| 1958 if(beforeStrength != sourceToken->strength) { |
| 1959 *status = U_INVALID_FORMAT_ERROR; |
| 1960 syntaxError(src->source,0,(int32_t)(src->end-src->so
urce),parseError); |
| 1961 DBG_FORMAT_ERROR |
| 1962 return 0; |
| 1963 } |
| 1964 } |
| 1965 } else { |
| 1966 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) { |
| 1967 /* if the previous token was also a reset, */ |
| 1968 /*this means that we have two consecutive resets */ |
| 1969 /* and we want to remove the previous one if empty*/ |
| 1970 if(src->resultLen > 0 && ListList[src->resultLen-1].first ==
NULL) { |
| 1971 src->resultLen--; |
| 1972 } |
| 1973 } |
| 1974 |
| 1975 if(sourceToken == NULL) { /* this is a reset, but it might still
be somewhere in the tailoring, in shorter form */ |
| 1976 uint32_t searchCharsLen = src->parsedToken.charsLen; |
| 1977 while(searchCharsLen > 1 && sourceToken == NULL) { |
| 1978 searchCharsLen--; |
| 1979 //key = searchCharsLen << 24 | charsOffset; |
| 1980 UColToken key; |
| 1981 key.source = searchCharsLen << 24 | src->parsedToken.cha
rsOffset; |
| 1982 key.rulesToParseHdl = &(src->source); |
| 1983 sourceToken = (UColToken *)uhash_get(src->tailored, &key
); |
| 1984 } |
| 1985 if(sourceToken != NULL) { |
| 1986 expandNext = (src->parsedToken.charsLen - searchCharsLen
) << 24 | (src->parsedToken.charsOffset + searchCharsLen); |
| 1987 } |
| 1988 } |
| 1989 |
| 1990 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */ |
| 1991 if(top == FALSE) { /* there is no indirection */ |
| 1992 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; |
| 1993 if(sourceToken != NULL && sourceToken->strength != UCOL_
TOK_RESET) { |
| 1994 /* this is a before that is already ordered in the U
CA - so we need to get the previous with good strength */ |
| 1995 while(sourceToken->strength > strength && sourceToke
n->previous != NULL) { |
| 1996 sourceToken = sourceToken->previous; |
| 1997 } |
| 1998 /* here, either we hit the strength or NULL */ |
| 1999 if(sourceToken->strength == strength) { |
| 2000 if(sourceToken->previous != NULL) { |
| 2001 sourceToken = sourceToken->previous; |
| 2002 } else { /* start of list */ |
| 2003 sourceToken = sourceToken->listHeader->reset
; |
| 2004 } |
| 2005 } else { /* we hit NULL */ |
| 2006 /* we should be doing the else part */ |
| 2007 sourceToken = sourceToken->listHeader->reset; |
| 2008 sourceToken = getVirginBefore(src, sourceToken,
strength, parseError, status); |
| 2009 } |
| 2010 } else { |
| 2011 sourceToken = getVirginBefore(src, sourceToken, stre
ngth, parseError, status); |
| 2012 } |
| 2013 } else { /* this is both before and indirection */ |
| 2014 top = FALSE; |
| 2015 ListList[src->resultLen].previousCE = 0; |
| 2016 ListList[src->resultLen].previousContCE = 0; |
| 2017 ListList[src->resultLen].indirect = TRUE; |
| 2018 /* we need to do slightly more work. we need to get the
baseCE using the */ |
| 2019 /* inverse UCA & getPrevious. The next bound is not set,
and will be decided */ |
| 2020 /* in ucol_bld */ |
| 2021 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; |
| 2022 uint32_t baseCE = ucolIndirectBoundaries[src->parsedToke
n.indirectIndex].startCE; |
| 2023 uint32_t baseContCE = ucolIndirectBoundaries[src->parsed
Token.indirectIndex].startContCE;//&0xFFFFFF3F; |
| 2024 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; |
| 2025 |
| 2026 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->
UCA->image + src->UCA->image->UCAConsts); |
| 2027 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICI
T_MIN<<24) && |
| 2028 (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICI
T_MAX<<24) ) { /* implicits - */ |
| 2029 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((b
aseContCE & UCOL_PRIMARYMASK) >> 16); |
| 2030 uint32_t raw = uprv_uca_getRawFromImplicit(primary); |
| 2031 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw
-1); |
| 2032 CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; |
| 2033 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) |
UCOL_CONTINUATION_MARKER; |
| 2034 } else { |
| 2035 /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseCo
ntCE, &CE, &SecondCE, strength);*/ |
| 2036 ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &Se
condCE, strength); |
| 2037 } |
| 2038 |
| 2039 ListList[src->resultLen].baseCE = CE; |
| 2040 ListList[src->resultLen].baseContCE = SecondCE; |
| 2041 ListList[src->resultLen].nextCE = 0; |
| 2042 ListList[src->resultLen].nextContCE = 0; |
| 2043 |
| 2044 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, p
arseError, status); |
| 2045 } |
| 2046 } |
| 2047 |
| 2048 |
| 2049 /* 5 If the relation is a reset: |
| 2050 If sourceToken is null |
| 2051 Create new list, create new sourceToken, make the baseCE from so
urce, put |
| 2052 the sourceToken in ListHeader of the new list */ |
| 2053 if(sourceToken == NULL) { |
| 2054 /* |
| 2055 3 Consider each item: relation, source, and expansion: e.g.
...< x / y ... |
| 2056 First convert all expansions into normal form. Examples: |
| 2057 If "xy" doesn't occur earlier in the list or in the UCA, con
vert &xy * c * |
| 2058 d * ... into &x * c/y * d * ... |
| 2059 Note: reset values can never have expansions, although they
can cause the |
| 2060 very next item to have one. They may be contractions, if the
y are found |
| 2061 earlier in the list. |
| 2062 */ |
| 2063 if(top == FALSE) { |
| 2064 collIterate s; |
| 2065 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; |
| 2066 |
| 2067 uprv_init_collIterate(src->UCA, src->source+src->parsedT
oken.charsOffset, src->parsedToken.charsLen, &s, status); |
| 2068 |
| 2069 CE = ucol_getNextCE(src->UCA, &s, status); |
| 2070 const UChar *expand = s.pos; |
| 2071 SecondCE = ucol_getNextCE(src->UCA, &s, status); |
| 2072 |
| 2073 ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F; |
| 2074 if(isContinuation(SecondCE)) { |
| 2075 ListList[src->resultLen].baseContCE = SecondCE; |
| 2076 } else { |
| 2077 ListList[src->resultLen].baseContCE = 0; |
| 2078 } |
| 2079 ListList[src->resultLen].nextCE = 0; |
| 2080 ListList[src->resultLen].nextContCE = 0; |
| 2081 ListList[src->resultLen].previousCE = 0; |
| 2082 ListList[src->resultLen].previousContCE = 0; |
| 2083 ListList[src->resultLen].indirect = FALSE; |
| 2084 sourceToken = ucol_tok_initAReset(src, expand, &expandNe
xt, parseError, status); |
| 2085 } else { /* top == TRUE */ |
| 2086 /* just use the supplied values */ |
| 2087 top = FALSE; |
| 2088 ListList[src->resultLen].previousCE = 0; |
| 2089 ListList[src->resultLen].previousContCE = 0; |
| 2090 ListList[src->resultLen].indirect = TRUE; |
| 2091 ListList[src->resultLen].baseCE = ucolIndirectBoundaries
[src->parsedToken.indirectIndex].startCE; |
| 2092 ListList[src->resultLen].baseContCE = ucolIndirectBounda
ries[src->parsedToken.indirectIndex].startContCE; |
| 2093 ListList[src->resultLen].nextCE = ucolIndirectBoundaries
[src->parsedToken.indirectIndex].limitCE; |
| 2094 ListList[src->resultLen].nextContCE = ucolIndirectBounda
ries[src->parsedToken.indirectIndex].limitContCE; |
| 2095 |
| 2096 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, p
arseError, status); |
| 2097 |
| 2098 } |
| 2099 } else { /* reset to something already in rules */ |
| 2100 top = FALSE; |
| 2101 } |
| 2102 } |
| 2103 /* 7 After all this, set LAST to point to sourceToken, and goto ste
p 3. */ |
| 2104 lastToken = sourceToken; |
| 2105 } else { |
| 2106 if(U_FAILURE(*status)) { |
| 2107 return 0; |
| 2108 } |
| 2109 } |
| 2110 } |
| 2111 #ifdef DEBUG_FOR_CODE_POINTS |
| 2112 fclose(dfcp_fp); |
| 2113 #endif |
| 2114 |
| 2115 |
| 2116 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { |
| 2117 src->resultLen--; |
| 2118 } |
| 2119 return src->resultLen; |
| 2120 } |
| 2121 |
| 2122 const UChar* ucol_tok_getRulesFromBundle( |
| 2123 void* /*context*/, |
| 2124 const char* locale, |
| 2125 const char* type, |
| 2126 int32_t* pLength, |
| 2127 UErrorCode* status) |
| 2128 { |
| 2129 const UChar* rules = NULL; |
| 2130 UResourceBundle* bundle; |
| 2131 UResourceBundle* collations; |
| 2132 UResourceBundle* collation; |
| 2133 |
| 2134 *pLength = 0; |
| 2135 |
| 2136 bundle = ures_open(U_ICUDATA_COLL, locale, status); |
| 2137 if(U_SUCCESS(*status)){ |
| 2138 collations = ures_getByKey(bundle, "collations", NULL, status); |
| 2139 if(U_SUCCESS(*status)){ |
| 2140 collation = ures_getByKey(collations, type, NULL, status); |
| 2141 if(U_SUCCESS(*status)){ |
| 2142 rules = ures_getStringByKey(collation, "Sequence", pLength, stat
us); |
| 2143 if(U_FAILURE(*status)){ |
| 2144 *pLength = 0; |
| 2145 rules = NULL; |
| 2146 } |
| 2147 ures_close(collation); |
| 2148 } |
| 2149 ures_close(collations); |
| 2150 } |
| 2151 } |
| 2152 |
| 2153 ures_close(bundle); |
| 2154 |
| 2155 return rules; |
| 2156 } |
| 2157 |
| 2158 void ucol_tok_initTokenList( |
| 2159 UColTokenParser *src, |
| 2160 const UChar *rules, |
| 2161 uint32_t rulesLength, |
| 2162 const UCollator *UCA, |
| 2163 GetCollationRulesFunction importFunc, |
| 2164 void* context, |
| 2165 UErrorCode *status) { |
| 2166 U_NAMESPACE_USE |
| 2167 |
| 2168 uint32_t nSize = 0; |
| 2169 uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE); |
| 2170 |
| 2171 bool needToDeallocRules = false; |
| 2172 |
| 2173 if(U_FAILURE(*status)) { |
| 2174 return; |
| 2175 } |
| 2176 |
| 2177 // set everything to zero, so that we can clean up gracefully |
| 2178 uprv_memset(src, 0, sizeof(UColTokenParser)); |
| 2179 |
| 2180 // first we need to find options that don't like to be normalized, |
| 2181 // like copy and remove... |
| 2182 //const UChar *openBrace = rules; |
| 2183 int32_t optionNumber = -1; |
| 2184 const UChar *setStart = NULL; |
| 2185 uint32_t i = 0; |
| 2186 while(i < rulesLength) { |
| 2187 if(rules[i] == 0x005B) { // '[': start of an option |
| 2188 /* Gets the following: |
| 2189 optionNumber: The index of the option. |
| 2190 setStart: The pointer at which the option arguments start. |
| 2191 */ |
| 2192 optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength
, &setStart); |
| 2193 |
| 2194 if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tail
oring */ |
| 2195 // [optimize] |
| 2196 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rule
s+rulesLength, status); |
| 2197 if(U_SUCCESS(*status)) { |
| 2198 if(src->copySet == NULL) { |
| 2199 src->copySet = newSet; |
| 2200 } else { |
| 2201 uset_addAll(src->copySet, newSet); |
| 2202 uset_close(newSet); |
| 2203 } |
| 2204 } else { |
| 2205 return; |
| 2206 } |
| 2207 } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) { |
| 2208 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rule
s+rulesLength, status); |
| 2209 if(U_SUCCESS(*status)) { |
| 2210 if(src->removeSet == NULL) { |
| 2211 src->removeSet = newSet; |
| 2212 } else { |
| 2213 uset_addAll(src->removeSet, newSet); |
| 2214 uset_close(newSet); |
| 2215 } |
| 2216 } else { |
| 2217 return; |
| 2218 } |
| 2219 } else if(optionNumber == OPTION_IMPORT){ |
| 2220 // [import <collation-name>] |
| 2221 |
| 2222 // Find the address of the closing ]. |
| 2223 UChar* import_end = u_strchr(setStart, 0x005D); |
| 2224 int32_t optionEndOffset = (int32_t)(import_end + 1 - rules); |
| 2225 // Ignore trailing whitespace. |
| 2226 while(uprv_isRuleWhiteSpace(*(import_end-1))) { |
| 2227 --import_end; |
| 2228 } |
| 2229 |
| 2230 int32_t optionLength = (int32_t)(import_end - setStart); |
| 2231 char option[50]; |
| 2232 if(optionLength >= (int32_t)sizeof(option)) { |
| 2233 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 2234 return; |
| 2235 } |
| 2236 u_UCharsToChars(setStart, option, optionLength); |
| 2237 option[optionLength] = 0; |
| 2238 |
| 2239 *status = U_ZERO_ERROR; |
| 2240 char locale[50]; |
| 2241 int32_t templ; |
| 2242 uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &te
mpl, status); |
| 2243 if(U_FAILURE(*status)) { |
| 2244 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 2245 return; |
| 2246 } |
| 2247 |
| 2248 char type[50]; |
| 2249 if (uloc_getKeywordValue(locale, "collation", type, (int32_t)siz
eof(type), status) <= 0 || |
| 2250 U_FAILURE(*status) |
| 2251 ) { |
| 2252 *status = U_ZERO_ERROR; |
| 2253 uprv_strcpy(type, "standard"); |
| 2254 } |
| 2255 |
| 2256 // TODO: Use public functions when available, see ticket #8134. |
| 2257 char *keywords = (char *)locale_getKeywordsStart(locale); |
| 2258 if(keywords != NULL) { |
| 2259 *keywords = 0; |
| 2260 } |
| 2261 |
| 2262 int32_t importRulesLength = 0; |
| 2263 const UChar* importRules = importFunc(context, locale, type, &im
portRulesLength, status); |
| 2264 |
| 2265 #ifdef DEBUG_FOR_COLL_RULES |
| 2266 std::string s; |
| 2267 UnicodeString(importRules).toUTF8String(s); |
| 2268 std::cout << "Import rules = " << s << std::endl; |
| 2269 #endif |
| 2270 |
| 2271 // Add the length of the imported rules to length of the origina
l rules, |
| 2272 // and subtract the length of the import option. |
| 2273 uint32_t newRulesLength = rulesLength + importRulesLength - (opt
ionEndOffset - i); |
| 2274 |
| 2275 UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UCha
r)); |
| 2276 |
| 2277 #ifdef DEBUG_FOR_COLL_RULES |
| 2278 std::string s1; |
| 2279 UnicodeString(rules).toUTF8String(s1); |
| 2280 std::cout << "Original rules = " << s1 << std::endl; |
| 2281 #endif |
| 2282 |
| 2283 |
| 2284 // Copy the section of the original rules leading up to the impo
rt |
| 2285 uprv_memcpy(newRules, rules, i*sizeof(UChar)); |
| 2286 // Copy the imported rules |
| 2287 uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UC
har)); |
| 2288 // Copy the rest of the original rules (minus the import option
itself) |
| 2289 uprv_memcpy(newRules+i+importRulesLength, |
| 2290 rules+optionEndOffset, |
| 2291 (rulesLength-optionEndOffset)*sizeof(UChar)); |
| 2292 |
| 2293 #ifdef DEBUG_FOR_COLL_RULES |
| 2294 std::string s2; |
| 2295 UnicodeString(newRules).toUTF8String(s2); |
| 2296 std::cout << "Resulting rules = " << s2 << std::endl; |
| 2297 #endif |
| 2298 |
| 2299 if(needToDeallocRules){ |
| 2300 // if needToDeallocRules is set, then we allocated rules, so
it's safe to cast and free |
| 2301 uprv_free((void*)rules); |
| 2302 } |
| 2303 needToDeallocRules = true; |
| 2304 rules = newRules; |
| 2305 rulesLength = newRulesLength; |
| 2306 |
| 2307 estimatedSize += importRulesLength*2; |
| 2308 |
| 2309 // First character of the new rules needs to be processed |
| 2310 i--; |
| 2311 } |
| 2312 } |
| 2313 //openBrace++; |
| 2314 i++; |
| 2315 } |
| 2316 |
| 2317 src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar)); |
| 2318 /* test for NULL */ |
| 2319 if (src->source == NULL) { |
| 2320 *status = U_MEMORY_ALLOCATION_ERROR; |
| 2321 return; |
| 2322 } |
| 2323 uprv_memset(src->source, 0, estimatedSize*sizeof(UChar)); |
| 2324 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estim
atedSize, status); |
| 2325 if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) { |
| 2326 *status = U_ZERO_ERROR; |
| 2327 src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_R
ULE_SPACE_SIZE)*sizeof(UChar)); |
| 2328 /* test for NULL */ |
| 2329 if (src->source == NULL) { |
| 2330 *status = U_MEMORY_ALLOCATION_ERROR; |
| 2331 return; |
| 2332 } |
| 2333 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, n
Size+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status); |
| 2334 } |
| 2335 if(needToDeallocRules){ |
| 2336 // if needToDeallocRules is set, then we allocated rules, so it's safe t
o cast and free |
| 2337 uprv_free((void*)rules); |
| 2338 } |
| 2339 |
| 2340 |
| 2341 src->current = src->source; |
| 2342 src->end = src->source+nSize; |
| 2343 src->sourceCurrent = src->source; |
| 2344 src->extraCurrent = src->end+1; // Preserve terminating zero in the rule str
ing so that option scanning works correctly |
| 2345 src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SP
ACE_SIZE; |
| 2346 src->varTop = NULL; |
| 2347 src->UCA = UCA; |
| 2348 src->invUCA = ucol_initInverseUCA(status); |
| 2349 src->parsedToken.charsLen = 0; |
| 2350 src->parsedToken.charsOffset = 0; |
| 2351 src->parsedToken.extensionLen = 0; |
| 2352 src->parsedToken.extensionOffset = 0; |
| 2353 src->parsedToken.prefixLen = 0; |
| 2354 src->parsedToken.prefixOffset = 0; |
| 2355 src->parsedToken.flags = 0; |
| 2356 src->parsedToken.strength = UCOL_TOK_UNSET; |
| 2357 src->buildCCTabFlag = FALSE; |
| 2358 src->isStarred = FALSE; |
| 2359 src->inRange = FALSE; |
| 2360 src->lastRangeCp = 0; |
| 2361 src->previousCp = 0; |
| 2362 |
| 2363 if(U_FAILURE(*status)) { |
| 2364 return; |
| 2365 } |
| 2366 src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, stat
us); |
| 2367 if(U_FAILURE(*status)) { |
| 2368 return; |
| 2369 } |
| 2370 uhash_setValueDeleter(src->tailored, uhash_freeBlock); |
| 2371 |
| 2372 src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet)); |
| 2373 /* test for NULL */ |
| 2374 if (src->opts == NULL) { |
| 2375 *status = U_MEMORY_ALLOCATION_ERROR; |
| 2376 return; |
| 2377 } |
| 2378 |
| 2379 uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet)); |
| 2380 |
| 2381 src->lh = 0; |
| 2382 src->listCapacity = 1024; |
| 2383 src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokL
istHeader)); |
| 2384 //Test for NULL |
| 2385 if (src->lh == NULL) { |
| 2386 *status = U_MEMORY_ALLOCATION_ERROR; |
| 2387 return; |
| 2388 } |
| 2389 uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader)); |
| 2390 src->resultLen = 0; |
| 2391 |
| 2392 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UC
A->image->UCAConsts); |
| 2393 |
| 2394 // UCOL_RESET_TOP_VALUE |
| 2395 setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IM
PLICIT); |
| 2396 // UCOL_FIRST_PRIMARY_IGNORABLE |
| 2397 setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0); |
| 2398 // UCOL_LAST_PRIMARY_IGNORABLE |
| 2399 setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0); |
| 2400 // UCOL_FIRST_SECONDARY_IGNORABLE |
| 2401 setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0); |
| 2402 // UCOL_LAST_SECONDARY_IGNORABLE |
| 2403 setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0); |
| 2404 // UCOL_FIRST_TERTIARY_IGNORABLE |
| 2405 setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0); |
| 2406 // UCOL_LAST_TERTIARY_IGNORABLE |
| 2407 setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0); |
| 2408 // UCOL_FIRST_VARIABLE |
| 2409 setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0); |
| 2410 // UCOL_LAST_VARIABLE |
| 2411 setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0); |
| 2412 // UCOL_FIRST_NON_VARIABLE |
| 2413 setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0); |
| 2414 // UCOL_LAST_NON_VARIABLE |
| 2415 setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_I
MPLICIT); |
| 2416 // UCOL_FIRST_IMPLICIT |
| 2417 setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0); |
| 2418 // UCOL_LAST_IMPLICIT |
| 2419 setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAIL
ING); |
| 2420 // UCOL_FIRST_TRAILING |
| 2421 setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0); |
| 2422 // UCOL_LAST_TRAILING |
| 2423 setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0); |
| 2424 ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24); |
| 2425 } |
| 2426 |
| 2427 |
| 2428 void ucol_tok_closeTokenList(UColTokenParser *src) { |
| 2429 if(src->copySet != NULL) { |
| 2430 uset_close(src->copySet); |
| 2431 } |
| 2432 if(src->removeSet != NULL) { |
| 2433 uset_close(src->removeSet); |
| 2434 } |
| 2435 if(src->tailored != NULL) { |
| 2436 uhash_close(src->tailored); |
| 2437 } |
| 2438 if(src->lh != NULL) { |
| 2439 uprv_free(src->lh); |
| 2440 } |
| 2441 if(src->source != NULL) { |
| 2442 uprv_free(src->source); |
| 2443 } |
| 2444 if(src->opts != NULL) { |
| 2445 uprv_free(src->opts); |
| 2446 } |
| 2447 if (src->reorderCodes != NULL) { |
| 2448 uprv_free(src->reorderCodes); |
| 2449 } |
| 2450 } |
| 2451 |
| 2452 #endif /* #if !UCONFIG_NO_COLLATION */ |
OLD | NEW |