OLD | NEW |
| (Empty) |
1 /* | |
2 ******************************************************************************* | |
3 * | |
4 * Copyright (C) 2001-2012, International Business Machines | |
5 * Corporation and others. All Rights Reserved. | |
6 * | |
7 ******************************************************************************* | |
8 * file name: ucol_tok.cpp | |
9 * encoding: US-ASCII | |
10 * tab size: 8 (not used) | |
11 * indentation:4 | |
12 * | |
13 * created 02/22/2001 | |
14 * created by: Vladimir Weinstein | |
15 * | |
16 * This module reads a tailoring rule string and produces a list of | |
17 * tokens that will be turned into collation elements | |
18 * | |
19 */ | |
20 | |
21 #include "unicode/utypes.h" | |
22 | |
23 #if !UCONFIG_NO_COLLATION | |
24 | |
25 #include "unicode/uscript.h" | |
26 #include "unicode/ustring.h" | |
27 #include "unicode/uchar.h" | |
28 #include "unicode/uniset.h" | |
29 | |
30 #include "cmemory.h" | |
31 #include "cstring.h" | |
32 #include "patternprops.h" | |
33 #include "ucol_bld.h" | |
34 #include "ucol_tok.h" | |
35 #include "ulocimp.h" | |
36 #include "uresimp.h" | |
37 | |
38 // Define this only for debugging. | |
39 // #define DEBUG_FOR_COLL_RULES 1 | |
40 | |
41 #ifdef DEBUG_FOR_COLL_RULES | |
42 #include <iostream> | |
43 #endif | |
44 | |
45 U_NAMESPACE_USE | |
46 | |
47 U_CDECL_BEGIN | |
48 static int32_t U_CALLCONV | |
49 uhash_hashTokens(const UHashTok k) | |
50 { | |
51 int32_t hash = 0; | |
52 //uint32_t key = (uint32_t)k.integer; | |
53 UColToken *key = (UColToken *)k.pointer; | |
54 if (key != 0) { | |
55 int32_t len = (key->source & 0xFF000000)>>24; | |
56 int32_t inc = ((len - 32) / 32) + 1; | |
57 | |
58 const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl); | |
59 const UChar *limit = p + len; | |
60 | |
61 while (p<limit) { | |
62 hash = (hash * 37) + *p; | |
63 p += inc; | |
64 } | |
65 } | |
66 return hash; | |
67 } | |
68 | |
69 static UBool U_CALLCONV | |
70 uhash_compareTokens(const UHashTok key1, const UHashTok key2) | |
71 { | |
72 //uint32_t p1 = (uint32_t) key1.integer; | |
73 //uint32_t p2 = (uint32_t) key2.integer; | |
74 UColToken *p1 = (UColToken *)key1.pointer; | |
75 UColToken *p2 = (UColToken *)key2.pointer; | |
76 const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl); | |
77 const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl); | |
78 uint32_t s1L = ((p1->source & 0xFF000000) >> 24); | |
79 uint32_t s2L = ((p2->source & 0xFF000000) >> 24); | |
80 const UChar *end = s1+s1L-1; | |
81 | |
82 if (p1 == p2) { | |
83 return TRUE; | |
84 } | |
85 if (p1->source == 0 || p2->source == 0) { | |
86 return FALSE; | |
87 } | |
88 if(s1L != s2L) { | |
89 return FALSE; | |
90 } | |
91 if(p1->source == p2->source) { | |
92 return TRUE; | |
93 } | |
94 while((s1 < end) && *s1 == *s2) { | |
95 ++s1; | |
96 ++s2; | |
97 } | |
98 if(*s1 == *s2) { | |
99 return TRUE; | |
100 } else { | |
101 return FALSE; | |
102 } | |
103 } | |
104 U_CDECL_END | |
105 | |
106 /* | |
107 * Debug messages used to pinpoint where a format error occurred. | |
108 * A better way is to include context-sensitive information in syntaxError() fun
ction. | |
109 * | |
110 * To turn this debugging on, either uncomment the following line, or define use
-DDEBUG_FOR_FORMAT_ERROR | |
111 * in the compile line. | |
112 */ | |
113 /* #define DEBUG_FOR_FORMAT_ERROR 1 */ | |
114 | |
115 #ifdef DEBUG_FOR_FORMAT_ERROR | |
116 #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__)
;} | |
117 #else | |
118 #define DBG_FORMAT_ERROR | |
119 #endif | |
120 | |
121 | |
122 /* | |
123 * Controls debug messages so that the output can be compared before and after a | |
124 * big change. Prints the information of every code point that comes out of the | |
125 * collation parser and its strength into a file. When a big change in format | |
126 * happens, the files before and after the change should be identical. | |
127 * | |
128 * To turn this debugging on, either uncomment the following line, or define use
-DDEBUG_FOR_CODE_POINTS | |
129 * in the compile line. | |
130 */ | |
131 // #define DEBUG_FOR_CODE_POINTS 1 | |
132 | |
133 #ifdef DEBUG_FOR_CODE_POINTS | |
134 FILE* dfcp_fp = NULL; | |
135 #endif | |
136 | |
137 | |
138 typedef struct { | |
139 uint32_t startCE; | |
140 uint32_t startContCE; | |
141 uint32_t limitCE; | |
142 uint32_t limitContCE; | |
143 } indirectBoundaries; | |
144 | |
145 /* these values are used for finding CE values for indirect positioning. */ | |
146 /* Indirect positioning is a mechanism for allowing resets on symbolic */ | |
147 /* values. It only works for resets and you cannot tailor indirect names */ | |
148 /* An indirect name can define either an anchor point or a range. An */ | |
149 /* anchor point behaves in exactly the same way as a code point in reset */ | |
150 /* would, except that it cannot be tailored. A range (we currently only */ | |
151 /* know for the [top] range will explicitly set the upper bound for */ | |
152 /* generated CEs, thus allowing for better control over how many CEs can */ | |
153 /* be squeezed between in the range without performance penalty. */ | |
154 /* In that respect, we use [top] for tailoring of locales that use CJK */ | |
155 /* characters. Other indirect values are currently a pure convenience, */ | |
156 /* they can be used to assure that the CEs will be always positioned in */ | |
157 /* the same place relative to a point with known properties (e.g. first */ | |
158 /* primary ignorable). */ | |
159 static indirectBoundaries ucolIndirectBoundaries[15]; | |
160 /* | |
161 static indirectBoundaries ucolIndirectBoundaries[11] = { | |
162 { UCOL_RESET_TOP_VALUE, 0, | |
163 UCOL_NEXT_TOP_VALUE, 0 }, | |
164 { UCOL_FIRST_PRIMARY_IGNORABLE, 0, | |
165 0, 0 }, | |
166 { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT, | |
167 0, 0 }, | |
168 { UCOL_FIRST_SECONDARY_IGNORABLE, 0, | |
169 0, 0 }, | |
170 { UCOL_LAST_SECONDARY_IGNORABLE, 0, | |
171 0, 0 }, | |
172 { UCOL_FIRST_TERTIARY_IGNORABLE, 0, | |
173 0, 0 }, | |
174 { UCOL_LAST_TERTIARY_IGNORABLE, 0, | |
175 0, 0 }, | |
176 { UCOL_FIRST_VARIABLE, 0, | |
177 0, 0 }, | |
178 { UCOL_LAST_VARIABLE, 0, | |
179 0, 0 }, | |
180 { UCOL_FIRST_NON_VARIABLE, 0, | |
181 0, 0 }, | |
182 { UCOL_LAST_NON_VARIABLE, 0, | |
183 0, 0 }, | |
184 }; | |
185 */ | |
186 | |
187 static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *en
d) { | |
188 | |
189 // Set values for the top - TODO: once we have values for all the indirects,
we are going | |
190 // to initalize here. | |
191 ucolIndirectBoundaries[indexR].startCE = start[0]; | |
192 ucolIndirectBoundaries[indexR].startContCE = start[1]; | |
193 if(end) { | |
194 ucolIndirectBoundaries[indexR].limitCE = end[0]; | |
195 ucolIndirectBoundaries[indexR].limitContCE = end[1]; | |
196 } else { | |
197 ucolIndirectBoundaries[indexR].limitCE = 0; | |
198 ucolIndirectBoundaries[indexR].limitContCE = 0; | |
199 } | |
200 } | |
201 | |
202 | |
203 static inline | |
204 void syntaxError(const UChar* rules, | |
205 int32_t pos, | |
206 int32_t rulesLen, | |
207 UParseError* parseError) | |
208 { | |
209 parseError->offset = pos; | |
210 parseError->line = 0 ; /* we are not using line numbers */ | |
211 | |
212 // for pre-context | |
213 int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN
-1)); | |
214 int32_t stop = pos; | |
215 | |
216 u_memcpy(parseError->preContext,rules+start,stop-start); | |
217 //null terminate the buffer | |
218 parseError->preContext[stop-start] = 0; | |
219 | |
220 //for post-context | |
221 start = pos+1; | |
222 stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1
)) : | |
223 rulesLen; | |
224 | |
225 if(start < stop) { | |
226 u_memcpy(parseError->postContext,rules+start,stop-start); | |
227 //null terminate the buffer | |
228 parseError->postContext[stop-start]= 0; | |
229 } else { | |
230 parseError->postContext[0] = 0; | |
231 } | |
232 } | |
233 | |
234 static | |
235 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, U
ColAttributeValue value) { | |
236 switch(attrib) { | |
237 case UCOL_HIRAGANA_QUATERNARY_MODE: | |
238 opts->hiraganaQ = value; | |
239 break; | |
240 case UCOL_FRENCH_COLLATION: | |
241 opts->frenchCollation = value; | |
242 break; | |
243 case UCOL_ALTERNATE_HANDLING: | |
244 opts->alternateHandling = value; | |
245 break; | |
246 case UCOL_CASE_FIRST: | |
247 opts->caseFirst = value; | |
248 break; | |
249 case UCOL_CASE_LEVEL: | |
250 opts->caseLevel = value; | |
251 break; | |
252 case UCOL_NORMALIZATION_MODE: | |
253 opts->normalizationMode = value; | |
254 break; | |
255 case UCOL_STRENGTH: | |
256 opts->strength = value; | |
257 break; | |
258 case UCOL_NUMERIC_COLLATION: | |
259 opts->numericCollation = value; | |
260 break; | |
261 case UCOL_ATTRIBUTE_COUNT: | |
262 default: | |
263 break; | |
264 } | |
265 } | |
266 | |
267 #define UTOK_OPTION_COUNT 22 | |
268 | |
269 static UBool didInit = FALSE; | |
270 /* we can be strict, or we can be lenient */ | |
271 /* I'd surely be lenient with the option arguments */ | |
272 /* maybe even with options */ | |
273 U_STRING_DECL(suboption_00, "non-ignorable", 13); | |
274 U_STRING_DECL(suboption_01, "shifted", 7); | |
275 | |
276 U_STRING_DECL(suboption_02, "lower", 5); | |
277 U_STRING_DECL(suboption_03, "upper", 5); | |
278 U_STRING_DECL(suboption_04, "off", 3); | |
279 U_STRING_DECL(suboption_05, "on", 2); | |
280 U_STRING_DECL(suboption_06, "1", 1); | |
281 U_STRING_DECL(suboption_07, "2", 1); | |
282 U_STRING_DECL(suboption_08, "3", 1); | |
283 U_STRING_DECL(suboption_09, "4", 1); | |
284 U_STRING_DECL(suboption_10, "I", 1); | |
285 | |
286 U_STRING_DECL(suboption_11, "primary", 7); | |
287 U_STRING_DECL(suboption_12, "secondary", 9); | |
288 U_STRING_DECL(suboption_13, "tertiary", 8); | |
289 U_STRING_DECL(suboption_14, "variable", 8); | |
290 U_STRING_DECL(suboption_15, "regular", 7); | |
291 U_STRING_DECL(suboption_16, "implicit", 8); | |
292 U_STRING_DECL(suboption_17, "trailing", 8); | |
293 | |
294 | |
295 U_STRING_DECL(option_00, "undefined", 9); | |
296 U_STRING_DECL(option_01, "rearrange", 9); | |
297 U_STRING_DECL(option_02, "alternate", 9); | |
298 U_STRING_DECL(option_03, "backwards", 9); | |
299 U_STRING_DECL(option_04, "variable top", 12); | |
300 U_STRING_DECL(option_05, "top", 3); | |
301 U_STRING_DECL(option_06, "normalization", 13); | |
302 U_STRING_DECL(option_07, "caseLevel", 9); | |
303 U_STRING_DECL(option_08, "caseFirst", 9); | |
304 U_STRING_DECL(option_09, "scriptOrder", 11); | |
305 U_STRING_DECL(option_10, "charsetname", 11); | |
306 U_STRING_DECL(option_11, "charset", 7); | |
307 U_STRING_DECL(option_12, "before", 6); | |
308 U_STRING_DECL(option_13, "hiraganaQ", 9); | |
309 U_STRING_DECL(option_14, "strength", 8); | |
310 U_STRING_DECL(option_15, "first", 5); | |
311 U_STRING_DECL(option_16, "last", 4); | |
312 U_STRING_DECL(option_17, "optimize", 8); | |
313 U_STRING_DECL(option_18, "suppressContractions", 20); | |
314 U_STRING_DECL(option_19, "numericOrdering", 15); | |
315 U_STRING_DECL(option_20, "import", 6); | |
316 U_STRING_DECL(option_21, "reorder", 7); | |
317 | |
318 /* | |
319 [last variable] last variable value | |
320 [last primary ignorable] largest CE for primary ignorable | |
321 [last secondary ignorable] largest CE for secondary ignorable | |
322 [last tertiary ignorable] largest CE for tertiary ignorable | |
323 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8
) | |
324 */ | |
325 | |
326 | |
327 static const ucolTokSuboption alternateSub[2] = { | |
328 {suboption_00, 13, UCOL_NON_IGNORABLE}, | |
329 {suboption_01, 7, UCOL_SHIFTED} | |
330 }; | |
331 | |
332 static const ucolTokSuboption caseFirstSub[3] = { | |
333 {suboption_02, 5, UCOL_LOWER_FIRST}, | |
334 {suboption_03, 5, UCOL_UPPER_FIRST}, | |
335 {suboption_04, 3, UCOL_OFF}, | |
336 }; | |
337 | |
338 static const ucolTokSuboption onOffSub[2] = { | |
339 {suboption_04, 3, UCOL_OFF}, | |
340 {suboption_05, 2, UCOL_ON} | |
341 }; | |
342 | |
343 static const ucolTokSuboption frenchSub[1] = { | |
344 {suboption_07, 1, UCOL_ON} | |
345 }; | |
346 | |
347 static const ucolTokSuboption beforeSub[3] = { | |
348 {suboption_06, 1, UCOL_PRIMARY}, | |
349 {suboption_07, 1, UCOL_SECONDARY}, | |
350 {suboption_08, 1, UCOL_TERTIARY} | |
351 }; | |
352 | |
353 static const ucolTokSuboption strengthSub[5] = { | |
354 {suboption_06, 1, UCOL_PRIMARY}, | |
355 {suboption_07, 1, UCOL_SECONDARY}, | |
356 {suboption_08, 1, UCOL_TERTIARY}, | |
357 {suboption_09, 1, UCOL_QUATERNARY}, | |
358 {suboption_10, 1, UCOL_IDENTICAL}, | |
359 }; | |
360 | |
361 static const ucolTokSuboption firstLastSub[7] = { | |
362 {suboption_11, 7, UCOL_PRIMARY}, | |
363 {suboption_12, 9, UCOL_PRIMARY}, | |
364 {suboption_13, 8, UCOL_PRIMARY}, | |
365 {suboption_14, 8, UCOL_PRIMARY}, | |
366 {suboption_15, 7, UCOL_PRIMARY}, | |
367 {suboption_16, 8, UCOL_PRIMARY}, | |
368 {suboption_17, 8, UCOL_PRIMARY}, | |
369 }; | |
370 | |
371 enum OptionNumber { | |
372 OPTION_ALTERNATE_HANDLING = 0, | |
373 OPTION_FRENCH_COLLATION, | |
374 OPTION_CASE_LEVEL, | |
375 OPTION_CASE_FIRST, | |
376 OPTION_NORMALIZATION_MODE, | |
377 OPTION_HIRAGANA_QUATERNARY, | |
378 OPTION_STRENGTH, | |
379 OPTION_NUMERIC_COLLATION, | |
380 OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION, | |
381 OPTION_VARIABLE_TOP, | |
382 OPTION_REARRANGE, | |
383 OPTION_BEFORE, | |
384 OPTION_TOP, | |
385 OPTION_FIRST, | |
386 OPTION_LAST, | |
387 OPTION_OPTIMIZE, | |
388 OPTION_SUPPRESS_CONTRACTIONS, | |
389 OPTION_UNDEFINED, | |
390 OPTION_SCRIPT_ORDER, | |
391 OPTION_CHARSET_NAME, | |
392 OPTION_CHARSET, | |
393 OPTION_IMPORT, | |
394 OPTION_SCRIPTREORDER | |
395 } ; | |
396 | |
397 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = { | |
398 /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alterna
te" */ | |
399 /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"
*/ | |
400 /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */ | |
401 /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */ | |
402 /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalizati
on" */ | |
403 /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraga
naQ" */ | |
404 /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */ | |
405 /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrde
ring"*/ | |
406 /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */ | |
407 /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */ | |
408 /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */ | |
409 /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */ | |
410 /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */ | |
411 /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */ | |
412 /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */ | |
413 /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractio
ns" */ | |
414 /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */ | |
415 /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */ | |
416 /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */ | |
417 /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charset" *
/ | |
418 /*20*/ {option_20, 6, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"import" */ | |
419 /*21*/ {option_21, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"reorder" */ | |
420 }; | |
421 | |
422 static | |
423 int32_t u_strncmpNoCase(const UChar *s1, | |
424 const UChar *s2, | |
425 int32_t n) | |
426 { | |
427 if(n > 0) { | |
428 int32_t rc; | |
429 for(;;) { | |
430 rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2); | |
431 if(rc != 0 || *s1 == 0 || --n == 0) { | |
432 return rc; | |
433 } | |
434 ++s1; | |
435 ++s2; | |
436 } | |
437 } | |
438 return 0; | |
439 } | |
440 | |
441 static | |
442 void ucol_uprv_tok_initData() { | |
443 if(!didInit) { | |
444 U_STRING_INIT(suboption_00, "non-ignorable", 13); | |
445 U_STRING_INIT(suboption_01, "shifted", 7); | |
446 | |
447 U_STRING_INIT(suboption_02, "lower", 5); | |
448 U_STRING_INIT(suboption_03, "upper", 5); | |
449 U_STRING_INIT(suboption_04, "off", 3); | |
450 U_STRING_INIT(suboption_05, "on", 2); | |
451 | |
452 U_STRING_INIT(suboption_06, "1", 1); | |
453 U_STRING_INIT(suboption_07, "2", 1); | |
454 U_STRING_INIT(suboption_08, "3", 1); | |
455 U_STRING_INIT(suboption_09, "4", 1); | |
456 U_STRING_INIT(suboption_10, "I", 1); | |
457 | |
458 U_STRING_INIT(suboption_11, "primary", 7); | |
459 U_STRING_INIT(suboption_12, "secondary", 9); | |
460 U_STRING_INIT(suboption_13, "tertiary", 8); | |
461 U_STRING_INIT(suboption_14, "variable", 8); | |
462 U_STRING_INIT(suboption_15, "regular", 7); | |
463 U_STRING_INIT(suboption_16, "implicit", 8); | |
464 U_STRING_INIT(suboption_17, "trailing", 8); | |
465 | |
466 | |
467 U_STRING_INIT(option_00, "undefined", 9); | |
468 U_STRING_INIT(option_01, "rearrange", 9); | |
469 U_STRING_INIT(option_02, "alternate", 9); | |
470 U_STRING_INIT(option_03, "backwards", 9); | |
471 U_STRING_INIT(option_04, "variable top", 12); | |
472 U_STRING_INIT(option_05, "top", 3); | |
473 U_STRING_INIT(option_06, "normalization", 13); | |
474 U_STRING_INIT(option_07, "caseLevel", 9); | |
475 U_STRING_INIT(option_08, "caseFirst", 9); | |
476 U_STRING_INIT(option_09, "scriptOrder", 11); | |
477 U_STRING_INIT(option_10, "charsetname", 11); | |
478 U_STRING_INIT(option_11, "charset", 7); | |
479 U_STRING_INIT(option_12, "before", 6); | |
480 U_STRING_INIT(option_13, "hiraganaQ", 9); | |
481 U_STRING_INIT(option_14, "strength", 8); | |
482 U_STRING_INIT(option_15, "first", 5); | |
483 U_STRING_INIT(option_16, "last", 4); | |
484 U_STRING_INIT(option_17, "optimize", 8); | |
485 U_STRING_INIT(option_18, "suppressContractions", 20); | |
486 U_STRING_INIT(option_19, "numericOrdering", 15); | |
487 U_STRING_INIT(option_20, "import ", 6); | |
488 U_STRING_INIT(option_21, "reorder", 7); | |
489 didInit = TRUE; | |
490 } | |
491 } | |
492 | |
493 | |
494 // This function reads basic options to set in the runtime collator | |
495 // used by data driven tests. Should not support build time options | |
496 U_CAPI const UChar * U_EXPORT2 | |
497 ucol_tok_getNextArgument(const UChar *start, const UChar *end, | |
498 UColAttribute *attrib, UColAttributeValue *value, | |
499 UErrorCode *status) | |
500 { | |
501 uint32_t i = 0; | |
502 int32_t j=0; | |
503 UBool foundOption = FALSE; | |
504 const UChar *optionArg = NULL; | |
505 | |
506 ucol_uprv_tok_initData(); | |
507 | |
508 while(start < end && PatternProps::isWhiteSpace(*start)) { /* eat whitespace
*/ | |
509 start++; | |
510 } | |
511 if(start >= end) { | |
512 return NULL; | |
513 } | |
514 /* skip opening '[' */ | |
515 if(*start == 0x005b) { | |
516 start++; | |
517 } else { | |
518 *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '[' | |
519 return NULL; | |
520 } | |
521 | |
522 while(i < UTOK_OPTION_COUNT) { | |
523 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].op
tionLen) == 0) { | |
524 foundOption = TRUE; | |
525 if(end - start > rulesOptions[i].optionLen) { | |
526 optionArg = start+rulesOptions[i].optionLen+1; /* start of the o
ptions, skip space */ | |
527 while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespac
e */ | |
528 optionArg++; | |
529 } | |
530 } | |
531 break; | |
532 } | |
533 i++; | |
534 } | |
535 | |
536 if(!foundOption) { | |
537 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
538 return NULL; | |
539 } | |
540 | |
541 if(optionArg) { | |
542 for(j = 0; j<rulesOptions[i].subSize; j++) { | |
543 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, ru
lesOptions[i].subopts[j].subLen) == 0) { | |
544 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr
, rulesOptions[i].subopts[j].attrVal); | |
545 *attrib = rulesOptions[i].attr; | |
546 *value = rulesOptions[i].subopts[j].attrVal; | |
547 optionArg += rulesOptions[i].subopts[j].subLen; | |
548 while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespac
e */ | |
549 optionArg++; | |
550 } | |
551 if(*optionArg == 0x005d) { | |
552 optionArg++; | |
553 return optionArg; | |
554 } else { | |
555 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
556 return NULL; | |
557 } | |
558 } | |
559 } | |
560 } | |
561 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
562 return NULL; | |
563 } | |
564 | |
565 static | |
566 USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, U
ErrorCode *status) { | |
567 while(*start != 0x005b) { /* advance while we find the first '[' */ | |
568 start++; | |
569 } | |
570 // now we need to get a balanced set of '[]'. The problem is that a set can
have | |
571 // many, and *end point to the first closing '[' | |
572 int32_t noOpenBraces = 1; | |
573 int32_t current = 1; // skip the opening brace | |
574 while(start+current < end && noOpenBraces != 0) { | |
575 if(start[current] == 0x005b) { | |
576 noOpenBraces++; | |
577 } else if(start[current] == 0x005D) { // closing brace | |
578 noOpenBraces--; | |
579 } | |
580 current++; | |
581 } | |
582 | |
583 if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) { | |
584 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
585 return NULL; | |
586 } | |
587 return uset_openPattern(start, current, status); | |
588 } | |
589 | |
590 /** | |
591 * Reads an option and matches the option name with the predefined options. (Cas
e-insensitive.) | |
592 * @param start Pointer to the start UChar. | |
593 * @param end Pointer to the last valid pointer beyond which the option will not
extend. | |
594 * @param optionArg Address of the pointer at which the options start (after the
option name) | |
595 * @return The index of the option, or -1 if the option is not valid. | |
596 */ | |
597 static | |
598 int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UCh
ar **optionArg) { | |
599 int32_t i = 0; | |
600 ucol_uprv_tok_initData(); | |
601 | |
602 while(PatternProps::isWhiteSpace(*start)) { /* eat whitespace */ | |
603 start++; | |
604 } | |
605 while(i < UTOK_OPTION_COUNT) { | |
606 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].op
tionLen) == 0) { | |
607 if(end - start > rulesOptions[i].optionLen) { | |
608 *optionArg = start+rulesOptions[i].optionLen; /* End of option n
ame; start of the options */ | |
609 while(PatternProps::isWhiteSpace(**optionArg)) { /* eat whitespa
ce */ | |
610 (*optionArg)++; | |
611 } | |
612 } | |
613 break; | |
614 } | |
615 i++; | |
616 } | |
617 if(i == UTOK_OPTION_COUNT) { | |
618 i = -1; // didn't find an option | |
619 } | |
620 return i; | |
621 } | |
622 | |
623 | |
624 static | |
625 void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) { | |
626 int32_t codeCount = 0; | |
627 int32_t codeIndex = 0; | |
628 char conversion[64]; | |
629 int32_t tokenLength = 0; | |
630 const UChar* space; | |
631 | |
632 const UChar* current = src->current; | |
633 const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current); | |
634 | |
635 // eat leading whitespace | |
636 while(current < end && u_isWhitespace(*current)) { | |
637 current++; | |
638 } | |
639 | |
640 while(current < end) { | |
641 space = u_memchr(current, 0x0020, end - current); | |
642 space = space == 0 ? end : space; | |
643 tokenLength = space - current; | |
644 if (tokenLength < 4) { | |
645 *status = U_INVALID_FORMAT_ERROR; | |
646 return; | |
647 } | |
648 codeCount++; | |
649 current += tokenLength; | |
650 while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ | |
651 ++current; | |
652 } | |
653 } | |
654 | |
655 if (codeCount == 0) { | |
656 *status = U_INVALID_FORMAT_ERROR; | |
657 } | |
658 | |
659 src->reorderCodesLength = codeCount; | |
660 src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t)); | |
661 current = src->current; | |
662 | |
663 // eat leading whitespace | |
664 while(current < end && u_isWhitespace(*current)) { | |
665 current++; | |
666 } | |
667 | |
668 while(current < end) { | |
669 space = u_memchr(current, 0x0020, end - current); | |
670 space = space == 0 ? end : space; | |
671 tokenLength = space - current; | |
672 if (tokenLength < 4) { | |
673 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
674 return; | |
675 } else { | |
676 u_UCharsToChars(current, conversion, tokenLength); | |
677 conversion[tokenLength] = '\0'; | |
678 src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion); | |
679 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) { | |
680 src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRI
PT, conversion); | |
681 } | |
682 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) { | |
683 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
684 } | |
685 } | |
686 codeIndex++; | |
687 current += tokenLength; | |
688 while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ | |
689 ++current; | |
690 } | |
691 } | |
692 } | |
693 | |
694 // reads and conforms to various options in rules | |
695 // end is the position of the first closing ']' | |
696 // However, some of the options take an UnicodeSet definition | |
697 // which needs to duplicate the closing ']' | |
698 // for example: '[copy [\uAC00-\uD7FF]]' | |
699 // These options will move end to the second ']' and the | |
700 // caller will set the current to it. | |
701 static | |
702 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status)
{ | |
703 const UChar* start = src->current; | |
704 int32_t i = 0; | |
705 int32_t j=0; | |
706 const UChar *optionArg = NULL; | |
707 | |
708 uint8_t result = 0; | |
709 | |
710 start++; /*skip opening '['*/ | |
711 i = ucol_uprv_tok_readOption(start, src->end, &optionArg); | |
712 if(optionArg) { | |
713 src->current = optionArg; | |
714 } | |
715 | |
716 if(i < 0) { | |
717 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
718 } else { | |
719 int32_t noOpenBraces = 1; | |
720 switch(i) { | |
721 case OPTION_ALTERNATE_HANDLING: | |
722 case OPTION_FRENCH_COLLATION: | |
723 case OPTION_CASE_LEVEL: | |
724 case OPTION_CASE_FIRST: | |
725 case OPTION_NORMALIZATION_MODE: | |
726 case OPTION_HIRAGANA_QUATERNARY: | |
727 case OPTION_STRENGTH: | |
728 case OPTION_NUMERIC_COLLATION: | |
729 if(optionArg) { | |
730 for(j = 0; j<rulesOptions[i].subSize; j++) { | |
731 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName
, rulesOptions[i].subopts[j].subLen) == 0) { | |
732 ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].at
tr, rulesOptions[i].subopts[j].attrVal); | |
733 result = UCOL_TOK_SUCCESS; | |
734 } | |
735 } | |
736 } | |
737 if(result == 0) { | |
738 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
739 } | |
740 break; | |
741 case OPTION_VARIABLE_TOP: | |
742 result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP; | |
743 break; | |
744 case OPTION_REARRANGE: | |
745 result = UCOL_TOK_SUCCESS; | |
746 break; | |
747 case OPTION_BEFORE: | |
748 if(optionArg) { | |
749 for(j = 0; j<rulesOptions[i].subSize; j++) { | |
750 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName
, rulesOptions[i].subopts[j].subLen) == 0) { | |
751 result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attr
Val + 1); | |
752 } | |
753 } | |
754 } | |
755 if(result == 0) { | |
756 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
757 } | |
758 break; | |
759 case OPTION_TOP: /* we are going to have an array with structures of limit C
Es */ | |
760 /* index to this array will be src->parsedToken.indirectIndex*/ | |
761 src->parsedToken.indirectIndex = 0; | |
762 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP; | |
763 break; | |
764 case OPTION_FIRST: | |
765 case OPTION_LAST: /* first, last */ | |
766 for(j = 0; j<rulesOptions[i].subSize; j++) { | |
767 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, ru
lesOptions[i].subopts[j].subLen) == 0) { | |
768 // the calculation below assumes that OPTION_FIRST and OPTION_LA
ST are at i and i+1 and that the first | |
769 // element of indirect boundaries is reserved for top. | |
770 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2
); | |
771 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;; | |
772 } | |
773 } | |
774 if(result == 0) { | |
775 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
776 } | |
777 break; | |
778 case OPTION_OPTIMIZE: | |
779 case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before no
rmalization | |
780 // we need to move end here | |
781 src->current++; // skip opening brace | |
782 while(src->current < src->end && noOpenBraces != 0) { | |
783 if(*src->current == 0x005b) { | |
784 noOpenBraces++; | |
785 } else if(*src->current == 0x005D) { // closing brace | |
786 noOpenBraces--; | |
787 } | |
788 src->current++; | |
789 } | |
790 result = UCOL_TOK_SUCCESS; | |
791 break; | |
792 case OPTION_SCRIPTREORDER: | |
793 ucol_tok_parseScriptReorder(src, status); | |
794 break; | |
795 default: | |
796 *status = U_UNSUPPORTED_ERROR; | |
797 break; | |
798 } | |
799 } | |
800 src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->curren
t)); | |
801 return result; | |
802 } | |
803 | |
804 | |
805 inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff,
int32_t len, UErrorCode *status) { | |
806 if (stuff == NULL || len <= 0) { | |
807 return; | |
808 } | |
809 UnicodeString tempStuff(FALSE, stuff, len); | |
810 if(src->extraCurrent+len >= src->extraEnd) { | |
811 /* reallocate */ | |
812 if (stuff >= src->source && stuff <= src->end) { | |
813 // Copy the "stuff" contents into tempStuff's own buffer. | |
814 // UnicodeString is copy-on-write. | |
815 if (len > 0) { | |
816 tempStuff.setCharAt(0, tempStuff[0]); | |
817 } else { | |
818 tempStuff.remove(); | |
819 } | |
820 } | |
821 UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->s
ource)*2*sizeof(UChar)); | |
822 if(newSrc != NULL) { | |
823 src->current = newSrc + (src->current - src->source); | |
824 src->extraCurrent = newSrc + (src->extraCurrent - src->source); | |
825 src->end = newSrc + (src->end - src->source); | |
826 src->extraEnd = newSrc + (src->extraEnd-src->source)*2; | |
827 src->sourceCurrent = newSrc + (src->sourceCurrent-src->source); | |
828 src->source = newSrc; | |
829 } else { | |
830 *status = U_MEMORY_ALLOCATION_ERROR; | |
831 return; | |
832 } | |
833 } | |
834 if(len == 1) { | |
835 *src->extraCurrent++ = tempStuff[0]; | |
836 } else { | |
837 u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len); | |
838 src->extraCurrent += len; | |
839 } | |
840 } | |
841 | |
842 inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) { | |
843 /* | |
844 top = TRUE; | |
845 */ | |
846 UChar buff[5]; | |
847 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); | |
848 buff[0] = 0xFFFE; | |
849 buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].sta
rtCE >> 16); | |
850 buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].sta
rtCE & 0xFFFF); | |
851 if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0)
{ | |
852 src->parsedToken.charsLen = 3; | |
853 ucol_tok_addToExtraCurrent(src, buff, 3, status); | |
854 } else { | |
855 buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex]
.startContCE >> 16); | |
856 buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex]
.startContCE & 0xFFFF); | |
857 src->parsedToken.charsLen = 5; | |
858 ucol_tok_addToExtraCurrent(src, buff, 5, status); | |
859 } | |
860 return TRUE; | |
861 } | |
862 | |
863 static UBool isCharNewLine(UChar c){ | |
864 switch(c){ | |
865 case 0x000A: /* LF */ | |
866 case 0x000D: /* CR */ | |
867 case 0x000C: /* FF */ | |
868 case 0x0085: /* NEL */ | |
869 case 0x2028: /* LS */ | |
870 case 0x2029: /* PS */ | |
871 return TRUE; | |
872 default: | |
873 return FALSE; | |
874 } | |
875 } | |
876 | |
877 /* | |
878 * This function is called several times when a range is processed. Each time,
the next code point | |
879 * is processed. | |
880 * The following variables must be set before calling this function: | |
881 * src->currentRangeCp: The current code point to process. | |
882 * src->lastRangeCp: The last code point in the range. | |
883 * Pre-requisite: src->currentRangeCp <= src->lastRangeCp. | |
884 */ | |
885 static const UChar* | |
886 ucol_tok_processNextCodePointInRange(UColTokenParser *src, | |
887 UErrorCode *status) | |
888 { | |
889 // Append current code point to source | |
890 UChar buff[U16_MAX_LENGTH]; | |
891 uint32_t i = 0; | |
892 | |
893 uint32_t nChars = U16_LENGTH(src->currentRangeCp); | |
894 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); | |
895 src->parsedToken.charsLen = nChars; | |
896 | |
897 U16_APPEND_UNSAFE(buff, i, src->currentRangeCp); | |
898 ucol_tok_addToExtraCurrent(src, buff, nChars, status); | |
899 | |
900 ++src->currentRangeCp; | |
901 if (src->currentRangeCp > src->lastRangeCp) { | |
902 src->inRange = FALSE; | |
903 | |
904 if (src->currentStarredCharIndex > src->lastStarredCharIndex) { | |
905 src->isStarred = FALSE; | |
906 } | |
907 } else { | |
908 src->previousCp = src->currentRangeCp; | |
909 } | |
910 return src->current; | |
911 } | |
912 | |
913 /* | |
914 * This function is called several times when a starred list is processed. Each
time, the next code point | |
915 * in the list is processed. | |
916 * The following variables must be set before calling this function: | |
917 * src->currentStarredCharIndex: Index (in src->source) of the first char of
the current code point. | |
918 * src->lastStarredCharIndex: Index to the last character in the list. | |
919 * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex. | |
920 */ | |
921 static const UChar* | |
922 ucol_tok_processNextTokenInStarredList(UColTokenParser *src) | |
923 { | |
924 // Extract the characters corresponding to the next code point. | |
925 UChar32 cp; | |
926 src->parsedToken.charsOffset = src->currentStarredCharIndex; | |
927 int32_t prev = src->currentStarredCharIndex; | |
928 U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src-
>source), cp); | |
929 src->parsedToken.charsLen = src->currentStarredCharIndex - prev; | |
930 | |
931 // When we are done parsing the starred string, turn the flag off so that | |
932 // the normal processing is restored. | |
933 if (src->currentStarredCharIndex > src->lastStarredCharIndex) { | |
934 src->isStarred = FALSE; | |
935 } | |
936 src->previousCp = cp; | |
937 return src->current; | |
938 } | |
939 | |
940 /* | |
941 * Partially parses the next token, keeps the indices in src->parsedToken, and u
pdates the counters. | |
942 * | |
943 * This routine parses and separates almost all tokens. The following are the sy
ntax characters recognized. | |
944 * # : Comment character | |
945 * & : Reset operator | |
946 * = : Equality | |
947 * < : Primary collation | |
948 * << : Secondary collation | |
949 * <<< : Tertiary collation | |
950 * ; : Secondary collation | |
951 * , : Tertiary collation | |
952 * / : Expansions | |
953 * | : Prefix | |
954 * - : Range | |
955 | |
956 * ! : Java Thai modifier, ignored | |
957 * @ : French only | |
958 | |
959 * [] : Options | |
960 * '' : Quotes | |
961 * | |
962 * Along with operators =, <, <<, <<<, the operator * is supported to indicate
a list. For example, &a<*bcdexyz | |
963 * is equivalent to &a<b<c<d<e<x<y<z. In lists, ranges also can be given, so &
a*b-ex-z is equivalent to the above. | |
964 * This function do not separate the tokens in a list. Instead, &a<*b-ex-z is
parsed as three tokens - "&a", | |
965 * "<*b", "-ex", "-z". The strength (< in this case), whether in a list, wheth
er in a range and the previous | |
966 * character returned as cached so that the calling program can do further spli
tting. | |
967 */ | |
968 static const UChar* | |
969 ucol_tok_parseNextTokenInternal(UColTokenParser *src, | |
970 UBool startOfRules, | |
971 UParseError *parseError, | |
972 UErrorCode *status) | |
973 { | |
974 UBool variableTop = FALSE; | |
975 UBool top = FALSE; | |
976 UBool inChars = TRUE; | |
977 UBool inQuote = FALSE; | |
978 UBool wasInQuote = FALSE; | |
979 uint8_t before = 0; | |
980 UBool isEscaped = FALSE; | |
981 | |
982 // TODO: replace these variables with src->parsedToken counterparts | |
983 // no need to use them anymore since we have src->parsedToken. | |
984 // Ideally, token parser would be a nice class... Once, when I have | |
985 // more time (around 2020 probably). | |
986 uint32_t newExtensionLen = 0; | |
987 uint32_t extensionOffset = 0; | |
988 uint32_t newStrength = UCOL_TOK_UNSET; | |
989 UChar buff[10]; | |
990 | |
991 src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0; | |
992 src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0; | |
993 src->parsedToken.indirectIndex = 0; | |
994 | |
995 while (src->current < src->end) { | |
996 UChar ch = *(src->current); | |
997 | |
998 if (inQuote) { | |
999 if (ch == 0x0027/*'\''*/) { | |
1000 inQuote = FALSE; | |
1001 } else { | |
1002 if ((src->parsedToken.charsLen == 0) || inChars) { | |
1003 if(src->parsedToken.charsLen == 0) { | |
1004 src->parsedToken.charsOffset = (uint32_t)(src->extraCurr
ent - src->source); | |
1005 } | |
1006 src->parsedToken.charsLen++; | |
1007 } else { | |
1008 if(newExtensionLen == 0) { | |
1009 extensionOffset = (uint32_t)(src->extraCurrent - src->so
urce); | |
1010 } | |
1011 newExtensionLen++; | |
1012 } | |
1013 } | |
1014 }else if(isEscaped){ | |
1015 isEscaped =FALSE; | |
1016 if (newStrength == UCOL_TOK_UNSET) { | |
1017 *status = U_INVALID_FORMAT_ERROR; | |
1018 syntaxError(src->source,(int32_t)(src->current-src->source),(int
32_t)(src->end-src->source),parseError); | |
1019 DBG_FORMAT_ERROR | |
1020 return NULL; | |
1021 // enabling rules to start with non-tokens a < b | |
1022 // newStrength = UCOL_TOK_RESET; | |
1023 } | |
1024 if(ch != 0x0000 && src->current != src->end) { | |
1025 if (inChars) { | |
1026 if(src->parsedToken.charsLen == 0) { | |
1027 src->parsedToken.charsOffset = (uint32_t)(src->current -
src->source); | |
1028 } | |
1029 src->parsedToken.charsLen++; | |
1030 } else { | |
1031 if(newExtensionLen == 0) { | |
1032 extensionOffset = (uint32_t)(src->current - src->source)
; | |
1033 } | |
1034 newExtensionLen++; | |
1035 } | |
1036 } | |
1037 }else { | |
1038 if(!PatternProps::isWhiteSpace(ch)) { | |
1039 /* Sets the strength for this entry */ | |
1040 switch (ch) { | |
1041 case 0x003D/*'='*/ : | |
1042 if (newStrength != UCOL_TOK_UNSET) { | |
1043 goto EndOfLoop; | |
1044 } | |
1045 | |
1046 /* if we start with strength, we'll reset to top */ | |
1047 if(startOfRules == TRUE) { | |
1048 src->parsedToken.indirectIndex = 5; | |
1049 top = ucol_tok_doSetTop(src, status); | |
1050 newStrength = UCOL_TOK_RESET; | |
1051 goto EndOfLoop; | |
1052 } | |
1053 newStrength = UCOL_IDENTICAL; | |
1054 if(*(src->current+1) == 0x002A) {/*'*'*/ | |
1055 src->current++; | |
1056 src->isStarred = TRUE; | |
1057 } | |
1058 break; | |
1059 | |
1060 case 0x002C/*','*/: | |
1061 if (newStrength != UCOL_TOK_UNSET) { | |
1062 goto EndOfLoop; | |
1063 } | |
1064 | |
1065 /* if we start with strength, we'll reset to top */ | |
1066 if(startOfRules == TRUE) { | |
1067 src->parsedToken.indirectIndex = 5; | |
1068 top = ucol_tok_doSetTop(src, status); | |
1069 newStrength = UCOL_TOK_RESET; | |
1070 goto EndOfLoop; | |
1071 } | |
1072 newStrength = UCOL_TERTIARY; | |
1073 break; | |
1074 | |
1075 case 0x003B/*';'*/: | |
1076 if (newStrength != UCOL_TOK_UNSET) { | |
1077 goto EndOfLoop; | |
1078 } | |
1079 | |
1080 /* if we start with strength, we'll reset to top */ | |
1081 if(startOfRules == TRUE) { | |
1082 src->parsedToken.indirectIndex = 5; | |
1083 top = ucol_tok_doSetTop(src, status); | |
1084 newStrength = UCOL_TOK_RESET; | |
1085 goto EndOfLoop; | |
1086 } | |
1087 newStrength = UCOL_SECONDARY; | |
1088 break; | |
1089 | |
1090 case 0x003C/*'<'*/: | |
1091 if (newStrength != UCOL_TOK_UNSET) { | |
1092 goto EndOfLoop; | |
1093 } | |
1094 | |
1095 /* if we start with strength, we'll reset to top */ | |
1096 if(startOfRules == TRUE) { | |
1097 src->parsedToken.indirectIndex = 5; | |
1098 top = ucol_tok_doSetTop(src, status); | |
1099 newStrength = UCOL_TOK_RESET; | |
1100 goto EndOfLoop; | |
1101 } | |
1102 /* before this, do a scan to verify whether this is */ | |
1103 /* another strength */ | |
1104 if(*(src->current+1) == 0x003C) { | |
1105 src->current++; | |
1106 if(*(src->current+1) == 0x003C) { | |
1107 src->current++; /* three in a row! */ | |
1108 newStrength = UCOL_TERTIARY; | |
1109 } else { /* two in a row */ | |
1110 newStrength = UCOL_SECONDARY; | |
1111 } | |
1112 } else { /* just one */ | |
1113 newStrength = UCOL_PRIMARY; | |
1114 } | |
1115 if(*(src->current+1) == 0x002A) {/*'*'*/ | |
1116 src->current++; | |
1117 src->isStarred = TRUE; | |
1118 } | |
1119 break; | |
1120 | |
1121 case 0x0026/*'&'*/: | |
1122 if (newStrength != UCOL_TOK_UNSET) { | |
1123 /**/ | |
1124 goto EndOfLoop; | |
1125 } | |
1126 | |
1127 newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */ | |
1128 break; | |
1129 | |
1130 case 0x005b/*'['*/: | |
1131 /* options - read an option, analyze it */ | |
1132 if(u_strchr(src->current, 0x005d /*']'*/) != NULL) { | |
1133 uint8_t result = ucol_uprv_tok_readAndSetOption(src, sta
tus); | |
1134 if(U_SUCCESS(*status)) { | |
1135 if(result & UCOL_TOK_TOP) { | |
1136 if(newStrength == UCOL_TOK_RESET) { | |
1137 top = ucol_tok_doSetTop(src, status); | |
1138 if(before) { // This is a combination of bef
ore and indirection like '&[before 2][first regular]<b' | |
1139 src->parsedToken.charsLen+=2; | |
1140 buff[0] = 0x002d; | |
1141 buff[1] = before; | |
1142 ucol_tok_addToExtraCurrent(src, buff, 2,
status); | |
1143 } | |
1144 | |
1145 src->current++; | |
1146 goto EndOfLoop; | |
1147 } else { | |
1148 *status = U_INVALID_FORMAT_ERROR; | |
1149 syntaxError(src->source,(int32_t)(src->curre
nt-src->source),(int32_t)(src->end-src->source),parseError); | |
1150 DBG_FORMAT_ERROR | |
1151 } | |
1152 } else if(result & UCOL_TOK_VARIABLE_TOP) { | |
1153 if(newStrength != UCOL_TOK_RESET && newStrength
!= UCOL_TOK_UNSET) { | |
1154 variableTop = TRUE; | |
1155 src->parsedToken.charsOffset = (uint32_t)(sr
c->extraCurrent - src->source); | |
1156 src->parsedToken.charsLen = 1; | |
1157 buff[0] = 0xFFFF; | |
1158 ucol_tok_addToExtraCurrent(src, buff, 1, sta
tus); | |
1159 src->current++; | |
1160 goto EndOfLoop; | |
1161 } else { | |
1162 *status = U_INVALID_FORMAT_ERROR; | |
1163 syntaxError(src->source,(int32_t)(src->curre
nt-src->source),(int32_t)(src->end-src->source),parseError); | |
1164 DBG_FORMAT_ERROR | |
1165 } | |
1166 } else if (result & UCOL_TOK_BEFORE){ | |
1167 if(newStrength == UCOL_TOK_RESET) { | |
1168 before = result & UCOL_TOK_BEFORE; | |
1169 } else { | |
1170 *status = U_INVALID_FORMAT_ERROR; | |
1171 syntaxError(src->source,(int32_t)(src->curre
nt-src->source),(int32_t)(src->end-src->source),parseError); | |
1172 DBG_FORMAT_ERROR | |
1173 } | |
1174 } | |
1175 } else { | |
1176 *status = U_INVALID_FORMAT_ERROR; | |
1177 syntaxError(src->source,(int32_t)(src->current-src->
source),(int32_t)(src->end-src->source),parseError); | |
1178 DBG_FORMAT_ERROR | |
1179 return NULL; | |
1180 } | |
1181 } | |
1182 break; | |
1183 case 0x0021/*! skip java thai modifier reordering*/: | |
1184 break; | |
1185 case 0x002F/*'/'*/: | |
1186 wasInQuote = FALSE; /* if we were copying source characters,
we want to stop now */ | |
1187 inChars = FALSE; /* we're now processing expansion */ | |
1188 break; | |
1189 case 0x005C /* back slash for escaped chars */: | |
1190 isEscaped = TRUE; | |
1191 break; | |
1192 /* found a quote, we're gonna start copying */ | |
1193 case 0x0027/*'\''*/: | |
1194 if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal unt
il we have a strength */ | |
1195 *status = U_INVALID_FORMAT_ERROR; | |
1196 syntaxError(src->source,(int32_t)(src->current-src->source
),(int32_t)(src->end-src->source),parseError); | |
1197 DBG_FORMAT_ERROR | |
1198 return NULL; | |
1199 // enabling rules to start with a non-token character a <
b | |
1200 // newStrength = UCOL_TOK_RESET; | |
1201 } | |
1202 | |
1203 inQuote = TRUE; | |
1204 | |
1205 if(inChars) { /* we're doing characters */ | |
1206 if(wasInQuote == FALSE) { | |
1207 src->parsedToken.charsOffset = (uint32_t)(src->extra
Current - src->source); | |
1208 } | |
1209 if (src->parsedToken.charsLen != 0) { | |
1210 ucol_tok_addToExtraCurrent(src, src->current - src->
parsedToken.charsLen, src->parsedToken.charsLen, status); | |
1211 } | |
1212 src->parsedToken.charsLen++; | |
1213 } else { /* we're doing an expansion */ | |
1214 if(wasInQuote == FALSE) { | |
1215 extensionOffset = (uint32_t)(src->extraCurrent - src
->source); | |
1216 } | |
1217 if (newExtensionLen != 0) { | |
1218 ucol_tok_addToExtraCurrent(src, src->current - newEx
tensionLen, newExtensionLen, status); | |
1219 } | |
1220 newExtensionLen++; | |
1221 } | |
1222 | |
1223 wasInQuote = TRUE; | |
1224 | |
1225 ch = *(++(src->current)); | |
1226 if(ch == 0x0027) { /* copy the double quote */ | |
1227 ucol_tok_addToExtraCurrent(src, &ch, 1, status); | |
1228 inQuote = FALSE; | |
1229 } | |
1230 break; | |
1231 | |
1232 /* '@' is french only if the strength is not currently set *
/ | |
1233 /* if it is, it's just a regular character in collation rule
s */ | |
1234 case 0x0040/*'@'*/: | |
1235 if (newStrength == UCOL_TOK_UNSET) { | |
1236 src->opts->frenchCollation = UCOL_ON; | |
1237 break; | |
1238 } | |
1239 | |
1240 case 0x007C /*|*/: /* this means we have actually been reading p
refix part */ | |
1241 // we want to store read characters to the prefix part and c
ontinue reading | |
1242 // the characters (proper way would be to restart reading th
e chars, but in | |
1243 // that case we would have to complicate the token hasher, w
hich I do not | |
1244 // intend to play with. Instead, we will do prefixes when pr
efixes are due | |
1245 // (before adding the elements). | |
1246 src->parsedToken.prefixOffset = src->parsedToken.charsOffset
; | |
1247 src->parsedToken.prefixLen = src->parsedToken.charsLen; | |
1248 | |
1249 if(inChars) { /* we're doing characters */ | |
1250 if(wasInQuote == FALSE) { | |
1251 src->parsedToken.charsOffset = (uint32_t)(src->extra
Current - src->source); | |
1252 } | |
1253 if (src->parsedToken.charsLen != 0) { | |
1254 ucol_tok_addToExtraCurrent(src, src->current - src->
parsedToken.charsLen, src->parsedToken.charsLen, status); | |
1255 } | |
1256 src->parsedToken.charsLen++; | |
1257 } | |
1258 | |
1259 wasInQuote = TRUE; | |
1260 | |
1261 do { | |
1262 ch = *(++(src->current)); | |
1263 // skip whitespace between '|' and the character | |
1264 } while (PatternProps::isWhiteSpace(ch)); | |
1265 break; | |
1266 | |
1267 //charsOffset = 0; | |
1268 //newCharsLen = 0; | |
1269 //break; // We want to store the whole prefix/character sequ
ence. If we break | |
1270 // the '|' is going to get lost. | |
1271 | |
1272 case 0x002D /*-*/: /* A range. */ | |
1273 if (newStrength != UCOL_TOK_UNSET) { | |
1274 // While processing the pending token, the isStarred field | |
1275 // is reset, so it needs to be saved for the next | |
1276 // invocation. | |
1277 src->savedIsStarred = src->isStarred; | |
1278 goto EndOfLoop; | |
1279 } | |
1280 src->isStarred = src->savedIsStarred; | |
1281 | |
1282 // Ranges are valid only in starred tokens. | |
1283 if (!src->isStarred) { | |
1284 *status = U_INVALID_FORMAT_ERROR; | |
1285 syntaxError(src->source,(int32_t)(src->current-src->source)
,(int32_t)(src->end-src->source),parseError); | |
1286 DBG_FORMAT_ERROR | |
1287 return NULL; | |
1288 } | |
1289 newStrength = src->parsedToken.strength; | |
1290 src->inRange = TRUE; | |
1291 break; | |
1292 | |
1293 case 0x0023 /*#*/: /* this is a comment, skip everything through
the end of line */ | |
1294 do { | |
1295 ch = *(++(src->current)); | |
1296 } while (!isCharNewLine(ch)); | |
1297 | |
1298 break; | |
1299 default: | |
1300 if (newStrength == UCOL_TOK_UNSET) { | |
1301 *status = U_INVALID_FORMAT_ERROR; | |
1302 syntaxError(src->source,(int32_t)(src->current-src->source
),(int32_t)(src->end-src->source),parseError); | |
1303 DBG_FORMAT_ERROR | |
1304 return NULL; | |
1305 } | |
1306 | |
1307 if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) { | |
1308 *status = U_INVALID_FORMAT_ERROR; | |
1309 syntaxError(src->source,(int32_t)(src->current-src->sour
ce),(int32_t)(src->end-src->source),parseError); | |
1310 DBG_FORMAT_ERROR | |
1311 return NULL; | |
1312 } | |
1313 | |
1314 if(ch == 0x0000 && src->current+1 == src->end) { | |
1315 break; | |
1316 } | |
1317 | |
1318 if (inChars) { | |
1319 if(src->parsedToken.charsLen == 0) { | |
1320 src->parsedToken.charsOffset = (uint32_t)(src->curre
nt - src->source); | |
1321 } | |
1322 src->parsedToken.charsLen++; | |
1323 } else { | |
1324 if(newExtensionLen == 0) { | |
1325 extensionOffset = (uint32_t)(src->current - src->sou
rce); | |
1326 } | |
1327 newExtensionLen++; | |
1328 } | |
1329 | |
1330 break; | |
1331 } | |
1332 } | |
1333 } | |
1334 | |
1335 if(wasInQuote) { | |
1336 if(ch != 0x27) { | |
1337 if(inQuote || !PatternProps::isWhiteSpace(ch)) { | |
1338 ucol_tok_addToExtraCurrent(src, &ch, 1, status); | |
1339 } | |
1340 } | |
1341 } | |
1342 | |
1343 src->current++; | |
1344 } | |
1345 | |
1346 EndOfLoop: | |
1347 wasInQuote = FALSE; | |
1348 if (newStrength == UCOL_TOK_UNSET) { | |
1349 return NULL; | |
1350 } | |
1351 | |
1352 if (src->parsedToken.charsLen == 0 && top == FALSE) { | |
1353 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(sr
c->end-src->source),parseError); | |
1354 *status = U_INVALID_FORMAT_ERROR; | |
1355 DBG_FORMAT_ERROR | |
1356 return NULL; | |
1357 } | |
1358 | |
1359 src->parsedToken.strength = newStrength; | |
1360 src->parsedToken.extensionOffset = extensionOffset; | |
1361 src->parsedToken.extensionLen = newExtensionLen; | |
1362 src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL
_TOK_TOP * (top?1:0)) | before; | |
1363 | |
1364 return src->current; | |
1365 } | |
1366 | |
1367 /* | |
1368 * Parses the next token, keeps the indices in src->parsedToken, and updates the
counters. | |
1369 * @see ucol_tok_parseNextTokenInternal() for the description of what operators
are supported. | |
1370 * | |
1371 * In addition to what ucol_tok_parseNextTokenInternal() does, this function doe
s the following: | |
1372 * 1) ucol_tok_parseNextTokenInternal() returns a range as a single token. Thi
s function separates | |
1373 * it to separate tokens and returns one by one. In order to do that, the n
ecessary states are | |
1374 * cached as member variables of the token parser. | |
1375 * 2) When encountering a range, ucol_tok_parseNextTokenInternal() processes ch
aracters up to the | |
1376 * starting character as a single list token (which is separated into indivi
dual characters here) | |
1377 * and as another list token starting with the last character in the range.
Before expanding it | |
1378 * as a list of tokens, this function expands the range by filling the inter
mediate characters and | |
1379 * returns them one by one as separate tokens. | |
1380 * Necessary checks are done for invalid combinations. | |
1381 */ | |
1382 U_CAPI const UChar* U_EXPORT2 | |
1383 ucol_tok_parseNextToken(UColTokenParser *src, | |
1384 UBool startOfRules, | |
1385 UParseError *parseError, | |
1386 UErrorCode *status) | |
1387 { | |
1388 const UChar *nextToken; | |
1389 | |
1390 if (src->inRange) { | |
1391 // We are not done processing a range. Continue it. | |
1392 return ucol_tok_processNextCodePointInRange(src, status); | |
1393 } else if (src->isStarred) { | |
1394 // We are not done processing a starred token. Continue it. | |
1395 return ucol_tok_processNextTokenInStarredList(src); | |
1396 } | |
1397 | |
1398 // Get the next token. | |
1399 nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, sta
tus); | |
1400 | |
1401 if (nextToken == NULL) { | |
1402 return NULL; | |
1403 } | |
1404 | |
1405 if (src->inRange) { | |
1406 // A new range has started. | |
1407 // Check whether it is a chain of ranges with more than one hyphen. | |
1408 if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) { | |
1409 *status = U_INVALID_FORMAT_ERROR; | |
1410 syntaxError(src->source,src->parsedToken.charsOffset-1, | |
1411 src->parsedToken.charsOffset+src->parsedToken.charsLen, pars
eError); | |
1412 DBG_FORMAT_ERROR | |
1413 return NULL; | |
1414 } | |
1415 | |
1416 // The current token indicates the second code point of the range. | |
1417 // Process just that, and then proceed with the star. | |
1418 src->currentStarredCharIndex = src->parsedToken.charsOffset; | |
1419 U16_NEXT(src->source, src->currentStarredCharIndex, | |
1420 (uint32_t)(src->end - src->source), src->lastRangeCp); | |
1421 if (src->lastRangeCp <= src->previousCp) { | |
1422 *status = U_INVALID_FORMAT_ERROR; | |
1423 syntaxError(src->source,src->parsedToken.charsOffset-1, | |
1424 src->parsedToken.charsOffset+src->parsedToken.charsLen,parse
Error); | |
1425 DBG_FORMAT_ERROR | |
1426 return NULL; | |
1427 } | |
1428 | |
1429 // Set current range code point to process the range loop | |
1430 src->currentRangeCp = src->previousCp + 1; | |
1431 | |
1432 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.
charsLen - 1; | |
1433 | |
1434 return ucol_tok_processNextCodePointInRange(src, status); | |
1435 } else if (src->isStarred) { | |
1436 // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharInd
ex_ so that | |
1437 // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive,
need to be | |
1438 // separated into several tokens and returned. | |
1439 src->currentStarredCharIndex = src->parsedToken.charsOffset; | |
1440 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken
.charsLen - 1; | |
1441 | |
1442 return ucol_tok_processNextTokenInStarredList(src); | |
1443 } else { | |
1444 // Set previous codepoint | |
1445 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end -
src->source), src->previousCp); | |
1446 } | |
1447 return nextToken; | |
1448 } | |
1449 | |
1450 | |
1451 /* | |
1452 Processing Description | |
1453 1 Build a ListList. Each list has a header, which contains two lists (positive | |
1454 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and | |
1455 reset may be null. | |
1456 2 As you process, you keep a LAST pointer that points to the last token you | |
1457 handled. | |
1458 | |
1459 */ | |
1460 | |
1461 static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand,
uint32_t *expandNext, | |
1462 UParseError *parseError, UErrorCode *statu
s) | |
1463 { | |
1464 if(src->resultLen == src->listCapacity) { | |
1465 // Unfortunately, this won't work, as we store addresses of lhs in token | |
1466 src->listCapacity *= 2; | |
1467 src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*s
izeof(UColTokListHeader)); | |
1468 if(src->lh == NULL) { | |
1469 *status = U_MEMORY_ALLOCATION_ERROR; | |
1470 return NULL; | |
1471 } | |
1472 } | |
1473 /* do the reset thing */ | |
1474 UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); | |
1475 /* test for NULL */ | |
1476 if (sourceToken == NULL) { | |
1477 *status = U_MEMORY_ALLOCATION_ERROR; | |
1478 return NULL; | |
1479 } | |
1480 sourceToken->rulesToParseHdl = &(src->source); | |
1481 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.cha
rsOffset; | |
1482 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedTo
ken.extensionOffset; | |
1483 | |
1484 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); | |
1485 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffs
et); | |
1486 | |
1487 // keep the flags around so that we know about before | |
1488 sourceToken->flags = src->parsedToken.flags; | |
1489 | |
1490 if(src->parsedToken.prefixOffset != 0) { | |
1491 // this is a syntax error | |
1492 *status = U_INVALID_FORMAT_ERROR; | |
1493 syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.
charsOffset+src->parsedToken.charsLen,parseError); | |
1494 DBG_FORMAT_ERROR | |
1495 uprv_free(sourceToken); | |
1496 return 0; | |
1497 } else { | |
1498 sourceToken->prefix = 0; | |
1499 } | |
1500 | |
1501 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should als
o handle reverse */ | |
1502 sourceToken->strength = UCOL_TOK_RESET; | |
1503 sourceToken->next = NULL; | |
1504 sourceToken->previous = NULL; | |
1505 sourceToken->noOfCEs = 0; | |
1506 sourceToken->noOfExpCEs = 0; | |
1507 sourceToken->listHeader = &src->lh[src->resultLen]; | |
1508 | |
1509 src->lh[src->resultLen].first = NULL; | |
1510 src->lh[src->resultLen].last = NULL; | |
1511 src->lh[src->resultLen].first = NULL; | |
1512 src->lh[src->resultLen].last = NULL; | |
1513 | |
1514 src->lh[src->resultLen].reset = sourceToken; | |
1515 | |
1516 /* | |
1517 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... | |
1518 First convert all expansions into normal form. Examples: | |
1519 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * | |
1520 d * ... into &x * c/y * d * ... | |
1521 Note: reset values can never have expansions, although they can cause the | |
1522 very next item to have one. They may be contractions, if they are found | |
1523 earlier in the list. | |
1524 */ | |
1525 *expandNext = 0; | |
1526 if(expand != NULL) { | |
1527 /* check to see if there is an expansion */ | |
1528 if(src->parsedToken.charsLen > 1) { | |
1529 uint32_t resetCharsOffset; | |
1530 resetCharsOffset = (uint32_t)(expand - src->source); | |
1531 sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOff
set ) << 24) | src->parsedToken.charsOffset; | |
1532 *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOf
fset - resetCharsOffset)<<24) | (resetCharsOffset); | |
1533 } | |
1534 } | |
1535 | |
1536 src->resultLen++; | |
1537 | |
1538 uhash_put(src->tailored, sourceToken, sourceToken, status); | |
1539 | |
1540 return sourceToken; | |
1541 } | |
1542 | |
1543 static | |
1544 inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken,
uint8_t strength, UParseError *parseError, UErrorCode *status) { | |
1545 if(U_FAILURE(*status)) { | |
1546 return NULL; | |
1547 } | |
1548 /* this is a virgin before - we need to fish the anchor from the UCA */ | |
1549 collIterate s; | |
1550 uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND; | |
1551 uint32_t CE, SecondCE; | |
1552 // uint32_t invPos; | |
1553 if(sourceToken != NULL) { | |
1554 uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFF
FFF), 1, &s, status); | |
1555 } else { | |
1556 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset
/**charsOffset*/, 1, &s, status); | |
1557 } | |
1558 if(U_FAILURE(*status)) { | |
1559 return NULL; | |
1560 } | |
1561 | |
1562 baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F; | |
1563 baseContCE = ucol_getNextCE(src->UCA, &s, status); | |
1564 if(baseContCE == UCOL_NO_MORE_CES) { | |
1565 baseContCE = 0; | |
1566 } | |
1567 | |
1568 | |
1569 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UC
A->image->UCAConsts); | |
1570 uint32_t ch = 0; | |
1571 uint32_t expandNext = 0; | |
1572 UColToken key; | |
1573 | |
1574 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseC
E & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ | |
1575 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRI
MARYMASK) >> 16); | |
1576 uint32_t raw = uprv_uca_getRawFromImplicit(primary); | |
1577 ch = uprv_uca_getCodePointFromRaw(raw-1); | |
1578 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); | |
1579 CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; | |
1580 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MA
RKER; | |
1581 | |
1582 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->sourc
e); | |
1583 *src->extraCurrent++ = 0xFFFE; | |
1584 *src->extraCurrent++ = (UChar)ch; | |
1585 src->parsedToken.charsLen++; | |
1586 | |
1587 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->pa
rsedToken.charsOffset/**charsOffset*/; | |
1588 key.rulesToParseHdl = &(src->source); | |
1589 | |
1590 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); | |
1591 sourceToken = (UColToken *)uhash_get(src->tailored, &key); | |
1592 | |
1593 if(sourceToken == NULL) { | |
1594 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; | |
1595 if(isContinuation(SecondCE)) { | |
1596 src->lh[src->resultLen].baseContCE = SecondCE; | |
1597 } else { | |
1598 src->lh[src->resultLen].baseContCE = 0; | |
1599 } | |
1600 src->lh[src->resultLen].nextCE = 0; | |
1601 src->lh[src->resultLen].nextContCE = 0; | |
1602 src->lh[src->resultLen].previousCE = 0; | |
1603 src->lh[src->resultLen].previousContCE = 0; | |
1604 | |
1605 src->lh[src->resultLen].indirect = FALSE; | |
1606 | |
1607 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, s
tatus); | |
1608 } | |
1609 | |
1610 } else { | |
1611 /* invPos = */ ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondC
E, strength); | |
1612 | |
1613 // we got the previous CE. Now we need to see if the difference between | |
1614 // the two CEs is really of the requested strength. | |
1615 // if it's a bigger difference (we asked for secondary and got primary),
we | |
1616 // need to modify the CE. | |
1617 if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < stre
ngth) { | |
1618 // adjust the strength | |
1619 // now we are in the situation where our baseCE should actually be m
odified in | |
1620 // order to get the CE in the right position. | |
1621 if(strength == UCOL_SECONDARY) { | |
1622 CE = baseCE - 0x0200; | |
1623 } else { // strength == UCOL_TERTIARY | |
1624 CE = baseCE - 0x02; | |
1625 } | |
1626 if(baseContCE) { | |
1627 if(strength == UCOL_SECONDARY) { | |
1628 SecondCE = baseContCE - 0x0200; | |
1629 } else { // strength == UCOL_TERTIARY | |
1630 SecondCE = baseContCE - 0x02; | |
1631 } | |
1632 } | |
1633 } | |
1634 | |
1635 #if 0 | |
1636 // the code below relies on getting a code point from the inverse table,
in order to be | |
1637 // able to merge the situations like &x < 9 &[before 1]a < d. This won't
work: | |
1638 // 1. There are many code points that have the same CE | |
1639 // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2
] are broken. | |
1640 // Also, in case when there is no equivalent strength before an element,
we have to actually | |
1641 // construct one. For example, &[before 2]a << x won't result in x << a,
because the element | |
1642 // before a is a primary difference. | |
1643 | |
1644 //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->t
able); | |
1645 | |
1646 | |
1647 ch = CETable[3*invPos+2]; | |
1648 | |
1649 if((ch & UCOL_INV_SIZEMASK) != 0) { | |
1650 uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->c
onts); | |
1651 uint32_t offset = (ch & UCOL_INV_OFFSETMASK); | |
1652 ch = conts[offset]; | |
1653 } | |
1654 | |
1655 *src->extraCurrent++ = (UChar)ch; | |
1656 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->sourc
e - 1); | |
1657 src->parsedToken.charsLen = 1; | |
1658 | |
1659 // We got an UCA before. However, this might have been tailored. | |
1660 // example: | |
1661 // &\u30ca = \u306a | |
1662 // &[before 3]\u306a<<<\u306a|\u309d | |
1663 | |
1664 | |
1665 // uint32_t key = (*newCharsLen << 24) | *charsOffset; | |
1666 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->pa
rsedToken.charsOffset/**charsOffset*/; | |
1667 key.rulesToParseHdl = &(src->source); | |
1668 | |
1669 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); | |
1670 sourceToken = (UColToken *)uhash_get(src->tailored, &key); | |
1671 #endif | |
1672 | |
1673 // here is how it should be. The situation such as &[before 1]a < x, sho
uld be | |
1674 // resolved exactly as if we wrote &a > x. | |
1675 // therefore, I don't really care if the UCA value before a has been cha
nged. | |
1676 // However, I do care if the strength between my element and the previou
s element | |
1677 // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2],
then i'll | |
1678 // have to construct the base CE. | |
1679 | |
1680 | |
1681 | |
1682 // if we found a tailored thing, we have to use the UCA value and constr
uct | |
1683 // a new reset token with constructed name | |
1684 //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { | |
1685 // character to which we want to anchor is already tailored. | |
1686 // We need to construct a new token which will be the anchor | |
1687 // point | |
1688 //*(src->extraCurrent-1) = 0xFFFE; | |
1689 //*src->extraCurrent++ = (UChar)ch; | |
1690 // grab before | |
1691 src->parsedToken.charsOffset -= 10; | |
1692 src->parsedToken.charsLen += 10; | |
1693 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; | |
1694 if(isContinuation(SecondCE)) { | |
1695 src->lh[src->resultLen].baseContCE = SecondCE; | |
1696 } else { | |
1697 src->lh[src->resultLen].baseContCE = 0; | |
1698 } | |
1699 src->lh[src->resultLen].nextCE = 0; | |
1700 src->lh[src->resultLen].nextContCE = 0; | |
1701 src->lh[src->resultLen].previousCE = 0; | |
1702 src->lh[src->resultLen].previousContCE = 0; | |
1703 | |
1704 src->lh[src->resultLen].indirect = FALSE; | |
1705 | |
1706 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, statu
s); | |
1707 //} | |
1708 } | |
1709 | |
1710 return sourceToken; | |
1711 | |
1712 } | |
1713 | |
1714 uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseErro
r, UErrorCode *status) { | |
1715 UColToken *lastToken = NULL; | |
1716 const UChar *parseEnd = NULL; | |
1717 uint32_t expandNext = 0; | |
1718 UBool variableTop = FALSE; | |
1719 UBool top = FALSE; | |
1720 uint16_t specs = 0; | |
1721 UColTokListHeader *ListList = NULL; | |
1722 | |
1723 src->parsedToken.strength = UCOL_TOK_UNSET; | |
1724 | |
1725 ListList = src->lh; | |
1726 | |
1727 if(U_FAILURE(*status)) { | |
1728 return 0; | |
1729 } | |
1730 #ifdef DEBUG_FOR_CODE_POINTS | |
1731 char filename[35]; | |
1732 sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid()); | |
1733 dfcp_fp = fopen(filename, "a"); | |
1734 fprintf(stdout, "Output is in the file %s.\n", filename); | |
1735 #endif | |
1736 | |
1737 #ifdef DEBUG_FOR_COLL_RULES | |
1738 std::string s3; | |
1739 UnicodeString(src->source).toUTF8String(s3); | |
1740 std::cout << "src->source = " << s3 << std::endl; | |
1741 #endif | |
1742 | |
1743 while(src->current < src->end || src->isStarred) { | |
1744 src->parsedToken.prefixOffset = 0; | |
1745 | |
1746 parseEnd = ucol_tok_parseNextToken(src, | |
1747 (UBool)(lastToken == NULL), | |
1748 parseError, | |
1749 status); | |
1750 | |
1751 specs = src->parsedToken.flags; | |
1752 | |
1753 | |
1754 variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0); | |
1755 top = ((specs & UCOL_TOK_TOP) != 0); | |
1756 | |
1757 if(U_SUCCESS(*status) && parseEnd != NULL) { | |
1758 UColToken *sourceToken = NULL; | |
1759 //uint32_t key = 0; | |
1760 uint32_t lastStrength = UCOL_TOK_UNSET; | |
1761 | |
1762 if(lastToken != NULL ) { | |
1763 lastStrength = lastToken->strength; | |
1764 } | |
1765 | |
1766 #ifdef DEBUG_FOR_CODE_POINTS | |
1767 UChar32 cp; | |
1768 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src
->extraEnd - src->source), cp); | |
1769 fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsed
Token.strength); | |
1770 #endif | |
1771 //key = newCharsLen << 24 | charsOffset; | |
1772 UColToken key; | |
1773 key.source = src->parsedToken.charsLen << 24 | src->parsedToken.char
sOffset; | |
1774 key.rulesToParseHdl = &(src->source); | |
1775 | |
1776 /* 4 Lookup each source in the CharsToToken map, and find a sourceT
oken */ | |
1777 sourceToken = (UColToken *)uhash_get(src->tailored, &key); | |
1778 | |
1779 if(src->parsedToken.strength != UCOL_TOK_RESET) { | |
1780 if(lastToken == NULL) { /* this means that rules haven't started
properly */ | |
1781 *status = U_INVALID_FORMAT_ERROR; | |
1782 syntaxError(src->source,0,(int32_t)(src->end-src->source),pa
rseError); | |
1783 DBG_FORMAT_ERROR | |
1784 return 0; | |
1785 } | |
1786 /* 6 Otherwise (when relation != reset) */ | |
1787 if(sourceToken == NULL) { | |
1788 /* If sourceToken is null, create new one, */ | |
1789 sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); | |
1790 /* test for NULL */ | |
1791 if (sourceToken == NULL) { | |
1792 *status = U_MEMORY_ALLOCATION_ERROR; | |
1793 return 0; | |
1794 } | |
1795 sourceToken->rulesToParseHdl = &(src->source); | |
1796 sourceToken->source = src->parsedToken.charsLen << 24 | src-
>parsedToken.charsOffset; | |
1797 | |
1798 sourceToken->debugSource = *(src->source + src->parsedToken.
charsOffset); | |
1799 | |
1800 sourceToken->prefix = src->parsedToken.prefixLen << 24 | src
->parsedToken.prefixOffset; | |
1801 sourceToken->debugPrefix = *(src->source + src->parsedToken.
prefixOffset); | |
1802 | |
1803 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO:
this should also handle reverse */ | |
1804 sourceToken->next = NULL; | |
1805 sourceToken->previous = NULL; | |
1806 sourceToken->noOfCEs = 0; | |
1807 sourceToken->noOfExpCEs = 0; | |
1808 // keep the flags around so that we know about before | |
1809 sourceToken->flags = src->parsedToken.flags; | |
1810 uhash_put(src->tailored, sourceToken, sourceToken, status); | |
1811 if(U_FAILURE(*status)) { | |
1812 return 0; | |
1813 } | |
1814 } else { | |
1815 /* we could have fished out a reset here */ | |
1816 if(sourceToken->strength != UCOL_TOK_RESET && lastToken != s
ourceToken) { | |
1817 /* otherwise remove sourceToken from where it was. */ | |
1818 if(sourceToken->next != NULL) { | |
1819 if(sourceToken->next->strength > sourceToken->streng
th) { | |
1820 sourceToken->next->strength = sourceToken->stren
gth; | |
1821 } | |
1822 sourceToken->next->previous = sourceToken->previous; | |
1823 } else { | |
1824 sourceToken->listHeader->last = sourceToken->previou
s; | |
1825 } | |
1826 | |
1827 if(sourceToken->previous != NULL) { | |
1828 sourceToken->previous->next = sourceToken->next; | |
1829 } else { | |
1830 sourceToken->listHeader->first = sourceToken->next; | |
1831 } | |
1832 sourceToken->next = NULL; | |
1833 sourceToken->previous = NULL; | |
1834 } | |
1835 } | |
1836 | |
1837 sourceToken->strength = src->parsedToken.strength; | |
1838 sourceToken->listHeader = lastToken->listHeader; | |
1839 | |
1840 /* | |
1841 1. Find the strongest strength in each list, and set strongestP
and strongestN | |
1842 accordingly in the headers. | |
1843 */ | |
1844 if(lastStrength == UCOL_TOK_RESET | |
1845 || sourceToken->listHeader->first == 0) { | |
1846 /* If LAST is a reset | |
1847 insert sourceToken in the list. */ | |
1848 if(sourceToken->listHeader->first == 0) { | |
1849 sourceToken->listHeader->first = sourceToken; | |
1850 sourceToken->listHeader->last = sourceToken; | |
1851 } else { /* we need to find a place for us */ | |
1852 /* and we'll get in front of the same strength */ | |
1853 if(sourceToken->listHeader->first->strength <= sourc
eToken->strength) { | |
1854 sourceToken->next = sourceToken->listHeader->fir
st; | |
1855 sourceToken->next->previous = sourceToken; | |
1856 sourceToken->listHeader->first = sourceToken; | |
1857 sourceToken->previous = NULL; | |
1858 } else { | |
1859 lastToken = sourceToken->listHeader->first; | |
1860 while(lastToken->next != NULL && lastToken->next
->strength > sourceToken->strength) { | |
1861 lastToken = lastToken->next; | |
1862 } | |
1863 if(lastToken->next != NULL) { | |
1864 lastToken->next->previous = sourceToken; | |
1865 } else { | |
1866 sourceToken->listHeader->last = sourceToken; | |
1867 } | |
1868 sourceToken->previous = lastToken; | |
1869 sourceToken->next = lastToken->next; | |
1870 lastToken->next = sourceToken; | |
1871 } | |
1872 } | |
1873 } else { | |
1874 /* Otherwise (when LAST is not a reset) | |
1875 if polarity (LAST) == polarity(relation), insert sourceT
oken after LAST, | |
1876 otherwise insert before. | |
1877 when inserting after or before, search to the next posit
ion with the same | |
1878 strength in that direction. (This is called postpone ins
ertion). */ | |
1879 if(sourceToken != lastToken) { | |
1880 if(lastToken->polarity == sourceToken->polarity) { | |
1881 while(lastToken->next != NULL && lastToken->next
->strength > sourceToken->strength) { | |
1882 lastToken = lastToken->next; | |
1883 } | |
1884 sourceToken->previous = lastToken; | |
1885 if(lastToken->next != NULL) { | |
1886 lastToken->next->previous = sourceToken; | |
1887 } else { | |
1888 sourceToken->listHeader->last = sourceToken; | |
1889 } | |
1890 | |
1891 sourceToken->next = lastToken->next; | |
1892 lastToken->next = sourceToken; | |
1893 } else { | |
1894 while(lastToken->previous != NULL && lastToken->
previous->strength > sourceToken->strength) { | |
1895 lastToken = lastToken->previous; | |
1896 } | |
1897 sourceToken->next = lastToken; | |
1898 if(lastToken->previous != NULL) { | |
1899 lastToken->previous->next = sourceToken; | |
1900 } else { | |
1901 sourceToken->listHeader->first = sourceToken
; | |
1902 } | |
1903 sourceToken->previous = lastToken->previous; | |
1904 lastToken->previous = sourceToken; | |
1905 } | |
1906 } else { /* repeated one thing twice in rules, stay with
the stronger strength */ | |
1907 if(lastStrength < sourceToken->strength) { | |
1908 sourceToken->strength = lastStrength; | |
1909 } | |
1910 } | |
1911 } | |
1912 | |
1913 /* if the token was a variable top, we're gonna put it in */ | |
1914 if(variableTop == TRUE && src->varTop == NULL) { | |
1915 variableTop = FALSE; | |
1916 src->varTop = sourceToken; | |
1917 } | |
1918 | |
1919 // Treat the expansions. | |
1920 // There are two types of expansions: explicit (x / y) and r
eset based propagating expansions | |
1921 // (&abc * d * e <=> &ab * d / c * e / c) | |
1922 // if both of them are in effect for a token, they are combi
ned. | |
1923 | |
1924 sourceToken->expansion = src->parsedToken.extensionLen << 24
| src->parsedToken.extensionOffset; | |
1925 | |
1926 if(expandNext != 0) { | |
1927 if(sourceToken->strength == UCOL_PRIMARY) { /* primary s
trength kills off the implicit expansion */ | |
1928 expandNext = 0; | |
1929 } else if(sourceToken->expansion == 0) { /* if there is
no expansion, implicit is just added to the token */ | |
1930 sourceToken->expansion = expandNext; | |
1931 } else { /* there is both explicit and implicit expansio
n. We need to make a combination */ | |
1932 uprv_memcpy(src->extraCurrent, src->source + (expand
Next & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar)); | |
1933 uprv_memcpy(src->extraCurrent+(expandNext >> 24), sr
c->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*size
of(UChar)); | |
1934 sourceToken->expansion = (uint32_t)(((expandNext >>
24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->s
ource)); | |
1935 src->extraCurrent += (expandNext >> 24) + src->parse
dToken.extensionLen; | |
1936 } | |
1937 } | |
1938 | |
1939 // This is just for debugging purposes | |
1940 if(sourceToken->expansion != 0) { | |
1941 sourceToken->debugExpansion = *(src->source + src->parse
dToken.extensionOffset); | |
1942 } else { | |
1943 sourceToken->debugExpansion = 0; | |
1944 } | |
1945 // if the previous token was a reset before, the strength of
this | |
1946 // token must match the strength of before. Otherwise we hav
e an | |
1947 // undefined situation. | |
1948 // In other words, we currently have a cludge which we use t
o | |
1949 // represent &a >> x. This is written as &[before 2]a << x. | |
1950 if((lastToken->flags & UCOL_TOK_BEFORE) != 0) { | |
1951 uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BE
FORE) - 1; | |
1952 if(beforeStrength != sourceToken->strength) { | |
1953 *status = U_INVALID_FORMAT_ERROR; | |
1954 syntaxError(src->source,0,(int32_t)(src->end-src->so
urce),parseError); | |
1955 DBG_FORMAT_ERROR | |
1956 return 0; | |
1957 } | |
1958 } | |
1959 } else { | |
1960 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) { | |
1961 /* if the previous token was also a reset, */ | |
1962 /*this means that we have two consecutive resets */ | |
1963 /* and we want to remove the previous one if empty*/ | |
1964 if(src->resultLen > 0 && ListList[src->resultLen-1].first ==
NULL) { | |
1965 src->resultLen--; | |
1966 } | |
1967 } | |
1968 | |
1969 if(sourceToken == NULL) { /* this is a reset, but it might still
be somewhere in the tailoring, in shorter form */ | |
1970 uint32_t searchCharsLen = src->parsedToken.charsLen; | |
1971 while(searchCharsLen > 1 && sourceToken == NULL) { | |
1972 searchCharsLen--; | |
1973 //key = searchCharsLen << 24 | charsOffset; | |
1974 UColToken key; | |
1975 key.source = searchCharsLen << 24 | src->parsedToken.cha
rsOffset; | |
1976 key.rulesToParseHdl = &(src->source); | |
1977 sourceToken = (UColToken *)uhash_get(src->tailored, &key
); | |
1978 } | |
1979 if(sourceToken != NULL) { | |
1980 expandNext = (src->parsedToken.charsLen - searchCharsLen
) << 24 | (src->parsedToken.charsOffset + searchCharsLen); | |
1981 } | |
1982 } | |
1983 | |
1984 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */ | |
1985 if(top == FALSE) { /* there is no indirection */ | |
1986 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; | |
1987 if(sourceToken != NULL && sourceToken->strength != UCOL_
TOK_RESET) { | |
1988 /* this is a before that is already ordered in the U
CA - so we need to get the previous with good strength */ | |
1989 while(sourceToken->strength > strength && sourceToke
n->previous != NULL) { | |
1990 sourceToken = sourceToken->previous; | |
1991 } | |
1992 /* here, either we hit the strength or NULL */ | |
1993 if(sourceToken->strength == strength) { | |
1994 if(sourceToken->previous != NULL) { | |
1995 sourceToken = sourceToken->previous; | |
1996 } else { /* start of list */ | |
1997 sourceToken = sourceToken->listHeader->reset
; | |
1998 } | |
1999 } else { /* we hit NULL */ | |
2000 /* we should be doing the else part */ | |
2001 sourceToken = sourceToken->listHeader->reset; | |
2002 sourceToken = getVirginBefore(src, sourceToken,
strength, parseError, status); | |
2003 } | |
2004 } else { | |
2005 sourceToken = getVirginBefore(src, sourceToken, stre
ngth, parseError, status); | |
2006 } | |
2007 } else { /* this is both before and indirection */ | |
2008 top = FALSE; | |
2009 ListList[src->resultLen].previousCE = 0; | |
2010 ListList[src->resultLen].previousContCE = 0; | |
2011 ListList[src->resultLen].indirect = TRUE; | |
2012 /* we need to do slightly more work. we need to get the
baseCE using the */ | |
2013 /* inverse UCA & getPrevious. The next bound is not set,
and will be decided */ | |
2014 /* in ucol_bld */ | |
2015 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; | |
2016 uint32_t baseCE = ucolIndirectBoundaries[src->parsedToke
n.indirectIndex].startCE; | |
2017 uint32_t baseContCE = ucolIndirectBoundaries[src->parsed
Token.indirectIndex].startContCE;//&0xFFFFFF3F; | |
2018 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; | |
2019 | |
2020 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->
UCA->image + src->UCA->image->UCAConsts); | |
2021 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICI
T_MIN<<24) && | |
2022 (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICI
T_MAX<<24) ) { /* implicits - */ | |
2023 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((b
aseContCE & UCOL_PRIMARYMASK) >> 16); | |
2024 uint32_t raw = uprv_uca_getRawFromImplicit(primary); | |
2025 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw
-1); | |
2026 CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; | |
2027 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) |
UCOL_CONTINUATION_MARKER; | |
2028 } else { | |
2029 /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseCo
ntCE, &CE, &SecondCE, strength);*/ | |
2030 ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &Se
condCE, strength); | |
2031 } | |
2032 | |
2033 ListList[src->resultLen].baseCE = CE; | |
2034 ListList[src->resultLen].baseContCE = SecondCE; | |
2035 ListList[src->resultLen].nextCE = 0; | |
2036 ListList[src->resultLen].nextContCE = 0; | |
2037 | |
2038 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, p
arseError, status); | |
2039 } | |
2040 } | |
2041 | |
2042 | |
2043 /* 5 If the relation is a reset: | |
2044 If sourceToken is null | |
2045 Create new list, create new sourceToken, make the baseCE from so
urce, put | |
2046 the sourceToken in ListHeader of the new list */ | |
2047 if(sourceToken == NULL) { | |
2048 /* | |
2049 3 Consider each item: relation, source, and expansion: e.g.
...< x / y ... | |
2050 First convert all expansions into normal form. Examples: | |
2051 If "xy" doesn't occur earlier in the list or in the UCA, con
vert &xy * c * | |
2052 d * ... into &x * c/y * d * ... | |
2053 Note: reset values can never have expansions, although they
can cause the | |
2054 very next item to have one. They may be contractions, if the
y are found | |
2055 earlier in the list. | |
2056 */ | |
2057 if(top == FALSE) { | |
2058 collIterate s; | |
2059 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; | |
2060 | |
2061 uprv_init_collIterate(src->UCA, src->source+src->parsedT
oken.charsOffset, src->parsedToken.charsLen, &s, status); | |
2062 | |
2063 CE = ucol_getNextCE(src->UCA, &s, status); | |
2064 const UChar *expand = s.pos; | |
2065 SecondCE = ucol_getNextCE(src->UCA, &s, status); | |
2066 | |
2067 ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F; | |
2068 if(isContinuation(SecondCE)) { | |
2069 ListList[src->resultLen].baseContCE = SecondCE; | |
2070 } else { | |
2071 ListList[src->resultLen].baseContCE = 0; | |
2072 } | |
2073 ListList[src->resultLen].nextCE = 0; | |
2074 ListList[src->resultLen].nextContCE = 0; | |
2075 ListList[src->resultLen].previousCE = 0; | |
2076 ListList[src->resultLen].previousContCE = 0; | |
2077 ListList[src->resultLen].indirect = FALSE; | |
2078 sourceToken = ucol_tok_initAReset(src, expand, &expandNe
xt, parseError, status); | |
2079 } else { /* top == TRUE */ | |
2080 /* just use the supplied values */ | |
2081 top = FALSE; | |
2082 ListList[src->resultLen].previousCE = 0; | |
2083 ListList[src->resultLen].previousContCE = 0; | |
2084 ListList[src->resultLen].indirect = TRUE; | |
2085 ListList[src->resultLen].baseCE = ucolIndirectBoundaries
[src->parsedToken.indirectIndex].startCE; | |
2086 ListList[src->resultLen].baseContCE = ucolIndirectBounda
ries[src->parsedToken.indirectIndex].startContCE; | |
2087 ListList[src->resultLen].nextCE = ucolIndirectBoundaries
[src->parsedToken.indirectIndex].limitCE; | |
2088 ListList[src->resultLen].nextContCE = ucolIndirectBounda
ries[src->parsedToken.indirectIndex].limitContCE; | |
2089 | |
2090 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, p
arseError, status); | |
2091 | |
2092 } | |
2093 } else { /* reset to something already in rules */ | |
2094 top = FALSE; | |
2095 } | |
2096 } | |
2097 /* 7 After all this, set LAST to point to sourceToken, and goto ste
p 3. */ | |
2098 lastToken = sourceToken; | |
2099 } else { | |
2100 if(U_FAILURE(*status)) { | |
2101 return 0; | |
2102 } | |
2103 } | |
2104 } | |
2105 #ifdef DEBUG_FOR_CODE_POINTS | |
2106 fclose(dfcp_fp); | |
2107 #endif | |
2108 | |
2109 | |
2110 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { | |
2111 src->resultLen--; | |
2112 } | |
2113 return src->resultLen; | |
2114 } | |
2115 | |
2116 const UChar* ucol_tok_getRulesFromBundle( | |
2117 void* /*context*/, | |
2118 const char* locale, | |
2119 const char* type, | |
2120 int32_t* pLength, | |
2121 UErrorCode* status) | |
2122 { | |
2123 const UChar* rules = NULL; | |
2124 UResourceBundle* bundle; | |
2125 UResourceBundle* collations; | |
2126 UResourceBundle* collation; | |
2127 | |
2128 *pLength = 0; | |
2129 | |
2130 bundle = ures_open(U_ICUDATA_COLL, locale, status); | |
2131 if(U_SUCCESS(*status)){ | |
2132 collations = ures_getByKey(bundle, "collations", NULL, status); | |
2133 if(U_SUCCESS(*status)){ | |
2134 collation = ures_getByKey(collations, type, NULL, status); | |
2135 if(U_SUCCESS(*status)){ | |
2136 rules = ures_getStringByKey(collation, "Sequence", pLength, stat
us); | |
2137 if(U_FAILURE(*status)){ | |
2138 *pLength = 0; | |
2139 rules = NULL; | |
2140 } | |
2141 ures_close(collation); | |
2142 } | |
2143 ures_close(collations); | |
2144 } | |
2145 } | |
2146 | |
2147 ures_close(bundle); | |
2148 | |
2149 return rules; | |
2150 } | |
2151 | |
2152 void ucol_tok_initTokenList( | |
2153 UColTokenParser *src, | |
2154 const UChar *rules, | |
2155 uint32_t rulesLength, | |
2156 const UCollator *UCA, | |
2157 GetCollationRulesFunction importFunc, | |
2158 void* context, | |
2159 UErrorCode *status) { | |
2160 U_NAMESPACE_USE | |
2161 | |
2162 uint32_t nSize = 0; | |
2163 uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE); | |
2164 | |
2165 bool needToDeallocRules = false; | |
2166 | |
2167 if(U_FAILURE(*status)) { | |
2168 return; | |
2169 } | |
2170 | |
2171 // set everything to zero, so that we can clean up gracefully | |
2172 uprv_memset(src, 0, sizeof(UColTokenParser)); | |
2173 | |
2174 // first we need to find options that don't like to be normalized, | |
2175 // like copy and remove... | |
2176 //const UChar *openBrace = rules; | |
2177 int32_t optionNumber = -1; | |
2178 const UChar *setStart = NULL; | |
2179 uint32_t i = 0; | |
2180 while(i < rulesLength) { | |
2181 if(rules[i] == 0x005B) { // '[': start of an option | |
2182 /* Gets the following: | |
2183 optionNumber: The index of the option. | |
2184 setStart: The pointer at which the option arguments start. | |
2185 */ | |
2186 optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength
, &setStart); | |
2187 | |
2188 if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tail
oring */ | |
2189 // [optimize] | |
2190 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rule
s+rulesLength, status); | |
2191 if(U_SUCCESS(*status)) { | |
2192 if(src->copySet == NULL) { | |
2193 src->copySet = newSet; | |
2194 } else { | |
2195 uset_addAll(src->copySet, newSet); | |
2196 uset_close(newSet); | |
2197 } | |
2198 } else { | |
2199 return; | |
2200 } | |
2201 } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) { | |
2202 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rule
s+rulesLength, status); | |
2203 if(U_SUCCESS(*status)) { | |
2204 if(src->removeSet == NULL) { | |
2205 src->removeSet = newSet; | |
2206 } else { | |
2207 uset_addAll(src->removeSet, newSet); | |
2208 uset_close(newSet); | |
2209 } | |
2210 } else { | |
2211 return; | |
2212 } | |
2213 } else if(optionNumber == OPTION_IMPORT){ | |
2214 // [import <collation-name>] | |
2215 | |
2216 // Find the address of the closing ]. | |
2217 UChar* import_end = u_strchr(setStart, 0x005D); | |
2218 int32_t optionEndOffset = (int32_t)(import_end + 1 - rules); | |
2219 // Ignore trailing whitespace. | |
2220 while(PatternProps::isWhiteSpace(*(import_end-1))) { | |
2221 --import_end; | |
2222 } | |
2223 | |
2224 int32_t optionLength = (int32_t)(import_end - setStart); | |
2225 char option[50]; | |
2226 if(optionLength >= (int32_t)sizeof(option)) { | |
2227 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
2228 return; | |
2229 } | |
2230 u_UCharsToChars(setStart, option, optionLength); | |
2231 option[optionLength] = 0; | |
2232 | |
2233 *status = U_ZERO_ERROR; | |
2234 char locale[50]; | |
2235 int32_t templ; | |
2236 uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &te
mpl, status); | |
2237 if(U_FAILURE(*status)) { | |
2238 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
2239 return; | |
2240 } | |
2241 | |
2242 char type[50]; | |
2243 if (uloc_getKeywordValue(locale, "collation", type, (int32_t)siz
eof(type), status) <= 0 || | |
2244 U_FAILURE(*status) | |
2245 ) { | |
2246 *status = U_ZERO_ERROR; | |
2247 uprv_strcpy(type, "standard"); | |
2248 } | |
2249 | |
2250 // TODO: Use public functions when available, see ticket #8134. | |
2251 char *keywords = (char *)locale_getKeywordsStart(locale); | |
2252 if(keywords != NULL) { | |
2253 *keywords = 0; | |
2254 } | |
2255 | |
2256 int32_t importRulesLength = 0; | |
2257 const UChar* importRules = importFunc(context, locale, type, &im
portRulesLength, status); | |
2258 | |
2259 #ifdef DEBUG_FOR_COLL_RULES | |
2260 std::string s; | |
2261 UnicodeString(importRules).toUTF8String(s); | |
2262 std::cout << "Import rules = " << s << std::endl; | |
2263 #endif | |
2264 | |
2265 // Add the length of the imported rules to length of the origina
l rules, | |
2266 // and subtract the length of the import option. | |
2267 uint32_t newRulesLength = rulesLength + importRulesLength - (opt
ionEndOffset - i); | |
2268 | |
2269 UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UCha
r)); | |
2270 | |
2271 #ifdef DEBUG_FOR_COLL_RULES | |
2272 std::string s1; | |
2273 UnicodeString(rules).toUTF8String(s1); | |
2274 std::cout << "Original rules = " << s1 << std::endl; | |
2275 #endif | |
2276 | |
2277 | |
2278 // Copy the section of the original rules leading up to the impo
rt | |
2279 uprv_memcpy(newRules, rules, i*sizeof(UChar)); | |
2280 // Copy the imported rules | |
2281 uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UC
har)); | |
2282 // Copy the rest of the original rules (minus the import option
itself) | |
2283 uprv_memcpy(newRules+i+importRulesLength, | |
2284 rules+optionEndOffset, | |
2285 (rulesLength-optionEndOffset)*sizeof(UChar)); | |
2286 | |
2287 #ifdef DEBUG_FOR_COLL_RULES | |
2288 std::string s2; | |
2289 UnicodeString(newRules).toUTF8String(s2); | |
2290 std::cout << "Resulting rules = " << s2 << std::endl; | |
2291 #endif | |
2292 | |
2293 if(needToDeallocRules){ | |
2294 // if needToDeallocRules is set, then we allocated rules, so
it's safe to cast and free | |
2295 uprv_free((void*)rules); | |
2296 } | |
2297 needToDeallocRules = true; | |
2298 rules = newRules; | |
2299 rulesLength = newRulesLength; | |
2300 | |
2301 estimatedSize += importRulesLength*2; | |
2302 | |
2303 // First character of the new rules needs to be processed | |
2304 i--; | |
2305 } | |
2306 } | |
2307 //openBrace++; | |
2308 i++; | |
2309 } | |
2310 | |
2311 src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar)); | |
2312 /* test for NULL */ | |
2313 if (src->source == NULL) { | |
2314 *status = U_MEMORY_ALLOCATION_ERROR; | |
2315 return; | |
2316 } | |
2317 uprv_memset(src->source, 0, estimatedSize*sizeof(UChar)); | |
2318 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estim
atedSize, status); | |
2319 if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) { | |
2320 *status = U_ZERO_ERROR; | |
2321 src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_R
ULE_SPACE_SIZE)*sizeof(UChar)); | |
2322 /* test for NULL */ | |
2323 if (src->source == NULL) { | |
2324 *status = U_MEMORY_ALLOCATION_ERROR; | |
2325 return; | |
2326 } | |
2327 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, n
Size+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status); | |
2328 } | |
2329 if(needToDeallocRules){ | |
2330 // if needToDeallocRules is set, then we allocated rules, so it's safe t
o cast and free | |
2331 uprv_free((void*)rules); | |
2332 } | |
2333 | |
2334 | |
2335 src->current = src->source; | |
2336 src->end = src->source+nSize; | |
2337 src->sourceCurrent = src->source; | |
2338 src->extraCurrent = src->end+1; // Preserve terminating zero in the rule str
ing so that option scanning works correctly | |
2339 src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SP
ACE_SIZE; | |
2340 src->varTop = NULL; | |
2341 src->UCA = UCA; | |
2342 src->invUCA = ucol_initInverseUCA(status); | |
2343 src->parsedToken.charsLen = 0; | |
2344 src->parsedToken.charsOffset = 0; | |
2345 src->parsedToken.extensionLen = 0; | |
2346 src->parsedToken.extensionOffset = 0; | |
2347 src->parsedToken.prefixLen = 0; | |
2348 src->parsedToken.prefixOffset = 0; | |
2349 src->parsedToken.flags = 0; | |
2350 src->parsedToken.strength = UCOL_TOK_UNSET; | |
2351 src->buildCCTabFlag = FALSE; | |
2352 src->isStarred = FALSE; | |
2353 src->inRange = FALSE; | |
2354 src->lastRangeCp = 0; | |
2355 src->previousCp = 0; | |
2356 | |
2357 if(U_FAILURE(*status)) { | |
2358 return; | |
2359 } | |
2360 src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, stat
us); | |
2361 if(U_FAILURE(*status)) { | |
2362 return; | |
2363 } | |
2364 uhash_setValueDeleter(src->tailored, uprv_free); | |
2365 | |
2366 src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet)); | |
2367 /* test for NULL */ | |
2368 if (src->opts == NULL) { | |
2369 *status = U_MEMORY_ALLOCATION_ERROR; | |
2370 return; | |
2371 } | |
2372 | |
2373 uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet)); | |
2374 | |
2375 src->lh = 0; | |
2376 src->listCapacity = 1024; | |
2377 src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokL
istHeader)); | |
2378 //Test for NULL | |
2379 if (src->lh == NULL) { | |
2380 *status = U_MEMORY_ALLOCATION_ERROR; | |
2381 return; | |
2382 } | |
2383 uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader)); | |
2384 src->resultLen = 0; | |
2385 | |
2386 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UC
A->image->UCAConsts); | |
2387 | |
2388 // UCOL_RESET_TOP_VALUE | |
2389 setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IM
PLICIT); | |
2390 // UCOL_FIRST_PRIMARY_IGNORABLE | |
2391 setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0); | |
2392 // UCOL_LAST_PRIMARY_IGNORABLE | |
2393 setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0); | |
2394 // UCOL_FIRST_SECONDARY_IGNORABLE | |
2395 setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0); | |
2396 // UCOL_LAST_SECONDARY_IGNORABLE | |
2397 setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0); | |
2398 // UCOL_FIRST_TERTIARY_IGNORABLE | |
2399 setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0); | |
2400 // UCOL_LAST_TERTIARY_IGNORABLE | |
2401 setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0); | |
2402 // UCOL_FIRST_VARIABLE | |
2403 setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0); | |
2404 // UCOL_LAST_VARIABLE | |
2405 setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0); | |
2406 // UCOL_FIRST_NON_VARIABLE | |
2407 setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0); | |
2408 // UCOL_LAST_NON_VARIABLE | |
2409 setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_I
MPLICIT); | |
2410 // UCOL_FIRST_IMPLICIT | |
2411 setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0); | |
2412 // UCOL_LAST_IMPLICIT | |
2413 setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAIL
ING); | |
2414 // UCOL_FIRST_TRAILING | |
2415 setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0); | |
2416 // UCOL_LAST_TRAILING | |
2417 setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0); | |
2418 ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24); | |
2419 } | |
2420 | |
2421 | |
2422 void ucol_tok_closeTokenList(UColTokenParser *src) { | |
2423 if(src->copySet != NULL) { | |
2424 uset_close(src->copySet); | |
2425 } | |
2426 if(src->removeSet != NULL) { | |
2427 uset_close(src->removeSet); | |
2428 } | |
2429 if(src->tailored != NULL) { | |
2430 uhash_close(src->tailored); | |
2431 } | |
2432 if(src->lh != NULL) { | |
2433 uprv_free(src->lh); | |
2434 } | |
2435 if(src->source != NULL) { | |
2436 uprv_free(src->source); | |
2437 } | |
2438 if(src->opts != NULL) { | |
2439 uprv_free(src->opts); | |
2440 } | |
2441 if (src->reorderCodes != NULL) { | |
2442 uprv_free(src->reorderCodes); | |
2443 } | |
2444 } | |
2445 | |
2446 #endif /* #if !UCONFIG_NO_COLLATION */ | |
OLD | NEW |