OLD | NEW |
| (Empty) |
1 /* | |
2 ******************************************************************************* | |
3 * | |
4 * Copyright (C) 2001-2013, International Business Machines | |
5 * Corporation and others. All Rights Reserved. | |
6 * | |
7 ******************************************************************************* | |
8 * file name: ucol_bld.cpp | |
9 * encoding: US-ASCII | |
10 * tab size: 8 (not used) | |
11 * indentation:4 | |
12 * | |
13 * created 02/22/2001 | |
14 * created by: Vladimir Weinstein | |
15 * | |
16 * This module builds a collator based on the rule set. | |
17 * | |
18 */ | |
19 | |
20 #include "unicode/utypes.h" | |
21 | |
22 #if !UCONFIG_NO_COLLATION | |
23 | |
24 #include "unicode/ucoleitr.h" | |
25 #include "unicode/udata.h" | |
26 #include "unicode/uchar.h" | |
27 #include "unicode/uniset.h" | |
28 #include "unicode/uscript.h" | |
29 #include "unicode/ustring.h" | |
30 #include "unicode/utf16.h" | |
31 #include "normalizer2impl.h" | |
32 #include "uassert.h" | |
33 #include "ucol_bld.h" | |
34 #include "ucol_elm.h" | |
35 #include "ucol_cnt.h" | |
36 #include "ucln_in.h" | |
37 #include "umutex.h" | |
38 #include "cmemory.h" | |
39 #include "cstring.h" | |
40 | |
41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) | |
42 | |
43 static const InverseUCATableHeader* _staticInvUCA = NULL; | |
44 static UDataMemory* invUCA_DATA_MEM = NULL; | |
45 static icu::UInitOnce gStaticInvUCAInitOnce = U_INITONCE_INITIALIZER; | |
46 | |
47 U_CDECL_BEGIN | |
48 static UBool U_CALLCONV | |
49 isAcceptableInvUCA(void * /*context*/, | |
50 const char * /*type*/, const char * /*name*/, | |
51 const UDataInfo *pInfo) | |
52 { | |
53 /* context, type & name are intentionally not used */ | |
54 if( pInfo->size>=20 && | |
55 pInfo->isBigEndian==U_IS_BIG_ENDIAN && | |
56 pInfo->charsetFamily==U_CHARSET_FAMILY && | |
57 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */ | |
58 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 && | |
59 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 && | |
60 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 && | |
61 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 && | |
62 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&& | |
63 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 && | |
64 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 && | |
65 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 && | |
66 ) | |
67 { | |
68 // TODO: Check that the invuca data version (pInfo->dataVersion) | |
69 // matches the ucadata version. | |
70 return TRUE; | |
71 } else { | |
72 return FALSE; | |
73 } | |
74 } | |
75 U_CDECL_END | |
76 | |
77 /* | |
78 * Takes two CEs (lead and continuation) and | |
79 * compares them as CEs should be compared: | |
80 * primary vs. primary, secondary vs. secondary | |
81 * tertiary vs. tertiary | |
82 */ | |
83 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0,
uint32_t target1) { | |
84 uint32_t s1 = source0, s2, t1 = target0, t2; | |
85 if(isContinuation(source1)) { | |
86 s2 = source1; | |
87 } else { | |
88 s2 = 0; | |
89 } | |
90 if(isContinuation(target1)) { | |
91 t2 = target1; | |
92 } else { | |
93 t2 = 0; | |
94 } | |
95 | |
96 uint32_t s = 0, t = 0; | |
97 if(s1 == t1 && s2 == t2) { | |
98 return 0; | |
99 } | |
100 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16); | |
101 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16); | |
102 if(s < t) { | |
103 return -1; | |
104 } else if(s > t) { | |
105 return 1; | |
106 } else { | |
107 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8; | |
108 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8; | |
109 if(s < t) { | |
110 return -1; | |
111 } else if(s > t) { | |
112 return 1; | |
113 } else { | |
114 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF); | |
115 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF); | |
116 if(s < t) { | |
117 return -1; | |
118 } else { | |
119 return 1; | |
120 } | |
121 } | |
122 } | |
123 } | |
124 | |
125 static | |
126 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t Second
CE) { | |
127 uint32_t bottom = 0, top = src->invUCA->tableSize; | |
128 uint32_t i = 0; | |
129 uint32_t first = 0, second = 0; | |
130 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); | |
131 int32_t res = 0; | |
132 | |
133 while(bottom < top-1) { | |
134 i = (top+bottom)/2; | |
135 first = *(CETable+3*i); | |
136 second = *(CETable+3*i+1); | |
137 res = compareCEs(first, second, CE, SecondCE); | |
138 if(res > 0) { | |
139 top = i; | |
140 } else if(res < 0) { | |
141 bottom = i; | |
142 } else { | |
143 break; | |
144 } | |
145 } | |
146 | |
147 /* weiv: */ | |
148 /* in searching for elements, I have removed the failure */ | |
149 /* The reason for this is that the builder does not rely */ | |
150 /* on search mechanism telling it that it didn't find an */ | |
151 /* element. However, indirect positioning relies on being */ | |
152 /* able to find the elements around any CE, even if it is */ | |
153 /* not defined in the UCA. */ | |
154 return i; | |
155 /* | |
156 if((first == CE && second == SecondCE)) { | |
157 return i; | |
158 } else { | |
159 return -1; | |
160 } | |
161 */ | |
162 } | |
163 | |
164 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = { | |
165 0xFFFF0000, | |
166 0xFFFFFF00, | |
167 0xFFFFFFFF | |
168 }; | |
169 | |
170 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src, | |
171 uint32_t CE, uint32_t contCE, | |
172 uint32_t *nextCE, uint32_t *nextCont
CE, | |
173 uint32_t strength) | |
174 { | |
175 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); | |
176 int32_t iCE; | |
177 | |
178 iCE = ucol_inv_findCE(src, CE, contCE); | |
179 | |
180 if(iCE<0) { | |
181 *nextCE = UCOL_NOT_FOUND; | |
182 return -1; | |
183 } | |
184 | |
185 CE &= strengthMask[strength]; | |
186 contCE &= strengthMask[strength]; | |
187 | |
188 *nextCE = CE; | |
189 *nextContCE = contCE; | |
190 | |
191 while((*nextCE & strengthMask[strength]) == CE | |
192 && (*nextContCE & strengthMask[strength]) == contCE) | |
193 { | |
194 *nextCE = (*(CETable+3*(++iCE))); | |
195 *nextContCE = (*(CETable+3*(iCE)+1)); | |
196 } | |
197 | |
198 return iCE; | |
199 } | |
200 | |
201 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src, | |
202 uint32_t CE, uint32_t contCE, | |
203 uint32_t *prevCE, uint32_t *prevCont
CE, | |
204 uint32_t strength) | |
205 { | |
206 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); | |
207 int32_t iCE; | |
208 | |
209 iCE = ucol_inv_findCE(src, CE, contCE); | |
210 | |
211 if(iCE<0) { | |
212 *prevCE = UCOL_NOT_FOUND; | |
213 return -1; | |
214 } | |
215 | |
216 CE &= strengthMask[strength]; | |
217 contCE &= strengthMask[strength]; | |
218 | |
219 *prevCE = CE; | |
220 *prevContCE = contCE; | |
221 | |
222 while((*prevCE & strengthMask[strength]) == CE | |
223 && (*prevContCE & strengthMask[strength])== contCE | |
224 && iCE > 0) /* this condition should prevent falling off the edge of the
world */ | |
225 { | |
226 /* here, we end up in a singularity - zero */ | |
227 *prevCE = (*(CETable+3*(--iCE))); | |
228 *prevContCE = (*(CETable+3*(iCE)+1)); | |
229 } | |
230 | |
231 return iCE; | |
232 } | |
233 | |
234 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t co
ntCE, | |
235 uint32_t prevCE, uint32_t
prevContCE) | |
236 { | |
237 if(prevCE == CE && prevContCE == contCE) { | |
238 return UCOL_IDENTICAL; | |
239 } | |
240 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY]
) | |
241 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[U
COL_PRIMARY])) | |
242 { | |
243 return UCOL_PRIMARY; | |
244 } | |
245 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECOND
ARY]) | |
246 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask
[UCOL_SECONDARY])) | |
247 { | |
248 return UCOL_SECONDARY; | |
249 } | |
250 return UCOL_TERTIARY; | |
251 } | |
252 | |
253 | |
254 /*static | |
255 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh,
uint32_t strength) { | |
256 | |
257 uint32_t CE = lh->baseCE; | |
258 uint32_t SecondCE = lh->baseContCE; | |
259 | |
260 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); | |
261 uint32_t previousCE, previousContCE; | |
262 int32_t iCE; | |
263 | |
264 iCE = ucol_inv_findCE(src, CE, SecondCE); | |
265 | |
266 if(iCE<0) { | |
267 return -1; | |
268 } | |
269 | |
270 CE &= strengthMask[strength]; | |
271 SecondCE &= strengthMask[strength]; | |
272 | |
273 previousCE = CE; | |
274 previousContCE = SecondCE; | |
275 | |
276 while((previousCE & strengthMask[strength]) == CE && (previousContCE & str
engthMask[strength])== SecondCE) { | |
277 previousCE = (*(CETable+3*(--iCE))); | |
278 previousContCE = (*(CETable+3*(iCE)+1)); | |
279 } | |
280 lh->previousCE = previousCE; | |
281 lh->previousContCE = previousContCE; | |
282 | |
283 return iCE; | |
284 }*/ | |
285 | |
286 static | |
287 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uin
t32_t strength) { | |
288 uint32_t CE = lh->baseCE; | |
289 uint32_t SecondCE = lh->baseContCE; | |
290 | |
291 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); | |
292 uint32_t nextCE, nextContCE; | |
293 int32_t iCE; | |
294 | |
295 iCE = ucol_inv_findCE(src, CE, SecondCE); | |
296 | |
297 if(iCE<0) { | |
298 return -1; | |
299 } | |
300 | |
301 CE &= strengthMask[strength]; | |
302 SecondCE &= strengthMask[strength]; | |
303 | |
304 nextCE = CE; | |
305 nextContCE = SecondCE; | |
306 | |
307 while((nextCE & strengthMask[strength]) == CE | |
308 && (nextContCE & strengthMask[strength]) == SecondCE) | |
309 { | |
310 nextCE = (*(CETable+3*(++iCE))); | |
311 nextContCE = (*(CETable+3*(iCE)+1)); | |
312 } | |
313 | |
314 lh->nextCE = nextCE; | |
315 lh->nextContCE = nextContCE; | |
316 | |
317 return iCE; | |
318 } | |
319 | |
320 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh
, UErrorCode *status) { | |
321 /* reset all the gaps */ | |
322 int32_t i = 0; | |
323 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); | |
324 uint32_t st = 0; | |
325 uint32_t t1, t2; | |
326 int32_t pos; | |
327 | |
328 UColToken *tok = lh->first; | |
329 uint32_t tokStrength = tok->strength; | |
330 | |
331 for(i = 0; i<3; i++) { | |
332 lh->gapsHi[3*i] = 0; | |
333 lh->gapsHi[3*i+1] = 0; | |
334 lh->gapsHi[3*i+2] = 0; | |
335 lh->gapsLo[3*i] = 0; | |
336 lh->gapsLo[3*i+1] = 0; | |
337 lh->gapsLo[3*i+2] = 0; | |
338 lh->numStr[i] = 0; | |
339 lh->fStrToken[i] = NULL; | |
340 lh->lStrToken[i] = NULL; | |
341 lh->pos[i] = -1; | |
342 } | |
343 | |
344 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UC
A->image->UCAConsts); | |
345 | |
346 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh
->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicit
s - */ | |
347 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT
_MAX ) { /* implicits - */ | |
348 lh->pos[0] = 0; | |
349 t1 = lh->baseCE; | |
350 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION; | |
351 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; | |
352 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMA
SK) << 8; | |
353 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)
) << 16; | |
354 uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK)
>> 16); | |
355 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(prim
aryCE)+1); | |
356 | |
357 t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; | |
358 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER
; | |
359 | |
360 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; | |
361 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMA
SK) << 8; | |
362 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)
) << 16; | |
363 } else if(lh->indirect == TRUE && lh->nextCE != 0) { | |
364 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) { | |
365 lh->pos[0] = 0; | |
366 t1 = lh->baseCE; | |
367 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION; | |
368 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; | |
369 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMA
SK) << 8; | |
370 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)
) << 16; | |
371 t1 = lh->nextCE; | |
372 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION; | |
373 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; | |
374 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMA
SK) << 8; | |
375 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)
) << 16; | |
376 } else { | |
377 for(;;) { | |
378 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { | |
379 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength
)) >= 0) { | |
380 lh->fStrToken[tokStrength] = tok; | |
381 } else { /* The CE must be implicit, since it's not in the table
*/ | |
382 /* Error */ | |
383 *status = U_INTERNAL_PROGRAM_ERROR; | |
384 } | |
385 } | |
386 | |
387 while(tok != NULL && tok->strength >= tokStrength) { | |
388 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { | |
389 lh->lStrToken[tokStrength] = tok; | |
390 } | |
391 tok = tok->next; | |
392 } | |
393 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) { | |
394 /* check if previous interval is the same and merge the interval
s if it is so */ | |
395 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) { | |
396 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1]; | |
397 lh->fStrToken[tokStrength+1] = NULL; | |
398 lh->lStrToken[tokStrength+1] = NULL; | |
399 lh->pos[tokStrength+1] = -1; | |
400 } | |
401 } | |
402 if(tok != NULL) { | |
403 tokStrength = tok->strength; | |
404 } else { | |
405 break; | |
406 } | |
407 } | |
408 for(st = 0; st < 3; st++) { | |
409 if((pos = lh->pos[st]) >= 0) { | |
410 t1 = *(CETable+3*(pos)); | |
411 t2 = *(CETable+3*(pos)+1); | |
412 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYM
ASK) >> 16; | |
413 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCO
L_SECONDARYMASK) << 8; | |
414 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TE
RTIARYORDER(t2)) << 16; | |
415 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; | |
416 //pos--; | |
417 //t1 = *(CETable+3*(pos)); | |
418 //t2 = *(CETable+3*(pos)+1); | |
419 t1 = lh->baseCE; | |
420 t2 = lh->baseContCE; | |
421 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYM
ASK) >> 16; | |
422 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCO
L_SECONDARYMASK) << 8; | |
423 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; | |
424 } | |
425 } | |
426 } | |
427 } | |
428 | |
429 | |
430 #define ucol_countBytes(value, noOfBytes) \ | |
431 { \ | |
432 uint32_t mask = 0xFFFFFFFF; \ | |
433 (noOfBytes) = 0; \ | |
434 while(mask != 0) { \ | |
435 if(((value) & mask) != 0) { \ | |
436 (noOfBytes)++; \ | |
437 } \ | |
438 mask >>= 8; \ | |
439 } \ | |
440 } | |
441 | |
442 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) { | |
443 if(U_SUCCESS(*status)) { | |
444 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); | |
445 } | |
446 return g->current; | |
447 } | |
448 | |
449 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, ui
nt32_t strength, UErrorCode *status) { | |
450 /* TODO: rename to enum names */ | |
451 uint32_t high, low, count=1; | |
452 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF; | |
453 | |
454 if(strength == UCOL_SECONDARY) { | |
455 low = UCOL_COMMON_TOP2<<24; | |
456 high = 0xFFFFFFFF; | |
457 count = 0xFF - UCOL_COMMON_TOP2; | |
458 } else { | |
459 low = UCOL_BYTE_COMMON << 24; //0x05000000; | |
460 high = 0x40000000; | |
461 count = 0x40 - UCOL_BYTE_COMMON; | |
462 } | |
463 | |
464 if(tok->next != NULL && tok->next->strength == strength) { | |
465 count = tok->next->toInsert; | |
466 } | |
467 | |
468 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); | |
469 g->current = UCOL_BYTE_COMMON<<24; | |
470 | |
471 if(g->noOfRanges == 0) { | |
472 *status = U_INTERNAL_PROGRAM_ERROR; | |
473 } | |
474 return g->current; | |
475 } | |
476 | |
477 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t
* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) { | |
478 uint32_t strength = tok->strength; | |
479 uint32_t low = lows[fStrength*3+strength]; | |
480 uint32_t high = highs[fStrength*3+strength]; | |
481 uint32_t maxByte = 0; | |
482 if(strength == UCOL_TERTIARY) { | |
483 maxByte = 0x3F; | |
484 } else if(strength == UCOL_PRIMARY) { | |
485 maxByte = 0xFE; | |
486 } else { | |
487 maxByte = 0xFF; | |
488 } | |
489 | |
490 uint32_t count = tok->toInsert; | |
491 | |
492 if(low >= high && strength > UCOL_PRIMARY) { | |
493 int32_t s = strength; | |
494 for(;;) { | |
495 s--; | |
496 if(lows[fStrength*3+s] != highs[fStrength*3+s]) { | |
497 if(strength == UCOL_SECONDARY) { | |
498 if (low < UCOL_COMMON_TOP2<<24 ) { | |
499 // Override if low range is less than UCOL_COMMON_TOP2. | |
500 low = UCOL_COMMON_TOP2<<24; | |
501 } | |
502 high = 0xFFFFFFFF; | |
503 } else { | |
504 // Override if low range is less than UCOL_COMMON_BOT3. | |
505 if ( low < UCOL_COMMON_BOT3<<24 ) { | |
506 low = UCOL_COMMON_BOT3<<24; | |
507 } | |
508 high = 0x40000000; | |
509 } | |
510 break; | |
511 } | |
512 if(s<0) { | |
513 *status = U_INTERNAL_PROGRAM_ERROR; | |
514 return 0; | |
515 } | |
516 } | |
517 } | |
518 | |
519 if(low < 0x02000000) { | |
520 // We must not use CE weight byte 02, so we set it as the minimum lower
bound. | |
521 // See http://site.icu-project.org/design/collation/bytes | |
522 low = 0x02000000; | |
523 } | |
524 | |
525 if(strength == UCOL_SECONDARY) { /* similar as simple */ | |
526 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<2
4)) { | |
527 low = UCOL_COMMON_TOP2<<24; | |
528 } | |
529 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<
24)) { | |
530 high = UCOL_COMMON_TOP2<<24; | |
531 } | |
532 if(low < (UCOL_COMMON_BOT2<<24)) { | |
533 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high,
count, maxByte, g->ranges); | |
534 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); | |
535 //g->current = UCOL_COMMON_BOT2<<24; | |
536 return g->current; | |
537 } | |
538 } | |
539 | |
540 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); | |
541 if(g->noOfRanges == 0) { | |
542 *status = U_INTERNAL_PROGRAM_ERROR; | |
543 } | |
544 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); | |
545 return g->current; | |
546 } | |
547 | |
548 static | |
549 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *res
Buf, const uint32_t resLen, UErrorCode *status) { | |
550 uint32_t i = 0; | |
551 UChar c; | |
552 | |
553 if(U_FAILURE(*status)) { | |
554 return 0; | |
555 } | |
556 | |
557 if(sourceLen > resLen) { | |
558 *status = U_MEMORY_ALLOCATION_ERROR; | |
559 return 0; | |
560 } | |
561 | |
562 for(i = 0; i < sourceLen; i++) { | |
563 c = source[i]; | |
564 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ | |
565 switch(c - 0x3000) { | |
566 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: ca
se 0x83: case 0x85: case 0x8E: | |
567 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: ca
se 0xE3: case 0xE5: case 0xEE: | |
568 c++; | |
569 break; | |
570 case 0xF5: | |
571 c = 0x30AB; | |
572 break; | |
573 case 0xF6: | |
574 c = 0x30B1; | |
575 break; | |
576 } | |
577 } | |
578 resBuf[i] = c; | |
579 } | |
580 return sourceLen; | |
581 } | |
582 | |
583 static | |
584 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *res
Buf, const uint32_t resLen, UErrorCode *status) { | |
585 uint32_t i = 0; | |
586 UChar c; | |
587 | |
588 if(U_FAILURE(*status)) { | |
589 return 0; | |
590 } | |
591 | |
592 if(sourceLen > resLen) { | |
593 *status = U_MEMORY_ALLOCATION_ERROR; | |
594 return 0; | |
595 } | |
596 | |
597 for(i = 0; i < sourceLen; i++) { | |
598 c = source[i]; | |
599 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ | |
600 switch(c - 0x3000) { | |
601 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: ca
se 0x84: case 0x86: case 0x8F: | |
602 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: ca
se 0xE4: case 0xE6: case 0xEF: | |
603 c--; | |
604 break; | |
605 case 0xAB: | |
606 c = 0x30F5; | |
607 break; | |
608 case 0xB1: | |
609 c = 0x30F6; | |
610 break; | |
611 } | |
612 } | |
613 resBuf[i] = c; | |
614 } | |
615 return sourceLen; | |
616 } | |
617 | |
618 U_NAMESPACE_BEGIN | |
619 | |
620 static | |
621 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t l
en, UErrorCode *status) { | |
622 uint32_t i = 0; | |
623 UChar n[128]; | |
624 uint32_t nLen = 0; | |
625 uint32_t uCount = 0, lCount = 0; | |
626 | |
627 collIterate s; | |
628 uint32_t order = 0; | |
629 | |
630 if(U_FAILURE(*status)) { | |
631 return UCOL_LOWER_CASE; | |
632 } | |
633 | |
634 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status); | |
635 if(U_SUCCESS(*status)) { | |
636 for(i = 0; i < nLen; i++) { | |
637 uprv_init_collIterate(UCA, &n[i], 1, &s, status); | |
638 order = ucol_getNextCE(UCA, &s, status); | |
639 if(isContinuation(order)) { | |
640 *status = U_INTERNAL_PROGRAM_ERROR; | |
641 return UCOL_LOWER_CASE; | |
642 } | |
643 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) { | |
644 uCount++; | |
645 } else { | |
646 if(u_islower(n[i])) { | |
647 lCount++; | |
648 } else if(U_SUCCESS(*status)) { | |
649 UChar sk[1], lk[1]; | |
650 u_toSmallKana(&n[i], 1, sk, 1, status); | |
651 u_toLargeKana(&n[i], 1, lk, 1, status); | |
652 if(sk[0] == n[i] && lk[0] != n[i]) { | |
653 lCount++; | |
654 } | |
655 } | |
656 } | |
657 } | |
658 } | |
659 | |
660 if(uCount != 0 && lCount != 0) { | |
661 return UCOL_MIXED_CASE; | |
662 } else if(uCount != 0) { | |
663 return UCOL_UPPER_CASE; | |
664 } else { | |
665 return UCOL_LOWER_CASE; | |
666 } | |
667 } | |
668 | |
669 | |
670 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok,
UErrorCode *status) { | |
671 /* this one makes the table and stuff */ | |
672 uint32_t noOfBytes[3]; | |
673 uint32_t i; | |
674 | |
675 for(i = 0; i<3; i++) { | |
676 ucol_countBytes(CEparts[i], noOfBytes[i]); | |
677 } | |
678 | |
679 /* Here we have to pack CEs from parts */ | |
680 | |
681 uint32_t CEi = 0; | |
682 uint32_t value = 0; | |
683 | |
684 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) { | |
685 if(CEi > 0) { | |
686 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ | |
687 } else { | |
688 value = 0; | |
689 } | |
690 | |
691 if(2*CEi<noOfBytes[0]) { | |
692 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16; | |
693 } | |
694 if(CEi<noOfBytes[1]) { | |
695 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8; | |
696 } | |
697 if(CEi<noOfBytes[2]) { | |
698 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F); | |
699 } | |
700 tok->CEs[CEi] = value; | |
701 CEi++; | |
702 } | |
703 if(CEi == 0) { /* totally ignorable */ | |
704 tok->noOfCEs = 1; | |
705 tok->CEs[0] = 0; | |
706 } else { /* there is at least something */ | |
707 tok->noOfCEs = CEi; | |
708 } | |
709 | |
710 | |
711 // we want to set case bits here and now, not later. | |
712 // Case bits handling | |
713 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables | |
714 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field | |
715 int32_t cSize = (tok->source & 0xFF000000) >> 24; | |
716 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source; | |
717 | |
718 if(cSize > 1) { | |
719 // Do it manually | |
720 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, statu
s); | |
721 } else { | |
722 // Copy it from the UCA | |
723 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status); | |
724 tok->CEs[0] |= (caseCE & 0xC0); | |
725 } | |
726 } | |
727 | |
728 #if UCOL_DEBUG==2 | |
729 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource,
tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes
[1]), CEparts[2]>> (32-8*noOfBytes[2])); | |
730 for(i = 0; i<tok->noOfCEs; i++) { | |
731 fprintf(stderr, "%08X ", tok->CEs[i]); | |
732 } | |
733 fprintf(stderr, "\n"); | |
734 #endif | |
735 } | |
736 | |
737 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErro
rCode *status) { | |
738 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT]; | |
739 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT]; | |
740 | |
741 UColToken *tok = lh->last; | |
742 uint32_t t[UCOL_STRENGTH_LIMIT]; | |
743 | |
744 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t)); | |
745 | |
746 /* must initialize ranges to avoid memory check warnings */ | |
747 for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) { | |
748 uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges)); | |
749 } | |
750 | |
751 tok->toInsert = 1; | |
752 t[tok->strength] = 1; | |
753 | |
754 while(tok->previous != NULL) { | |
755 if(tok->previous->strength < tok->strength) { /* going up */ | |
756 t[tok->strength] = 0; | |
757 t[tok->previous->strength]++; | |
758 } else if(tok->previous->strength > tok->strength) { /* going down */ | |
759 t[tok->previous->strength] = 1; | |
760 } else { | |
761 t[tok->strength]++; | |
762 } | |
763 tok=tok->previous; | |
764 tok->toInsert = t[tok->strength]; | |
765 } | |
766 | |
767 tok->toInsert = t[tok->strength]; | |
768 ucol_inv_getGapPositions(src, lh, status); | |
769 | |
770 #if UCOL_DEBUG | |
771 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE); | |
772 int32_t j = 2; | |
773 for(j = 2; j >= 0; j--) { | |
774 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh-
>gapsLo[j*3+1], lh->gapsLo[j*3+2]); | |
775 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh-
>gapsHi[j*3+1], lh->gapsHi[j*3+2]); | |
776 } | |
777 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; | |
778 | |
779 do { | |
780 fprintf(stderr,"%i", tok->strength); | |
781 tok = tok->next; | |
782 } while(tok != NULL); | |
783 fprintf(stderr, "\n"); | |
784 | |
785 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; | |
786 | |
787 do { | |
788 fprintf(stderr,"%i", tok->toInsert); | |
789 tok = tok->next; | |
790 } while(tok != NULL); | |
791 #endif | |
792 | |
793 tok = lh->first; | |
794 uint32_t fStrength = UCOL_IDENTICAL; | |
795 uint32_t initStrength = UCOL_IDENTICAL; | |
796 | |
797 | |
798 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE &
UCOL_PRIMARYMASK) >> 16; | |
799 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->bas
eContCE & UCOL_SECONDARYMASK) << 8; | |
800 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERT
IARYORDER(lh->baseContCE)) << 16; | |
801 | |
802 while (tok != NULL && U_SUCCESS(*status)) { | |
803 fStrength = tok->strength; | |
804 if(fStrength < initStrength) { | |
805 initStrength = fStrength; | |
806 if(lh->pos[fStrength] == -1) { | |
807 while(lh->pos[fStrength] == -1 && fStrength > 0) { | |
808 fStrength--; | |
809 } | |
810 if(lh->pos[fStrength] == -1) { | |
811 *status = U_INTERNAL_PROGRAM_ERROR; | |
812 return; | |
813 } | |
814 } | |
815 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */ | |
816 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; | |
817 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1]; | |
818 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gap
sLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */ | |
819 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY
], lh->gapsLo, lh->gapsHi, tok, fStrength, status); | |
820 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */ | |
821 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; | |
822 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrengt
h*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/ | |
823 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDA
RY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); | |
824 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE
RTIARY], tok, UCOL_TERTIARY, status); | |
825 } else { /* primaries */ | |
826 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gaps
Lo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/ | |
827 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY],
lh->gapsLo, lh->gapsHi, tok, fStrength, status); | |
828 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_S
ECONDARY], tok, UCOL_SECONDARY, status); | |
829 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE
RTIARY], tok, UCOL_TERTIARY, status); | |
830 } | |
831 } else { | |
832 if(tok->strength == UCOL_TERTIARY) { | |
833 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIA
RY], status); | |
834 } else if(tok->strength == UCOL_SECONDARY) { | |
835 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECON
DARY], status); | |
836 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE
RTIARY], tok, UCOL_TERTIARY, status); | |
837 } else if(tok->strength == UCOL_PRIMARY) { | |
838 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY
], status); | |
839 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_S
ECONDARY], tok, UCOL_SECONDARY, status); | |
840 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE
RTIARY], tok, UCOL_TERTIARY, status); | |
841 } | |
842 } | |
843 ucol_doCE(src, CEparts, tok, status); | |
844 tok = tok->next; | |
845 } | |
846 } | |
847 | |
848 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
istHeader *lh, UErrorCode *status) { | |
849 UCAElements el; | |
850 UColToken *tok = lh->first; | |
851 UColToken *expt = NULL; | |
852 uint32_t i = 0, j = 0; | |
853 const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status); | |
854 | |
855 while(tok != NULL && U_SUCCESS(*status)) { | |
856 /* first, check if there are any expansions */ | |
857 /* if there are expansions, we need to do a little bit more processing *
/ | |
858 /* since parts of expansion can be tailored, while others are not */ | |
859 if(tok->expansion != 0) { | |
860 uint32_t len = tok->expansion >> 24; | |
861 uint32_t currentSequenceLen = len; | |
862 uint32_t expOffset = tok->expansion & 0x00FFFFFF; | |
863 //uint32_t exp = currentSequenceLen | expOffset; | |
864 UColToken exp; | |
865 exp.source = currentSequenceLen | expOffset; | |
866 exp.rulesToParseHdl = &(src->source); | |
867 | |
868 while(len > 0) { | |
869 currentSequenceLen = len; | |
870 while(currentSequenceLen > 0) { | |
871 exp.source = (currentSequenceLen << 24) | expOffset; | |
872 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != N
ULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */ | |
873 uint32_t noOfCEsToCopy = expt->noOfCEs; | |
874 for(j = 0; j<noOfCEsToCopy; j++) { | |
875 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j]; | |
876 } | |
877 tok->noOfExpCEs += noOfCEsToCopy; | |
878 // Smart people never try to add codepoints and CEs. | |
879 // For some odd reason, it won't work. | |
880 expOffset += currentSequenceLen; //noOfCEsToCopy; | |
881 len -= currentSequenceLen; //noOfCEsToCopy; | |
882 break; | |
883 } else { | |
884 currentSequenceLen--; | |
885 } | |
886 } | |
887 if(currentSequenceLen == 0) { /* couldn't find any tailored subs
equence */ | |
888 /* will have to get one from UCA */ | |
889 /* first, get the UChars from the rules */ | |
890 /* then pick CEs out until there is no more and stuff them i
nto expansion */ | |
891 collIterate s; | |
892 uint32_t order = 0; | |
893 uprv_init_collIterate(src->UCA, expOffset + src->source, 1,
&s, status); | |
894 | |
895 for(;;) { | |
896 order = ucol_getNextCE(src->UCA, &s, status); | |
897 if(order == UCOL_NO_MORE_CES) { | |
898 break; | |
899 } | |
900 tok->expCEs[tok->noOfExpCEs++] = order; | |
901 } | |
902 expOffset++; | |
903 len--; | |
904 } | |
905 } | |
906 } else { | |
907 tok->noOfExpCEs = 0; | |
908 } | |
909 | |
910 /* set the ucaelement with obtained values */ | |
911 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs; | |
912 /* copy CEs */ | |
913 for(i = 0; i<tok->noOfCEs; i++) { | |
914 el.CEs[i] = tok->CEs[i]; | |
915 } | |
916 for(i = 0; i<tok->noOfExpCEs; i++) { | |
917 el.CEs[i+tok->noOfCEs] = tok->expCEs[i]; | |
918 } | |
919 | |
920 /* copy UChars */ | |
921 // We kept prefix and source kind of together, as it is a kind of a cont
raction. | |
922 // However, now we have to slice the prefix off the main thing - | |
923 el.prefix = el.prefixChars; | |
924 el.cPoints = el.uchars; | |
925 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust
accordingly in the | |
926 // addPrefix function in ucol_elm. The reason is that we need to add
both composed AND | |
927 // decomposed elements to the unsaf table. | |
928 el.prefixSize = tok->prefix>>24; | |
929 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.
prefixSize*sizeof(UChar)); | |
930 | |
931 el.cSize = (tok->source >> 24)-(tok->prefix>>24); | |
932 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24)
+ src->source, el.cSize*sizeof(UChar)); | |
933 } else { | |
934 el.prefixSize = 0; | |
935 *el.prefix = 0; | |
936 | |
937 el.cSize = (tok->source >> 24); | |
938 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.
cSize*sizeof(UChar)); | |
939 } | |
940 if(src->UCA != NULL) { | |
941 for(i = 0; i<el.cSize; i++) { | |
942 if(UCOL_ISJAMO(el.cPoints[i])) { | |
943 t->image->jamoSpecial = TRUE; | |
944 } | |
945 } | |
946 if (!src->buildCCTabFlag && el.cSize > 0) { | |
947 // Check the trailing canonical combining class (tccc) of the la
st character. | |
948 const UChar *s = el.cPoints + el.cSize; | |
949 uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s); | |
950 if ((fcd & 0xff) != 0) { | |
951 src->buildCCTabFlag = TRUE; | |
952 } | |
953 } | |
954 } | |
955 | |
956 /* and then, add it */ | |
957 #if UCOL_DEBUG==2 | |
958 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]); | |
959 #endif | |
960 uprv_uca_addAnElement(t, &el, status); | |
961 | |
962 #if UCOL_DEBUG_DUPLICATES | |
963 if(*status != U_ZERO_ERROR) { | |
964 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoint
s[0], tok->debugSource); | |
965 *status = U_ZERO_ERROR; | |
966 } | |
967 #endif | |
968 | |
969 tok = tok->next; | |
970 } | |
971 } | |
972 | |
973 U_CDECL_BEGIN | |
974 static UBool U_CALLCONV | |
975 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit,
uint32_t value) { | |
976 UErrorCode status = U_ZERO_ERROR; | |
977 tempUCATable *t = (tempUCATable *)context; | |
978 if(value == 0) { | |
979 while(start < limit) { | |
980 uint32_t CE = utrie_get32(t->mapping, start, NULL); | |
981 if(CE == UCOL_NOT_FOUND) { | |
982 UCAElements el; | |
983 el.isThai = FALSE; | |
984 el.prefixSize = 0; | |
985 el.prefixChars[0] = 0; | |
986 el.prefix = el.prefixChars; | |
987 el.cPoints = el.uchars; | |
988 | |
989 el.cSize = 0; | |
990 U16_APPEND_UNSAFE(el.uchars, el.cSize, start); | |
991 | |
992 el.noOfCEs = 1; | |
993 el.CEs[0] = 0; | |
994 uprv_uca_addAnElement(t, &el, &status); | |
995 | |
996 } | |
997 start++; | |
998 } | |
999 } | |
1000 if(U_FAILURE(status)) { | |
1001 return FALSE; | |
1002 } else { | |
1003 return TRUE; | |
1004 } | |
1005 } | |
1006 U_CDECL_END | |
1007 | |
1008 static void | |
1009 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t, | |
1010 UChar32 start, UChar32 end, | |
1011 UErrorCode *status) | |
1012 { | |
1013 //UChar decomp[256]; | |
1014 uint32_t CE = UCOL_NOT_FOUND; | |
1015 UChar32 u = 0; | |
1016 UCAElements el; | |
1017 el.isThai = FALSE; | |
1018 el.prefixSize = 0; | |
1019 el.prefixChars[0] = 0; | |
1020 collIterate colIt; | |
1021 | |
1022 if(U_SUCCESS(*status)) { | |
1023 for(u = start; u<=end; u++) { | |
1024 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND | |
1025 /* this test is for contractions that are missing the starting e
lement. */ | |
1026 || ((isCntTableElement(CE)) && | |
1027 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_F
OUND)) | |
1028 ) | |
1029 { | |
1030 el.cSize = 0; | |
1031 U16_APPEND_UNSAFE(el.uchars, el.cSize, u); | |
1032 //decomp[0] = (UChar)u; | |
1033 //el.uchars[0] = (UChar)u; | |
1034 el.cPoints = el.uchars; | |
1035 //el.cSize = 1; | |
1036 el.noOfCEs = 0; | |
1037 el.prefix = el.prefixChars; | |
1038 el.prefixSize = 0; | |
1039 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt); | |
1040 // We actually want to check whether this element is a special | |
1041 // If it is an implicit element (hangul, CJK - we want to copy t
he | |
1042 // special, not the resolved CEs) - for hangul, copying resolved | |
1043 // would just make things the same (there is an expansion and it | |
1044 // takes approximately the same amount of time to resolve as | |
1045 // falling back to the UCA). | |
1046 /* | |
1047 UTRIE_GET32(src->UCA->mapping, u, CE); | |
1048 tag = getCETag(CE); | |
1049 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG | |
1050 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG | |
1051 || tag == LEAD_SURROGATE_TAG) { | |
1052 el.CEs[el.noOfCEs++] = CE; | |
1053 } else { | |
1054 */ | |
1055 // It turns out that it does not make sense to keep implicits | |
1056 // unresolved. The cost of resolving them is big enough so that | |
1057 // it doesn't make any difference whether we have to go to the U
CA | |
1058 // or not. | |
1059 { | |
1060 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt,
status); | |
1061 while(CE != UCOL_NO_MORE_CES) { | |
1062 CE = ucol_getNextCE(src->UCA, &colIt, status); | |
1063 if(CE != UCOL_NO_MORE_CES) { | |
1064 el.CEs[el.noOfCEs++] = CE; | |
1065 } | |
1066 } | |
1067 } | |
1068 uprv_uca_addAnElement(t, &el, status); | |
1069 } | |
1070 } | |
1071 } | |
1072 } | |
1073 | |
1074 U_NAMESPACE_END | |
1075 | |
1076 U_CFUNC UCATableHeader * | |
1077 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) { | |
1078 U_NAMESPACE_USE | |
1079 | |
1080 uint32_t i = 0; | |
1081 if(U_FAILURE(*status)) { | |
1082 return NULL; | |
1083 } | |
1084 /* | |
1085 2. Eliminate the negative lists by doing the following for each non-null ne
gative list: | |
1086 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE, | |
1087 create new ListHeader X | |
1088 o reverse the list, add to the end of X's positive list. Reset the strengt
h of the | |
1089 first item you add, based on the stronger strength levels of the two lists. | |
1090 */ | |
1091 /* | |
1092 3. For each ListHeader with a non-null positive list: | |
1093 */ | |
1094 /* | |
1095 o Find all character strings with CEs between the baseCE and the | |
1096 next/previous CE, at the strength of the first token. Add these to the | |
1097 tailoring. | |
1098 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the | |
1099 tailoring has & x < z... | |
1100 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ... | |
1101 */ | |
1102 /* It is possible that this part should be done even while constructing list
*/ | |
1103 /* The problem is that it is unknown what is going to be the strongest weigh
t */ | |
1104 /* So we might as well do it here */ | |
1105 | |
1106 /* | |
1107 o Allocate CEs for each token in the list, based on the total number N of
the | |
1108 largest level difference, and the gap G between baseCE and nextCE at that | |
1109 level. The relation * between the last item and nextCE is the same as the | |
1110 strongest strength. | |
1111 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) | |
1112 ? There are 3 primary items: a, d, e. Fit them into the primary gap. | |
1113 Then fit b and c into the secondary gap between a and d, then fit q | |
1114 into the tertiary gap between b and c. | |
1115 | |
1116 o Example: baseCE << b <<< q << c * nextCE(X,2) | |
1117 ? There are 2 secondary items: b, c. Fit them into the secondary gap. | |
1118 Then fit q into the tertiary gap between b and c. | |
1119 o When incrementing primary values, we will not cross high byte | |
1120 boundaries except where there is only a single-byte primary. That is to | |
1121 ensure that the script reordering will continue to work. | |
1122 */ | |
1123 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader)
); | |
1124 /* test for NULL */ | |
1125 if (image == NULL) { | |
1126 *status = U_MEMORY_ALLOCATION_ERROR; | |
1127 return NULL; | |
1128 } | |
1129 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader)); | |
1130 | |
1131 for(i = 0; i<src->resultLen; i++) { | |
1132 /* now we need to generate the CEs */ | |
1133 /* We stuff the initial value in the buffers, and increase the appropria
te buffer */ | |
1134 /* According to strength
*/ | |
1135 if(U_SUCCESS(*status)) { | |
1136 if(src->lh[i].first) { // if there are any elements | |
1137 // due to the way parser works, subsequent tailorings | |
1138 // may remove all the elements from a sequence, therefore | |
1139 // leaving an empty tailoring sequence. | |
1140 ucol_initBuffers(src, &src->lh[i], status); | |
1141 } | |
1142 } | |
1143 if(U_FAILURE(*status)) { | |
1144 uprv_free(image); | |
1145 return NULL; | |
1146 } | |
1147 } | |
1148 | |
1149 if(src->varTop != NULL) { /* stuff the variable top value */ | |
1150 src->opts->variableTopValue = (*(src->varTop->CEs))>>16; | |
1151 /* remove it from the list */ | |
1152 if(src->varTop->listHeader->first == src->varTop) { /* first in list */ | |
1153 src->varTop->listHeader->first = src->varTop->next; | |
1154 } | |
1155 if(src->varTop->listHeader->last == src->varTop) { /* first in list */ | |
1156 src->varTop->listHeader->last = src->varTop->previous; | |
1157 } | |
1158 if(src->varTop->next != NULL) { | |
1159 src->varTop->next->previous = src->varTop->previous; | |
1160 } | |
1161 if(src->varTop->previous != NULL) { | |
1162 src->varTop->previous->next = src->varTop->next; | |
1163 } | |
1164 } | |
1165 | |
1166 | |
1167 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOU
ND_TAG, NOT_FOUND_TAG, status); | |
1168 if(U_FAILURE(*status)) { | |
1169 uprv_free(image); | |
1170 return NULL; | |
1171 } | |
1172 | |
1173 | |
1174 /* After this, we have assigned CE values to all regular CEs */ | |
1175 /* now we will go through list once more and resolve expansions, */ | |
1176 /* make UCAElements structs and add them to table */ | |
1177 for(i = 0; i<src->resultLen; i++) { | |
1178 /* now we need to generate the CEs */ | |
1179 /* We stuff the initial value in the buffers, and increase the appropria
te buffer */ | |
1180 /* According to strength
*/ | |
1181 if(U_SUCCESS(*status)) { | |
1182 ucol_createElements(src, t, &src->lh[i], status); | |
1183 } | |
1184 } | |
1185 | |
1186 UCAElements el; | |
1187 el.isThai = FALSE; | |
1188 el.prefixSize = 0; | |
1189 el.prefixChars[0] = 0; | |
1190 | |
1191 /* add latin-1 stuff */ | |
1192 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status); | |
1193 | |
1194 /* add stuff for copying */ | |
1195 if(src->copySet != NULL) { | |
1196 int32_t i = 0; | |
1197 UnicodeSet *set = (UnicodeSet *)src->copySet; | |
1198 for(i = 0; i < set->getRangeCount(); i++) { | |
1199 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->g
etRangeEnd(i), status); | |
1200 } | |
1201 } | |
1202 | |
1203 if(U_SUCCESS(*status)) { | |
1204 /* copy contractions from the UCA - this is felt mostly for cyrillic*/ | |
1205 | |
1206 uint32_t tailoredCE = UCOL_NOT_FOUND; | |
1207 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->c
ontractionUCACombos); | |
1208 int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosW
idth; | |
1209 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status)
; | |
1210 // Check for null pointer | |
1211 if (ucaEl == NULL) { | |
1212 *status = U_MEMORY_ALLOCATION_ERROR; | |
1213 return NULL; | |
1214 } | |
1215 while(*conts != 0) { | |
1216 // A continuation is NUL-terminated and NUL-padded | |
1217 // except if it has the maximum length. | |
1218 int32_t contractionLength = maxUCAContractionLength; | |
1219 while(contractionLength > 0 && conts[contractionLength - 1] == 0) { | |
1220 --contractionLength; | |
1221 } | |
1222 UChar32 first; | |
1223 int32_t firstLength = 0; | |
1224 U16_NEXT(conts, firstLength, contractionLength, first); | |
1225 tailoredCE = utrie_get32(t->mapping, first, NULL); | |
1226 if(tailoredCE != UCOL_NOT_FOUND) { | |
1227 UBool needToAdd = TRUE; | |
1228 if(isCntTableElement(tailoredCE)) { | |
1229 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts
+firstLength, status) == TRUE) { | |
1230 needToAdd = FALSE; | |
1231 } | |
1232 } | |
1233 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) { | |
1234 UCAElements elm; | |
1235 elm.cPoints = el.uchars; | |
1236 elm.noOfCEs = 0; | |
1237 elm.uchars[0] = *conts; | |
1238 elm.uchars[1] = 0; | |
1239 elm.cSize = 1; | |
1240 elm.prefixChars[0] = *(conts+2); | |
1241 elm.isThai = FALSE; | |
1242 elm.prefix = elm.prefixChars; | |
1243 elm.prefixSize = 1; | |
1244 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLoo
kup, &elm); | |
1245 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) { | |
1246 needToAdd = TRUE; | |
1247 } | |
1248 } | |
1249 if(src->removeSet != NULL && uset_contains(src->removeSet, first
)) { | |
1250 needToAdd = FALSE; | |
1251 } | |
1252 | |
1253 if(needToAdd == TRUE) { // we need to add if this contraction is
not tailored. | |
1254 if (*(conts+1) != 0) { // contractions | |
1255 el.prefix = el.prefixChars; | |
1256 el.prefixSize = 0; | |
1257 el.cPoints = el.uchars; | |
1258 el.noOfCEs = 0; | |
1259 u_memcpy(el.uchars, conts, contractionLength); | |
1260 el.cSize = contractionLength; | |
1261 ucol_setText(ucaEl, el.uchars, el.cSize, status); | |
1262 } | |
1263 else { // pre-context character | |
1264 UChar str[4] = { 0 }; | |
1265 int32_t len=0; | |
1266 int32_t preKeyLen=0; | |
1267 | |
1268 el.cPoints = el.uchars; | |
1269 el.noOfCEs = 0; | |
1270 el.uchars[0] = *conts; | |
1271 el.uchars[1] = 0; | |
1272 el.cSize = 1; | |
1273 el.prefixChars[0] = *(conts+2); | |
1274 el.prefix = el.prefixChars; | |
1275 el.prefixSize = 1; | |
1276 if (el.prefixChars[0]!=0) { | |
1277 // get CE of prefix character first | |
1278 str[0]=el.prefixChars[0]; | |
1279 str[1]=0; | |
1280 ucol_setText(ucaEl, str, 1, status); | |
1281 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaE
l, status)) | |
1282 != UCOL_NULLORDER) { | |
1283 preKeyLen++; // count number of keys for prefix
character | |
1284 } | |
1285 str[len++] = el.prefixChars[0]; | |
1286 } | |
1287 | |
1288 str[len++] = el.uchars[0]; | |
1289 str[len]=0; | |
1290 ucol_setText(ucaEl, str, len, status); | |
1291 // Skip the keys for prefix character, then copy the res
t to el. | |
1292 while ((preKeyLen-->0) && | |
1293 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, s
tatus)) != UCOL_NULLORDER) { | |
1294 continue; | |
1295 } | |
1296 | |
1297 } | |
1298 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, statu
s)) != UCOL_NULLORDER) { | |
1299 el.noOfCEs++; | |
1300 } | |
1301 uprv_uca_addAnElement(t, &el, status); | |
1302 } | |
1303 | |
1304 } else if(src->removeSet != NULL && uset_contains(src->removeSet, fi
rst)) { | |
1305 ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status); | |
1306 } | |
1307 conts+=maxUCAContractionLength; | |
1308 } | |
1309 ucol_closeElements(ucaEl); | |
1310 } | |
1311 | |
1312 // Add completely ignorable elements | |
1313 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t); | |
1314 | |
1315 // add tailoring characters related canonical closures | |
1316 uprv_uca_canonicalClosure(t, src, NULL, status); | |
1317 | |
1318 /* still need to produce compatibility closure */ | |
1319 | |
1320 UCATableHeader *myData = uprv_uca_assembleTable(t, status); | |
1321 | |
1322 uprv_uca_closeTempTable(t); | |
1323 uprv_free(image); | |
1324 | |
1325 return myData; | |
1326 } | |
1327 | |
1328 U_CDECL_BEGIN | |
1329 static UBool U_CALLCONV | |
1330 ucol_bld_cleanup(void) | |
1331 { | |
1332 udata_close(invUCA_DATA_MEM); | |
1333 invUCA_DATA_MEM = NULL; | |
1334 _staticInvUCA = NULL; | |
1335 gStaticInvUCAInitOnce.reset(); | |
1336 return TRUE; | |
1337 } | |
1338 U_CDECL_END | |
1339 | |
1340 static void U_CALLCONV initInverseUCA(UErrorCode &status) { | |
1341 U_ASSERT(invUCA_DATA_MEM == NULL); | |
1342 U_ASSERT(_staticInvUCA == NULL); | |
1343 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup); | |
1344 InverseUCATableHeader *newInvUCA = NULL; | |
1345 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_
DATA_NAME, isAcceptableInvUCA, NULL, &status); | |
1346 | |
1347 if(U_FAILURE(status)) { | |
1348 if (result) { | |
1349 udata_close(result); | |
1350 } | |
1351 // This is not needed, as we are talking about | |
1352 // memory we got from UData | |
1353 //uprv_free(newInvUCA); | |
1354 return; | |
1355 } | |
1356 | |
1357 if(result != NULL) { /* It looks like sometimes we can fail to find the data
file */ | |
1358 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result); | |
1359 UCollator *UCA = ucol_initUCA(&status); | |
1360 // UCA versions of UCA and inverse UCA should match | |
1361 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVe
rsionInfo)) != 0) { | |
1362 status = U_INVALID_FORMAT_ERROR; | |
1363 udata_close(result); | |
1364 return; | |
1365 } | |
1366 | |
1367 invUCA_DATA_MEM = result; | |
1368 _staticInvUCA = newInvUCA; | |
1369 } | |
1370 } | |
1371 | |
1372 | |
1373 U_CAPI const InverseUCATableHeader * U_EXPORT2 | |
1374 ucol_initInverseUCA(UErrorCode *status) | |
1375 { | |
1376 umtx_initOnce(gStaticInvUCAInitOnce, &initInverseUCA, *status); | |
1377 return _staticInvUCA; | |
1378 } | |
1379 | |
1380 /* This is the data that is used for non-script reordering codes. These _must_ b
e kept | |
1381 * in order that they are to be applied as defaults and in synch with the UColRe
orderCode enum. | |
1382 */ | |
1383 static const char * const ReorderingTokenNames[] = { | |
1384 "SPACE", | |
1385 "PUNCT", | |
1386 "SYMBOL", | |
1387 "CURRENCY", | |
1388 "DIGIT" | |
1389 }; | |
1390 | |
1391 static void toUpper(const char* src, char* dst, uint32_t length) { | |
1392 for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) { | |
1393 *dst = uprv_toupper(*src); | |
1394 } | |
1395 *dst = '\0'; | |
1396 } | |
1397 | |
1398 U_INTERNAL int32_t U_EXPORT2 | |
1399 ucol_findReorderingEntry(const char* name) { | |
1400 char buffer[32]; | |
1401 toUpper(name, buffer, 32); | |
1402 for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) { | |
1403 if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) { | |
1404 return entry + UCOL_REORDER_CODE_FIRST; | |
1405 } | |
1406 } | |
1407 return USCRIPT_INVALID_CODE; | |
1408 } | |
1409 | |
1410 #endif /* #if !UCONFIG_NO_COLLATION */ | |
OLD | NEW |