Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(666)

Side by Side Diff: icu46/source/i18n/ucol_bld.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/
Patch Set: Created 10 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « icu46/source/i18n/ucol_bld.h ('k') | icu46/source/i18n/ucol_cnt.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Property Changes:
Added: svn:eol-style
+ LF
OLDNEW
(Empty)
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2001-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucol_bld.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created 02/22/2001
14 * created by: Vladimir Weinstein
15 *
16 * This module builds a collator based on the rule set.
17 *
18 */
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_COLLATION
23
24 #include "unicode/ucoleitr.h"
25 #include "unicode/udata.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uniset.h"
28 #include "unicode/uscript.h"
29 #include "unicode/ustring.h"
30 #include "normalizer2impl.h"
31 #include "ucol_bld.h"
32 #include "ucol_elm.h"
33 #include "ucol_cnt.h"
34 #include "ucln_in.h"
35 #include "umutex.h"
36 #include "cmemory.h"
37 #include "cstring.h"
38
39 U_NAMESPACE_BEGIN
40
41 static const InverseUCATableHeader* _staticInvUCA = NULL;
42 static UDataMemory* invUCA_DATA_MEM = NULL;
43
44 U_CDECL_BEGIN
45 static UBool U_CALLCONV
46 isAcceptableInvUCA(void * /*context*/,
47 const char * /*type*/, const char * /*name*/,
48 const UDataInfo *pInfo)
49 {
50 /* context, type & name are intentionally not used */
51 if( pInfo->size>=20 &&
52 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
53 pInfo->charsetFamily==U_CHARSET_FAMILY &&
54 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */
55 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
56 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
57 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
58 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
59 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
60 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
61 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
62 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
63 )
64 {
65 UVersionInfo UCDVersion;
66 u_getUnicodeVersion(UCDVersion);
67 return (pInfo->dataVersion[0]==UCDVersion[0] &&
68 pInfo->dataVersion[1]==UCDVersion[1]);
69 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
70 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
71 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
72 } else {
73 return FALSE;
74 }
75 }
76 U_CDECL_END
77
78 /*
79 * Takes two CEs (lead and continuation) and
80 * compares them as CEs should be compared:
81 * primary vs. primary, secondary vs. secondary
82 * tertiary vs. tertiary
83 */
84 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
85 uint32_t s1 = source0, s2, t1 = target0, t2;
86 if(isContinuation(source1)) {
87 s2 = source1;
88 } else {
89 s2 = 0;
90 }
91 if(isContinuation(target1)) {
92 t2 = target1;
93 } else {
94 t2 = 0;
95 }
96
97 uint32_t s = 0, t = 0;
98 if(s1 == t1 && s2 == t2) {
99 return 0;
100 }
101 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
102 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
103 if(s < t) {
104 return -1;
105 } else if(s > t) {
106 return 1;
107 } else {
108 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
109 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
110 if(s < t) {
111 return -1;
112 } else if(s > t) {
113 return 1;
114 } else {
115 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
116 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
117 if(s < t) {
118 return -1;
119 } else {
120 return 1;
121 }
122 }
123 }
124 }
125
126 static
127 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t Second CE) {
128 uint32_t bottom = 0, top = src->invUCA->tableSize;
129 uint32_t i = 0;
130 uint32_t first = 0, second = 0;
131 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
132 int32_t res = 0;
133
134 while(bottom < top-1) {
135 i = (top+bottom)/2;
136 first = *(CETable+3*i);
137 second = *(CETable+3*i+1);
138 res = compareCEs(first, second, CE, SecondCE);
139 if(res > 0) {
140 top = i;
141 } else if(res < 0) {
142 bottom = i;
143 } else {
144 break;
145 }
146 }
147
148 /* weiv: */
149 /* in searching for elements, I have removed the failure */
150 /* The reason for this is that the builder does not rely */
151 /* on search mechanism telling it that it didn't find an */
152 /* element. However, indirect positioning relies on being */
153 /* able to find the elements around any CE, even if it is */
154 /* not defined in the UCA. */
155 return i;
156 /*
157 if((first == CE && second == SecondCE)) {
158 return i;
159 } else {
160 return -1;
161 }
162 */
163 }
164
165 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
166 0xFFFF0000,
167 0xFFFFFF00,
168 0xFFFFFFFF
169 };
170
171 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
172 uint32_t CE, uint32_t contCE,
173 uint32_t *nextCE, uint32_t *nextCont CE,
174 uint32_t strength)
175 {
176 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
177 int32_t iCE;
178
179 iCE = ucol_inv_findCE(src, CE, contCE);
180
181 if(iCE<0) {
182 *nextCE = UCOL_NOT_FOUND;
183 return -1;
184 }
185
186 CE &= strengthMask[strength];
187 contCE &= strengthMask[strength];
188
189 *nextCE = CE;
190 *nextContCE = contCE;
191
192 while((*nextCE & strengthMask[strength]) == CE
193 && (*nextContCE & strengthMask[strength]) == contCE)
194 {
195 *nextCE = (*(CETable+3*(++iCE)));
196 *nextContCE = (*(CETable+3*(iCE)+1));
197 }
198
199 return iCE;
200 }
201
202 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
203 uint32_t CE, uint32_t contCE,
204 uint32_t *prevCE, uint32_t *prevCont CE,
205 uint32_t strength)
206 {
207 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
208 int32_t iCE;
209
210 iCE = ucol_inv_findCE(src, CE, contCE);
211
212 if(iCE<0) {
213 *prevCE = UCOL_NOT_FOUND;
214 return -1;
215 }
216
217 CE &= strengthMask[strength];
218 contCE &= strengthMask[strength];
219
220 *prevCE = CE;
221 *prevContCE = contCE;
222
223 while((*prevCE & strengthMask[strength]) == CE
224 && (*prevContCE & strengthMask[strength])== contCE
225 && iCE > 0) /* this condition should prevent falling off the edge of the world */
226 {
227 /* here, we end up in a singularity - zero */
228 *prevCE = (*(CETable+3*(--iCE)));
229 *prevContCE = (*(CETable+3*(iCE)+1));
230 }
231
232 return iCE;
233 }
234
235 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t co ntCE,
236 uint32_t prevCE, uint32_t prevContCE)
237 {
238 if(prevCE == CE && prevContCE == contCE) {
239 return UCOL_IDENTICAL;
240 }
241 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY] )
242 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[U COL_PRIMARY]))
243 {
244 return UCOL_PRIMARY;
245 }
246 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECOND ARY])
247 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask [UCOL_SECONDARY]))
248 {
249 return UCOL_SECONDARY;
250 }
251 return UCOL_TERTIARY;
252 }
253
254
255 /*static
256 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
257
258 uint32_t CE = lh->baseCE;
259 uint32_t SecondCE = lh->baseContCE;
260
261 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
262 uint32_t previousCE, previousContCE;
263 int32_t iCE;
264
265 iCE = ucol_inv_findCE(src, CE, SecondCE);
266
267 if(iCE<0) {
268 return -1;
269 }
270
271 CE &= strengthMask[strength];
272 SecondCE &= strengthMask[strength];
273
274 previousCE = CE;
275 previousContCE = SecondCE;
276
277 while((previousCE & strengthMask[strength]) == CE && (previousContCE & str engthMask[strength])== SecondCE) {
278 previousCE = (*(CETable+3*(--iCE)));
279 previousContCE = (*(CETable+3*(iCE)+1));
280 }
281 lh->previousCE = previousCE;
282 lh->previousContCE = previousContCE;
283
284 return iCE;
285 }*/
286
287 static
288 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uin t32_t strength) {
289 uint32_t CE = lh->baseCE;
290 uint32_t SecondCE = lh->baseContCE;
291
292 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
293 uint32_t nextCE, nextContCE;
294 int32_t iCE;
295
296 iCE = ucol_inv_findCE(src, CE, SecondCE);
297
298 if(iCE<0) {
299 return -1;
300 }
301
302 CE &= strengthMask[strength];
303 SecondCE &= strengthMask[strength];
304
305 nextCE = CE;
306 nextContCE = SecondCE;
307
308 while((nextCE & strengthMask[strength]) == CE
309 && (nextContCE & strengthMask[strength]) == SecondCE)
310 {
311 nextCE = (*(CETable+3*(++iCE)));
312 nextContCE = (*(CETable+3*(iCE)+1));
313 }
314
315 lh->nextCE = nextCE;
316 lh->nextContCE = nextContCE;
317
318 return iCE;
319 }
320
321 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh , UErrorCode *status) {
322 /* reset all the gaps */
323 int32_t i = 0;
324 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
325 uint32_t st = 0;
326 uint32_t t1, t2;
327 int32_t pos;
328
329 UColToken *tok = lh->first;
330 uint32_t tokStrength = tok->strength;
331
332 for(i = 0; i<3; i++) {
333 lh->gapsHi[3*i] = 0;
334 lh->gapsHi[3*i+1] = 0;
335 lh->gapsHi[3*i+2] = 0;
336 lh->gapsLo[3*i] = 0;
337 lh->gapsLo[3*i+1] = 0;
338 lh->gapsLo[3*i+2] = 0;
339 lh->numStr[i] = 0;
340 lh->fStrToken[i] = NULL;
341 lh->lStrToken[i] = NULL;
342 lh->pos[i] = -1;
343 }
344
345 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UC A->image->UCAConsts);
346
347 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh ->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicit s - */
348 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT _MAX ) { /* implicits - */
349 lh->pos[0] = 0;
350 t1 = lh->baseCE;
351 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
352 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
353 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMA SK) << 8;
354 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2) ) << 16;
355 uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
356 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(prim aryCE)+1);
357
358 t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
359 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER ;
360
361 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
362 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMA SK) << 8;
363 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2) ) << 16;
364 } else if(lh->indirect == TRUE && lh->nextCE != 0) {
365 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
366 lh->pos[0] = 0;
367 t1 = lh->baseCE;
368 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
369 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
370 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMA SK) << 8;
371 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2) ) << 16;
372 t1 = lh->nextCE;
373 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
374 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
375 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMA SK) << 8;
376 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2) ) << 16;
377 } else {
378 for(;;) {
379 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
380 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength )) >= 0) {
381 lh->fStrToken[tokStrength] = tok;
382 } else { /* The CE must be implicit, since it's not in the table */
383 /* Error */
384 *status = U_INTERNAL_PROGRAM_ERROR;
385 }
386 }
387
388 while(tok != NULL && tok->strength >= tokStrength) {
389 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
390 lh->lStrToken[tokStrength] = tok;
391 }
392 tok = tok->next;
393 }
394 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
395 /* check if previous interval is the same and merge the interval s if it is so */
396 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
397 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
398 lh->fStrToken[tokStrength+1] = NULL;
399 lh->lStrToken[tokStrength+1] = NULL;
400 lh->pos[tokStrength+1] = -1;
401 }
402 }
403 if(tok != NULL) {
404 tokStrength = tok->strength;
405 } else {
406 break;
407 }
408 }
409 for(st = 0; st < 3; st++) {
410 if((pos = lh->pos[st]) >= 0) {
411 t1 = *(CETable+3*(pos));
412 t2 = *(CETable+3*(pos)+1);
413 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYM ASK) >> 16;
414 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCO L_SECONDARYMASK) << 8;
415 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TE RTIARYORDER(t2)) << 16;
416 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
417 //pos--;
418 //t1 = *(CETable+3*(pos));
419 //t2 = *(CETable+3*(pos)+1);
420 t1 = lh->baseCE;
421 t2 = lh->baseContCE;
422 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYM ASK) >> 16;
423 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCO L_SECONDARYMASK) << 8;
424 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
425 }
426 }
427 }
428 }
429
430
431 #define ucol_countBytes(value, noOfBytes) \
432 { \
433 uint32_t mask = 0xFFFFFFFF; \
434 (noOfBytes) = 0; \
435 while(mask != 0) { \
436 if(((value) & mask) != 0) { \
437 (noOfBytes)++; \
438 } \
439 mask >>= 8; \
440 } \
441 }
442
443 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
444 if(U_SUCCESS(*status)) {
445 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
446 }
447 return g->current;
448 }
449
450 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, ui nt32_t strength, UErrorCode *status) {
451 /* TODO: rename to enum names */
452 uint32_t high, low, count=1;
453 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
454
455 if(strength == UCOL_SECONDARY) {
456 low = UCOL_COMMON_TOP2<<24;
457 high = 0xFFFFFFFF;
458 count = 0xFF - UCOL_COMMON_TOP2;
459 } else {
460 low = UCOL_BYTE_COMMON << 24; //0x05000000;
461 high = 0x40000000;
462 count = 0x40 - UCOL_BYTE_COMMON;
463 }
464
465 if(tok->next != NULL && tok->next->strength == strength) {
466 count = tok->next->toInsert;
467 }
468
469 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
470 g->current = UCOL_BYTE_COMMON<<24;
471
472 if(g->noOfRanges == 0) {
473 *status = U_INTERNAL_PROGRAM_ERROR;
474 }
475 return g->current;
476 }
477
478 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t * highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
479 uint32_t strength = tok->strength;
480 uint32_t low = lows[fStrength*3+strength];
481 uint32_t high = highs[fStrength*3+strength];
482 uint32_t maxByte = 0;
483 if(strength == UCOL_TERTIARY) {
484 maxByte = 0x3F;
485 } else if(strength == UCOL_PRIMARY) {
486 maxByte = 0xFE;
487 } else {
488 maxByte = 0xFF;
489 }
490
491 uint32_t count = tok->toInsert;
492
493 if(low >= high && strength > UCOL_PRIMARY) {
494 int32_t s = strength;
495 for(;;) {
496 s--;
497 if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
498 if(strength == UCOL_SECONDARY) {
499 if (low < UCOL_COMMON_TOP2<<24 ) {
500 // Override if low range is less than UCOL_COMMON_TOP2.
501 low = UCOL_COMMON_TOP2<<24;
502 }
503 high = 0xFFFFFFFF;
504 } else {
505 // Override if low range is less than UCOL_COMMON_BOT3.
506 if ( low < UCOL_COMMON_BOT3<<24 ) {
507 low = UCOL_COMMON_BOT3<<24;
508 }
509 high = 0x40000000;
510 }
511 break;
512 }
513 if(s<0) {
514 *status = U_INTERNAL_PROGRAM_ERROR;
515 return 0;
516 }
517 }
518 }
519
520 if(low < 0x02000000) {
521 // We must not use CE weight byte 02, so we set it as the minimum lower bound.
522 // See http://site.icu-project.org/design/collation/bytes
523 low = 0x02000000;
524 }
525
526 if(strength == UCOL_SECONDARY) { /* similar as simple */
527 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<2 4)) {
528 low = UCOL_COMMON_TOP2<<24;
529 }
530 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<< 24)) {
531 high = UCOL_COMMON_TOP2<<24;
532 }
533 if(low < (UCOL_COMMON_BOT2<<24)) {
534 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
535 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
536 //g->current = UCOL_COMMON_BOT2<<24;
537 return g->current;
538 }
539 }
540
541 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
542 if(g->noOfRanges == 0) {
543 *status = U_INTERNAL_PROGRAM_ERROR;
544 }
545 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
546 return g->current;
547 }
548
549 static
550 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *res Buf, const uint32_t resLen, UErrorCode *status) {
551 uint32_t i = 0;
552 UChar c;
553
554 if(U_FAILURE(*status)) {
555 return 0;
556 }
557
558 if(sourceLen > resLen) {
559 *status = U_MEMORY_ALLOCATION_ERROR;
560 return 0;
561 }
562
563 for(i = 0; i < sourceLen; i++) {
564 c = source[i];
565 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
566 switch(c - 0x3000) {
567 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: ca se 0x83: case 0x85: case 0x8E:
568 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: ca se 0xE3: case 0xE5: case 0xEE:
569 c++;
570 break;
571 case 0xF5:
572 c = 0x30AB;
573 break;
574 case 0xF6:
575 c = 0x30B1;
576 break;
577 }
578 }
579 resBuf[i] = c;
580 }
581 return sourceLen;
582 }
583
584 static
585 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *res Buf, const uint32_t resLen, UErrorCode *status) {
586 uint32_t i = 0;
587 UChar c;
588
589 if(U_FAILURE(*status)) {
590 return 0;
591 }
592
593 if(sourceLen > resLen) {
594 *status = U_MEMORY_ALLOCATION_ERROR;
595 return 0;
596 }
597
598 for(i = 0; i < sourceLen; i++) {
599 c = source[i];
600 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
601 switch(c - 0x3000) {
602 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: ca se 0x84: case 0x86: case 0x8F:
603 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: ca se 0xE4: case 0xE6: case 0xEF:
604 c--;
605 break;
606 case 0xAB:
607 c = 0x30F5;
608 break;
609 case 0xB1:
610 c = 0x30F6;
611 break;
612 }
613 }
614 resBuf[i] = c;
615 }
616 return sourceLen;
617 }
618
619 static
620 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t l en, UErrorCode *status) {
621 uint32_t i = 0;
622 UChar n[128];
623 uint32_t nLen = 0;
624 uint32_t uCount = 0, lCount = 0;
625
626 collIterate s;
627 uint32_t order = 0;
628
629 if(U_FAILURE(*status)) {
630 return UCOL_LOWER_CASE;
631 }
632
633 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
634 if(U_SUCCESS(*status)) {
635 for(i = 0; i < nLen; i++) {
636 uprv_init_collIterate(UCA, &n[i], 1, &s, status);
637 order = ucol_getNextCE(UCA, &s, status);
638 if(isContinuation(order)) {
639 *status = U_INTERNAL_PROGRAM_ERROR;
640 return UCOL_LOWER_CASE;
641 }
642 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
643 uCount++;
644 } else {
645 if(u_islower(n[i])) {
646 lCount++;
647 } else if(U_SUCCESS(*status)) {
648 UChar sk[1], lk[1];
649 u_toSmallKana(&n[i], 1, sk, 1, status);
650 u_toLargeKana(&n[i], 1, lk, 1, status);
651 if(sk[0] == n[i] && lk[0] != n[i]) {
652 lCount++;
653 }
654 }
655 }
656 }
657 }
658
659 if(uCount != 0 && lCount != 0) {
660 return UCOL_MIXED_CASE;
661 } else if(uCount != 0) {
662 return UCOL_UPPER_CASE;
663 } else {
664 return UCOL_LOWER_CASE;
665 }
666 }
667
668
669 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
670 /* this one makes the table and stuff */
671 uint32_t noOfBytes[3];
672 uint32_t i;
673
674 for(i = 0; i<3; i++) {
675 ucol_countBytes(CEparts[i], noOfBytes[i]);
676 }
677
678 /* Here we have to pack CEs from parts */
679
680 uint32_t CEi = 0;
681 uint32_t value = 0;
682
683 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
684 if(CEi > 0) {
685 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
686 } else {
687 value = 0;
688 }
689
690 if(2*CEi<noOfBytes[0]) {
691 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
692 }
693 if(CEi<noOfBytes[1]) {
694 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
695 }
696 if(CEi<noOfBytes[2]) {
697 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
698 }
699 tok->CEs[CEi] = value;
700 CEi++;
701 }
702 if(CEi == 0) { /* totally ignorable */
703 tok->noOfCEs = 1;
704 tok->CEs[0] = 0;
705 } else { /* there is at least something */
706 tok->noOfCEs = CEi;
707 }
708
709
710 // we want to set case bits here and now, not later.
711 // Case bits handling
712 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
713 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
714 int32_t cSize = (tok->source & 0xFF000000) >> 24;
715 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
716
717 if(cSize > 1) {
718 // Do it manually
719 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, statu s);
720 } else {
721 // Copy it from the UCA
722 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
723 tok->CEs[0] |= (caseCE & 0xC0);
724 }
725 }
726
727 #if UCOL_DEBUG==2
728 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes [1]), CEparts[2]>> (32-8*noOfBytes[2]));
729 for(i = 0; i<tok->noOfCEs; i++) {
730 fprintf(stderr, "%08X ", tok->CEs[i]);
731 }
732 fprintf(stderr, "\n");
733 #endif
734 }
735
736 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErro rCode *status) {
737 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
738 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
739
740 UColToken *tok = lh->last;
741 uint32_t t[UCOL_STRENGTH_LIMIT];
742
743 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
744
745 tok->toInsert = 1;
746 t[tok->strength] = 1;
747
748 while(tok->previous != NULL) {
749 if(tok->previous->strength < tok->strength) { /* going up */
750 t[tok->strength] = 0;
751 t[tok->previous->strength]++;
752 } else if(tok->previous->strength > tok->strength) { /* going down */
753 t[tok->previous->strength] = 1;
754 } else {
755 t[tok->strength]++;
756 }
757 tok=tok->previous;
758 tok->toInsert = t[tok->strength];
759 }
760
761 tok->toInsert = t[tok->strength];
762 ucol_inv_getGapPositions(src, lh, status);
763
764 #if UCOL_DEBUG
765 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
766 int32_t j = 2;
767 for(j = 2; j >= 0; j--) {
768 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh- >gapsLo[j*3+1], lh->gapsLo[j*3+2]);
769 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh- >gapsHi[j*3+1], lh->gapsHi[j*3+2]);
770 }
771 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
772
773 do {
774 fprintf(stderr,"%i", tok->strength);
775 tok = tok->next;
776 } while(tok != NULL);
777 fprintf(stderr, "\n");
778
779 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
780
781 do {
782 fprintf(stderr,"%i", tok->toInsert);
783 tok = tok->next;
784 } while(tok != NULL);
785 #endif
786
787 tok = lh->first;
788 uint32_t fStrength = UCOL_IDENTICAL;
789 uint32_t initStrength = UCOL_IDENTICAL;
790
791
792 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
793 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->bas eContCE & UCOL_SECONDARYMASK) << 8;
794 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERT IARYORDER(lh->baseContCE)) << 16;
795
796 while (tok != NULL && U_SUCCESS(*status)) {
797 fStrength = tok->strength;
798 if(fStrength < initStrength) {
799 initStrength = fStrength;
800 if(lh->pos[fStrength] == -1) {
801 while(lh->pos[fStrength] == -1 && fStrength > 0) {
802 fStrength--;
803 }
804 if(lh->pos[fStrength] == -1) {
805 *status = U_INTERNAL_PROGRAM_ERROR;
806 return;
807 }
808 }
809 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
810 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
811 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
812 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gap sLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
813 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY ], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
814 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
815 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
816 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrengt h*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
817 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDA RY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
818 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE RTIARY], tok, UCOL_TERTIARY, status);
819 } else { /* primaries */
820 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gaps Lo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
821 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
822 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_S ECONDARY], tok, UCOL_SECONDARY, status);
823 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE RTIARY], tok, UCOL_TERTIARY, status);
824 }
825 } else {
826 if(tok->strength == UCOL_TERTIARY) {
827 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIA RY], status);
828 } else if(tok->strength == UCOL_SECONDARY) {
829 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECON DARY], status);
830 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE RTIARY], tok, UCOL_TERTIARY, status);
831 } else if(tok->strength == UCOL_PRIMARY) {
832 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY ], status);
833 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_S ECONDARY], tok, UCOL_SECONDARY, status);
834 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE RTIARY], tok, UCOL_TERTIARY, status);
835 }
836 }
837 ucol_doCE(src, CEparts, tok, status);
838 tok = tok->next;
839 }
840 }
841
842 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL istHeader *lh, UErrorCode *status) {
843 UCAElements el;
844 UColToken *tok = lh->first;
845 UColToken *expt = NULL;
846 uint32_t i = 0, j = 0;
847 UChar32 fcdHighStart;
848 const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
849
850 while(tok != NULL && U_SUCCESS(*status)) {
851 /* first, check if there are any expansions */
852 /* if there are expansions, we need to do a little bit more processing * /
853 /* since parts of expansion can be tailored, while others are not */
854 if(tok->expansion != 0) {
855 uint32_t len = tok->expansion >> 24;
856 uint32_t currentSequenceLen = len;
857 uint32_t expOffset = tok->expansion & 0x00FFFFFF;
858 //uint32_t exp = currentSequenceLen | expOffset;
859 UColToken exp;
860 exp.source = currentSequenceLen | expOffset;
861 exp.rulesToParseHdl = &(src->source);
862
863 while(len > 0) {
864 currentSequenceLen = len;
865 while(currentSequenceLen > 0) {
866 exp.source = (currentSequenceLen << 24) | expOffset;
867 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != N ULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
868 uint32_t noOfCEsToCopy = expt->noOfCEs;
869 for(j = 0; j<noOfCEsToCopy; j++) {
870 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
871 }
872 tok->noOfExpCEs += noOfCEsToCopy;
873 // Smart people never try to add codepoints and CEs.
874 // For some odd reason, it won't work.
875 expOffset += currentSequenceLen; //noOfCEsToCopy;
876 len -= currentSequenceLen; //noOfCEsToCopy;
877 break;
878 } else {
879 currentSequenceLen--;
880 }
881 }
882 if(currentSequenceLen == 0) { /* couldn't find any tailored subs equence */
883 /* will have to get one from UCA */
884 /* first, get the UChars from the rules */
885 /* then pick CEs out until there is no more and stuff them i nto expansion */
886 collIterate s;
887 uint32_t order = 0;
888 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
889
890 for(;;) {
891 order = ucol_getNextCE(src->UCA, &s, status);
892 if(order == UCOL_NO_MORE_CES) {
893 break;
894 }
895 tok->expCEs[tok->noOfExpCEs++] = order;
896 }
897 expOffset++;
898 len--;
899 }
900 }
901 } else {
902 tok->noOfExpCEs = 0;
903 }
904
905 /* set the ucaelement with obtained values */
906 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
907 /* copy CEs */
908 for(i = 0; i<tok->noOfCEs; i++) {
909 el.CEs[i] = tok->CEs[i];
910 }
911 for(i = 0; i<tok->noOfExpCEs; i++) {
912 el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
913 }
914
915 /* copy UChars */
916 // We kept prefix and source kind of together, as it is a kind of a cont raction.
917 // However, now we have to slice the prefix off the main thing -
918 el.prefix = el.prefixChars;
919 el.cPoints = el.uchars;
920 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
921 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
922 // decomposed elements to the unsaf table.
923 el.prefixSize = tok->prefix>>24;
924 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el. prefixSize*sizeof(UChar));
925
926 el.cSize = (tok->source >> 24)-(tok->prefix>>24);
927 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
928 } else {
929 el.prefixSize = 0;
930 *el.prefix = 0;
931
932 el.cSize = (tok->source >> 24);
933 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el. cSize*sizeof(UChar));
934 }
935 if(src->UCA != NULL) {
936 for(i = 0; i<el.cSize; i++) {
937 if(UCOL_ISJAMO(el.cPoints[i])) {
938 t->image->jamoSpecial = TRUE;
939 }
940 }
941 if (!src->buildCCTabFlag && el.cSize > 0) {
942 // Check the trailing canonical combining class (tccc) of the la st character.
943 const UChar *s = el.cPoints + el.cSize;
944 uint16_t fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, el.cP oints, s);
945 if ((fcd & 0xff) != 0) {
946 src->buildCCTabFlag = TRUE;
947 }
948 }
949 }
950
951 /* and then, add it */
952 #if UCOL_DEBUG==2
953 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
954 #endif
955 uprv_uca_addAnElement(t, &el, status);
956
957 #if UCOL_DEBUG_DUPLICATES
958 if(*status != U_ZERO_ERROR) {
959 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoint s[0], tok->debugSource);
960 *status = U_ZERO_ERROR;
961 }
962 #endif
963
964 tok = tok->next;
965 }
966 }
967
968 U_CDECL_BEGIN
969 static UBool U_CALLCONV
970 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
971 UErrorCode status = U_ZERO_ERROR;
972 tempUCATable *t = (tempUCATable *)context;
973 if(value == 0) {
974 while(start < limit) {
975 uint32_t CE = utrie_get32(t->mapping, start, NULL);
976 if(CE == UCOL_NOT_FOUND) {
977 UCAElements el;
978 el.isThai = FALSE;
979 el.prefixSize = 0;
980 el.prefixChars[0] = 0;
981 el.prefix = el.prefixChars;
982 el.cPoints = el.uchars;
983
984 el.cSize = 0;
985 UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
986
987 el.noOfCEs = 1;
988 el.CEs[0] = 0;
989 uprv_uca_addAnElement(t, &el, &status);
990
991 }
992 start++;
993 }
994 }
995 if(U_FAILURE(status)) {
996 return FALSE;
997 } else {
998 return TRUE;
999 }
1000 }
1001 U_CDECL_END
1002
1003 static void
1004 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
1005 UChar32 start, UChar32 end,
1006 UErrorCode *status)
1007 {
1008 //UChar decomp[256];
1009 uint32_t CE = UCOL_NOT_FOUND;
1010 UChar32 u = 0;
1011 UCAElements el;
1012 el.isThai = FALSE;
1013 el.prefixSize = 0;
1014 el.prefixChars[0] = 0;
1015 collIterate colIt;
1016
1017 if(U_SUCCESS(*status)) {
1018 for(u = start; u<=end; u++) {
1019 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1020 /* this test is for contractions that are missing the starting e lement. */
1021 || ((isCntTableElement(CE)) &&
1022 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_F OUND))
1023 )
1024 {
1025 el.cSize = 0;
1026 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1027 //decomp[0] = (UChar)u;
1028 //el.uchars[0] = (UChar)u;
1029 el.cPoints = el.uchars;
1030 //el.cSize = 1;
1031 el.noOfCEs = 0;
1032 el.prefix = el.prefixChars;
1033 el.prefixSize = 0;
1034 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1035 // We actually want to check whether this element is a special
1036 // If it is an implicit element (hangul, CJK - we want to copy t he
1037 // special, not the resolved CEs) - for hangul, copying resolved
1038 // would just make things the same (there is an expansion and it
1039 // takes approximately the same amount of time to resolve as
1040 // falling back to the UCA).
1041 /*
1042 UTRIE_GET32(src->UCA->mapping, u, CE);
1043 tag = getCETag(CE);
1044 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1045 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1046 || tag == LEAD_SURROGATE_TAG) {
1047 el.CEs[el.noOfCEs++] = CE;
1048 } else {
1049 */
1050 // It turns out that it does not make sense to keep implicits
1051 // unresolved. The cost of resolving them is big enough so that
1052 // it doesn't make any difference whether we have to go to the U CA
1053 // or not.
1054 {
1055 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
1056 while(CE != UCOL_NO_MORE_CES) {
1057 CE = ucol_getNextCE(src->UCA, &colIt, status);
1058 if(CE != UCOL_NO_MORE_CES) {
1059 el.CEs[el.noOfCEs++] = CE;
1060 }
1061 }
1062 }
1063 uprv_uca_addAnElement(t, &el, status);
1064 }
1065 }
1066 }
1067 }
1068
1069 U_CFUNC UCATableHeader *
1070 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1071 U_NAMESPACE_USE
1072
1073 uint32_t i = 0;
1074 if(U_FAILURE(*status)) {
1075 return NULL;
1076 }
1077 /*
1078 2. Eliminate the negative lists by doing the following for each non-null ne gative list:
1079 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1080 create new ListHeader X
1081 o reverse the list, add to the end of X's positive list. Reset the strengt h of the
1082 first item you add, based on the stronger strength levels of the two lists.
1083 */
1084 /*
1085 3. For each ListHeader with a non-null positive list:
1086 */
1087 /*
1088 o Find all character strings with CEs between the baseCE and the
1089 next/previous CE, at the strength of the first token. Add these to the
1090 tailoring.
1091 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
1092 tailoring has & x < z...
1093 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
1094 */
1095 /* It is possible that this part should be done even while constructing list */
1096 /* The problem is that it is unknown what is going to be the strongest weigh t */
1097 /* So we might as well do it here */
1098
1099 /*
1100 o Allocate CEs for each token in the list, based on the total number N of the
1101 largest level difference, and the gap G between baseCE and nextCE at that
1102 level. The relation * between the last item and nextCE is the same as the
1103 strongest strength.
1104 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1105 ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1106 Then fit b and c into the secondary gap between a and d, then fit q
1107 into the tertiary gap between b and c.
1108
1109 o Example: baseCE << b <<< q << c * nextCE(X,2)
1110 ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1111 Then fit q into the tertiary gap between b and c.
1112 o When incrementing primary values, we will not cross high byte
1113 boundaries except where there is only a single-byte primary. That is to
1114 ensure that the script reordering will continue to work.
1115 */
1116 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader) );
1117 /* test for NULL */
1118 if (image == NULL) {
1119 *status = U_MEMORY_ALLOCATION_ERROR;
1120 return NULL;
1121 }
1122 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1123
1124 for(i = 0; i<src->resultLen; i++) {
1125 /* now we need to generate the CEs */
1126 /* We stuff the initial value in the buffers, and increase the appropria te buffer */
1127 /* According to strength */
1128 if(U_SUCCESS(*status)) {
1129 if(src->lh[i].first) { // if there are any elements
1130 // due to the way parser works, subsequent tailorings
1131 // may remove all the elements from a sequence, therefore
1132 // leaving an empty tailoring sequence.
1133 ucol_initBuffers(src, &src->lh[i], status);
1134 }
1135 }
1136 if(U_FAILURE(*status)) {
1137 uprv_free(image);
1138 return NULL;
1139 }
1140 }
1141
1142 if(src->varTop != NULL) { /* stuff the variable top value */
1143 src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1144 /* remove it from the list */
1145 if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1146 src->varTop->listHeader->first = src->varTop->next;
1147 }
1148 if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1149 src->varTop->listHeader->last = src->varTop->previous;
1150 }
1151 if(src->varTop->next != NULL) {
1152 src->varTop->next->previous = src->varTop->previous;
1153 }
1154 if(src->varTop->previous != NULL) {
1155 src->varTop->previous->next = src->varTop->next;
1156 }
1157 }
1158
1159
1160 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOU ND_TAG, NOT_FOUND_TAG, status);
1161 if(U_FAILURE(*status)) {
1162 uprv_free(image);
1163 return NULL;
1164 }
1165
1166
1167 /* After this, we have assigned CE values to all regular CEs */
1168 /* now we will go through list once more and resolve expansions, */
1169 /* make UCAElements structs and add them to table */
1170 for(i = 0; i<src->resultLen; i++) {
1171 /* now we need to generate the CEs */
1172 /* We stuff the initial value in the buffers, and increase the appropria te buffer */
1173 /* According to strength */
1174 if(U_SUCCESS(*status)) {
1175 ucol_createElements(src, t, &src->lh[i], status);
1176 }
1177 }
1178
1179 UCAElements el;
1180 el.isThai = FALSE;
1181 el.prefixSize = 0;
1182 el.prefixChars[0] = 0;
1183
1184 /* add latin-1 stuff */
1185 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1186
1187 /* add stuff for copying */
1188 if(src->copySet != NULL) {
1189 int32_t i = 0;
1190 UnicodeSet *set = (UnicodeSet *)src->copySet;
1191 for(i = 0; i < set->getRangeCount(); i++) {
1192 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->g etRangeEnd(i), status);
1193 }
1194 }
1195
1196 if(U_SUCCESS(*status)) {
1197 /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1198
1199 uint32_t tailoredCE = UCOL_NOT_FOUND;
1200 //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image- >UCAConsts+sizeof(UCAConstants));
1201 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->c ontractionUCACombos);
1202 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status) ;
1203 // Check for null pointer
1204 if (ucaEl == NULL) {
1205 *status = U_MEMORY_ALLOCATION_ERROR;
1206 return NULL;
1207 }
1208 while(*conts != 0) {
1209 /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
1210 tailoredCE = utrie_get32(t->mapping, *conts, NULL);
1211 if(tailoredCE != UCOL_NOT_FOUND) {
1212 UBool needToAdd = TRUE;
1213 if(isCntTableElement(tailoredCE)) {
1214 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts +1, status) == TRUE) {
1215 needToAdd = FALSE;
1216 }
1217 }
1218 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1219 UCAElements elm;
1220 elm.cPoints = el.uchars;
1221 elm.noOfCEs = 0;
1222 elm.uchars[0] = *conts;
1223 elm.uchars[1] = 0;
1224 elm.cSize = 1;
1225 elm.prefixChars[0] = *(conts+2);
1226 elm.isThai = FALSE;
1227 elm.prefix = elm.prefixChars;
1228 elm.prefixSize = 1;
1229 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLoo kup, &elm);
1230 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1231 needToAdd = TRUE;
1232 }
1233 }
1234 if(src->removeSet != NULL && uset_contains(src->removeSet, *cont s)) {
1235 needToAdd = FALSE;
1236 }
1237
1238 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1239 if (*(conts+1) != 0) { // contractions
1240 el.prefix = el.prefixChars;
1241 el.prefixSize = 0;
1242 el.cPoints = el.uchars;
1243 el.noOfCEs = 0;
1244 el.uchars[0] = *conts;
1245 el.uchars[1] = *(conts+1);
1246 if(*(conts+2)!=0) {
1247 el.uchars[2] = *(conts+2);
1248 el.cSize = 3;
1249 } else {
1250 el.cSize = 2;
1251 }
1252 ucol_setText(ucaEl, el.uchars, el.cSize, status);
1253 }
1254 else { // pre-context character
1255 UChar str[4] = { 0 };
1256 int32_t len=0;
1257 int32_t preKeyLen=0;
1258
1259 el.cPoints = el.uchars;
1260 el.noOfCEs = 0;
1261 el.uchars[0] = *conts;
1262 el.uchars[1] = 0;
1263 el.cSize = 1;
1264 el.prefixChars[0] = *(conts+2);
1265 el.prefix = el.prefixChars;
1266 el.prefixSize = 1;
1267 if (el.prefixChars[0]!=0) {
1268 // get CE of prefix character first
1269 str[0]=el.prefixChars[0];
1270 str[1]=0;
1271 ucol_setText(ucaEl, str, 1, status);
1272 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaE l, status))
1273 != UCOL_NULLORDER) {
1274 preKeyLen++; // count number of keys for prefix character
1275 }
1276 str[len++] = el.prefixChars[0];
1277 }
1278
1279 str[len++] = el.uchars[0];
1280 str[len]=0;
1281 ucol_setText(ucaEl, str, len, status);
1282 // Skip the keys for prefix character, then copy the res t to el.
1283 while ((preKeyLen-->0) &&
1284 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, s tatus)) != UCOL_NULLORDER) {
1285 continue;
1286 }
1287
1288 }
1289 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, statu s)) != UCOL_NULLORDER) {
1290 el.noOfCEs++;
1291 }
1292 uprv_uca_addAnElement(t, &el, status);
1293 }
1294
1295 } else if(src->removeSet != NULL && uset_contains(src->removeSet, *c onts)) {
1296 ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status);
1297 }
1298 conts+=3;
1299 }
1300 ucol_closeElements(ucaEl);
1301 }
1302
1303 // Add completely ignorable elements
1304 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1305
1306 // add tailoring characters related canonical closures
1307 uprv_uca_canonicalClosure(t, src, NULL, status);
1308
1309 /* still need to produce compatibility closure */
1310
1311 UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1312
1313 uprv_uca_closeTempTable(t);
1314 uprv_free(image);
1315
1316 return myData;
1317 }
1318
1319 U_CDECL_BEGIN
1320 static UBool U_CALLCONV
1321 ucol_bld_cleanup(void)
1322 {
1323 udata_close(invUCA_DATA_MEM);
1324 invUCA_DATA_MEM = NULL;
1325 _staticInvUCA = NULL;
1326 return TRUE;
1327 }
1328 U_CDECL_END
1329
1330 U_CAPI const InverseUCATableHeader * U_EXPORT2
1331 ucol_initInverseUCA(UErrorCode *status)
1332 {
1333 if(U_FAILURE(*status)) return NULL;
1334
1335 UBool needsInit;
1336 UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
1337
1338 if(needsInit) {
1339 InverseUCATableHeader *newInvUCA = NULL;
1340 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, I NVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1341
1342 if(U_FAILURE(*status)) {
1343 if (result) {
1344 udata_close(result);
1345 }
1346 // This is not needed, as we are talking about
1347 // memory we got from UData
1348 //uprv_free(newInvUCA);
1349 }
1350
1351 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1352 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1353 UCollator *UCA = ucol_initUCA(status);
1354 // UCA versions of UCA and inverse UCA should match
1355 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof (UVersionInfo)) != 0) {
1356 *status = U_INVALID_FORMAT_ERROR;
1357 udata_close(result);
1358 return NULL;
1359 }
1360
1361 umtx_lock(NULL);
1362 if(_staticInvUCA == NULL) {
1363 invUCA_DATA_MEM = result;
1364 _staticInvUCA = newInvUCA;
1365 result = NULL;
1366 newInvUCA = NULL;
1367 }
1368 umtx_unlock(NULL);
1369
1370 if(newInvUCA != NULL) {
1371 udata_close(result);
1372 // This is not needed, as we are talking about
1373 // memory we got from UData
1374 //uprv_free(newInvUCA);
1375 }
1376 else {
1377 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1378 }
1379 }
1380 }
1381 return _staticInvUCA;
1382 }
1383
1384 /* This is the data that is used for non-script reordering codes. These _must_ b e kept
1385 * in order that they are to be applied as defaults and in synch with the UColRe orderCode enum.
1386 */
1387 static const char* ReorderingTokenNames[] = {
1388 "SPACE",
1389 "PUNCT",
1390 "SYMBOL",
1391 "CURRENCY",
1392 "DIGIT",
1393 NULL
1394 };
1395
1396 static void toUpper(const char* src, char* dst, uint32_t length) {
1397 for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
1398 *dst = toupper(*src);
1399 }
1400 *dst = '\0';
1401 }
1402
1403 U_INTERNAL int32_t U_EXPORT2
1404 ucol_findReorderingEntry(const char* name) {
1405 char buffer[32];
1406 toUpper(name, buffer, 32);
1407 for (uint32_t entry = 0; ReorderingTokenNames[entry] != NULL; entry++) {
1408 if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
1409 return entry + UCOL_REORDER_CODE_FIRST;
1410 }
1411 }
1412 return USCRIPT_INVALID_CODE;
1413 }
1414
1415 U_NAMESPACE_END
1416
1417 #endif /* #if !UCONFIG_NO_COLLATION */
OLDNEW
« no previous file with comments | « icu46/source/i18n/ucol_bld.h ('k') | icu46/source/i18n/ucol_cnt.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698