Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(65)

Side by Side Diff: source/i18n/ucol_bld.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master
Patch Set: remove unusued directories Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/i18n/ucol_bld.h ('k') | source/i18n/ucol_cnt.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2001-2013, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucol_bld.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created 02/22/2001
14 * created by: Vladimir Weinstein
15 *
16 * This module builds a collator based on the rule set.
17 *
18 */
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_COLLATION
23
24 #include "unicode/ucoleitr.h"
25 #include "unicode/udata.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uniset.h"
28 #include "unicode/uscript.h"
29 #include "unicode/ustring.h"
30 #include "unicode/utf16.h"
31 #include "normalizer2impl.h"
32 #include "uassert.h"
33 #include "ucol_bld.h"
34 #include "ucol_elm.h"
35 #include "ucol_cnt.h"
36 #include "ucln_in.h"
37 #include "umutex.h"
38 #include "cmemory.h"
39 #include "cstring.h"
40
41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
42
43 static const InverseUCATableHeader* _staticInvUCA = NULL;
44 static UDataMemory* invUCA_DATA_MEM = NULL;
45 static icu::UInitOnce gStaticInvUCAInitOnce = U_INITONCE_INITIALIZER;
46
47 U_CDECL_BEGIN
48 static UBool U_CALLCONV
49 isAcceptableInvUCA(void * /*context*/,
50 const char * /*type*/, const char * /*name*/,
51 const UDataInfo *pInfo)
52 {
53 /* context, type & name are intentionally not used */
54 if( pInfo->size>=20 &&
55 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
56 pInfo->charsetFamily==U_CHARSET_FAMILY &&
57 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */
58 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
59 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
60 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
61 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
62 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
63 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
64 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
65 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
66 )
67 {
68 // TODO: Check that the invuca data version (pInfo->dataVersion)
69 // matches the ucadata version.
70 return TRUE;
71 } else {
72 return FALSE;
73 }
74 }
75 U_CDECL_END
76
77 /*
78 * Takes two CEs (lead and continuation) and
79 * compares them as CEs should be compared:
80 * primary vs. primary, secondary vs. secondary
81 * tertiary vs. tertiary
82 */
83 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
84 uint32_t s1 = source0, s2, t1 = target0, t2;
85 if(isContinuation(source1)) {
86 s2 = source1;
87 } else {
88 s2 = 0;
89 }
90 if(isContinuation(target1)) {
91 t2 = target1;
92 } else {
93 t2 = 0;
94 }
95
96 uint32_t s = 0, t = 0;
97 if(s1 == t1 && s2 == t2) {
98 return 0;
99 }
100 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
101 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
102 if(s < t) {
103 return -1;
104 } else if(s > t) {
105 return 1;
106 } else {
107 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
108 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
109 if(s < t) {
110 return -1;
111 } else if(s > t) {
112 return 1;
113 } else {
114 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
115 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
116 if(s < t) {
117 return -1;
118 } else {
119 return 1;
120 }
121 }
122 }
123 }
124
125 static
126 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t Second CE) {
127 uint32_t bottom = 0, top = src->invUCA->tableSize;
128 uint32_t i = 0;
129 uint32_t first = 0, second = 0;
130 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
131 int32_t res = 0;
132
133 while(bottom < top-1) {
134 i = (top+bottom)/2;
135 first = *(CETable+3*i);
136 second = *(CETable+3*i+1);
137 res = compareCEs(first, second, CE, SecondCE);
138 if(res > 0) {
139 top = i;
140 } else if(res < 0) {
141 bottom = i;
142 } else {
143 break;
144 }
145 }
146
147 /* weiv: */
148 /* in searching for elements, I have removed the failure */
149 /* The reason for this is that the builder does not rely */
150 /* on search mechanism telling it that it didn't find an */
151 /* element. However, indirect positioning relies on being */
152 /* able to find the elements around any CE, even if it is */
153 /* not defined in the UCA. */
154 return i;
155 /*
156 if((first == CE && second == SecondCE)) {
157 return i;
158 } else {
159 return -1;
160 }
161 */
162 }
163
164 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
165 0xFFFF0000,
166 0xFFFFFF00,
167 0xFFFFFFFF
168 };
169
170 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
171 uint32_t CE, uint32_t contCE,
172 uint32_t *nextCE, uint32_t *nextCont CE,
173 uint32_t strength)
174 {
175 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
176 int32_t iCE;
177
178 iCE = ucol_inv_findCE(src, CE, contCE);
179
180 if(iCE<0) {
181 *nextCE = UCOL_NOT_FOUND;
182 return -1;
183 }
184
185 CE &= strengthMask[strength];
186 contCE &= strengthMask[strength];
187
188 *nextCE = CE;
189 *nextContCE = contCE;
190
191 while((*nextCE & strengthMask[strength]) == CE
192 && (*nextContCE & strengthMask[strength]) == contCE)
193 {
194 *nextCE = (*(CETable+3*(++iCE)));
195 *nextContCE = (*(CETable+3*(iCE)+1));
196 }
197
198 return iCE;
199 }
200
201 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
202 uint32_t CE, uint32_t contCE,
203 uint32_t *prevCE, uint32_t *prevCont CE,
204 uint32_t strength)
205 {
206 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
207 int32_t iCE;
208
209 iCE = ucol_inv_findCE(src, CE, contCE);
210
211 if(iCE<0) {
212 *prevCE = UCOL_NOT_FOUND;
213 return -1;
214 }
215
216 CE &= strengthMask[strength];
217 contCE &= strengthMask[strength];
218
219 *prevCE = CE;
220 *prevContCE = contCE;
221
222 while((*prevCE & strengthMask[strength]) == CE
223 && (*prevContCE & strengthMask[strength])== contCE
224 && iCE > 0) /* this condition should prevent falling off the edge of the world */
225 {
226 /* here, we end up in a singularity - zero */
227 *prevCE = (*(CETable+3*(--iCE)));
228 *prevContCE = (*(CETable+3*(iCE)+1));
229 }
230
231 return iCE;
232 }
233
234 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t co ntCE,
235 uint32_t prevCE, uint32_t prevContCE)
236 {
237 if(prevCE == CE && prevContCE == contCE) {
238 return UCOL_IDENTICAL;
239 }
240 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY] )
241 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[U COL_PRIMARY]))
242 {
243 return UCOL_PRIMARY;
244 }
245 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECOND ARY])
246 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask [UCOL_SECONDARY]))
247 {
248 return UCOL_SECONDARY;
249 }
250 return UCOL_TERTIARY;
251 }
252
253
254 /*static
255 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
256
257 uint32_t CE = lh->baseCE;
258 uint32_t SecondCE = lh->baseContCE;
259
260 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
261 uint32_t previousCE, previousContCE;
262 int32_t iCE;
263
264 iCE = ucol_inv_findCE(src, CE, SecondCE);
265
266 if(iCE<0) {
267 return -1;
268 }
269
270 CE &= strengthMask[strength];
271 SecondCE &= strengthMask[strength];
272
273 previousCE = CE;
274 previousContCE = SecondCE;
275
276 while((previousCE & strengthMask[strength]) == CE && (previousContCE & str engthMask[strength])== SecondCE) {
277 previousCE = (*(CETable+3*(--iCE)));
278 previousContCE = (*(CETable+3*(iCE)+1));
279 }
280 lh->previousCE = previousCE;
281 lh->previousContCE = previousContCE;
282
283 return iCE;
284 }*/
285
286 static
287 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uin t32_t strength) {
288 uint32_t CE = lh->baseCE;
289 uint32_t SecondCE = lh->baseContCE;
290
291 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
292 uint32_t nextCE, nextContCE;
293 int32_t iCE;
294
295 iCE = ucol_inv_findCE(src, CE, SecondCE);
296
297 if(iCE<0) {
298 return -1;
299 }
300
301 CE &= strengthMask[strength];
302 SecondCE &= strengthMask[strength];
303
304 nextCE = CE;
305 nextContCE = SecondCE;
306
307 while((nextCE & strengthMask[strength]) == CE
308 && (nextContCE & strengthMask[strength]) == SecondCE)
309 {
310 nextCE = (*(CETable+3*(++iCE)));
311 nextContCE = (*(CETable+3*(iCE)+1));
312 }
313
314 lh->nextCE = nextCE;
315 lh->nextContCE = nextContCE;
316
317 return iCE;
318 }
319
320 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh , UErrorCode *status) {
321 /* reset all the gaps */
322 int32_t i = 0;
323 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
324 uint32_t st = 0;
325 uint32_t t1, t2;
326 int32_t pos;
327
328 UColToken *tok = lh->first;
329 uint32_t tokStrength = tok->strength;
330
331 for(i = 0; i<3; i++) {
332 lh->gapsHi[3*i] = 0;
333 lh->gapsHi[3*i+1] = 0;
334 lh->gapsHi[3*i+2] = 0;
335 lh->gapsLo[3*i] = 0;
336 lh->gapsLo[3*i+1] = 0;
337 lh->gapsLo[3*i+2] = 0;
338 lh->numStr[i] = 0;
339 lh->fStrToken[i] = NULL;
340 lh->lStrToken[i] = NULL;
341 lh->pos[i] = -1;
342 }
343
344 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UC A->image->UCAConsts);
345
346 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh ->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicit s - */
347 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT _MAX ) { /* implicits - */
348 lh->pos[0] = 0;
349 t1 = lh->baseCE;
350 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
351 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
352 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMA SK) << 8;
353 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2) ) << 16;
354 uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
355 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(prim aryCE)+1);
356
357 t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
358 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER ;
359
360 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
361 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMA SK) << 8;
362 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2) ) << 16;
363 } else if(lh->indirect == TRUE && lh->nextCE != 0) {
364 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
365 lh->pos[0] = 0;
366 t1 = lh->baseCE;
367 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
368 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
369 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMA SK) << 8;
370 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2) ) << 16;
371 t1 = lh->nextCE;
372 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
373 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
374 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMA SK) << 8;
375 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2) ) << 16;
376 } else {
377 for(;;) {
378 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
379 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength )) >= 0) {
380 lh->fStrToken[tokStrength] = tok;
381 } else { /* The CE must be implicit, since it's not in the table */
382 /* Error */
383 *status = U_INTERNAL_PROGRAM_ERROR;
384 }
385 }
386
387 while(tok != NULL && tok->strength >= tokStrength) {
388 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
389 lh->lStrToken[tokStrength] = tok;
390 }
391 tok = tok->next;
392 }
393 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
394 /* check if previous interval is the same and merge the interval s if it is so */
395 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
396 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
397 lh->fStrToken[tokStrength+1] = NULL;
398 lh->lStrToken[tokStrength+1] = NULL;
399 lh->pos[tokStrength+1] = -1;
400 }
401 }
402 if(tok != NULL) {
403 tokStrength = tok->strength;
404 } else {
405 break;
406 }
407 }
408 for(st = 0; st < 3; st++) {
409 if((pos = lh->pos[st]) >= 0) {
410 t1 = *(CETable+3*(pos));
411 t2 = *(CETable+3*(pos)+1);
412 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYM ASK) >> 16;
413 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCO L_SECONDARYMASK) << 8;
414 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TE RTIARYORDER(t2)) << 16;
415 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
416 //pos--;
417 //t1 = *(CETable+3*(pos));
418 //t2 = *(CETable+3*(pos)+1);
419 t1 = lh->baseCE;
420 t2 = lh->baseContCE;
421 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYM ASK) >> 16;
422 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCO L_SECONDARYMASK) << 8;
423 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
424 }
425 }
426 }
427 }
428
429
430 #define ucol_countBytes(value, noOfBytes) \
431 { \
432 uint32_t mask = 0xFFFFFFFF; \
433 (noOfBytes) = 0; \
434 while(mask != 0) { \
435 if(((value) & mask) != 0) { \
436 (noOfBytes)++; \
437 } \
438 mask >>= 8; \
439 } \
440 }
441
442 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
443 if(U_SUCCESS(*status)) {
444 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
445 }
446 return g->current;
447 }
448
449 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, ui nt32_t strength, UErrorCode *status) {
450 /* TODO: rename to enum names */
451 uint32_t high, low, count=1;
452 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
453
454 if(strength == UCOL_SECONDARY) {
455 low = UCOL_COMMON_TOP2<<24;
456 high = 0xFFFFFFFF;
457 count = 0xFF - UCOL_COMMON_TOP2;
458 } else {
459 low = UCOL_BYTE_COMMON << 24; //0x05000000;
460 high = 0x40000000;
461 count = 0x40 - UCOL_BYTE_COMMON;
462 }
463
464 if(tok->next != NULL && tok->next->strength == strength) {
465 count = tok->next->toInsert;
466 }
467
468 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
469 g->current = UCOL_BYTE_COMMON<<24;
470
471 if(g->noOfRanges == 0) {
472 *status = U_INTERNAL_PROGRAM_ERROR;
473 }
474 return g->current;
475 }
476
477 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t * highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
478 uint32_t strength = tok->strength;
479 uint32_t low = lows[fStrength*3+strength];
480 uint32_t high = highs[fStrength*3+strength];
481 uint32_t maxByte = 0;
482 if(strength == UCOL_TERTIARY) {
483 maxByte = 0x3F;
484 } else if(strength == UCOL_PRIMARY) {
485 maxByte = 0xFE;
486 } else {
487 maxByte = 0xFF;
488 }
489
490 uint32_t count = tok->toInsert;
491
492 if(low >= high && strength > UCOL_PRIMARY) {
493 int32_t s = strength;
494 for(;;) {
495 s--;
496 if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
497 if(strength == UCOL_SECONDARY) {
498 if (low < UCOL_COMMON_TOP2<<24 ) {
499 // Override if low range is less than UCOL_COMMON_TOP2.
500 low = UCOL_COMMON_TOP2<<24;
501 }
502 high = 0xFFFFFFFF;
503 } else {
504 // Override if low range is less than UCOL_COMMON_BOT3.
505 if ( low < UCOL_COMMON_BOT3<<24 ) {
506 low = UCOL_COMMON_BOT3<<24;
507 }
508 high = 0x40000000;
509 }
510 break;
511 }
512 if(s<0) {
513 *status = U_INTERNAL_PROGRAM_ERROR;
514 return 0;
515 }
516 }
517 }
518
519 if(low < 0x02000000) {
520 // We must not use CE weight byte 02, so we set it as the minimum lower bound.
521 // See http://site.icu-project.org/design/collation/bytes
522 low = 0x02000000;
523 }
524
525 if(strength == UCOL_SECONDARY) { /* similar as simple */
526 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<2 4)) {
527 low = UCOL_COMMON_TOP2<<24;
528 }
529 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<< 24)) {
530 high = UCOL_COMMON_TOP2<<24;
531 }
532 if(low < (UCOL_COMMON_BOT2<<24)) {
533 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
534 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
535 //g->current = UCOL_COMMON_BOT2<<24;
536 return g->current;
537 }
538 }
539
540 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
541 if(g->noOfRanges == 0) {
542 *status = U_INTERNAL_PROGRAM_ERROR;
543 }
544 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
545 return g->current;
546 }
547
548 static
549 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *res Buf, const uint32_t resLen, UErrorCode *status) {
550 uint32_t i = 0;
551 UChar c;
552
553 if(U_FAILURE(*status)) {
554 return 0;
555 }
556
557 if(sourceLen > resLen) {
558 *status = U_MEMORY_ALLOCATION_ERROR;
559 return 0;
560 }
561
562 for(i = 0; i < sourceLen; i++) {
563 c = source[i];
564 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
565 switch(c - 0x3000) {
566 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: ca se 0x83: case 0x85: case 0x8E:
567 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: ca se 0xE3: case 0xE5: case 0xEE:
568 c++;
569 break;
570 case 0xF5:
571 c = 0x30AB;
572 break;
573 case 0xF6:
574 c = 0x30B1;
575 break;
576 }
577 }
578 resBuf[i] = c;
579 }
580 return sourceLen;
581 }
582
583 static
584 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *res Buf, const uint32_t resLen, UErrorCode *status) {
585 uint32_t i = 0;
586 UChar c;
587
588 if(U_FAILURE(*status)) {
589 return 0;
590 }
591
592 if(sourceLen > resLen) {
593 *status = U_MEMORY_ALLOCATION_ERROR;
594 return 0;
595 }
596
597 for(i = 0; i < sourceLen; i++) {
598 c = source[i];
599 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
600 switch(c - 0x3000) {
601 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: ca se 0x84: case 0x86: case 0x8F:
602 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: ca se 0xE4: case 0xE6: case 0xEF:
603 c--;
604 break;
605 case 0xAB:
606 c = 0x30F5;
607 break;
608 case 0xB1:
609 c = 0x30F6;
610 break;
611 }
612 }
613 resBuf[i] = c;
614 }
615 return sourceLen;
616 }
617
618 U_NAMESPACE_BEGIN
619
620 static
621 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t l en, UErrorCode *status) {
622 uint32_t i = 0;
623 UChar n[128];
624 uint32_t nLen = 0;
625 uint32_t uCount = 0, lCount = 0;
626
627 collIterate s;
628 uint32_t order = 0;
629
630 if(U_FAILURE(*status)) {
631 return UCOL_LOWER_CASE;
632 }
633
634 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
635 if(U_SUCCESS(*status)) {
636 for(i = 0; i < nLen; i++) {
637 uprv_init_collIterate(UCA, &n[i], 1, &s, status);
638 order = ucol_getNextCE(UCA, &s, status);
639 if(isContinuation(order)) {
640 *status = U_INTERNAL_PROGRAM_ERROR;
641 return UCOL_LOWER_CASE;
642 }
643 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
644 uCount++;
645 } else {
646 if(u_islower(n[i])) {
647 lCount++;
648 } else if(U_SUCCESS(*status)) {
649 UChar sk[1], lk[1];
650 u_toSmallKana(&n[i], 1, sk, 1, status);
651 u_toLargeKana(&n[i], 1, lk, 1, status);
652 if(sk[0] == n[i] && lk[0] != n[i]) {
653 lCount++;
654 }
655 }
656 }
657 }
658 }
659
660 if(uCount != 0 && lCount != 0) {
661 return UCOL_MIXED_CASE;
662 } else if(uCount != 0) {
663 return UCOL_UPPER_CASE;
664 } else {
665 return UCOL_LOWER_CASE;
666 }
667 }
668
669
670 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
671 /* this one makes the table and stuff */
672 uint32_t noOfBytes[3];
673 uint32_t i;
674
675 for(i = 0; i<3; i++) {
676 ucol_countBytes(CEparts[i], noOfBytes[i]);
677 }
678
679 /* Here we have to pack CEs from parts */
680
681 uint32_t CEi = 0;
682 uint32_t value = 0;
683
684 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
685 if(CEi > 0) {
686 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
687 } else {
688 value = 0;
689 }
690
691 if(2*CEi<noOfBytes[0]) {
692 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
693 }
694 if(CEi<noOfBytes[1]) {
695 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
696 }
697 if(CEi<noOfBytes[2]) {
698 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
699 }
700 tok->CEs[CEi] = value;
701 CEi++;
702 }
703 if(CEi == 0) { /* totally ignorable */
704 tok->noOfCEs = 1;
705 tok->CEs[0] = 0;
706 } else { /* there is at least something */
707 tok->noOfCEs = CEi;
708 }
709
710
711 // we want to set case bits here and now, not later.
712 // Case bits handling
713 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
714 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
715 int32_t cSize = (tok->source & 0xFF000000) >> 24;
716 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
717
718 if(cSize > 1) {
719 // Do it manually
720 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, statu s);
721 } else {
722 // Copy it from the UCA
723 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
724 tok->CEs[0] |= (caseCE & 0xC0);
725 }
726 }
727
728 #if UCOL_DEBUG==2
729 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes [1]), CEparts[2]>> (32-8*noOfBytes[2]));
730 for(i = 0; i<tok->noOfCEs; i++) {
731 fprintf(stderr, "%08X ", tok->CEs[i]);
732 }
733 fprintf(stderr, "\n");
734 #endif
735 }
736
737 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErro rCode *status) {
738 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
739 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
740
741 UColToken *tok = lh->last;
742 uint32_t t[UCOL_STRENGTH_LIMIT];
743
744 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
745
746 /* must initialize ranges to avoid memory check warnings */
747 for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) {
748 uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges));
749 }
750
751 tok->toInsert = 1;
752 t[tok->strength] = 1;
753
754 while(tok->previous != NULL) {
755 if(tok->previous->strength < tok->strength) { /* going up */
756 t[tok->strength] = 0;
757 t[tok->previous->strength]++;
758 } else if(tok->previous->strength > tok->strength) { /* going down */
759 t[tok->previous->strength] = 1;
760 } else {
761 t[tok->strength]++;
762 }
763 tok=tok->previous;
764 tok->toInsert = t[tok->strength];
765 }
766
767 tok->toInsert = t[tok->strength];
768 ucol_inv_getGapPositions(src, lh, status);
769
770 #if UCOL_DEBUG
771 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
772 int32_t j = 2;
773 for(j = 2; j >= 0; j--) {
774 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh- >gapsLo[j*3+1], lh->gapsLo[j*3+2]);
775 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh- >gapsHi[j*3+1], lh->gapsHi[j*3+2]);
776 }
777 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
778
779 do {
780 fprintf(stderr,"%i", tok->strength);
781 tok = tok->next;
782 } while(tok != NULL);
783 fprintf(stderr, "\n");
784
785 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
786
787 do {
788 fprintf(stderr,"%i", tok->toInsert);
789 tok = tok->next;
790 } while(tok != NULL);
791 #endif
792
793 tok = lh->first;
794 uint32_t fStrength = UCOL_IDENTICAL;
795 uint32_t initStrength = UCOL_IDENTICAL;
796
797
798 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
799 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->bas eContCE & UCOL_SECONDARYMASK) << 8;
800 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERT IARYORDER(lh->baseContCE)) << 16;
801
802 while (tok != NULL && U_SUCCESS(*status)) {
803 fStrength = tok->strength;
804 if(fStrength < initStrength) {
805 initStrength = fStrength;
806 if(lh->pos[fStrength] == -1) {
807 while(lh->pos[fStrength] == -1 && fStrength > 0) {
808 fStrength--;
809 }
810 if(lh->pos[fStrength] == -1) {
811 *status = U_INTERNAL_PROGRAM_ERROR;
812 return;
813 }
814 }
815 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
816 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
817 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
818 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gap sLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
819 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY ], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
820 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
821 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
822 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrengt h*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
823 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDA RY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
824 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE RTIARY], tok, UCOL_TERTIARY, status);
825 } else { /* primaries */
826 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gaps Lo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
827 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
828 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_S ECONDARY], tok, UCOL_SECONDARY, status);
829 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE RTIARY], tok, UCOL_TERTIARY, status);
830 }
831 } else {
832 if(tok->strength == UCOL_TERTIARY) {
833 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIA RY], status);
834 } else if(tok->strength == UCOL_SECONDARY) {
835 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECON DARY], status);
836 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE RTIARY], tok, UCOL_TERTIARY, status);
837 } else if(tok->strength == UCOL_PRIMARY) {
838 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY ], status);
839 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_S ECONDARY], tok, UCOL_SECONDARY, status);
840 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE RTIARY], tok, UCOL_TERTIARY, status);
841 }
842 }
843 ucol_doCE(src, CEparts, tok, status);
844 tok = tok->next;
845 }
846 }
847
848 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL istHeader *lh, UErrorCode *status) {
849 UCAElements el;
850 UColToken *tok = lh->first;
851 UColToken *expt = NULL;
852 uint32_t i = 0, j = 0;
853 const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
854
855 while(tok != NULL && U_SUCCESS(*status)) {
856 /* first, check if there are any expansions */
857 /* if there are expansions, we need to do a little bit more processing * /
858 /* since parts of expansion can be tailored, while others are not */
859 if(tok->expansion != 0) {
860 uint32_t len = tok->expansion >> 24;
861 uint32_t currentSequenceLen = len;
862 uint32_t expOffset = tok->expansion & 0x00FFFFFF;
863 //uint32_t exp = currentSequenceLen | expOffset;
864 UColToken exp;
865 exp.source = currentSequenceLen | expOffset;
866 exp.rulesToParseHdl = &(src->source);
867
868 while(len > 0) {
869 currentSequenceLen = len;
870 while(currentSequenceLen > 0) {
871 exp.source = (currentSequenceLen << 24) | expOffset;
872 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != N ULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
873 uint32_t noOfCEsToCopy = expt->noOfCEs;
874 for(j = 0; j<noOfCEsToCopy; j++) {
875 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
876 }
877 tok->noOfExpCEs += noOfCEsToCopy;
878 // Smart people never try to add codepoints and CEs.
879 // For some odd reason, it won't work.
880 expOffset += currentSequenceLen; //noOfCEsToCopy;
881 len -= currentSequenceLen; //noOfCEsToCopy;
882 break;
883 } else {
884 currentSequenceLen--;
885 }
886 }
887 if(currentSequenceLen == 0) { /* couldn't find any tailored subs equence */
888 /* will have to get one from UCA */
889 /* first, get the UChars from the rules */
890 /* then pick CEs out until there is no more and stuff them i nto expansion */
891 collIterate s;
892 uint32_t order = 0;
893 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
894
895 for(;;) {
896 order = ucol_getNextCE(src->UCA, &s, status);
897 if(order == UCOL_NO_MORE_CES) {
898 break;
899 }
900 tok->expCEs[tok->noOfExpCEs++] = order;
901 }
902 expOffset++;
903 len--;
904 }
905 }
906 } else {
907 tok->noOfExpCEs = 0;
908 }
909
910 /* set the ucaelement with obtained values */
911 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
912 /* copy CEs */
913 for(i = 0; i<tok->noOfCEs; i++) {
914 el.CEs[i] = tok->CEs[i];
915 }
916 for(i = 0; i<tok->noOfExpCEs; i++) {
917 el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
918 }
919
920 /* copy UChars */
921 // We kept prefix and source kind of together, as it is a kind of a cont raction.
922 // However, now we have to slice the prefix off the main thing -
923 el.prefix = el.prefixChars;
924 el.cPoints = el.uchars;
925 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
926 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
927 // decomposed elements to the unsaf table.
928 el.prefixSize = tok->prefix>>24;
929 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el. prefixSize*sizeof(UChar));
930
931 el.cSize = (tok->source >> 24)-(tok->prefix>>24);
932 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
933 } else {
934 el.prefixSize = 0;
935 *el.prefix = 0;
936
937 el.cSize = (tok->source >> 24);
938 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el. cSize*sizeof(UChar));
939 }
940 if(src->UCA != NULL) {
941 for(i = 0; i<el.cSize; i++) {
942 if(UCOL_ISJAMO(el.cPoints[i])) {
943 t->image->jamoSpecial = TRUE;
944 }
945 }
946 if (!src->buildCCTabFlag && el.cSize > 0) {
947 // Check the trailing canonical combining class (tccc) of the la st character.
948 const UChar *s = el.cPoints + el.cSize;
949 uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s);
950 if ((fcd & 0xff) != 0) {
951 src->buildCCTabFlag = TRUE;
952 }
953 }
954 }
955
956 /* and then, add it */
957 #if UCOL_DEBUG==2
958 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
959 #endif
960 uprv_uca_addAnElement(t, &el, status);
961
962 #if UCOL_DEBUG_DUPLICATES
963 if(*status != U_ZERO_ERROR) {
964 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoint s[0], tok->debugSource);
965 *status = U_ZERO_ERROR;
966 }
967 #endif
968
969 tok = tok->next;
970 }
971 }
972
973 U_CDECL_BEGIN
974 static UBool U_CALLCONV
975 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
976 UErrorCode status = U_ZERO_ERROR;
977 tempUCATable *t = (tempUCATable *)context;
978 if(value == 0) {
979 while(start < limit) {
980 uint32_t CE = utrie_get32(t->mapping, start, NULL);
981 if(CE == UCOL_NOT_FOUND) {
982 UCAElements el;
983 el.isThai = FALSE;
984 el.prefixSize = 0;
985 el.prefixChars[0] = 0;
986 el.prefix = el.prefixChars;
987 el.cPoints = el.uchars;
988
989 el.cSize = 0;
990 U16_APPEND_UNSAFE(el.uchars, el.cSize, start);
991
992 el.noOfCEs = 1;
993 el.CEs[0] = 0;
994 uprv_uca_addAnElement(t, &el, &status);
995
996 }
997 start++;
998 }
999 }
1000 if(U_FAILURE(status)) {
1001 return FALSE;
1002 } else {
1003 return TRUE;
1004 }
1005 }
1006 U_CDECL_END
1007
1008 static void
1009 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
1010 UChar32 start, UChar32 end,
1011 UErrorCode *status)
1012 {
1013 //UChar decomp[256];
1014 uint32_t CE = UCOL_NOT_FOUND;
1015 UChar32 u = 0;
1016 UCAElements el;
1017 el.isThai = FALSE;
1018 el.prefixSize = 0;
1019 el.prefixChars[0] = 0;
1020 collIterate colIt;
1021
1022 if(U_SUCCESS(*status)) {
1023 for(u = start; u<=end; u++) {
1024 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1025 /* this test is for contractions that are missing the starting e lement. */
1026 || ((isCntTableElement(CE)) &&
1027 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_F OUND))
1028 )
1029 {
1030 el.cSize = 0;
1031 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1032 //decomp[0] = (UChar)u;
1033 //el.uchars[0] = (UChar)u;
1034 el.cPoints = el.uchars;
1035 //el.cSize = 1;
1036 el.noOfCEs = 0;
1037 el.prefix = el.prefixChars;
1038 el.prefixSize = 0;
1039 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1040 // We actually want to check whether this element is a special
1041 // If it is an implicit element (hangul, CJK - we want to copy t he
1042 // special, not the resolved CEs) - for hangul, copying resolved
1043 // would just make things the same (there is an expansion and it
1044 // takes approximately the same amount of time to resolve as
1045 // falling back to the UCA).
1046 /*
1047 UTRIE_GET32(src->UCA->mapping, u, CE);
1048 tag = getCETag(CE);
1049 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1050 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1051 || tag == LEAD_SURROGATE_TAG) {
1052 el.CEs[el.noOfCEs++] = CE;
1053 } else {
1054 */
1055 // It turns out that it does not make sense to keep implicits
1056 // unresolved. The cost of resolving them is big enough so that
1057 // it doesn't make any difference whether we have to go to the U CA
1058 // or not.
1059 {
1060 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
1061 while(CE != UCOL_NO_MORE_CES) {
1062 CE = ucol_getNextCE(src->UCA, &colIt, status);
1063 if(CE != UCOL_NO_MORE_CES) {
1064 el.CEs[el.noOfCEs++] = CE;
1065 }
1066 }
1067 }
1068 uprv_uca_addAnElement(t, &el, status);
1069 }
1070 }
1071 }
1072 }
1073
1074 U_NAMESPACE_END
1075
1076 U_CFUNC UCATableHeader *
1077 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1078 U_NAMESPACE_USE
1079
1080 uint32_t i = 0;
1081 if(U_FAILURE(*status)) {
1082 return NULL;
1083 }
1084 /*
1085 2. Eliminate the negative lists by doing the following for each non-null ne gative list:
1086 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1087 create new ListHeader X
1088 o reverse the list, add to the end of X's positive list. Reset the strengt h of the
1089 first item you add, based on the stronger strength levels of the two lists.
1090 */
1091 /*
1092 3. For each ListHeader with a non-null positive list:
1093 */
1094 /*
1095 o Find all character strings with CEs between the baseCE and the
1096 next/previous CE, at the strength of the first token. Add these to the
1097 tailoring.
1098 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
1099 tailoring has & x < z...
1100 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
1101 */
1102 /* It is possible that this part should be done even while constructing list */
1103 /* The problem is that it is unknown what is going to be the strongest weigh t */
1104 /* So we might as well do it here */
1105
1106 /*
1107 o Allocate CEs for each token in the list, based on the total number N of the
1108 largest level difference, and the gap G between baseCE and nextCE at that
1109 level. The relation * between the last item and nextCE is the same as the
1110 strongest strength.
1111 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1112 ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1113 Then fit b and c into the secondary gap between a and d, then fit q
1114 into the tertiary gap between b and c.
1115
1116 o Example: baseCE << b <<< q << c * nextCE(X,2)
1117 ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1118 Then fit q into the tertiary gap between b and c.
1119 o When incrementing primary values, we will not cross high byte
1120 boundaries except where there is only a single-byte primary. That is to
1121 ensure that the script reordering will continue to work.
1122 */
1123 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader) );
1124 /* test for NULL */
1125 if (image == NULL) {
1126 *status = U_MEMORY_ALLOCATION_ERROR;
1127 return NULL;
1128 }
1129 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1130
1131 for(i = 0; i<src->resultLen; i++) {
1132 /* now we need to generate the CEs */
1133 /* We stuff the initial value in the buffers, and increase the appropria te buffer */
1134 /* According to strength */
1135 if(U_SUCCESS(*status)) {
1136 if(src->lh[i].first) { // if there are any elements
1137 // due to the way parser works, subsequent tailorings
1138 // may remove all the elements from a sequence, therefore
1139 // leaving an empty tailoring sequence.
1140 ucol_initBuffers(src, &src->lh[i], status);
1141 }
1142 }
1143 if(U_FAILURE(*status)) {
1144 uprv_free(image);
1145 return NULL;
1146 }
1147 }
1148
1149 if(src->varTop != NULL) { /* stuff the variable top value */
1150 src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1151 /* remove it from the list */
1152 if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1153 src->varTop->listHeader->first = src->varTop->next;
1154 }
1155 if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1156 src->varTop->listHeader->last = src->varTop->previous;
1157 }
1158 if(src->varTop->next != NULL) {
1159 src->varTop->next->previous = src->varTop->previous;
1160 }
1161 if(src->varTop->previous != NULL) {
1162 src->varTop->previous->next = src->varTop->next;
1163 }
1164 }
1165
1166
1167 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOU ND_TAG, NOT_FOUND_TAG, status);
1168 if(U_FAILURE(*status)) {
1169 uprv_free(image);
1170 return NULL;
1171 }
1172
1173
1174 /* After this, we have assigned CE values to all regular CEs */
1175 /* now we will go through list once more and resolve expansions, */
1176 /* make UCAElements structs and add them to table */
1177 for(i = 0; i<src->resultLen; i++) {
1178 /* now we need to generate the CEs */
1179 /* We stuff the initial value in the buffers, and increase the appropria te buffer */
1180 /* According to strength */
1181 if(U_SUCCESS(*status)) {
1182 ucol_createElements(src, t, &src->lh[i], status);
1183 }
1184 }
1185
1186 UCAElements el;
1187 el.isThai = FALSE;
1188 el.prefixSize = 0;
1189 el.prefixChars[0] = 0;
1190
1191 /* add latin-1 stuff */
1192 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1193
1194 /* add stuff for copying */
1195 if(src->copySet != NULL) {
1196 int32_t i = 0;
1197 UnicodeSet *set = (UnicodeSet *)src->copySet;
1198 for(i = 0; i < set->getRangeCount(); i++) {
1199 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->g etRangeEnd(i), status);
1200 }
1201 }
1202
1203 if(U_SUCCESS(*status)) {
1204 /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1205
1206 uint32_t tailoredCE = UCOL_NOT_FOUND;
1207 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->c ontractionUCACombos);
1208 int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosW idth;
1209 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status) ;
1210 // Check for null pointer
1211 if (ucaEl == NULL) {
1212 *status = U_MEMORY_ALLOCATION_ERROR;
1213 return NULL;
1214 }
1215 while(*conts != 0) {
1216 // A continuation is NUL-terminated and NUL-padded
1217 // except if it has the maximum length.
1218 int32_t contractionLength = maxUCAContractionLength;
1219 while(contractionLength > 0 && conts[contractionLength - 1] == 0) {
1220 --contractionLength;
1221 }
1222 UChar32 first;
1223 int32_t firstLength = 0;
1224 U16_NEXT(conts, firstLength, contractionLength, first);
1225 tailoredCE = utrie_get32(t->mapping, first, NULL);
1226 if(tailoredCE != UCOL_NOT_FOUND) {
1227 UBool needToAdd = TRUE;
1228 if(isCntTableElement(tailoredCE)) {
1229 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts +firstLength, status) == TRUE) {
1230 needToAdd = FALSE;
1231 }
1232 }
1233 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1234 UCAElements elm;
1235 elm.cPoints = el.uchars;
1236 elm.noOfCEs = 0;
1237 elm.uchars[0] = *conts;
1238 elm.uchars[1] = 0;
1239 elm.cSize = 1;
1240 elm.prefixChars[0] = *(conts+2);
1241 elm.isThai = FALSE;
1242 elm.prefix = elm.prefixChars;
1243 elm.prefixSize = 1;
1244 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLoo kup, &elm);
1245 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1246 needToAdd = TRUE;
1247 }
1248 }
1249 if(src->removeSet != NULL && uset_contains(src->removeSet, first )) {
1250 needToAdd = FALSE;
1251 }
1252
1253 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1254 if (*(conts+1) != 0) { // contractions
1255 el.prefix = el.prefixChars;
1256 el.prefixSize = 0;
1257 el.cPoints = el.uchars;
1258 el.noOfCEs = 0;
1259 u_memcpy(el.uchars, conts, contractionLength);
1260 el.cSize = contractionLength;
1261 ucol_setText(ucaEl, el.uchars, el.cSize, status);
1262 }
1263 else { // pre-context character
1264 UChar str[4] = { 0 };
1265 int32_t len=0;
1266 int32_t preKeyLen=0;
1267
1268 el.cPoints = el.uchars;
1269 el.noOfCEs = 0;
1270 el.uchars[0] = *conts;
1271 el.uchars[1] = 0;
1272 el.cSize = 1;
1273 el.prefixChars[0] = *(conts+2);
1274 el.prefix = el.prefixChars;
1275 el.prefixSize = 1;
1276 if (el.prefixChars[0]!=0) {
1277 // get CE of prefix character first
1278 str[0]=el.prefixChars[0];
1279 str[1]=0;
1280 ucol_setText(ucaEl, str, 1, status);
1281 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaE l, status))
1282 != UCOL_NULLORDER) {
1283 preKeyLen++; // count number of keys for prefix character
1284 }
1285 str[len++] = el.prefixChars[0];
1286 }
1287
1288 str[len++] = el.uchars[0];
1289 str[len]=0;
1290 ucol_setText(ucaEl, str, len, status);
1291 // Skip the keys for prefix character, then copy the res t to el.
1292 while ((preKeyLen-->0) &&
1293 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, s tatus)) != UCOL_NULLORDER) {
1294 continue;
1295 }
1296
1297 }
1298 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, statu s)) != UCOL_NULLORDER) {
1299 el.noOfCEs++;
1300 }
1301 uprv_uca_addAnElement(t, &el, status);
1302 }
1303
1304 } else if(src->removeSet != NULL && uset_contains(src->removeSet, fi rst)) {
1305 ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status);
1306 }
1307 conts+=maxUCAContractionLength;
1308 }
1309 ucol_closeElements(ucaEl);
1310 }
1311
1312 // Add completely ignorable elements
1313 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1314
1315 // add tailoring characters related canonical closures
1316 uprv_uca_canonicalClosure(t, src, NULL, status);
1317
1318 /* still need to produce compatibility closure */
1319
1320 UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1321
1322 uprv_uca_closeTempTable(t);
1323 uprv_free(image);
1324
1325 return myData;
1326 }
1327
1328 U_CDECL_BEGIN
1329 static UBool U_CALLCONV
1330 ucol_bld_cleanup(void)
1331 {
1332 udata_close(invUCA_DATA_MEM);
1333 invUCA_DATA_MEM = NULL;
1334 _staticInvUCA = NULL;
1335 gStaticInvUCAInitOnce.reset();
1336 return TRUE;
1337 }
1338 U_CDECL_END
1339
1340 static void U_CALLCONV initInverseUCA(UErrorCode &status) {
1341 U_ASSERT(invUCA_DATA_MEM == NULL);
1342 U_ASSERT(_staticInvUCA == NULL);
1343 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1344 InverseUCATableHeader *newInvUCA = NULL;
1345 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_ DATA_NAME, isAcceptableInvUCA, NULL, &status);
1346
1347 if(U_FAILURE(status)) {
1348 if (result) {
1349 udata_close(result);
1350 }
1351 // This is not needed, as we are talking about
1352 // memory we got from UData
1353 //uprv_free(newInvUCA);
1354 return;
1355 }
1356
1357 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1358 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1359 UCollator *UCA = ucol_initUCA(&status);
1360 // UCA versions of UCA and inverse UCA should match
1361 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVe rsionInfo)) != 0) {
1362 status = U_INVALID_FORMAT_ERROR;
1363 udata_close(result);
1364 return;
1365 }
1366
1367 invUCA_DATA_MEM = result;
1368 _staticInvUCA = newInvUCA;
1369 }
1370 }
1371
1372
1373 U_CAPI const InverseUCATableHeader * U_EXPORT2
1374 ucol_initInverseUCA(UErrorCode *status)
1375 {
1376 umtx_initOnce(gStaticInvUCAInitOnce, &initInverseUCA, *status);
1377 return _staticInvUCA;
1378 }
1379
1380 /* This is the data that is used for non-script reordering codes. These _must_ b e kept
1381 * in order that they are to be applied as defaults and in synch with the UColRe orderCode enum.
1382 */
1383 static const char * const ReorderingTokenNames[] = {
1384 "SPACE",
1385 "PUNCT",
1386 "SYMBOL",
1387 "CURRENCY",
1388 "DIGIT"
1389 };
1390
1391 static void toUpper(const char* src, char* dst, uint32_t length) {
1392 for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
1393 *dst = uprv_toupper(*src);
1394 }
1395 *dst = '\0';
1396 }
1397
1398 U_INTERNAL int32_t U_EXPORT2
1399 ucol_findReorderingEntry(const char* name) {
1400 char buffer[32];
1401 toUpper(name, buffer, 32);
1402 for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) {
1403 if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
1404 return entry + UCOL_REORDER_CODE_FIRST;
1405 }
1406 }
1407 return USCRIPT_INVALID_CODE;
1408 }
1409
1410 #endif /* #if !UCONFIG_NO_COLLATION */
OLDNEW
« no previous file with comments | « source/i18n/ucol_bld.h ('k') | source/i18n/ucol_cnt.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698