OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * |
| 4 * Copyright (C) 2001-2010, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ******************************************************************************* |
| 8 * file name: ucol_bld.cpp |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created 02/22/2001 |
| 14 * created by: Vladimir Weinstein |
| 15 * |
| 16 * This module builds a collator based on the rule set. |
| 17 * |
| 18 */ |
| 19 |
| 20 #include "unicode/utypes.h" |
| 21 |
| 22 #if !UCONFIG_NO_COLLATION |
| 23 |
| 24 #include "unicode/ucoleitr.h" |
| 25 #include "unicode/udata.h" |
| 26 #include "unicode/uchar.h" |
| 27 #include "unicode/uniset.h" |
| 28 #include "unicode/uscript.h" |
| 29 #include "unicode/ustring.h" |
| 30 #include "normalizer2impl.h" |
| 31 #include "ucol_bld.h" |
| 32 #include "ucol_elm.h" |
| 33 #include "ucol_cnt.h" |
| 34 #include "ucln_in.h" |
| 35 #include "umutex.h" |
| 36 #include "cmemory.h" |
| 37 #include "cstring.h" |
| 38 |
| 39 U_NAMESPACE_BEGIN |
| 40 |
| 41 static const InverseUCATableHeader* _staticInvUCA = NULL; |
| 42 static UDataMemory* invUCA_DATA_MEM = NULL; |
| 43 |
| 44 U_CDECL_BEGIN |
| 45 static UBool U_CALLCONV |
| 46 isAcceptableInvUCA(void * /*context*/, |
| 47 const char * /*type*/, const char * /*name*/, |
| 48 const UDataInfo *pInfo) |
| 49 { |
| 50 /* context, type & name are intentionally not used */ |
| 51 if( pInfo->size>=20 && |
| 52 pInfo->isBigEndian==U_IS_BIG_ENDIAN && |
| 53 pInfo->charsetFamily==U_CHARSET_FAMILY && |
| 54 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */ |
| 55 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 && |
| 56 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 && |
| 57 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 && |
| 58 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 && |
| 59 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&& |
| 60 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 && |
| 61 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 && |
| 62 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 && |
| 63 ) |
| 64 { |
| 65 UVersionInfo UCDVersion; |
| 66 u_getUnicodeVersion(UCDVersion); |
| 67 return (pInfo->dataVersion[0]==UCDVersion[0] && |
| 68 pInfo->dataVersion[1]==UCDVersion[1]); |
| 69 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] && |
| 70 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] && |
| 71 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) { |
| 72 } else { |
| 73 return FALSE; |
| 74 } |
| 75 } |
| 76 U_CDECL_END |
| 77 |
| 78 /* |
| 79 * Takes two CEs (lead and continuation) and |
| 80 * compares them as CEs should be compared: |
| 81 * primary vs. primary, secondary vs. secondary |
| 82 * tertiary vs. tertiary |
| 83 */ |
| 84 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0,
uint32_t target1) { |
| 85 uint32_t s1 = source0, s2, t1 = target0, t2; |
| 86 if(isContinuation(source1)) { |
| 87 s2 = source1; |
| 88 } else { |
| 89 s2 = 0; |
| 90 } |
| 91 if(isContinuation(target1)) { |
| 92 t2 = target1; |
| 93 } else { |
| 94 t2 = 0; |
| 95 } |
| 96 |
| 97 uint32_t s = 0, t = 0; |
| 98 if(s1 == t1 && s2 == t2) { |
| 99 return 0; |
| 100 } |
| 101 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16); |
| 102 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16); |
| 103 if(s < t) { |
| 104 return -1; |
| 105 } else if(s > t) { |
| 106 return 1; |
| 107 } else { |
| 108 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8; |
| 109 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8; |
| 110 if(s < t) { |
| 111 return -1; |
| 112 } else if(s > t) { |
| 113 return 1; |
| 114 } else { |
| 115 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF); |
| 116 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF); |
| 117 if(s < t) { |
| 118 return -1; |
| 119 } else { |
| 120 return 1; |
| 121 } |
| 122 } |
| 123 } |
| 124 } |
| 125 |
| 126 static |
| 127 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t Second
CE) { |
| 128 uint32_t bottom = 0, top = src->invUCA->tableSize; |
| 129 uint32_t i = 0; |
| 130 uint32_t first = 0, second = 0; |
| 131 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
| 132 int32_t res = 0; |
| 133 |
| 134 while(bottom < top-1) { |
| 135 i = (top+bottom)/2; |
| 136 first = *(CETable+3*i); |
| 137 second = *(CETable+3*i+1); |
| 138 res = compareCEs(first, second, CE, SecondCE); |
| 139 if(res > 0) { |
| 140 top = i; |
| 141 } else if(res < 0) { |
| 142 bottom = i; |
| 143 } else { |
| 144 break; |
| 145 } |
| 146 } |
| 147 |
| 148 /* weiv: */ |
| 149 /* in searching for elements, I have removed the failure */ |
| 150 /* The reason for this is that the builder does not rely */ |
| 151 /* on search mechanism telling it that it didn't find an */ |
| 152 /* element. However, indirect positioning relies on being */ |
| 153 /* able to find the elements around any CE, even if it is */ |
| 154 /* not defined in the UCA. */ |
| 155 return i; |
| 156 /* |
| 157 if((first == CE && second == SecondCE)) { |
| 158 return i; |
| 159 } else { |
| 160 return -1; |
| 161 } |
| 162 */ |
| 163 } |
| 164 |
| 165 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = { |
| 166 0xFFFF0000, |
| 167 0xFFFFFF00, |
| 168 0xFFFFFFFF |
| 169 }; |
| 170 |
| 171 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src, |
| 172 uint32_t CE, uint32_t contCE, |
| 173 uint32_t *nextCE, uint32_t *nextCont
CE, |
| 174 uint32_t strength) |
| 175 { |
| 176 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
| 177 int32_t iCE; |
| 178 |
| 179 iCE = ucol_inv_findCE(src, CE, contCE); |
| 180 |
| 181 if(iCE<0) { |
| 182 *nextCE = UCOL_NOT_FOUND; |
| 183 return -1; |
| 184 } |
| 185 |
| 186 CE &= strengthMask[strength]; |
| 187 contCE &= strengthMask[strength]; |
| 188 |
| 189 *nextCE = CE; |
| 190 *nextContCE = contCE; |
| 191 |
| 192 while((*nextCE & strengthMask[strength]) == CE |
| 193 && (*nextContCE & strengthMask[strength]) == contCE) |
| 194 { |
| 195 *nextCE = (*(CETable+3*(++iCE))); |
| 196 *nextContCE = (*(CETable+3*(iCE)+1)); |
| 197 } |
| 198 |
| 199 return iCE; |
| 200 } |
| 201 |
| 202 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src, |
| 203 uint32_t CE, uint32_t contCE, |
| 204 uint32_t *prevCE, uint32_t *prevCont
CE, |
| 205 uint32_t strength) |
| 206 { |
| 207 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
| 208 int32_t iCE; |
| 209 |
| 210 iCE = ucol_inv_findCE(src, CE, contCE); |
| 211 |
| 212 if(iCE<0) { |
| 213 *prevCE = UCOL_NOT_FOUND; |
| 214 return -1; |
| 215 } |
| 216 |
| 217 CE &= strengthMask[strength]; |
| 218 contCE &= strengthMask[strength]; |
| 219 |
| 220 *prevCE = CE; |
| 221 *prevContCE = contCE; |
| 222 |
| 223 while((*prevCE & strengthMask[strength]) == CE |
| 224 && (*prevContCE & strengthMask[strength])== contCE |
| 225 && iCE > 0) /* this condition should prevent falling off the edge of the
world */ |
| 226 { |
| 227 /* here, we end up in a singularity - zero */ |
| 228 *prevCE = (*(CETable+3*(--iCE))); |
| 229 *prevContCE = (*(CETable+3*(iCE)+1)); |
| 230 } |
| 231 |
| 232 return iCE; |
| 233 } |
| 234 |
| 235 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t co
ntCE, |
| 236 uint32_t prevCE, uint32_t
prevContCE) |
| 237 { |
| 238 if(prevCE == CE && prevContCE == contCE) { |
| 239 return UCOL_IDENTICAL; |
| 240 } |
| 241 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY]
) |
| 242 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[U
COL_PRIMARY])) |
| 243 { |
| 244 return UCOL_PRIMARY; |
| 245 } |
| 246 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECOND
ARY]) |
| 247 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask
[UCOL_SECONDARY])) |
| 248 { |
| 249 return UCOL_SECONDARY; |
| 250 } |
| 251 return UCOL_TERTIARY; |
| 252 } |
| 253 |
| 254 |
| 255 /*static |
| 256 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh,
uint32_t strength) { |
| 257 |
| 258 uint32_t CE = lh->baseCE; |
| 259 uint32_t SecondCE = lh->baseContCE; |
| 260 |
| 261 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
| 262 uint32_t previousCE, previousContCE; |
| 263 int32_t iCE; |
| 264 |
| 265 iCE = ucol_inv_findCE(src, CE, SecondCE); |
| 266 |
| 267 if(iCE<0) { |
| 268 return -1; |
| 269 } |
| 270 |
| 271 CE &= strengthMask[strength]; |
| 272 SecondCE &= strengthMask[strength]; |
| 273 |
| 274 previousCE = CE; |
| 275 previousContCE = SecondCE; |
| 276 |
| 277 while((previousCE & strengthMask[strength]) == CE && (previousContCE & str
engthMask[strength])== SecondCE) { |
| 278 previousCE = (*(CETable+3*(--iCE))); |
| 279 previousContCE = (*(CETable+3*(iCE)+1)); |
| 280 } |
| 281 lh->previousCE = previousCE; |
| 282 lh->previousContCE = previousContCE; |
| 283 |
| 284 return iCE; |
| 285 }*/ |
| 286 |
| 287 static |
| 288 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uin
t32_t strength) { |
| 289 uint32_t CE = lh->baseCE; |
| 290 uint32_t SecondCE = lh->baseContCE; |
| 291 |
| 292 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
| 293 uint32_t nextCE, nextContCE; |
| 294 int32_t iCE; |
| 295 |
| 296 iCE = ucol_inv_findCE(src, CE, SecondCE); |
| 297 |
| 298 if(iCE<0) { |
| 299 return -1; |
| 300 } |
| 301 |
| 302 CE &= strengthMask[strength]; |
| 303 SecondCE &= strengthMask[strength]; |
| 304 |
| 305 nextCE = CE; |
| 306 nextContCE = SecondCE; |
| 307 |
| 308 while((nextCE & strengthMask[strength]) == CE |
| 309 && (nextContCE & strengthMask[strength]) == SecondCE) |
| 310 { |
| 311 nextCE = (*(CETable+3*(++iCE))); |
| 312 nextContCE = (*(CETable+3*(iCE)+1)); |
| 313 } |
| 314 |
| 315 lh->nextCE = nextCE; |
| 316 lh->nextContCE = nextContCE; |
| 317 |
| 318 return iCE; |
| 319 } |
| 320 |
| 321 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh
, UErrorCode *status) { |
| 322 /* reset all the gaps */ |
| 323 int32_t i = 0; |
| 324 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
| 325 uint32_t st = 0; |
| 326 uint32_t t1, t2; |
| 327 int32_t pos; |
| 328 |
| 329 UColToken *tok = lh->first; |
| 330 uint32_t tokStrength = tok->strength; |
| 331 |
| 332 for(i = 0; i<3; i++) { |
| 333 lh->gapsHi[3*i] = 0; |
| 334 lh->gapsHi[3*i+1] = 0; |
| 335 lh->gapsHi[3*i+2] = 0; |
| 336 lh->gapsLo[3*i] = 0; |
| 337 lh->gapsLo[3*i+1] = 0; |
| 338 lh->gapsLo[3*i+2] = 0; |
| 339 lh->numStr[i] = 0; |
| 340 lh->fStrToken[i] = NULL; |
| 341 lh->lStrToken[i] = NULL; |
| 342 lh->pos[i] = -1; |
| 343 } |
| 344 |
| 345 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UC
A->image->UCAConsts); |
| 346 |
| 347 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh
->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicit
s - */ |
| 348 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT
_MAX ) { /* implicits - */ |
| 349 lh->pos[0] = 0; |
| 350 t1 = lh->baseCE; |
| 351 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION; |
| 352 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; |
| 353 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMA
SK) << 8; |
| 354 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)
) << 16; |
| 355 uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK)
>> 16); |
| 356 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(prim
aryCE)+1); |
| 357 |
| 358 t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; |
| 359 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER
; |
| 360 |
| 361 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; |
| 362 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMA
SK) << 8; |
| 363 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)
) << 16; |
| 364 } else if(lh->indirect == TRUE && lh->nextCE != 0) { |
| 365 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) { |
| 366 lh->pos[0] = 0; |
| 367 t1 = lh->baseCE; |
| 368 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION; |
| 369 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; |
| 370 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMA
SK) << 8; |
| 371 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)
) << 16; |
| 372 t1 = lh->nextCE; |
| 373 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION; |
| 374 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; |
| 375 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMA
SK) << 8; |
| 376 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)
) << 16; |
| 377 } else { |
| 378 for(;;) { |
| 379 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { |
| 380 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength
)) >= 0) { |
| 381 lh->fStrToken[tokStrength] = tok; |
| 382 } else { /* The CE must be implicit, since it's not in the table
*/ |
| 383 /* Error */ |
| 384 *status = U_INTERNAL_PROGRAM_ERROR; |
| 385 } |
| 386 } |
| 387 |
| 388 while(tok != NULL && tok->strength >= tokStrength) { |
| 389 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { |
| 390 lh->lStrToken[tokStrength] = tok; |
| 391 } |
| 392 tok = tok->next; |
| 393 } |
| 394 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) { |
| 395 /* check if previous interval is the same and merge the interval
s if it is so */ |
| 396 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) { |
| 397 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1]; |
| 398 lh->fStrToken[tokStrength+1] = NULL; |
| 399 lh->lStrToken[tokStrength+1] = NULL; |
| 400 lh->pos[tokStrength+1] = -1; |
| 401 } |
| 402 } |
| 403 if(tok != NULL) { |
| 404 tokStrength = tok->strength; |
| 405 } else { |
| 406 break; |
| 407 } |
| 408 } |
| 409 for(st = 0; st < 3; st++) { |
| 410 if((pos = lh->pos[st]) >= 0) { |
| 411 t1 = *(CETable+3*(pos)); |
| 412 t2 = *(CETable+3*(pos)+1); |
| 413 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYM
ASK) >> 16; |
| 414 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCO
L_SECONDARYMASK) << 8; |
| 415 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TE
RTIARYORDER(t2)) << 16; |
| 416 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; |
| 417 //pos--; |
| 418 //t1 = *(CETable+3*(pos)); |
| 419 //t2 = *(CETable+3*(pos)+1); |
| 420 t1 = lh->baseCE; |
| 421 t2 = lh->baseContCE; |
| 422 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYM
ASK) >> 16; |
| 423 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCO
L_SECONDARYMASK) << 8; |
| 424 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; |
| 425 } |
| 426 } |
| 427 } |
| 428 } |
| 429 |
| 430 |
| 431 #define ucol_countBytes(value, noOfBytes) \ |
| 432 { \ |
| 433 uint32_t mask = 0xFFFFFFFF; \ |
| 434 (noOfBytes) = 0; \ |
| 435 while(mask != 0) { \ |
| 436 if(((value) & mask) != 0) { \ |
| 437 (noOfBytes)++; \ |
| 438 } \ |
| 439 mask >>= 8; \ |
| 440 } \ |
| 441 } |
| 442 |
| 443 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) { |
| 444 if(U_SUCCESS(*status)) { |
| 445 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); |
| 446 } |
| 447 return g->current; |
| 448 } |
| 449 |
| 450 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, ui
nt32_t strength, UErrorCode *status) { |
| 451 /* TODO: rename to enum names */ |
| 452 uint32_t high, low, count=1; |
| 453 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF; |
| 454 |
| 455 if(strength == UCOL_SECONDARY) { |
| 456 low = UCOL_COMMON_TOP2<<24; |
| 457 high = 0xFFFFFFFF; |
| 458 count = 0xFF - UCOL_COMMON_TOP2; |
| 459 } else { |
| 460 low = UCOL_BYTE_COMMON << 24; //0x05000000; |
| 461 high = 0x40000000; |
| 462 count = 0x40 - UCOL_BYTE_COMMON; |
| 463 } |
| 464 |
| 465 if(tok->next != NULL && tok->next->strength == strength) { |
| 466 count = tok->next->toInsert; |
| 467 } |
| 468 |
| 469 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); |
| 470 g->current = UCOL_BYTE_COMMON<<24; |
| 471 |
| 472 if(g->noOfRanges == 0) { |
| 473 *status = U_INTERNAL_PROGRAM_ERROR; |
| 474 } |
| 475 return g->current; |
| 476 } |
| 477 |
| 478 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t
* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) { |
| 479 uint32_t strength = tok->strength; |
| 480 uint32_t low = lows[fStrength*3+strength]; |
| 481 uint32_t high = highs[fStrength*3+strength]; |
| 482 uint32_t maxByte = 0; |
| 483 if(strength == UCOL_TERTIARY) { |
| 484 maxByte = 0x3F; |
| 485 } else if(strength == UCOL_PRIMARY) { |
| 486 maxByte = 0xFE; |
| 487 } else { |
| 488 maxByte = 0xFF; |
| 489 } |
| 490 |
| 491 uint32_t count = tok->toInsert; |
| 492 |
| 493 if(low >= high && strength > UCOL_PRIMARY) { |
| 494 int32_t s = strength; |
| 495 for(;;) { |
| 496 s--; |
| 497 if(lows[fStrength*3+s] != highs[fStrength*3+s]) { |
| 498 if(strength == UCOL_SECONDARY) { |
| 499 if (low < UCOL_COMMON_TOP2<<24 ) { |
| 500 // Override if low range is less than UCOL_COMMON_TOP2. |
| 501 low = UCOL_COMMON_TOP2<<24; |
| 502 } |
| 503 high = 0xFFFFFFFF; |
| 504 } else { |
| 505 // Override if low range is less than UCOL_COMMON_BOT3. |
| 506 if ( low < UCOL_COMMON_BOT3<<24 ) { |
| 507 low = UCOL_COMMON_BOT3<<24; |
| 508 } |
| 509 high = 0x40000000; |
| 510 } |
| 511 break; |
| 512 } |
| 513 if(s<0) { |
| 514 *status = U_INTERNAL_PROGRAM_ERROR; |
| 515 return 0; |
| 516 } |
| 517 } |
| 518 } |
| 519 |
| 520 if(low < 0x02000000) { |
| 521 // We must not use CE weight byte 02, so we set it as the minimum lower
bound. |
| 522 // See http://site.icu-project.org/design/collation/bytes |
| 523 low = 0x02000000; |
| 524 } |
| 525 |
| 526 if(strength == UCOL_SECONDARY) { /* similar as simple */ |
| 527 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<2
4)) { |
| 528 low = UCOL_COMMON_TOP2<<24; |
| 529 } |
| 530 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<
24)) { |
| 531 high = UCOL_COMMON_TOP2<<24; |
| 532 } |
| 533 if(low < (UCOL_COMMON_BOT2<<24)) { |
| 534 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high,
count, maxByte, g->ranges); |
| 535 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); |
| 536 //g->current = UCOL_COMMON_BOT2<<24; |
| 537 return g->current; |
| 538 } |
| 539 } |
| 540 |
| 541 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); |
| 542 if(g->noOfRanges == 0) { |
| 543 *status = U_INTERNAL_PROGRAM_ERROR; |
| 544 } |
| 545 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); |
| 546 return g->current; |
| 547 } |
| 548 |
| 549 static |
| 550 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *res
Buf, const uint32_t resLen, UErrorCode *status) { |
| 551 uint32_t i = 0; |
| 552 UChar c; |
| 553 |
| 554 if(U_FAILURE(*status)) { |
| 555 return 0; |
| 556 } |
| 557 |
| 558 if(sourceLen > resLen) { |
| 559 *status = U_MEMORY_ALLOCATION_ERROR; |
| 560 return 0; |
| 561 } |
| 562 |
| 563 for(i = 0; i < sourceLen; i++) { |
| 564 c = source[i]; |
| 565 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ |
| 566 switch(c - 0x3000) { |
| 567 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: ca
se 0x83: case 0x85: case 0x8E: |
| 568 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: ca
se 0xE3: case 0xE5: case 0xEE: |
| 569 c++; |
| 570 break; |
| 571 case 0xF5: |
| 572 c = 0x30AB; |
| 573 break; |
| 574 case 0xF6: |
| 575 c = 0x30B1; |
| 576 break; |
| 577 } |
| 578 } |
| 579 resBuf[i] = c; |
| 580 } |
| 581 return sourceLen; |
| 582 } |
| 583 |
| 584 static |
| 585 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *res
Buf, const uint32_t resLen, UErrorCode *status) { |
| 586 uint32_t i = 0; |
| 587 UChar c; |
| 588 |
| 589 if(U_FAILURE(*status)) { |
| 590 return 0; |
| 591 } |
| 592 |
| 593 if(sourceLen > resLen) { |
| 594 *status = U_MEMORY_ALLOCATION_ERROR; |
| 595 return 0; |
| 596 } |
| 597 |
| 598 for(i = 0; i < sourceLen; i++) { |
| 599 c = source[i]; |
| 600 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ |
| 601 switch(c - 0x3000) { |
| 602 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: ca
se 0x84: case 0x86: case 0x8F: |
| 603 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: ca
se 0xE4: case 0xE6: case 0xEF: |
| 604 c--; |
| 605 break; |
| 606 case 0xAB: |
| 607 c = 0x30F5; |
| 608 break; |
| 609 case 0xB1: |
| 610 c = 0x30F6; |
| 611 break; |
| 612 } |
| 613 } |
| 614 resBuf[i] = c; |
| 615 } |
| 616 return sourceLen; |
| 617 } |
| 618 |
| 619 static |
| 620 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t l
en, UErrorCode *status) { |
| 621 uint32_t i = 0; |
| 622 UChar n[128]; |
| 623 uint32_t nLen = 0; |
| 624 uint32_t uCount = 0, lCount = 0; |
| 625 |
| 626 collIterate s; |
| 627 uint32_t order = 0; |
| 628 |
| 629 if(U_FAILURE(*status)) { |
| 630 return UCOL_LOWER_CASE; |
| 631 } |
| 632 |
| 633 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status); |
| 634 if(U_SUCCESS(*status)) { |
| 635 for(i = 0; i < nLen; i++) { |
| 636 uprv_init_collIterate(UCA, &n[i], 1, &s, status); |
| 637 order = ucol_getNextCE(UCA, &s, status); |
| 638 if(isContinuation(order)) { |
| 639 *status = U_INTERNAL_PROGRAM_ERROR; |
| 640 return UCOL_LOWER_CASE; |
| 641 } |
| 642 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) { |
| 643 uCount++; |
| 644 } else { |
| 645 if(u_islower(n[i])) { |
| 646 lCount++; |
| 647 } else if(U_SUCCESS(*status)) { |
| 648 UChar sk[1], lk[1]; |
| 649 u_toSmallKana(&n[i], 1, sk, 1, status); |
| 650 u_toLargeKana(&n[i], 1, lk, 1, status); |
| 651 if(sk[0] == n[i] && lk[0] != n[i]) { |
| 652 lCount++; |
| 653 } |
| 654 } |
| 655 } |
| 656 } |
| 657 } |
| 658 |
| 659 if(uCount != 0 && lCount != 0) { |
| 660 return UCOL_MIXED_CASE; |
| 661 } else if(uCount != 0) { |
| 662 return UCOL_UPPER_CASE; |
| 663 } else { |
| 664 return UCOL_LOWER_CASE; |
| 665 } |
| 666 } |
| 667 |
| 668 |
| 669 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok,
UErrorCode *status) { |
| 670 /* this one makes the table and stuff */ |
| 671 uint32_t noOfBytes[3]; |
| 672 uint32_t i; |
| 673 |
| 674 for(i = 0; i<3; i++) { |
| 675 ucol_countBytes(CEparts[i], noOfBytes[i]); |
| 676 } |
| 677 |
| 678 /* Here we have to pack CEs from parts */ |
| 679 |
| 680 uint32_t CEi = 0; |
| 681 uint32_t value = 0; |
| 682 |
| 683 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) { |
| 684 if(CEi > 0) { |
| 685 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ |
| 686 } else { |
| 687 value = 0; |
| 688 } |
| 689 |
| 690 if(2*CEi<noOfBytes[0]) { |
| 691 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16; |
| 692 } |
| 693 if(CEi<noOfBytes[1]) { |
| 694 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8; |
| 695 } |
| 696 if(CEi<noOfBytes[2]) { |
| 697 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F); |
| 698 } |
| 699 tok->CEs[CEi] = value; |
| 700 CEi++; |
| 701 } |
| 702 if(CEi == 0) { /* totally ignorable */ |
| 703 tok->noOfCEs = 1; |
| 704 tok->CEs[0] = 0; |
| 705 } else { /* there is at least something */ |
| 706 tok->noOfCEs = CEi; |
| 707 } |
| 708 |
| 709 |
| 710 // we want to set case bits here and now, not later. |
| 711 // Case bits handling |
| 712 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables |
| 713 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field |
| 714 int32_t cSize = (tok->source & 0xFF000000) >> 24; |
| 715 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source; |
| 716 |
| 717 if(cSize > 1) { |
| 718 // Do it manually |
| 719 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, statu
s); |
| 720 } else { |
| 721 // Copy it from the UCA |
| 722 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status); |
| 723 tok->CEs[0] |= (caseCE & 0xC0); |
| 724 } |
| 725 } |
| 726 |
| 727 #if UCOL_DEBUG==2 |
| 728 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource,
tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes
[1]), CEparts[2]>> (32-8*noOfBytes[2])); |
| 729 for(i = 0; i<tok->noOfCEs; i++) { |
| 730 fprintf(stderr, "%08X ", tok->CEs[i]); |
| 731 } |
| 732 fprintf(stderr, "\n"); |
| 733 #endif |
| 734 } |
| 735 |
| 736 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErro
rCode *status) { |
| 737 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT]; |
| 738 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT]; |
| 739 |
| 740 UColToken *tok = lh->last; |
| 741 uint32_t t[UCOL_STRENGTH_LIMIT]; |
| 742 |
| 743 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t)); |
| 744 |
| 745 tok->toInsert = 1; |
| 746 t[tok->strength] = 1; |
| 747 |
| 748 while(tok->previous != NULL) { |
| 749 if(tok->previous->strength < tok->strength) { /* going up */ |
| 750 t[tok->strength] = 0; |
| 751 t[tok->previous->strength]++; |
| 752 } else if(tok->previous->strength > tok->strength) { /* going down */ |
| 753 t[tok->previous->strength] = 1; |
| 754 } else { |
| 755 t[tok->strength]++; |
| 756 } |
| 757 tok=tok->previous; |
| 758 tok->toInsert = t[tok->strength]; |
| 759 } |
| 760 |
| 761 tok->toInsert = t[tok->strength]; |
| 762 ucol_inv_getGapPositions(src, lh, status); |
| 763 |
| 764 #if UCOL_DEBUG |
| 765 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE); |
| 766 int32_t j = 2; |
| 767 for(j = 2; j >= 0; j--) { |
| 768 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh-
>gapsLo[j*3+1], lh->gapsLo[j*3+2]); |
| 769 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh-
>gapsHi[j*3+1], lh->gapsHi[j*3+2]); |
| 770 } |
| 771 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; |
| 772 |
| 773 do { |
| 774 fprintf(stderr,"%i", tok->strength); |
| 775 tok = tok->next; |
| 776 } while(tok != NULL); |
| 777 fprintf(stderr, "\n"); |
| 778 |
| 779 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; |
| 780 |
| 781 do { |
| 782 fprintf(stderr,"%i", tok->toInsert); |
| 783 tok = tok->next; |
| 784 } while(tok != NULL); |
| 785 #endif |
| 786 |
| 787 tok = lh->first; |
| 788 uint32_t fStrength = UCOL_IDENTICAL; |
| 789 uint32_t initStrength = UCOL_IDENTICAL; |
| 790 |
| 791 |
| 792 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE &
UCOL_PRIMARYMASK) >> 16; |
| 793 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->bas
eContCE & UCOL_SECONDARYMASK) << 8; |
| 794 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERT
IARYORDER(lh->baseContCE)) << 16; |
| 795 |
| 796 while (tok != NULL && U_SUCCESS(*status)) { |
| 797 fStrength = tok->strength; |
| 798 if(fStrength < initStrength) { |
| 799 initStrength = fStrength; |
| 800 if(lh->pos[fStrength] == -1) { |
| 801 while(lh->pos[fStrength] == -1 && fStrength > 0) { |
| 802 fStrength--; |
| 803 } |
| 804 if(lh->pos[fStrength] == -1) { |
| 805 *status = U_INTERNAL_PROGRAM_ERROR; |
| 806 return; |
| 807 } |
| 808 } |
| 809 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */ |
| 810 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; |
| 811 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1]; |
| 812 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gap
sLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */ |
| 813 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY
], lh->gapsLo, lh->gapsHi, tok, fStrength, status); |
| 814 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */ |
| 815 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; |
| 816 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrengt
h*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/ |
| 817 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDA
RY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); |
| 818 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE
RTIARY], tok, UCOL_TERTIARY, status); |
| 819 } else { /* primaries */ |
| 820 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gaps
Lo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/ |
| 821 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY],
lh->gapsLo, lh->gapsHi, tok, fStrength, status); |
| 822 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_S
ECONDARY], tok, UCOL_SECONDARY, status); |
| 823 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE
RTIARY], tok, UCOL_TERTIARY, status); |
| 824 } |
| 825 } else { |
| 826 if(tok->strength == UCOL_TERTIARY) { |
| 827 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIA
RY], status); |
| 828 } else if(tok->strength == UCOL_SECONDARY) { |
| 829 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECON
DARY], status); |
| 830 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE
RTIARY], tok, UCOL_TERTIARY, status); |
| 831 } else if(tok->strength == UCOL_PRIMARY) { |
| 832 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY
], status); |
| 833 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_S
ECONDARY], tok, UCOL_SECONDARY, status); |
| 834 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE
RTIARY], tok, UCOL_TERTIARY, status); |
| 835 } |
| 836 } |
| 837 ucol_doCE(src, CEparts, tok, status); |
| 838 tok = tok->next; |
| 839 } |
| 840 } |
| 841 |
| 842 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
istHeader *lh, UErrorCode *status) { |
| 843 UCAElements el; |
| 844 UColToken *tok = lh->first; |
| 845 UColToken *expt = NULL; |
| 846 uint32_t i = 0, j = 0; |
| 847 UChar32 fcdHighStart; |
| 848 const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); |
| 849 |
| 850 while(tok != NULL && U_SUCCESS(*status)) { |
| 851 /* first, check if there are any expansions */ |
| 852 /* if there are expansions, we need to do a little bit more processing *
/ |
| 853 /* since parts of expansion can be tailored, while others are not */ |
| 854 if(tok->expansion != 0) { |
| 855 uint32_t len = tok->expansion >> 24; |
| 856 uint32_t currentSequenceLen = len; |
| 857 uint32_t expOffset = tok->expansion & 0x00FFFFFF; |
| 858 //uint32_t exp = currentSequenceLen | expOffset; |
| 859 UColToken exp; |
| 860 exp.source = currentSequenceLen | expOffset; |
| 861 exp.rulesToParseHdl = &(src->source); |
| 862 |
| 863 while(len > 0) { |
| 864 currentSequenceLen = len; |
| 865 while(currentSequenceLen > 0) { |
| 866 exp.source = (currentSequenceLen << 24) | expOffset; |
| 867 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != N
ULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */ |
| 868 uint32_t noOfCEsToCopy = expt->noOfCEs; |
| 869 for(j = 0; j<noOfCEsToCopy; j++) { |
| 870 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j]; |
| 871 } |
| 872 tok->noOfExpCEs += noOfCEsToCopy; |
| 873 // Smart people never try to add codepoints and CEs. |
| 874 // For some odd reason, it won't work. |
| 875 expOffset += currentSequenceLen; //noOfCEsToCopy; |
| 876 len -= currentSequenceLen; //noOfCEsToCopy; |
| 877 break; |
| 878 } else { |
| 879 currentSequenceLen--; |
| 880 } |
| 881 } |
| 882 if(currentSequenceLen == 0) { /* couldn't find any tailored subs
equence */ |
| 883 /* will have to get one from UCA */ |
| 884 /* first, get the UChars from the rules */ |
| 885 /* then pick CEs out until there is no more and stuff them i
nto expansion */ |
| 886 collIterate s; |
| 887 uint32_t order = 0; |
| 888 uprv_init_collIterate(src->UCA, expOffset + src->source, 1,
&s, status); |
| 889 |
| 890 for(;;) { |
| 891 order = ucol_getNextCE(src->UCA, &s, status); |
| 892 if(order == UCOL_NO_MORE_CES) { |
| 893 break; |
| 894 } |
| 895 tok->expCEs[tok->noOfExpCEs++] = order; |
| 896 } |
| 897 expOffset++; |
| 898 len--; |
| 899 } |
| 900 } |
| 901 } else { |
| 902 tok->noOfExpCEs = 0; |
| 903 } |
| 904 |
| 905 /* set the ucaelement with obtained values */ |
| 906 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs; |
| 907 /* copy CEs */ |
| 908 for(i = 0; i<tok->noOfCEs; i++) { |
| 909 el.CEs[i] = tok->CEs[i]; |
| 910 } |
| 911 for(i = 0; i<tok->noOfExpCEs; i++) { |
| 912 el.CEs[i+tok->noOfCEs] = tok->expCEs[i]; |
| 913 } |
| 914 |
| 915 /* copy UChars */ |
| 916 // We kept prefix and source kind of together, as it is a kind of a cont
raction. |
| 917 // However, now we have to slice the prefix off the main thing - |
| 918 el.prefix = el.prefixChars; |
| 919 el.cPoints = el.uchars; |
| 920 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust
accordingly in the |
| 921 // addPrefix function in ucol_elm. The reason is that we need to add
both composed AND |
| 922 // decomposed elements to the unsaf table. |
| 923 el.prefixSize = tok->prefix>>24; |
| 924 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.
prefixSize*sizeof(UChar)); |
| 925 |
| 926 el.cSize = (tok->source >> 24)-(tok->prefix>>24); |
| 927 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24)
+ src->source, el.cSize*sizeof(UChar)); |
| 928 } else { |
| 929 el.prefixSize = 0; |
| 930 *el.prefix = 0; |
| 931 |
| 932 el.cSize = (tok->source >> 24); |
| 933 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.
cSize*sizeof(UChar)); |
| 934 } |
| 935 if(src->UCA != NULL) { |
| 936 for(i = 0; i<el.cSize; i++) { |
| 937 if(UCOL_ISJAMO(el.cPoints[i])) { |
| 938 t->image->jamoSpecial = TRUE; |
| 939 } |
| 940 } |
| 941 if (!src->buildCCTabFlag && el.cSize > 0) { |
| 942 // Check the trailing canonical combining class (tccc) of the la
st character. |
| 943 const UChar *s = el.cPoints + el.cSize; |
| 944 uint16_t fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, el.cP
oints, s); |
| 945 if ((fcd & 0xff) != 0) { |
| 946 src->buildCCTabFlag = TRUE; |
| 947 } |
| 948 } |
| 949 } |
| 950 |
| 951 /* and then, add it */ |
| 952 #if UCOL_DEBUG==2 |
| 953 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]); |
| 954 #endif |
| 955 uprv_uca_addAnElement(t, &el, status); |
| 956 |
| 957 #if UCOL_DEBUG_DUPLICATES |
| 958 if(*status != U_ZERO_ERROR) { |
| 959 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoint
s[0], tok->debugSource); |
| 960 *status = U_ZERO_ERROR; |
| 961 } |
| 962 #endif |
| 963 |
| 964 tok = tok->next; |
| 965 } |
| 966 } |
| 967 |
| 968 U_CDECL_BEGIN |
| 969 static UBool U_CALLCONV |
| 970 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit,
uint32_t value) { |
| 971 UErrorCode status = U_ZERO_ERROR; |
| 972 tempUCATable *t = (tempUCATable *)context; |
| 973 if(value == 0) { |
| 974 while(start < limit) { |
| 975 uint32_t CE = utrie_get32(t->mapping, start, NULL); |
| 976 if(CE == UCOL_NOT_FOUND) { |
| 977 UCAElements el; |
| 978 el.isThai = FALSE; |
| 979 el.prefixSize = 0; |
| 980 el.prefixChars[0] = 0; |
| 981 el.prefix = el.prefixChars; |
| 982 el.cPoints = el.uchars; |
| 983 |
| 984 el.cSize = 0; |
| 985 UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start); |
| 986 |
| 987 el.noOfCEs = 1; |
| 988 el.CEs[0] = 0; |
| 989 uprv_uca_addAnElement(t, &el, &status); |
| 990 |
| 991 } |
| 992 start++; |
| 993 } |
| 994 } |
| 995 if(U_FAILURE(status)) { |
| 996 return FALSE; |
| 997 } else { |
| 998 return TRUE; |
| 999 } |
| 1000 } |
| 1001 U_CDECL_END |
| 1002 |
| 1003 static void |
| 1004 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t, |
| 1005 UChar32 start, UChar32 end, |
| 1006 UErrorCode *status) |
| 1007 { |
| 1008 //UChar decomp[256]; |
| 1009 uint32_t CE = UCOL_NOT_FOUND; |
| 1010 UChar32 u = 0; |
| 1011 UCAElements el; |
| 1012 el.isThai = FALSE; |
| 1013 el.prefixSize = 0; |
| 1014 el.prefixChars[0] = 0; |
| 1015 collIterate colIt; |
| 1016 |
| 1017 if(U_SUCCESS(*status)) { |
| 1018 for(u = start; u<=end; u++) { |
| 1019 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND |
| 1020 /* this test is for contractions that are missing the starting e
lement. */ |
| 1021 || ((isCntTableElement(CE)) && |
| 1022 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_F
OUND)) |
| 1023 ) |
| 1024 { |
| 1025 el.cSize = 0; |
| 1026 U16_APPEND_UNSAFE(el.uchars, el.cSize, u); |
| 1027 //decomp[0] = (UChar)u; |
| 1028 //el.uchars[0] = (UChar)u; |
| 1029 el.cPoints = el.uchars; |
| 1030 //el.cSize = 1; |
| 1031 el.noOfCEs = 0; |
| 1032 el.prefix = el.prefixChars; |
| 1033 el.prefixSize = 0; |
| 1034 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt); |
| 1035 // We actually want to check whether this element is a special |
| 1036 // If it is an implicit element (hangul, CJK - we want to copy t
he |
| 1037 // special, not the resolved CEs) - for hangul, copying resolved |
| 1038 // would just make things the same (there is an expansion and it |
| 1039 // takes approximately the same amount of time to resolve as |
| 1040 // falling back to the UCA). |
| 1041 /* |
| 1042 UTRIE_GET32(src->UCA->mapping, u, CE); |
| 1043 tag = getCETag(CE); |
| 1044 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG |
| 1045 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG |
| 1046 || tag == LEAD_SURROGATE_TAG) { |
| 1047 el.CEs[el.noOfCEs++] = CE; |
| 1048 } else { |
| 1049 */ |
| 1050 // It turns out that it does not make sense to keep implicits |
| 1051 // unresolved. The cost of resolving them is big enough so that |
| 1052 // it doesn't make any difference whether we have to go to the U
CA |
| 1053 // or not. |
| 1054 { |
| 1055 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt,
status); |
| 1056 while(CE != UCOL_NO_MORE_CES) { |
| 1057 CE = ucol_getNextCE(src->UCA, &colIt, status); |
| 1058 if(CE != UCOL_NO_MORE_CES) { |
| 1059 el.CEs[el.noOfCEs++] = CE; |
| 1060 } |
| 1061 } |
| 1062 } |
| 1063 uprv_uca_addAnElement(t, &el, status); |
| 1064 } |
| 1065 } |
| 1066 } |
| 1067 } |
| 1068 |
| 1069 U_CFUNC UCATableHeader * |
| 1070 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) { |
| 1071 U_NAMESPACE_USE |
| 1072 |
| 1073 uint32_t i = 0; |
| 1074 if(U_FAILURE(*status)) { |
| 1075 return NULL; |
| 1076 } |
| 1077 /* |
| 1078 2. Eliminate the negative lists by doing the following for each non-null ne
gative list: |
| 1079 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE, |
| 1080 create new ListHeader X |
| 1081 o reverse the list, add to the end of X's positive list. Reset the strengt
h of the |
| 1082 first item you add, based on the stronger strength levels of the two lists. |
| 1083 */ |
| 1084 /* |
| 1085 3. For each ListHeader with a non-null positive list: |
| 1086 */ |
| 1087 /* |
| 1088 o Find all character strings with CEs between the baseCE and the |
| 1089 next/previous CE, at the strength of the first token. Add these to the |
| 1090 tailoring. |
| 1091 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the |
| 1092 tailoring has & x < z... |
| 1093 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ... |
| 1094 */ |
| 1095 /* It is possible that this part should be done even while constructing list
*/ |
| 1096 /* The problem is that it is unknown what is going to be the strongest weigh
t */ |
| 1097 /* So we might as well do it here */ |
| 1098 |
| 1099 /* |
| 1100 o Allocate CEs for each token in the list, based on the total number N of
the |
| 1101 largest level difference, and the gap G between baseCE and nextCE at that |
| 1102 level. The relation * between the last item and nextCE is the same as the |
| 1103 strongest strength. |
| 1104 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) |
| 1105 ? There are 3 primary items: a, d, e. Fit them into the primary gap. |
| 1106 Then fit b and c into the secondary gap between a and d, then fit q |
| 1107 into the tertiary gap between b and c. |
| 1108 |
| 1109 o Example: baseCE << b <<< q << c * nextCE(X,2) |
| 1110 ? There are 2 secondary items: b, c. Fit them into the secondary gap. |
| 1111 Then fit q into the tertiary gap between b and c. |
| 1112 o When incrementing primary values, we will not cross high byte |
| 1113 boundaries except where there is only a single-byte primary. That is to |
| 1114 ensure that the script reordering will continue to work. |
| 1115 */ |
| 1116 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader)
); |
| 1117 /* test for NULL */ |
| 1118 if (image == NULL) { |
| 1119 *status = U_MEMORY_ALLOCATION_ERROR; |
| 1120 return NULL; |
| 1121 } |
| 1122 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader)); |
| 1123 |
| 1124 for(i = 0; i<src->resultLen; i++) { |
| 1125 /* now we need to generate the CEs */ |
| 1126 /* We stuff the initial value in the buffers, and increase the appropria
te buffer */ |
| 1127 /* According to strength
*/ |
| 1128 if(U_SUCCESS(*status)) { |
| 1129 if(src->lh[i].first) { // if there are any elements |
| 1130 // due to the way parser works, subsequent tailorings |
| 1131 // may remove all the elements from a sequence, therefore |
| 1132 // leaving an empty tailoring sequence. |
| 1133 ucol_initBuffers(src, &src->lh[i], status); |
| 1134 } |
| 1135 } |
| 1136 if(U_FAILURE(*status)) { |
| 1137 uprv_free(image); |
| 1138 return NULL; |
| 1139 } |
| 1140 } |
| 1141 |
| 1142 if(src->varTop != NULL) { /* stuff the variable top value */ |
| 1143 src->opts->variableTopValue = (*(src->varTop->CEs))>>16; |
| 1144 /* remove it from the list */ |
| 1145 if(src->varTop->listHeader->first == src->varTop) { /* first in list */ |
| 1146 src->varTop->listHeader->first = src->varTop->next; |
| 1147 } |
| 1148 if(src->varTop->listHeader->last == src->varTop) { /* first in list */ |
| 1149 src->varTop->listHeader->last = src->varTop->previous; |
| 1150 } |
| 1151 if(src->varTop->next != NULL) { |
| 1152 src->varTop->next->previous = src->varTop->previous; |
| 1153 } |
| 1154 if(src->varTop->previous != NULL) { |
| 1155 src->varTop->previous->next = src->varTop->next; |
| 1156 } |
| 1157 } |
| 1158 |
| 1159 |
| 1160 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOU
ND_TAG, NOT_FOUND_TAG, status); |
| 1161 if(U_FAILURE(*status)) { |
| 1162 uprv_free(image); |
| 1163 return NULL; |
| 1164 } |
| 1165 |
| 1166 |
| 1167 /* After this, we have assigned CE values to all regular CEs */ |
| 1168 /* now we will go through list once more and resolve expansions, */ |
| 1169 /* make UCAElements structs and add them to table */ |
| 1170 for(i = 0; i<src->resultLen; i++) { |
| 1171 /* now we need to generate the CEs */ |
| 1172 /* We stuff the initial value in the buffers, and increase the appropria
te buffer */ |
| 1173 /* According to strength
*/ |
| 1174 if(U_SUCCESS(*status)) { |
| 1175 ucol_createElements(src, t, &src->lh[i], status); |
| 1176 } |
| 1177 } |
| 1178 |
| 1179 UCAElements el; |
| 1180 el.isThai = FALSE; |
| 1181 el.prefixSize = 0; |
| 1182 el.prefixChars[0] = 0; |
| 1183 |
| 1184 /* add latin-1 stuff */ |
| 1185 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status); |
| 1186 |
| 1187 /* add stuff for copying */ |
| 1188 if(src->copySet != NULL) { |
| 1189 int32_t i = 0; |
| 1190 UnicodeSet *set = (UnicodeSet *)src->copySet; |
| 1191 for(i = 0; i < set->getRangeCount(); i++) { |
| 1192 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->g
etRangeEnd(i), status); |
| 1193 } |
| 1194 } |
| 1195 |
| 1196 if(U_SUCCESS(*status)) { |
| 1197 /* copy contractions from the UCA - this is felt mostly for cyrillic*/ |
| 1198 |
| 1199 uint32_t tailoredCE = UCOL_NOT_FOUND; |
| 1200 //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image-
>UCAConsts+sizeof(UCAConstants)); |
| 1201 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->c
ontractionUCACombos); |
| 1202 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status)
; |
| 1203 // Check for null pointer |
| 1204 if (ucaEl == NULL) { |
| 1205 *status = U_MEMORY_ALLOCATION_ERROR; |
| 1206 return NULL; |
| 1207 } |
| 1208 while(*conts != 0) { |
| 1209 /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/ |
| 1210 tailoredCE = utrie_get32(t->mapping, *conts, NULL); |
| 1211 if(tailoredCE != UCOL_NOT_FOUND) { |
| 1212 UBool needToAdd = TRUE; |
| 1213 if(isCntTableElement(tailoredCE)) { |
| 1214 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts
+1, status) == TRUE) { |
| 1215 needToAdd = FALSE; |
| 1216 } |
| 1217 } |
| 1218 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) { |
| 1219 UCAElements elm; |
| 1220 elm.cPoints = el.uchars; |
| 1221 elm.noOfCEs = 0; |
| 1222 elm.uchars[0] = *conts; |
| 1223 elm.uchars[1] = 0; |
| 1224 elm.cSize = 1; |
| 1225 elm.prefixChars[0] = *(conts+2); |
| 1226 elm.isThai = FALSE; |
| 1227 elm.prefix = elm.prefixChars; |
| 1228 elm.prefixSize = 1; |
| 1229 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLoo
kup, &elm); |
| 1230 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) { |
| 1231 needToAdd = TRUE; |
| 1232 } |
| 1233 } |
| 1234 if(src->removeSet != NULL && uset_contains(src->removeSet, *cont
s)) { |
| 1235 needToAdd = FALSE; |
| 1236 } |
| 1237 |
| 1238 if(needToAdd == TRUE) { // we need to add if this contraction is
not tailored. |
| 1239 if (*(conts+1) != 0) { // contractions |
| 1240 el.prefix = el.prefixChars; |
| 1241 el.prefixSize = 0; |
| 1242 el.cPoints = el.uchars; |
| 1243 el.noOfCEs = 0; |
| 1244 el.uchars[0] = *conts; |
| 1245 el.uchars[1] = *(conts+1); |
| 1246 if(*(conts+2)!=0) { |
| 1247 el.uchars[2] = *(conts+2); |
| 1248 el.cSize = 3; |
| 1249 } else { |
| 1250 el.cSize = 2; |
| 1251 } |
| 1252 ucol_setText(ucaEl, el.uchars, el.cSize, status); |
| 1253 } |
| 1254 else { // pre-context character |
| 1255 UChar str[4] = { 0 }; |
| 1256 int32_t len=0; |
| 1257 int32_t preKeyLen=0; |
| 1258 |
| 1259 el.cPoints = el.uchars; |
| 1260 el.noOfCEs = 0; |
| 1261 el.uchars[0] = *conts; |
| 1262 el.uchars[1] = 0; |
| 1263 el.cSize = 1; |
| 1264 el.prefixChars[0] = *(conts+2); |
| 1265 el.prefix = el.prefixChars; |
| 1266 el.prefixSize = 1; |
| 1267 if (el.prefixChars[0]!=0) { |
| 1268 // get CE of prefix character first |
| 1269 str[0]=el.prefixChars[0]; |
| 1270 str[1]=0; |
| 1271 ucol_setText(ucaEl, str, 1, status); |
| 1272 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaE
l, status)) |
| 1273 != UCOL_NULLORDER) { |
| 1274 preKeyLen++; // count number of keys for prefix
character |
| 1275 } |
| 1276 str[len++] = el.prefixChars[0]; |
| 1277 } |
| 1278 |
| 1279 str[len++] = el.uchars[0]; |
| 1280 str[len]=0; |
| 1281 ucol_setText(ucaEl, str, len, status); |
| 1282 // Skip the keys for prefix character, then copy the res
t to el. |
| 1283 while ((preKeyLen-->0) && |
| 1284 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, s
tatus)) != UCOL_NULLORDER) { |
| 1285 continue; |
| 1286 } |
| 1287 |
| 1288 } |
| 1289 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, statu
s)) != UCOL_NULLORDER) { |
| 1290 el.noOfCEs++; |
| 1291 } |
| 1292 uprv_uca_addAnElement(t, &el, status); |
| 1293 } |
| 1294 |
| 1295 } else if(src->removeSet != NULL && uset_contains(src->removeSet, *c
onts)) { |
| 1296 ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status); |
| 1297 } |
| 1298 conts+=3; |
| 1299 } |
| 1300 ucol_closeElements(ucaEl); |
| 1301 } |
| 1302 |
| 1303 // Add completely ignorable elements |
| 1304 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t); |
| 1305 |
| 1306 // add tailoring characters related canonical closures |
| 1307 uprv_uca_canonicalClosure(t, src, NULL, status); |
| 1308 |
| 1309 /* still need to produce compatibility closure */ |
| 1310 |
| 1311 UCATableHeader *myData = uprv_uca_assembleTable(t, status); |
| 1312 |
| 1313 uprv_uca_closeTempTable(t); |
| 1314 uprv_free(image); |
| 1315 |
| 1316 return myData; |
| 1317 } |
| 1318 |
| 1319 U_CDECL_BEGIN |
| 1320 static UBool U_CALLCONV |
| 1321 ucol_bld_cleanup(void) |
| 1322 { |
| 1323 udata_close(invUCA_DATA_MEM); |
| 1324 invUCA_DATA_MEM = NULL; |
| 1325 _staticInvUCA = NULL; |
| 1326 return TRUE; |
| 1327 } |
| 1328 U_CDECL_END |
| 1329 |
| 1330 U_CAPI const InverseUCATableHeader * U_EXPORT2 |
| 1331 ucol_initInverseUCA(UErrorCode *status) |
| 1332 { |
| 1333 if(U_FAILURE(*status)) return NULL; |
| 1334 |
| 1335 UBool needsInit; |
| 1336 UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit); |
| 1337 |
| 1338 if(needsInit) { |
| 1339 InverseUCATableHeader *newInvUCA = NULL; |
| 1340 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, I
NVC_DATA_NAME, isAcceptableInvUCA, NULL, status); |
| 1341 |
| 1342 if(U_FAILURE(*status)) { |
| 1343 if (result) { |
| 1344 udata_close(result); |
| 1345 } |
| 1346 // This is not needed, as we are talking about |
| 1347 // memory we got from UData |
| 1348 //uprv_free(newInvUCA); |
| 1349 } |
| 1350 |
| 1351 if(result != NULL) { /* It looks like sometimes we can fail to find the
data file */ |
| 1352 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result); |
| 1353 UCollator *UCA = ucol_initUCA(status); |
| 1354 // UCA versions of UCA and inverse UCA should match |
| 1355 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof
(UVersionInfo)) != 0) { |
| 1356 *status = U_INVALID_FORMAT_ERROR; |
| 1357 udata_close(result); |
| 1358 return NULL; |
| 1359 } |
| 1360 |
| 1361 umtx_lock(NULL); |
| 1362 if(_staticInvUCA == NULL) { |
| 1363 invUCA_DATA_MEM = result; |
| 1364 _staticInvUCA = newInvUCA; |
| 1365 result = NULL; |
| 1366 newInvUCA = NULL; |
| 1367 } |
| 1368 umtx_unlock(NULL); |
| 1369 |
| 1370 if(newInvUCA != NULL) { |
| 1371 udata_close(result); |
| 1372 // This is not needed, as we are talking about |
| 1373 // memory we got from UData |
| 1374 //uprv_free(newInvUCA); |
| 1375 } |
| 1376 else { |
| 1377 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup); |
| 1378 } |
| 1379 } |
| 1380 } |
| 1381 return _staticInvUCA; |
| 1382 } |
| 1383 |
| 1384 /* This is the data that is used for non-script reordering codes. These _must_ b
e kept |
| 1385 * in order that they are to be applied as defaults and in synch with the UColRe
orderCode enum. |
| 1386 */ |
| 1387 static const char* ReorderingTokenNames[] = { |
| 1388 "SPACE", |
| 1389 "PUNCT", |
| 1390 "SYMBOL", |
| 1391 "CURRENCY", |
| 1392 "DIGIT", |
| 1393 NULL |
| 1394 }; |
| 1395 |
| 1396 static void toUpper(const char* src, char* dst, uint32_t length) { |
| 1397 for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) { |
| 1398 *dst = toupper(*src); |
| 1399 } |
| 1400 *dst = '\0'; |
| 1401 } |
| 1402 |
| 1403 U_INTERNAL int32_t U_EXPORT2 |
| 1404 ucol_findReorderingEntry(const char* name) { |
| 1405 char buffer[32]; |
| 1406 toUpper(name, buffer, 32); |
| 1407 for (uint32_t entry = 0; ReorderingTokenNames[entry] != NULL; entry++) { |
| 1408 if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) { |
| 1409 return entry + UCOL_REORDER_CODE_FIRST; |
| 1410 } |
| 1411 } |
| 1412 return USCRIPT_INVALID_CODE; |
| 1413 } |
| 1414 |
| 1415 U_NAMESPACE_END |
| 1416 |
| 1417 #endif /* #if !UCONFIG_NO_COLLATION */ |
OLD | NEW |