OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * Copyright (C) 1996-2010, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* |
| 6 * file name: ucol.cpp |
| 7 * encoding: US-ASCII |
| 8 * tab size: 8 (not used) |
| 9 * indentation:4 |
| 10 * |
| 11 * Modification history |
| 12 * Date Name Comments |
| 13 * 1996-1999 various members of ICU team maintained C API for collation framewo
rk |
| 14 * 02/16/2001 synwee Added internal method getPrevSpecialCE |
| 15 * 03/01/2001 synwee Added maxexpansion functionality. |
| 16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compl
iant |
| 17 */ |
| 18 |
| 19 #include "unicode/utypes.h" |
| 20 |
| 21 #if !UCONFIG_NO_COLLATION |
| 22 |
| 23 #include "unicode/coleitr.h" |
| 24 #include "unicode/unorm.h" |
| 25 #include "unicode/udata.h" |
| 26 #include "unicode/ustring.h" |
| 27 |
| 28 #include "ucol_imp.h" |
| 29 #include "bocsu.h" |
| 30 |
| 31 #include "normalizer2impl.h" |
| 32 #include "unorm_it.h" |
| 33 #include "umutex.h" |
| 34 #include "cmemory.h" |
| 35 #include "ucln_in.h" |
| 36 #include "cstring.h" |
| 37 #include "utracimp.h" |
| 38 #include "putilimp.h" |
| 39 #include "uassert.h" |
| 40 |
| 41 #ifdef UCOL_DEBUG |
| 42 #include <stdio.h> |
| 43 #endif |
| 44 |
| 45 U_NAMESPACE_USE |
| 46 |
| 47 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
| 48 |
| 49 #define LAST_BYTE_MASK_ 0xFF |
| 50 #define SECOND_LAST_BYTE_SHIFT_ 8 |
| 51 |
| 52 #define ZERO_CC_LIMIT_ 0xC0 |
| 53 |
| 54 // this is static pointer to the normalizer fcdTrieIndex |
| 55 // it is always the same between calls to u_cleanup |
| 56 // and therefore writing to it is not synchronized. |
| 57 // It is cleaned in ucol_cleanup |
| 58 static const uint16_t *fcdTrieIndex=NULL; |
| 59 // Code points at fcdHighStart and above have a zero FCD value. |
| 60 static UChar32 fcdHighStart = 0; |
| 61 |
| 62 // These are values from UCA required for |
| 63 // implicit generation and supressing sort key compression |
| 64 // they should regularly be in the UCA, but if one |
| 65 // is running without UCA, it could be a problem |
| 66 static const int32_t maxRegularPrimary = 0x7A; |
| 67 static const int32_t minImplicitPrimary = 0xE0; |
| 68 static const int32_t maxImplicitPrimary = 0xE4; |
| 69 |
| 70 U_CDECL_BEGIN |
| 71 static UBool U_CALLCONV |
| 72 ucol_cleanup(void) |
| 73 { |
| 74 fcdTrieIndex = NULL; |
| 75 return TRUE; |
| 76 } |
| 77 |
| 78 static int32_t U_CALLCONV |
| 79 _getFoldingOffset(uint32_t data) { |
| 80 return (int32_t)(data&0xFFFFFF); |
| 81 } |
| 82 |
| 83 U_CDECL_END |
| 84 |
| 85 // init FCD data |
| 86 static inline |
| 87 UBool initializeFCD(UErrorCode *status) { |
| 88 if (fcdTrieIndex != NULL) { |
| 89 return TRUE; |
| 90 } else { |
| 91 // The result is constant, until the library is reloaded. |
| 92 fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); |
| 93 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); |
| 94 return U_SUCCESS(*status); |
| 95 } |
| 96 } |
| 97 |
| 98 static |
| 99 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceStri
ng, |
| 100 int32_t sourceLen, collIterate *s, |
| 101 UErrorCode *status) |
| 102 { |
| 103 (s)->string = (s)->pos = sourceString; |
| 104 (s)->origFlags = 0; |
| 105 (s)->flags = 0; |
| 106 if (sourceLen >= 0) { |
| 107 s->flags |= UCOL_ITER_HASLEN; |
| 108 (s)->endp = (UChar *)sourceString+sourceLen; |
| 109 } |
| 110 else { |
| 111 /* change to enable easier checking for end of string for fcdpositon */ |
| 112 (s)->endp = NULL; |
| 113 } |
| 114 (s)->extendCEs = NULL; |
| 115 (s)->extendCEsSize = 0; |
| 116 (s)->CEpos = (s)->toReturn = (s)->CEs; |
| 117 (s)->offsetBuffer = NULL; |
| 118 (s)->offsetBufferSize = 0; |
| 119 (s)->offsetReturn = (s)->offsetStore = NULL; |
| 120 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0; |
| 121 (s)->coll = (collator); |
| 122 (s)->nfd = Normalizer2Factory::getNFDInstance(*status); |
| 123 (s)->fcdPosition = 0; |
| 124 if(collator->normalizationMode == UCOL_ON) { |
| 125 (s)->flags |= UCOL_ITER_NORM; |
| 126 } |
| 127 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY)
{ |
| 128 (s)->flags |= UCOL_HIRAGANA_Q; |
| 129 } |
| 130 (s)->iterator = NULL; |
| 131 //(s)->iteratorIndex = 0; |
| 132 } |
| 133 |
| 134 U_CAPI void U_EXPORT2 |
| 135 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, |
| 136 int32_t sourceLen, collIterate *s, |
| 137 UErrorCode *status) { |
| 138 /* Out-of-line version for use from other files. */ |
| 139 IInit_collIterate(collator, sourceString, sourceLen, s, status); |
| 140 } |
| 141 |
| 142 U_CAPI collIterate * U_EXPORT2 |
| 143 uprv_new_collIterate(UErrorCode *status) { |
| 144 if(U_FAILURE(*status)) { |
| 145 return NULL; |
| 146 } |
| 147 collIterate *s = new collIterate; |
| 148 if(s == NULL) { |
| 149 *status = U_MEMORY_ALLOCATION_ERROR; |
| 150 return NULL; |
| 151 } |
| 152 return s; |
| 153 } |
| 154 |
| 155 U_CAPI void U_EXPORT2 |
| 156 uprv_delete_collIterate(collIterate *s) { |
| 157 delete s; |
| 158 } |
| 159 |
| 160 U_CAPI UBool U_EXPORT2 |
| 161 uprv_collIterateAtEnd(collIterate *s) { |
| 162 return s == NULL || s->pos == s->endp; |
| 163 } |
| 164 |
| 165 /** |
| 166 * Backup the state of the collIterate struct data |
| 167 * @param data collIterate to backup |
| 168 * @param backup storage |
| 169 */ |
| 170 static |
| 171 inline void backupState(const collIterate *data, collIterateState *backup) |
| 172 { |
| 173 backup->fcdPosition = data->fcdPosition; |
| 174 backup->flags = data->flags; |
| 175 backup->origFlags = data->origFlags; |
| 176 backup->pos = data->pos; |
| 177 backup->bufferaddress = data->writableBuffer.getBuffer(); |
| 178 backup->buffersize = data->writableBuffer.length(); |
| 179 backup->iteratorMove = 0; |
| 180 backup->iteratorIndex = 0; |
| 181 if(data->iterator != NULL) { |
| 182 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER
_CURRENT); |
| 183 backup->iteratorIndex = data->iterator->getState(data->iterator); |
| 184 // no we try to fixup if we're using a normalizing iterator and we get U
ITER_NO_STATE |
| 185 if(backup->iteratorIndex == UITER_NO_STATE) { |
| 186 while((backup->iteratorIndex = data->iterator->getState(data->iterat
or)) == UITER_NO_STATE) { |
| 187 backup->iteratorMove++; |
| 188 data->iterator->move(data->iterator, -1, UITER_CURRENT); |
| 189 } |
| 190 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CUR
RENT); |
| 191 } |
| 192 } |
| 193 } |
| 194 |
| 195 /** |
| 196 * Loads the state into the collIterate struct data |
| 197 * @param data collIterate to backup |
| 198 * @param backup storage |
| 199 * @param forwards boolean to indicate if forwards iteration is used, |
| 200 * false indicates backwards iteration |
| 201 */ |
| 202 static |
| 203 inline void loadState(collIterate *data, const collIterateState *backup, |
| 204 UBool forwards) |
| 205 { |
| 206 UErrorCode status = U_ZERO_ERROR; |
| 207 data->flags = backup->flags; |
| 208 data->origFlags = backup->origFlags; |
| 209 if(data->iterator != NULL) { |
| 210 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO
); |
| 211 data->iterator->setState(data->iterator, backup->iteratorIndex, &status)
; |
| 212 if(backup->iteratorMove != 0) { |
| 213 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CUR
RENT); |
| 214 } |
| 215 } |
| 216 data->pos = backup->pos; |
| 217 |
| 218 if ((data->flags & UCOL_ITER_INNORMBUF) && |
| 219 data->writableBuffer.getBuffer() != backup->bufferaddress) { |
| 220 /* |
| 221 this is when a new buffer has been reallocated and we'll have to |
| 222 calculate the new position. |
| 223 note the new buffer has to contain the contents of the old buffer. |
| 224 */ |
| 225 if (forwards) { |
| 226 data->pos = data->writableBuffer.getTerminatedBuffer() + |
| 227 (data->pos - backup->bufferaddress); |
| 228 } |
| 229 else { |
| 230 /* backwards direction */ |
| 231 int32_t temp = backup->buffersize - |
| 232 (int32_t)(data->pos - backup->bufferaddress); |
| 233 data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writ
ableBuffer.length() - temp); |
| 234 } |
| 235 } |
| 236 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { |
| 237 /* |
| 238 this is alittle tricky. |
| 239 if we are initially not in the normalization buffer, even if we |
| 240 normalize in the later stage, the data in the buffer will be |
| 241 ignored, since we skip back up to the data string. |
| 242 however if we are already in the normalization buffer, any |
| 243 further normalization will pull data into the normalization |
| 244 buffer and modify the fcdPosition. |
| 245 since we are keeping the data in the buffer for use, the |
| 246 fcdPosition can not be reverted back. |
| 247 arrgghh.... |
| 248 */ |
| 249 data->fcdPosition = backup->fcdPosition; |
| 250 } |
| 251 } |
| 252 |
| 253 static UBool |
| 254 reallocCEs(collIterate *data, int32_t newCapacity) { |
| 255 uint32_t *oldCEs = data->extendCEs; |
| 256 if(oldCEs == NULL) { |
| 257 oldCEs = data->CEs; |
| 258 } |
| 259 int32_t length = data->CEpos - oldCEs; |
| 260 uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4); |
| 261 if(newCEs == NULL) { |
| 262 return FALSE; |
| 263 } |
| 264 uprv_memcpy(newCEs, oldCEs, length * 4); |
| 265 uprv_free(data->extendCEs); |
| 266 data->extendCEs = newCEs; |
| 267 data->extendCEsSize = newCapacity; |
| 268 data->CEpos = newCEs + length; |
| 269 return TRUE; |
| 270 } |
| 271 |
| 272 static UBool |
| 273 increaseCEsCapacity(collIterate *data) { |
| 274 int32_t oldCapacity; |
| 275 if(data->extendCEs != NULL) { |
| 276 oldCapacity = data->extendCEsSize; |
| 277 } else { |
| 278 oldCapacity = LENGTHOF(data->CEs); |
| 279 } |
| 280 return reallocCEs(data, 2 * oldCapacity); |
| 281 } |
| 282 |
| 283 static UBool |
| 284 ensureCEsCapacity(collIterate *data, int32_t minCapacity) { |
| 285 int32_t oldCapacity; |
| 286 if(data->extendCEs != NULL) { |
| 287 oldCapacity = data->extendCEsSize; |
| 288 } else { |
| 289 oldCapacity = LENGTHOF(data->CEs); |
| 290 } |
| 291 if(minCapacity <= oldCapacity) { |
| 292 return TRUE; |
| 293 } |
| 294 oldCapacity *= 2; |
| 295 return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacit
y); |
| 296 } |
| 297 |
| 298 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) { |
| 299 if(U_FAILURE(errorCode)) { |
| 300 return; |
| 301 } |
| 302 int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuf
fer); |
| 303 if(length >= offsetBufferSize) { |
| 304 int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE; |
| 305 int32_t *newBuffer = reinterpret_cast<int32_t *>(uprv_malloc(newCapacity
* 4)); |
| 306 if(newBuffer == NULL) { |
| 307 errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 308 return; |
| 309 } |
| 310 if(length > 0) { |
| 311 uprv_memcpy(newBuffer, offsetBuffer, length * 4); |
| 312 } |
| 313 uprv_free(offsetBuffer); |
| 314 offsetBuffer = newBuffer; |
| 315 offsetStore = offsetBuffer + length; |
| 316 offsetBufferSize = newCapacity; |
| 317 } |
| 318 *offsetStore++ = offset; |
| 319 } |
| 320 |
| 321 /* |
| 322 * collIter_eos() |
| 323 * Checks for a collIterate being positioned at the end of |
| 324 * its source string. |
| 325 * |
| 326 */ |
| 327 static |
| 328 inline UBool collIter_eos(collIterate *s) { |
| 329 if(s->flags & UCOL_USE_ITERATOR) { |
| 330 return !(s->iterator->hasNext(s->iterator)); |
| 331 } |
| 332 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) { |
| 333 // Null terminated string, but not at null, so not at end. |
| 334 // Whether in main or normalization buffer doesn't matter. |
| 335 return FALSE; |
| 336 } |
| 337 |
| 338 // String with length. Can't be in normalization buffer, which is always |
| 339 // null termintated. |
| 340 if (s->flags & UCOL_ITER_HASLEN) { |
| 341 return (s->pos == s->endp); |
| 342 } |
| 343 |
| 344 // We are at a null termination, could be either normalization buffer or mai
n string. |
| 345 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) { |
| 346 // At null at end of main string. |
| 347 return TRUE; |
| 348 } |
| 349 |
| 350 // At null at end of normalization buffer. Need to check whether there ther
e are |
| 351 // any characters left in the main buffer. |
| 352 if(s->origFlags & UCOL_USE_ITERATOR) { |
| 353 return !(s->iterator->hasNext(s->iterator)); |
| 354 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) { |
| 355 // Null terminated main string. fcdPosition is the 'return' position in
to main buf. |
| 356 return (*s->fcdPosition == 0); |
| 357 } |
| 358 else { |
| 359 // Main string with an end pointer. |
| 360 return s->fcdPosition == s->endp; |
| 361 } |
| 362 } |
| 363 |
| 364 /* |
| 365 * collIter_bos() |
| 366 * Checks for a collIterate being positioned at the start of |
| 367 * its source string. |
| 368 * |
| 369 */ |
| 370 static |
| 371 inline UBool collIter_bos(collIterate *source) { |
| 372 // if we're going backwards, we need to know whether there is more in the |
| 373 // iterator, even if we are in the side buffer |
| 374 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR)
{ |
| 375 return !source->iterator->hasPrevious(source->iterator); |
| 376 } |
| 377 if (source->pos <= source->string || |
| 378 ((source->flags & UCOL_ITER_INNORMBUF) && |
| 379 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) { |
| 380 return TRUE; |
| 381 } |
| 382 return FALSE; |
| 383 } |
| 384 |
| 385 /*static |
| 386 inline UBool collIter_SimpleBos(collIterate *source) { |
| 387 // if we're going backwards, we need to know whether there is more in the |
| 388 // iterator, even if we are in the side buffer |
| 389 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR)
{ |
| 390 return !source->iterator->hasPrevious(source->iterator); |
| 391 } |
| 392 if (source->pos == source->string) { |
| 393 return TRUE; |
| 394 } |
| 395 return FALSE; |
| 396 }*/ |
| 397 //return (data->pos == data->string) || |
| 398 |
| 399 |
| 400 /****************************************************************************/ |
| 401 /* Following are the open/close functions */ |
| 402 /* */ |
| 403 /****************************************************************************/ |
| 404 |
| 405 static UCollator* |
| 406 ucol_initFromBinary(const uint8_t *bin, int32_t length, |
| 407 const UCollator *base, |
| 408 UCollator *fillIn, |
| 409 UErrorCode *status) |
| 410 { |
| 411 UCollator *result = fillIn; |
| 412 if(U_FAILURE(*status)) { |
| 413 return NULL; |
| 414 } |
| 415 /* |
| 416 if(base == NULL) { |
| 417 // we don't support null base yet |
| 418 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 419 return NULL; |
| 420 } |
| 421 */ |
| 422 // We need these and we could be running without UCA |
| 423 uprv_uca_initImplicitConstants(status); |
| 424 UCATableHeader *colData = (UCATableHeader *)bin; |
| 425 // do we want version check here? We're trying to figure out whether collato
rs are compatible |
| 426 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeo
f(UVersionInfo)) != 0 || |
| 427 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersio
nInfo)) != 0)) || |
| 428 colData->version[0] != UCOL_BUILDER_VERSION) |
| 429 { |
| 430 *status = U_COLLATOR_VERSION_MISMATCH; |
| 431 return NULL; |
| 432 } |
| 433 else { |
| 434 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(s
izeof(UColOptionSet)))) { |
| 435 result = ucol_initCollator((const UCATableHeader *)bin, result, base
, status); |
| 436 if(U_FAILURE(*status)){ |
| 437 return NULL; |
| 438 } |
| 439 result->hasRealData = TRUE; |
| 440 } |
| 441 else { |
| 442 if(base) { |
| 443 result = ucol_initCollator(base->image, result, base, status); |
| 444 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const
UCATableHeader *)bin)->options), status); |
| 445 if(U_FAILURE(*status)){ |
| 446 return NULL; |
| 447 } |
| 448 result->hasRealData = FALSE; |
| 449 } |
| 450 else { |
| 451 *status = U_USELESS_COLLATOR_ERROR; |
| 452 return NULL; |
| 453 } |
| 454 } |
| 455 result->freeImageOnClose = FALSE; |
| 456 } |
| 457 result->actualLocale = NULL; |
| 458 result->validLocale = NULL; |
| 459 result->requestedLocale = NULL; |
| 460 result->rules = NULL; |
| 461 result->rulesLength = 0; |
| 462 result->freeRulesOnClose = FALSE; |
| 463 result->ucaRules = NULL; |
| 464 return result; |
| 465 } |
| 466 |
| 467 U_CAPI UCollator* U_EXPORT2 |
| 468 ucol_openBinary(const uint8_t *bin, int32_t length, |
| 469 const UCollator *base, |
| 470 UErrorCode *status) |
| 471 { |
| 472 return ucol_initFromBinary(bin, length, base, NULL, status); |
| 473 } |
| 474 |
| 475 U_CAPI int32_t U_EXPORT2 |
| 476 ucol_cloneBinary(const UCollator *coll, |
| 477 uint8_t *buffer, int32_t capacity, |
| 478 UErrorCode *status) |
| 479 { |
| 480 int32_t length = 0; |
| 481 if(U_FAILURE(*status)) { |
| 482 return length; |
| 483 } |
| 484 if(capacity < 0) { |
| 485 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 486 return length; |
| 487 } |
| 488 if(coll->hasRealData == TRUE) { |
| 489 length = coll->image->size; |
| 490 if(length <= capacity) { |
| 491 uprv_memcpy(buffer, coll->image, length); |
| 492 } else { |
| 493 *status = U_BUFFER_OVERFLOW_ERROR; |
| 494 } |
| 495 } else { |
| 496 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(
UColOptionSet))); |
| 497 if(length <= capacity) { |
| 498 /* build the UCATableHeader with minimal entries */ |
| 499 /* do not copy the header from the UCA file because its values are w
rong! */ |
| 500 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ |
| 501 |
| 502 /* reset everything */ |
| 503 uprv_memset(buffer, 0, length); |
| 504 |
| 505 /* set the tailoring-specific values */ |
| 506 UCATableHeader *myData = (UCATableHeader *)buffer; |
| 507 myData->size = length; |
| 508 |
| 509 /* offset for the options, the only part of the data that is present
after the header */ |
| 510 myData->options = sizeof(UCATableHeader); |
| 511 |
| 512 /* need to always set the expansion value for an upper bound of the
options */ |
| 513 myData->expansion = myData->options + sizeof(UColOptionSet); |
| 514 |
| 515 myData->magic = UCOL_HEADER_MAGIC; |
| 516 myData->isBigEndian = U_IS_BIG_ENDIAN; |
| 517 myData->charSetFamily = U_CHARSET_FAMILY; |
| 518 |
| 519 /* copy UCA's version; genrb will override all but the builder versi
on with tailoring data */ |
| 520 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionIn
fo)); |
| 521 |
| 522 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVer
sionInfo)); |
| 523 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVer
sionInfo)); |
| 524 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeo
f(UVersionInfo)); |
| 525 myData->jamoSpecial = coll->image->jamoSpecial; |
| 526 |
| 527 /* copy the collator options */ |
| 528 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options
, sizeof(UColOptionSet)); |
| 529 } else { |
| 530 *status = U_BUFFER_OVERFLOW_ERROR; |
| 531 } |
| 532 } |
| 533 return length; |
| 534 } |
| 535 |
| 536 U_CAPI UCollator* U_EXPORT2 |
| 537 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize,
UErrorCode *status) |
| 538 { |
| 539 UCollator * localCollator; |
| 540 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator); |
| 541 char *stackBufferChars = (char *)stackBuffer; |
| 542 int32_t imageSize = 0; |
| 543 int32_t rulesSize = 0; |
| 544 int32_t rulesPadding = 0; |
| 545 uint8_t *image; |
| 546 UChar *rules; |
| 547 UBool colAllocated = FALSE; |
| 548 UBool imageAllocated = FALSE; |
| 549 |
| 550 if (status == NULL || U_FAILURE(*status)){ |
| 551 return 0; |
| 552 } |
| 553 if ((stackBuffer && !pBufferSize) || !coll){ |
| 554 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 555 return 0; |
| 556 } |
| 557 if (coll->rules && coll->freeRulesOnClose) { |
| 558 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar); |
| 559 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar)); |
| 560 bufferSizeNeeded += rulesSize + rulesPadding; |
| 561 } |
| 562 |
| 563 if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set neede
d size into *pBufferSize */ |
| 564 *pBufferSize = bufferSizeNeeded; |
| 565 return 0; |
| 566 } |
| 567 |
| 568 /* Pointers on 64-bit platforms need to be aligned |
| 569 * on a 64-bit boundry in memory. |
| 570 */ |
| 571 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { |
| 572 int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars); |
| 573 if (*pBufferSize > offsetUp) { |
| 574 *pBufferSize -= offsetUp; |
| 575 stackBufferChars += offsetUp; |
| 576 } |
| 577 else { |
| 578 /* prevent using the stack buffer but keep the size > 0 so that we d
o not just preflight */ |
| 579 *pBufferSize = 1; |
| 580 } |
| 581 } |
| 582 stackBuffer = (void *)stackBufferChars; |
| 583 |
| 584 if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) { |
| 585 /* allocate one here...*/ |
| 586 stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded); |
| 587 // Null pointer check. |
| 588 if (stackBufferChars == NULL) { |
| 589 *status = U_MEMORY_ALLOCATION_ERROR; |
| 590 return NULL; |
| 591 } |
| 592 colAllocated = TRUE; |
| 593 if (U_SUCCESS(*status)) { |
| 594 *status = U_SAFECLONE_ALLOCATED_WARNING; |
| 595 } |
| 596 } |
| 597 localCollator = (UCollator *)stackBufferChars; |
| 598 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding); |
| 599 { |
| 600 UErrorCode tempStatus = U_ZERO_ERROR; |
| 601 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus); |
| 602 } |
| 603 if (coll->freeImageOnClose) { |
| 604 image = (uint8_t *)uprv_malloc(imageSize); |
| 605 // Null pointer check |
| 606 if (image == NULL) { |
| 607 *status = U_MEMORY_ALLOCATION_ERROR; |
| 608 return NULL; |
| 609 } |
| 610 ucol_cloneBinary(coll, image, imageSize, status); |
| 611 imageAllocated = TRUE; |
| 612 } |
| 613 else { |
| 614 image = (uint8_t *)coll->image; |
| 615 } |
| 616 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollat
or, status); |
| 617 if (U_FAILURE(*status)) { |
| 618 return NULL; |
| 619 } |
| 620 |
| 621 if (coll->rules) { |
| 622 if (coll->freeRulesOnClose) { |
| 623 localCollator->rules = u_strcpy(rules, coll->rules); |
| 624 //bufferEnd += rulesSize; |
| 625 } |
| 626 else { |
| 627 localCollator->rules = coll->rules; |
| 628 } |
| 629 localCollator->freeRulesOnClose = FALSE; |
| 630 localCollator->rulesLength = coll->rulesLength; |
| 631 } |
| 632 |
| 633 int32_t i; |
| 634 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { |
| 635 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(col
l, (UColAttribute)i, status), status); |
| 636 } |
| 637 // zero copies of pointers |
| 638 localCollator->actualLocale = NULL; |
| 639 localCollator->validLocale = NULL; |
| 640 localCollator->requestedLocale = NULL; |
| 641 localCollator->ucaRules = coll->ucaRules; // There should only be one copy h
ere. |
| 642 localCollator->freeOnClose = colAllocated; |
| 643 localCollator->freeImageOnClose = imageAllocated; |
| 644 return localCollator; |
| 645 } |
| 646 |
| 647 U_CAPI void U_EXPORT2 |
| 648 ucol_close(UCollator *coll) |
| 649 { |
| 650 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); |
| 651 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); |
| 652 if(coll != NULL) { |
| 653 // these are always owned by each UCollator struct, |
| 654 // so we always free them |
| 655 if(coll->validLocale != NULL) { |
| 656 uprv_free(coll->validLocale); |
| 657 } |
| 658 if(coll->actualLocale != NULL) { |
| 659 uprv_free(coll->actualLocale); |
| 660 } |
| 661 if(coll->requestedLocale != NULL) { |
| 662 uprv_free(coll->requestedLocale); |
| 663 } |
| 664 if(coll->latinOneCEs != NULL) { |
| 665 uprv_free(coll->latinOneCEs); |
| 666 } |
| 667 if(coll->options != NULL && coll->freeOptionsOnClose) { |
| 668 uprv_free(coll->options); |
| 669 } |
| 670 if(coll->rules != NULL && coll->freeRulesOnClose) { |
| 671 uprv_free((UChar *)coll->rules); |
| 672 } |
| 673 if(coll->image != NULL && coll->freeImageOnClose) { |
| 674 uprv_free((UCATableHeader *)coll->image); |
| 675 } |
| 676 if(coll->leadBytePermutationTable != NULL) { |
| 677 uprv_free(coll->leadBytePermutationTable); |
| 678 } |
| 679 if(coll->reorderCodes != NULL) { |
| 680 uprv_free(coll->reorderCodes); |
| 681 } |
| 682 |
| 683 /* Here, it would be advisable to close: */ |
| 684 /* - UData for UCA (unless we stuff it in the root resb */ |
| 685 /* Again, do we need additional housekeeping... HMMM! */ |
| 686 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose); |
| 687 if(coll->freeOnClose){ |
| 688 /* for safeClone, if freeOnClose is FALSE, |
| 689 don't free the other instance data */ |
| 690 uprv_free(coll); |
| 691 } |
| 692 } |
| 693 UTRACE_EXIT(); |
| 694 } |
| 695 |
| 696 /* This one is currently used by genrb & tests. After constructing from rules (t
ailoring),*/ |
| 697 /* you should be able to get the binary chunk to write out... Doesn't look very
full now */ |
| 698 U_CFUNC uint8_t* U_EXPORT2 |
| 699 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status) |
| 700 { |
| 701 uint8_t *result = NULL; |
| 702 if(U_FAILURE(*status)) { |
| 703 return NULL; |
| 704 } |
| 705 if(coll->hasRealData == TRUE) { |
| 706 *length = coll->image->size; |
| 707 result = (uint8_t *)uprv_malloc(*length); |
| 708 /* test for NULL */ |
| 709 if (result == NULL) { |
| 710 *status = U_MEMORY_ALLOCATION_ERROR; |
| 711 return NULL; |
| 712 } |
| 713 uprv_memcpy(result, coll->image, *length); |
| 714 } else { |
| 715 *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof
(UColOptionSet))); |
| 716 result = (uint8_t *)uprv_malloc(*length); |
| 717 /* test for NULL */ |
| 718 if (result == NULL) { |
| 719 *status = U_MEMORY_ALLOCATION_ERROR; |
| 720 return NULL; |
| 721 } |
| 722 |
| 723 /* build the UCATableHeader with minimal entries */ |
| 724 /* do not copy the header from the UCA file because its values are wrong
! */ |
| 725 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ |
| 726 |
| 727 /* reset everything */ |
| 728 uprv_memset(result, 0, *length); |
| 729 |
| 730 /* set the tailoring-specific values */ |
| 731 UCATableHeader *myData = (UCATableHeader *)result; |
| 732 myData->size = *length; |
| 733 |
| 734 /* offset for the options, the only part of the data that is present aft
er the header */ |
| 735 myData->options = sizeof(UCATableHeader); |
| 736 |
| 737 /* need to always set the expansion value for an upper bound of the opti
ons */ |
| 738 myData->expansion = myData->options + sizeof(UColOptionSet); |
| 739 |
| 740 myData->magic = UCOL_HEADER_MAGIC; |
| 741 myData->isBigEndian = U_IS_BIG_ENDIAN; |
| 742 myData->charSetFamily = U_CHARSET_FAMILY; |
| 743 |
| 744 /* copy UCA's version; genrb will override all but the builder version w
ith tailoring data */ |
| 745 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo))
; |
| 746 |
| 747 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersion
Info)); |
| 748 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersion
Info)); |
| 749 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UV
ersionInfo)); |
| 750 myData->jamoSpecial = coll->image->jamoSpecial; |
| 751 |
| 752 /* copy the collator options */ |
| 753 uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, si
zeof(UColOptionSet)); |
| 754 } |
| 755 return result; |
| 756 } |
| 757 |
| 758 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCo
de *status) { |
| 759 if(U_FAILURE(*status)) { |
| 760 return; |
| 761 } |
| 762 result->caseFirst = (UColAttributeValue)opts->caseFirst; |
| 763 result->caseLevel = (UColAttributeValue)opts->caseLevel; |
| 764 result->frenchCollation = (UColAttributeValue)opts->frenchCollation; |
| 765 result->normalizationMode = (UColAttributeValue)opts->normalizationMode; |
| 766 if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) { |
| 767 return; |
| 768 } |
| 769 result->strength = (UColAttributeValue)opts->strength; |
| 770 result->variableTopValue = opts->variableTopValue; |
| 771 result->alternateHandling = (UColAttributeValue)opts->alternateHandling; |
| 772 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ; |
| 773 result->numericCollation = (UColAttributeValue)opts->numericCollation; |
| 774 result->caseFirstisDefault = TRUE; |
| 775 result->caseLevelisDefault = TRUE; |
| 776 result->frenchCollationisDefault = TRUE; |
| 777 result->normalizationModeisDefault = TRUE; |
| 778 result->strengthisDefault = TRUE; |
| 779 result->variableTopValueisDefault = TRUE; |
| 780 result->alternateHandlingisDefault = TRUE; |
| 781 result->hiraganaQisDefault = TRUE; |
| 782 result->numericCollationisDefault = TRUE; |
| 783 |
| 784 ucol_updateInternalState(result, status); |
| 785 |
| 786 result->options = opts; |
| 787 } |
| 788 |
| 789 |
| 790 /** |
| 791 * Approximate determination if a character is at a contraction end. |
| 792 * Guaranteed to be TRUE if a character is at the end of a contraction, |
| 793 * otherwise it is not deterministic. |
| 794 * @param c character to be determined |
| 795 * @param coll collator |
| 796 */ |
| 797 static |
| 798 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) { |
| 799 if (c < coll->minContrEndCP) { |
| 800 return FALSE; |
| 801 } |
| 802 |
| 803 int32_t hash = c; |
| 804 uint8_t htbyte; |
| 805 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { |
| 806 if (U16_IS_TRAIL(c)) { |
| 807 return TRUE; |
| 808 } |
| 809 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; |
| 810 } |
| 811 htbyte = coll->contrEndCP[hash>>3]; |
| 812 return (((htbyte >> (hash & 7)) & 1) == 1); |
| 813 } |
| 814 |
| 815 |
| 816 |
| 817 /* |
| 818 * i_getCombiningClass() |
| 819 * A fast, at least partly inline version of u_getCombiningClass() |
| 820 * This is a candidate for further optimization. Used heavily |
| 821 * in contraction processing. |
| 822 */ |
| 823 static |
| 824 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) { |
| 825 uint8_t sCC = 0; |
| 826 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) { |
| 827 sCC = u_getCombiningClass(c); |
| 828 } |
| 829 return sCC; |
| 830 } |
| 831 |
| 832 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, con
st UCollator *UCA, UErrorCode *status) { |
| 833 UChar c; |
| 834 UCollator *result = fillIn; |
| 835 if(U_FAILURE(*status) || image == NULL) { |
| 836 return NULL; |
| 837 } |
| 838 |
| 839 if(result == NULL) { |
| 840 result = (UCollator *)uprv_malloc(sizeof(UCollator)); |
| 841 if(result == NULL) { |
| 842 *status = U_MEMORY_ALLOCATION_ERROR; |
| 843 return result; |
| 844 } |
| 845 result->freeOnClose = TRUE; |
| 846 } else { |
| 847 result->freeOnClose = FALSE; |
| 848 } |
| 849 |
| 850 result->image = image; |
| 851 result->mapping.getFoldingOffset = _getFoldingOffset; |
| 852 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosit
ion; |
| 853 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE -
result->image->mappingPosition, status); |
| 854 if(U_FAILURE(*status)) { |
| 855 if(result->freeOnClose == TRUE) { |
| 856 uprv_free(result); |
| 857 result = NULL; |
| 858 } |
| 859 return result; |
| 860 } |
| 861 |
| 862 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping); |
| 863 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->
contractionCEs); |
| 864 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->c
ontractionIndex); |
| 865 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expan
sion); |
| 866 result->rules = NULL; |
| 867 result->rulesLength = 0; |
| 868 result->freeRulesOnClose = FALSE; |
| 869 result->reorderCodes = NULL; |
| 870 result->reorderCodesLength = 0; |
| 871 result->leadBytePermutationTable = NULL; |
| 872 |
| 873 /* get the version info from UCATableHeader and populate the Collator struct
*/ |
| 874 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/ |
| 875 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules v
ersion*/ |
| 876 result->dataVersion[2] = 0; |
| 877 result->dataVersion[3] = 0; |
| 878 |
| 879 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP; |
| 880 result->minUnsafeCP = 0; |
| 881 for (c=0; c<0x300; c++) { // Find the smallest unsafe char. |
| 882 if (ucol_unsafeCP(c, result)) break; |
| 883 } |
| 884 result->minUnsafeCP = c; |
| 885 |
| 886 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP; |
| 887 result->minContrEndCP = 0; |
| 888 for (c=0; c<0x300; c++) { // Find the Contraction-ending char. |
| 889 if (ucol_contractionEndCP(c, result)) break; |
| 890 } |
| 891 result->minContrEndCP = c; |
| 892 |
| 893 /* max expansion tables */ |
| 894 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image + |
| 895 result->image->endExpansionCE); |
| 896 result->lastEndExpansionCE = result->endExpansionCE + |
| 897 result->image->endExpansionCECount - 1; |
| 898 result->expansionCESize = (uint8_t*)result->image + |
| 899 result->image->expansionCESize; |
| 900 |
| 901 |
| 902 //result->errorCode = *status; |
| 903 |
| 904 result->latinOneCEs = NULL; |
| 905 |
| 906 result->latinOneRegenTable = FALSE; |
| 907 result->latinOneFailed = FALSE; |
| 908 result->UCA = UCA; |
| 909 |
| 910 /* Normally these will be set correctly later. This is the default if you us
e UCA or the default. */ |
| 911 result->ucaRules = NULL; |
| 912 result->actualLocale = NULL; |
| 913 result->validLocale = NULL; |
| 914 result->requestedLocale = NULL; |
| 915 result->hasRealData = FALSE; // real data lives in .dat file... |
| 916 result->freeImageOnClose = FALSE; |
| 917 |
| 918 /* set attributes */ |
| 919 ucol_setOptionsFromHeader( |
| 920 result, |
| 921 (UColOptionSet*)((uint8_t*)result->image+result->image->options), |
| 922 status); |
| 923 result->freeOptionsOnClose = FALSE; |
| 924 |
| 925 return result; |
| 926 } |
| 927 |
| 928 /* new Mark's code */ |
| 929 |
| 930 /** |
| 931 * For generation of Implicit CEs |
| 932 * @author Davis |
| 933 * |
| 934 * Cleaned up so that changes can be made more easily. |
| 935 * Old values: |
| 936 # First Implicit: E26A792D |
| 937 # Last Implicit: E3DC70C0 |
| 938 # First CJK: E0030300 |
| 939 # Last CJK: E0A9DD00 |
| 940 # First CJK_A: E0A9DF00 |
| 941 # Last CJK_A: E0DE3100 |
| 942 */ |
| 943 /* Following is a port of Mark's code for new treatment of implicits. |
| 944 * It is positioned here, since ucol_initUCA need to initialize the |
| 945 * variables below according to the data in the fractional UCA. |
| 946 */ |
| 947 |
| 948 /** |
| 949 * Function used to: |
| 950 * a) collapse the 2 different Han ranges from UCA into one (in the right order)
, and |
| 951 * b) bump any non-CJK characters by 10FFFF. |
| 952 * The relevant blocks are: |
| 953 * A: 4E00..9FFF; CJK Unified Ideographs |
| 954 * F900..FAFF; CJK Compatibility Ideographs |
| 955 * B: 3400..4DBF; CJK Unified Ideographs Extension A |
| 956 * 20000..XX; CJK Unified Ideographs Extension B (and others later on) |
| 957 * As long as |
| 958 * no new B characters are allocated between 4E00 and FAFF, and |
| 959 * no new A characters are outside of this range, |
| 960 * (very high probability) this simple code will work. |
| 961 * The reordered blocks are: |
| 962 * Block1 is CJK |
| 963 * Block2 is CJK_COMPAT_USED |
| 964 * Block3 is CJK_A |
| 965 * (all contiguous) |
| 966 * Any other CJK gets its normal code point |
| 967 * Any non-CJK gets +10FFFF |
| 968 * When we reorder Block1, we make sure that it is at the very start, |
| 969 * so that it will use a 3-byte form. |
| 970 * Warning: the we only pick up the compatibility characters that are |
| 971 * NOT decomposed, so that block is smaller! |
| 972 */ |
| 973 |
| 974 // CONSTANTS |
| 975 static const UChar32 |
| 976 NON_CJK_OFFSET = 0x110000, |
| 977 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2 |
| 978 |
| 979 /** |
| 980 * Precomputed by initImplicitConstants() |
| 981 */ |
| 982 static int32_t |
| 983 final3Multiplier = 0, |
| 984 final4Multiplier = 0, |
| 985 final3Count = 0, |
| 986 final4Count = 0, |
| 987 medialCount = 0, |
| 988 min3Primary = 0, |
| 989 min4Primary = 0, |
| 990 max4Primary = 0, |
| 991 minTrail = 0, |
| 992 maxTrail = 0, |
| 993 max3Trail = 0, |
| 994 max4Trail = 0, |
| 995 min4Boundary = 0; |
| 996 |
| 997 static const UChar32 |
| 998 // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; |
| 999 // 9FCB;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; |
| 1000 CJK_BASE = 0x4E00, |
| 1001 CJK_LIMIT = 0x9FCB+1, |
| 1002 // Unified CJK ideographs in the compatibility ideographs block. |
| 1003 CJK_COMPAT_USED_BASE = 0xFA0E, |
| 1004 CJK_COMPAT_USED_LIMIT = 0xFA2F+1, |
| 1005 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; |
| 1006 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; |
| 1007 CJK_A_BASE = 0x3400, |
| 1008 CJK_A_LIMIT = 0x4DB5+1, |
| 1009 // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;; |
| 1010 // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;; |
| 1011 CJK_B_BASE = 0x20000, |
| 1012 CJK_B_LIMIT = 0x2A6D6+1, |
| 1013 // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;; |
| 1014 // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;; |
| 1015 CJK_C_BASE = 0x2A700, |
| 1016 CJK_C_LIMIT = 0x2B734+1, |
| 1017 // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;; |
| 1018 // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;; |
| 1019 CJK_D_BASE = 0x2B740, |
| 1020 CJK_D_LIMIT = 0x2B81D+1; |
| 1021 // when adding to this list, look for all occurrences (in project) |
| 1022 // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing
!!!! |
| 1023 |
| 1024 static UChar32 swapCJK(UChar32 i) { |
| 1025 if (i < CJK_A_BASE) { |
| 1026 // non-CJK |
| 1027 } else if (i < CJK_A_LIMIT) { |
| 1028 // Extension A has lower code points than the original Unihan+compat |
| 1029 // but sorts higher. |
| 1030 return i - CJK_A_BASE |
| 1031 + (CJK_LIMIT - CJK_BASE) |
| 1032 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); |
| 1033 } else if (i < CJK_BASE) { |
| 1034 // non-CJK |
| 1035 } else if (i < CJK_LIMIT) { |
| 1036 return i - CJK_BASE; |
| 1037 } else if (i < CJK_COMPAT_USED_BASE) { |
| 1038 // non-CJK |
| 1039 } else if (i < CJK_COMPAT_USED_LIMIT) { |
| 1040 return i - CJK_COMPAT_USED_BASE |
| 1041 + (CJK_LIMIT - CJK_BASE); |
| 1042 } else if (i < CJK_B_BASE) { |
| 1043 // non-CJK |
| 1044 } else if (i < CJK_B_LIMIT) { |
| 1045 return i; // non-BMP-CJK |
| 1046 } else if (i < CJK_C_BASE) { |
| 1047 // non-CJK |
| 1048 } else if (i < CJK_C_LIMIT) { |
| 1049 return i; // non-BMP-CJK |
| 1050 } else if (i < CJK_D_BASE) { |
| 1051 // non-CJK |
| 1052 } else if (i < CJK_D_LIMIT) { |
| 1053 return i; // non-BMP-CJK |
| 1054 } |
| 1055 return i + NON_CJK_OFFSET; // non-CJK |
| 1056 } |
| 1057 |
| 1058 U_CAPI UChar32 U_EXPORT2 |
| 1059 uprv_uca_getRawFromCodePoint(UChar32 i) { |
| 1060 return swapCJK(i)+1; |
| 1061 } |
| 1062 |
| 1063 U_CAPI UChar32 U_EXPORT2 |
| 1064 uprv_uca_getCodePointFromRaw(UChar32 i) { |
| 1065 i--; |
| 1066 UChar32 result = 0; |
| 1067 if(i >= NON_CJK_OFFSET) { |
| 1068 result = i - NON_CJK_OFFSET; |
| 1069 } else if(i >= CJK_B_BASE) { |
| 1070 result = i; |
| 1071 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted |
| 1072 if(i < CJK_LIMIT - CJK_BASE) { |
| 1073 result = i + CJK_BASE; |
| 1074 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMP
AT_USED_BASE)) { |
| 1075 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE); |
| 1076 } else { |
| 1077 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_
LIMIT - CJK_COMPAT_USED_BASE); |
| 1078 } |
| 1079 } else { |
| 1080 result = -1; |
| 1081 } |
| 1082 return result; |
| 1083 } |
| 1084 |
| 1085 // GET IMPLICIT PRIMARY WEIGHTS |
| 1086 // Return value is left justified primary key |
| 1087 U_CAPI uint32_t U_EXPORT2 |
| 1088 uprv_uca_getImplicitFromRaw(UChar32 cp) { |
| 1089 /* |
| 1090 if (cp < 0 || cp > UCOL_MAX_INPUT) { |
| 1091 throw new IllegalArgumentException("Code point out of range " + Utility.
hex(cp)); |
| 1092 } |
| 1093 */ |
| 1094 int32_t last0 = cp - min4Boundary; |
| 1095 if (last0 < 0) { |
| 1096 int32_t last1 = cp / final3Count; |
| 1097 last0 = cp % final3Count; |
| 1098 |
| 1099 int32_t last2 = last1 / medialCount; |
| 1100 last1 %= medialCount; |
| 1101 |
| 1102 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at
start |
| 1103 last1 = minTrail + last1; // offset |
| 1104 last2 = min3Primary + last2; // offset |
| 1105 /* |
| 1106 if (last2 >= min4Primary) { |
| 1107 throw new IllegalArgumentException("4-byte out of range: " + Utility
.hex(cp) + ", " + Utility.hex(last2)); |
| 1108 } |
| 1109 */ |
| 1110 return (last2 << 24) + (last1 << 16) + (last0 << 8); |
| 1111 } else { |
| 1112 int32_t last1 = last0 / final4Count; |
| 1113 last0 %= final4Count; |
| 1114 |
| 1115 int32_t last2 = last1 / medialCount; |
| 1116 last1 %= medialCount; |
| 1117 |
| 1118 int32_t last3 = last2 / medialCount; |
| 1119 last2 %= medialCount; |
| 1120 |
| 1121 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at
start |
| 1122 last1 = minTrail + last1; // offset |
| 1123 last2 = minTrail + last2; // offset |
| 1124 last3 = min4Primary + last3; // offset |
| 1125 /* |
| 1126 if (last3 > max4Primary) { |
| 1127 throw new IllegalArgumentException("4-byte out of range: " + Utility
.hex(cp) + ", " + Utility.hex(last3)); |
| 1128 } |
| 1129 */ |
| 1130 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0; |
| 1131 } |
| 1132 } |
| 1133 |
| 1134 static uint32_t U_EXPORT2 |
| 1135 uprv_uca_getImplicitPrimary(UChar32 cp) { |
| 1136 //fprintf(stdout, "Incoming: %04x\n", cp); |
| 1137 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); |
| 1138 |
| 1139 cp = swapCJK(cp); |
| 1140 cp++; |
| 1141 // we now have a range of numbers from 0 to 21FFFF. |
| 1142 |
| 1143 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); |
| 1144 //fprintf(stdout, "CJK swapped: %04x\n", cp); |
| 1145 |
| 1146 return uprv_uca_getImplicitFromRaw(cp); |
| 1147 } |
| 1148 |
| 1149 /** |
| 1150 * Converts implicit CE into raw integer ("code point") |
| 1151 * @param implicit |
| 1152 * @return -1 if illegal format |
| 1153 */ |
| 1154 U_CAPI UChar32 U_EXPORT2 |
| 1155 uprv_uca_getRawFromImplicit(uint32_t implicit) { |
| 1156 UChar32 result; |
| 1157 UChar32 b3 = implicit & 0xFF; |
| 1158 UChar32 b2 = (implicit >> 8) & 0xFF; |
| 1159 UChar32 b1 = (implicit >> 16) & 0xFF; |
| 1160 UChar32 b0 = (implicit >> 24) & 0xFF; |
| 1161 |
| 1162 // simple parameter checks |
| 1163 if (b0 < min3Primary || b0 > max4Primary |
| 1164 || b1 < minTrail || b1 > maxTrail) |
| 1165 return -1; |
| 1166 // normal offsets |
| 1167 b1 -= minTrail; |
| 1168 |
| 1169 // take care of the final values, and compose |
| 1170 if (b0 < min4Primary) { |
| 1171 if (b2 < minTrail || b2 > max3Trail || b3 != 0) |
| 1172 return -1; |
| 1173 b2 -= minTrail; |
| 1174 UChar32 remainder = b2 % final3Multiplier; |
| 1175 if (remainder != 0) |
| 1176 return -1; |
| 1177 b0 -= min3Primary; |
| 1178 b2 /= final3Multiplier; |
| 1179 result = ((b0 * medialCount) + b1) * final3Count + b2; |
| 1180 } else { |
| 1181 if (b2 < minTrail || b2 > maxTrail |
| 1182 || b3 < minTrail || b3 > max4Trail) |
| 1183 return -1; |
| 1184 b2 -= minTrail; |
| 1185 b3 -= minTrail; |
| 1186 UChar32 remainder = b3 % final4Multiplier; |
| 1187 if (remainder != 0) |
| 1188 return -1; |
| 1189 b3 /= final4Multiplier; |
| 1190 b0 -= min4Primary; |
| 1191 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count +
b3 + min4Boundary; |
| 1192 } |
| 1193 // final check |
| 1194 if (result < 0 || result > UCOL_MAX_INPUT) |
| 1195 return -1; |
| 1196 return result; |
| 1197 } |
| 1198 |
| 1199 |
| 1200 static inline int32_t divideAndRoundUp(int a, int b) { |
| 1201 return 1 + (a-1)/b; |
| 1202 } |
| 1203 |
| 1204 /* this function is either called from initUCA or from genUCA before |
| 1205 * doing canonical closure for the UCA. |
| 1206 */ |
| 1207 |
| 1208 /** |
| 1209 * Set up to generate implicits. |
| 1210 * Maintenance Note: this function may end up being called more than once, due |
| 1211 * to threading races during initialization. Make sure that |
| 1212 * none of the Constants is ever transiently assigned an |
| 1213 * incorrect value. |
| 1214 * @param minPrimary |
| 1215 * @param maxPrimary |
| 1216 * @param minTrail final byte |
| 1217 * @param maxTrail final byte |
| 1218 * @param gap3 the gap we leave for tailoring for 3-byte forms |
| 1219 * @param gap4 the gap we leave for tailoring for 4-byte forms |
| 1220 */ |
| 1221 static void initImplicitConstants(int minPrimary, int maxPrimary, |
| 1222 int minTrailIn, int maxTrailIn, |
| 1223 int gap3, int primaries3count, |
| 1224 UErrorCode *status) { |
| 1225 // some simple parameter checks |
| 1226 if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) |
| 1227 || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) |
| 1228 || (primaries3count < 1)) |
| 1229 { |
| 1230 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 1231 return; |
| 1232 }; |
| 1233 |
| 1234 minTrail = minTrailIn; |
| 1235 maxTrail = maxTrailIn; |
| 1236 |
| 1237 min3Primary = minPrimary; |
| 1238 max4Primary = maxPrimary; |
| 1239 // compute constants for use later. |
| 1240 // number of values we can use in trailing bytes |
| 1241 // leave room for empty values between AND above, e.g. if gap = 2 |
| 1242 // range 3..7 => +3 -4 -5 -6 -7: so 1 value |
| 1243 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values |
| 1244 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values |
| 1245 final3Multiplier = gap3 + 1; |
| 1246 final3Count = (maxTrail - minTrail + 1) / final3Multiplier; |
| 1247 max3Trail = minTrail + (final3Count - 1) * final3Multiplier; |
| 1248 |
| 1249 // medials can use full range |
| 1250 medialCount = (maxTrail - minTrail + 1); |
| 1251 // find out how many values fit in each form |
| 1252 int32_t threeByteCount = medialCount * final3Count; |
| 1253 // now determine where the 3/4 boundary is. |
| 1254 // we use 3 bytes below the boundary, and 4 above |
| 1255 int32_t primariesAvailable = maxPrimary - minPrimary + 1; |
| 1256 int32_t primaries4count = primariesAvailable - primaries3count; |
| 1257 |
| 1258 |
| 1259 int32_t min3ByteCoverage = primaries3count * threeByteCount; |
| 1260 min4Primary = minPrimary + primaries3count; |
| 1261 min4Boundary = min3ByteCoverage; |
| 1262 // Now expand out the multiplier for the 4 bytes, and redo. |
| 1263 |
| 1264 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary; |
| 1265 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count
); |
| 1266 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCo
unt * medialCount); |
| 1267 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte; |
| 1268 if (gap4 < 1) { |
| 1269 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 1270 return; |
| 1271 } |
| 1272 final4Multiplier = gap4 + 1; |
| 1273 final4Count = neededPerFinalByte; |
| 1274 max4Trail = minTrail + (final4Count - 1) * final4Multiplier; |
| 1275 } |
| 1276 |
| 1277 /** |
| 1278 * Supply parameters for generating implicit CEs |
| 1279 */ |
| 1280 U_CAPI void U_EXPORT2 |
| 1281 uprv_uca_initImplicitConstants(UErrorCode *status) { |
| 1282 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms
. |
| 1283 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status); |
| 1284 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1,
1, status); |
| 1285 } |
| 1286 |
| 1287 |
| 1288 /* collIterNormalize Incremental Normalization happens here.
*/ |
| 1289 /* pick up the range of chars identifed by FCD,
*/ |
| 1290 /* normalize it into the collIterate's writable buffer,
*/ |
| 1291 /* switch the collIterate's state to use the writable b
uffer. */ |
| 1292 /*
*/ |
| 1293 static |
| 1294 void collIterNormalize(collIterate *collationSource) |
| 1295 { |
| 1296 UErrorCode status = U_ZERO_ERROR; |
| 1297 const UChar *srcP = collationSource->pos - 1; /* Start of chars to nor
malize */ |
| 1298 const UChar *endP = collationSource->fcdPosition; /* End of region to norma
lize+1 */ |
| 1299 |
| 1300 collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP -
srcP)), |
| 1301 collationSource->writableBuffer, |
| 1302 status); |
| 1303 if (U_FAILURE(status)) { |
| 1304 #ifdef UCOL_DEBUG |
| 1305 fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_erro
rName(status)); |
| 1306 #endif |
| 1307 return; |
| 1308 } |
| 1309 |
| 1310 collationSource->pos = collationSource->writableBuffer.getTerminatedB
uffer(); |
| 1311 collationSource->origFlags = collationSource->flags; |
| 1312 collationSource->flags |= UCOL_ITER_INNORMBUF; |
| 1313 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE
_ITERATOR); |
| 1314 } |
| 1315 |
| 1316 |
| 1317 // This function takes the iterator and extracts normalized stuff up to the next
boundary |
| 1318 // It is similar in the end results to the collIterNormalize, but for the cases
when we |
| 1319 // use an iterator |
| 1320 /*static |
| 1321 inline void normalizeIterator(collIterate *collationSource) { |
| 1322 UErrorCode status = U_ZERO_ERROR; |
| 1323 UBool wasNormalized = FALSE; |
| 1324 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->ite
rator, UITER_CURRENT); |
| 1325 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iter
ator); |
| 1326 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writa
bleBuffer, |
| 1327 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalize
d, &status); |
| 1328 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->w
ritableBufSize) { |
| 1329 // reallocate and terminate |
| 1330 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer, |
| 1331 &collationSource->writableBuffer, |
| 1332 (int32_t *)&collationSource->writableBufSize, nor
mLen + 1, |
| 1333 0) |
| 1334 ) { |
| 1335 #ifdef UCOL_DEBUG |
| 1336 fprintf(stderr, "normalizeIterator(), out of memory\n"); |
| 1337 #endif |
| 1338 return; |
| 1339 } |
| 1340 status = U_ZERO_ERROR; |
| 1341 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITE
R_ZERO); |
| 1342 collationSource->iterator->setState(collationSource->iterator, iterIndex, &s
tatus); |
| 1343 normLen = unorm_next(collationSource->iterator, collationSource->writableBuf
fer, |
| 1344 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalize
d, &status); |
| 1345 } |
| 1346 // Terminate the buffer - we already checked that it is big enough |
| 1347 collationSource->writableBuffer[normLen] = 0; |
| 1348 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { |
| 1349 collationSource->flags |= UCOL_ITER_ALLOCATED; |
| 1350 } |
| 1351 collationSource->pos = collationSource->writableBuffer; |
| 1352 collationSource->origFlags = collationSource->flags; |
| 1353 collationSource->flags |= UCOL_ITER_INNORMBUF; |
| 1354 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_I
TERATOR); |
| 1355 }*/ |
| 1356 |
| 1357 |
| 1358 /* Incremental FCD check and normalize
*/ |
| 1359 /* Called from getNextCE when normalization state is suspect.
*/ |
| 1360 /* When entering, the state is known to be this:
*/ |
| 1361 /* o We are working in the main buffer of the collIterate, not the side
*/ |
| 1362 /* writable buffer. When in the side buffer, normalization mode is alw
ays off, */ |
| 1363 /* so we won't get here.
*/ |
| 1364 /* o The leading combining class from the current character is 0 or
*/ |
| 1365 /* the trailing combining class of the previous char was zero.
*/ |
| 1366 /* True because the previous call to this function will have always exi
ted */ |
| 1367 /* that way, and we get called for every char where cc might be non-zer
o. */ |
| 1368 static |
| 1369 inline UBool collIterFCD(collIterate *collationSource) { |
| 1370 const UChar *srcP, *endP; |
| 1371 uint8_t leadingCC; |
| 1372 uint8_t prevTrailingCC = 0; |
| 1373 uint16_t fcd; |
| 1374 UBool needNormalize = FALSE; |
| 1375 |
| 1376 srcP = collationSource->pos-1; |
| 1377 |
| 1378 if (collationSource->flags & UCOL_ITER_HASLEN) { |
| 1379 endP = collationSource->endp; |
| 1380 } else { |
| 1381 endP = NULL; |
| 1382 } |
| 1383 |
| 1384 // Get the trailing combining class of the current character. If it's zero, |
| 1385 // we are OK. |
| 1386 /* trie access */ |
| 1387 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP); |
| 1388 if (fcd != 0) { |
| 1389 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); |
| 1390 |
| 1391 if (prevTrailingCC != 0) { |
| 1392 // The current char has a non-zero trailing CC. Scan forward until
we find |
| 1393 // a char with a leading cc of zero. |
| 1394 while (endP == NULL || srcP != endP) |
| 1395 { |
| 1396 const UChar *savedSrcP = srcP; |
| 1397 |
| 1398 /* trie access */ |
| 1399 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP); |
| 1400 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); |
| 1401 if (leadingCC == 0) { |
| 1402 srcP = savedSrcP; // Hit char that is not part of combi
ning sequence. |
| 1403 // back up over it. (Could be surr
ogate pair!) |
| 1404 break; |
| 1405 } |
| 1406 |
| 1407 if (leadingCC < prevTrailingCC) { |
| 1408 needNormalize = TRUE; |
| 1409 } |
| 1410 |
| 1411 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); |
| 1412 } |
| 1413 } |
| 1414 } |
| 1415 |
| 1416 collationSource->fcdPosition = (UChar *)srcP; |
| 1417 |
| 1418 return needNormalize; |
| 1419 } |
| 1420 |
| 1421 /****************************************************************************/ |
| 1422 /* Following are the CE retrieval functions */ |
| 1423 /* */ |
| 1424 /****************************************************************************/ |
| 1425 |
| 1426 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource); |
| 1427 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource); |
| 1428 |
| 1429 /* there should be a macro version of this function in the header file */ |
| 1430 /* This is the first function that tries to fetch a collation element */ |
| 1431 /* If it's not succesfull or it encounters a more difficult situation */ |
| 1432 /* some more sofisticated and slower functions are invoked */ |
| 1433 static |
| 1434 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou
rce, UErrorCode *status) { |
| 1435 uint32_t order = 0; |
| 1436 if (collationSource->CEpos > collationSource->toReturn) { /* Are there
any CEs from previous expansions? */ |
| 1437 order = *(collationSource->toReturn++); /* if so
, return them */ |
| 1438 if(collationSource->CEpos == collationSource->toReturn) { |
| 1439 collationSource->CEpos = collationSource->toReturn = collationSource
->extendCEs ? collationSource->extendCEs : collationSource->CEs; |
| 1440 } |
| 1441 return order; |
| 1442 } |
| 1443 |
| 1444 UChar ch = 0; |
| 1445 collationSource->offsetReturn = NULL; |
| 1446 |
| 1447 for (;;) /* Loop handles case when incremental nor
malize switches */ |
| 1448 { /* to or from the side buffer / origina
l string, and we */ |
| 1449 /* need to start again to get the next character. */ |
| 1450 |
| 1451 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF |
UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0) |
| 1452 { |
| 1453 // The source string is null terminated and we're not working from t
he side buffer, |
| 1454 // and we're not normalizing. This is the fast path. |
| 1455 // (We can be in the side buffer for Thai pre-vowel reordering eve
n when not normalizing.) |
| 1456 ch = *collationSource->pos++; |
| 1457 if (ch != 0) { |
| 1458 break; |
| 1459 } |
| 1460 else { |
| 1461 return UCOL_NO_MORE_CES; |
| 1462 } |
| 1463 } |
| 1464 |
| 1465 if (collationSource->flags & UCOL_ITER_HASLEN) { |
| 1466 // Normal path for strings when length is specified. |
| 1467 // (We can't be in side buffer because it is always null terminate
d.) |
| 1468 if (collationSource->pos >= collationSource->endp) { |
| 1469 // Ran off of the end of the main source string. We're done. |
| 1470 return UCOL_NO_MORE_CES; |
| 1471 } |
| 1472 ch = *collationSource->pos++; |
| 1473 } |
| 1474 else if(collationSource->flags & UCOL_USE_ITERATOR) { |
| 1475 UChar32 iterCh = collationSource->iterator->next(collationSource->it
erator); |
| 1476 if(iterCh == U_SENTINEL) { |
| 1477 return UCOL_NO_MORE_CES; |
| 1478 } |
| 1479 ch = (UChar)iterCh; |
| 1480 } |
| 1481 else |
| 1482 { |
| 1483 // Null terminated string. |
| 1484 ch = *collationSource->pos++; |
| 1485 if (ch == 0) { |
| 1486 // Ran off end of buffer. |
| 1487 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { |
| 1488 // Ran off end of main string. backing up one character. |
| 1489 collationSource->pos--; |
| 1490 return UCOL_NO_MORE_CES; |
| 1491 } |
| 1492 else |
| 1493 { |
| 1494 // Hit null in the normalize side buffer. |
| 1495 // Usually this means the end of the normalized data, |
| 1496 // except for one odd case: a null followed by combining cha
rs, |
| 1497 // which is the case if we are at the start of the buffer. |
| 1498 if (collationSource->pos == collationSource->writableBuffer.
getBuffer()+1) { |
| 1499 break; |
| 1500 } |
| 1501 |
| 1502 // Null marked end of side buffer. |
| 1503 // Revert to the main string and |
| 1504 // loop back to top to try again to get a character. |
| 1505 collationSource->pos = collationSource->fcdPosition; |
| 1506 collationSource->flags = collationSource->origFlags; |
| 1507 continue; |
| 1508 } |
| 1509 } |
| 1510 } |
| 1511 |
| 1512 if(collationSource->flags&UCOL_HIRAGANA_Q) { |
| 1513 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the
flag |
| 1514 * based on whether the previous codepoint was Hiragana or Katakana. |
| 1515 */ |
| 1516 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) || |
| 1517 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x30
99 && ch <= 0x309C))) { |
| 1518 collationSource->flags |= UCOL_WAS_HIRAGANA; |
| 1519 } else { |
| 1520 collationSource->flags &= ~UCOL_WAS_HIRAGANA; |
| 1521 } |
| 1522 } |
| 1523 |
| 1524 // We've got a character. See if there's any fcd and/or normalization s
tuff to do. |
| 1525 // Note that UCOL_ITER_NORM flag is always zero when we are in the si
de buffer. |
| 1526 if ((collationSource->flags & UCOL_ITER_NORM) == 0) { |
| 1527 break; |
| 1528 } |
| 1529 |
| 1530 if (collationSource->fcdPosition >= collationSource->pos) { |
| 1531 // An earlier FCD check has already covered the current character. |
| 1532 // We can go ahead and process this char. |
| 1533 break; |
| 1534 } |
| 1535 |
| 1536 if (ch < ZERO_CC_LIMIT_ ) { |
| 1537 // Fast fcd safe path. Trailing combining class == 0. This char is
OK. |
| 1538 break; |
| 1539 } |
| 1540 |
| 1541 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { |
| 1542 // We need to peek at the next character in order to tell if we are
FCD |
| 1543 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->
pos >= collationSource->endp) { |
| 1544 // We are at the last char of source string. |
| 1545 // It is always OK for FCD check. |
| 1546 break; |
| 1547 } |
| 1548 |
| 1549 // Not at last char of source string (or we'll check against termina
ting null). Do the FCD fast test |
| 1550 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) { |
| 1551 break; |
| 1552 } |
| 1553 } |
| 1554 |
| 1555 |
| 1556 // Need a more complete FCD check and possible normalization. |
| 1557 if (collIterFCD(collationSource)) { |
| 1558 collIterNormalize(collationSource); |
| 1559 } |
| 1560 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { |
| 1561 // No normalization was needed. Go ahead and process the char we a
lready had. |
| 1562 break; |
| 1563 } |
| 1564 |
| 1565 // Some normalization happened. Next loop iteration will pick up a char |
| 1566 // from the normalization buffer. |
| 1567 |
| 1568 } // end for (;;) |
| 1569 |
| 1570 |
| 1571 if (ch <= 0xFF) { |
| 1572 /* For latin-1 characters we never need to fall back to the UCA table
*/ |
| 1573 /* because all of the UCA data is replicated in the latinOneMapping a
rray */ |
| 1574 order = coll->latinOneMapping[ch]; |
| 1575 if (order > UCOL_NOT_FOUND) { |
| 1576 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, stat
us); |
| 1577 } |
| 1578 } |
| 1579 else |
| 1580 { |
| 1581 // Always use UCA for Han, Hangul |
| 1582 // (Han extension A is before main Han block) |
| 1583 // **** Han compatibility chars ?? **** |
| 1584 if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && |
| 1585 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) { |
| 1586 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) { |
| 1587 // between the two target ranges; do normal lookup |
| 1588 // **** this range is YI, Modifier tone letters, **** |
| 1589 // **** Latin-D, Syloti Nagari, Phagas-pa. **** |
| 1590 // **** Latin-D might be tailored, so we need to **** |
| 1591 // **** do the normal lookup for these guys. **** |
| 1592 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); |
| 1593 } else { |
| 1594 // in one of the target ranges; use UCA |
| 1595 order = UCOL_NOT_FOUND; |
| 1596 } |
| 1597 } else { |
| 1598 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); |
| 1599 } |
| 1600 |
| 1601 if(order > UCOL_NOT_FOUND) { /* if
a CE is special */ |
| 1602 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, stat
us); /* and try to get the special CE */ |
| 1603 } |
| 1604 |
| 1605 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good
CE in the tailoring */ |
| 1606 /* if we got here, the codepoint MUST be over 0xFF - so we look dire
ctly in the trie */ |
| 1607 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); |
| 1608 |
| 1609 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */ |
| 1610 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSou
rce, status); |
| 1611 } |
| 1612 } |
| 1613 } |
| 1614 if(order == UCOL_NOT_FOUND) { |
| 1615 order = getImplicit(ch, collationSource); |
| 1616 } |
| 1617 return order; /* return the CE */ |
| 1618 } |
| 1619 |
| 1620 /* ucol_getNextCE, out-of-line version for use from other files. */ |
| 1621 U_CAPI uint32_t U_EXPORT2 |
| 1622 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *
status) { |
| 1623 return ucol_IGetNextCE(coll, collationSource, status); |
| 1624 } |
| 1625 |
| 1626 |
| 1627 /** |
| 1628 * Incremental previous normalization happens here. Pick up the range of chars |
| 1629 * identifed by FCD, normalize it into the collIterate's writable buffer, |
| 1630 * switch the collIterate's state to use the writable buffer. |
| 1631 * @param data collation iterator data |
| 1632 */ |
| 1633 static |
| 1634 void collPrevIterNormalize(collIterate *data) |
| 1635 { |
| 1636 UErrorCode status = U_ZERO_ERROR; |
| 1637 const UChar *pEnd = data->pos; /* End normalize + 1 */ |
| 1638 const UChar *pStart; |
| 1639 |
| 1640 /* Start normalize */ |
| 1641 if (data->fcdPosition == NULL) { |
| 1642 pStart = data->string; |
| 1643 } |
| 1644 else { |
| 1645 pStart = data->fcdPosition + 1; |
| 1646 } |
| 1647 |
| 1648 int32_t normLen = |
| 1649 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pSta
rt) + 1)), |
| 1650 data->writableBuffer, |
| 1651 status). |
| 1652 length(); |
| 1653 if(U_FAILURE(status)) { |
| 1654 return; |
| 1655 } |
| 1656 /* |
| 1657 this puts the null termination infront of the normalized string instead |
| 1658 of the end |
| 1659 */ |
| 1660 data->writableBuffer.insert(0, (UChar)0); |
| 1661 |
| 1662 /* |
| 1663 * The usual case at this point is that we've got a base |
| 1664 * character followed by marks that were normalized. If |
| 1665 * fcdPosition is NULL, that means that we backed up to |
| 1666 * the beginning of the string and there's no base character. |
| 1667 * |
| 1668 * Forward processing will usually normalize when it sees |
| 1669 * the first mark, so that mark will get it's natural offset |
| 1670 * and the rest will get the offset of the character following |
| 1671 * the marks. The base character will also get its natural offset. |
| 1672 * |
| 1673 * We write the offset of the base character, if there is one, |
| 1674 * followed by the offset of the first mark and then the offsets |
| 1675 * of the rest of the marks. |
| 1676 */ |
| 1677 int32_t firstMarkOffset = 0; |
| 1678 int32_t trailOffset = (int32_t)(data->pos - data->string + 1); |
| 1679 int32_t trailCount = normLen - 1; |
| 1680 |
| 1681 if (data->fcdPosition != NULL) { |
| 1682 int32_t baseOffset = (int32_t)(data->fcdPosition - data->string); |
| 1683 UChar baseChar = *data->fcdPosition; |
| 1684 |
| 1685 firstMarkOffset = baseOffset + 1; |
| 1686 |
| 1687 /* |
| 1688 * If the base character is the start of a contraction, forward processi
ng |
| 1689 * will normalize the marks while checking for the contraction, which me
ans |
| 1690 * that the offset of the first mark will the same as the other marks. |
| 1691 * |
| 1692 * **** THIS IS PROBABLY NOT A COMPLETE TEST **** |
| 1693 */ |
| 1694 if (baseChar >= 0x100) { |
| 1695 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, bas
eChar); |
| 1696 |
| 1697 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) { |
| 1698 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, bas
eChar); |
| 1699 } |
| 1700 |
| 1701 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION
_TAG) { |
| 1702 firstMarkOffset = trailOffset; |
| 1703 } |
| 1704 } |
| 1705 |
| 1706 data->appendOffset(baseOffset, status); |
| 1707 } |
| 1708 |
| 1709 data->appendOffset(firstMarkOffset, status); |
| 1710 |
| 1711 for (int32_t i = 0; i < trailCount; i += 1) { |
| 1712 data->appendOffset(trailOffset, status); |
| 1713 } |
| 1714 |
| 1715 data->offsetRepeatValue = trailOffset; |
| 1716 |
| 1717 data->offsetReturn = data->offsetStore - 1; |
| 1718 if (data->offsetReturn == data->offsetBuffer) { |
| 1719 data->offsetStore = data->offsetBuffer; |
| 1720 } |
| 1721 |
| 1722 data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen; |
| 1723 data->origFlags = data->flags; |
| 1724 data->flags |= UCOL_ITER_INNORMBUF; |
| 1725 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); |
| 1726 } |
| 1727 |
| 1728 |
| 1729 /** |
| 1730 * Incremental FCD check for previous iteration and normalize. Called from |
| 1731 * getPrevCE when normalization state is suspect. |
| 1732 * When entering, the state is known to be this: |
| 1733 * o We are working in the main buffer of the collIterate, not the side |
| 1734 * writable buffer. When in the side buffer, normalization mode is always |
| 1735 * off, so we won't get here. |
| 1736 * o The leading combining class from the current character is 0 or the |
| 1737 * trailing combining class of the previous char was zero. |
| 1738 * True because the previous call to this function will have always exited |
| 1739 * that way, and we get called for every char where cc might be non-zero. |
| 1740 * @param data collation iterate struct |
| 1741 * @return normalization status, TRUE for normalization to be done, FALSE |
| 1742 * otherwise |
| 1743 */ |
| 1744 static |
| 1745 inline UBool collPrevIterFCD(collIterate *data) |
| 1746 { |
| 1747 const UChar *src, *start; |
| 1748 uint8_t leadingCC; |
| 1749 uint8_t trailingCC = 0; |
| 1750 uint16_t fcd; |
| 1751 UBool result = FALSE; |
| 1752 |
| 1753 start = data->string; |
| 1754 src = data->pos + 1; |
| 1755 |
| 1756 /* Get the trailing combining class of the current character. */ |
| 1757 fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src); |
| 1758 |
| 1759 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); |
| 1760 |
| 1761 if (leadingCC != 0) { |
| 1762 /* |
| 1763 The current char has a non-zero leading combining class. |
| 1764 Scan backward until we find a char with a trailing cc of zero. |
| 1765 */ |
| 1766 for (;;) |
| 1767 { |
| 1768 if (start == src) { |
| 1769 data->fcdPosition = NULL; |
| 1770 return result; |
| 1771 } |
| 1772 |
| 1773 fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src); |
| 1774 |
| 1775 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); |
| 1776 |
| 1777 if (trailingCC == 0) { |
| 1778 break; |
| 1779 } |
| 1780 |
| 1781 if (leadingCC < trailingCC) { |
| 1782 result = TRUE; |
| 1783 } |
| 1784 |
| 1785 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); |
| 1786 } |
| 1787 } |
| 1788 |
| 1789 data->fcdPosition = (UChar *)src; |
| 1790 |
| 1791 return result; |
| 1792 } |
| 1793 |
| 1794 /** gets a code unit from the string at a given offset |
| 1795 * Handles both normal and iterative cases. |
| 1796 * No error checking - caller beware! |
| 1797 */ |
| 1798 static inline |
| 1799 UChar peekCodeUnit(collIterate *source, int32_t offset) { |
| 1800 if(source->pos != NULL) { |
| 1801 return *(source->pos + offset); |
| 1802 } else if(source->iterator != NULL) { |
| 1803 UChar32 c; |
| 1804 if(offset != 0) { |
| 1805 source->iterator->move(source->iterator, offset, UITER_CURRENT); |
| 1806 c = source->iterator->next(source->iterator); |
| 1807 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT); |
| 1808 } else { |
| 1809 c = source->iterator->current(source->iterator); |
| 1810 } |
| 1811 return c >= 0 ? (UChar)c : 0xfffd; // If the caller works properly, we
should never see c<0. |
| 1812 } else { |
| 1813 return 0xfffd; |
| 1814 } |
| 1815 } |
| 1816 |
| 1817 // Code point version. Treats the offset as a _code point_ delta. |
| 1818 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-for
med UTF-16. |
| 1819 // We cannot use U16_FWD_1 and similar because we do not know the start and limi
t of the buffer. |
| 1820 static inline |
| 1821 UChar32 peekCodePoint(collIterate *source, int32_t offset) { |
| 1822 UChar32 c; |
| 1823 if(source->pos != NULL) { |
| 1824 const UChar *p = source->pos; |
| 1825 if(offset >= 0) { |
| 1826 // Skip forward over (offset-1) code points. |
| 1827 while(--offset >= 0) { |
| 1828 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) { |
| 1829 ++p; |
| 1830 } |
| 1831 } |
| 1832 // Read the code point there. |
| 1833 c = *p++; |
| 1834 UChar trail; |
| 1835 if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) { |
| 1836 c = U16_GET_SUPPLEMENTARY(c, trail); |
| 1837 } |
| 1838 } else /* offset<0 */ { |
| 1839 // Skip backward over (offset-1) code points. |
| 1840 while(++offset < 0) { |
| 1841 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) { |
| 1842 --p; |
| 1843 } |
| 1844 } |
| 1845 // Read the code point before that. |
| 1846 c = *--p; |
| 1847 UChar lead; |
| 1848 if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) { |
| 1849 c = U16_GET_SUPPLEMENTARY(lead, c); |
| 1850 } |
| 1851 } |
| 1852 } else if(source->iterator != NULL) { |
| 1853 if(offset >= 0) { |
| 1854 // Skip forward over (offset-1) code points. |
| 1855 int32_t fwd = offset; |
| 1856 while(fwd-- > 0) { |
| 1857 uiter_next32(source->iterator); |
| 1858 } |
| 1859 // Read the code point there. |
| 1860 c = uiter_current32(source->iterator); |
| 1861 // Return to the starting point, skipping backward over (offset-1) c
ode points. |
| 1862 while(offset-- > 0) { |
| 1863 uiter_previous32(source->iterator); |
| 1864 } |
| 1865 } else /* offset<0 */ { |
| 1866 // Read backward, reading offset code points, remember only the last
-read one. |
| 1867 int32_t back = offset; |
| 1868 do { |
| 1869 c = uiter_previous32(source->iterator); |
| 1870 } while(++back < 0); |
| 1871 // Return to the starting position, skipping forward over offset cod
e points. |
| 1872 do { |
| 1873 uiter_next32(source->iterator); |
| 1874 } while(++offset < 0); |
| 1875 } |
| 1876 } else { |
| 1877 c = U_SENTINEL; |
| 1878 } |
| 1879 return c; |
| 1880 } |
| 1881 |
| 1882 /** |
| 1883 * Determines if we are at the start of the data string in the backwards |
| 1884 * collation iterator |
| 1885 * @param data collation iterator |
| 1886 * @return TRUE if we are at the start |
| 1887 */ |
| 1888 static |
| 1889 inline UBool isAtStartPrevIterate(collIterate *data) { |
| 1890 if(data->pos == NULL && data->iterator != NULL) { |
| 1891 return !data->iterator->hasPrevious(data->iterator); |
| 1892 } |
| 1893 //return (collIter_bos(data)) || |
| 1894 return (data->pos == data->string) || |
| 1895 ((data->flags & UCOL_ITER_INNORMBUF) && |
| 1896 *(data->pos - 1) == 0 && data->fcdPosition == NULL); |
| 1897 } |
| 1898 |
| 1899 static |
| 1900 inline void goBackOne(collIterate *data) { |
| 1901 # if 0 |
| 1902 // somehow, it looks like we need to keep iterator synced up |
| 1903 // at all times, as above. |
| 1904 if(data->pos) { |
| 1905 data->pos--; |
| 1906 } |
| 1907 if(data->iterator) { |
| 1908 data->iterator->previous(data->iterator); |
| 1909 } |
| 1910 #endif |
| 1911 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) { |
| 1912 data->iterator->previous(data->iterator); |
| 1913 } |
| 1914 if(data->pos) { |
| 1915 data->pos --; |
| 1916 } |
| 1917 } |
| 1918 |
| 1919 /** |
| 1920 * Inline function that gets a simple CE. |
| 1921 * So what it does is that it will first check the expansion buffer. If the |
| 1922 * expansion buffer is not empty, ie the end pointer to the expansion buffer |
| 1923 * is different from the string pointer, we return the collation element at the |
| 1924 * return pointer and decrement it. |
| 1925 * For more complicated CEs it resorts to getComplicatedCE. |
| 1926 * @param coll collator data |
| 1927 * @param data collation iterator struct |
| 1928 * @param status error status |
| 1929 */ |
| 1930 static |
| 1931 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, |
| 1932 UErrorCode *status) |
| 1933 { |
| 1934 uint32_t result = (uint32_t)UCOL_NULLORDER; |
| 1935 |
| 1936 if (data->offsetReturn != NULL) { |
| 1937 if (data->offsetRepeatCount > 0) { |
| 1938 data->offsetRepeatCount -= 1; |
| 1939 } else { |
| 1940 if (data->offsetReturn == data->offsetBuffer) { |
| 1941 data->offsetReturn = NULL; |
| 1942 data->offsetStore = data->offsetBuffer; |
| 1943 } else { |
| 1944 data->offsetReturn -= 1; |
| 1945 } |
| 1946 } |
| 1947 } |
| 1948 |
| 1949 if ((data->extendCEs && data->toReturn > data->extendCEs) || |
| 1950 (!data->extendCEs && data->toReturn > data->CEs)) |
| 1951 { |
| 1952 data->toReturn -= 1; |
| 1953 result = *(data->toReturn); |
| 1954 if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) { |
| 1955 data->CEpos = data->toReturn; |
| 1956 } |
| 1957 } |
| 1958 else { |
| 1959 UChar ch = 0; |
| 1960 |
| 1961 /* |
| 1962 Loop handles case when incremental normalize switches to or from the |
| 1963 side buffer / original string, and we need to start again to get the |
| 1964 next character. |
| 1965 */ |
| 1966 for (;;) { |
| 1967 if (data->flags & UCOL_ITER_HASLEN) { |
| 1968 /* |
| 1969 Normal path for strings when length is specified. |
| 1970 Not in side buffer because it is always null terminated. |
| 1971 */ |
| 1972 if (data->pos <= data->string) { |
| 1973 /* End of the main source string */ |
| 1974 return UCOL_NO_MORE_CES; |
| 1975 } |
| 1976 data->pos --; |
| 1977 ch = *data->pos; |
| 1978 } |
| 1979 // we are using an iterator to go back. Pray for us! |
| 1980 else if (data->flags & UCOL_USE_ITERATOR) { |
| 1981 UChar32 iterCh = data->iterator->previous(data->iterator); |
| 1982 if(iterCh == U_SENTINEL) { |
| 1983 return UCOL_NO_MORE_CES; |
| 1984 } else { |
| 1985 ch = (UChar)iterCh; |
| 1986 } |
| 1987 } |
| 1988 else { |
| 1989 data->pos --; |
| 1990 ch = *data->pos; |
| 1991 /* we are in the side buffer. */ |
| 1992 if (ch == 0) { |
| 1993 /* |
| 1994 At the start of the normalize side buffer. |
| 1995 Go back to string. |
| 1996 Because pointer points to the last accessed character, |
| 1997 hence we have to increment it by one here. |
| 1998 */ |
| 1999 data->flags = data->origFlags; |
| 2000 data->offsetRepeatValue = 0; |
| 2001 |
| 2002 if (data->fcdPosition == NULL) { |
| 2003 data->pos = data->string; |
| 2004 return UCOL_NO_MORE_CES; |
| 2005 } |
| 2006 else { |
| 2007 data->pos = data->fcdPosition + 1; |
| 2008 } |
| 2009 |
| 2010 continue; |
| 2011 } |
| 2012 } |
| 2013 |
| 2014 if(data->flags&UCOL_HIRAGANA_Q) { |
| 2015 if(ch>=0x3040 && ch<=0x309f) { |
| 2016 data->flags |= UCOL_WAS_HIRAGANA; |
| 2017 } else { |
| 2018 data->flags &= ~UCOL_WAS_HIRAGANA; |
| 2019 } |
| 2020 } |
| 2021 |
| 2022 /* |
| 2023 * got a character to determine if there's fcd and/or normalization |
| 2024 * stuff to do. |
| 2025 * if the current character is not fcd. |
| 2026 * if current character is at the start of the string |
| 2027 * Trailing combining class == 0. |
| 2028 * Note if pos is in the writablebuffer, norm is always 0 |
| 2029 */ |
| 2030 if (ch < ZERO_CC_LIMIT_ || |
| 2031 // this should propel us out of the loop in the iterator case |
| 2032 (data->flags & UCOL_ITER_NORM) == 0 || |
| 2033 (data->fcdPosition != NULL && data->fcdPosition <= data->pos) |
| 2034 || data->string == data->pos) { |
| 2035 break; |
| 2036 } |
| 2037 |
| 2038 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { |
| 2039 /* if next character is FCD */ |
| 2040 if (data->pos == data->string) { |
| 2041 /* First char of string is always OK for FCD check */ |
| 2042 break; |
| 2043 } |
| 2044 |
| 2045 /* Not first char of string, do the FCD fast test */ |
| 2046 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) { |
| 2047 break; |
| 2048 } |
| 2049 } |
| 2050 |
| 2051 /* Need a more complete FCD check and possible normalization. */ |
| 2052 if (collPrevIterFCD(data)) { |
| 2053 collPrevIterNormalize(data); |
| 2054 } |
| 2055 |
| 2056 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { |
| 2057 /* No normalization. Go ahead and process the char. */ |
| 2058 break; |
| 2059 } |
| 2060 |
| 2061 /* |
| 2062 Some normalization happened. |
| 2063 Next loop picks up a char from the normalization buffer. |
| 2064 */ |
| 2065 } |
| 2066 |
| 2067 /* attempt to handle contractions, after removal of the backwards |
| 2068 contraction |
| 2069 */ |
| 2070 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) { |
| 2071 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data,
status); |
| 2072 } else { |
| 2073 if (ch <= 0xFF) { |
| 2074 result = coll->latinOneMapping[ch]; |
| 2075 } |
| 2076 else { |
| 2077 // Always use UCA for [3400..9FFF], [AC00..D7AF] |
| 2078 // **** [FA0E..FA2F] ?? **** |
| 2079 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && |
| 2080 (ch >= 0x3400 && ch <= 0xD7AF)) { |
| 2081 if (ch > 0x9FFF && ch < 0xAC00) { |
| 2082 // between the two target ranges; do normal lookup |
| 2083 // **** this range is YI, Modifier tone letters, **** |
| 2084 // **** Latin-D, Syloti Nagari, Phagas-pa. **** |
| 2085 // **** Latin-D might be tailored, so we need to **** |
| 2086 // **** do the normal lookup for these guys. **** |
| 2087 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); |
| 2088 } else { |
| 2089 result = UCOL_NOT_FOUND; |
| 2090 } |
| 2091 } else { |
| 2092 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); |
| 2093 } |
| 2094 } |
| 2095 if (result > UCOL_NOT_FOUND) { |
| 2096 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, statu
s); |
| 2097 } |
| 2098 if (result == UCOL_NOT_FOUND) { // Not found in master list |
| 2099 if (!isAtStartPrevIterate(data) && |
| 2100 ucol_contractionEndCP(ch, data->coll)) |
| 2101 { |
| 2102 result = UCOL_CONTRACTION; |
| 2103 } else { |
| 2104 if(coll->UCA) { |
| 2105 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); |
| 2106 } |
| 2107 } |
| 2108 |
| 2109 if (result > UCOL_NOT_FOUND) { |
| 2110 if(coll->UCA) { |
| 2111 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result
, data, status); |
| 2112 } |
| 2113 } |
| 2114 } |
| 2115 } |
| 2116 |
| 2117 if(result == UCOL_NOT_FOUND) { |
| 2118 result = getPrevImplicit(ch, data); |
| 2119 } |
| 2120 } |
| 2121 |
| 2122 return result; |
| 2123 } |
| 2124 |
| 2125 |
| 2126 /* ucol_getPrevCE, out-of-line version for use from other files. */ |
| 2127 U_CFUNC uint32_t U_EXPORT2 |
| 2128 ucol_getPrevCE(const UCollator *coll, collIterate *data, |
| 2129 UErrorCode *status) { |
| 2130 return ucol_IGetPrevCE(coll, data, status); |
| 2131 } |
| 2132 |
| 2133 |
| 2134 /* this should be connected to special Jamo handling */ |
| 2135 U_CFUNC uint32_t U_EXPORT2 |
| 2136 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) { |
| 2137 collIterate colIt; |
| 2138 IInit_collIterate(coll, &u, 1, &colIt, status); |
| 2139 if(U_FAILURE(*status)) { |
| 2140 return 0; |
| 2141 } |
| 2142 return ucol_IGetNextCE(coll, &colIt, status); |
| 2143 } |
| 2144 |
| 2145 /** |
| 2146 * Inserts the argument character into the end of the buffer pushing back the |
| 2147 * null terminator. |
| 2148 * @param data collIterate struct data |
| 2149 * @param ch character to be appended |
| 2150 * @return the position of the new addition |
| 2151 */ |
| 2152 static |
| 2153 inline const UChar * insertBufferEnd(collIterate *data, UChar ch) |
| 2154 { |
| 2155 int32_t oldLength = data->writableBuffer.length(); |
| 2156 return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength; |
| 2157 } |
| 2158 |
| 2159 /** |
| 2160 * Inserts the argument string into the end of the buffer pushing back the |
| 2161 * null terminator. |
| 2162 * @param data collIterate struct data |
| 2163 * @param string to be appended |
| 2164 * @param length of the string to be appended |
| 2165 * @return the position of the new addition |
| 2166 */ |
| 2167 static |
| 2168 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_
t length) |
| 2169 { |
| 2170 int32_t oldLength = data->writableBuffer.length(); |
| 2171 return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldL
ength; |
| 2172 } |
| 2173 |
| 2174 /** |
| 2175 * Special normalization function for contraction in the forwards iterator. |
| 2176 * This normalization sequence will place the current character at source->pos |
| 2177 * and its following normalized sequence into the buffer. |
| 2178 * The fcd position, pos will be changed. |
| 2179 * pos will now point to positions in the buffer. |
| 2180 * Flags will be changed accordingly. |
| 2181 * @param data collation iterator data |
| 2182 */ |
| 2183 static |
| 2184 inline void normalizeNextContraction(collIterate *data) |
| 2185 { |
| 2186 int32_t strsize; |
| 2187 UErrorCode status = U_ZERO_ERROR; |
| 2188 /* because the pointer points to the next character */ |
| 2189 const UChar *pStart = data->pos - 1; |
| 2190 const UChar *pEnd; |
| 2191 |
| 2192 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { |
| 2193 data->writableBuffer.setTo(*(pStart - 1)); |
| 2194 strsize = 1; |
| 2195 } |
| 2196 else { |
| 2197 strsize = data->writableBuffer.length(); |
| 2198 } |
| 2199 |
| 2200 pEnd = data->fcdPosition; |
| 2201 |
| 2202 data->writableBuffer.append( |
| 2203 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStar
t)), status)); |
| 2204 if(U_FAILURE(status)) { |
| 2205 return; |
| 2206 } |
| 2207 |
| 2208 data->pos = data->writableBuffer.getTerminatedBuffer() + strsize; |
| 2209 data->origFlags = data->flags; |
| 2210 data->flags |= UCOL_ITER_INNORMBUF; |
| 2211 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); |
| 2212 } |
| 2213 |
| 2214 /** |
| 2215 * Contraction character management function that returns the next character |
| 2216 * for the forwards iterator. |
| 2217 * Does nothing if the next character is in buffer and not the first character |
| 2218 * in it. |
| 2219 * Else it checks next character in data string to see if it is normalizable. |
| 2220 * If it is not, the character is simply copied into the buffer, else |
| 2221 * the whole normalized substring is copied into the buffer, including the |
| 2222 * current character. |
| 2223 * @param data collation element iterator data |
| 2224 * @return next character |
| 2225 */ |
| 2226 static |
| 2227 inline UChar getNextNormalizedChar(collIterate *data) |
| 2228 { |
| 2229 UChar nextch; |
| 2230 UChar ch; |
| 2231 // Here we need to add the iterator code. One problem is the way |
| 2232 // end of string is handled. If we just return next char, it could |
| 2233 // be the sentinel. Most of the cases already check for this, but we |
| 2234 // need to be sure. |
| 2235 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) { |
| 2236 /* if no normalization and not in buffer. */ |
| 2237 if(data->flags & UCOL_USE_ITERATOR) { |
| 2238 return (UChar)data->iterator->next(data->iterator); |
| 2239 } else { |
| 2240 return *(data->pos ++); |
| 2241 } |
| 2242 } |
| 2243 |
| 2244 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) { |
| 2245 //normalizeIterator(data); |
| 2246 //} |
| 2247 |
| 2248 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); |
| 2249 if ((innormbuf && *data->pos != 0) || |
| 2250 (data->fcdPosition != NULL && !innormbuf && |
| 2251 data->pos < data->fcdPosition)) { |
| 2252 /* |
| 2253 if next character is in normalized buffer, no further normalization |
| 2254 is required |
| 2255 */ |
| 2256 return *(data->pos ++); |
| 2257 } |
| 2258 |
| 2259 if (data->flags & UCOL_ITER_HASLEN) { |
| 2260 /* in data string */ |
| 2261 if (data->pos + 1 == data->endp) { |
| 2262 return *(data->pos ++); |
| 2263 } |
| 2264 } |
| 2265 else { |
| 2266 if (innormbuf) { |
| 2267 // inside the normalization buffer, but at the end |
| 2268 // (since we encountered zero). This means, in the |
| 2269 // case we're using char iterator, that we need to |
| 2270 // do another round of normalization. |
| 2271 //if(data->origFlags & UCOL_USE_ITERATOR) { |
| 2272 // we need to restore original flags, |
| 2273 // otherwise, we'll lose them |
| 2274 //data->flags = data->origFlags; |
| 2275 //normalizeIterator(data); |
| 2276 //return *(data->pos++); |
| 2277 //} else { |
| 2278 /* |
| 2279 in writable buffer, at this point fcdPosition can not be |
| 2280 pointing to the end of the data string. see contracting tag. |
| 2281 */ |
| 2282 if(data->fcdPosition) { |
| 2283 if (*(data->fcdPosition + 1) == 0 || |
| 2284 data->fcdPosition + 1 == data->endp) { |
| 2285 /* at the end of the string, dump it into the normalizer */ |
| 2286 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1; |
| 2287 // Check if data->pos received a null pointer |
| 2288 if (data->pos == NULL) { |
| 2289 return (UChar)-1; // Return to indicate error. |
| 2290 } |
| 2291 return *(data->fcdPosition ++); |
| 2292 } |
| 2293 data->pos = data->fcdPosition; |
| 2294 } else if(data->origFlags & UCOL_USE_ITERATOR) { |
| 2295 // if we are here, we're using a normalizing iterator. |
| 2296 // we should just continue further. |
| 2297 data->flags = data->origFlags; |
| 2298 data->pos = NULL; |
| 2299 return (UChar)data->iterator->next(data->iterator); |
| 2300 } |
| 2301 //} |
| 2302 } |
| 2303 else { |
| 2304 if (*(data->pos + 1) == 0) { |
| 2305 return *(data->pos ++); |
| 2306 } |
| 2307 } |
| 2308 } |
| 2309 |
| 2310 ch = *data->pos ++; |
| 2311 nextch = *data->pos; |
| 2312 |
| 2313 /* |
| 2314 * if the current character is not fcd. |
| 2315 * Trailing combining class == 0. |
| 2316 */ |
| 2317 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) && |
| 2318 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ || |
| 2319 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) { |
| 2320 /* |
| 2321 Need a more complete FCD check and possible normalization. |
| 2322 normalize substring will be appended to buffer |
| 2323 */ |
| 2324 if (collIterFCD(data)) { |
| 2325 normalizeNextContraction(data); |
| 2326 return *(data->pos ++); |
| 2327 } |
| 2328 else if (innormbuf) { |
| 2329 /* fcdposition shifted even when there's no normalization, if we |
| 2330 don't input the rest into this, we'll get the wrong position when |
| 2331 we reach the end of the writableBuffer */ |
| 2332 int32_t length = (int32_t)(data->fcdPosition - data->pos + 1); |
| 2333 data->pos = insertBufferEnd(data, data->pos - 1, length); |
| 2334 // Check if data->pos received a null pointer |
| 2335 if (data->pos == NULL) { |
| 2336 return (UChar)-1; // Return to indicate error. |
| 2337 } |
| 2338 return *(data->pos ++); |
| 2339 } |
| 2340 } |
| 2341 |
| 2342 if (innormbuf) { |
| 2343 /* |
| 2344 no normalization is to be done hence only one character will be |
| 2345 appended to the buffer. |
| 2346 */ |
| 2347 data->pos = insertBufferEnd(data, ch) + 1; |
| 2348 // Check if data->pos received a null pointer |
| 2349 if (data->pos == NULL) { |
| 2350 return (UChar)-1; // Return to indicate error. |
| 2351 } |
| 2352 } |
| 2353 |
| 2354 /* points back to the pos in string */ |
| 2355 return ch; |
| 2356 } |
| 2357 |
| 2358 |
| 2359 |
| 2360 /** |
| 2361 * Function to copy the buffer into writableBuffer and sets the fcd position to |
| 2362 * the correct position |
| 2363 * @param source data string source |
| 2364 * @param buffer character buffer |
| 2365 */ |
| 2366 static |
| 2367 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &b
uffer) |
| 2368 { |
| 2369 /* okay confusing part here. to ensure that the skipped characters are |
| 2370 considered later, we need to place it in the appropriate position in the |
| 2371 normalization buffer and reassign the pos pointer. simple case if pos |
| 2372 reside in string, simply copy to normalization buffer and |
| 2373 fcdposition = pos, pos = start of normalization buffer. if pos in |
| 2374 normalization buffer, we'll insert the copy infront of pos and point pos |
| 2375 to the start of the normalization buffer. why am i doing these copies? |
| 2376 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecial
CE does |
| 2377 not require any changes, which be really painful. */ |
| 2378 if (source->flags & UCOL_ITER_INNORMBUF) { |
| 2379 int32_t replaceLength = source->pos - source->writableBuffer.getBuffer()
; |
| 2380 source->writableBuffer.replace(0, replaceLength, buffer); |
| 2381 } |
| 2382 else { |
| 2383 source->fcdPosition = source->pos; |
| 2384 source->origFlags = source->flags; |
| 2385 source->flags |= UCOL_ITER_INNORMBUF; |
| 2386 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_IT
ERATOR); |
| 2387 source->writableBuffer = buffer; |
| 2388 } |
| 2389 |
| 2390 source->pos = source->writableBuffer.getTerminatedBuffer(); |
| 2391 } |
| 2392 |
| 2393 /** |
| 2394 * Function to get the discontiguos collation element within the source. |
| 2395 * Note this function will set the position to the appropriate places. |
| 2396 * @param coll current collator used |
| 2397 * @param source data string source |
| 2398 * @param constart index to the start character in the contraction table |
| 2399 * @return discontiguos collation element offset |
| 2400 */ |
| 2401 static |
| 2402 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source, |
| 2403 const UChar *constart) |
| 2404 { |
| 2405 /* source->pos currently points to the second combining character after |
| 2406 the start character */ |
| 2407 const UChar *temppos = source->pos; |
| 2408 UnicodeString buffer; |
| 2409 const UChar *tempconstart = constart; |
| 2410 uint8_t tempflags = source->flags; |
| 2411 UBool multicontraction = FALSE; |
| 2412 collIterateState discState; |
| 2413 |
| 2414 backupState(source, &discState); |
| 2415 |
| 2416 buffer.setTo(peekCodePoint(source, -1)); |
| 2417 for (;;) { |
| 2418 UChar *UCharOffset; |
| 2419 UChar schar, |
| 2420 tchar; |
| 2421 uint32_t result; |
| 2422 |
| 2423 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp) |
| 2424 || (peekCodeUnit(source, 0) == 0 && |
| 2425 //|| (*source->pos == 0 && |
| 2426 ((source->flags & UCOL_ITER_INNORMBUF) == 0 || |
| 2427 source->fcdPosition == NULL || |
| 2428 source->fcdPosition == source->endp || |
| 2429 *(source->fcdPosition) == 0 || |
| 2430 u_getCombiningClass(*(source->fcdPosition)) == 0)) || |
| 2431 /* end of string in null terminated string or stopped by a |
| 2432 null character, note fcd does not always point to a base |
| 2433 character after the discontiguos change */ |
| 2434 u_getCombiningClass(peekCodePoint(source, 0)) == 0) { |
| 2435 //u_getCombiningClass(*(source->pos)) == 0) { |
| 2436 //constart = (UChar *)coll->image + getContractOffset(CE); |
| 2437 if (multicontraction) { |
| 2438 source->pos = temppos - 1; |
| 2439 setDiscontiguosAttribute(source, buffer); |
| 2440 return *(coll->contractionCEs + |
| 2441 (tempconstart - coll->contractionIndex)); |
| 2442 } |
| 2443 constart = tempconstart; |
| 2444 break; |
| 2445 } |
| 2446 |
| 2447 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/ |
| 2448 schar = getNextNormalizedChar(source); |
| 2449 |
| 2450 while (schar > (tchar = *UCharOffset)) { |
| 2451 UCharOffset++; |
| 2452 } |
| 2453 |
| 2454 if (schar != tchar) { |
| 2455 /* not the correct codepoint. we stuff the current codepoint into |
| 2456 the discontiguos buffer and try the next character */ |
| 2457 buffer.append(schar); |
| 2458 continue; |
| 2459 } |
| 2460 else { |
| 2461 if (u_getCombiningClass(schar) == |
| 2462 u_getCombiningClass(peekCodePoint(source, -2))) { |
| 2463 buffer.append(schar); |
| 2464 continue; |
| 2465 } |
| 2466 result = *(coll->contractionCEs + |
| 2467 (UCharOffset - coll->contractionIndex)); |
| 2468 } |
| 2469 |
| 2470 if (result == UCOL_NOT_FOUND) { |
| 2471 break; |
| 2472 } else if (isContraction(result)) { |
| 2473 /* this is a multi-contraction*/ |
| 2474 tempconstart = (UChar *)coll->image + getContractOffset(result); |
| 2475 if (*(coll->contractionCEs + (constart - coll->contractionIndex)) |
| 2476 != UCOL_NOT_FOUND) { |
| 2477 multicontraction = TRUE; |
| 2478 temppos = source->pos + 1; |
| 2479 } |
| 2480 } else { |
| 2481 setDiscontiguosAttribute(source, buffer); |
| 2482 return result; |
| 2483 } |
| 2484 } |
| 2485 |
| 2486 /* no problems simply reverting just like that, |
| 2487 if we are in string before getting into this function, points back to |
| 2488 string hence no problem. |
| 2489 if we are in normalization buffer before getting into this function, |
| 2490 since we'll never use another normalization within this function, we |
| 2491 know that fcdposition points to a base character. the normalization buffer |
| 2492 never change, hence this revert works. */ |
| 2493 loadState(source, &discState, TRUE); |
| 2494 goBackOne(source); |
| 2495 |
| 2496 //source->pos = temppos - 1; |
| 2497 source->flags = tempflags; |
| 2498 return *(coll->contractionCEs + (constart - coll->contractionIndex)); |
| 2499 } |
| 2500 |
| 2501 /* now uses Mark's getImplicitPrimary code */ |
| 2502 static |
| 2503 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) { |
| 2504 uint32_t r = uprv_uca_getImplicitPrimary(cp); |
| 2505 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0; |
| 2506 collationSource->offsetRepeatCount += 1; |
| 2507 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order' |
| 2508 } |
| 2509 |
| 2510 /** |
| 2511 * Inserts the argument character into the front of the buffer replacing the |
| 2512 * front null terminator. |
| 2513 * @param data collation element iterator data |
| 2514 * @param ch character to be appended |
| 2515 */ |
| 2516 static |
| 2517 inline void insertBufferFront(collIterate *data, UChar ch) |
| 2518 { |
| 2519 data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTer
minatedBuffer() + 2; |
| 2520 } |
| 2521 |
| 2522 /** |
| 2523 * Special normalization function for contraction in the previous iterator. |
| 2524 * This normalization sequence will place the current character at source->pos |
| 2525 * and its following normalized sequence into the buffer. |
| 2526 * The fcd position, pos will be changed. |
| 2527 * pos will now point to positions in the buffer. |
| 2528 * Flags will be changed accordingly. |
| 2529 * @param data collation iterator data |
| 2530 */ |
| 2531 static |
| 2532 inline void normalizePrevContraction(collIterate *data, UErrorCode *status) |
| 2533 { |
| 2534 const UChar *pEnd = data->pos + 1; /* End normalize + 1 */ |
| 2535 const UChar *pStart; |
| 2536 |
| 2537 UnicodeString endOfBuffer; |
| 2538 if (data->flags & UCOL_ITER_HASLEN) { |
| 2539 /* |
| 2540 normalization buffer not used yet, we'll pull down the next |
| 2541 character into the end of the buffer |
| 2542 */ |
| 2543 endOfBuffer.setTo(*pEnd); |
| 2544 } |
| 2545 else { |
| 2546 endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL |
| 2547 } |
| 2548 |
| 2549 if (data->fcdPosition == NULL) { |
| 2550 pStart = data->string; |
| 2551 } |
| 2552 else { |
| 2553 pStart = data->fcdPosition + 1; |
| 2554 } |
| 2555 int32_t normLen = |
| 2556 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStar
t)), |
| 2557 data->writableBuffer, |
| 2558 *status). |
| 2559 length(); |
| 2560 if(U_FAILURE(*status)) { |
| 2561 return; |
| 2562 } |
| 2563 /* |
| 2564 this puts the null termination infront of the normalized string instead |
| 2565 of the end |
| 2566 */ |
| 2567 data->pos = |
| 2568 data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminat
edBuffer() + |
| 2569 1 + normLen; |
| 2570 data->origFlags = data->flags; |
| 2571 data->flags |= UCOL_ITER_INNORMBUF; |
| 2572 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); |
| 2573 } |
| 2574 |
| 2575 /** |
| 2576 * Contraction character management function that returns the previous character |
| 2577 * for the backwards iterator. |
| 2578 * Does nothing if the previous character is in buffer and not the first |
| 2579 * character in it. |
| 2580 * Else it checks previous character in data string to see if it is |
| 2581 * normalizable. |
| 2582 * If it is not, the character is simply copied into the buffer, else |
| 2583 * the whole normalized substring is copied into the buffer, including the |
| 2584 * current character. |
| 2585 * @param data collation element iterator data |
| 2586 * @return previous character |
| 2587 */ |
| 2588 static |
| 2589 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status) |
| 2590 { |
| 2591 UChar prevch; |
| 2592 UChar ch; |
| 2593 const UChar *start; |
| 2594 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); |
| 2595 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 || |
| 2596 (innormbuf && *(data->pos - 1) != 0)) { |
| 2597 /* |
| 2598 if no normalization. |
| 2599 if previous character is in normalized buffer, no further normalization |
| 2600 is required |
| 2601 */ |
| 2602 if(data->flags & UCOL_USE_ITERATOR) { |
| 2603 data->iterator->move(data->iterator, -1, UITER_CURRENT); |
| 2604 return (UChar)data->iterator->next(data->iterator); |
| 2605 } else { |
| 2606 return *(data->pos - 1); |
| 2607 } |
| 2608 } |
| 2609 |
| 2610 start = data->pos; |
| 2611 if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) { |
| 2612 /* in data string */ |
| 2613 if ((start - 1) == data->string) { |
| 2614 return *(start - 1); |
| 2615 } |
| 2616 start --; |
| 2617 ch = *start; |
| 2618 prevch = *(start - 1); |
| 2619 } |
| 2620 else { |
| 2621 /* |
| 2622 in writable buffer, at this point fcdPosition can not be NULL. |
| 2623 see contracting tag. |
| 2624 */ |
| 2625 if (data->fcdPosition == data->string) { |
| 2626 /* at the start of the string, just dump it into the normalizer */ |
| 2627 insertBufferFront(data, *(data->fcdPosition)); |
| 2628 data->fcdPosition = NULL; |
| 2629 return *(data->pos - 1); |
| 2630 } |
| 2631 start = data->fcdPosition; |
| 2632 ch = *start; |
| 2633 prevch = *(start - 1); |
| 2634 } |
| 2635 /* |
| 2636 * if the current character is not fcd. |
| 2637 * Trailing combining class == 0. |
| 2638 */ |
| 2639 if (data->fcdPosition > start && |
| 2640 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_)) |
| 2641 { |
| 2642 /* |
| 2643 Need a more complete FCD check and possible normalization. |
| 2644 normalize substring will be appended to buffer |
| 2645 */ |
| 2646 const UChar *backuppos = data->pos; |
| 2647 data->pos = start; |
| 2648 if (collPrevIterFCD(data)) { |
| 2649 normalizePrevContraction(data, status); |
| 2650 return *(data->pos - 1); |
| 2651 } |
| 2652 data->pos = backuppos; |
| 2653 data->fcdPosition ++; |
| 2654 } |
| 2655 |
| 2656 if (innormbuf) { |
| 2657 /* |
| 2658 no normalization is to be done hence only one character will be |
| 2659 appended to the buffer. |
| 2660 */ |
| 2661 insertBufferFront(data, ch); |
| 2662 data->fcdPosition --; |
| 2663 } |
| 2664 |
| 2665 return ch; |
| 2666 } |
| 2667 |
| 2668 /* This function handles the special CEs like contractions, expansions, surrogat
es, Thai */ |
| 2669 /* It is called by getNextCE */ |
| 2670 |
| 2671 /* The following should be even */ |
| 2672 #define UCOL_MAX_DIGITS_FOR_NUMBER 254 |
| 2673 |
| 2674 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
lIterate *source, UErrorCode *status) { |
| 2675 collIterateState entryState; |
| 2676 backupState(source, &entryState); |
| 2677 UChar32 cp = ch; |
| 2678 |
| 2679 for (;;) { |
| 2680 // This loop will repeat only in the case of contractions, and only when
a contraction |
| 2681 // is found and the first CE resulting from that contraction is itself
a special |
| 2682 // (an expansion, for example.) All other special CE types are fully
handled the |
| 2683 // first time through, and the loop exits. |
| 2684 |
| 2685 const uint32_t *CEOffset = NULL; |
| 2686 switch(getCETag(CE)) { |
| 2687 case NOT_FOUND_TAG: |
| 2688 /* This one is not found, and we'll let somebody else bother about i
t... no more games */ |
| 2689 return CE; |
| 2690 case SPEC_PROC_TAG: |
| 2691 { |
| 2692 // Special processing is getting a CE that is preceded by a cert
ain prefix |
| 2693 // Currently this is only needed for optimizing Japanese length
and iteration marks. |
| 2694 // When we encouter a special processing tag, we go backwards an
d try to see if |
| 2695 // we have a match. |
| 2696 // Contraction tables are used - so the whole process is not unl
ike contraction. |
| 2697 // prefix data is stored backwards in the table. |
| 2698 const UChar *UCharOffset; |
| 2699 UChar schar, tchar; |
| 2700 collIterateState prefixState; |
| 2701 backupState(source, &prefixState); |
| 2702 loadState(source, &entryState, TRUE); |
| 2703 goBackOne(source); // We want to look at the point where we ente
red - actually one |
| 2704 // before that... |
| 2705 |
| 2706 for(;;) { |
| 2707 // This loop will run once per source string character, for
as long as we |
| 2708 // are matching a potential contraction sequence |
| 2709 |
| 2710 // First we position ourselves at the begining of contractio
n sequence |
| 2711 const UChar *ContractionStart = UCharOffset = (UChar *)coll-
>image+getContractOffset(CE); |
| 2712 if (collIter_bos(source)) { |
| 2713 CE = *(coll->contractionCEs + (UCharOffset - coll->contr
actionIndex)); |
| 2714 break; |
| 2715 } |
| 2716 schar = getPrevNormalizedChar(source, status); |
| 2717 goBackOne(source); |
| 2718 |
| 2719 while(schar > (tchar = *UCharOffset)) { /* since the contrac
tion codepoints should be ordered, we skip all that are smaller */ |
| 2720 UCharOffset++; |
| 2721 } |
| 2722 |
| 2723 if (schar == tchar) { |
| 2724 // Found the source string char in the table. |
| 2725 // Pick up the corresponding CE from the table. |
| 2726 CE = *(coll->contractionCEs + |
| 2727 (UCharOffset - coll->contractionIndex)); |
| 2728 } |
| 2729 else |
| 2730 { |
| 2731 // Source string char was not in the table. |
| 2732 // We have not found the prefix. |
| 2733 CE = *(coll->contractionCEs + |
| 2734 (ContractionStart - coll->contractionIndex)); |
| 2735 } |
| 2736 |
| 2737 if(!isPrefix(CE)) { |
| 2738 // The source string char was in the contraction table,
and the corresponding |
| 2739 // CE is not a prefix CE. We found the prefix, break |
| 2740 // out of loop, this CE will end up being returned. T
his is the normal |
| 2741 // way out of prefix handling when the source actually
contained |
| 2742 // the prefix. |
| 2743 break; |
| 2744 } |
| 2745 } |
| 2746 if(CE != UCOL_NOT_FOUND) { // we found something and we can meri
lly continue |
| 2747 loadState(source, &prefixState, TRUE); |
| 2748 if(source->origFlags & UCOL_USE_ITERATOR) { |
| 2749 source->flags = source->origFlags; |
| 2750 } |
| 2751 } else { // prefix search was a failure, we have to backup all t
he way to the start |
| 2752 loadState(source, &entryState, TRUE); |
| 2753 } |
| 2754 break; |
| 2755 } |
| 2756 case CONTRACTION_TAG: |
| 2757 { |
| 2758 /* This should handle contractions */ |
| 2759 collIterateState state; |
| 2760 backupState(source, &state); |
| 2761 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->imag
e+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND; |
| 2762 const UChar *UCharOffset; |
| 2763 UChar schar, tchar; |
| 2764 |
| 2765 for (;;) { |
| 2766 /* This loop will run once per source string character, for
as long as we */ |
| 2767 /* are matching a potential contraction sequence
*/ |
| 2768 |
| 2769 /* First we position ourselves at the begining of contractio
n sequence */ |
| 2770 const UChar *ContractionStart = UCharOffset = (UChar *)coll-
>image+getContractOffset(CE); |
| 2771 |
| 2772 if (collIter_eos(source)) { |
| 2773 // Ran off the end of the source string. |
| 2774 CE = *(coll->contractionCEs + (UCharOffset - coll->contr
actionIndex)); |
| 2775 // So we'll pick whatever we have at the point... |
| 2776 if (CE == UCOL_NOT_FOUND) { |
| 2777 // back up the source over all the chars we scanned
going into this contraction. |
| 2778 CE = firstCE; |
| 2779 loadState(source, &state, TRUE); |
| 2780 if(source->origFlags & UCOL_USE_ITERATOR) { |
| 2781 source->flags = source->origFlags; |
| 2782 } |
| 2783 } |
| 2784 break; |
| 2785 } |
| 2786 |
| 2787 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the di
scontiguos stuff */ /* skip the backward offset, see above */ |
| 2788 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8); |
| 2789 |
| 2790 schar = getNextNormalizedChar(source); |
| 2791 while(schar > (tchar = *UCharOffset)) { /* since the contrac
tion codepoints should be ordered, we skip all that are smaller */ |
| 2792 UCharOffset++; |
| 2793 } |
| 2794 |
| 2795 if (schar == tchar) { |
| 2796 // Found the source string char in the contraction table
. |
| 2797 // Pick up the corresponding CE from the table. |
| 2798 CE = *(coll->contractionCEs + |
| 2799 (UCharOffset - coll->contractionIndex)); |
| 2800 } |
| 2801 else |
| 2802 { |
| 2803 // Source string char was not in contraction table. |
| 2804 // Unless we have a discontiguous contraction, we have
finished |
| 2805 // with this contraction. |
| 2806 // in order to do the proper detection, we |
| 2807 // need to see if we're dealing with a supplementary |
| 2808 /* We test whether the next two char are surrogate pairs
. |
| 2809 * This test is done if the iterator is not NULL. |
| 2810 * If there is no surrogate pair, the iterator |
| 2811 * goes back one if needed. */ |
| 2812 UChar32 miss = schar; |
| 2813 if (source->iterator) { |
| 2814 UChar32 surrNextChar; /* the next char in the iterat
ion to test */ |
| 2815 int32_t prevPos; /* holds the previous position befo
re move forward of the source iterator */ |
| 2816 if(U16_IS_LEAD(schar) && source->iterator->hasNext(s
ource->iterator)) { |
| 2817 prevPos = source->iterator->index; |
| 2818 surrNextChar = getNextNormalizedChar(source); |
| 2819 if (U16_IS_TRAIL(surrNextChar)) { |
| 2820 miss = U16_GET_SUPPLEMENTARY(schar, surrNext
Char); |
| 2821 } else if (prevPos < source->iterator->index){ |
| 2822 goBackOne(source); |
| 2823 } |
| 2824 } |
| 2825 } else if (U16_IS_LEAD(schar)) { |
| 2826 miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalize
dChar(source)); |
| 2827 } |
| 2828 |
| 2829 uint8_t sCC; |
| 2830 if (miss < 0x300 || |
| 2831 maxCC == 0 || |
| 2832 (sCC = i_getCombiningClass(miss, coll)) == 0 || |
| 2833 sCC>maxCC || |
| 2834 (allSame != 0 && sCC == maxCC) || |
| 2835 collIter_eos(source)) |
| 2836 { |
| 2837 // Contraction can not be discontiguous. |
| 2838 goBackOne(source); // back up the source string by
one, |
| 2839 // because the character we just looked at was |
| 2840 // not part of the contraction. */ |
| 2841 if(U_IS_SUPPLEMENTARY(miss)) { |
| 2842 goBackOne(source); |
| 2843 } |
| 2844 CE = *(coll->contractionCEs + |
| 2845 (ContractionStart - coll->contractionIndex)); |
| 2846 } else { |
| 2847 // |
| 2848 // Contraction is possibly discontiguous. |
| 2849 // Scan more of source string looking for a match |
| 2850 // |
| 2851 UChar tempchar; |
| 2852 /* find the next character if schar is not a base ch
aracter |
| 2853 and we are not yet at the end of the string */ |
| 2854 tempchar = getNextNormalizedChar(source); |
| 2855 // probably need another supplementary thingie here |
| 2856 goBackOne(source); |
| 2857 if (i_getCombiningClass(tempchar, coll) == 0) { |
| 2858 goBackOne(source); |
| 2859 if(U_IS_SUPPLEMENTARY(miss)) { |
| 2860 goBackOne(source); |
| 2861 } |
| 2862 /* Spit out the last char of the string, wasn't
tasty enough */ |
| 2863 CE = *(coll->contractionCEs + |
| 2864 (ContractionStart - coll->contractionIndex))
; |
| 2865 } else { |
| 2866 CE = getDiscontiguous(coll, source, ContractionS
tart); |
| 2867 } |
| 2868 } |
| 2869 } // else after if(schar == tchar) |
| 2870 |
| 2871 if(CE == UCOL_NOT_FOUND) { |
| 2872 /* The Source string did not match the contraction that
we were checking. */ |
| 2873 /* Back up the source position to undo the effects of h
aving partially */ |
| 2874 /* scanned through what ultimately proved to not be a
contraction. */ |
| 2875 loadState(source, &state, TRUE); |
| 2876 CE = firstCE; |
| 2877 break; |
| 2878 } |
| 2879 |
| 2880 if(!isContraction(CE)) { |
| 2881 // The source string char was in the contraction table,
and the corresponding |
| 2882 // CE is not a contraction CE. We completed the contr
action, break |
| 2883 // out of loop, this CE will end up being returned. T
his is the normal |
| 2884 // way out of contraction handling when the source act
ually contained |
| 2885 // the contraction. |
| 2886 break; |
| 2887 } |
| 2888 |
| 2889 |
| 2890 // The source string char was in the contraction table, and
the corresponding |
| 2891 // CE is IS a contraction CE. We will continue looping t
o check the source |
| 2892 // string for the remaining chars in the contraction. |
| 2893 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart
- coll->contractionIndex)); |
| 2894 if(tempCE != UCOL_NOT_FOUND) { |
| 2895 // We have scanned a a section of source string for whic
h there is a |
| 2896 // CE from the contraction table. Remember the CE and
scan position, so |
| 2897 // that we can return to this point if further scanning
fails to |
| 2898 // match a longer contraction sequence. |
| 2899 firstCE = tempCE; |
| 2900 |
| 2901 goBackOne(source); |
| 2902 backupState(source, &state); |
| 2903 getNextNormalizedChar(source); |
| 2904 |
| 2905 // Another way to do this is: |
| 2906 //collIterateState tempState; |
| 2907 //backupState(source, &tempState); |
| 2908 //goBackOne(source); |
| 2909 //backupState(source, &state); |
| 2910 //loadState(source, &tempState, TRUE); |
| 2911 |
| 2912 // The problem is that for incomplete contractions we ha
ve to remember the previous |
| 2913 // position. Before, the only thing I needed to do was s
tate.pos--; |
| 2914 // After iterator introduction and especially after intr
oduction of normalizing |
| 2915 // iterators, it became much more difficult to decrease
the saved state. |
| 2916 // I'm not yet sure which of the two methods above is fa
ster. |
| 2917 } |
| 2918 } // for(;;) |
| 2919 break; |
| 2920 } // case CONTRACTION_TAG: |
| 2921 case LONG_PRIMARY_TAG: |
| 2922 { |
| 2923 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; |
| 2924 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYT
E_COMMON; |
| 2925 source->offsetRepeatCount += 1; |
| 2926 return CE; |
| 2927 } |
| 2928 case EXPANSION_TAG: |
| 2929 { |
| 2930 /* This should handle expansion. */ |
| 2931 /* NOTE: we can encounter both continuations and expansions in a
n expansion! */ |
| 2932 /* I have to decide where continuations are going to be dealt wi
th */ |
| 2933 uint32_t size; |
| 2934 uint32_t i; /* general counter */ |
| 2935 |
| 2936 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* fi
nd the offset to expansion table */ |
| 2937 size = getExpansionCount(CE); |
| 2938 CE = *CEOffset++; |
| 2939 //source->offsetRepeatCount = -1; |
| 2940 |
| 2941 if(size != 0) { /* if there are less than 16 elements in expansi
on, we don't terminate */ |
| 2942 for(i = 1; i<size; i++) { |
| 2943 *(source->CEpos++) = *CEOffset++; |
| 2944 source->offsetRepeatCount += 1; |
| 2945 } |
| 2946 } else { /* else, we do */ |
| 2947 while(*CEOffset != 0) { |
| 2948 *(source->CEpos++) = *CEOffset++; |
| 2949 source->offsetRepeatCount += 1; |
| 2950 } |
| 2951 } |
| 2952 |
| 2953 return CE; |
| 2954 } |
| 2955 case DIGIT_TAG: |
| 2956 { |
| 2957 /* |
| 2958 We do a check to see if we want to collate digits as numbers; if
so we generate |
| 2959 a custom collation key. Otherwise we pull out the value stored i
n the expansion table. |
| 2960 */ |
| 2961 //uint32_t size; |
| 2962 uint32_t i; /* general counter */ |
| 2963 |
| 2964 if (source->coll->numericCollation == UCOL_ON){ |
| 2965 collIterateState digitState = {0,0,0,0,0,0,0,0,0}; |
| 2966 UChar32 char32 = 0; |
| 2967 int32_t digVal = 0; |
| 2968 |
| 2969 uint32_t digIndx = 0; |
| 2970 uint32_t endIndex = 0; |
| 2971 uint32_t trailingZeroIndex = 0; |
| 2972 |
| 2973 uint8_t collateVal = 0; |
| 2974 |
| 2975 UBool nonZeroValReached = FALSE; |
| 2976 |
| 2977 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I j
ust need a temporary place to store my generated CEs. |
| 2978 /* |
| 2979 We parse the source string until we hit a char that's N
OT a digit. |
| 2980 Use this u_charDigitValue. This might be slow because we
have to |
| 2981 handle surrogates... |
| 2982 */ |
| 2983 /* |
| 2984 if (U16_IS_LEAD(ch)){ |
| 2985 if (!collIter_eos(source)) { |
| 2986 backupState(source, &digitState); |
| 2987 UChar trail = getNextNormalizedChar(source); |
| 2988 if(U16_IS_TRAIL(trail)) { |
| 2989 char32 = U16_GET_SUPPLEMENTARY(ch, trail); |
| 2990 } else { |
| 2991 loadState(source, &digitState, TRUE); |
| 2992 char32 = ch; |
| 2993 } |
| 2994 } else { |
| 2995 char32 = ch; |
| 2996 } |
| 2997 } else { |
| 2998 char32 = ch; |
| 2999 } |
| 3000 digVal = u_charDigitValue(char32); |
| 3001 */ |
| 3002 digVal = u_charDigitValue(cp); // if we have arrived here, w
e have |
| 3003 // already processed possible supplementaries that trigered
the digit tag - |
| 3004 // all supplementaries are marked in the UCA. |
| 3005 /* |
| 3006 We pad a zero in front of the first element anyways. Th
is takes |
| 3007 care of the (probably) most common case where people are
sorting things followed |
| 3008 by a single digit |
| 3009 */ |
| 3010 digIndx++; |
| 3011 for(;;){ |
| 3012 // Make sure we have enough space. No longer needed; |
| 3013 // at this point digIndx now has a max value of UCOL_MAX
_DIGITS_FOR_NUMBER |
| 3014 // (it has been pre-incremented) so we just ensure that
numTempBuf is big enough |
| 3015 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3). |
| 3016 |
| 3017 // Skipping over leading zeroes. |
| 3018 if (digVal != 0) { |
| 3019 nonZeroValReached = TRUE; |
| 3020 } |
| 3021 if (nonZeroValReached) { |
| 3022 /* |
| 3023 We parse the digit string into base 100 numbers (thi
s fits into a byte). |
| 3024 We only add to the buffer in twos, thus if we are pa
rsing an odd character, |
| 3025 that serves as the 'tens' digit while the if we are
parsing an even one, that |
| 3026 is the 'ones' digit. We dumped the parsed base 100 v
alue (collateVal) into |
| 3027 a buffer. We multiply each collateVal by 2 (to give
us room) and add 5 (to avoid |
| 3028 overlapping magic CE byte values). The last byte we
subtract 1 to ensure it is less |
| 3029 than all the other bytes. |
| 3030 */ |
| 3031 |
| 3032 if (digIndx % 2 == 1){ |
| 3033 collateVal += (uint8_t)digVal; |
| 3034 |
| 3035 // We don't enter the low-order-digit case unles
s we've already seen |
| 3036 // the high order, or for the first digit, which
is always non-zero. |
| 3037 if (collateVal != 0) |
| 3038 trailingZeroIndex = 0; |
| 3039 |
| 3040 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; |
| 3041 collateVal = 0; |
| 3042 } |
| 3043 else{ |
| 3044 // We drop the collation value into the buffer s
o if we need to do |
| 3045 // a "front patch" we don't have to check to see
if we're hitting the |
| 3046 // last element. |
| 3047 collateVal = (uint8_t)(digVal * 10); |
| 3048 |
| 3049 // Check for trailing zeroes. |
| 3050 if (collateVal == 0) |
| 3051 { |
| 3052 if (!trailingZeroIndex) |
| 3053 trailingZeroIndex = (digIndx/2) + 2; |
| 3054 } |
| 3055 else |
| 3056 trailingZeroIndex = 0; |
| 3057 |
| 3058 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; |
| 3059 } |
| 3060 digIndx++; |
| 3061 } |
| 3062 |
| 3063 // Get next character. |
| 3064 if (!collIter_eos(source)){ |
| 3065 ch = getNextNormalizedChar(source); |
| 3066 if (U16_IS_LEAD(ch)){ |
| 3067 if (!collIter_eos(source)) { |
| 3068 backupState(source, &digitState); |
| 3069 UChar trail = getNextNormalizedChar(source); |
| 3070 if(U16_IS_TRAIL(trail)) { |
| 3071 char32 = U16_GET_SUPPLEMENTARY(ch, trail
); |
| 3072 } else { |
| 3073 loadState(source, &digitState, TRUE); |
| 3074 char32 = ch; |
| 3075 } |
| 3076 } |
| 3077 } else { |
| 3078 char32 = ch; |
| 3079 } |
| 3080 |
| 3081 if ((digVal = u_charDigitValue(char32)) == -1 || dig
Indx > UCOL_MAX_DIGITS_FOR_NUMBER){ |
| 3082 // Resetting position to point to the next unpro
cessed char. We |
| 3083 // overshot it when doing our test/set for numbe
rs. |
| 3084 if (char32 > 0xFFFF) { // For surrogates. |
| 3085 loadState(source, &digitState, TRUE); |
| 3086 //goBackOne(source); |
| 3087 } |
| 3088 goBackOne(source); |
| 3089 break; |
| 3090 } |
| 3091 } else { |
| 3092 break; |
| 3093 } |
| 3094 } |
| 3095 |
| 3096 if (nonZeroValReached == FALSE){ |
| 3097 digIndx = 2; |
| 3098 numTempBuf[2] = 6; |
| 3099 } |
| 3100 |
| 3101 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx
/2) + 2) ; |
| 3102 if (digIndx % 2 != 0){ |
| 3103 /* |
| 3104 We missed a value. Since digIndx isn't even, stuck too m
any values into the buffer (this is what |
| 3105 we get for padding the first byte with a zero). "Front-p
atch" now by pushing all nybbles forward. |
| 3106 Doing it this way ensures that at least 50% of the time
(statistically speaking) we'll only be doing a |
| 3107 single pass and optimizes for strings with single digits
. I'm just assuming that's the more common case. |
| 3108 */ |
| 3109 |
| 3110 for(i = 2; i < endIndex; i++){ |
| 3111 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10)
* 10) + |
| 3112 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6; |
| 3113 } |
| 3114 --digIndx; |
| 3115 } |
| 3116 |
| 3117 // Subtract one off of the last byte. |
| 3118 numTempBuf[endIndex-1] -= 1; |
| 3119 |
| 3120 /* |
| 3121 We want to skip over the first two slots in the buffer. The
first slot |
| 3122 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The
second slot is for the |
| 3123 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. |
| 3124 */ |
| 3125 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; |
| 3126 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F)); |
| 3127 |
| 3128 // Now transfer the collation key to our collIterate struct. |
| 3129 // The total size for our collation key is endIndx bumped up
to the next largest even value divided by two. |
| 3130 //size = ((endIndex+1) & ~1)/2; |
| 3131 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARY
ORDERSHIFT) | //Primary weight |
| 3132 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Seco
ndary weight |
| 3133 UCOL_BYTE_COMMON; // Tertiary weight. |
| 3134 i = 2; // Reset the index into the buffer. |
| 3135 while(i < endIndex) |
| 3136 { |
| 3137 uint32_t primWeight = numTempBuf[i++] << 8; |
| 3138 if ( i < endIndex) |
| 3139 primWeight |= numTempBuf[i++]; |
| 3140 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHI
FT) | UCOL_CONTINUATION_MARKER; |
| 3141 } |
| 3142 |
| 3143 } else { |
| 3144 // no numeric mode, we'll just switch to whatever we stashed
and continue |
| 3145 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /
* find the offset to expansion table */ |
| 3146 CE = *CEOffset++; |
| 3147 break; |
| 3148 } |
| 3149 return CE; |
| 3150 } |
| 3151 /* various implicits optimization */ |
| 3152 case IMPLICIT_TAG: /* everything that is not defined otherwise */ |
| 3153 /* UCA is filled with these. Tailorings are NOT_FOUND */ |
| 3154 return getImplicit(cp, source); |
| 3155 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
*/ |
| 3156 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImpl
icit |
| 3157 return getImplicit(cp, source); |
| 3158 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ |
| 3159 { |
| 3160 static const uint32_t |
| 3161 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11
A7; |
| 3162 //const uint32_t LCount = 19; |
| 3163 static const uint32_t VCount = 21; |
| 3164 static const uint32_t TCount = 28; |
| 3165 //const uint32_t NCount = VCount * TCount; // 588 |
| 3166 //const uint32_t SCount = LCount * NCount; // 11172 |
| 3167 uint32_t L = ch - SBase; |
| 3168 |
| 3169 // divide into pieces |
| 3170 |
| 3171 uint32_t T = L % TCount; // we do it in this order since some co
mpilers can do % and / in one operation |
| 3172 L /= TCount; |
| 3173 uint32_t V = L % VCount; |
| 3174 L /= VCount; |
| 3175 |
| 3176 // offset them |
| 3177 |
| 3178 L += LBase; |
| 3179 V += VBase; |
| 3180 T += TBase; |
| 3181 |
| 3182 // return the first CE, but first put the rest into the expansio
n buffer |
| 3183 if (!source->coll->image->jamoSpecial) { // FAST PATH |
| 3184 |
| 3185 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V
); |
| 3186 if (T != TBase) { |
| 3187 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mappin
g, T); |
| 3188 } |
| 3189 |
| 3190 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L); |
| 3191 |
| 3192 } else { // Jamo is Special |
| 3193 // Since Hanguls pass the FCD check, it is |
| 3194 // guaranteed that we won't be in |
| 3195 // the normalization buffer if something like this happens |
| 3196 // However, if we are using a uchar iterator and normalizati
on |
| 3197 // is ON, the Hangul that lead us here is going to be in tha
t |
| 3198 // normalization buffer. Here we want to restore the uchar |
| 3199 // iterator state and pull out of the normalization buffer |
| 3200 if(source->iterator != NULL && source->flags & UCOL_ITER_INN
ORMBUF) { |
| 3201 source->flags = source->origFlags; // restore the iterat
or |
| 3202 source->pos = NULL; |
| 3203 } |
| 3204 // Move Jamos into normalization buffer |
| 3205 UChar *buffer = source->writableBuffer.getBuffer(4); |
| 3206 int32_t bufferLength; |
| 3207 buffer[0] = (UChar)L; |
| 3208 buffer[1] = (UChar)V; |
| 3209 if (T != TBase) { |
| 3210 buffer[2] = (UChar)T; |
| 3211 bufferLength = 3; |
| 3212 } else { |
| 3213 bufferLength = 2; |
| 3214 } |
| 3215 source->writableBuffer.releaseBuffer(bufferLength); |
| 3216 |
| 3217 source->fcdPosition = source->pos; // Indicate where
to continue in main input string |
| 3218 // after exhausting the writableBuffer |
| 3219 source->pos = source->writableBuffer.getTerminatedBuffer()
; |
| 3220 source->origFlags = source->flags; |
| 3221 source->flags |= UCOL_ITER_INNORMBUF; |
| 3222 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); |
| 3223 |
| 3224 return(UCOL_IGNORABLE); |
| 3225 } |
| 3226 } |
| 3227 case SURROGATE_TAG: |
| 3228 /* we encountered a leading surrogate. We shall get the CE by using
the following code unit */ |
| 3229 /* two things can happen here: next code point can be a trailing sur
rogate - we will use it */ |
| 3230 /* to retrieve the CE, or it is not a trailing surrogate (or the str
ing is done). In that case */ |
| 3231 /* we treat it like an unassigned code point. */ |
| 3232 { |
| 3233 UChar trail; |
| 3234 collIterateState state; |
| 3235 backupState(source, &state); |
| 3236 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNorma
lizedChar(source))))) { |
| 3237 // we chould have stepped one char forward and it might have
turned that it |
| 3238 // was not a trail surrogate. In that case, we have to backu
p. |
| 3239 loadState(source, &state, TRUE); |
| 3240 return UCOL_NOT_FOUND; |
| 3241 } else { |
| 3242 /* TODO: CE contain the data from the previous CE + the mask
. It should at least be unmasked */ |
| 3243 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFF
FF, trail); |
| 3244 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates
in this block, but not this one. |
| 3245 // We need to backup |
| 3246 loadState(source, &state, TRUE); |
| 3247 return CE; |
| 3248 } |
| 3249 // calculate the supplementary code point value, if surrogat
e was not tailored |
| 3250 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10U
L)+0xdc00-0x10000)); |
| 3251 } |
| 3252 } |
| 3253 break; |
| 3254 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ |
| 3255 UChar nextChar; |
| 3256 if( source->flags & UCOL_USE_ITERATOR) { |
| 3257 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source
->iterator))) { |
| 3258 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); |
| 3259 source->iterator->next(source->iterator); |
| 3260 return getImplicit(cp, source); |
| 3261 } |
| 3262 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->po
s<source->endp)) && |
| 3263 U_IS_TRAIL((nextChar=*source->pos))) { |
| 3264 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); |
| 3265 source->pos++; |
| 3266 return getImplicit(cp, source); |
| 3267 } |
| 3268 return UCOL_NOT_FOUND; |
| 3269 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ |
| 3270 return UCOL_NOT_FOUND; /* broken surrogate sequence */ |
| 3271 case CHARSET_TAG: |
| 3272 /* not yet implemented */ |
| 3273 /* probably after 1.8 */ |
| 3274 return UCOL_NOT_FOUND; |
| 3275 default: |
| 3276 *status = U_INTERNAL_PROGRAM_ERROR; |
| 3277 CE=0; |
| 3278 break; |
| 3279 } |
| 3280 if (CE <= UCOL_NOT_FOUND) break; |
| 3281 } |
| 3282 return CE; |
| 3283 } |
| 3284 |
| 3285 |
| 3286 /* now uses Mark's getImplicitPrimary code */ |
| 3287 static |
| 3288 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) { |
| 3289 uint32_t r = uprv_uca_getImplicitPrimary(cp); |
| 3290 |
| 3291 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505; |
| 3292 collationSource->toReturn = collationSource->CEpos; |
| 3293 |
| 3294 // **** doesn't work if using iterator **** |
| 3295 if (collationSource->flags & UCOL_ITER_INNORMBUF) { |
| 3296 collationSource->offsetRepeatCount = 1; |
| 3297 } else { |
| 3298 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->
string); |
| 3299 |
| 3300 UErrorCode errorCode = U_ZERO_ERROR; |
| 3301 collationSource->appendOffset(firstOffset, errorCode); |
| 3302 collationSource->appendOffset(firstOffset + 1, errorCode); |
| 3303 |
| 3304 collationSource->offsetReturn = collationSource->offsetStore - 1; |
| 3305 *(collationSource->offsetBuffer) = firstOffset; |
| 3306 if (collationSource->offsetReturn == collationSource->offsetBuffer) { |
| 3307 collationSource->offsetStore = collationSource->offsetBuffer; |
| 3308 } |
| 3309 } |
| 3310 |
| 3311 return ((r & 0x0000FFFF)<<16) | 0x000000C0; |
| 3312 } |
| 3313 |
| 3314 /** |
| 3315 * This function handles the special CEs like contractions, expansions, |
| 3316 * surrogates, Thai. |
| 3317 * It is called by both getPrevCE |
| 3318 */ |
| 3319 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, |
| 3320 collIterate *source, |
| 3321 UErrorCode *status) |
| 3322 { |
| 3323 const uint32_t *CEOffset = NULL; |
| 3324 UChar *UCharOffset = NULL; |
| 3325 UChar schar; |
| 3326 const UChar *constart = NULL; |
| 3327 uint32_t size; |
| 3328 UChar buffer[UCOL_MAX_BUFFER]; |
| 3329 uint32_t *endCEBuffer; |
| 3330 UChar *strbuffer; |
| 3331 int32_t noChars = 0; |
| 3332 int32_t CECount = 0; |
| 3333 |
| 3334 for(;;) |
| 3335 { |
| 3336 /* the only ces that loops are thai and contractions */ |
| 3337 switch (getCETag(CE)) |
| 3338 { |
| 3339 case NOT_FOUND_TAG: /* this tag always returns */ |
| 3340 return CE; |
| 3341 |
| 3342 case SPEC_PROC_TAG: |
| 3343 { |
| 3344 // Special processing is getting a CE that is preceded by a cert
ain prefix |
| 3345 // Currently this is only needed for optimizing Japanese length
and iteration marks. |
| 3346 // When we encouter a special processing tag, we go backwards an
d try to see if |
| 3347 // we have a match. |
| 3348 // Contraction tables are used - so the whole process is not unl
ike contraction. |
| 3349 // prefix data is stored backwards in the table. |
| 3350 const UChar *UCharOffset; |
| 3351 UChar schar, tchar; |
| 3352 collIterateState prefixState; |
| 3353 backupState(source, &prefixState); |
| 3354 for(;;) { |
| 3355 // This loop will run once per source string character, for
as long as we |
| 3356 // are matching a potential contraction sequence |
| 3357 |
| 3358 // First we position ourselves at the begining of contractio
n sequence |
| 3359 const UChar *ContractionStart = UCharOffset = (UChar *)coll-
>image+getContractOffset(CE); |
| 3360 |
| 3361 if (collIter_bos(source)) { |
| 3362 CE = *(coll->contractionCEs + (UCharOffset - coll->contr
actionIndex)); |
| 3363 break; |
| 3364 } |
| 3365 schar = getPrevNormalizedChar(source, status); |
| 3366 goBackOne(source); |
| 3367 |
| 3368 while(schar > (tchar = *UCharOffset)) { /* since the contrac
tion codepoints should be ordered, we skip all that are smaller */ |
| 3369 UCharOffset++; |
| 3370 } |
| 3371 |
| 3372 if (schar == tchar) { |
| 3373 // Found the source string char in the table. |
| 3374 // Pick up the corresponding CE from the table. |
| 3375 CE = *(coll->contractionCEs + |
| 3376 (UCharOffset - coll->contractionIndex)); |
| 3377 } |
| 3378 else |
| 3379 { |
| 3380 // if there is a completely ignorable code point in the
middle of |
| 3381 // a prefix, we need to act as if it's not there |
| 3382 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-
fdef are set to zero) |
| 3383 // lone surrogates cannot be set to zero as it would bre
ak other processing |
| 3384 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping
, schar); |
| 3385 // it's easy for BMP code points |
| 3386 if(isZeroCE == 0) { |
| 3387 continue; |
| 3388 } else if(U16_IS_SURROGATE(schar)) { |
| 3389 // for supplementary code points, we have to check t
he next one |
| 3390 // situations where we are going to ignore |
| 3391 // 1. beginning of the string: schar is a lone surro
gate |
| 3392 // 2. schar is a lone surrogate |
| 3393 // 3. schar is a trail surrogate in a valid surrogat
e sequence |
| 3394 // that is explicitly set to zero. |
| 3395 if (!collIter_bos(source)) { |
| 3396 UChar lead; |
| 3397 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(
lead = getPrevNormalizedChar(source, status))) { |
| 3398 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapp
ing, lead); |
| 3399 if(isSpecial(isZeroCE) && getCETag(isZeroCE)
== SURROGATE_TAG) { |
| 3400 uint32_t finalCE = UTRIE_GET32_FROM_OFFS
ET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar); |
| 3401 if(finalCE == 0) { |
| 3402 // this is a real, assigned complete
ly ignorable code point |
| 3403 goBackOne(source); |
| 3404 continue; |
| 3405 } |
| 3406 } |
| 3407 } else { |
| 3408 // lone surrogate, treat like unassigned |
| 3409 return UCOL_NOT_FOUND; |
| 3410 } |
| 3411 } else { |
| 3412 // lone surrogate at the beggining, treat like u
nassigned |
| 3413 return UCOL_NOT_FOUND; |
| 3414 } |
| 3415 } |
| 3416 // Source string char was not in the table. |
| 3417 // We have not found the prefix. |
| 3418 CE = *(coll->contractionCEs + |
| 3419 (ContractionStart - coll->contractionIndex)); |
| 3420 } |
| 3421 |
| 3422 if(!isPrefix(CE)) { |
| 3423 // The source string char was in the contraction table,
and the corresponding |
| 3424 // CE is not a prefix CE. We found the prefix, break |
| 3425 // out of loop, this CE will end up being returned. T
his is the normal |
| 3426 // way out of prefix handling when the source actually
contained |
| 3427 // the prefix. |
| 3428 break; |
| 3429 } |
| 3430 } |
| 3431 loadState(source, &prefixState, TRUE); |
| 3432 break; |
| 3433 } |
| 3434 |
| 3435 case CONTRACTION_TAG: { |
| 3436 /* to ensure that the backwards and forwards iteration matches, we |
| 3437 take the current region of most possible match and pass it through |
| 3438 the forward iteration. this will ensure that the obstinate problem o
f |
| 3439 overlapping contractions will not occur. |
| 3440 */ |
| 3441 schar = peekCodeUnit(source, 0); |
| 3442 constart = (UChar *)coll->image + getContractOffset(CE); |
| 3443 if (isAtStartPrevIterate(source) |
| 3444 /* commented away contraction end checks after adding the checks |
| 3445 in getPrevCE */) { |
| 3446 /* start of string or this is not the end of any contraction
*/ |
| 3447 CE = *(coll->contractionCEs + |
| 3448 (constart - coll->contractionIndex)); |
| 3449 break; |
| 3450 } |
| 3451 strbuffer = buffer; |
| 3452 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1); |
| 3453 *(UCharOffset --) = 0; |
| 3454 noChars = 0; |
| 3455 // have to swap thai characters |
| 3456 while (ucol_unsafeCP(schar, coll)) { |
| 3457 *(UCharOffset) = schar; |
| 3458 noChars++; |
| 3459 UCharOffset --; |
| 3460 schar = getPrevNormalizedChar(source, status); |
| 3461 goBackOne(source); |
| 3462 // TODO: when we exhaust the contraction buffer, |
| 3463 // it needs to get reallocated. The problem is |
| 3464 // that the size depends on the string which is |
| 3465 // not iterated over. However, since we're travelling |
| 3466 // backwards, we already had to set the iterator at |
| 3467 // the end - so we might as well know where we are? |
| 3468 if (UCharOffset + 1 == buffer) { |
| 3469 /* we have exhausted the buffer */ |
| 3470 int32_t newsize = 0; |
| 3471 if(source->pos) { // actually dealing with a position |
| 3472 newsize = (int32_t)(source->pos - source->string + 1); |
| 3473 } else { // iterator |
| 3474 newsize = 4 * UCOL_MAX_BUFFER; |
| 3475 } |
| 3476 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) * |
| 3477 (newsize + UCOL_MAX_BUFFER)); |
| 3478 /* test for NULL */ |
| 3479 if (strbuffer == NULL) { |
| 3480 *status = U_MEMORY_ALLOCATION_ERROR; |
| 3481 return UCOL_NO_MORE_CES; |
| 3482 } |
| 3483 UCharOffset = strbuffer + newsize; |
| 3484 uprv_memcpy(UCharOffset, buffer, |
| 3485 UCOL_MAX_BUFFER * sizeof(UChar)); |
| 3486 UCharOffset --; |
| 3487 } |
| 3488 if ((source->pos && (source->pos == source->string || |
| 3489 ((source->flags & UCOL_ITER_INNORMBUF) && |
| 3490 *(source->pos - 1) == 0 && source->fcdPosition == NULL))) |
| 3491 || (source->iterator && !source->iterator->hasPrevious(sourc
e->iterator))) { |
| 3492 break; |
| 3493 } |
| 3494 } |
| 3495 /* adds the initial base character to the string */ |
| 3496 *(UCharOffset) = schar; |
| 3497 noChars++; |
| 3498 |
| 3499 int32_t offsetBias; |
| 3500 |
| 3501 // **** doesn't work if using iterator **** |
| 3502 if (source->flags & UCOL_ITER_INNORMBUF) { |
| 3503 offsetBias = -1; |
| 3504 } else { |
| 3505 offsetBias = (int32_t)(source->pos - source->string); |
| 3506 } |
| 3507 |
| 3508 /* a new collIterate is used to simplify things, since using the cur
rent |
| 3509 collIterate will mean that the forward and backwards iteration will |
| 3510 share and change the same buffers. we don't want to get into that. *
/ |
| 3511 collIterate temp; |
| 3512 int32_t rawOffset; |
| 3513 |
| 3514 IInit_collIterate(coll, UCharOffset, noChars, &temp, status); |
| 3515 if(U_FAILURE(*status)) { |
| 3516 return UCOL_NULLORDER; |
| 3517 } |
| 3518 temp.flags &= ~UCOL_ITER_NORM; |
| 3519 temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT; |
| 3520 |
| 3521 rawOffset = (int32_t)(temp.pos - temp.string); // should always be z
ero? |
| 3522 CE = ucol_IGetNextCE(coll, &temp, status); |
| 3523 |
| 3524 if (source->extendCEs) { |
| 3525 endCEBuffer = source->extendCEs + source->extendCEsSize; |
| 3526 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(u
int32_t)); |
| 3527 } else { |
| 3528 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE; |
| 3529 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_
t)); |
| 3530 } |
| 3531 |
| 3532 while (CE != UCOL_NO_MORE_CES) { |
| 3533 *(source->CEpos ++) = CE; |
| 3534 |
| 3535 if (offsetBias >= 0) { |
| 3536 source->appendOffset(rawOffset + offsetBias, *status); |
| 3537 } |
| 3538 |
| 3539 CECount++; |
| 3540 if (source->CEpos == endCEBuffer) { |
| 3541 /* ran out of CE space, reallocate to new buffer. |
| 3542 If reallocation fails, reset pointers and bail out, |
| 3543 there's no guarantee of the right character position after |
| 3544 this bail*/ |
| 3545 if (!increaseCEsCapacity(source)) { |
| 3546 *status = U_MEMORY_ALLOCATION_ERROR; |
| 3547 break; |
| 3548 } |
| 3549 |
| 3550 endCEBuffer = source->extendCEs + source->extendCEsSize; |
| 3551 } |
| 3552 |
| 3553 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) { |
| 3554 rawOffset = (int32_t)(temp.fcdPosition - temp.string); |
| 3555 } else { |
| 3556 rawOffset = (int32_t)(temp.pos - temp.string); |
| 3557 } |
| 3558 |
| 3559 CE = ucol_IGetNextCE(coll, &temp, status); |
| 3560 } |
| 3561 |
| 3562 if (strbuffer != buffer) { |
| 3563 uprv_free(strbuffer); |
| 3564 } |
| 3565 if (U_FAILURE(*status)) { |
| 3566 return (uint32_t)UCOL_NULLORDER; |
| 3567 } |
| 3568 |
| 3569 if (source->offsetRepeatValue != 0) { |
| 3570 if (CECount > noChars) { |
| 3571 source->offsetRepeatCount += temp.offsetRepeatCount; |
| 3572 } else { |
| 3573 // **** does this really skip the right offsets? **** |
| 3574 source->offsetReturn -= (noChars - CECount); |
| 3575 } |
| 3576 } |
| 3577 |
| 3578 if (offsetBias >= 0) { |
| 3579 source->offsetReturn = source->offsetStore - 1; |
| 3580 if (source->offsetReturn == source->offsetBuffer) { |
| 3581 source->offsetStore = source->offsetBuffer; |
| 3582 } |
| 3583 } |
| 3584 |
| 3585 source->toReturn = source->CEpos - 1; |
| 3586 if (source->toReturn == source->CEs) { |
| 3587 source->CEpos = source->CEs; |
| 3588 } |
| 3589 |
| 3590 return *(source->toReturn); |
| 3591 } |
| 3592 case LONG_PRIMARY_TAG: |
| 3593 { |
| 3594 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON
<< 8) | UCOL_BYTE_COMMON; |
| 3595 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; |
| 3596 source->toReturn = source->CEpos - 1; |
| 3597 |
| 3598 if (source->flags & UCOL_ITER_INNORMBUF) { |
| 3599 source->offsetRepeatCount = 1; |
| 3600 } else { |
| 3601 int32_t firstOffset = (int32_t)(source->pos - source->string
); |
| 3602 |
| 3603 source->appendOffset(firstOffset, *status); |
| 3604 source->appendOffset(firstOffset + 1, *status); |
| 3605 |
| 3606 source->offsetReturn = source->offsetStore - 1; |
| 3607 *(source->offsetBuffer) = firstOffset; |
| 3608 if (source->offsetReturn == source->offsetBuffer) { |
| 3609 source->offsetStore = source->offsetBuffer; |
| 3610 } |
| 3611 } |
| 3612 |
| 3613 |
| 3614 return *(source->toReturn); |
| 3615 } |
| 3616 |
| 3617 case EXPANSION_TAG: /* this tag always returns */ |
| 3618 { |
| 3619 /* |
| 3620 This should handle expansion. |
| 3621 NOTE: we can encounter both continuations and expansions in an expan
sion! |
| 3622 I have to decide where continuations are going to be dealt with |
| 3623 */ |
| 3624 int32_t firstOffset = (int32_t)(source->pos - source->string); |
| 3625 |
| 3626 // **** doesn't work if using iterator **** |
| 3627 if (source->offsetReturn != NULL) { |
| 3628 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetRet
urn == source->offsetBuffer) { |
| 3629 source->offsetStore = source->offsetBuffer; |
| 3630 }else { |
| 3631 firstOffset = -1; |
| 3632 } |
| 3633 } |
| 3634 |
| 3635 /* find the offset to expansion table */ |
| 3636 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); |
| 3637 size = getExpansionCount(CE); |
| 3638 if (size != 0) { |
| 3639 /* |
| 3640 if there are less than 16 elements in expansion, we don't termin
ate |
| 3641 */ |
| 3642 uint32_t count; |
| 3643 |
| 3644 for (count = 0; count < size; count++) { |
| 3645 *(source->CEpos ++) = *CEOffset++; |
| 3646 |
| 3647 if (firstOffset >= 0) { |
| 3648 source->appendOffset(firstOffset + 1, *status); |
| 3649 } |
| 3650 } |
| 3651 } else { |
| 3652 /* else, we do */ |
| 3653 while (*CEOffset != 0) { |
| 3654 *(source->CEpos ++) = *CEOffset ++; |
| 3655 |
| 3656 if (firstOffset >= 0) { |
| 3657 source->appendOffset(firstOffset + 1, *status); |
| 3658 } |
| 3659 } |
| 3660 } |
| 3661 |
| 3662 if (firstOffset >= 0) { |
| 3663 source->offsetReturn = source->offsetStore - 1; |
| 3664 *(source->offsetBuffer) = firstOffset; |
| 3665 if (source->offsetReturn == source->offsetBuffer) { |
| 3666 source->offsetStore = source->offsetBuffer; |
| 3667 } |
| 3668 } else { |
| 3669 source->offsetRepeatCount += size - 1; |
| 3670 } |
| 3671 |
| 3672 source->toReturn = source->CEpos - 1; |
| 3673 // in case of one element expansion, we |
| 3674 // want to immediately return CEpos |
| 3675 if(source->toReturn == source->CEs) { |
| 3676 source->CEpos = source->CEs; |
| 3677 } |
| 3678 |
| 3679 return *(source->toReturn); |
| 3680 } |
| 3681 |
| 3682 case DIGIT_TAG: |
| 3683 { |
| 3684 /* |
| 3685 We do a check to see if we want to collate digits as numbers; if
so we generate |
| 3686 a custom collation key. Otherwise we pull out the value stored i
n the expansion table. |
| 3687 */ |
| 3688 uint32_t i; /* general counter */ |
| 3689 |
| 3690 if (source->coll->numericCollation == UCOL_ON){ |
| 3691 uint32_t digIndx = 0; |
| 3692 uint32_t endIndex = 0; |
| 3693 uint32_t leadingZeroIndex = 0; |
| 3694 uint32_t trailingZeroCount = 0; |
| 3695 |
| 3696 uint8_t collateVal = 0; |
| 3697 |
| 3698 UBool nonZeroValReached = FALSE; |
| 3699 |
| 3700 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I j
ust need a temporary place to store my generated CEs. |
| 3701 /* |
| 3702 We parse the source string until we hit a char that's NOT a
digit. |
| 3703 Use this u_charDigitValue. This might be slow because we hav
e to |
| 3704 handle surrogates... |
| 3705 */ |
| 3706 /* |
| 3707 We need to break up the digit string into collection element
s of UCOL_MAX_DIGITS_FOR_NUMBER or less, |
| 3708 with any chunks smaller than that being on the right end of
the digit string - i.e. the first collation |
| 3709 element we process when going backward. To determine how lon
g that chunk might be, we may need to make |
| 3710 two passes through the loop that collects digits - one to se
e how long the string is (and how much is |
| 3711 leading zeros) to determine the length of that right-hand ch
unk, and a second (if the whole string has |
| 3712 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits
) to actually process that collation |
| 3713 element chunk after resetting the state to the initialState
at the right side of the digit string. |
| 3714 */ |
| 3715 uint32_t ceLimit = 0; |
| 3716 UChar initial_ch = ch; |
| 3717 collIterateState initialState = {0,0,0,0,0,0,0,0,0}; |
| 3718 backupState(source, &initialState); |
| 3719 |
| 3720 for(;;) { |
| 3721 collIterateState state = {0,0,0,0,0,0,0,0,0}; |
| 3722 UChar32 char32 = 0; |
| 3723 int32_t digVal = 0; |
| 3724 |
| 3725 if (U16_IS_TRAIL (ch)) { |
| 3726 if (!collIter_bos(source)){ |
| 3727 UChar lead = getPrevNormalizedChar(source, statu
s); |
| 3728 if(U16_IS_LEAD(lead)) { |
| 3729 char32 = U16_GET_SUPPLEMENTARY(lead,ch); |
| 3730 goBackOne(source); |
| 3731 } else { |
| 3732 char32 = ch; |
| 3733 } |
| 3734 } else { |
| 3735 char32 = ch; |
| 3736 } |
| 3737 } else { |
| 3738 char32 = ch; |
| 3739 } |
| 3740 digVal = u_charDigitValue(char32); |
| 3741 |
| 3742 for(;;) { |
| 3743 // Make sure we have enough space. No longer needed; |
| 3744 // at this point the largest value of digIndx when w
e need to save data in numTempBuf |
| 3745 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-
incremented) so we just ensure |
| 3746 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FO
R_NUMBER/2 + 2). |
| 3747 |
| 3748 // Skip over trailing zeroes, and keep a count of th
em. |
| 3749 if (digVal != 0) |
| 3750 nonZeroValReached = TRUE; |
| 3751 |
| 3752 if (nonZeroValReached) { |
| 3753 /* |
| 3754 We parse the digit string into base 100 numbers
(this fits into a byte). |
| 3755 We only add to the buffer in twos, thus if we ar
e parsing an odd character, |
| 3756 that serves as the 'tens' digit while the if we
are parsing an even one, that |
| 3757 is the 'ones' digit. We dumped the parsed base 1
00 value (collateVal) into |
| 3758 a buffer. We multiply each collateVal by 2 (to g
ive us room) and add 5 (to avoid |
| 3759 overlapping magic CE byte values). The last byte
we subtract 1 to ensure it is less |
| 3760 than all the other bytes. |
| 3761 |
| 3762 Since we're doing in this reverse we want to put
the first digit encountered into the |
| 3763 ones place and the second digit encountered into
the tens place. |
| 3764 */ |
| 3765 |
| 3766 if ((digIndx + trailingZeroCount) % 2 == 1) { |
| 3767 // High-order digit case (tens place) |
| 3768 collateVal += (uint8_t)(digVal * 10); |
| 3769 |
| 3770 // We cannot set leadingZeroIndex unless it
has been set for the |
| 3771 // low-order digit. Therefore, all we can do
for the high-order |
| 3772 // digit is turn it off, never on. |
| 3773 // The only time we will have a high digit w
ithout a low is for |
| 3774 // the very first non-zero digit, so no zero
check is necessary. |
| 3775 if (collateVal != 0) |
| 3776 leadingZeroIndex = 0; |
| 3777 |
| 3778 // The first pass through, digIndx may excee
d the limit, but in that case |
| 3779 // we no longer care about numTempBuf conten
ts since they will be discarded |
| 3780 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER )
{ |
| 3781 numTempBuf[(digIndx/2) + 2] = collateVal
*2 + 6; |
| 3782 } |
| 3783 collateVal = 0; |
| 3784 } else { |
| 3785 // Low-order digit case (ones place) |
| 3786 collateVal = (uint8_t)digVal; |
| 3787 |
| 3788 // Check for leading zeroes. |
| 3789 if (collateVal == 0) { |
| 3790 if (!leadingZeroIndex) |
| 3791 leadingZeroIndex = (digIndx/2) + 2; |
| 3792 } else |
| 3793 leadingZeroIndex = 0; |
| 3794 |
| 3795 // No need to write to buffer; the case of a
last odd digit |
| 3796 // is handled below. |
| 3797 } |
| 3798 ++digIndx; |
| 3799 } else |
| 3800 ++trailingZeroCount; |
| 3801 |
| 3802 if (!collIter_bos(source)) { |
| 3803 ch = getPrevNormalizedChar(source, status); |
| 3804 //goBackOne(source); |
| 3805 if (U16_IS_TRAIL(ch)) { |
| 3806 backupState(source, &state); |
| 3807 if (!collIter_bos(source)) { |
| 3808 goBackOne(source); |
| 3809 UChar lead = getPrevNormalizedChar(sourc
e, status); |
| 3810 |
| 3811 if(U16_IS_LEAD(lead)) { |
| 3812 char32 = U16_GET_SUPPLEMENTARY(lead,
ch); |
| 3813 } else { |
| 3814 loadState(source, &state, FALSE); |
| 3815 char32 = ch; |
| 3816 } |
| 3817 } |
| 3818 } else |
| 3819 char32 = ch; |
| 3820 |
| 3821 if ((digVal = u_charDigitValue(char32)) == -1 ||
(ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) { |
| 3822 if (char32 > 0xFFFF) {// For surrogates. |
| 3823 loadState(source, &state, FALSE); |
| 3824 } |
| 3825 // Don't need to "reverse" the goBackOne cal
l, |
| 3826 // as this points to the next position to pr
ocess.. |
| 3827 //if (char32 > 0xFFFF) // For surrogates. |
| 3828 //getNextNormalizedChar(source); |
| 3829 break; |
| 3830 } |
| 3831 |
| 3832 goBackOne(source); |
| 3833 }else |
| 3834 break; |
| 3835 } |
| 3836 |
| 3837 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_N
UMBER) { |
| 3838 // our collation element is not too big, go ahead an
d finish with it |
| 3839 break; |
| 3840 } |
| 3841 // our digit string is too long for a collation element; |
| 3842 // set the limit for it, reset the state and begin again |
| 3843 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGIT
S_FOR_NUMBER; |
| 3844 if ( ceLimit == 0 ) { |
| 3845 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER; |
| 3846 } |
| 3847 ch = initial_ch; |
| 3848 loadState(source, &initialState, FALSE); |
| 3849 digIndx = endIndex = leadingZeroIndex = trailingZeroCoun
t = 0; |
| 3850 collateVal = 0; |
| 3851 nonZeroValReached = FALSE; |
| 3852 } |
| 3853 |
| 3854 if (! nonZeroValReached) { |
| 3855 digIndx = 2; |
| 3856 trailingZeroCount = 0; |
| 3857 numTempBuf[2] = 6; |
| 3858 } |
| 3859 |
| 3860 if ((digIndx + trailingZeroCount) % 2 != 0) { |
| 3861 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6; |
| 3862 digIndx += 1; // The implicit leading zero |
| 3863 } |
| 3864 if (trailingZeroCount % 2 != 0) { |
| 3865 // We had to consume one trailing zero for the low digit |
| 3866 // of the least significant byte |
| 3867 digIndx += 1; // The trailing zero not in the expo
nent |
| 3868 trailingZeroCount -= 1; |
| 3869 } |
| 3870 |
| 3871 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2
) + 2) ; |
| 3872 |
| 3873 // Subtract one off of the last byte. Really the first byte
here, but it's reversed... |
| 3874 numTempBuf[2] -= 1; |
| 3875 |
| 3876 /* |
| 3877 We want to skip over the first two slots in the buffer. The
first slot |
| 3878 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The
second slot is for the |
| 3879 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. |
| 3880 The exponent must be adjusted by the number of leading zeroe
s, and the number of |
| 3881 trailing zeroes. |
| 3882 */ |
| 3883 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; |
| 3884 uint32_t exponent = (digIndx+trailingZeroCount)/2; |
| 3885 if (leadingZeroIndex) |
| 3886 exponent -= ((digIndx/2) + 2 - leadingZeroIndex); |
| 3887 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F)); |
| 3888 |
| 3889 // Now transfer the collation key to our collIterate struct. |
| 3890 // The total size for our collation key is half of endIndex,
rounded up. |
| 3891 int32_t size = (endIndex+1)/2; |
| 3892 if(!ensureCEsCapacity(source, size)) { |
| 3893 return UCOL_NULLORDER; |
| 3894 } |
| 3895 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1])
<< UCOL_PRIMARYORDERSHIFT) | //Primary weight |
| 3896 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Seco
ndary weight |
| 3897 UCOL_BYTE_COMMON; // Tertiary weight. |
| 3898 i = endIndex - 1; // Reset the index into the buffer. |
| 3899 while(i >= 2) { |
| 3900 uint32_t primWeight = numTempBuf[i--] << 8; |
| 3901 if ( i >= 2) |
| 3902 primWeight |= numTempBuf[i--]; |
| 3903 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHI
FT) | UCOL_CONTINUATION_MARKER; |
| 3904 } |
| 3905 |
| 3906 source->toReturn = source->CEpos -1; |
| 3907 return *(source->toReturn); |
| 3908 } else { |
| 3909 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); |
| 3910 CE = *(CEOffset++); |
| 3911 break; |
| 3912 } |
| 3913 } |
| 3914 |
| 3915 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ |
| 3916 { |
| 3917 static const uint32_t |
| 3918 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11
A7; |
| 3919 //const uint32_t LCount = 19; |
| 3920 static const uint32_t VCount = 21; |
| 3921 static const uint32_t TCount = 28; |
| 3922 //const uint32_t NCount = VCount * TCount; /* 588 */ |
| 3923 //const uint32_t SCount = LCount * NCount; /* 11172 */ |
| 3924 |
| 3925 uint32_t L = ch - SBase; |
| 3926 /* |
| 3927 divide into pieces. |
| 3928 we do it in this order since some compilers can do % and / in on
e |
| 3929 operation |
| 3930 */ |
| 3931 uint32_t T = L % TCount; |
| 3932 L /= TCount; |
| 3933 uint32_t V = L % VCount; |
| 3934 L /= VCount; |
| 3935 |
| 3936 /* offset them */ |
| 3937 L += LBase; |
| 3938 V += VBase; |
| 3939 T += TBase; |
| 3940 |
| 3941 int32_t firstOffset = (int32_t)(source->pos - source->string); |
| 3942 source->appendOffset(firstOffset, *status); |
| 3943 |
| 3944 /* |
| 3945 * return the first CE, but first put the rest into the expansio
n buffer |
| 3946 */ |
| 3947 if (!source->coll->image->jamoSpecial) { |
| 3948 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L
); |
| 3949 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V
); |
| 3950 source->appendOffset(firstOffset + 1, *status); |
| 3951 |
| 3952 if (T != TBase) { |
| 3953 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mappin
g, T); |
| 3954 source->appendOffset(firstOffset + 1, *status); |
| 3955 } |
| 3956 |
| 3957 source->toReturn = source->CEpos - 1; |
| 3958 |
| 3959 source->offsetReturn = source->offsetStore - 1; |
| 3960 if (source->offsetReturn == source->offsetBuffer) { |
| 3961 source->offsetStore = source->offsetBuffer; |
| 3962 } |
| 3963 |
| 3964 return *(source->toReturn); |
| 3965 } else { |
| 3966 // Since Hanguls pass the FCD check, it is |
| 3967 // guaranteed that we won't be in |
| 3968 // the normalization buffer if something like this happens |
| 3969 // Move Jamos into normalization buffer |
| 3970 /* |
| 3971 Move the Jamos into the |
| 3972 normalization buffer |
| 3973 */ |
| 3974 UChar *tempbuffer = source->writableBuffer.getBuffer(5); |
| 3975 int32_t tempbufferLength; |
| 3976 tempbuffer[0] = 0; |
| 3977 tempbuffer[1] = (UChar)L; |
| 3978 tempbuffer[2] = (UChar)V; |
| 3979 if (T != TBase) { |
| 3980 tempbuffer[3] = (UChar)T; |
| 3981 tempbufferLength = 4; |
| 3982 } else { |
| 3983 tempbufferLength = 3; |
| 3984 } |
| 3985 source->writableBuffer.releaseBuffer(tempbufferLength); |
| 3986 |
| 3987 /* |
| 3988 Indicate where to continue in main input string after exhaus
ting |
| 3989 the writableBuffer |
| 3990 */ |
| 3991 if (source->pos == source->string) { |
| 3992 source->fcdPosition = NULL; |
| 3993 } else { |
| 3994 source->fcdPosition = source->pos-1; |
| 3995 } |
| 3996 |
| 3997 source->pos = source->writableBuffer.getTermin
atedBuffer() + tempbufferLength; |
| 3998 source->origFlags = source->flags; |
| 3999 source->flags |= UCOL_ITER_INNORMBUF; |
| 4000 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HAS
LEN); |
| 4001 |
| 4002 return(UCOL_IGNORABLE); |
| 4003 } |
| 4004 } |
| 4005 |
| 4006 case IMPLICIT_TAG: /* everything that is not defined otherwise */ |
| 4007 return getPrevImplicit(ch, source); |
| 4008 |
| 4009 // TODO: Remove CJK implicits as they are handled by the getImplicit
Primary function |
| 4010 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
*/ |
| 4011 return getPrevImplicit(ch, source); |
| 4012 |
| 4013 case SURROGATE_TAG: /* This is a surrogate pair */ |
| 4014 /* essentially an engaged lead surrogate. */ |
| 4015 /* if you have encountered it here, it means that a */ |
| 4016 /* broken sequence was encountered and this is an error */ |
| 4017 return UCOL_NOT_FOUND; |
| 4018 |
| 4019 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ |
| 4020 return UCOL_NOT_FOUND; /* broken surrogate sequence */ |
| 4021 |
| 4022 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ |
| 4023 { |
| 4024 UChar32 cp = 0; |
| 4025 UChar prevChar; |
| 4026 const UChar *prev; |
| 4027 if (isAtStartPrevIterate(source)) { |
| 4028 /* we are at the start of the string, wrong place to be at *
/ |
| 4029 return UCOL_NOT_FOUND; |
| 4030 } |
| 4031 if (source->pos != source->writableBuffer.getBuffer()) { |
| 4032 prev = source->pos - 1; |
| 4033 } else { |
| 4034 prev = source->fcdPosition; |
| 4035 } |
| 4036 prevChar = *prev; |
| 4037 |
| 4038 /* Handles Han and Supplementary characters here.*/ |
| 4039 if (U16_IS_LEAD(prevChar)) { |
| 4040 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<
10UL)+0xdc00-0x10000)); |
| 4041 source->pos = prev; |
| 4042 } else { |
| 4043 return UCOL_NOT_FOUND; /* like unassigned */ |
| 4044 } |
| 4045 |
| 4046 return getPrevImplicit(cp, source); |
| 4047 } |
| 4048 |
| 4049 /* UCA is filled with these. Tailorings are NOT_FOUND */ |
| 4050 /* not yet implemented */ |
| 4051 case CHARSET_TAG: /* this tag always returns */ |
| 4052 /* probably after 1.8 */ |
| 4053 return UCOL_NOT_FOUND; |
| 4054 |
| 4055 default: /* this tag always returns */ |
| 4056 *status = U_INTERNAL_PROGRAM_ERROR; |
| 4057 CE=0; |
| 4058 break; |
| 4059 } |
| 4060 |
| 4061 if (CE <= UCOL_NOT_FOUND) { |
| 4062 break; |
| 4063 } |
| 4064 } |
| 4065 |
| 4066 return CE; |
| 4067 } |
| 4068 |
| 4069 /* This should really be a macro */ |
| 4070 /* However, it is used only when stack buffers are not sufficiently big, and the
n we're messed up performance wise */ |
| 4071 /* anyway */ |
| 4072 static |
| 4073 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *sec
ond, uint32_t *secSize, uint32_t newSize, UErrorCode *status) { |
| 4074 #ifdef UCOL_DEBUG |
| 4075 fprintf(stderr, "."); |
| 4076 #endif |
| 4077 uint8_t *newStart = NULL; |
| 4078 uint32_t offset = (uint32_t)(*secondaries-secStart); |
| 4079 |
| 4080 if(secStart==second) { |
| 4081 newStart=(uint8_t*)uprv_malloc(newSize); |
| 4082 if(newStart==NULL) { |
| 4083 *status = U_MEMORY_ALLOCATION_ERROR; |
| 4084 return NULL; |
| 4085 } |
| 4086 uprv_memcpy(newStart, secStart, *secondaries-secStart); |
| 4087 } else { |
| 4088 newStart=(uint8_t*)uprv_realloc(secStart, newSize); |
| 4089 if(newStart==NULL) { |
| 4090 *status = U_MEMORY_ALLOCATION_ERROR; |
| 4091 /* Since we're reallocating, return original reference so we don't l
oose it. */ |
| 4092 return secStart; |
| 4093 } |
| 4094 } |
| 4095 *secondaries=newStart+offset; |
| 4096 *secSize=newSize; |
| 4097 return newStart; |
| 4098 } |
| 4099 |
| 4100 |
| 4101 /* This should really be a macro
*/ |
| 4102 /* This function is used to reverse parts of a buffer. We need this operation wh
en doing continuation */ |
| 4103 /* secondaries in French
*/ |
| 4104 /* |
| 4105 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) { |
| 4106 uint8_t temp; |
| 4107 while(start<end) { |
| 4108 temp = *start; |
| 4109 *start++ = *end; |
| 4110 *end-- = temp; |
| 4111 } |
| 4112 } |
| 4113 */ |
| 4114 |
| 4115 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \ |
| 4116 TYPE tempA; \ |
| 4117 while((start)<(end)) { \ |
| 4118 tempA = *(start); \ |
| 4119 *(start)++ = *(end); \ |
| 4120 *(end)-- = tempA; \ |
| 4121 } \ |
| 4122 } |
| 4123 |
| 4124 /****************************************************************************/ |
| 4125 /* Following are the sortkey generation functions */ |
| 4126 /* */ |
| 4127 /****************************************************************************/ |
| 4128 |
| 4129 /** |
| 4130 * Merge two sort keys. |
| 4131 * This is useful, for example, to combine sort keys from first and last names |
| 4132 * to sort such pairs. |
| 4133 * Merged sort keys consider on each collation level the first part first entire
ly, |
| 4134 * then the second one. |
| 4135 * It is possible to merge multiple sort keys by consecutively merging |
| 4136 * another one with the intermediate result. |
| 4137 * |
| 4138 * The length of the merge result is the sum of the lengths of the input sort ke
ys |
| 4139 * minus 1. |
| 4140 * |
| 4141 * @param src1 the first sort key |
| 4142 * @param src1Length the length of the first sort key, including the zero byte a
t the end; |
| 4143 * can be -1 if the function is to find the length |
| 4144 * @param src2 the second sort key |
| 4145 * @param src2Length the length of the second sort key, including the zero byte
at the end; |
| 4146 * can be -1 if the function is to find the length |
| 4147 * @param dest the buffer where the merged sort key is written, |
| 4148 * can be NULL if destCapacity==0 |
| 4149 * @param destCapacity the number of bytes in the dest buffer |
| 4150 * @return the length of the merged sort key, src1Length+src2Length-1; |
| 4151 * can be larger than destCapacity, or 0 if an error occurs (only for il
legal arguments), |
| 4152 * in which cases the contents of dest is undefined |
| 4153 * |
| 4154 * @draft |
| 4155 */ |
| 4156 U_CAPI int32_t U_EXPORT2 |
| 4157 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, |
| 4158 const uint8_t *src2, int32_t src2Length, |
| 4159 uint8_t *dest, int32_t destCapacity) { |
| 4160 int32_t destLength; |
| 4161 uint8_t b; |
| 4162 |
| 4163 /* check arguments */ |
| 4164 if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[sr
c1Length-1]!=0) || |
| 4165 src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[sr
c2Length-1]!=0) || |
| 4166 destCapacity<0 || (destCapacity>0 && dest==NULL) |
| 4167 ) { |
| 4168 /* error, attempt to write a zero byte and return 0 */ |
| 4169 if(dest!=NULL && destCapacity>0) { |
| 4170 *dest=0; |
| 4171 } |
| 4172 return 0; |
| 4173 } |
| 4174 |
| 4175 /* check lengths and capacity */ |
| 4176 if(src1Length<0) { |
| 4177 src1Length=(int32_t)uprv_strlen((const char *)src1)+1; |
| 4178 } |
| 4179 if(src2Length<0) { |
| 4180 src2Length=(int32_t)uprv_strlen((const char *)src2)+1; |
| 4181 } |
| 4182 |
| 4183 destLength=src1Length+src2Length-1; |
| 4184 if(destLength>destCapacity) { |
| 4185 /* the merged sort key does not fit into the destination */ |
| 4186 return destLength; |
| 4187 } |
| 4188 |
| 4189 /* merge the sort keys with the same number of levels */ |
| 4190 while(*src1!=0 && *src2!=0) { /* while both have another level */ |
| 4191 /* copy level from src1 not including 00 or 01 */ |
| 4192 while((b=*src1)>=2) { |
| 4193 ++src1; |
| 4194 *dest++=b; |
| 4195 } |
| 4196 |
| 4197 /* add a 02 merge separator */ |
| 4198 *dest++=2; |
| 4199 |
| 4200 /* copy level from src2 not including 00 or 01 */ |
| 4201 while((b=*src2)>=2) { |
| 4202 ++src2; |
| 4203 *dest++=b; |
| 4204 } |
| 4205 |
| 4206 /* if both sort keys have another level, then add a 01 level separator a
nd continue */ |
| 4207 if(*src1==1 && *src2==1) { |
| 4208 ++src1; |
| 4209 ++src2; |
| 4210 *dest++=1; |
| 4211 } |
| 4212 } |
| 4213 |
| 4214 /* |
| 4215 * here, at least one sort key is finished now, but the other one |
| 4216 * might have some contents left from containing more levels; |
| 4217 * that contents is just appended to the result |
| 4218 */ |
| 4219 if(*src1!=0) { |
| 4220 /* src1 is not finished, therefore *src2==0, and src1 is appended */ |
| 4221 src2=src1; |
| 4222 } |
| 4223 /* append src2, "the other, unfinished sort key" */ |
| 4224 uprv_strcpy((char *)dest, (const char *)src2); |
| 4225 |
| 4226 /* trust that neither sort key contained illegally embedded zero bytes */ |
| 4227 return destLength; |
| 4228 } |
| 4229 |
| 4230 /* sortkey API */ |
| 4231 U_CAPI int32_t U_EXPORT2 |
| 4232 ucol_getSortKey(const UCollator *coll, |
| 4233 const UChar *source, |
| 4234 int32_t sourceLength, |
| 4235 uint8_t *result, |
| 4236 int32_t resultLength) |
| 4237 { |
| 4238 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); |
| 4239 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { |
| 4240 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, sour
ce, |
| 4241 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLengt
h)); |
| 4242 } |
| 4243 |
| 4244 UErrorCode status = U_ZERO_ERROR; |
| 4245 int32_t keySize = 0; |
| 4246 |
| 4247 if(source != NULL) { |
| 4248 // source == NULL is actually an error situation, but we would need to |
| 4249 // have an error code to return it. Until we introduce a new |
| 4250 // API, it stays like this |
| 4251 |
| 4252 /* this uses the function pointer that is set in updateinternalstate */ |
| 4253 /* currently, there are two funcs: */ |
| 4254 /*ucol_calcSortKey(...);*/ |
| 4255 /*ucol_calcSortKeySimpleTertiary(...);*/ |
| 4256 |
| 4257 keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLe
ngth, FALSE, &status); |
| 4258 //if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && result &
& resultLength > 0) { |
| 4259 // That's not good. Something unusual happened. |
| 4260 // We don't know how much we initialized before we failed. |
| 4261 // NULL terminate for safety. |
| 4262 // We have no way say that we have generated a partial sort key. |
| 4263 //result[0] = 0; |
| 4264 //keySize = 0; |
| 4265 //} |
| 4266 } |
| 4267 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); |
| 4268 UTRACE_EXIT_STATUS(status); |
| 4269 return keySize; |
| 4270 } |
| 4271 |
| 4272 /* this function is called by the C++ API for sortkey generation */ |
| 4273 U_CFUNC int32_t |
| 4274 ucol_getSortKeyWithAllocation(const UCollator *coll, |
| 4275 const UChar *source, int32_t sourceLength, |
| 4276 uint8_t **pResult, |
| 4277 UErrorCode *pErrorCode) { |
| 4278 *pResult = 0; |
| 4279 return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pError
Code); |
| 4280 } |
| 4281 |
| 4282 #define UCOL_FSEC_BUF_SIZE 256 |
| 4283 |
| 4284 // Is this primary weight compressible? |
| 4285 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit). |
| 4286 // TODO: This should use per-lead-byte flags from FractionalUCA.txt. |
| 4287 static inline UBool |
| 4288 isCompressible(const UCollator * /*coll*/, uint8_t primary1) { |
| 4289 return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegul
arPrimary; |
| 4290 } |
| 4291 |
| 4292 /* This function tries to get the size of a sortkey. It will be invoked if the s
ize of resulting buffer is 0 */ |
| 4293 /* or if we run out of space while making a sortkey and want to return ASAP
*/ |
| 4294 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t curre
ntSize, UColAttributeValue strength, int32_t len) { |
| 4295 UErrorCode status = U_ZERO_ERROR; |
| 4296 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->ima
ge + coll->image->UCAConsts); |
| 4297 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); |
| 4298 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); |
| 4299 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); |
| 4300 UBool compareIdent = (strength == UCOL_IDENTICAL); |
| 4301 UBool doCase = (coll->caseLevel == UCOL_ON); |
| 4302 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); |
| 4303 //UBool qShifted = shifted && (compareQuad == 0); |
| 4304 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); |
| 4305 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0)
; |
| 4306 uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE]; |
| 4307 uint8_t *fSecs = fSecsBuff; |
| 4308 uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE; |
| 4309 uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL; |
| 4310 |
| 4311 uint32_t variableTopValue = coll->variableTopValue; |
| 4312 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); |
| 4313 if(doHiragana) { |
| 4314 UCOL_COMMON_BOT4++; |
| 4315 /* allocate one more space for hiragana */ |
| 4316 } |
| 4317 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); |
| 4318 |
| 4319 uint32_t order = UCOL_NO_MORE_CES; |
| 4320 uint8_t primary1 = 0; |
| 4321 uint8_t primary2 = 0; |
| 4322 uint8_t secondary = 0; |
| 4323 uint8_t tertiary = 0; |
| 4324 int32_t caseShift = 0; |
| 4325 uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */ |
| 4326 |
| 4327 uint8_t caseSwitch = coll->caseSwitch; |
| 4328 uint8_t tertiaryMask = coll->tertiaryMask; |
| 4329 uint8_t tertiaryCommon = coll->tertiaryCommon; |
| 4330 |
| 4331 UBool wasShifted = FALSE; |
| 4332 UBool notIsContinuation = FALSE; |
| 4333 uint8_t leadPrimary = 0; |
| 4334 |
| 4335 |
| 4336 for(;;) { |
| 4337 order = ucol_IGetNextCE(coll, s, &status); |
| 4338 if(order == UCOL_NO_MORE_CES) { |
| 4339 break; |
| 4340 } |
| 4341 |
| 4342 if(order == 0) { |
| 4343 continue; |
| 4344 } |
| 4345 |
| 4346 notIsContinuation = !isContinuation(order); |
| 4347 |
| 4348 |
| 4349 if(notIsContinuation) { |
| 4350 tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK)); |
| 4351 } else { |
| 4352 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); |
| 4353 } |
| 4354 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); |
| 4355 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); |
| 4356 primary1 = (uint8_t)(order >> 8); |
| 4357 |
| 4358 /* no need to permute since the actual code values don't matter |
| 4359 if (coll->leadBytePermutationTable != NULL && notIsContinuation) { |
| 4360 primary1 = coll->leadBytePermutationTable[primary1]; |
| 4361 } |
| 4362 */ |
| 4363 |
| 4364 if((shifted && ((notIsContinuation && order <= variableTopValue && prima
ry1 > 0) |
| 4365 || (!notIsContinuation && wasShifted))) |
| 4366 || (wasShifted && primary1 == 0)) { /* amendment to the UCA says tha
t primary ignorables */ |
| 4367 /* and other ignorables should be removed if following a shifted
code point */ |
| 4368 if(primary1 == 0) { /* if we were shifted and we got an ignorabl
e code point */ |
| 4369 /* we should just completely ignore it */ |
| 4370 continue; |
| 4371 } |
| 4372 if(compareQuad == 0) { |
| 4373 if(c4 > 0) { |
| 4374 currentSize += (c2/UCOL_BOT_COUNT4)+1; |
| 4375 c4 = 0; |
| 4376 } |
| 4377 currentSize++; |
| 4378 if(primary2 != 0) { |
| 4379 currentSize++; |
| 4380 } |
| 4381 } |
| 4382 wasShifted = TRUE; |
| 4383 } else { |
| 4384 wasShifted = FALSE; |
| 4385 /* Note: This code assumes that the table is well built i.e. not hav
ing 0 bytes where they are not supposed to be. */ |
| 4386 /* Usually, we'll have non-zero primary1 & primary2, except in cases
of a-z and friends, when primary2 will */ |
| 4387 /* calculate sortkey size */ |
| 4388 if(primary1 != UCOL_IGNORABLE) { |
| 4389 if(notIsContinuation) { |
| 4390 if(leadPrimary == primary1) { |
| 4391 currentSize++; |
| 4392 } else { |
| 4393 if(leadPrimary != 0) { |
| 4394 currentSize++; |
| 4395 } |
| 4396 if(primary2 == UCOL_IGNORABLE) { |
| 4397 /* one byter, not compressed */ |
| 4398 currentSize++; |
| 4399 leadPrimary = 0; |
| 4400 } else if(isCompressible(coll, primary1)) { |
| 4401 /* compress */ |
| 4402 leadPrimary = primary1; |
| 4403 currentSize+=2; |
| 4404 } else { |
| 4405 leadPrimary = 0; |
| 4406 currentSize+=2; |
| 4407 } |
| 4408 } |
| 4409 } else { /* we are in continuation, so we're gonna add primary t
o the key don't care about compression */ |
| 4410 currentSize++; |
| 4411 if(primary2 != UCOL_IGNORABLE) { |
| 4412 currentSize++; |
| 4413 } |
| 4414 } |
| 4415 } |
| 4416 |
| 4417 if(secondary > compareSec) { /* I think that != 0 test should be !=
IGNORABLE */ |
| 4418 if(!isFrenchSec){ |
| 4419 if (secondary == UCOL_COMMON2 && notIsContinuation) { |
| 4420 c2++; |
| 4421 } else { |
| 4422 if(c2 > 0) { |
| 4423 if (secondary > UCOL_COMMON2) { // not necessary for
4th level. |
| 4424 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1; |
| 4425 } else { |
| 4426 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1; |
| 4427 } |
| 4428 c2 = 0; |
| 4429 } |
| 4430 currentSize++; |
| 4431 } |
| 4432 } else { |
| 4433 fSecs[fSecsLen++] = secondary; |
| 4434 if(fSecsLen == fSecsMaxLen) { |
| 4435 uint8_t *fSecsTemp; |
| 4436 if(fSecs == fSecsBuff) { |
| 4437 fSecsTemp = (uint8_t *)uprv_malloc(2*fSecsLen); |
| 4438 } else { |
| 4439 fSecsTemp = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLe
n); |
| 4440 } |
| 4441 if(fSecsTemp == NULL) { |
| 4442 status = U_MEMORY_ALLOCATION_ERROR; |
| 4443 return 0; |
| 4444 } |
| 4445 fSecs = fSecsTemp; |
| 4446 fSecsMaxLen *= 2; |
| 4447 } |
| 4448 if(notIsContinuation) { |
| 4449 if (frenchStartPtr != NULL) { |
| 4450 /* reverse secondaries from frenchStartPtr up to fre
nchEndPtr */ |
| 4451 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, fr
enchEndPtr); |
| 4452 frenchStartPtr = NULL; |
| 4453 } |
| 4454 } else { |
| 4455 if (frenchStartPtr == NULL) { |
| 4456 frenchStartPtr = fSecs+fSecsLen-2; |
| 4457 } |
| 4458 frenchEndPtr = fSecs+fSecsLen-1; |
| 4459 } |
| 4460 } |
| 4461 } |
| 4462 |
| 4463 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { |
| 4464 // do the case level if we need to do it. We don't want to calcu
late |
| 4465 // case level for primary ignorables if we have only primary str
ength and case level |
| 4466 // otherwise we would break well formedness of CEs |
| 4467 if (caseShift == 0) { |
| 4468 currentSize++; |
| 4469 caseShift = UCOL_CASE_SHIFT_START; |
| 4470 } |
| 4471 if((tertiary&0x3F) > 0 && notIsContinuation) { |
| 4472 caseShift--; |
| 4473 if((tertiary &0xC0) != 0) { |
| 4474 if (caseShift == 0) { |
| 4475 currentSize++; |
| 4476 caseShift = UCOL_CASE_SHIFT_START; |
| 4477 } |
| 4478 caseShift--; |
| 4479 } |
| 4480 } |
| 4481 } else { |
| 4482 if(notIsContinuation) { |
| 4483 tertiary ^= caseSwitch; |
| 4484 } |
| 4485 } |
| 4486 |
| 4487 tertiary &= tertiaryMask; |
| 4488 if(tertiary > compareTer) { /* I think that != 0 test should be != I
GNORABLE */ |
| 4489 if (tertiary == tertiaryCommon && notIsContinuation) { |
| 4490 c3++; |
| 4491 } else { |
| 4492 if(c3 > 0) { |
| 4493 if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_
COMMON3_NORMAL) |
| 4494 || (tertiary <= tertiaryCommon && tertiaryCommon ==
UCOL_COMMON3_UPPERFIRST)) { |
| 4495 currentSize += (c3/(uint32_t)coll->tertiaryTopCo
unt)+1; |
| 4496 } else { |
| 4497 currentSize += (c3/(uint32_t)coll->tertiaryBottomCou
nt)+1; |
| 4498 } |
| 4499 c3 = 0; |
| 4500 } |
| 4501 currentSize++; |
| 4502 } |
| 4503 } |
| 4504 |
| 4505 if(/*qShifted*/(compareQuad==0) && notIsContinuation) { |
| 4506 if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we n
eed to note it |
| 4507 if(c4>0) { // Close this part |
| 4508 currentSize += (c4/UCOL_BOT_COUNT4)+1; |
| 4509 c4 = 0; |
| 4510 } |
| 4511 currentSize++; // Add the Hiragana |
| 4512 } else { // This wasn't Hiragana, so we can continue adding stuf
f |
| 4513 c4++; |
| 4514 } |
| 4515 } |
| 4516 } |
| 4517 } |
| 4518 |
| 4519 if(!isFrenchSec){ |
| 4520 if(c2 > 0) { |
| 4521 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BO
T_COUNT2 != 0)?1:0); |
| 4522 } |
| 4523 } else { |
| 4524 uint32_t i = 0; |
| 4525 if(frenchStartPtr != NULL) { |
| 4526 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); |
| 4527 } |
| 4528 for(i = 0; i<fSecsLen; i++) { |
| 4529 secondary = *(fSecs+fSecsLen-i-1); |
| 4530 /* This is compression code. */ |
| 4531 if (secondary == UCOL_COMMON2) { |
| 4532 ++c2; |
| 4533 } else { |
| 4534 if(c2 > 0) { |
| 4535 if (secondary > UCOL_COMMON2) { // not necessary for 4th lev
el. |
| 4536 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint
32_t)UCOL_TOP_COUNT2 != 0)?1:0); |
| 4537 } else { |
| 4538 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint
32_t)UCOL_BOT_COUNT2 != 0)?1:0); |
| 4539 } |
| 4540 c2 = 0; |
| 4541 } |
| 4542 currentSize++; |
| 4543 } |
| 4544 } |
| 4545 if(c2 > 0) { |
| 4546 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BO
T_COUNT2 != 0)?1:0); |
| 4547 } |
| 4548 if(fSecs != fSecsBuff) { |
| 4549 uprv_free(fSecs); |
| 4550 } |
| 4551 } |
| 4552 |
| 4553 if(c3 > 0) { |
| 4554 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t
)coll->tertiaryBottomCount != 0)?1:0); |
| 4555 } |
| 4556 |
| 4557 if(c4 > 0 && compareQuad == 0) { |
| 4558 currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_CO
UNT4 != 0)?1:0); |
| 4559 } |
| 4560 |
| 4561 if(compareIdent) { |
| 4562 currentSize += u_lengthOfIdenticalLevelRun(s->string, len); |
| 4563 } |
| 4564 return currentSize; |
| 4565 } |
| 4566 |
| 4567 static |
| 4568 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) { |
| 4569 if (caseShift == 0) { |
| 4570 *(*cases)++ = UCOL_CASE_BYTE_START; |
| 4571 caseShift = UCOL_CASE_SHIFT_START; |
| 4572 } |
| 4573 } |
| 4574 |
| 4575 // Adds a value to the buffer if it's safe to add. Increments the number of adde
d values, so that we |
| 4576 // know how many values we wanted to add, even if we didn't add them all |
| 4577 static |
| 4578 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size
, const uint8_t value) { |
| 4579 size++; |
| 4580 if(primaries < limit) { |
| 4581 *(primaries)++ = value; |
| 4582 } |
| 4583 } |
| 4584 |
| 4585 // Packs the secondary buffer when processing French locale. Adds the terminator
. |
| 4586 static |
| 4587 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *second
aries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) { |
| 4588 uint8_t secondary; |
| 4589 int32_t count2 = 0; |
| 4590 uint32_t i = 0, size = 0; |
| 4591 // we use i here since the key size already accounts for terminators, so we'
ll discard the increment |
| 4592 addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR); |
| 4593 /* If there are any unresolved continuation secondaries, reverse them here s
o that we can reverse the whole secondary thing */ |
| 4594 if(frenchStartPtr != NULL) { |
| 4595 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); |
| 4596 } |
| 4597 for(i = 0; i<*secsize; i++) { |
| 4598 secondary = *(secondaries-i-1); |
| 4599 /* This is compression code. */ |
| 4600 if (secondary == UCOL_COMMON2) { |
| 4601 ++count2; |
| 4602 } else { |
| 4603 if (count2 > 0) { |
| 4604 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. |
| 4605 while (count2 > UCOL_TOP_COUNT2) { |
| 4606 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCO
L_COMMON_TOP2 - UCOL_TOP_COUNT2)); |
| 4607 count2 -= (uint32_t)UCOL_TOP_COUNT2; |
| 4608 } |
| 4609 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_CO
MMON_TOP2 - (count2-1))); |
| 4610 } else { |
| 4611 while (count2 > UCOL_BOT_COUNT2) { |
| 4612 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCO
L_COMMON_BOT2 + UCOL_BOT_COUNT2)); |
| 4613 count2 -= (uint32_t)UCOL_BOT_COUNT2; |
| 4614 } |
| 4615 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_CO
MMON_BOT2 + (count2-1))); |
| 4616 } |
| 4617 count2 = 0; |
| 4618 } |
| 4619 addWithIncrement(primaries, primEnd, size, secondary); |
| 4620 } |
| 4621 } |
| 4622 if (count2 > 0) { |
| 4623 while (count2 > UCOL_BOT_COUNT2) { |
| 4624 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT
2 + UCOL_BOT_COUNT2)); |
| 4625 count2 -= (uint32_t)UCOL_BOT_COUNT2; |
| 4626 } |
| 4627 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 +
(count2-1))); |
| 4628 } |
| 4629 *secsize = size; |
| 4630 return primaries; |
| 4631 } |
| 4632 |
| 4633 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0 |
| 4634 |
| 4635 /* This is the sortkey work horse function */ |
| 4636 U_CFUNC int32_t U_CALLCONV |
| 4637 ucol_calcSortKey(const UCollator *coll, |
| 4638 const UChar *source, |
| 4639 int32_t sourceLength, |
| 4640 uint8_t **result, |
| 4641 uint32_t resultLength, |
| 4642 UBool allocateSKBuffer, |
| 4643 UErrorCode *status) |
| 4644 { |
| 4645 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->ima
ge + coll->image->UCAConsts); |
| 4646 |
| 4647 uint32_t i = 0; /* general purpose counter */ |
| 4648 |
| 4649 /* Stack allocated buffers for buffers we use */ |
| 4650 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], te
rt[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BU
FFER]; |
| 4651 |
| 4652 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *ca
ses = caseB, *quads = quad; |
| 4653 |
| 4654 if(U_FAILURE(*status)) { |
| 4655 return 0; |
| 4656 } |
| 4657 |
| 4658 if(primaries == NULL && allocateSKBuffer == TRUE) { |
| 4659 primaries = *result = prim; |
| 4660 resultLength = UCOL_PRIMARY_MAX_BUFFER; |
| 4661 } |
| 4662 |
| 4663 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BU
FFER, |
| 4664 caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER; |
| 4665 |
| 4666 uint32_t sortKeySize = 1; /* it is always \0 terminated */ |
| 4667 |
| 4668 UnicodeString normSource; |
| 4669 |
| 4670 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); |
| 4671 |
| 4672 UColAttributeValue strength = coll->strength; |
| 4673 |
| 4674 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); |
| 4675 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); |
| 4676 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); |
| 4677 UBool compareIdent = (strength == UCOL_IDENTICAL); |
| 4678 UBool doCase = (coll->caseLevel == UCOL_ON); |
| 4679 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0)
; |
| 4680 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); |
| 4681 //UBool qShifted = shifted && (compareQuad == 0); |
| 4682 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); |
| 4683 |
| 4684 uint32_t variableTopValue = coll->variableTopValue; |
| 4685 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no |
| 4686 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high. |
| 4687 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); |
| 4688 uint8_t UCOL_HIRAGANA_QUAD = 0; |
| 4689 if(doHiragana) { |
| 4690 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++; |
| 4691 /* allocate one more space for hiragana, value for hiragana */ |
| 4692 } |
| 4693 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); |
| 4694 |
| 4695 /* support for special features like caselevel and funky secondaries */ |
| 4696 uint8_t *frenchStartPtr = NULL; |
| 4697 uint8_t *frenchEndPtr = NULL; |
| 4698 uint32_t caseShift = 0; |
| 4699 |
| 4700 sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShi
fted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0)); |
| 4701 |
| 4702 /* If we need to normalize, we'll do it all at once at the beginning! */ |
| 4703 const Normalizer2 *norm2; |
| 4704 if(compareIdent) { |
| 4705 norm2 = Normalizer2Factory::getNFDInstance(*status); |
| 4706 } else if(coll->normalizationMode != UCOL_OFF) { |
| 4707 norm2 = Normalizer2Factory::getFCDInstance(*status); |
| 4708 } else { |
| 4709 norm2 = NULL; |
| 4710 } |
| 4711 if(norm2 != NULL) { |
| 4712 normSource.setTo(FALSE, source, len); |
| 4713 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); |
| 4714 if(qcYesLength != len) { |
| 4715 UnicodeString unnormalized = normSource.tempSubString(qcYesLength); |
| 4716 normSource.truncate(qcYesLength); |
| 4717 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); |
| 4718 source = normSource.getBuffer(); |
| 4719 len = normSource.length(); |
| 4720 } |
| 4721 } |
| 4722 collIterate s; |
| 4723 IInit_collIterate(coll, source, len, &s, status); |
| 4724 if(U_FAILURE(*status)) { |
| 4725 return 0; |
| 4726 } |
| 4727 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was norma
lized. |
| 4728 |
| 4729 if(resultLength == 0 || primaries == NULL) { |
| 4730 return ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); |
| 4731 } |
| 4732 uint8_t *primarySafeEnd = primaries + resultLength - 1; |
| 4733 if(strength > UCOL_PRIMARY) { |
| 4734 primarySafeEnd--; |
| 4735 } |
| 4736 |
| 4737 uint32_t minBufferSize = UCOL_MAX_BUFFER; |
| 4738 |
| 4739 uint8_t *primStart = primaries; |
| 4740 uint8_t *secStart = secondaries; |
| 4741 uint8_t *terStart = tertiaries; |
| 4742 uint8_t *caseStart = cases; |
| 4743 uint8_t *quadStart = quads; |
| 4744 |
| 4745 uint32_t order = 0; |
| 4746 |
| 4747 uint8_t primary1 = 0; |
| 4748 uint8_t primary2 = 0; |
| 4749 uint8_t secondary = 0; |
| 4750 uint8_t tertiary = 0; |
| 4751 uint8_t caseSwitch = coll->caseSwitch; |
| 4752 uint8_t tertiaryMask = coll->tertiaryMask; |
| 4753 int8_t tertiaryAddition = coll->tertiaryAddition; |
| 4754 uint8_t tertiaryTop = coll->tertiaryTop; |
| 4755 uint8_t tertiaryBottom = coll->tertiaryBottom; |
| 4756 uint8_t tertiaryCommon = coll->tertiaryCommon; |
| 4757 uint8_t caseBits = 0; |
| 4758 |
| 4759 UBool finished = FALSE; |
| 4760 UBool wasShifted = FALSE; |
| 4761 UBool notIsContinuation = FALSE; |
| 4762 |
| 4763 uint32_t prevBuffSize = 0; |
| 4764 |
| 4765 uint32_t count2 = 0, count3 = 0, count4 = 0; |
| 4766 uint8_t leadPrimary = 0; |
| 4767 |
| 4768 for(;;) { |
| 4769 for(i=prevBuffSize; i<minBufferSize; ++i) { |
| 4770 |
| 4771 order = ucol_IGetNextCE(coll, &s, status); |
| 4772 if(order == UCOL_NO_MORE_CES) { |
| 4773 finished = TRUE; |
| 4774 break; |
| 4775 } |
| 4776 |
| 4777 if(order == 0) { |
| 4778 continue; |
| 4779 } |
| 4780 |
| 4781 notIsContinuation = !isContinuation(order); |
| 4782 |
| 4783 if(notIsContinuation) { |
| 4784 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK); |
| 4785 } else { |
| 4786 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); |
| 4787 } |
| 4788 |
| 4789 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); |
| 4790 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); |
| 4791 primary1 = (uint8_t)(order >> 8); |
| 4792 |
| 4793 uint8_t originalPrimary1 = primary1; |
| 4794 if(notIsContinuation && coll->leadBytePermutationTable != NULL) { |
| 4795 primary1 = coll->leadBytePermutationTable[primary1]; |
| 4796 } |
| 4797 |
| 4798 if((shifted && ((notIsContinuation && order <= variableTopValue && p
rimary1 > 0) |
| 4799 || (!notIsContinuation && wasShifted))) |
| 4800 || (wasShifted && primary1 == 0)) /* amendment to the UCA says t
hat primary ignorables */ |
| 4801 { |
| 4802 /* and other ignorables should be removed if following a shifted
code point */ |
| 4803 if(primary1 == 0) { /* if we were shifted and we got an ignorabl
e code point */ |
| 4804 /* we should just completely ignore it */ |
| 4805 continue; |
| 4806 } |
| 4807 if(compareQuad == 0) { |
| 4808 if(count4 > 0) { |
| 4809 while (count4 > UCOL_BOT_COUNT4) { |
| 4810 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COU
NT4); |
| 4811 count4 -= UCOL_BOT_COUNT4; |
| 4812 } |
| 4813 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); |
| 4814 count4 = 0; |
| 4815 } |
| 4816 /* We are dealing with a variable and we're treating them as
shifted */ |
| 4817 /* This is a shifted ignorable */ |
| 4818 if(primary1 != 0) { /* we need to check this since we could
be in continuation */ |
| 4819 *quads++ = primary1; |
| 4820 } |
| 4821 if(primary2 != 0) { |
| 4822 *quads++ = primary2; |
| 4823 } |
| 4824 } |
| 4825 wasShifted = TRUE; |
| 4826 } else { |
| 4827 wasShifted = FALSE; |
| 4828 /* Note: This code assumes that the table is well built i.e. not
having 0 bytes where they are not supposed to be. */ |
| 4829 /* Usually, we'll have non-zero primary1 & primary2, except in c
ases of a-z and friends, when primary2 will */ |
| 4830 /* regular and simple sortkey calc */ |
| 4831 if(primary1 != UCOL_IGNORABLE) { |
| 4832 if(notIsContinuation) { |
| 4833 if(leadPrimary == primary1) { |
| 4834 *primaries++ = primary2; |
| 4835 } else { |
| 4836 if(leadPrimary != 0) { |
| 4837 *primaries++ = (uint8_t)((primary1 > leadPrimary
) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); |
| 4838 } |
| 4839 if(primary2 == UCOL_IGNORABLE) { |
| 4840 /* one byter, not compressed */ |
| 4841 *primaries++ = primary1; |
| 4842 leadPrimary = 0; |
| 4843 } else if(isCompressible(coll, originalPrimary1)) { |
| 4844 /* compress */ |
| 4845 *primaries++ = leadPrimary = primary1; |
| 4846 if(primaries <= primarySafeEnd) { |
| 4847 *primaries++ = primary2; |
| 4848 } |
| 4849 } else { |
| 4850 leadPrimary = 0; |
| 4851 *primaries++ = primary1; |
| 4852 if(primaries <= primarySafeEnd) { |
| 4853 *primaries++ = primary2; |
| 4854 } |
| 4855 } |
| 4856 } |
| 4857 } else { /* we are in continuation, so we're gonna add prima
ry to the key don't care about compression */ |
| 4858 *primaries++ = primary1; |
| 4859 if((primary2 != UCOL_IGNORABLE) && (primaries <= primary
SafeEnd)) { |
| 4860 *primaries++ = primary2; /* second part */ |
| 4861 } |
| 4862 } |
| 4863 } |
| 4864 |
| 4865 if(secondary > compareSec) { |
| 4866 if(!isFrenchSec) { |
| 4867 /* This is compression code. */ |
| 4868 if (secondary == UCOL_COMMON2 && notIsContinuation) { |
| 4869 ++count2; |
| 4870 } else { |
| 4871 if (count2 > 0) { |
| 4872 if (secondary > UCOL_COMMON2) { // not necessary
for 4th level. |
| 4873 while (count2 > UCOL_TOP_COUNT2) { |
| 4874 *secondaries++ = (uint8_t)(UCOL_COMMON_T
OP2 - UCOL_TOP_COUNT2); |
| 4875 count2 -= (uint32_t)UCOL_TOP_COUNT2; |
| 4876 } |
| 4877 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2
- (count2-1)); |
| 4878 } else { |
| 4879 while (count2 > UCOL_BOT_COUNT2) { |
| 4880 *secondaries++ = (uint8_t)(UCOL_COMMON_B
OT2 + UCOL_BOT_COUNT2); |
| 4881 count2 -= (uint32_t)UCOL_BOT_COUNT2; |
| 4882 } |
| 4883 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2-1)); |
| 4884 } |
| 4885 count2 = 0; |
| 4886 } |
| 4887 *secondaries++ = secondary; |
| 4888 } |
| 4889 } else { |
| 4890 *secondaries++ = secondary; |
| 4891 /* Do the special handling for French secondaries */ |
| 4892 /* We need to get continuation elements and do intermedi
ate restore */ |
| 4893 /* abc1c2c3de with french secondaries need to be edc1c2c
3ba NOT edc3c2c1ba */ |
| 4894 if(notIsContinuation) { |
| 4895 if (frenchStartPtr != NULL) { |
| 4896 /* reverse secondaries from frenchStartPtr up to
frenchEndPtr */ |
| 4897 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr); |
| 4898 frenchStartPtr = NULL; |
| 4899 } |
| 4900 } else { |
| 4901 if (frenchStartPtr == NULL) { |
| 4902 frenchStartPtr = secondaries - 2; |
| 4903 } |
| 4904 frenchEndPtr = secondaries-1; |
| 4905 } |
| 4906 } |
| 4907 } |
| 4908 |
| 4909 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { |
| 4910 // do the case level if we need to do it. We don't want to c
alculate |
| 4911 // case level for primary ignorables if we have only primary
strength and case level |
| 4912 // otherwise we would break well formedness of CEs |
| 4913 doCaseShift(&cases, caseShift); |
| 4914 if(notIsContinuation) { |
| 4915 caseBits = (uint8_t)(tertiary & 0xC0); |
| 4916 |
| 4917 if(tertiary != 0) { |
| 4918 if(coll->caseFirst == UCOL_UPPER_FIRST) { |
| 4919 if((caseBits & 0xC0) == 0) { |
| 4920 *(cases-1) |= 1 << (--caseShift); |
| 4921 } else { |
| 4922 *(cases-1) |= 0 << (--caseShift); |
| 4923 /* second bit */ |
| 4924 doCaseShift(&cases, caseShift); |
| 4925 *(cases-1) |= ((caseBits>>6)&1) << (--caseSh
ift); |
| 4926 } |
| 4927 } else { |
| 4928 if((caseBits & 0xC0) == 0) { |
| 4929 *(cases-1) |= 0 << (--caseShift); |
| 4930 } else { |
| 4931 *(cases-1) |= 1 << (--caseShift); |
| 4932 /* second bit */ |
| 4933 doCaseShift(&cases, caseShift); |
| 4934 *(cases-1) |= ((caseBits>>7)&1) << (--caseSh
ift); |
| 4935 } |
| 4936 } |
| 4937 } |
| 4938 |
| 4939 } |
| 4940 } else { |
| 4941 if(notIsContinuation) { |
| 4942 tertiary ^= caseSwitch; |
| 4943 } |
| 4944 } |
| 4945 |
| 4946 tertiary &= tertiaryMask; |
| 4947 if(tertiary > compareTer) { |
| 4948 /* This is compression code. */ |
| 4949 /* sequence size check is included in the if clause */ |
| 4950 if (tertiary == tertiaryCommon && notIsContinuation) { |
| 4951 ++count3; |
| 4952 } else { |
| 4953 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_C
OMMON3_NORMAL) { |
| 4954 tertiary += tertiaryAddition; |
| 4955 } else if(tertiary <= tertiaryCommon && tertiaryCommon =
= UCOL_COMMON3_UPPERFIRST) { |
| 4956 tertiary -= tertiaryAddition; |
| 4957 } |
| 4958 if (count3 > 0) { |
| 4959 if ((tertiary > tertiaryCommon)) { |
| 4960 while (count3 > coll->tertiaryTopCount) { |
| 4961 *tertiaries++ = (uint8_t)(tertiaryTop - coll
->tertiaryTopCount); |
| 4962 count3 -= (uint32_t)coll->tertiaryTopCount; |
| 4963 } |
| 4964 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-
1)); |
| 4965 } else { |
| 4966 while (count3 > coll->tertiaryBottomCount) { |
| 4967 *tertiaries++ = (uint8_t)(tertiaryBottom + c
oll->tertiaryBottomCount); |
| 4968 count3 -= (uint32_t)coll->tertiaryBottomCoun
t; |
| 4969 } |
| 4970 *tertiaries++ = (uint8_t)(tertiaryBottom + (coun
t3-1)); |
| 4971 } |
| 4972 count3 = 0; |
| 4973 } |
| 4974 *tertiaries++ = tertiary; |
| 4975 } |
| 4976 } |
| 4977 |
| 4978 if(/*qShifted*/(compareQuad==0) && notIsContinuation) { |
| 4979 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and w
e need to note it |
| 4980 if(count4>0) { // Close this part |
| 4981 while (count4 > UCOL_BOT_COUNT4) { |
| 4982 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT
_COUNT4); |
| 4983 count4 -= UCOL_BOT_COUNT4; |
| 4984 } |
| 4985 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); |
| 4986 count4 = 0; |
| 4987 } |
| 4988 *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana |
| 4989 } else { // This wasn't Hiragana, so we can continue adding
stuff |
| 4990 count4++; |
| 4991 } |
| 4992 } |
| 4993 } |
| 4994 |
| 4995 if(primaries > primarySafeEnd) { /* We have stepped over the primary
buffer */ |
| 4996 if(allocateSKBuffer == FALSE) { /* need to save our butts if we
cannot reallocate */ |
| 4997 IInit_collIterate(coll, (UChar *)source, len, &s, status); |
| 4998 if(U_FAILURE(*status)) { |
| 4999 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; |
| 5000 finished = TRUE; |
| 5001 break; |
| 5002 } |
| 5003 s.flags &= ~UCOL_ITER_NORM; |
| 5004 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, str
ength, len); |
| 5005 *status = U_BUFFER_OVERFLOW_ERROR; |
| 5006 finished = TRUE; |
| 5007 break; |
| 5008 } else { /* It's much nicer if we can actually reallocate */ |
| 5009 int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+
(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadSt
art)); |
| 5010 primStart = reallocateBuffer(&primaries, *result, prim, &res
ultLength, 2*sks, status); |
| 5011 if(U_SUCCESS(*status)) { |
| 5012 *result = primStart; |
| 5013 primarySafeEnd = primStart + resultLength - 1; |
| 5014 if(strength > UCOL_PRIMARY) { |
| 5015 primarySafeEnd--; |
| 5016 } |
| 5017 } else { |
| 5018 /* We ran out of memory!? We can't recover. */ |
| 5019 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; |
| 5020 finished = TRUE; |
| 5021 break; |
| 5022 } |
| 5023 } |
| 5024 } |
| 5025 } |
| 5026 if(finished) { |
| 5027 break; |
| 5028 } else { |
| 5029 prevBuffSize = minBufferSize; |
| 5030 |
| 5031 uint32_t frenchStartOffset = 0, frenchEndOffset = 0; |
| 5032 if (frenchStartPtr != NULL) { |
| 5033 frenchStartOffset = (uint32_t)(frenchStartPtr - secStart); |
| 5034 frenchEndOffset = (uint32_t)(frenchEndPtr - secStart); |
| 5035 } |
| 5036 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize
, 2*secSize, status); |
| 5037 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2
*terSize, status); |
| 5038 caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*
caseSize, status); |
| 5039 quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*q
uadSize, status); |
| 5040 if(U_FAILURE(*status)) { |
| 5041 /* We ran out of memory!? We can't recover. */ |
| 5042 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; |
| 5043 break; |
| 5044 } |
| 5045 if (frenchStartPtr != NULL) { |
| 5046 frenchStartPtr = secStart + frenchStartOffset; |
| 5047 frenchEndPtr = secStart + frenchEndOffset; |
| 5048 } |
| 5049 minBufferSize *= 2; |
| 5050 } |
| 5051 } |
| 5052 |
| 5053 /* Here, we are generally done with processing */ |
| 5054 /* bailing out would not be too productive */ |
| 5055 |
| 5056 if(U_SUCCESS(*status)) { |
| 5057 sortKeySize += (uint32_t)(primaries - primStart); |
| 5058 /* we have done all the CE's, now let's put them together to form a key
*/ |
| 5059 if(compareSec == 0) { |
| 5060 if (count2 > 0) { |
| 5061 while (count2 > UCOL_BOT_COUNT2) { |
| 5062 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT
2); |
| 5063 count2 -= (uint32_t)UCOL_BOT_COUNT2; |
| 5064 } |
| 5065 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); |
| 5066 } |
| 5067 uint32_t secsize = (uint32_t)(secondaries-secStart); |
| 5068 if(!isFrenchSec) { // Regular situation, we know the length of secon
daries |
| 5069 sortKeySize += secsize; |
| 5070 if(sortKeySize <= resultLength) { |
| 5071 *(primaries++) = UCOL_LEVELTERMINATOR; |
| 5072 uprv_memcpy(primaries, secStart, secsize); |
| 5073 primaries += secsize; |
| 5074 } else { |
| 5075 if(allocateSKBuffer == TRUE) { /* need to save our butts if
we cannot reallocate */ |
| 5076 primStart = reallocateBuffer(&primaries, *result, prim,
&resultLength, 2*sortKeySize, status); |
| 5077 if(U_SUCCESS(*status)) { |
| 5078 *result = primStart; |
| 5079 *(primaries++) = UCOL_LEVELTERMINATOR; |
| 5080 uprv_memcpy(primaries, secStart, secsize); |
| 5081 primaries += secsize; |
| 5082 } |
| 5083 else { |
| 5084 /* We ran out of memory!? We can't recover. */ |
| 5085 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; |
| 5086 goto cleanup; |
| 5087 } |
| 5088 } else { |
| 5089 *status = U_BUFFER_OVERFLOW_ERROR; |
| 5090 } |
| 5091 } |
| 5092 } else { // French secondary is on. We will need to pack French. pac
kFrench will add the level terminator |
| 5093 uint8_t *newPrim = packFrench(primaries, primStart+resultLength,
secondaries, &secsize, frenchStartPtr, frenchEndPtr); |
| 5094 sortKeySize += secsize; |
| 5095 if(sortKeySize <= resultLength) { // if we managed to pack fine |
| 5096 primaries = newPrim; // update the primary pointer |
| 5097 } else { // overflow, need to reallocate and redo |
| 5098 if(allocateSKBuffer == TRUE) { /* need to save our butts if
we cannot reallocate */ |
| 5099 primStart = reallocateBuffer(&primaries, *result, prim,
&resultLength, 2*sortKeySize, status); |
| 5100 if(U_SUCCESS(*status)) { |
| 5101 primaries = packFrench(primaries, primStart+resultLe
ngth, secondaries, &secsize, frenchStartPtr, frenchEndPtr); |
| 5102 } |
| 5103 else { |
| 5104 /* We ran out of memory!? We can't recover. */ |
| 5105 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; |
| 5106 goto cleanup; |
| 5107 } |
| 5108 } else { |
| 5109 *status = U_BUFFER_OVERFLOW_ERROR; |
| 5110 } |
| 5111 } |
| 5112 } |
| 5113 } |
| 5114 |
| 5115 if(doCase) { |
| 5116 uint32_t casesize = (uint32_t)(cases - caseStart); |
| 5117 sortKeySize += casesize; |
| 5118 if(sortKeySize <= resultLength) { |
| 5119 *(primaries++) = UCOL_LEVELTERMINATOR; |
| 5120 uprv_memcpy(primaries, caseStart, casesize); |
| 5121 primaries += casesize; |
| 5122 } else { |
| 5123 if(allocateSKBuffer == TRUE) { |
| 5124 primStart = reallocateBuffer(&primaries, *result, prim, &res
ultLength, 2*sortKeySize, status); |
| 5125 if(U_SUCCESS(*status)) { |
| 5126 *result = primStart; |
| 5127 *(primaries++) = UCOL_LEVELTERMINATOR; |
| 5128 uprv_memcpy(primaries, caseStart, casesize); |
| 5129 } |
| 5130 else { |
| 5131 /* We ran out of memory!? We can't recover. */ |
| 5132 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; |
| 5133 goto cleanup; |
| 5134 } |
| 5135 } else { |
| 5136 *status = U_BUFFER_OVERFLOW_ERROR; |
| 5137 } |
| 5138 } |
| 5139 } |
| 5140 |
| 5141 if(compareTer == 0) { |
| 5142 if (count3 > 0) { |
| 5143 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) { |
| 5144 while (count3 >= coll->tertiaryTopCount) { |
| 5145 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTo
pCount); |
| 5146 count3 -= (uint32_t)coll->tertiaryTopCount; |
| 5147 } |
| 5148 *tertiaries++ = (uint8_t)(tertiaryTop - count3); |
| 5149 } else { |
| 5150 while (count3 > coll->tertiaryBottomCount) { |
| 5151 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiar
yBottomCount); |
| 5152 count3 -= (uint32_t)coll->tertiaryBottomCount; |
| 5153 } |
| 5154 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); |
| 5155 } |
| 5156 } |
| 5157 uint32_t tersize = (uint32_t)(tertiaries - terStart); |
| 5158 sortKeySize += tersize; |
| 5159 if(sortKeySize <= resultLength) { |
| 5160 *(primaries++) = UCOL_LEVELTERMINATOR; |
| 5161 uprv_memcpy(primaries, terStart, tersize); |
| 5162 primaries += tersize; |
| 5163 } else { |
| 5164 if(allocateSKBuffer == TRUE) { |
| 5165 primStart = reallocateBuffer(&primaries, *result, prim, &res
ultLength, 2*sortKeySize, status); |
| 5166 if(U_SUCCESS(*status)) { |
| 5167 *result = primStart; |
| 5168 *(primaries++) = UCOL_LEVELTERMINATOR; |
| 5169 uprv_memcpy(primaries, terStart, tersize); |
| 5170 } |
| 5171 else { |
| 5172 /* We ran out of memory!? We can't recover. */ |
| 5173 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; |
| 5174 goto cleanup; |
| 5175 } |
| 5176 } else { |
| 5177 *status = U_BUFFER_OVERFLOW_ERROR; |
| 5178 } |
| 5179 } |
| 5180 |
| 5181 if(compareQuad == 0/*qShifted == TRUE*/) { |
| 5182 if(count4 > 0) { |
| 5183 while (count4 > UCOL_BOT_COUNT4) { |
| 5184 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4)
; |
| 5185 count4 -= UCOL_BOT_COUNT4; |
| 5186 } |
| 5187 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); |
| 5188 } |
| 5189 uint32_t quadsize = (uint32_t)(quads - quadStart); |
| 5190 sortKeySize += quadsize; |
| 5191 if(sortKeySize <= resultLength) { |
| 5192 *(primaries++) = UCOL_LEVELTERMINATOR; |
| 5193 uprv_memcpy(primaries, quadStart, quadsize); |
| 5194 primaries += quadsize; |
| 5195 } else { |
| 5196 if(allocateSKBuffer == TRUE) { |
| 5197 primStart = reallocateBuffer(&primaries, *result, prim,
&resultLength, 2*sortKeySize, status); |
| 5198 if(U_SUCCESS(*status)) { |
| 5199 *result = primStart; |
| 5200 *(primaries++) = UCOL_LEVELTERMINATOR; |
| 5201 uprv_memcpy(primaries, quadStart, quadsize); |
| 5202 } |
| 5203 else { |
| 5204 /* We ran out of memory!? We can't recover. */ |
| 5205 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; |
| 5206 goto cleanup; |
| 5207 } |
| 5208 } else { |
| 5209 *status = U_BUFFER_OVERFLOW_ERROR; |
| 5210 } |
| 5211 } |
| 5212 } |
| 5213 |
| 5214 if(compareIdent) { |
| 5215 sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len); |
| 5216 if(sortKeySize <= resultLength) { |
| 5217 *(primaries++) = UCOL_LEVELTERMINATOR; |
| 5218 primaries += u_writeIdenticalLevelRun(s.string, len, primari
es); |
| 5219 } else { |
| 5220 if(allocateSKBuffer == TRUE) { |
| 5221 primStart = reallocateBuffer(&primaries, *result, prim,
&resultLength, sortKeySize, status); |
| 5222 if(U_SUCCESS(*status)) { |
| 5223 *result = primStart; |
| 5224 *(primaries++) = UCOL_LEVELTERMINATOR; |
| 5225 u_writeIdenticalLevelRun(s.string, len, primaries); |
| 5226 } |
| 5227 else { |
| 5228 /* We ran out of memory!? We can't recover. */ |
| 5229 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; |
| 5230 goto cleanup; |
| 5231 } |
| 5232 } else { |
| 5233 *status = U_BUFFER_OVERFLOW_ERROR; |
| 5234 } |
| 5235 } |
| 5236 } |
| 5237 } |
| 5238 *(primaries++) = '\0'; |
| 5239 } |
| 5240 |
| 5241 if(allocateSKBuffer == TRUE) { |
| 5242 *result = (uint8_t*)uprv_malloc(sortKeySize); |
| 5243 /* test for NULL */ |
| 5244 if (*result == NULL) { |
| 5245 *status = U_MEMORY_ALLOCATION_ERROR; |
| 5246 goto cleanup; |
| 5247 } |
| 5248 uprv_memcpy(*result, primStart, sortKeySize); |
| 5249 if(primStart != prim) { |
| 5250 uprv_free(primStart); |
| 5251 } |
| 5252 } |
| 5253 |
| 5254 cleanup: |
| 5255 if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *
status != U_BUFFER_OVERFLOW_ERROR) { |
| 5256 /* NULL terminate for safety */ |
| 5257 **result = 0; |
| 5258 } |
| 5259 if(terStart != tert) { |
| 5260 uprv_free(terStart); |
| 5261 uprv_free(secStart); |
| 5262 uprv_free(caseStart); |
| 5263 uprv_free(quadStart); |
| 5264 } |
| 5265 |
| 5266 /* To avoid memory leak, free the offset buffer if necessary. */ |
| 5267 ucol_freeOffsetBuffer(&s); |
| 5268 |
| 5269 return sortKeySize; |
| 5270 } |
| 5271 |
| 5272 |
| 5273 U_CFUNC int32_t U_CALLCONV |
| 5274 ucol_calcSortKeySimpleTertiary(const UCollator *coll, |
| 5275 const UChar *source, |
| 5276 int32_t sourceLength, |
| 5277 uint8_t **result, |
| 5278 uint32_t resultLength, |
| 5279 UBool allocateSKBuffer, |
| 5280 UErrorCode *status) |
| 5281 { |
| 5282 U_ALIGN_CODE(16); |
| 5283 |
| 5284 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->ima
ge + coll->image->UCAConsts); |
| 5285 uint32_t i = 0; /* general purpose counter */ |
| 5286 |
| 5287 /* Stack allocated buffers for buffers we use */ |
| 5288 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], te
rt[UCOL_TERTIARY_MAX_BUFFER]; |
| 5289 |
| 5290 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert; |
| 5291 |
| 5292 if(U_FAILURE(*status)) { |
| 5293 return 0; |
| 5294 } |
| 5295 |
| 5296 if(primaries == NULL && allocateSKBuffer == TRUE) { |
| 5297 primaries = *result = prim; |
| 5298 resultLength = UCOL_PRIMARY_MAX_BUFFER; |
| 5299 } |
| 5300 |
| 5301 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BU
FFER; |
| 5302 |
| 5303 uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for
secondary and tertiary */ |
| 5304 |
| 5305 UnicodeString normSource; |
| 5306 |
| 5307 int32_t len = sourceLength; |
| 5308 |
| 5309 /* If we need to normalize, we'll do it all at once at the beginning! */ |
| 5310 if(coll->normalizationMode != UCOL_OFF) { |
| 5311 normSource.setTo(len < 0, source, len); |
| 5312 const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status); |
| 5313 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); |
| 5314 if(qcYesLength != normSource.length()) { |
| 5315 UnicodeString unnormalized = normSource.tempSubString(qcYesLength); |
| 5316 normSource.truncate(qcYesLength); |
| 5317 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); |
| 5318 source = normSource.getBuffer(); |
| 5319 len = normSource.length(); |
| 5320 } |
| 5321 } |
| 5322 collIterate s; |
| 5323 IInit_collIterate(coll, (UChar *)source, len, &s, status); |
| 5324 if(U_FAILURE(*status)) { |
| 5325 return 0; |
| 5326 } |
| 5327 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was norma
lized. |
| 5328 |
| 5329 if(resultLength == 0 || primaries == NULL) { |
| 5330 return ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len); |
| 5331 } |
| 5332 |
| 5333 uint8_t *primarySafeEnd = primaries + resultLength - 2; |
| 5334 |
| 5335 uint32_t minBufferSize = UCOL_MAX_BUFFER; |
| 5336 |
| 5337 uint8_t *primStart = primaries; |
| 5338 uint8_t *secStart = secondaries; |
| 5339 uint8_t *terStart = tertiaries; |
| 5340 |
| 5341 uint32_t order = 0; |
| 5342 |
| 5343 uint8_t primary1 = 0; |
| 5344 uint8_t primary2 = 0; |
| 5345 uint8_t secondary = 0; |
| 5346 uint8_t tertiary = 0; |
| 5347 uint8_t caseSwitch = coll->caseSwitch; |
| 5348 uint8_t tertiaryMask = coll->tertiaryMask; |
| 5349 int8_t tertiaryAddition = coll->tertiaryAddition; |
| 5350 uint8_t tertiaryTop = coll->tertiaryTop; |
| 5351 uint8_t tertiaryBottom = coll->tertiaryBottom; |
| 5352 uint8_t tertiaryCommon = coll->tertiaryCommon; |
| 5353 |
| 5354 uint32_t prevBuffSize = 0; |
| 5355 |
| 5356 UBool finished = FALSE; |
| 5357 UBool notIsContinuation = FALSE; |
| 5358 |
| 5359 uint32_t count2 = 0, count3 = 0; |
| 5360 uint8_t leadPrimary = 0; |
| 5361 |
| 5362 for(;;) { |
| 5363 for(i=prevBuffSize; i<minBufferSize; ++i) { |
| 5364 |
| 5365 order = ucol_IGetNextCE(coll, &s, status); |
| 5366 |
| 5367 if(order == 0) { |
| 5368 continue; |
| 5369 } |
| 5370 |
| 5371 if(order == UCOL_NO_MORE_CES) { |
| 5372 finished = TRUE; |
| 5373 break; |
| 5374 } |
| 5375 |
| 5376 notIsContinuation = !isContinuation(order); |
| 5377 |
| 5378 if(notIsContinuation) { |
| 5379 tertiary = (uint8_t)((order & tertiaryMask)); |
| 5380 } else { |
| 5381 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); |
| 5382 } |
| 5383 |
| 5384 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); |
| 5385 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); |
| 5386 primary1 = (uint8_t)(order >> 8); |
| 5387 |
| 5388 uint8_t originalPrimary1 = primary1; |
| 5389 if (coll->leadBytePermutationTable != NULL && notIsContinuation) { |
| 5390 primary1 = coll->leadBytePermutationTable[primary1]; |
| 5391 } |
| 5392 |
| 5393 /* Note: This code assumes that the table is well built i.e. not hav
ing 0 bytes where they are not supposed to be. */ |
| 5394 /* Usually, we'll have non-zero primary1 & primary2, except in cases
of a-z and friends, when primary2 will */ |
| 5395 /* be zero with non zero primary1. primary3 is different than 0 only
for long primaries - see above. */ |
| 5396 /* regular and simple sortkey calc */ |
| 5397 if(primary1 != UCOL_IGNORABLE) { |
| 5398 if(notIsContinuation) { |
| 5399 if(leadPrimary == primary1) { |
| 5400 *primaries++ = primary2; |
| 5401 } else { |
| 5402 if(leadPrimary != 0) { |
| 5403 *primaries++ = (uint8_t)((primary1 > leadPrimary) ?
UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); |
| 5404 } |
| 5405 if(primary2 == UCOL_IGNORABLE) { |
| 5406 /* one byter, not compressed */ |
| 5407 *primaries++ = primary1; |
| 5408 leadPrimary = 0; |
| 5409 } else if(isCompressible(coll, originalPrimary1)) { |
| 5410 /* compress */ |
| 5411 *primaries++ = leadPrimary = primary1; |
| 5412 *primaries++ = primary2; |
| 5413 } else { |
| 5414 leadPrimary = 0; |
| 5415 *primaries++ = primary1; |
| 5416 *primaries++ = primary2; |
| 5417 } |
| 5418 } |
| 5419 } else { /* we are in continuation, so we're gonna add primary t
o the key don't care about compression */ |
| 5420 *primaries++ = primary1; |
| 5421 if(primary2 != UCOL_IGNORABLE) { |
| 5422 *primaries++ = primary2; /* second part */ |
| 5423 } |
| 5424 } |
| 5425 } |
| 5426 |
| 5427 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE
*/ |
| 5428 /* This is compression code. */ |
| 5429 if (secondary == UCOL_COMMON2 && notIsContinuation) { |
| 5430 ++count2; |
| 5431 } else { |
| 5432 if (count2 > 0) { |
| 5433 if (secondary > UCOL_COMMON2) { // not necessary for 4th
level. |
| 5434 while (count2 > UCOL_TOP_COUNT2) { |
| 5435 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UC
OL_TOP_COUNT2); |
| 5436 count2 -= (uint32_t)UCOL_TOP_COUNT2; |
| 5437 } |
| 5438 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count
2-1)); |
| 5439 } else { |
| 5440 while (count2 > UCOL_BOT_COUNT2) { |
| 5441 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UC
OL_BOT_COUNT2); |
| 5442 count2 -= (uint32_t)UCOL_BOT_COUNT2; |
| 5443 } |
| 5444 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count
2-1)); |
| 5445 } |
| 5446 count2 = 0; |
| 5447 } |
| 5448 *secondaries++ = secondary; |
| 5449 } |
| 5450 } |
| 5451 |
| 5452 if(notIsContinuation) { |
| 5453 tertiary ^= caseSwitch; |
| 5454 } |
| 5455 |
| 5456 if(tertiary > 0) { |
| 5457 /* This is compression code. */ |
| 5458 /* sequence size check is included in the if clause */ |
| 5459 if (tertiary == tertiaryCommon && notIsContinuation) { |
| 5460 ++count3; |
| 5461 } else { |
| 5462 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMO
N3_NORMAL) { |
| 5463 tertiary += tertiaryAddition; |
| 5464 } else if (tertiary <= tertiaryCommon && tertiaryCommon == U
COL_COMMON3_UPPERFIRST) { |
| 5465 tertiary -= tertiaryAddition; |
| 5466 } |
| 5467 if (count3 > 0) { |
| 5468 if ((tertiary > tertiaryCommon)) { |
| 5469 while (count3 > coll->tertiaryTopCount) { |
| 5470 *tertiaries++ = (uint8_t)(tertiaryTop - coll->te
rtiaryTopCount); |
| 5471 count3 -= (uint32_t)coll->tertiaryTopCount; |
| 5472 } |
| 5473 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1)); |
| 5474 } else { |
| 5475 while (count3 > coll->tertiaryBottomCount) { |
| 5476 *tertiaries++ = (uint8_t)(tertiaryBottom + coll-
>tertiaryBottomCount); |
| 5477 count3 -= (uint32_t)coll->tertiaryBottomCount; |
| 5478 } |
| 5479 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1
)); |
| 5480 } |
| 5481 count3 = 0; |
| 5482 } |
| 5483 *tertiaries++ = tertiary; |
| 5484 } |
| 5485 } |
| 5486 |
| 5487 if(primaries > primarySafeEnd) { /* We have stepped over the primary
buffer */ |
| 5488 if(allocateSKBuffer == FALSE) { /* need to save our butts if we
cannot reallocate */ |
| 5489 IInit_collIterate(coll, (UChar *)source, len, &s, status); |
| 5490 if(U_FAILURE(*status)) { |
| 5491 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; |
| 5492 finished = TRUE; |
| 5493 break; |
| 5494 } |
| 5495 s.flags &= ~UCOL_ITER_NORM; |
| 5496 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, col
l->strength, len); |
| 5497 *status = U_BUFFER_OVERFLOW_ERROR; |
| 5498 finished = TRUE; |
| 5499 break; |
| 5500 } else { /* It's much nicer if we can actually reallocate */ |
| 5501 int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+
(secondaries - secStart)+(tertiaries - terStart)); |
| 5502 primStart = reallocateBuffer(&primaries, *result, prim, &res
ultLength, 2*sks, status); |
| 5503 if(U_SUCCESS(*status)) { |
| 5504 *result = primStart; |
| 5505 primarySafeEnd = primStart + resultLength - 2; |
| 5506 } else { |
| 5507 /* We ran out of memory!? We can't recover. */ |
| 5508 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; |
| 5509 finished = TRUE; |
| 5510 break; |
| 5511 } |
| 5512 } |
| 5513 } |
| 5514 } |
| 5515 if(finished) { |
| 5516 break; |
| 5517 } else { |
| 5518 prevBuffSize = minBufferSize; |
| 5519 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize
, 2*secSize, status); |
| 5520 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2
*terSize, status); |
| 5521 minBufferSize *= 2; |
| 5522 if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can
at least give the sortkey size |
| 5523 /* We ran out of memory!? We can't recover. */ |
| 5524 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; |
| 5525 break; |
| 5526 } |
| 5527 } |
| 5528 } |
| 5529 |
| 5530 if(U_SUCCESS(*status)) { |
| 5531 sortKeySize += (uint32_t)(primaries - primStart); |
| 5532 /* we have done all the CE's, now let's put them together to form a key
*/ |
| 5533 if (count2 > 0) { |
| 5534 while (count2 > UCOL_BOT_COUNT2) { |
| 5535 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); |
| 5536 count2 -= (uint32_t)UCOL_BOT_COUNT2; |
| 5537 } |
| 5538 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); |
| 5539 } |
| 5540 uint32_t secsize = (uint32_t)(secondaries-secStart); |
| 5541 sortKeySize += secsize; |
| 5542 if(sortKeySize <= resultLength) { |
| 5543 *(primaries++) = UCOL_LEVELTERMINATOR; |
| 5544 uprv_memcpy(primaries, secStart, secsize); |
| 5545 primaries += secsize; |
| 5546 } else { |
| 5547 if(allocateSKBuffer == TRUE) { |
| 5548 primStart = reallocateBuffer(&primaries, *result, prim, &resultL
ength, 2*sortKeySize, status); |
| 5549 if(U_SUCCESS(*status)) { |
| 5550 *(primaries++) = UCOL_LEVELTERMINATOR; |
| 5551 *result = primStart; |
| 5552 uprv_memcpy(primaries, secStart, secsize); |
| 5553 } |
| 5554 else { |
| 5555 /* We ran out of memory!? We can't recover. */ |
| 5556 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; |
| 5557 goto cleanup; |
| 5558 } |
| 5559 } else { |
| 5560 *status = U_BUFFER_OVERFLOW_ERROR; |
| 5561 } |
| 5562 } |
| 5563 |
| 5564 if (count3 > 0) { |
| 5565 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) { |
| 5566 while (count3 >= coll->tertiaryTopCount) { |
| 5567 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCou
nt); |
| 5568 count3 -= (uint32_t)coll->tertiaryTopCount; |
| 5569 } |
| 5570 *tertiaries++ = (uint8_t)(tertiaryTop - count3); |
| 5571 } else { |
| 5572 while (count3 > coll->tertiaryBottomCount) { |
| 5573 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBot
tomCount); |
| 5574 count3 -= (uint32_t)coll->tertiaryBottomCount; |
| 5575 } |
| 5576 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); |
| 5577 } |
| 5578 } |
| 5579 uint32_t tersize = (uint32_t)(tertiaries - terStart); |
| 5580 sortKeySize += tersize; |
| 5581 if(sortKeySize <= resultLength) { |
| 5582 *(primaries++) = UCOL_LEVELTERMINATOR; |
| 5583 uprv_memcpy(primaries, terStart, tersize); |
| 5584 primaries += tersize; |
| 5585 } else { |
| 5586 if(allocateSKBuffer == TRUE) { |
| 5587 primStart = reallocateBuffer(&primaries, *result, prim, &resultL
ength, 2*sortKeySize, status); |
| 5588 if(U_SUCCESS(*status)) { |
| 5589 *result = primStart; |
| 5590 *(primaries++) = UCOL_LEVELTERMINATOR; |
| 5591 uprv_memcpy(primaries, terStart, tersize); |
| 5592 } |
| 5593 else { |
| 5594 /* We ran out of memory!? We can't recover. */ |
| 5595 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; |
| 5596 goto cleanup; |
| 5597 } |
| 5598 } else { |
| 5599 *status = U_BUFFER_OVERFLOW_ERROR; |
| 5600 } |
| 5601 } |
| 5602 |
| 5603 *(primaries++) = '\0'; |
| 5604 } |
| 5605 |
| 5606 if(allocateSKBuffer == TRUE) { |
| 5607 *result = (uint8_t*)uprv_malloc(sortKeySize); |
| 5608 /* test for NULL */ |
| 5609 if (*result == NULL) { |
| 5610 *status = U_MEMORY_ALLOCATION_ERROR; |
| 5611 goto cleanup; |
| 5612 } |
| 5613 uprv_memcpy(*result, primStart, sortKeySize); |
| 5614 if(primStart != prim) { |
| 5615 uprv_free(primStart); |
| 5616 } |
| 5617 } |
| 5618 |
| 5619 cleanup: |
| 5620 if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *
status != U_BUFFER_OVERFLOW_ERROR) { |
| 5621 /* NULL terminate for safety */ |
| 5622 **result = 0; |
| 5623 } |
| 5624 if(terStart != tert) { |
| 5625 uprv_free(terStart); |
| 5626 uprv_free(secStart); |
| 5627 } |
| 5628 |
| 5629 /* To avoid memory leak, free the offset buffer if necessary. */ |
| 5630 ucol_freeOffsetBuffer(&s); |
| 5631 |
| 5632 return sortKeySize; |
| 5633 } |
| 5634 |
| 5635 static inline |
| 5636 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) { |
| 5637 UBool notIsContinuation = !isContinuation(CE); |
| 5638 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF); |
| 5639 if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0) |
| 5640 || (!notIsContinuation && *wasShifted))) |
| 5641 || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that pri
mary ignorables */ |
| 5642 { |
| 5643 // The stuff below should probably be in the sortkey code... maybe not..
. |
| 5644 if(primary1 != 0) { /* if we were shifted and we got an ignorable code p
oint */ |
| 5645 /* we should just completely ignore it */ |
| 5646 *wasShifted = TRUE; |
| 5647 //continue; |
| 5648 } |
| 5649 //*wasShifted = TRUE; |
| 5650 return TRUE; |
| 5651 } else { |
| 5652 *wasShifted = FALSE; |
| 5653 return FALSE; |
| 5654 } |
| 5655 } |
| 5656 static inline |
| 5657 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *des
t) { |
| 5658 if(level < maxLevel) { |
| 5659 dest[i++] = UCOL_LEVELTERMINATOR; |
| 5660 } else { |
| 5661 dest[i++] = 0; |
| 5662 } |
| 5663 } |
| 5664 |
| 5665 /** enumeration of level identifiers for partial sort key generation */ |
| 5666 enum { |
| 5667 UCOL_PSK_PRIMARY = 0, |
| 5668 UCOL_PSK_SECONDARY = 1, |
| 5669 UCOL_PSK_CASE = 2, |
| 5670 UCOL_PSK_TERTIARY = 3, |
| 5671 UCOL_PSK_QUATERNARY = 4, |
| 5672 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have t
hree bits to blow */ |
| 5673 UCOL_PSK_IDENTICAL = 6, |
| 5674 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce
zeros */ |
| 5675 UCOL_PSK_LIMIT |
| 5676 }; |
| 5677 |
| 5678 /** collation state enum. *_SHIFT value is how much to shift right |
| 5679 * to get the state piece to the right. *_MASK value should be |
| 5680 * ANDed with the shifted state. This data is stored in state[1] |
| 5681 * field. |
| 5682 */ |
| 5683 enum { |
| 5684 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value
from above */ |
| 5685 UCOL_PSK_LEVEL_MASK = 7, /** three bits */ |
| 5686 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary
or quaternary already written */ |
| 5687 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1, |
| 5688 /** can be only 0 or 1, since we get up to two bytes from primary or quatern
ary |
| 5689 * This field is also used to denote that the French secondary level is fin
ished |
| 5690 */ |
| 5691 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */ |
| 5692 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */ |
| 5693 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already wri
tten */ |
| 5694 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */ |
| 5695 /** When we do French we need to reverse secondary values. However, continua
tions |
| 5696 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2
c3ba |
| 5697 */ |
| 5698 UCOL_PSK_BOCSU_BYTES_SHIFT = 7, |
| 5699 UCOL_PSK_BOCSU_BYTES_MASK = 3, |
| 5700 UCOL_PSK_CONSUMED_CES_SHIFT = 9, |
| 5701 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF |
| 5702 }; |
| 5703 |
| 5704 // macro calculating the number of expansion CEs available |
| 5705 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn |
| 5706 |
| 5707 |
| 5708 /** main sortkey part procedure. On the first call, |
| 5709 * you should pass in a collator, an iterator, empty state |
| 5710 * state[0] == state[1] == 0, a buffer to hold results |
| 5711 * number of bytes you need and an error code pointer. |
| 5712 * Make sure your buffer is big enough to hold the wanted |
| 5713 * number of sortkey bytes. I don't check. |
| 5714 * The only meaningful status you can get back is |
| 5715 * U_BUFFER_OVERFLOW_ERROR, which basically means that you |
| 5716 * have been dealt a raw deal and that you probably won't |
| 5717 * be able to use partial sortkey generation for this |
| 5718 * particular combination of string and collator. This |
| 5719 * is highly unlikely, but you should still check the error code. |
| 5720 * Any other status means that you're not in a sane situation |
| 5721 * anymore. After the first call, preserve state values and |
| 5722 * use them on subsequent calls to obtain more bytes of a sortkey. |
| 5723 * Use until the number of bytes written is smaller than the requested |
| 5724 * number of bytes. Generated sortkey is not compatible with the |
| 5725 * one generated by ucol_getSortKey, as we don't do any compression. |
| 5726 * However, levels are still terminated by a 1 (one) and the sortkey |
| 5727 * is terminated by a 0 (zero). Identical level is the same as in the |
| 5728 * regular sortkey - internal bocu-1 implementation is used. |
| 5729 * For curious, although you cannot do much about this, here is |
| 5730 * the structure of state words. |
| 5731 * state[0] - iterator state. Depends on the iterator implementation, |
| 5732 * but allows the iterator to continue where it stopped in |
| 5733 * the last iteration. |
| 5734 * state[1] - collation processing state. Here is the distribution |
| 5735 * of the bits: |
| 5736 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary |
| 5737 * quaternary, quin (we don't use this one), identical and |
| 5738 * null (producing only zeroes - first one to terminate the |
| 5739 * sortkey and subsequent to fill the buffer). |
| 5740 * 3 - byte count. Number of bytes written on the primary level. |
| 5741 * 4 - was shifted. Whether the previous iteration finished in the |
| 5742 * shifted state. |
| 5743 * 5, 6 - French continuation bytes written. See the comment in the enum |
| 5744 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on |
| 5745 * the identical level. |
| 5746 * 9..31 - CEs consumed. Number of getCE or next32 operations performed |
| 5747 * since thes last successful update of the iterator state. |
| 5748 */ |
| 5749 U_CAPI int32_t U_EXPORT2 |
| 5750 ucol_nextSortKeyPart(const UCollator *coll, |
| 5751 UCharIterator *iter, |
| 5752 uint32_t state[2], |
| 5753 uint8_t *dest, int32_t count, |
| 5754 UErrorCode *status) |
| 5755 { |
| 5756 /* error checking */ |
| 5757 if(status==NULL || U_FAILURE(*status)) { |
| 5758 return 0; |
| 5759 } |
| 5760 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); |
| 5761 if( coll==NULL || iter==NULL || |
| 5762 state==NULL || |
| 5763 count<0 || (count>0 && dest==NULL) |
| 5764 ) { |
| 5765 *status=U_ILLEGAL_ARGUMENT_ERROR; |
| 5766 UTRACE_EXIT_STATUS(status); |
| 5767 return 0; |
| 5768 } |
| 5769 |
| 5770 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=
%d", |
| 5771 coll, iter, state[0], state[1], dest, count); |
| 5772 |
| 5773 if(count==0) { |
| 5774 /* nothing to do */ |
| 5775 UTRACE_EXIT_VALUE(0); |
| 5776 return 0; |
| 5777 } |
| 5778 /** Setting up situation according to the state we got from the previous ite
ration */ |
| 5779 // The state of the iterator from the previous invocation |
| 5780 uint32_t iterState = state[0]; |
| 5781 // Has the last iteration ended in the shifted state |
| 5782 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_
SHIFTED_MASK)?TRUE:FALSE; |
| 5783 // What is the current level of the sortkey? |
| 5784 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK; |
| 5785 // Have we written only one byte from a two byte primary in the previous ite
ration? |
| 5786 // Also on secondary level - have we finished with the French secondary? |
| 5787 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_D
ONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK; |
| 5788 // number of bytes in the continuation buffer for French |
| 5789 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USE
D_FRENCH_MASK; |
| 5790 // Number of bytes already written from a bocsu sequence. Since |
| 5791 // the longes bocsu sequence is 4 long, this can be up to 3. |
| 5792 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK
_BOCSU_BYTES_MASK; |
| 5793 // Number of elements that need to be consumed in this iteration because |
| 5794 // the iterator returned UITER_NO_STATE at the end of the last iteration, |
| 5795 // so we had to save the last valid state. |
| 5796 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED
_CES_MASK; |
| 5797 |
| 5798 /** values that depend on the collator attributes */ |
| 5799 // strength of the collator. |
| 5800 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status); |
| 5801 // maximal level of the partial sortkey. Need to take whether case level is
done |
| 5802 int32_t maxLevel = 0; |
| 5803 if(strength < UCOL_TERTIARY) { |
| 5804 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { |
| 5805 maxLevel = UCOL_PSK_CASE; |
| 5806 } else { |
| 5807 maxLevel = strength; |
| 5808 } |
| 5809 } else { |
| 5810 if(strength == UCOL_TERTIARY) { |
| 5811 maxLevel = UCOL_PSK_TERTIARY; |
| 5812 } else if(strength == UCOL_QUATERNARY) { |
| 5813 maxLevel = UCOL_PSK_QUATERNARY; |
| 5814 } else { // identical |
| 5815 maxLevel = UCOL_IDENTICAL; |
| 5816 } |
| 5817 } |
| 5818 // value for the quaternary level if Hiragana is encountered. Used for JIS X
4061 collation |
| 5819 uint8_t UCOL_HIRAGANA_QUAD = |
| 5820 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON
)?0xFE:0xFF; |
| 5821 // Boundary value that decides whether a CE is shifted or not |
| 5822 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopV
alue<<16):0; |
| 5823 // Are we doing French collation? |
| 5824 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status)
== UCOL_ON); |
| 5825 |
| 5826 /** initializing the collation state */ |
| 5827 UBool notIsContinuation = FALSE; |
| 5828 uint32_t CE = UCOL_NO_MORE_CES; |
| 5829 |
| 5830 collIterate s; |
| 5831 IInit_collIterate(coll, NULL, -1, &s, status); |
| 5832 if(U_FAILURE(*status)) { |
| 5833 UTRACE_EXIT_STATUS(*status); |
| 5834 return 0; |
| 5835 } |
| 5836 s.iterator = iter; |
| 5837 s.flags |= UCOL_USE_ITERATOR; |
| 5838 // This variable tells us whether we have produced some other levels in this
iteration |
| 5839 // before we moved to the identical level. In that case, we need to switch t
he |
| 5840 // type of the iterator. |
| 5841 UBool doingIdenticalFromStart = FALSE; |
| 5842 // Normalizing iterator |
| 5843 // The division for the array length may truncate the array size to |
| 5844 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high |
| 5845 // for all platforms anyway. |
| 5846 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; |
| 5847 UNormIterator *normIter = NULL; |
| 5848 // If the normalization is turned on for the collator and we are below ident
ical level |
| 5849 // we will use a FCD normalizing iterator |
| 5850 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && le
vel < UCOL_PSK_IDENTICAL) { |
| 5851 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); |
| 5852 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status); |
| 5853 s.flags &= ~UCOL_ITER_NORM; |
| 5854 if(U_FAILURE(*status)) { |
| 5855 UTRACE_EXIT_STATUS(*status); |
| 5856 return 0; |
| 5857 } |
| 5858 } else if(level == UCOL_PSK_IDENTICAL) { |
| 5859 // for identical level, we need a NFD iterator. We need to instantiate i
t here, since we |
| 5860 // will be updating the state - and this cannot be done on an ordinary i
terator. |
| 5861 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); |
| 5862 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); |
| 5863 s.flags &= ~UCOL_ITER_NORM; |
| 5864 if(U_FAILURE(*status)) { |
| 5865 UTRACE_EXIT_STATUS(*status); |
| 5866 return 0; |
| 5867 } |
| 5868 doingIdenticalFromStart = TRUE; |
| 5869 } |
| 5870 |
| 5871 // This is the tentative new state of the iterator. The problem |
| 5872 // is that the iterator might return an undefined state, in |
| 5873 // which case we should save the last valid state and increase |
| 5874 // the iterator skip value. |
| 5875 uint32_t newState = 0; |
| 5876 |
| 5877 // First, we set the iterator to the last valid position |
| 5878 // from the last iteration. This was saved in state[0]. |
| 5879 if(iterState == 0) { |
| 5880 /* initial state */ |
| 5881 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone)
{ |
| 5882 s.iterator->move(s.iterator, 0, UITER_LIMIT); |
| 5883 } else { |
| 5884 s.iterator->move(s.iterator, 0, UITER_START); |
| 5885 } |
| 5886 } else { |
| 5887 /* reset to previous state */ |
| 5888 s.iterator->setState(s.iterator, iterState, status); |
| 5889 if(U_FAILURE(*status)) { |
| 5890 UTRACE_EXIT_STATUS(*status); |
| 5891 return 0; |
| 5892 } |
| 5893 } |
| 5894 |
| 5895 |
| 5896 |
| 5897 // This variable tells us whether we can attempt to update the state |
| 5898 // of iterator. Situations where we don't want to update iterator state |
| 5899 // are the existence of expansion CEs that are not yet processed, and |
| 5900 // finishing the case level without enough space in the buffer to insert |
| 5901 // a level terminator. |
| 5902 UBool canUpdateState = TRUE; |
| 5903 |
| 5904 // Consume all the CEs that were consumed at the end of the previous |
| 5905 // iteration without updating the iterator state. On identical level, |
| 5906 // consume the code points. |
| 5907 int32_t counter = cces; |
| 5908 if(level < UCOL_PSK_IDENTICAL) { |
| 5909 while(counter-->0) { |
| 5910 // If we're doing French and we are on the secondary level, |
| 5911 // we go backwards. |
| 5912 if(level == UCOL_PSK_SECONDARY && doingFrench) { |
| 5913 CE = ucol_IGetPrevCE(coll, &s, status); |
| 5914 } else { |
| 5915 CE = ucol_IGetNextCE(coll, &s, status); |
| 5916 } |
| 5917 if(CE==UCOL_NO_MORE_CES) { |
| 5918 /* should not happen */ |
| 5919 *status=U_INTERNAL_PROGRAM_ERROR; |
| 5920 UTRACE_EXIT_STATUS(*status); |
| 5921 return 0; |
| 5922 } |
| 5923 if(uprv_numAvailableExpCEs(s)) { |
| 5924 canUpdateState = FALSE; |
| 5925 } |
| 5926 } |
| 5927 } else { |
| 5928 while(counter-->0) { |
| 5929 uiter_next32(s.iterator); |
| 5930 } |
| 5931 } |
| 5932 |
| 5933 // French secondary needs to know whether the iterator state of zero came fr
om previous level OR |
| 5934 // from a new invocation... |
| 5935 UBool wasDoingPrimary = FALSE; |
| 5936 // destination buffer byte counter. When this guy |
| 5937 // gets to count, we're done with the iteration |
| 5938 int32_t i = 0; |
| 5939 // used to count the zero bytes written after we |
| 5940 // have finished with the sort key |
| 5941 int32_t j = 0; |
| 5942 |
| 5943 |
| 5944 // Hm.... I think we're ready to plunge in. Basic story is as following: |
| 5945 // we have a fall through case based on level. This is used for initial |
| 5946 // positioning on iteration start. Every level processor contains a |
| 5947 // for(;;) which will be broken when we exhaust all the CEs. Other |
| 5948 // way to exit is a goto saveState, which happens when we have filled |
| 5949 // out our buffer. |
| 5950 switch(level) { |
| 5951 case UCOL_PSK_PRIMARY: |
| 5952 wasDoingPrimary = TRUE; |
| 5953 for(;;) { |
| 5954 if(i==count) { |
| 5955 goto saveState; |
| 5956 } |
| 5957 // We should save the state only if we |
| 5958 // are sure that we are done with the |
| 5959 // previous iterator state |
| 5960 if(canUpdateState && byteCountOrFrenchDone == 0) { |
| 5961 newState = s.iterator->getState(s.iterator); |
| 5962 if(newState != UITER_NO_STATE) { |
| 5963 iterState = newState; |
| 5964 cces = 0; |
| 5965 } |
| 5966 } |
| 5967 CE = ucol_IGetNextCE(coll, &s, status); |
| 5968 cces++; |
| 5969 if(CE==UCOL_NO_MORE_CES) { |
| 5970 // Add the level separator |
| 5971 terminatePSKLevel(level, maxLevel, i, dest); |
| 5972 byteCountOrFrenchDone=0; |
| 5973 // Restart the iteration an move to the |
| 5974 // second level |
| 5975 s.iterator->move(s.iterator, 0, UITER_START); |
| 5976 cces = 0; |
| 5977 level = UCOL_PSK_SECONDARY; |
| 5978 break; |
| 5979 } |
| 5980 if(!isContinuation(CE)){ |
| 5981 if(coll->leadBytePermutationTable != NULL){ |
| 5982 CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE &
0x00FFFFFF); |
| 5983 } |
| 5984 } |
| 5985 if(!isShiftedCE(CE, LVT, &wasShifted)) { |
| 5986 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */ |
| 5987 if(CE != 0) { |
| 5988 if(byteCountOrFrenchDone == 0) { |
| 5989 // get the second byte of primary |
| 5990 dest[i++]=(uint8_t)(CE >> 8); |
| 5991 } else { |
| 5992 byteCountOrFrenchDone = 0; |
| 5993 } |
| 5994 if((CE &=0xff)!=0) { |
| 5995 if(i==count) { |
| 5996 /* overflow */ |
| 5997 byteCountOrFrenchDone = 1; |
| 5998 cces--; |
| 5999 goto saveState; |
| 6000 } |
| 6001 dest[i++]=(uint8_t)CE; |
| 6002 } |
| 6003 } |
| 6004 } |
| 6005 if(uprv_numAvailableExpCEs(s)) { |
| 6006 canUpdateState = FALSE; |
| 6007 } else { |
| 6008 canUpdateState = TRUE; |
| 6009 } |
| 6010 } |
| 6011 /* fall through to next level */ |
| 6012 case UCOL_PSK_SECONDARY: |
| 6013 if(strength >= UCOL_SECONDARY) { |
| 6014 if(!doingFrench) { |
| 6015 for(;;) { |
| 6016 if(i == count) { |
| 6017 goto saveState; |
| 6018 } |
| 6019 // We should save the state only if we |
| 6020 // are sure that we are done with the |
| 6021 // previous iterator state |
| 6022 if(canUpdateState) { |
| 6023 newState = s.iterator->getState(s.iterator); |
| 6024 if(newState != UITER_NO_STATE) { |
| 6025 iterState = newState; |
| 6026 cces = 0; |
| 6027 } |
| 6028 } |
| 6029 CE = ucol_IGetNextCE(coll, &s, status); |
| 6030 cces++; |
| 6031 if(CE==UCOL_NO_MORE_CES) { |
| 6032 // Add the level separator |
| 6033 terminatePSKLevel(level, maxLevel, i, dest); |
| 6034 byteCountOrFrenchDone = 0; |
| 6035 // Restart the iteration an move to the |
| 6036 // second level |
| 6037 s.iterator->move(s.iterator, 0, UITER_START); |
| 6038 cces = 0; |
| 6039 level = UCOL_PSK_CASE; |
| 6040 break; |
| 6041 } |
| 6042 if(!isShiftedCE(CE, LVT, &wasShifted)) { |
| 6043 CE >>= 8; /* get secondary */ |
| 6044 if(CE != 0) { |
| 6045 dest[i++]=(uint8_t)CE; |
| 6046 } |
| 6047 } |
| 6048 if(uprv_numAvailableExpCEs(s)) { |
| 6049 canUpdateState = FALSE; |
| 6050 } else { |
| 6051 canUpdateState = TRUE; |
| 6052 } |
| 6053 } |
| 6054 } else { // French secondary processing |
| 6055 uint8_t frenchBuff[UCOL_MAX_BUFFER]; |
| 6056 int32_t frenchIndex = 0; |
| 6057 // Here we are going backwards. |
| 6058 // If the iterator is at the beggining, it should be |
| 6059 // moved to end. |
| 6060 if(wasDoingPrimary) { |
| 6061 s.iterator->move(s.iterator, 0, UITER_LIMIT); |
| 6062 cces = 0; |
| 6063 } |
| 6064 for(;;) { |
| 6065 if(i == count) { |
| 6066 goto saveState; |
| 6067 } |
| 6068 if(canUpdateState) { |
| 6069 newState = s.iterator->getState(s.iterator); |
| 6070 if(newState != UITER_NO_STATE) { |
| 6071 iterState = newState; |
| 6072 cces = 0; |
| 6073 } |
| 6074 } |
| 6075 CE = ucol_IGetPrevCE(coll, &s, status); |
| 6076 cces++; |
| 6077 if(CE==UCOL_NO_MORE_CES) { |
| 6078 // Add the level separator |
| 6079 terminatePSKLevel(level, maxLevel, i, dest); |
| 6080 byteCountOrFrenchDone = 0; |
| 6081 // Restart the iteration an move to the next level |
| 6082 s.iterator->move(s.iterator, 0, UITER_START); |
| 6083 level = UCOL_PSK_CASE; |
| 6084 break; |
| 6085 } |
| 6086 if(isContinuation(CE)) { // if it's a continuation, we want
to save it and |
| 6087 // reverse when we get a first non-continuation CE. |
| 6088 CE >>= 8; |
| 6089 frenchBuff[frenchIndex++] = (uint8_t)CE; |
| 6090 } else if(!isShiftedCE(CE, LVT, &wasShifted)) { |
| 6091 CE >>= 8; /* get secondary */ |
| 6092 if(!frenchIndex) { |
| 6093 if(CE != 0) { |
| 6094 dest[i++]=(uint8_t)CE; |
| 6095 } |
| 6096 } else { |
| 6097 frenchBuff[frenchIndex++] = (uint8_t)CE; |
| 6098 frenchIndex -= usedFrench; |
| 6099 usedFrench = 0; |
| 6100 while(i < count && frenchIndex) { |
| 6101 dest[i++] = frenchBuff[--frenchIndex]; |
| 6102 usedFrench++; |
| 6103 } |
| 6104 } |
| 6105 } |
| 6106 if(uprv_numAvailableExpCEs(s)) { |
| 6107 canUpdateState = FALSE; |
| 6108 } else { |
| 6109 canUpdateState = TRUE; |
| 6110 } |
| 6111 } |
| 6112 } |
| 6113 } else { |
| 6114 level = UCOL_PSK_CASE; |
| 6115 } |
| 6116 /* fall through to next level */ |
| 6117 case UCOL_PSK_CASE: |
| 6118 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { |
| 6119 uint32_t caseShift = UCOL_CASE_SHIFT_START; |
| 6120 uint8_t caseByte = UCOL_CASE_BYTE_START; |
| 6121 uint8_t caseBits = 0; |
| 6122 |
| 6123 for(;;) { |
| 6124 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START); |
| 6125 if(i == count) { |
| 6126 goto saveState; |
| 6127 } |
| 6128 // We should save the state only if we |
| 6129 // are sure that we are done with the |
| 6130 // previous iterator state |
| 6131 if(canUpdateState) { |
| 6132 newState = s.iterator->getState(s.iterator); |
| 6133 if(newState != UITER_NO_STATE) { |
| 6134 iterState = newState; |
| 6135 cces = 0; |
| 6136 } |
| 6137 } |
| 6138 CE = ucol_IGetNextCE(coll, &s, status); |
| 6139 cces++; |
| 6140 if(CE==UCOL_NO_MORE_CES) { |
| 6141 // On the case level we might have an unfinished |
| 6142 // case byte. Add one if it's started. |
| 6143 if(caseShift != UCOL_CASE_SHIFT_START) { |
| 6144 dest[i++] = caseByte; |
| 6145 } |
| 6146 cces = 0; |
| 6147 // We have finished processing CEs on this level. |
| 6148 // However, we don't know if we have enough space |
| 6149 // to add a case level terminator. |
| 6150 if(i < count) { |
| 6151 // Add the level separator |
| 6152 terminatePSKLevel(level, maxLevel, i, dest); |
| 6153 // Restart the iteration and move to the |
| 6154 // next level |
| 6155 s.iterator->move(s.iterator, 0, UITER_START); |
| 6156 level = UCOL_PSK_TERTIARY; |
| 6157 } else { |
| 6158 canUpdateState = FALSE; |
| 6159 } |
| 6160 break; |
| 6161 } |
| 6162 |
| 6163 if(!isShiftedCE(CE, LVT, &wasShifted)) { |
| 6164 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || s
trength > UCOL_PRIMARY)) { |
| 6165 // do the case level if we need to do it. We don't want
to calculate |
| 6166 // case level for primary ignorables if we have only pri
mary strength and case level |
| 6167 // otherwise we would break well formedness of CEs |
| 6168 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); |
| 6169 caseBits = (uint8_t)(CE & 0xC0); |
| 6170 // this copies the case level logic from the |
| 6171 // sort key generation code |
| 6172 if(CE != 0) { |
| 6173 if (caseShift == 0) { |
| 6174 dest[i++] = caseByte; |
| 6175 caseShift = UCOL_CASE_SHIFT_START; |
| 6176 caseByte = UCOL_CASE_BYTE_START; |
| 6177 } |
| 6178 if(coll->caseFirst == UCOL_UPPER_FIRST) { |
| 6179 if((caseBits & 0xC0) == 0) { |
| 6180 caseByte |= 1 << (--caseShift); |
| 6181 } else { |
| 6182 caseByte |= 0 << (--caseShift); |
| 6183 /* second bit */ |
| 6184 if(caseShift == 0) { |
| 6185 dest[i++] = caseByte; |
| 6186 caseShift = UCOL_CASE_SHIFT_START; |
| 6187 caseByte = UCOL_CASE_BYTE_START; |
| 6188 } |
| 6189 caseByte |= ((caseBits>>6)&1) << (--caseShif
t); |
| 6190 } |
| 6191 } else { |
| 6192 if((caseBits & 0xC0) == 0) { |
| 6193 caseByte |= 0 << (--caseShift); |
| 6194 } else { |
| 6195 caseByte |= 1 << (--caseShift); |
| 6196 /* second bit */ |
| 6197 if(caseShift == 0) { |
| 6198 dest[i++] = caseByte; |
| 6199 caseShift = UCOL_CASE_SHIFT_START; |
| 6200 caseByte = UCOL_CASE_BYTE_START; |
| 6201 } |
| 6202 caseByte |= ((caseBits>>7)&1) << (--caseShif
t); |
| 6203 } |
| 6204 } |
| 6205 } |
| 6206 |
| 6207 } |
| 6208 } |
| 6209 // Not sure this is correct for the case level - revisit |
| 6210 if(uprv_numAvailableExpCEs(s)) { |
| 6211 canUpdateState = FALSE; |
| 6212 } else { |
| 6213 canUpdateState = TRUE; |
| 6214 } |
| 6215 } |
| 6216 } else { |
| 6217 level = UCOL_PSK_TERTIARY; |
| 6218 } |
| 6219 /* fall through to next level */ |
| 6220 case UCOL_PSK_TERTIARY: |
| 6221 if(strength >= UCOL_TERTIARY) { |
| 6222 for(;;) { |
| 6223 if(i == count) { |
| 6224 goto saveState; |
| 6225 } |
| 6226 // We should save the state only if we |
| 6227 // are sure that we are done with the |
| 6228 // previous iterator state |
| 6229 if(canUpdateState) { |
| 6230 newState = s.iterator->getState(s.iterator); |
| 6231 if(newState != UITER_NO_STATE) { |
| 6232 iterState = newState; |
| 6233 cces = 0; |
| 6234 } |
| 6235 } |
| 6236 CE = ucol_IGetNextCE(coll, &s, status); |
| 6237 cces++; |
| 6238 if(CE==UCOL_NO_MORE_CES) { |
| 6239 // Add the level separator |
| 6240 terminatePSKLevel(level, maxLevel, i, dest); |
| 6241 byteCountOrFrenchDone = 0; |
| 6242 // Restart the iteration an move to the |
| 6243 // second level |
| 6244 s.iterator->move(s.iterator, 0, UITER_START); |
| 6245 cces = 0; |
| 6246 level = UCOL_PSK_QUATERNARY; |
| 6247 break; |
| 6248 } |
| 6249 if(!isShiftedCE(CE, LVT, &wasShifted)) { |
| 6250 notIsContinuation = !isContinuation(CE); |
| 6251 |
| 6252 if(notIsContinuation) { |
| 6253 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); |
| 6254 CE ^= coll->caseSwitch; |
| 6255 CE &= coll->tertiaryMask; |
| 6256 } else { |
| 6257 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); |
| 6258 } |
| 6259 |
| 6260 if(CE != 0) { |
| 6261 dest[i++]=(uint8_t)CE; |
| 6262 } |
| 6263 } |
| 6264 if(uprv_numAvailableExpCEs(s)) { |
| 6265 canUpdateState = FALSE; |
| 6266 } else { |
| 6267 canUpdateState = TRUE; |
| 6268 } |
| 6269 } |
| 6270 } else { |
| 6271 // if we're not doing tertiary |
| 6272 // skip to the end |
| 6273 level = UCOL_PSK_NULL; |
| 6274 } |
| 6275 /* fall through to next level */ |
| 6276 case UCOL_PSK_QUATERNARY: |
| 6277 if(strength >= UCOL_QUATERNARY) { |
| 6278 for(;;) { |
| 6279 if(i == count) { |
| 6280 goto saveState; |
| 6281 } |
| 6282 // We should save the state only if we |
| 6283 // are sure that we are done with the |
| 6284 // previous iterator state |
| 6285 if(canUpdateState) { |
| 6286 newState = s.iterator->getState(s.iterator); |
| 6287 if(newState != UITER_NO_STATE) { |
| 6288 iterState = newState; |
| 6289 cces = 0; |
| 6290 } |
| 6291 } |
| 6292 CE = ucol_IGetNextCE(coll, &s, status); |
| 6293 cces++; |
| 6294 if(CE==UCOL_NO_MORE_CES) { |
| 6295 // Add the level separator |
| 6296 terminatePSKLevel(level, maxLevel, i, dest); |
| 6297 //dest[i++] = UCOL_LEVELTERMINATOR; |
| 6298 byteCountOrFrenchDone = 0; |
| 6299 // Restart the iteration an move to the |
| 6300 // second level |
| 6301 s.iterator->move(s.iterator, 0, UITER_START); |
| 6302 cces = 0; |
| 6303 level = UCOL_PSK_QUIN; |
| 6304 break; |
| 6305 } |
| 6306 if(CE==0) |
| 6307 continue; |
| 6308 if(isShiftedCE(CE, LVT, &wasShifted)) { |
| 6309 CE >>= 16; /* get primary */ |
| 6310 if(CE != 0) { |
| 6311 if(byteCountOrFrenchDone == 0) { |
| 6312 dest[i++]=(uint8_t)(CE >> 8); |
| 6313 } else { |
| 6314 byteCountOrFrenchDone = 0; |
| 6315 } |
| 6316 if((CE &=0xff)!=0) { |
| 6317 if(i==count) { |
| 6318 /* overflow */ |
| 6319 byteCountOrFrenchDone = 1; |
| 6320 goto saveState; |
| 6321 } |
| 6322 dest[i++]=(uint8_t)CE; |
| 6323 } |
| 6324 } |
| 6325 } else { |
| 6326 notIsContinuation = !isContinuation(CE); |
| 6327 if(notIsContinuation) { |
| 6328 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana a
nd we need to note it |
| 6329 dest[i++] = UCOL_HIRAGANA_QUAD; |
| 6330 } else { |
| 6331 dest[i++] = 0xFF; |
| 6332 } |
| 6333 } |
| 6334 } |
| 6335 if(uprv_numAvailableExpCEs(s)) { |
| 6336 canUpdateState = FALSE; |
| 6337 } else { |
| 6338 canUpdateState = TRUE; |
| 6339 } |
| 6340 } |
| 6341 } else { |
| 6342 // if we're not doing quaternary |
| 6343 // skip to the end |
| 6344 level = UCOL_PSK_NULL; |
| 6345 } |
| 6346 /* fall through to next level */ |
| 6347 case UCOL_PSK_QUIN: |
| 6348 level = UCOL_PSK_IDENTICAL; |
| 6349 /* fall through to next level */ |
| 6350 case UCOL_PSK_IDENTICAL: |
| 6351 if(strength >= UCOL_IDENTICAL) { |
| 6352 UChar32 first, second; |
| 6353 int32_t bocsuBytesWritten = 0; |
| 6354 // We always need to do identical on |
| 6355 // the NFD form of the string. |
| 6356 if(normIter == NULL) { |
| 6357 // we arrived from the level below and |
| 6358 // normalization was not turned on. |
| 6359 // therefore, we need to make a fresh NFD iterator |
| 6360 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter),
status); |
| 6361 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); |
| 6362 } else if(!doingIdenticalFromStart) { |
| 6363 // there is an iterator, but we did some other levels. |
| 6364 // therefore, we have a FCD iterator - need to make |
| 6365 // a NFD one. |
| 6366 // normIter being at the beginning does not guarantee |
| 6367 // that the underlying iterator is at the beginning |
| 6368 iter->move(iter, 0, UITER_START); |
| 6369 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); |
| 6370 } |
| 6371 // At this point we have a NFD iterator that is positioned |
| 6372 // in the right place |
| 6373 if(U_FAILURE(*status)) { |
| 6374 UTRACE_EXIT_STATUS(*status); |
| 6375 return 0; |
| 6376 } |
| 6377 first = uiter_previous32(s.iterator); |
| 6378 // maybe we're at the start of the string |
| 6379 if(first == U_SENTINEL) { |
| 6380 first = 0; |
| 6381 } else { |
| 6382 uiter_next32(s.iterator); |
| 6383 } |
| 6384 |
| 6385 j = 0; |
| 6386 for(;;) { |
| 6387 if(i == count) { |
| 6388 if(j+1 < bocsuBytesWritten) { |
| 6389 bocsuBytesUsed = j+1; |
| 6390 } |
| 6391 goto saveState; |
| 6392 } |
| 6393 |
| 6394 // On identical level, we will always save |
| 6395 // the state if we reach this point, since |
| 6396 // we don't depend on getNextCE for content |
| 6397 // all the content is in our buffer and we |
| 6398 // already either stored the full buffer OR |
| 6399 // otherwise we won't arrive here. |
| 6400 newState = s.iterator->getState(s.iterator); |
| 6401 if(newState != UITER_NO_STATE) { |
| 6402 iterState = newState; |
| 6403 cces = 0; |
| 6404 } |
| 6405 |
| 6406 uint8_t buff[4]; |
| 6407 second = uiter_next32(s.iterator); |
| 6408 cces++; |
| 6409 |
| 6410 // end condition for identical level |
| 6411 if(second == U_SENTINEL) { |
| 6412 terminatePSKLevel(level, maxLevel, i, dest); |
| 6413 level = UCOL_PSK_NULL; |
| 6414 break; |
| 6415 } |
| 6416 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, seco
nd, buff); |
| 6417 first = second; |
| 6418 |
| 6419 j = 0; |
| 6420 if(bocsuBytesUsed != 0) { |
| 6421 while(bocsuBytesUsed-->0) { |
| 6422 j++; |
| 6423 } |
| 6424 } |
| 6425 |
| 6426 while(i < count && j < bocsuBytesWritten) { |
| 6427 dest[i++] = buff[j++]; |
| 6428 } |
| 6429 } |
| 6430 |
| 6431 } else { |
| 6432 level = UCOL_PSK_NULL; |
| 6433 } |
| 6434 /* fall through to next level */ |
| 6435 case UCOL_PSK_NULL: |
| 6436 j = i; |
| 6437 while(j<count) { |
| 6438 dest[j++]=0; |
| 6439 } |
| 6440 break; |
| 6441 default: |
| 6442 *status = U_INTERNAL_PROGRAM_ERROR; |
| 6443 UTRACE_EXIT_STATUS(*status); |
| 6444 return 0; |
| 6445 } |
| 6446 |
| 6447 saveState: |
| 6448 // Now we need to return stuff. First we want to see whether we have |
| 6449 // done everything for the current state of iterator. |
| 6450 if(byteCountOrFrenchDone |
| 6451 || canUpdateState == FALSE |
| 6452 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE) |
| 6453 { |
| 6454 // Any of above mean that the previous transaction |
| 6455 // wasn't finished and that we should store the |
| 6456 // previous iterator state. |
| 6457 state[0] = iterState; |
| 6458 } else { |
| 6459 // The transaction is complete. We will continue in the next iteration. |
| 6460 state[0] = s.iterator->getState(s.iterator); |
| 6461 cces = 0; |
| 6462 } |
| 6463 // Store the number of bocsu bytes written. |
| 6464 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) { |
| 6465 *status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 6466 } |
| 6467 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BY
TES_SHIFT; |
| 6468 |
| 6469 // Next we put in the level of comparison |
| 6470 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT); |
| 6471 |
| 6472 // If we are doing French, we need to store whether we have just finished th
e French level |
| 6473 if(level == UCOL_PSK_SECONDARY && doingFrench) { |
| 6474 state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK)
<< UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); |
| 6475 } else { |
| 6476 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE
_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); |
| 6477 } |
| 6478 |
| 6479 // Was the latest CE shifted |
| 6480 if(wasShifted) { |
| 6481 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT; |
| 6482 } |
| 6483 // Check for cces overflow |
| 6484 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) { |
| 6485 *status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 6486 } |
| 6487 // Store cces |
| 6488 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SH
IFT); |
| 6489 |
| 6490 // Check for French overflow |
| 6491 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) { |
| 6492 *status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 6493 } |
| 6494 // Store number of bytes written in the French secondary continuation sequen
ce |
| 6495 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENC
H_SHIFT); |
| 6496 |
| 6497 |
| 6498 // If we have used normalizing iterator, get rid of it |
| 6499 if(normIter != NULL) { |
| 6500 unorm_closeIter(normIter); |
| 6501 } |
| 6502 |
| 6503 /* To avoid memory leak, free the offset buffer if necessary. */ |
| 6504 ucol_freeOffsetBuffer(&s); |
| 6505 |
| 6506 // Return number of meaningful sortkey bytes. |
| 6507 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", |
| 6508 dest,i, state[0], state[1]); |
| 6509 UTRACE_EXIT_VALUE(i); |
| 6510 return i; |
| 6511 } |
| 6512 |
| 6513 /** |
| 6514 * Produce a bound for a given sortkey and a number of levels. |
| 6515 */ |
| 6516 U_CAPI int32_t U_EXPORT2 |
| 6517 ucol_getBound(const uint8_t *source, |
| 6518 int32_t sourceLength, |
| 6519 UColBoundMode boundType, |
| 6520 uint32_t noOfLevels, |
| 6521 uint8_t *result, |
| 6522 int32_t resultLength, |
| 6523 UErrorCode *status) |
| 6524 { |
| 6525 // consistency checks |
| 6526 if(status == NULL || U_FAILURE(*status)) { |
| 6527 return 0; |
| 6528 } |
| 6529 if(source == NULL) { |
| 6530 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 6531 return 0; |
| 6532 } |
| 6533 |
| 6534 int32_t sourceIndex = 0; |
| 6535 // Scan the string until we skip enough of the key OR reach the end of the k
ey |
| 6536 do { |
| 6537 sourceIndex++; |
| 6538 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) { |
| 6539 noOfLevels--; |
| 6540 } |
| 6541 } while (noOfLevels > 0 |
| 6542 && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); |
| 6543 |
| 6544 if((source[sourceIndex] == 0 || sourceIndex == sourceLength) |
| 6545 && noOfLevels > 0) { |
| 6546 *status = U_SORT_KEY_TOO_SHORT_WARNING; |
| 6547 } |
| 6548 |
| 6549 |
| 6550 // READ ME: this code assumes that the values for boundType |
| 6551 // enum will not changes. They are set so that the enum value |
| 6552 // corresponds to the number of extra bytes each bound type |
| 6553 // needs. |
| 6554 if(result != NULL && resultLength >= sourceIndex+boundType) { |
| 6555 uprv_memcpy(result, source, sourceIndex); |
| 6556 switch(boundType) { |
| 6557 // Lower bound just gets terminated. No extra bytes |
| 6558 case UCOL_BOUND_LOWER: // = 0 |
| 6559 break; |
| 6560 // Upper bound needs one extra byte |
| 6561 case UCOL_BOUND_UPPER: // = 1 |
| 6562 result[sourceIndex++] = 2; |
| 6563 break; |
| 6564 // Upper long bound needs two extra bytes |
| 6565 case UCOL_BOUND_UPPER_LONG: // = 2 |
| 6566 result[sourceIndex++] = 0xFF; |
| 6567 result[sourceIndex++] = 0xFF; |
| 6568 break; |
| 6569 default: |
| 6570 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 6571 return 0; |
| 6572 } |
| 6573 result[sourceIndex++] = 0; |
| 6574 |
| 6575 return sourceIndex; |
| 6576 } else { |
| 6577 return sourceIndex+boundType+1; |
| 6578 } |
| 6579 } |
| 6580 |
| 6581 /****************************************************************************/ |
| 6582 /* Following are the functions that deal with the properties of a collator */ |
| 6583 /* there are new APIs and some compatibility APIs */ |
| 6584 /****************************************************************************/ |
| 6585 |
| 6586 static inline void |
| 6587 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE, |
| 6588 int32_t *primShift, int32_t *secShift, int32_t *terShift) |
| 6589 { |
| 6590 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; |
| 6591 UBool reverseSecondary = FALSE; |
| 6592 UBool continuation = isContinuation(CE); |
| 6593 if(!continuation) { |
| 6594 tertiary = (uint8_t)((CE & coll->tertiaryMask)); |
| 6595 tertiary ^= coll->caseSwitch; |
| 6596 reverseSecondary = TRUE; |
| 6597 } else { |
| 6598 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); |
| 6599 tertiary &= UCOL_REMOVE_CASE; |
| 6600 reverseSecondary = FALSE; |
| 6601 } |
| 6602 |
| 6603 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); |
| 6604 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); |
| 6605 primary1 = (uint8_t)(CE >> 8); |
| 6606 |
| 6607 if(primary1 != 0) { |
| 6608 if (coll->leadBytePermutationTable != NULL && !continuation) { |
| 6609 primary1 = coll->leadBytePermutationTable[primary1]; |
| 6610 } |
| 6611 |
| 6612 coll->latinOneCEs[ch] |= (primary1 << *primShift); |
| 6613 *primShift -= 8; |
| 6614 } |
| 6615 if(primary2 != 0) { |
| 6616 if(*primShift < 0) { |
| 6617 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; |
| 6618 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; |
| 6619 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; |
| 6620 return; |
| 6621 } |
| 6622 coll->latinOneCEs[ch] |= (primary2 << *primShift); |
| 6623 *primShift -= 8; |
| 6624 } |
| 6625 if(secondary != 0) { |
| 6626 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse se
condary |
| 6627 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space fo
r secondary |
| 6628 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24); |
| 6629 } else { // normal case |
| 6630 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secSh
ift); |
| 6631 } |
| 6632 *secShift -= 8; |
| 6633 } |
| 6634 if(tertiary != 0) { |
| 6635 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift
); |
| 6636 *terShift -= 8; |
| 6637 } |
| 6638 } |
| 6639 |
| 6640 static inline UBool |
| 6641 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) { |
| 6642 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3); |
| 6643 if(newTable == NULL) { |
| 6644 *status = U_MEMORY_ALLOCATION_ERROR; |
| 6645 coll->latinOneFailed = TRUE; |
| 6646 return FALSE; |
| 6647 } |
| 6648 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTable
Len)*sizeof(uint32_t); |
| 6649 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3); |
| 6650 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy); |
| 6651 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToC
opy); |
| 6652 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, siz
eToCopy); |
| 6653 coll->latinOneTableLen = size; |
| 6654 uprv_free(coll->latinOneCEs); |
| 6655 coll->latinOneCEs = newTable; |
| 6656 return TRUE; |
| 6657 } |
| 6658 |
| 6659 static UBool |
| 6660 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) { |
| 6661 UBool result = TRUE; |
| 6662 if(coll->latinOneCEs == NULL) { |
| 6663 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINO
NETABLELEN*3); |
| 6664 if(coll->latinOneCEs == NULL) { |
| 6665 *status = U_MEMORY_ALLOCATION_ERROR; |
| 6666 return FALSE; |
| 6667 } |
| 6668 coll->latinOneTableLen = UCOL_LATINONETABLELEN; |
| 6669 } |
| 6670 UChar ch = 0; |
| 6671 UCollationElements *it = ucol_openElements(coll, &ch, 1, status); |
| 6672 // Check for null pointer |
| 6673 if (U_FAILURE(*status)) { |
| 6674 return FALSE; |
| 6675 } |
| 6676 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3)
; |
| 6677 |
| 6678 int32_t primShift = 24, secShift = 24, terShift = 24; |
| 6679 uint32_t CE = 0; |
| 6680 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1; |
| 6681 |
| 6682 // TODO: make safe if you get more than you wanted... |
| 6683 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) { |
| 6684 primShift = 24; secShift = 24; terShift = 24; |
| 6685 if(ch < 0x100) { |
| 6686 CE = coll->latinOneMapping[ch]; |
| 6687 } else { |
| 6688 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); |
| 6689 if(CE == UCOL_NOT_FOUND && coll->UCA) { |
| 6690 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); |
| 6691 } |
| 6692 } |
| 6693 if(CE < UCOL_NOT_FOUND) { |
| 6694 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift
); |
| 6695 } else { |
| 6696 switch (getCETag(CE)) { |
| 6697 case EXPANSION_TAG: |
| 6698 case DIGIT_TAG: |
| 6699 ucol_setText(it, &ch, 1, status); |
| 6700 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) { |
| 6701 if(primShift < 0 || secShift < 0 || terShift < 0) { |
| 6702 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; |
| 6703 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL
_OUT_CE; |
| 6704 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BA
IL_OUT_CE; |
| 6705 break; |
| 6706 } |
| 6707 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &
terShift); |
| 6708 } |
| 6709 break; |
| 6710 case CONTRACTION_TAG: |
| 6711 // here is the trick |
| 6712 // F2 is contraction. We do something very similar to contractio
ns |
| 6713 // but have two indices, one in the real contraction table and t
he |
| 6714 // other to where we stuffed things. This hopes that we don't ha
ve |
| 6715 // many contractions (this should work for latin-1 tables). |
| 6716 { |
| 6717 if((CE & 0x00FFF000) != 0) { |
| 6718 *status = U_UNSUPPORTED_ERROR; |
| 6719 goto cleanup_after_failure; |
| 6720 } |
| 6721 |
| 6722 const UChar *UCharOffset = (UChar *)coll->image+getContractO
ffset(CE); |
| 6723 |
| 6724 CE |= (contractionOffset & 0xFFF) << 12; // insert the offse
t in latin-1 table |
| 6725 |
| 6726 coll->latinOneCEs[ch] = CE; |
| 6727 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE; |
| 6728 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE; |
| 6729 |
| 6730 // We're going to jump into contraction table, pick the elem
ents |
| 6731 // and use them |
| 6732 do { |
| 6733 CE = *(coll->contractionCEs + |
| 6734 (UCharOffset - coll->contractionIndex)); |
| 6735 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG)
{ |
| 6736 uint32_t size; |
| 6737 uint32_t i; /* general counter */ |
| 6738 uint32_t *CEOffset = (uint32_t *)coll->image+getExpa
nsionOffset(CE); /* find the offset to expansion table */ |
| 6739 size = getExpansionCount(CE); |
| 6740 //CE = *CEOffset++; |
| 6741 if(size != 0) { /* if there are less than 16 element
s in expansion, we don't terminate */ |
| 6742 for(i = 0; i<size; i++) { |
| 6743 if(primShift < 0 || secShift < 0 || terShift
< 0) { |
| 6744 coll->latinOneCEs[(UChar)contractionOffs
et] = UCOL_BAIL_OUT_CE; |
| 6745 coll->latinOneCEs[coll->latinOneTableLen
+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; |
| 6746 coll->latinOneCEs[2*coll->latinOneTableL
en+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; |
| 6747 break; |
| 6748 } |
| 6749 ucol_addLatinOneEntry(coll, (UChar)contracti
onOffset, *CEOffset++, &primShift, &secShift, &terShift); |
| 6750 } |
| 6751 } else { /* else, we do */ |
| 6752 while(*CEOffset != 0) { |
| 6753 if(primShift < 0 || secShift < 0 || terShift
< 0) { |
| 6754 coll->latinOneCEs[(UChar)contractionOffs
et] = UCOL_BAIL_OUT_CE; |
| 6755 coll->latinOneCEs[coll->latinOneTableLen
+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; |
| 6756 coll->latinOneCEs[2*coll->latinOneTableL
en+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; |
| 6757 break; |
| 6758 } |
| 6759 ucol_addLatinOneEntry(coll, (UChar)contracti
onOffset, *CEOffset++, &primShift, &secShift, &terShift); |
| 6760 } |
| 6761 } |
| 6762 contractionOffset++; |
| 6763 } else if(CE < UCOL_NOT_FOUND) { |
| 6764 ucol_addLatinOneEntry(coll, (UChar)contractionOffset
++, CE, &primShift, &secShift, &terShift); |
| 6765 } else { |
| 6766 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_B
AIL_OUT_CE; |
| 6767 coll->latinOneCEs[coll->latinOneTableLen+(UChar)cont
ractionOffset] = UCOL_BAIL_OUT_CE; |
| 6768 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)co
ntractionOffset] = UCOL_BAIL_OUT_CE; |
| 6769 contractionOffset++; |
| 6770 } |
| 6771 UCharOffset++; |
| 6772 primShift = 24; secShift = 24; terShift = 24; |
| 6773 if(contractionOffset == coll->latinOneTableLen) { // we
need to reallocate |
| 6774 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneT
ableLen, status)) { |
| 6775 goto cleanup_after_failure; |
| 6776 } |
| 6777 } |
| 6778 } while(*UCharOffset != 0xFFFF); |
| 6779 } |
| 6780 break;; |
| 6781 case SPEC_PROC_TAG: |
| 6782 { |
| 6783 // 0xB7 is a precontext character defined in UCA5.1, a speci
al |
| 6784 // handle is implemeted in order to save LatinOne table for |
| 6785 // most locales. |
| 6786 if (ch==0xb7) { |
| 6787 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShif
t, &terShift); |
| 6788 } |
| 6789 else { |
| 6790 goto cleanup_after_failure; |
| 6791 } |
| 6792 } |
| 6793 break; |
| 6794 default: |
| 6795 goto cleanup_after_failure; |
| 6796 } |
| 6797 } |
| 6798 } |
| 6799 // compact table |
| 6800 if(contractionOffset < coll->latinOneTableLen) { |
| 6801 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) { |
| 6802 goto cleanup_after_failure; |
| 6803 } |
| 6804 } |
| 6805 ucol_closeElements(it); |
| 6806 return result; |
| 6807 |
| 6808 cleanup_after_failure: |
| 6809 // status should already be set before arriving here. |
| 6810 coll->latinOneFailed = TRUE; |
| 6811 ucol_closeElements(it); |
| 6812 return FALSE; |
| 6813 } |
| 6814 |
| 6815 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) { |
| 6816 if(U_SUCCESS(*status)) { |
| 6817 if(coll->caseFirst == UCOL_UPPER_FIRST) { |
| 6818 coll->caseSwitch = UCOL_CASE_SWITCH; |
| 6819 } else { |
| 6820 coll->caseSwitch = UCOL_NO_CASE_SWITCH; |
| 6821 } |
| 6822 |
| 6823 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) { |
| 6824 coll->tertiaryMask = UCOL_REMOVE_CASE; |
| 6825 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; |
| 6826 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /*
Should be 0x80 */ |
| 6827 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF; |
| 6828 coll->tertiaryBottom = UCOL_COMMON_BOT3; |
| 6829 } else { |
| 6830 coll->tertiaryMask = UCOL_KEEP_CASE; |
| 6831 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON; |
| 6832 if(coll->caseFirst == UCOL_UPPER_FIRST) { |
| 6833 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST; |
| 6834 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER; |
| 6835 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER; |
| 6836 } else { |
| 6837 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; |
| 6838 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER; |
| 6839 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER; |
| 6840 } |
| 6841 } |
| 6842 |
| 6843 /* Set the compression values */ |
| 6844 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1
); |
| 6845 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* w
e multilply double with int, but need only int */ |
| 6846 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopC
ount); |
| 6847 |
| 6848 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY |
| 6849 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == U
COL_NON_IGNORABLE) |
| 6850 { |
| 6851 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary; |
| 6852 } else { |
| 6853 coll->sortKeyGen = ucol_calcSortKey; |
| 6854 } |
| 6855 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && col
l->numericCollation == UCOL_OFF |
| 6856 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneF
ailed) |
| 6857 { |
| 6858 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) { |
| 6859 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in build
ing latin1 table, we'll use it |
| 6860 //fprintf(stderr, "F"); |
| 6861 coll->latinOneUse = TRUE; |
| 6862 } else { |
| 6863 coll->latinOneUse = FALSE; |
| 6864 } |
| 6865 if(*status == U_UNSUPPORTED_ERROR) { |
| 6866 *status = U_ZERO_ERROR; |
| 6867 } |
| 6868 } else { // latin1Table exists and it doesn't need to be regenerated
, just use it |
| 6869 coll->latinOneUse = TRUE; |
| 6870 } |
| 6871 } else { |
| 6872 coll->latinOneUse = FALSE; |
| 6873 } |
| 6874 } |
| 6875 } |
| 6876 |
| 6877 U_CAPI uint32_t U_EXPORT2 |
| 6878 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCod
e *status) { |
| 6879 if(U_FAILURE(*status) || coll == NULL) { |
| 6880 return 0; |
| 6881 } |
| 6882 if(len == -1) { |
| 6883 len = u_strlen(varTop); |
| 6884 } |
| 6885 if(len == 0) { |
| 6886 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 6887 return 0; |
| 6888 } |
| 6889 |
| 6890 collIterate s; |
| 6891 IInit_collIterate(coll, varTop, len, &s, status); |
| 6892 if(U_FAILURE(*status)) { |
| 6893 return 0; |
| 6894 } |
| 6895 |
| 6896 uint32_t CE = ucol_IGetNextCE(coll, &s, status); |
| 6897 |
| 6898 /* here we check if we have consumed all characters */ |
| 6899 /* you can put in either one character or a contraction */ |
| 6900 /* you shouldn't put more... */ |
| 6901 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) { |
| 6902 *status = U_CE_NOT_FOUND_ERROR; |
| 6903 return 0; |
| 6904 } |
| 6905 |
| 6906 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status); |
| 6907 |
| 6908 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) { |
| 6909 *status = U_PRIMARY_TOO_LONG_ERROR; |
| 6910 return 0; |
| 6911 } |
| 6912 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) { |
| 6913 coll->variableTopValueisDefault = FALSE; |
| 6914 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16; |
| 6915 } |
| 6916 |
| 6917 /* To avoid memory leak, free the offset buffer if necessary. */ |
| 6918 ucol_freeOffsetBuffer(&s); |
| 6919 |
| 6920 return CE & UCOL_PRIMARYMASK; |
| 6921 } |
| 6922 |
| 6923 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode
*status) { |
| 6924 if(U_FAILURE(*status) || coll == NULL) { |
| 6925 return 0; |
| 6926 } |
| 6927 return coll->variableTopValue<<16; |
| 6928 } |
| 6929 |
| 6930 U_CAPI void U_EXPORT2 |
| 6931 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *stat
us) { |
| 6932 if(U_FAILURE(*status) || coll == NULL) { |
| 6933 return; |
| 6934 } |
| 6935 |
| 6936 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) { |
| 6937 coll->variableTopValueisDefault = FALSE; |
| 6938 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16; |
| 6939 } |
| 6940 } |
| 6941 /* Attribute setter API */ |
| 6942 U_CAPI void U_EXPORT2 |
| 6943 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value,
UErrorCode *status) { |
| 6944 if(U_FAILURE(*status) || coll == NULL) { |
| 6945 return; |
| 6946 } |
| 6947 UColAttributeValue oldFrench = coll->frenchCollation; |
| 6948 UColAttributeValue oldCaseFirst = coll->caseFirst; |
| 6949 switch(attr) { |
| 6950 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */ |
| 6951 if(value == UCOL_ON) { |
| 6952 coll->numericCollation = UCOL_ON; |
| 6953 coll->numericCollationisDefault = FALSE; |
| 6954 } else if (value == UCOL_OFF) { |
| 6955 coll->numericCollation = UCOL_OFF; |
| 6956 coll->numericCollationisDefault = FALSE; |
| 6957 } else if (value == UCOL_DEFAULT) { |
| 6958 coll->numericCollationisDefault = TRUE; |
| 6959 coll->numericCollation = (UColAttributeValue)coll->options->numericC
ollation; |
| 6960 } else { |
| 6961 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 6962 } |
| 6963 break; |
| 6964 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragan
a */ |
| 6965 if(value == UCOL_ON) { |
| 6966 coll->hiraganaQ = UCOL_ON; |
| 6967 coll->hiraganaQisDefault = FALSE; |
| 6968 } else if (value == UCOL_OFF) { |
| 6969 coll->hiraganaQ = UCOL_OFF; |
| 6970 coll->hiraganaQisDefault = FALSE; |
| 6971 } else if (value == UCOL_DEFAULT) { |
| 6972 coll->hiraganaQisDefault = TRUE; |
| 6973 coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ; |
| 6974 } else { |
| 6975 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 6976 } |
| 6977 break; |
| 6978 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*
/ |
| 6979 if(value == UCOL_ON) { |
| 6980 coll->frenchCollation = UCOL_ON; |
| 6981 coll->frenchCollationisDefault = FALSE; |
| 6982 } else if (value == UCOL_OFF) { |
| 6983 coll->frenchCollation = UCOL_OFF; |
| 6984 coll->frenchCollationisDefault = FALSE; |
| 6985 } else if (value == UCOL_DEFAULT) { |
| 6986 coll->frenchCollationisDefault = TRUE; |
| 6987 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCol
lation; |
| 6988 } else { |
| 6989 *status = U_ILLEGAL_ARGUMENT_ERROR ; |
| 6990 } |
| 6991 break; |
| 6992 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ |
| 6993 if(value == UCOL_SHIFTED) { |
| 6994 coll->alternateHandling = UCOL_SHIFTED; |
| 6995 coll->alternateHandlingisDefault = FALSE; |
| 6996 } else if (value == UCOL_NON_IGNORABLE) { |
| 6997 coll->alternateHandling = UCOL_NON_IGNORABLE; |
| 6998 coll->alternateHandlingisDefault = FALSE; |
| 6999 } else if (value == UCOL_DEFAULT) { |
| 7000 coll->alternateHandlingisDefault = TRUE; |
| 7001 coll->alternateHandling = (UColAttributeValue)coll->options->alterna
teHandling ; |
| 7002 } else { |
| 7003 *status = U_ILLEGAL_ARGUMENT_ERROR ; |
| 7004 } |
| 7005 break; |
| 7006 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ |
| 7007 if(value == UCOL_LOWER_FIRST) { |
| 7008 coll->caseFirst = UCOL_LOWER_FIRST; |
| 7009 coll->caseFirstisDefault = FALSE; |
| 7010 } else if (value == UCOL_UPPER_FIRST) { |
| 7011 coll->caseFirst = UCOL_UPPER_FIRST; |
| 7012 coll->caseFirstisDefault = FALSE; |
| 7013 } else if (value == UCOL_OFF) { |
| 7014 coll->caseFirst = UCOL_OFF; |
| 7015 coll->caseFirstisDefault = FALSE; |
| 7016 } else if (value == UCOL_DEFAULT) { |
| 7017 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst; |
| 7018 coll->caseFirstisDefault = TRUE; |
| 7019 } else { |
| 7020 *status = U_ILLEGAL_ARGUMENT_ERROR ; |
| 7021 } |
| 7022 break; |
| 7023 case UCOL_CASE_LEVEL: /* do we have an extra case level */ |
| 7024 if(value == UCOL_ON) { |
| 7025 coll->caseLevel = UCOL_ON; |
| 7026 coll->caseLevelisDefault = FALSE; |
| 7027 } else if (value == UCOL_OFF) { |
| 7028 coll->caseLevel = UCOL_OFF; |
| 7029 coll->caseLevelisDefault = FALSE; |
| 7030 } else if (value == UCOL_DEFAULT) { |
| 7031 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel; |
| 7032 coll->caseLevelisDefault = TRUE; |
| 7033 } else { |
| 7034 *status = U_ILLEGAL_ARGUMENT_ERROR ; |
| 7035 } |
| 7036 break; |
| 7037 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ |
| 7038 if(value == UCOL_ON) { |
| 7039 coll->normalizationMode = UCOL_ON; |
| 7040 coll->normalizationModeisDefault = FALSE; |
| 7041 initializeFCD(status); |
| 7042 } else if (value == UCOL_OFF) { |
| 7043 coll->normalizationMode = UCOL_OFF; |
| 7044 coll->normalizationModeisDefault = FALSE; |
| 7045 } else if (value == UCOL_DEFAULT) { |
| 7046 coll->normalizationModeisDefault = TRUE; |
| 7047 coll->normalizationMode = (UColAttributeValue)coll->options->normali
zationMode; |
| 7048 if(coll->normalizationMode == UCOL_ON) { |
| 7049 initializeFCD(status); |
| 7050 } |
| 7051 } else { |
| 7052 *status = U_ILLEGAL_ARGUMENT_ERROR ; |
| 7053 } |
| 7054 break; |
| 7055 case UCOL_STRENGTH: /* attribute for strength */ |
| 7056 if (value == UCOL_DEFAULT) { |
| 7057 coll->strengthisDefault = TRUE; |
| 7058 coll->strength = (UColAttributeValue)coll->options->strength; |
| 7059 } else if (value <= UCOL_IDENTICAL) { |
| 7060 coll->strengthisDefault = FALSE; |
| 7061 coll->strength = value; |
| 7062 } else { |
| 7063 *status = U_ILLEGAL_ARGUMENT_ERROR ; |
| 7064 } |
| 7065 break; |
| 7066 case UCOL_ATTRIBUTE_COUNT: |
| 7067 default: |
| 7068 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 7069 break; |
| 7070 } |
| 7071 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) { |
| 7072 coll->latinOneRegenTable = TRUE; |
| 7073 } else { |
| 7074 coll->latinOneRegenTable = FALSE; |
| 7075 } |
| 7076 ucol_updateInternalState(coll, status); |
| 7077 } |
| 7078 |
| 7079 U_CAPI UColAttributeValue U_EXPORT2 |
| 7080 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status)
{ |
| 7081 if(U_FAILURE(*status) || coll == NULL) { |
| 7082 return UCOL_DEFAULT; |
| 7083 } |
| 7084 switch(attr) { |
| 7085 case UCOL_NUMERIC_COLLATION: |
| 7086 return coll->numericCollation; |
| 7087 case UCOL_HIRAGANA_QUATERNARY_MODE: |
| 7088 return coll->hiraganaQ; |
| 7089 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*
/ |
| 7090 return coll->frenchCollation; |
| 7091 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ |
| 7092 return coll->alternateHandling; |
| 7093 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ |
| 7094 return coll->caseFirst; |
| 7095 case UCOL_CASE_LEVEL: /* do we have an extra case level */ |
| 7096 return coll->caseLevel; |
| 7097 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ |
| 7098 return coll->normalizationMode; |
| 7099 case UCOL_STRENGTH: /* attribute for strength */ |
| 7100 return coll->strength; |
| 7101 case UCOL_ATTRIBUTE_COUNT: |
| 7102 default: |
| 7103 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 7104 break; |
| 7105 } |
| 7106 return UCOL_DEFAULT; |
| 7107 } |
| 7108 |
| 7109 U_CAPI void U_EXPORT2 |
| 7110 ucol_setStrength( UCollator *coll, |
| 7111 UCollationStrength strength) |
| 7112 { |
| 7113 UErrorCode status = U_ZERO_ERROR; |
| 7114 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); |
| 7115 } |
| 7116 |
| 7117 U_CAPI UCollationStrength U_EXPORT2 |
| 7118 ucol_getStrength(const UCollator *coll) |
| 7119 { |
| 7120 UErrorCode status = U_ZERO_ERROR; |
| 7121 return ucol_getAttribute(coll, UCOL_STRENGTH, &status); |
| 7122 } |
| 7123 |
| 7124 U_INTERNAL int32_t U_EXPORT2 |
| 7125 ucol_getReorderCodes(const UCollator *coll, |
| 7126 int32_t *dest, |
| 7127 int32_t destCapacity, |
| 7128 UErrorCode *pErrorCode) { |
| 7129 if (U_FAILURE(*pErrorCode)) { |
| 7130 return 0; |
| 7131 } |
| 7132 |
| 7133 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { |
| 7134 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 7135 return 0; |
| 7136 } |
| 7137 |
| 7138 if (coll->reorderCodesLength > destCapacity) { |
| 7139 *pErrorCode = U_BUFFER_OVERFLOW_ERROR; |
| 7140 return coll->reorderCodesLength; |
| 7141 } |
| 7142 for (int32_t i = 0; i < coll->reorderCodesLength; i++) { |
| 7143 dest[i] = coll->reorderCodes[i]; |
| 7144 } |
| 7145 return coll->reorderCodesLength; |
| 7146 } |
| 7147 |
| 7148 U_INTERNAL void U_EXPORT2 |
| 7149 ucol_setReorderCodes(UCollator *coll, |
| 7150 const int32_t *reorderCodes, |
| 7151 int32_t reorderCodesLength, |
| 7152 UErrorCode *pErrorCode) { |
| 7153 if (U_FAILURE(*pErrorCode)) { |
| 7154 return; |
| 7155 } |
| 7156 |
| 7157 if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NUL
L)) { |
| 7158 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 7159 return; |
| 7160 } |
| 7161 |
| 7162 uprv_free(coll->reorderCodes); |
| 7163 coll->reorderCodes = NULL; |
| 7164 coll->reorderCodesLength = 0; |
| 7165 if (reorderCodesLength == 0) { |
| 7166 uprv_free(coll->leadBytePermutationTable); |
| 7167 coll->leadBytePermutationTable = NULL; |
| 7168 return; |
| 7169 } |
| 7170 coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int3
2_t)); |
| 7171 if (coll->reorderCodes == NULL) { |
| 7172 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; |
| 7173 return; |
| 7174 } |
| 7175 for (int32_t i = 0; i < reorderCodesLength; i++) { |
| 7176 coll->reorderCodes[i] = reorderCodes[i]; |
| 7177 } |
| 7178 coll->reorderCodesLength = reorderCodesLength; |
| 7179 ucol_buildPermutationTable(coll, pErrorCode); |
| 7180 if (U_FAILURE(*pErrorCode)) { |
| 7181 uprv_free(coll->reorderCodes); |
| 7182 coll->reorderCodes = NULL; |
| 7183 coll->reorderCodesLength = 0; |
| 7184 } |
| 7185 } |
| 7186 |
| 7187 |
| 7188 /****************************************************************************/ |
| 7189 /* Following are misc functions */ |
| 7190 /* there are new APIs and some compatibility APIs */ |
| 7191 /****************************************************************************/ |
| 7192 |
| 7193 U_CAPI void U_EXPORT2 |
| 7194 ucol_getVersion(const UCollator* coll, |
| 7195 UVersionInfo versionInfo) |
| 7196 { |
| 7197 /* RunTime version */ |
| 7198 uint8_t rtVersion = UCOL_RUNTIME_VERSION; |
| 7199 /* Builder version*/ |
| 7200 uint8_t bdVersion = coll->image->version[0]; |
| 7201 |
| 7202 /* Charset Version. Need to get the version from cnv files |
| 7203 * makeconv should populate cnv files with version and |
| 7204 * an api has to be provided in ucnv.h to obtain this version |
| 7205 */ |
| 7206 uint8_t csVersion = 0; |
| 7207 |
| 7208 /* combine the version info */ |
| 7209 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersi
on)); |
| 7210 |
| 7211 /* Tailoring rules */ |
| 7212 versionInfo[0] = (uint8_t)(cmbVersion>>8); |
| 7213 versionInfo[1] = (uint8_t)cmbVersion; |
| 7214 versionInfo[2] = coll->image->version[1]; |
| 7215 if(coll->UCA) { |
| 7216 /* Include the minor number when getting the UCA version. (major & 1f) <
< 3 | (minor & 7) */ |
| 7217 versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->
UCA->image->UCAVersion[1] & 0x07); |
| 7218 } else { |
| 7219 versionInfo[3] = 0; |
| 7220 } |
| 7221 } |
| 7222 |
| 7223 |
| 7224 /* This internal API checks whether a character is tailored or not */ |
| 7225 U_CAPI UBool U_EXPORT2 |
| 7226 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) { |
| 7227 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) { |
| 7228 return FALSE; |
| 7229 } |
| 7230 |
| 7231 uint32_t CE = UCOL_NOT_FOUND; |
| 7232 const UChar *ContractionStart = NULL; |
| 7233 if(u < 0x100) { /* latin-1 */ |
| 7234 CE = coll->latinOneMapping[u]; |
| 7235 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) { |
| 7236 return FALSE; |
| 7237 } |
| 7238 } else { /* regular */ |
| 7239 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u); |
| 7240 } |
| 7241 |
| 7242 if(isContraction(CE)) { |
| 7243 ContractionStart = (UChar *)coll->image+getContractOffset(CE); |
| 7244 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex)
); |
| 7245 } |
| 7246 |
| 7247 return (UBool)(CE != UCOL_NOT_FOUND); |
| 7248 } |
| 7249 |
| 7250 |
| 7251 /****************************************************************************/ |
| 7252 /* Following are the string compare functions */ |
| 7253 /* */ |
| 7254 /****************************************************************************/ |
| 7255 |
| 7256 |
| 7257 /* ucol_checkIdent internal function. Does byte level string compare. */ |
| 7258 /* Used by strcoll if strength == identical and strings */ |
| 7259 /* are otherwise equal. */ |
| 7260 /* */ |
| 7261 /* Comparison must be done on NFD normalized strings. */ |
| 7262 /* FCD is not good enough. */ |
| 7263 |
| 7264 static |
| 7265 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBoo
l normalize, UErrorCode *status) |
| 7266 { |
| 7267 // When we arrive here, we can have normal strings or UCharIterators. Curren
tly they are both |
| 7268 // of same type, but that doesn't really mean that it will stay that way. |
| 7269 int32_t comparison; |
| 7270 |
| 7271 if (sColl->flags & UCOL_USE_ITERATOR) { |
| 7272 // The division for the array length may truncate the array size to |
| 7273 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too
high |
| 7274 // for all platforms anyway. |
| 7275 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; |
| 7276 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; |
| 7277 UNormIterator *sNIt = NULL, *tNIt = NULL; |
| 7278 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); |
| 7279 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); |
| 7280 sColl->iterator->move(sColl->iterator, 0, UITER_START); |
| 7281 tColl->iterator->move(tColl->iterator, 0, UITER_START); |
| 7282 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, sta
tus); |
| 7283 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, sta
tus); |
| 7284 comparison = u_strCompareIter(sIt, tIt, TRUE); |
| 7285 unorm_closeIter(sNIt); |
| 7286 unorm_closeIter(tNIt); |
| 7287 } else { |
| 7288 int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl-
>endp - sColl->string) : -1; |
| 7289 const UChar *sBuf = sColl->string; |
| 7290 int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl-
>endp - tColl->string) : -1; |
| 7291 const UChar *tBuf = tColl->string; |
| 7292 |
| 7293 if (normalize) { |
| 7294 *status = U_ZERO_ERROR; |
| 7295 // Note: We could use Normalizer::compare() or similar, but for shor
t strings |
| 7296 // which may not be in FCD it might be faster to just NFD them. |
| 7297 // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather tha
n |
| 7298 // NFD'ing immediately might be faster for long strings, |
| 7299 // but string comparison is usually done on relatively short strings
. |
| 7300 sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN
) == 0, sBuf, sLen), |
| 7301 sColl->writableBuffer, |
| 7302 *status); |
| 7303 tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN
) == 0, tBuf, tLen), |
| 7304 tColl->writableBuffer, |
| 7305 *status); |
| 7306 if(U_FAILURE(*status)) { |
| 7307 return UCOL_LESS; |
| 7308 } |
| 7309 comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writ
ableBuffer); |
| 7310 } else { |
| 7311 comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE); |
| 7312 } |
| 7313 } |
| 7314 |
| 7315 if (comparison < 0) { |
| 7316 return UCOL_LESS; |
| 7317 } else if (comparison == 0) { |
| 7318 return UCOL_EQUAL; |
| 7319 } else /* comparison > 0 */ { |
| 7320 return UCOL_GREATER; |
| 7321 } |
| 7322 } |
| 7323 |
| 7324 /* CEBuf - A struct and some inline functions to handle the saving */ |
| 7325 /* of CEs in a buffer within ucol_strcoll */ |
| 7326 |
| 7327 #define UCOL_CEBUF_SIZE 512 |
| 7328 typedef struct ucol_CEBuf { |
| 7329 uint32_t *buf; |
| 7330 uint32_t *endp; |
| 7331 uint32_t *pos; |
| 7332 uint32_t localArray[UCOL_CEBUF_SIZE]; |
| 7333 } ucol_CEBuf; |
| 7334 |
| 7335 |
| 7336 static |
| 7337 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) { |
| 7338 (b)->buf = (b)->pos = (b)->localArray; |
| 7339 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE; |
| 7340 } |
| 7341 |
| 7342 static |
| 7343 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) { |
| 7344 uint32_t oldSize; |
| 7345 uint32_t newSize; |
| 7346 uint32_t *newBuf; |
| 7347 |
| 7348 ci->flags |= UCOL_ITER_ALLOCATED; |
| 7349 oldSize = (uint32_t)(b->pos - b->buf); |
| 7350 newSize = oldSize * 2; |
| 7351 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t)); |
| 7352 if(newBuf == NULL) { |
| 7353 *status = U_MEMORY_ALLOCATION_ERROR; |
| 7354 } |
| 7355 else { |
| 7356 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t)); |
| 7357 if (b->buf != b->localArray) { |
| 7358 uprv_free(b->buf); |
| 7359 } |
| 7360 b->buf = newBuf; |
| 7361 b->endp = b->buf + newSize; |
| 7362 b->pos = b->buf + oldSize; |
| 7363 } |
| 7364 } |
| 7365 |
| 7366 static |
| 7367 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCo
de *status) { |
| 7368 if (b->pos == b->endp) { |
| 7369 ucol_CEBuf_Expand(b, ci, status); |
| 7370 } |
| 7371 if (U_SUCCESS(*status)) { |
| 7372 *(b)->pos++ = ce; |
| 7373 } |
| 7374 } |
| 7375 |
| 7376 /* This is a trick string compare function that goes in and uses sortkeys to com
pare */ |
| 7377 /* It is used when compare gets in trouble and needs to bail out
*/ |
| 7378 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl, |
| 7379 collIterate *tColl, |
| 7380 UErrorCode *status) |
| 7381 { |
| 7382 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER]; |
| 7383 uint8_t *sourceKeyP = sourceKey; |
| 7384 uint8_t *targetKeyP = targetKey; |
| 7385 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER; |
| 7386 const UCollator *coll = sColl->coll; |
| 7387 const UChar *source = NULL; |
| 7388 const UChar *target = NULL; |
| 7389 int32_t result = UCOL_EQUAL; |
| 7390 UnicodeString sourceString, targetString; |
| 7391 int32_t sourceLength; |
| 7392 int32_t targetLength; |
| 7393 |
| 7394 if(sColl->flags & UCOL_USE_ITERATOR) { |
| 7395 sColl->iterator->move(sColl->iterator, 0, UITER_START); |
| 7396 tColl->iterator->move(tColl->iterator, 0, UITER_START); |
| 7397 UChar32 c; |
| 7398 while((c=sColl->iterator->next(sColl->iterator))>=0) { |
| 7399 sourceString.append((UChar)c); |
| 7400 } |
| 7401 while((c=tColl->iterator->next(tColl->iterator))>=0) { |
| 7402 targetString.append((UChar)c); |
| 7403 } |
| 7404 source = sourceString.getBuffer(); |
| 7405 sourceLength = sourceString.length(); |
| 7406 target = targetString.getBuffer(); |
| 7407 targetLength = targetString.length(); |
| 7408 } else { // no iterators |
| 7409 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sCo
ll->string):-1; |
| 7410 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tCo
ll->string):-1; |
| 7411 source = sColl->string; |
| 7412 target = tColl->string; |
| 7413 } |
| 7414 |
| 7415 |
| 7416 |
| 7417 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourc
eKeyLen); |
| 7418 if(sourceKeyLen > UCOL_MAX_BUFFER) { |
| 7419 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t)); |
| 7420 if(sourceKeyP == NULL) { |
| 7421 *status = U_MEMORY_ALLOCATION_ERROR; |
| 7422 goto cleanup_and_do_compare; |
| 7423 } |
| 7424 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, s
ourceKeyLen); |
| 7425 } |
| 7426 |
| 7427 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targe
tKeyLen); |
| 7428 if(targetKeyLen > UCOL_MAX_BUFFER) { |
| 7429 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t)); |
| 7430 if(targetKeyP == NULL) { |
| 7431 *status = U_MEMORY_ALLOCATION_ERROR; |
| 7432 goto cleanup_and_do_compare; |
| 7433 } |
| 7434 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, t
argetKeyLen); |
| 7435 } |
| 7436 |
| 7437 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP); |
| 7438 |
| 7439 cleanup_and_do_compare: |
| 7440 if(sourceKeyP != NULL && sourceKeyP != sourceKey) { |
| 7441 uprv_free(sourceKeyP); |
| 7442 } |
| 7443 |
| 7444 if(targetKeyP != NULL && targetKeyP != targetKey) { |
| 7445 uprv_free(targetKeyP); |
| 7446 } |
| 7447 |
| 7448 if(result<0) { |
| 7449 return UCOL_LESS; |
| 7450 } else if(result>0) { |
| 7451 return UCOL_GREATER; |
| 7452 } else { |
| 7453 return UCOL_EQUAL; |
| 7454 } |
| 7455 } |
| 7456 |
| 7457 |
| 7458 static UCollationResult |
| 7459 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status) |
| 7460 { |
| 7461 U_ALIGN_CODE(16); |
| 7462 |
| 7463 const UCollator *coll = sColl->coll; |
| 7464 |
| 7465 |
| 7466 // setting up the collator parameters |
| 7467 UColAttributeValue strength = coll->strength; |
| 7468 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY); |
| 7469 |
| 7470 UBool checkSecTer = initialCheckSecTer; |
| 7471 UBool checkTertiary = (strength >= UCOL_TERTIARY); |
| 7472 UBool checkQuad = (strength >= UCOL_QUATERNARY); |
| 7473 UBool checkIdent = (strength == UCOL_IDENTICAL); |
| 7474 UBool checkCase = (coll->caseLevel == UCOL_ON); |
| 7475 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer; |
| 7476 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); |
| 7477 UBool qShifted = shifted && checkQuad; |
| 7478 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad; |
| 7479 |
| 7480 if(doHiragana && shifted) { |
| 7481 return (ucol_compareUsingSortKeys(sColl, tColl, status)); |
| 7482 } |
| 7483 uint8_t caseSwitch = coll->caseSwitch; |
| 7484 uint8_t tertiaryMask = coll->tertiaryMask; |
| 7485 |
| 7486 // This is the lowest primary value that will not be ignored if shifted |
| 7487 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0; |
| 7488 |
| 7489 UCollationResult result = UCOL_EQUAL; |
| 7490 UCollationResult hirResult = UCOL_EQUAL; |
| 7491 |
| 7492 // Preparing the CE buffers. They will be filled during the primary phase |
| 7493 ucol_CEBuf sCEs; |
| 7494 ucol_CEBuf tCEs; |
| 7495 UCOL_INIT_CEBUF(&sCEs); |
| 7496 UCOL_INIT_CEBUF(&tCEs); |
| 7497 |
| 7498 uint32_t secS = 0, secT = 0; |
| 7499 uint32_t sOrder=0, tOrder=0; |
| 7500 |
| 7501 // Non shifted primary processing is quite simple |
| 7502 if(!shifted) { |
| 7503 for(;;) { |
| 7504 |
| 7505 // We fetch CEs until we hit a non ignorable primary or end. |
| 7506 do { |
| 7507 // We get the next CE |
| 7508 sOrder = ucol_IGetNextCE(coll, sColl, status); |
| 7509 // Stuff it in the buffer |
| 7510 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); |
| 7511 // And keep just the primary part. |
| 7512 sOrder &= UCOL_PRIMARYMASK; |
| 7513 } while(sOrder == 0); |
| 7514 |
| 7515 // see the comments on the above block |
| 7516 do { |
| 7517 tOrder = ucol_IGetNextCE(coll, tColl, status); |
| 7518 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); |
| 7519 tOrder &= UCOL_PRIMARYMASK; |
| 7520 } while(tOrder == 0); |
| 7521 |
| 7522 // if both primaries are the same |
| 7523 if(sOrder == tOrder) { |
| 7524 // and there are no more CEs, we advance to the next level |
| 7525 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { |
| 7526 break; |
| 7527 } |
| 7528 if(doHiragana && hirResult == UCOL_EQUAL) { |
| 7529 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCO
L_WAS_HIRAGANA)) { |
| 7530 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl
->flags & UCOL_WAS_HIRAGANA)) |
| 7531 ? UCOL_LESS:UCOL_GREATER; |
| 7532 } |
| 7533 } |
| 7534 } else { |
| 7535 // only need to check one for continuation |
| 7536 // if one is then the other must be or the preceding CE would be
a prefix of the other |
| 7537 if (coll->leadBytePermutationTable != NULL && !isContinuation(sO
rder)) { |
| 7538 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24)
| (sOrder & 0x00FFFFFF); |
| 7539 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24)
| (tOrder & 0x00FFFFFF); |
| 7540 } |
| 7541 // if two primaries are different, we are done |
| 7542 result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER; |
| 7543 goto commonReturn; |
| 7544 } |
| 7545 } // no primary difference... do the rest from the buffers |
| 7546 } else { // shifted - do a slightly more complicated processing :) |
| 7547 for(;;) { |
| 7548 UBool sInShifted = FALSE; |
| 7549 UBool tInShifted = FALSE; |
| 7550 // This version of code can be refactored. However, it seems easier
to understand this way. |
| 7551 // Source loop. Sam as the target loop. |
| 7552 for(;;) { |
| 7553 sOrder = ucol_IGetNextCE(coll, sColl, status); |
| 7554 if(sOrder == UCOL_NO_MORE_CES) { |
| 7555 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); |
| 7556 break; |
| 7557 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMA
SK) == 0)) { |
| 7558 /* UCA amendment - ignore ignorables that follow shifted cod
e points */ |
| 7559 continue; |
| 7560 } else if(isContinuation(sOrder)) { |
| 7561 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary va
lue */ |
| 7562 if(sInShifted) { |
| 7563 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* pres
erve interesting continuation */ |
| 7564 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); |
| 7565 continue; |
| 7566 } else { |
| 7567 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); |
| 7568 break; |
| 7569 } |
| 7570 } else { /* Just lower level values */ |
| 7571 if(sInShifted) { |
| 7572 continue; |
| 7573 } else { |
| 7574 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); |
| 7575 continue; |
| 7576 } |
| 7577 } |
| 7578 } else { /* regular */ |
| 7579 if(coll->leadBytePermutationTable != NULL){ |
| 7580 sOrder = (coll->leadBytePermutationTable[sOrder>>24] <<
24) | (sOrder & 0x00FFFFFF); |
| 7581 } |
| 7582 if((sOrder & UCOL_PRIMARYMASK) > LVT) { |
| 7583 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); |
| 7584 break; |
| 7585 } else { |
| 7586 if((sOrder & UCOL_PRIMARYMASK) > 0) { |
| 7587 sInShifted = TRUE; |
| 7588 sOrder &= UCOL_PRIMARYMASK; |
| 7589 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); |
| 7590 continue; |
| 7591 } else { |
| 7592 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); |
| 7593 sInShifted = FALSE; |
| 7594 continue; |
| 7595 } |
| 7596 } |
| 7597 } |
| 7598 } |
| 7599 sOrder &= UCOL_PRIMARYMASK; |
| 7600 sInShifted = FALSE; |
| 7601 |
| 7602 for(;;) { |
| 7603 tOrder = ucol_IGetNextCE(coll, tColl, status); |
| 7604 if(tOrder == UCOL_NO_MORE_CES) { |
| 7605 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); |
| 7606 break; |
| 7607 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMA
SK) == 0)) { |
| 7608 /* UCA amendment - ignore ignorables that follow shifted cod
e points */ |
| 7609 continue; |
| 7610 } else if(isContinuation(tOrder)) { |
| 7611 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary va
lue */ |
| 7612 if(tInShifted) { |
| 7613 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* pres
erve interesting continuation */ |
| 7614 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); |
| 7615 continue; |
| 7616 } else { |
| 7617 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); |
| 7618 break; |
| 7619 } |
| 7620 } else { /* Just lower level values */ |
| 7621 if(tInShifted) { |
| 7622 continue; |
| 7623 } else { |
| 7624 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); |
| 7625 continue; |
| 7626 } |
| 7627 } |
| 7628 } else { /* regular */ |
| 7629 if(coll->leadBytePermutationTable != NULL){ |
| 7630 tOrder = (coll->leadBytePermutationTable[tOrder>>24] <<
24) | (tOrder & 0x00FFFFFF); |
| 7631 } |
| 7632 if((tOrder & UCOL_PRIMARYMASK) > LVT) { |
| 7633 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); |
| 7634 break; |
| 7635 } else { |
| 7636 if((tOrder & UCOL_PRIMARYMASK) > 0) { |
| 7637 tInShifted = TRUE; |
| 7638 tOrder &= UCOL_PRIMARYMASK; |
| 7639 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); |
| 7640 continue; |
| 7641 } else { |
| 7642 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); |
| 7643 tInShifted = FALSE; |
| 7644 continue; |
| 7645 } |
| 7646 } |
| 7647 } |
| 7648 } |
| 7649 tOrder &= UCOL_PRIMARYMASK; |
| 7650 tInShifted = FALSE; |
| 7651 |
| 7652 if(sOrder == tOrder) { |
| 7653 /* |
| 7654 if(doHiragana && hirResult == UCOL_EQUAL) { |
| 7655 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_
HIRAGANA)) { |
| 7656 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags &
UCOL_WAS_HIRAGANA)) |
| 7657 ? UCOL_LESS:UCOL_GREATER; |
| 7658 } |
| 7659 } |
| 7660 */ |
| 7661 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { |
| 7662 break; |
| 7663 } else { |
| 7664 sOrder = 0; |
| 7665 tOrder = 0; |
| 7666 continue; |
| 7667 } |
| 7668 } else { |
| 7669 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER; |
| 7670 goto commonReturn; |
| 7671 } |
| 7672 } /* no primary difference... do the rest from the buffers */ |
| 7673 } |
| 7674 |
| 7675 /* now, we're gonna reexamine collected CEs */ |
| 7676 uint32_t *sCE; |
| 7677 uint32_t *tCE; |
| 7678 |
| 7679 /* This is the secondary level of comparison */ |
| 7680 if(checkSecTer) { |
| 7681 if(!isFrenchSec) { /* normal */ |
| 7682 sCE = sCEs.buf; |
| 7683 tCE = tCEs.buf; |
| 7684 for(;;) { |
| 7685 while (secS == 0) { |
| 7686 secS = *(sCE++) & UCOL_SECONDARYMASK; |
| 7687 } |
| 7688 |
| 7689 while(secT == 0) { |
| 7690 secT = *(tCE++) & UCOL_SECONDARYMASK; |
| 7691 } |
| 7692 |
| 7693 if(secS == secT) { |
| 7694 if(secS == UCOL_NO_MORE_CES_SECONDARY) { |
| 7695 break; |
| 7696 } else { |
| 7697 secS = 0; secT = 0; |
| 7698 continue; |
| 7699 } |
| 7700 } else { |
| 7701 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; |
| 7702 goto commonReturn; |
| 7703 } |
| 7704 } |
| 7705 } else { /* do the French */ |
| 7706 uint32_t *sCESave = NULL; |
| 7707 uint32_t *tCESave = NULL; |
| 7708 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimi
zed */ |
| 7709 tCE = tCEs.pos-2; |
| 7710 for(;;) { |
| 7711 while (secS == 0 && sCE >= sCEs.buf) { |
| 7712 if(sCESave == NULL) { |
| 7713 secS = *(sCE--); |
| 7714 if(isContinuation(secS)) { |
| 7715 while(isContinuation(secS = *(sCE--))) |
| 7716 ; |
| 7717 /* after this, secS has the start of continuation, a
nd sCEs points before that */ |
| 7718 sCESave = sCE; /* we save it, so that we know where
to come back AND that we need to go forward */ |
| 7719 sCE+=2; /* need to point to the first continuation
CP */ |
| 7720 /* However, now you can just continue doing stuff */ |
| 7721 } |
| 7722 } else { |
| 7723 secS = *(sCE++); |
| 7724 if(!isContinuation(secS)) { /* This means we have finish
ed with this cont */ |
| 7725 sCE = sCESave; /* reset the pointer to be
fore continuation */ |
| 7726 sCESave = NULL; |
| 7727 secS = 0; /* Fetch a fresh CE before the continuati
on sequence. */ |
| 7728 continue; |
| 7729 } |
| 7730 } |
| 7731 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit *
/ |
| 7732 } |
| 7733 |
| 7734 while(secT == 0 && tCE >= tCEs.buf) { |
| 7735 if(tCESave == NULL) { |
| 7736 secT = *(tCE--); |
| 7737 if(isContinuation(secT)) { |
| 7738 while(isContinuation(secT = *(tCE--))) |
| 7739 ; |
| 7740 /* after this, secS has the start of continuation, a
nd sCEs points before that */ |
| 7741 tCESave = tCE; /* we save it, so that we know where
to come back AND that we need to go forward */ |
| 7742 tCE+=2; /* need to point to the first continuation
CP */ |
| 7743 /* However, now you can just continue doing stuff */ |
| 7744 } |
| 7745 } else { |
| 7746 secT = *(tCE++); |
| 7747 if(!isContinuation(secT)) { /* This means we have finish
ed with this cont */ |
| 7748 tCE = tCESave; /* reset the pointer to befo
re continuation */ |
| 7749 tCESave = NULL; |
| 7750 secT = 0; /* Fetch a fresh CE before the continuati
on sequence. */ |
| 7751 continue; |
| 7752 } |
| 7753 } |
| 7754 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit *
/ |
| 7755 } |
| 7756 |
| 7757 if(secS == secT) { |
| 7758 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf &&
tCE < tCEs.buf)) { |
| 7759 break; |
| 7760 } else { |
| 7761 secS = 0; secT = 0; |
| 7762 continue; |
| 7763 } |
| 7764 } else { |
| 7765 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; |
| 7766 goto commonReturn; |
| 7767 } |
| 7768 } |
| 7769 } |
| 7770 } |
| 7771 |
| 7772 /* doing the case bit */ |
| 7773 if(checkCase) { |
| 7774 sCE = sCEs.buf; |
| 7775 tCE = tCEs.buf; |
| 7776 for(;;) { |
| 7777 while((secS & UCOL_REMOVE_CASE) == 0) { |
| 7778 if(!isContinuation(*sCE++)) { |
| 7779 secS =*(sCE-1); |
| 7780 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMA
RY) { |
| 7781 // primary ignorables should not be considered on the ca
se level when the strength is primary |
| 7782 // otherwise, the CEs stop being well-formed |
| 7783 secS &= UCOL_TERT_CASE_MASK; |
| 7784 secS ^= caseSwitch; |
| 7785 } else { |
| 7786 secS = 0; |
| 7787 } |
| 7788 } else { |
| 7789 secS = 0; |
| 7790 } |
| 7791 } |
| 7792 |
| 7793 while((secT & UCOL_REMOVE_CASE) == 0) { |
| 7794 if(!isContinuation(*tCE++)) { |
| 7795 secT = *(tCE-1); |
| 7796 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMA
RY) { |
| 7797 // primary ignorables should not be considered on the ca
se level when the strength is primary |
| 7798 // otherwise, the CEs stop being well-formed |
| 7799 secT &= UCOL_TERT_CASE_MASK; |
| 7800 secT ^= caseSwitch; |
| 7801 } else { |
| 7802 secT = 0; |
| 7803 } |
| 7804 } else { |
| 7805 secT = 0; |
| 7806 } |
| 7807 } |
| 7808 |
| 7809 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) { |
| 7810 result = UCOL_LESS; |
| 7811 goto commonReturn; |
| 7812 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK))
{ |
| 7813 result = UCOL_GREATER; |
| 7814 goto commonReturn; |
| 7815 } |
| 7816 |
| 7817 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT &
UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) { |
| 7818 break; |
| 7819 } else { |
| 7820 secS = 0; |
| 7821 secT = 0; |
| 7822 } |
| 7823 } |
| 7824 } |
| 7825 |
| 7826 /* Tertiary level */ |
| 7827 if(checkTertiary) { |
| 7828 secS = 0; |
| 7829 secT = 0; |
| 7830 sCE = sCEs.buf; |
| 7831 tCE = tCEs.buf; |
| 7832 for(;;) { |
| 7833 while((secS & UCOL_REMOVE_CASE) == 0) { |
| 7834 secS = *(sCE++) & tertiaryMask; |
| 7835 if(!isContinuation(secS)) { |
| 7836 secS ^= caseSwitch; |
| 7837 } else { |
| 7838 secS &= UCOL_REMOVE_CASE; |
| 7839 } |
| 7840 } |
| 7841 |
| 7842 while((secT & UCOL_REMOVE_CASE) == 0) { |
| 7843 secT = *(tCE++) & tertiaryMask; |
| 7844 if(!isContinuation(secT)) { |
| 7845 secT ^= caseSwitch; |
| 7846 } else { |
| 7847 secT &= UCOL_REMOVE_CASE; |
| 7848 } |
| 7849 } |
| 7850 |
| 7851 if(secS == secT) { |
| 7852 if((secS & UCOL_REMOVE_CASE) == 1) { |
| 7853 break; |
| 7854 } else { |
| 7855 secS = 0; secT = 0; |
| 7856 continue; |
| 7857 } |
| 7858 } else { |
| 7859 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; |
| 7860 goto commonReturn; |
| 7861 } |
| 7862 } |
| 7863 } |
| 7864 |
| 7865 |
| 7866 if(qShifted /*checkQuad*/) { |
| 7867 UBool sInShifted = TRUE; |
| 7868 UBool tInShifted = TRUE; |
| 7869 secS = 0; |
| 7870 secT = 0; |
| 7871 sCE = sCEs.buf; |
| 7872 tCE = tCEs.buf; |
| 7873 for(;;) { |
| 7874 while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(sec
S) && !sInShifted)) { |
| 7875 secS = *(sCE++); |
| 7876 if(isContinuation(secS)) { |
| 7877 if(!sInShifted) { |
| 7878 continue; |
| 7879 } |
| 7880 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non
continuation */ |
| 7881 secS = UCOL_PRIMARYMASK; |
| 7882 sInShifted = FALSE; |
| 7883 } else { |
| 7884 sInShifted = TRUE; |
| 7885 } |
| 7886 } |
| 7887 secS &= UCOL_PRIMARYMASK; |
| 7888 |
| 7889 |
| 7890 while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(sec
T) && !tInShifted)) { |
| 7891 secT = *(tCE++); |
| 7892 if(isContinuation(secT)) { |
| 7893 if(!tInShifted) { |
| 7894 continue; |
| 7895 } |
| 7896 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) { |
| 7897 secT = UCOL_PRIMARYMASK; |
| 7898 tInShifted = FALSE; |
| 7899 } else { |
| 7900 tInShifted = TRUE; |
| 7901 } |
| 7902 } |
| 7903 secT &= UCOL_PRIMARYMASK; |
| 7904 |
| 7905 if(secS == secT) { |
| 7906 if(secS == UCOL_NO_MORE_CES_PRIMARY) { |
| 7907 break; |
| 7908 } else { |
| 7909 secS = 0; secT = 0; |
| 7910 continue; |
| 7911 } |
| 7912 } else { |
| 7913 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; |
| 7914 goto commonReturn; |
| 7915 } |
| 7916 } |
| 7917 } else if(doHiragana && hirResult != UCOL_EQUAL) { |
| 7918 // If we're fine on quaternaries, we might be different |
| 7919 // on Hiragana. This, however, might fail us in shifted. |
| 7920 result = hirResult; |
| 7921 goto commonReturn; |
| 7922 } |
| 7923 |
| 7924 /* For IDENTICAL comparisons, we use a bitwise character comparison */ |
| 7925 /* as a tiebreaker if all else is equal. */ |
| 7926 /* Getting here should be quite rare - strings are not identical - */ |
| 7927 /* that is checked first, but compared == through all other checks. */ |
| 7928 if(checkIdent) |
| 7929 { |
| 7930 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UC
OL_ON); |
| 7931 result = ucol_checkIdent(sColl, tColl, TRUE, status); |
| 7932 } |
| 7933 |
| 7934 commonReturn: |
| 7935 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) { |
| 7936 if (sCEs.buf != sCEs.localArray ) { |
| 7937 uprv_free(sCEs.buf); |
| 7938 } |
| 7939 if (tCEs.buf != tCEs.localArray ) { |
| 7940 uprv_free(tCEs.buf); |
| 7941 } |
| 7942 } |
| 7943 |
| 7944 return result; |
| 7945 } |
| 7946 |
| 7947 static UCollationResult |
| 7948 ucol_strcollRegular(const UCollator *coll, |
| 7949 const UChar *source, int32_t sourceLength, |
| 7950 const UChar *target, int32_t targetLength, |
| 7951 UErrorCode *status) { |
| 7952 collIterate sColl, tColl; |
| 7953 // Preparing the context objects for iterating over strings |
| 7954 IInit_collIterate(coll, source, sourceLength, &sColl, status); |
| 7955 IInit_collIterate(coll, target, targetLength, &tColl, status); |
| 7956 if(U_FAILURE(*status)) { |
| 7957 return UCOL_LESS; |
| 7958 } |
| 7959 return ucol_strcollRegular(&sColl, &tColl, status); |
| 7960 } |
| 7961 |
| 7962 static inline uint32_t |
| 7963 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength, |
| 7964 uint32_t CE, const UChar *s, int32_t *index, int32_t l
en) |
| 7965 { |
| 7966 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); |
| 7967 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; |
| 7968 int32_t offset = 1; |
| 7969 UChar schar = 0, tchar = 0; |
| 7970 |
| 7971 for(;;) { |
| 7972 if(len == -1) { |
| 7973 if(s[*index] == 0) { // end of string |
| 7974 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOn
eOffset]); |
| 7975 } else { |
| 7976 schar = s[*index]; |
| 7977 } |
| 7978 } else { |
| 7979 if(*index == len) { |
| 7980 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOn
eOffset]); |
| 7981 } else { |
| 7982 schar = s[*index]; |
| 7983 } |
| 7984 } |
| 7985 |
| 7986 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contractio
n codepoints should be ordered, we skip all that are smaller */ |
| 7987 offset++; |
| 7988 } |
| 7989 |
| 7990 if (schar == tchar) { |
| 7991 (*index)++; |
| 7992 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff
set+offset]); |
| 7993 } |
| 7994 else |
| 7995 { |
| 7996 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { |
| 7997 return UCOL_BAIL_OUT_CE; |
| 7998 } |
| 7999 // skip completely ignorables |
| 8000 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); |
| 8001 if(isZeroCE == 0) { // we have to ignore completely ignorables |
| 8002 (*index)++; |
| 8003 continue; |
| 8004 } |
| 8005 |
| 8006 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff
set]); |
| 8007 } |
| 8008 } |
| 8009 } |
| 8010 |
| 8011 |
| 8012 /** |
| 8013 * This is a fast strcoll, geared towards text in Latin-1. |
| 8014 * It supports contractions of size two, French secondaries |
| 8015 * and case switching. You can use it with strengths primary |
| 8016 * to tertiary. It does not support shifted and case level. |
| 8017 * It relies on the table build by setupLatin1Table. If it |
| 8018 * doesn't understand something, it will go to the regular |
| 8019 * strcoll. |
| 8020 */ |
| 8021 static UCollationResult |
| 8022 ucol_strcollUseLatin1( const UCollator *coll, |
| 8023 const UChar *source, |
| 8024 int32_t sLen, |
| 8025 const UChar *target, |
| 8026 int32_t tLen, |
| 8027 UErrorCode *status) |
| 8028 { |
| 8029 U_ALIGN_CODE(16); |
| 8030 int32_t strength = coll->strength; |
| 8031 |
| 8032 int32_t sIndex = 0, tIndex = 0; |
| 8033 UChar sChar = 0, tChar = 0; |
| 8034 uint32_t sOrder=0, tOrder=0; |
| 8035 |
| 8036 UBool endOfSource = FALSE; |
| 8037 |
| 8038 uint32_t *elements = coll->latinOneCEs; |
| 8039 |
| 8040 UBool haveContractions = FALSE; // if we have contractions in our string |
| 8041 // we cannot do French secondary |
| 8042 |
| 8043 // Do the primary level |
| 8044 for(;;) { |
| 8045 while(sOrder==0) { // this loop skips primary ignorables |
| 8046 // sOrder=getNextlatinOneCE(source); |
| 8047 if(sLen==-1) { // handling zero terminated strings |
| 8048 sChar=source[sIndex++]; |
| 8049 if(sChar==0) { |
| 8050 endOfSource = TRUE; |
| 8051 break; |
| 8052 } |
| 8053 } else { // handling strings with known length |
| 8054 if(sIndex==sLen) { |
| 8055 endOfSource = TRUE; |
| 8056 break; |
| 8057 } |
| 8058 sChar=source[sIndex++]; |
| 8059 } |
| 8060 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sCha
r > 0xFF, but this is faster on win32) |
| 8061 //fprintf(stderr, "R"); |
| 8062 return ucol_strcollRegular(coll, source, sLen, target, tLen, sta
tus); |
| 8063 } |
| 8064 sOrder = elements[sChar]; |
| 8065 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special |
| 8066 // specials can basically be either contractions or bail-out sig
ns. If we get anything |
| 8067 // else, we'll bail out anywasy |
| 8068 if(getCETag(sOrder) == CONTRACTION_TAG) { |
| 8069 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOr
der, source, &sIndex, sLen); |
| 8070 haveContractions = TRUE; // if there are contractions, we ca
nnot do French secondary |
| 8071 // However, if there are contractions in the table, but we a
lways use just one char, |
| 8072 // we might be able to do French. This should be checked out
. |
| 8073 } |
| 8074 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { |
| 8075 //fprintf(stderr, "S"); |
| 8076 return ucol_strcollRegular(coll, source, sLen, target, tLen,
status); |
| 8077 } |
| 8078 } |
| 8079 } |
| 8080 |
| 8081 while(tOrder==0) { // this loop skips primary ignorables |
| 8082 // tOrder=getNextlatinOneCE(target); |
| 8083 if(tLen==-1) { // handling zero terminated strings |
| 8084 tChar=target[tIndex++]; |
| 8085 if(tChar==0) { |
| 8086 if(endOfSource) { // this is different than source loop, |
| 8087 // as we already know that source loop is done here, |
| 8088 // so we can either finish the primary loop if both |
| 8089 // strings are done or anounce the result if only |
| 8090 // target is done. Same below. |
| 8091 goto endOfPrimLoop; |
| 8092 } else { |
| 8093 return UCOL_GREATER; |
| 8094 } |
| 8095 } |
| 8096 } else { // handling strings with known length |
| 8097 if(tIndex==tLen) { |
| 8098 if(endOfSource) { |
| 8099 goto endOfPrimLoop; |
| 8100 } else { |
| 8101 return UCOL_GREATER; |
| 8102 } |
| 8103 } |
| 8104 tChar=target[tIndex++]; |
| 8105 } |
| 8106 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sCha
r > 0xFF, but this is faster on win32) |
| 8107 //fprintf(stderr, "R"); |
| 8108 return ucol_strcollRegular(coll, source, sLen, target, tLen, sta
tus); |
| 8109 } |
| 8110 tOrder = elements[tChar]; |
| 8111 if(tOrder >= UCOL_NOT_FOUND) { |
| 8112 // Handling specials, see the comments for source |
| 8113 if(getCETag(tOrder) == CONTRACTION_TAG) { |
| 8114 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOr
der, target, &tIndex, tLen); |
| 8115 haveContractions = TRUE; |
| 8116 } |
| 8117 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { |
| 8118 //fprintf(stderr, "S"); |
| 8119 return ucol_strcollRegular(coll, source, sLen, target, tLen,
status); |
| 8120 } |
| 8121 } |
| 8122 } |
| 8123 if(endOfSource) { // source is finished, but target is not, say the resu
lt. |
| 8124 return UCOL_LESS; |
| 8125 } |
| 8126 |
| 8127 if(sOrder == tOrder) { // if we have same CEs, we continue the loop |
| 8128 sOrder = 0; tOrder = 0; |
| 8129 continue; |
| 8130 } else { |
| 8131 // compare current top bytes |
| 8132 if(((sOrder^tOrder)&0xFF000000)!=0) { |
| 8133 // top bytes differ, return difference |
| 8134 if(sOrder < tOrder) { |
| 8135 return UCOL_LESS; |
| 8136 } else if(sOrder > tOrder) { |
| 8137 return UCOL_GREATER; |
| 8138 } |
| 8139 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24)
; |
| 8140 // since we must return enum value |
| 8141 } |
| 8142 |
| 8143 // top bytes match, continue with following bytes |
| 8144 sOrder<<=8; |
| 8145 tOrder<<=8; |
| 8146 } |
| 8147 } |
| 8148 |
| 8149 endOfPrimLoop: |
| 8150 // after primary loop, we definitely know the sizes of strings, |
| 8151 // so we set it and use simpler loop for secondaries and tertiaries |
| 8152 sLen = sIndex; tLen = tIndex; |
| 8153 if(strength >= UCOL_SECONDARY) { |
| 8154 // adjust the table beggining |
| 8155 elements += coll->latinOneTableLen; |
| 8156 endOfSource = FALSE; |
| 8157 |
| 8158 if(coll->frenchCollation == UCOL_OFF) { // non French |
| 8159 // This loop is a simplified copy of primary loop |
| 8160 // at this point we know that whole strings are latin-1, so we don't |
| 8161 // check for that. We also know that we only have contractions as |
| 8162 // specials. |
| 8163 sIndex = 0; tIndex = 0; |
| 8164 for(;;) { |
| 8165 while(sOrder==0) { |
| 8166 if(sIndex==sLen) { |
| 8167 endOfSource = TRUE; |
| 8168 break; |
| 8169 } |
| 8170 sChar=source[sIndex++]; |
| 8171 sOrder = elements[sChar]; |
| 8172 if(sOrder > UCOL_NOT_FOUND) { |
| 8173 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDAR
Y, sOrder, source, &sIndex, sLen); |
| 8174 } |
| 8175 } |
| 8176 |
| 8177 while(tOrder==0) { |
| 8178 if(tIndex==tLen) { |
| 8179 if(endOfSource) { |
| 8180 goto endOfSecLoop; |
| 8181 } else { |
| 8182 return UCOL_GREATER; |
| 8183 } |
| 8184 } |
| 8185 tChar=target[tIndex++]; |
| 8186 tOrder = elements[tChar]; |
| 8187 if(tOrder > UCOL_NOT_FOUND) { |
| 8188 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDAR
Y, tOrder, target, &tIndex, tLen); |
| 8189 } |
| 8190 } |
| 8191 if(endOfSource) { |
| 8192 return UCOL_LESS; |
| 8193 } |
| 8194 |
| 8195 if(sOrder == tOrder) { |
| 8196 sOrder = 0; tOrder = 0; |
| 8197 continue; |
| 8198 } else { |
| 8199 // see primary loop for comments on this |
| 8200 if(((sOrder^tOrder)&0xFF000000)!=0) { |
| 8201 if(sOrder < tOrder) { |
| 8202 return UCOL_LESS; |
| 8203 } else if(sOrder > tOrder) { |
| 8204 return UCOL_GREATER; |
| 8205 } |
| 8206 } |
| 8207 sOrder<<=8; |
| 8208 tOrder<<=8; |
| 8209 } |
| 8210 } |
| 8211 } else { // French |
| 8212 if(haveContractions) { // if we have contractions, we have to bail o
ut |
| 8213 // since we don't really know how to handle them here |
| 8214 return ucol_strcollRegular(coll, source, sLen, target, tLen, sta
tus); |
| 8215 } |
| 8216 // For French, we go backwards |
| 8217 sIndex = sLen; tIndex = tLen; |
| 8218 for(;;) { |
| 8219 while(sOrder==0) { |
| 8220 if(sIndex==0) { |
| 8221 endOfSource = TRUE; |
| 8222 break; |
| 8223 } |
| 8224 sChar=source[--sIndex]; |
| 8225 sOrder = elements[sChar]; |
| 8226 // don't even look for contractions |
| 8227 } |
| 8228 |
| 8229 while(tOrder==0) { |
| 8230 if(tIndex==0) { |
| 8231 if(endOfSource) { |
| 8232 goto endOfSecLoop; |
| 8233 } else { |
| 8234 return UCOL_GREATER; |
| 8235 } |
| 8236 } |
| 8237 tChar=target[--tIndex]; |
| 8238 tOrder = elements[tChar]; |
| 8239 // don't even look for contractions |
| 8240 } |
| 8241 if(endOfSource) { |
| 8242 return UCOL_LESS; |
| 8243 } |
| 8244 |
| 8245 if(sOrder == tOrder) { |
| 8246 sOrder = 0; tOrder = 0; |
| 8247 continue; |
| 8248 } else { |
| 8249 // see the primary loop for comments |
| 8250 if(((sOrder^tOrder)&0xFF000000)!=0) { |
| 8251 if(sOrder < tOrder) { |
| 8252 return UCOL_LESS; |
| 8253 } else if(sOrder > tOrder) { |
| 8254 return UCOL_GREATER; |
| 8255 } |
| 8256 } |
| 8257 sOrder<<=8; |
| 8258 tOrder<<=8; |
| 8259 } |
| 8260 } |
| 8261 } |
| 8262 } |
| 8263 |
| 8264 endOfSecLoop: |
| 8265 if(strength >= UCOL_TERTIARY) { |
| 8266 // tertiary loop is the same as secondary (except no French) |
| 8267 elements += coll->latinOneTableLen; |
| 8268 sIndex = 0; tIndex = 0; |
| 8269 endOfSource = FALSE; |
| 8270 for(;;) { |
| 8271 while(sOrder==0) { |
| 8272 if(sIndex==sLen) { |
| 8273 endOfSource = TRUE; |
| 8274 break; |
| 8275 } |
| 8276 sChar=source[sIndex++]; |
| 8277 sOrder = elements[sChar]; |
| 8278 if(sOrder > UCOL_NOT_FOUND) { |
| 8279 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sO
rder, source, &sIndex, sLen); |
| 8280 } |
| 8281 } |
| 8282 while(tOrder==0) { |
| 8283 if(tIndex==tLen) { |
| 8284 if(endOfSource) { |
| 8285 return UCOL_EQUAL; // if both strings are at the end, th
ey are equal |
| 8286 } else { |
| 8287 return UCOL_GREATER; |
| 8288 } |
| 8289 } |
| 8290 tChar=target[tIndex++]; |
| 8291 tOrder = elements[tChar]; |
| 8292 if(tOrder > UCOL_NOT_FOUND) { |
| 8293 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tO
rder, target, &tIndex, tLen); |
| 8294 } |
| 8295 } |
| 8296 if(endOfSource) { |
| 8297 return UCOL_LESS; |
| 8298 } |
| 8299 if(sOrder == tOrder) { |
| 8300 sOrder = 0; tOrder = 0; |
| 8301 continue; |
| 8302 } else { |
| 8303 if(((sOrder^tOrder)&0xff000000)!=0) { |
| 8304 if(sOrder < tOrder) { |
| 8305 return UCOL_LESS; |
| 8306 } else if(sOrder > tOrder) { |
| 8307 return UCOL_GREATER; |
| 8308 } |
| 8309 } |
| 8310 sOrder<<=8; |
| 8311 tOrder<<=8; |
| 8312 } |
| 8313 } |
| 8314 } |
| 8315 return UCOL_EQUAL; |
| 8316 } |
| 8317 |
| 8318 |
| 8319 U_CAPI UCollationResult U_EXPORT2 |
| 8320 ucol_strcollIter( const UCollator *coll, |
| 8321 UCharIterator *sIter, |
| 8322 UCharIterator *tIter, |
| 8323 UErrorCode *status) |
| 8324 { |
| 8325 if(!status || U_FAILURE(*status)) { |
| 8326 return UCOL_EQUAL; |
| 8327 } |
| 8328 |
| 8329 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); |
| 8330 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIt
er); |
| 8331 |
| 8332 if (sIter == tIter) { |
| 8333 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) |
| 8334 return UCOL_EQUAL; |
| 8335 } |
| 8336 if(sIter == NULL || tIter == NULL || coll == NULL) { |
| 8337 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 8338 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) |
| 8339 return UCOL_EQUAL; |
| 8340 } |
| 8341 |
| 8342 UCollationResult result = UCOL_EQUAL; |
| 8343 |
| 8344 // Preparing the context objects for iterating over strings |
| 8345 collIterate sColl, tColl; |
| 8346 IInit_collIterate(coll, NULL, -1, &sColl, status); |
| 8347 IInit_collIterate(coll, NULL, -1, &tColl, status); |
| 8348 if(U_FAILURE(*status)) { |
| 8349 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) |
| 8350 return UCOL_EQUAL; |
| 8351 } |
| 8352 // The division for the array length may truncate the array size to |
| 8353 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high |
| 8354 // for all platforms anyway. |
| 8355 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; |
| 8356 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; |
| 8357 UNormIterator *sNormIter = NULL, *tNormIter = NULL; |
| 8358 |
| 8359 sColl.iterator = sIter; |
| 8360 sColl.flags |= UCOL_USE_ITERATOR; |
| 8361 tColl.flags |= UCOL_USE_ITERATOR; |
| 8362 tColl.iterator = tIter; |
| 8363 |
| 8364 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { |
| 8365 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), statu
s); |
| 8366 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status); |
| 8367 sColl.flags &= ~UCOL_ITER_NORM; |
| 8368 |
| 8369 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), statu
s); |
| 8370 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status); |
| 8371 tColl.flags &= ~UCOL_ITER_NORM; |
| 8372 } |
| 8373 |
| 8374 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL; |
| 8375 |
| 8376 while((sChar = sColl.iterator->next(sColl.iterator)) == |
| 8377 (tChar = tColl.iterator->next(tColl.iterator))) { |
| 8378 if(sChar == U_SENTINEL) { |
| 8379 result = UCOL_EQUAL; |
| 8380 goto end_compare; |
| 8381 } |
| 8382 } |
| 8383 |
| 8384 if(sChar == U_SENTINEL) { |
| 8385 tChar = tColl.iterator->previous(tColl.iterator); |
| 8386 } |
| 8387 |
| 8388 if(tChar == U_SENTINEL) { |
| 8389 sChar = sColl.iterator->previous(sColl.iterator); |
| 8390 } |
| 8391 |
| 8392 sChar = sColl.iterator->previous(sColl.iterator); |
| 8393 tChar = tColl.iterator->previous(tColl.iterator); |
| 8394 |
| 8395 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll)) |
| 8396 { |
| 8397 // We are stopped in the middle of a contraction. |
| 8398 // Scan backwards through the == part of the string looking for the star
t of the contraction. |
| 8399 // It doesn't matter which string we scan, since they are the same in
this region. |
| 8400 do |
| 8401 { |
| 8402 sChar = sColl.iterator->previous(sColl.iterator); |
| 8403 tChar = tColl.iterator->previous(tColl.iterator); |
| 8404 } |
| 8405 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll)); |
| 8406 } |
| 8407 |
| 8408 |
| 8409 if(U_SUCCESS(*status)) { |
| 8410 result = ucol_strcollRegular(&sColl, &tColl, status); |
| 8411 } |
| 8412 |
| 8413 end_compare: |
| 8414 if(sNormIter || tNormIter) { |
| 8415 unorm_closeIter(sNormIter); |
| 8416 unorm_closeIter(tNormIter); |
| 8417 } |
| 8418 |
| 8419 UTRACE_EXIT_VALUE_STATUS(result, *status) |
| 8420 return result; |
| 8421 } |
| 8422 |
| 8423 |
| 8424 /* */ |
| 8425 /* ucol_strcoll Main public API string comparison function */ |
| 8426 /* */ |
| 8427 U_CAPI UCollationResult U_EXPORT2 |
| 8428 ucol_strcoll( const UCollator *coll, |
| 8429 const UChar *source, |
| 8430 int32_t sourceLength, |
| 8431 const UChar *target, |
| 8432 int32_t targetLength) |
| 8433 { |
| 8434 U_ALIGN_CODE(16); |
| 8435 |
| 8436 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); |
| 8437 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { |
| 8438 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, sour
ce, target); |
| 8439 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLengt
h); |
| 8440 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLengt
h); |
| 8441 } |
| 8442 |
| 8443 if(source == NULL || target == NULL) { |
| 8444 // do not crash, but return. Should have |
| 8445 // status argument to return error. |
| 8446 UTRACE_EXIT_VALUE(UCOL_EQUAL); |
| 8447 return UCOL_EQUAL; |
| 8448 } |
| 8449 |
| 8450 /* Quick check if source and target are same strings. */ |
| 8451 /* They should either both be NULL terminated or the explicit length should
be set on both. */ |
| 8452 if (source==target && sourceLength==targetLength) { |
| 8453 UTRACE_EXIT_VALUE(UCOL_EQUAL); |
| 8454 return UCOL_EQUAL; |
| 8455 } |
| 8456 |
| 8457 /* Scan the strings. Find:
*/ |
| 8458 /* The length of any leading portion that is equal
*/ |
| 8459 /* Whether they are exactly equal. (in which case we just return)
*/ |
| 8460 const UChar *pSrc = source; |
| 8461 const UChar *pTarg = target; |
| 8462 int32_t equalLength; |
| 8463 |
| 8464 if (sourceLength == -1 && targetLength == -1) { |
| 8465 // Both strings are null terminated. |
| 8466 // Scan through any leading equal portion. |
| 8467 while (*pSrc == *pTarg && *pSrc != 0) { |
| 8468 pSrc++; |
| 8469 pTarg++; |
| 8470 } |
| 8471 if (*pSrc == 0 && *pTarg == 0) { |
| 8472 UTRACE_EXIT_VALUE(UCOL_EQUAL); |
| 8473 return UCOL_EQUAL; |
| 8474 } |
| 8475 equalLength = (int32_t)(pSrc - source); |
| 8476 } |
| 8477 else |
| 8478 { |
| 8479 // One or both strings has an explicit length. |
| 8480 const UChar *pSrcEnd = source + sourceLength; |
| 8481 const UChar *pTargEnd = target + targetLength; |
| 8482 |
| 8483 // Scan while the strings are bitwise ==, or until one is exhausted. |
| 8484 for (;;) { |
| 8485 if (pSrc == pSrcEnd || pTarg == pTargEnd) { |
| 8486 break; |
| 8487 } |
| 8488 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLeng
th == -1)) { |
| 8489 break; |
| 8490 } |
| 8491 if (*pSrc != *pTarg) { |
| 8492 break; |
| 8493 } |
| 8494 pSrc++; |
| 8495 pTarg++; |
| 8496 } |
| 8497 equalLength = (int32_t)(pSrc - source); |
| 8498 |
| 8499 // If we made it all the way through both strings, we are done. They ar
e == |
| 8500 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of
src string, however it was specified. */ |
| 8501 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also
at end of dest string */ |
| 8502 { |
| 8503 UTRACE_EXIT_VALUE(UCOL_EQUAL); |
| 8504 return UCOL_EQUAL; |
| 8505 } |
| 8506 } |
| 8507 if (equalLength > 0) { |
| 8508 /* There is an identical portion at the beginning of the two strings.
*/ |
| 8509 /* If the identical portion ends within a contraction or a comibining
*/ |
| 8510 /* character sequence, back up to the start of that sequence.
*/ |
| 8511 |
| 8512 // These values should already be set by the code above. |
| 8513 //pSrc = source + equalLength; /* point to the first differing c
hars */ |
| 8514 //pTarg = target + equalLength; |
| 8515 if ((pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) || |
| 8516 (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))) |
| 8517 { |
| 8518 // We are stopped in the middle of a contraction. |
| 8519 // Scan backwards through the == part of the string looking for the
start of the contraction. |
| 8520 // It doesn't matter which string we scan, since they are the same
in this region. |
| 8521 do |
| 8522 { |
| 8523 equalLength--; |
| 8524 pSrc--; |
| 8525 } |
| 8526 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll)); |
| 8527 } |
| 8528 |
| 8529 source += equalLength; |
| 8530 target += equalLength; |
| 8531 if (sourceLength > 0) { |
| 8532 sourceLength -= equalLength; |
| 8533 } |
| 8534 if (targetLength > 0) { |
| 8535 targetLength -= equalLength; |
| 8536 } |
| 8537 } |
| 8538 |
| 8539 UErrorCode status = U_ZERO_ERROR; |
| 8540 UCollationResult returnVal; |
| 8541 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLeng
th > 0 && *target&0xff00)) { |
| 8542 returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targ
etLength, &status); |
| 8543 } else { |
| 8544 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, ta
rgetLength, &status); |
| 8545 } |
| 8546 UTRACE_EXIT_VALUE(returnVal); |
| 8547 return returnVal; |
| 8548 } |
| 8549 |
| 8550 /* convenience function for comparing strings */ |
| 8551 U_CAPI UBool U_EXPORT2 |
| 8552 ucol_greater( const UCollator *coll, |
| 8553 const UChar *source, |
| 8554 int32_t sourceLength, |
| 8555 const UChar *target, |
| 8556 int32_t targetLength) |
| 8557 { |
| 8558 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) |
| 8559 == UCOL_GREATER); |
| 8560 } |
| 8561 |
| 8562 /* convenience function for comparing strings */ |
| 8563 U_CAPI UBool U_EXPORT2 |
| 8564 ucol_greaterOrEqual( const UCollator *coll, |
| 8565 const UChar *source, |
| 8566 int32_t sourceLength, |
| 8567 const UChar *target, |
| 8568 int32_t targetLength) |
| 8569 { |
| 8570 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) |
| 8571 != UCOL_LESS); |
| 8572 } |
| 8573 |
| 8574 /* convenience function for comparing strings */ |
| 8575 U_CAPI UBool U_EXPORT2 |
| 8576 ucol_equal( const UCollator *coll, |
| 8577 const UChar *source, |
| 8578 int32_t sourceLength, |
| 8579 const UChar *target, |
| 8580 int32_t targetLength) |
| 8581 { |
| 8582 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) |
| 8583 == UCOL_EQUAL); |
| 8584 } |
| 8585 |
| 8586 U_CAPI void U_EXPORT2 |
| 8587 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { |
| 8588 if(coll && coll->UCA) { |
| 8589 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo)); |
| 8590 } |
| 8591 } |
| 8592 |
| 8593 #endif /* #if !UCONFIG_NO_COLLATION */ |
OLD | NEW |