| OLD | NEW |
| 1 /* | 1 /* |
| 2 ******************************************************************************* | 2 ******************************************************************************* |
| 3 * Copyright (C) 1996-2013, International Business Machines | 3 * Copyright (C) 1996-2014, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* | 5 ******************************************************************************* |
| 6 * file name: ucol.cpp | 6 * file name: ucol.cpp |
| 7 * encoding: US-ASCII | 7 * encoding: US-ASCII |
| 8 * tab size: 8 (not used) | 8 * tab size: 8 (not used) |
| 9 * indentation:4 | 9 * indentation:4 |
| 10 * | 10 * |
| 11 * Modification history | 11 * Modification history |
| 12 * Date Name Comments | 12 * Date Name Comments |
| 13 * 1996-1999 various members of ICU team maintained C API for collation framewo
rk | 13 * 1996-1999 various members of ICU team maintained C API for collation framewo
rk |
| 14 * 02/16/2001 synwee Added internal method getPrevSpecialCE | 14 * 02/16/2001 synwee Added internal method getPrevSpecialCE |
| 15 * 03/01/2001 synwee Added maxexpansion functionality. | 15 * 03/01/2001 synwee Added maxexpansion functionality. |
| 16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compl
iant | 16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compl
iant |
| 17 * 2012-2014 markus Rewritten in C++ again. |
| 17 */ | 18 */ |
| 18 | 19 |
| 19 #include "unicode/utypes.h" | 20 #include "unicode/utypes.h" |
| 20 | 21 |
| 21 #if !UCONFIG_NO_COLLATION | 22 #if !UCONFIG_NO_COLLATION |
| 22 | 23 |
| 24 #include "unicode/coll.h" |
| 25 #include "unicode/tblcoll.h" |
| 23 #include "unicode/bytestream.h" | 26 #include "unicode/bytestream.h" |
| 24 #include "unicode/coleitr.h" | 27 #include "unicode/coleitr.h" |
| 25 #include "unicode/unorm.h" | 28 #include "unicode/ucoleitr.h" |
| 26 #include "unicode/udata.h" | |
| 27 #include "unicode/ustring.h" | 29 #include "unicode/ustring.h" |
| 28 #include "unicode/utf8.h" | |
| 29 | |
| 30 #include "ucol_imp.h" | |
| 31 #include "bocsu.h" | |
| 32 | |
| 33 #include "normalizer2impl.h" | |
| 34 #include "unorm_it.h" | |
| 35 #include "umutex.h" | |
| 36 #include "cmemory.h" | 30 #include "cmemory.h" |
| 37 #include "ucln_in.h" | 31 #include "collation.h" |
| 38 #include "cstring.h" | 32 #include "cstring.h" |
| 39 #include "utracimp.h" | |
| 40 #include "putilimp.h" | 33 #include "putilimp.h" |
| 41 #include "uassert.h" | 34 #include "uassert.h" |
| 42 #include "unicode/coll.h" | 35 #include "utracimp.h" |
| 43 | |
| 44 #ifdef UCOL_DEBUG | |
| 45 #include <stdio.h> | |
| 46 #endif | |
| 47 | 36 |
| 48 U_NAMESPACE_USE | 37 U_NAMESPACE_USE |
| 49 | 38 |
| 50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) | |
| 51 | |
| 52 #define LAST_BYTE_MASK_ 0xFF | |
| 53 #define SECOND_LAST_BYTE_SHIFT_ 8 | |
| 54 | |
| 55 #define ZERO_CC_LIMIT_ 0xC0 | |
| 56 | |
| 57 // These are static pointers to the NFC/NFD implementation instance. | |
| 58 // Each of them is always the same between calls to u_cleanup | |
| 59 // and therefore writing to it is not synchronized. | |
| 60 // They are cleaned in ucol_cleanup | |
| 61 static const Normalizer2 *g_nfd = NULL; | |
| 62 static const Normalizer2Impl *g_nfcImpl = NULL; | |
| 63 | |
| 64 // These are values from UCA required for | |
| 65 // implicit generation and supressing sort key compression | |
| 66 // they should regularly be in the UCA, but if one | |
| 67 // is running without UCA, it could be a problem | |
| 68 static const int32_t maxRegularPrimary = 0x7A; | |
| 69 static const int32_t minImplicitPrimary = 0xE0; | |
| 70 static const int32_t maxImplicitPrimary = 0xE4; | |
| 71 | |
| 72 U_CDECL_BEGIN | |
| 73 static UBool U_CALLCONV | |
| 74 ucol_cleanup(void) | |
| 75 { | |
| 76 g_nfd = NULL; | |
| 77 g_nfcImpl = NULL; | |
| 78 return TRUE; | |
| 79 } | |
| 80 | |
| 81 static int32_t U_CALLCONV | |
| 82 _getFoldingOffset(uint32_t data) { | |
| 83 return (int32_t)(data&0xFFFFFF); | |
| 84 } | |
| 85 | |
| 86 U_CDECL_END | |
| 87 | |
| 88 static inline | |
| 89 UBool initializeNFD(UErrorCode *status) { | |
| 90 if (g_nfd != NULL) { | |
| 91 return TRUE; | |
| 92 } else { | |
| 93 // The result is constant, until the library is reloaded. | |
| 94 g_nfd = Normalizer2Factory::getNFDInstance(*status); | |
| 95 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); | |
| 96 return U_SUCCESS(*status); | |
| 97 } | |
| 98 } | |
| 99 | |
| 100 // init FCD data | |
| 101 static inline | |
| 102 UBool initializeFCD(UErrorCode *status) { | |
| 103 if (g_nfcImpl != NULL) { | |
| 104 return TRUE; | |
| 105 } else { | |
| 106 // The result is constant, until the library is reloaded. | |
| 107 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status); | |
| 108 // Note: Alternatively, we could also store this pointer in each collIte
rate struct, | |
| 109 // same as Normalizer2Factory::getImpl(collIterate->nfd). | |
| 110 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); | |
| 111 return U_SUCCESS(*status); | |
| 112 } | |
| 113 } | |
| 114 | |
| 115 static | |
| 116 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceStri
ng, | |
| 117 int32_t sourceLen, collIterate *s, | |
| 118 UErrorCode *status) | |
| 119 { | |
| 120 (s)->string = (s)->pos = sourceString; | |
| 121 (s)->origFlags = 0; | |
| 122 (s)->flags = 0; | |
| 123 if (sourceLen >= 0) { | |
| 124 s->flags |= UCOL_ITER_HASLEN; | |
| 125 (s)->endp = (UChar *)sourceString+sourceLen; | |
| 126 } | |
| 127 else { | |
| 128 /* change to enable easier checking for end of string for fcdpositon */ | |
| 129 (s)->endp = NULL; | |
| 130 } | |
| 131 (s)->extendCEs = NULL; | |
| 132 (s)->extendCEsSize = 0; | |
| 133 (s)->CEpos = (s)->toReturn = (s)->CEs; | |
| 134 (s)->offsetBuffer = NULL; | |
| 135 (s)->offsetBufferSize = 0; | |
| 136 (s)->offsetReturn = (s)->offsetStore = NULL; | |
| 137 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0; | |
| 138 (s)->coll = (collator); | |
| 139 if (initializeNFD(status)) { | |
| 140 (s)->nfd = g_nfd; | |
| 141 } else { | |
| 142 return; | |
| 143 } | |
| 144 (s)->fcdPosition = 0; | |
| 145 if(collator->normalizationMode == UCOL_ON) { | |
| 146 (s)->flags |= UCOL_ITER_NORM; | |
| 147 } | |
| 148 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY)
{ | |
| 149 (s)->flags |= UCOL_HIRAGANA_Q; | |
| 150 } | |
| 151 (s)->iterator = NULL; | |
| 152 //(s)->iteratorIndex = 0; | |
| 153 } | |
| 154 | |
| 155 U_CAPI void U_EXPORT2 | |
| 156 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, | |
| 157 int32_t sourceLen, collIterate *s, | |
| 158 UErrorCode *status) { | |
| 159 /* Out-of-line version for use from other files. */ | |
| 160 IInit_collIterate(collator, sourceString, sourceLen, s, status); | |
| 161 } | |
| 162 | |
| 163 U_CAPI collIterate * U_EXPORT2 | |
| 164 uprv_new_collIterate(UErrorCode *status) { | |
| 165 if(U_FAILURE(*status)) { | |
| 166 return NULL; | |
| 167 } | |
| 168 collIterate *s = new collIterate; | |
| 169 if(s == NULL) { | |
| 170 *status = U_MEMORY_ALLOCATION_ERROR; | |
| 171 return NULL; | |
| 172 } | |
| 173 return s; | |
| 174 } | |
| 175 | |
| 176 U_CAPI void U_EXPORT2 | |
| 177 uprv_delete_collIterate(collIterate *s) { | |
| 178 delete s; | |
| 179 } | |
| 180 | |
| 181 U_CAPI UBool U_EXPORT2 | |
| 182 uprv_collIterateAtEnd(collIterate *s) { | |
| 183 return s == NULL || s->pos == s->endp; | |
| 184 } | |
| 185 | |
| 186 /** | |
| 187 * Backup the state of the collIterate struct data | |
| 188 * @param data collIterate to backup | |
| 189 * @param backup storage | |
| 190 */ | |
| 191 static | |
| 192 inline void backupState(const collIterate *data, collIterateState *backup) | |
| 193 { | |
| 194 backup->fcdPosition = data->fcdPosition; | |
| 195 backup->flags = data->flags; | |
| 196 backup->origFlags = data->origFlags; | |
| 197 backup->pos = data->pos; | |
| 198 backup->bufferaddress = data->writableBuffer.getBuffer(); | |
| 199 backup->buffersize = data->writableBuffer.length(); | |
| 200 backup->iteratorMove = 0; | |
| 201 backup->iteratorIndex = 0; | |
| 202 if(data->iterator != NULL) { | |
| 203 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER
_CURRENT); | |
| 204 backup->iteratorIndex = data->iterator->getState(data->iterator); | |
| 205 // no we try to fixup if we're using a normalizing iterator and we get U
ITER_NO_STATE | |
| 206 if(backup->iteratorIndex == UITER_NO_STATE) { | |
| 207 while((backup->iteratorIndex = data->iterator->getState(data->iterat
or)) == UITER_NO_STATE) { | |
| 208 backup->iteratorMove++; | |
| 209 data->iterator->move(data->iterator, -1, UITER_CURRENT); | |
| 210 } | |
| 211 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CUR
RENT); | |
| 212 } | |
| 213 } | |
| 214 } | |
| 215 | |
| 216 /** | |
| 217 * Loads the state into the collIterate struct data | |
| 218 * @param data collIterate to backup | |
| 219 * @param backup storage | |
| 220 * @param forwards boolean to indicate if forwards iteration is used, | |
| 221 * false indicates backwards iteration | |
| 222 */ | |
| 223 static | |
| 224 inline void loadState(collIterate *data, const collIterateState *backup, | |
| 225 UBool forwards) | |
| 226 { | |
| 227 UErrorCode status = U_ZERO_ERROR; | |
| 228 data->flags = backup->flags; | |
| 229 data->origFlags = backup->origFlags; | |
| 230 if(data->iterator != NULL) { | |
| 231 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO
); | |
| 232 data->iterator->setState(data->iterator, backup->iteratorIndex, &status)
; | |
| 233 if(backup->iteratorMove != 0) { | |
| 234 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CUR
RENT); | |
| 235 } | |
| 236 } | |
| 237 data->pos = backup->pos; | |
| 238 | |
| 239 if ((data->flags & UCOL_ITER_INNORMBUF) && | |
| 240 data->writableBuffer.getBuffer() != backup->bufferaddress) { | |
| 241 /* | |
| 242 this is when a new buffer has been reallocated and we'll have to | |
| 243 calculate the new position. | |
| 244 note the new buffer has to contain the contents of the old buffer. | |
| 245 */ | |
| 246 if (forwards) { | |
| 247 data->pos = data->writableBuffer.getTerminatedBuffer() + | |
| 248 (data->pos - backup->bufferaddress); | |
| 249 } | |
| 250 else { | |
| 251 /* backwards direction */ | |
| 252 int32_t temp = backup->buffersize - | |
| 253 (int32_t)(data->pos - backup->bufferaddress); | |
| 254 data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writ
ableBuffer.length() - temp); | |
| 255 } | |
| 256 } | |
| 257 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { | |
| 258 /* | |
| 259 this is alittle tricky. | |
| 260 if we are initially not in the normalization buffer, even if we | |
| 261 normalize in the later stage, the data in the buffer will be | |
| 262 ignored, since we skip back up to the data string. | |
| 263 however if we are already in the normalization buffer, any | |
| 264 further normalization will pull data into the normalization | |
| 265 buffer and modify the fcdPosition. | |
| 266 since we are keeping the data in the buffer for use, the | |
| 267 fcdPosition can not be reverted back. | |
| 268 arrgghh.... | |
| 269 */ | |
| 270 data->fcdPosition = backup->fcdPosition; | |
| 271 } | |
| 272 } | |
| 273 | |
| 274 static UBool | |
| 275 reallocCEs(collIterate *data, int32_t newCapacity) { | |
| 276 uint32_t *oldCEs = data->extendCEs; | |
| 277 if(oldCEs == NULL) { | |
| 278 oldCEs = data->CEs; | |
| 279 } | |
| 280 int32_t length = data->CEpos - oldCEs; | |
| 281 uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4); | |
| 282 if(newCEs == NULL) { | |
| 283 return FALSE; | |
| 284 } | |
| 285 uprv_memcpy(newCEs, oldCEs, length * 4); | |
| 286 uprv_free(data->extendCEs); | |
| 287 data->extendCEs = newCEs; | |
| 288 data->extendCEsSize = newCapacity; | |
| 289 data->CEpos = newCEs + length; | |
| 290 return TRUE; | |
| 291 } | |
| 292 | |
| 293 static UBool | |
| 294 increaseCEsCapacity(collIterate *data) { | |
| 295 int32_t oldCapacity; | |
| 296 if(data->extendCEs != NULL) { | |
| 297 oldCapacity = data->extendCEsSize; | |
| 298 } else { | |
| 299 oldCapacity = LENGTHOF(data->CEs); | |
| 300 } | |
| 301 return reallocCEs(data, 2 * oldCapacity); | |
| 302 } | |
| 303 | |
| 304 static UBool | |
| 305 ensureCEsCapacity(collIterate *data, int32_t minCapacity) { | |
| 306 int32_t oldCapacity; | |
| 307 if(data->extendCEs != NULL) { | |
| 308 oldCapacity = data->extendCEsSize; | |
| 309 } else { | |
| 310 oldCapacity = LENGTHOF(data->CEs); | |
| 311 } | |
| 312 if(minCapacity <= oldCapacity) { | |
| 313 return TRUE; | |
| 314 } | |
| 315 oldCapacity *= 2; | |
| 316 return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacit
y); | |
| 317 } | |
| 318 | |
| 319 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) { | |
| 320 if(U_FAILURE(errorCode)) { | |
| 321 return; | |
| 322 } | |
| 323 int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuf
fer); | |
| 324 U_ASSERT(length >= offsetBufferSize || offsetStore != NULL); | |
| 325 if(length >= offsetBufferSize) { | |
| 326 int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE; | |
| 327 int32_t *newBuffer = static_cast<int32_t *>(uprv_malloc(newCapacity * 4)
); | |
| 328 if(newBuffer == NULL) { | |
| 329 errorCode = U_MEMORY_ALLOCATION_ERROR; | |
| 330 return; | |
| 331 } | |
| 332 if(length > 0) { | |
| 333 uprv_memcpy(newBuffer, offsetBuffer, length * 4); | |
| 334 } | |
| 335 uprv_free(offsetBuffer); | |
| 336 offsetBuffer = newBuffer; | |
| 337 offsetStore = offsetBuffer + length; | |
| 338 offsetBufferSize = newCapacity; | |
| 339 } | |
| 340 *offsetStore++ = offset; | |
| 341 } | |
| 342 | |
| 343 /* | |
| 344 * collIter_eos() | |
| 345 * Checks for a collIterate being positioned at the end of | |
| 346 * its source string. | |
| 347 * | |
| 348 */ | |
| 349 static | |
| 350 inline UBool collIter_eos(collIterate *s) { | |
| 351 if(s->flags & UCOL_USE_ITERATOR) { | |
| 352 return !(s->iterator->hasNext(s->iterator)); | |
| 353 } | |
| 354 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) { | |
| 355 // Null terminated string, but not at null, so not at end. | |
| 356 // Whether in main or normalization buffer doesn't matter. | |
| 357 return FALSE; | |
| 358 } | |
| 359 | |
| 360 // String with length. Can't be in normalization buffer, which is always | |
| 361 // null termintated. | |
| 362 if (s->flags & UCOL_ITER_HASLEN) { | |
| 363 return (s->pos == s->endp); | |
| 364 } | |
| 365 | |
| 366 // We are at a null termination, could be either normalization buffer or mai
n string. | |
| 367 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) { | |
| 368 // At null at end of main string. | |
| 369 return TRUE; | |
| 370 } | |
| 371 | |
| 372 // At null at end of normalization buffer. Need to check whether there ther
e are | |
| 373 // any characters left in the main buffer. | |
| 374 if(s->origFlags & UCOL_USE_ITERATOR) { | |
| 375 return !(s->iterator->hasNext(s->iterator)); | |
| 376 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) { | |
| 377 // Null terminated main string. fcdPosition is the 'return' position in
to main buf. | |
| 378 return (*s->fcdPosition == 0); | |
| 379 } | |
| 380 else { | |
| 381 // Main string with an end pointer. | |
| 382 return s->fcdPosition == s->endp; | |
| 383 } | |
| 384 } | |
| 385 | |
| 386 /* | |
| 387 * collIter_bos() | |
| 388 * Checks for a collIterate being positioned at the start of | |
| 389 * its source string. | |
| 390 * | |
| 391 */ | |
| 392 static | |
| 393 inline UBool collIter_bos(collIterate *source) { | |
| 394 // if we're going backwards, we need to know whether there is more in the | |
| 395 // iterator, even if we are in the side buffer | |
| 396 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR)
{ | |
| 397 return !source->iterator->hasPrevious(source->iterator); | |
| 398 } | |
| 399 if (source->pos <= source->string || | |
| 400 ((source->flags & UCOL_ITER_INNORMBUF) && | |
| 401 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) { | |
| 402 return TRUE; | |
| 403 } | |
| 404 return FALSE; | |
| 405 } | |
| 406 | |
| 407 /*static | |
| 408 inline UBool collIter_SimpleBos(collIterate *source) { | |
| 409 // if we're going backwards, we need to know whether there is more in the | |
| 410 // iterator, even if we are in the side buffer | |
| 411 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR)
{ | |
| 412 return !source->iterator->hasPrevious(source->iterator); | |
| 413 } | |
| 414 if (source->pos == source->string) { | |
| 415 return TRUE; | |
| 416 } | |
| 417 return FALSE; | |
| 418 }*/ | |
| 419 //return (data->pos == data->string) || | |
| 420 | |
| 421 | |
| 422 /****************************************************************************/ | |
| 423 /* Following are the open/close functions */ | |
| 424 /* */ | |
| 425 /****************************************************************************/ | |
| 426 | |
| 427 static UCollator* | |
| 428 ucol_initFromBinary(const uint8_t *bin, int32_t length, | |
| 429 const UCollator *base, | |
| 430 UCollator *fillIn, | |
| 431 UErrorCode *status) | |
| 432 { | |
| 433 UCollator *result = fillIn; | |
| 434 if(U_FAILURE(*status)) { | |
| 435 return NULL; | |
| 436 } | |
| 437 /* | |
| 438 if(base == NULL) { | |
| 439 // we don't support null base yet | |
| 440 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
| 441 return NULL; | |
| 442 } | |
| 443 */ | |
| 444 // We need these and we could be running without UCA | |
| 445 uprv_uca_initImplicitConstants(status); | |
| 446 UCATableHeader *colData = (UCATableHeader *)bin; | |
| 447 // do we want version check here? We're trying to figure out whether collato
rs are compatible | |
| 448 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeo
f(UVersionInfo)) != 0 || | |
| 449 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersio
nInfo)) != 0)) || | |
| 450 colData->version[0] != UCOL_BUILDER_VERSION) | |
| 451 { | |
| 452 *status = U_COLLATOR_VERSION_MISMATCH; | |
| 453 return NULL; | |
| 454 } | |
| 455 else { | |
| 456 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(s
izeof(UColOptionSet)))) { | |
| 457 result = ucol_initCollator((const UCATableHeader *)bin, result, base
, status); | |
| 458 if(U_FAILURE(*status)){ | |
| 459 return NULL; | |
| 460 } | |
| 461 result->hasRealData = TRUE; | |
| 462 } | |
| 463 else { | |
| 464 if(base) { | |
| 465 result = ucol_initCollator(base->image, result, base, status); | |
| 466 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const
UCATableHeader *)bin)->options), status); | |
| 467 if(U_FAILURE(*status)){ | |
| 468 return NULL; | |
| 469 } | |
| 470 result->hasRealData = FALSE; | |
| 471 } | |
| 472 else { | |
| 473 *status = U_USELESS_COLLATOR_ERROR; | |
| 474 return NULL; | |
| 475 } | |
| 476 } | |
| 477 result->freeImageOnClose = FALSE; | |
| 478 } | |
| 479 result->actualLocale = NULL; | |
| 480 result->validLocale = NULL; | |
| 481 result->requestedLocale = NULL; | |
| 482 result->rules = NULL; | |
| 483 result->rulesLength = 0; | |
| 484 result->freeRulesOnClose = FALSE; | |
| 485 result->ucaRules = NULL; | |
| 486 return result; | |
| 487 } | |
| 488 | |
| 489 U_CAPI UCollator* U_EXPORT2 | 39 U_CAPI UCollator* U_EXPORT2 |
| 490 ucol_openBinary(const uint8_t *bin, int32_t length, | 40 ucol_openBinary(const uint8_t *bin, int32_t length, |
| 491 const UCollator *base, | 41 const UCollator *base, |
| 492 UErrorCode *status) | 42 UErrorCode *status) |
| 493 { | 43 { |
| 494 return ucol_initFromBinary(bin, length, base, NULL, status); | 44 if(U_FAILURE(*status)) { return NULL; } |
| 45 RuleBasedCollator *coll = new RuleBasedCollator( |
| 46 bin, length, |
| 47 RuleBasedCollator::rbcFromUCollator(base), |
| 48 *status); |
| 49 if(coll == NULL) { |
| 50 *status = U_MEMORY_ALLOCATION_ERROR; |
| 51 return NULL; |
| 52 } |
| 53 if(U_FAILURE(*status)) { |
| 54 delete coll; |
| 55 return NULL; |
| 56 } |
| 57 return coll->toUCollator(); |
| 495 } | 58 } |
| 496 | 59 |
| 497 U_CAPI int32_t U_EXPORT2 | 60 U_CAPI int32_t U_EXPORT2 |
| 498 ucol_cloneBinary(const UCollator *coll, | 61 ucol_cloneBinary(const UCollator *coll, |
| 499 uint8_t *buffer, int32_t capacity, | 62 uint8_t *buffer, int32_t capacity, |
| 500 UErrorCode *status) | 63 UErrorCode *status) |
| 501 { | 64 { |
| 502 int32_t length = 0; | |
| 503 if(U_FAILURE(*status)) { | 65 if(U_FAILURE(*status)) { |
| 504 return length; | 66 return 0; |
| 505 } | 67 } |
| 506 if(capacity < 0) { | 68 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); |
| 507 *status = U_ILLEGAL_ARGUMENT_ERROR; | 69 if(rbc == NULL && coll != NULL) { |
| 508 return length; | 70 *status = U_UNSUPPORTED_ERROR; |
| 71 return 0; |
| 509 } | 72 } |
| 510 if(coll->hasRealData == TRUE) { | 73 return rbc->cloneBinary(buffer, capacity, *status); |
| 511 length = coll->image->size; | |
| 512 if(length <= capacity) { | |
| 513 uprv_memcpy(buffer, coll->image, length); | |
| 514 } else { | |
| 515 *status = U_BUFFER_OVERFLOW_ERROR; | |
| 516 } | |
| 517 } else { | |
| 518 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(
UColOptionSet))); | |
| 519 if(length <= capacity) { | |
| 520 /* build the UCATableHeader with minimal entries */ | |
| 521 /* do not copy the header from the UCA file because its values are w
rong! */ | |
| 522 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ | |
| 523 | |
| 524 /* reset everything */ | |
| 525 uprv_memset(buffer, 0, length); | |
| 526 | |
| 527 /* set the tailoring-specific values */ | |
| 528 UCATableHeader *myData = (UCATableHeader *)buffer; | |
| 529 myData->size = length; | |
| 530 | |
| 531 /* offset for the options, the only part of the data that is present
after the header */ | |
| 532 myData->options = sizeof(UCATableHeader); | |
| 533 | |
| 534 /* need to always set the expansion value for an upper bound of the
options */ | |
| 535 myData->expansion = myData->options + sizeof(UColOptionSet); | |
| 536 | |
| 537 myData->magic = UCOL_HEADER_MAGIC; | |
| 538 myData->isBigEndian = U_IS_BIG_ENDIAN; | |
| 539 myData->charSetFamily = U_CHARSET_FAMILY; | |
| 540 | |
| 541 /* copy UCA's version; genrb will override all but the builder versi
on with tailoring data */ | |
| 542 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionIn
fo)); | |
| 543 | |
| 544 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVer
sionInfo)); | |
| 545 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVer
sionInfo)); | |
| 546 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeo
f(UVersionInfo)); | |
| 547 myData->jamoSpecial = coll->image->jamoSpecial; | |
| 548 | |
| 549 /* copy the collator options */ | |
| 550 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options
, sizeof(UColOptionSet)); | |
| 551 } else { | |
| 552 *status = U_BUFFER_OVERFLOW_ERROR; | |
| 553 } | |
| 554 } | |
| 555 return length; | |
| 556 } | 74 } |
| 557 | 75 |
| 558 U_CAPI UCollator* U_EXPORT2 | 76 U_CAPI UCollator* U_EXPORT2 |
| 559 ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferS
ize, UErrorCode *status) | 77 ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferS
ize, UErrorCode *status) |
| 560 { | 78 { |
| 561 UCollator * localCollator; | |
| 562 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator); | |
| 563 int32_t imageSize = 0; | |
| 564 int32_t rulesSize = 0; | |
| 565 int32_t rulesPadding = 0; | |
| 566 int32_t defaultReorderCodesSize = 0; | |
| 567 int32_t reorderCodesSize = 0; | |
| 568 uint8_t *image; | |
| 569 UChar *rules; | |
| 570 int32_t* defaultReorderCodes; | |
| 571 int32_t* reorderCodes; | |
| 572 uint8_t* leadBytePermutationTable; | |
| 573 UBool imageAllocated = FALSE; | |
| 574 | |
| 575 if (status == NULL || U_FAILURE(*status)){ | 79 if (status == NULL || U_FAILURE(*status)){ |
| 576 return NULL; | 80 return NULL; |
| 577 } | 81 } |
| 578 if (coll == NULL) { | 82 if (coll == NULL) { |
| 579 *status = U_ILLEGAL_ARGUMENT_ERROR; | 83 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 580 return NULL; | 84 return NULL; |
| 581 } | 85 } |
| 582 | |
| 583 if (coll->rules && coll->freeRulesOnClose) { | |
| 584 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar); | |
| 585 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar)); | |
| 586 bufferSizeNeeded += rulesSize + rulesPadding; | |
| 587 } | |
| 588 // no padding for alignment needed from here since the next two are 4 byte q
uantities | |
| 589 if (coll->defaultReorderCodes) { | |
| 590 defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32
_t); | |
| 591 bufferSizeNeeded += defaultReorderCodesSize; | |
| 592 } | |
| 593 if (coll->reorderCodes) { | |
| 594 reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t); | |
| 595 bufferSizeNeeded += reorderCodesSize; | |
| 596 } | |
| 597 if (coll->leadBytePermutationTable) { | |
| 598 bufferSizeNeeded += 256 * sizeof(uint8_t); | |
| 599 } | |
| 600 | |
| 601 if (pBufferSize != NULL) { | 86 if (pBufferSize != NULL) { |
| 602 int32_t inputSize = *pBufferSize; | 87 int32_t inputSize = *pBufferSize; |
| 603 *pBufferSize = 1; | 88 *pBufferSize = 1; |
| 604 if (inputSize == 0) { | 89 if (inputSize == 0) { |
| 605 return NULL; // preflighting for deprecated functionality | 90 return NULL; // preflighting for deprecated functionality |
| 606 } | 91 } |
| 607 } | 92 } |
| 608 | 93 Collator *newColl = Collator::fromUCollator(coll)->clone(); |
| 609 char *stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded); | 94 if (newColl == NULL) { |
| 610 // Null pointer check. | |
| 611 if (stackBufferChars == NULL) { | |
| 612 *status = U_MEMORY_ALLOCATION_ERROR; | 95 *status = U_MEMORY_ALLOCATION_ERROR; |
| 613 return NULL; | 96 } else { |
| 97 *status = U_SAFECLONE_ALLOCATED_WARNING; |
| 614 } | 98 } |
| 615 *status = U_SAFECLONE_ALLOCATED_WARNING; | 99 return newColl->toUCollator(); |
| 616 | |
| 617 localCollator = (UCollator *)stackBufferChars; | |
| 618 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding); | |
| 619 defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize); | |
| 620 reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCode
sSize); | |
| 621 leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize; | |
| 622 | |
| 623 { | |
| 624 UErrorCode tempStatus = U_ZERO_ERROR; | |
| 625 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus); | |
| 626 } | |
| 627 if (coll->freeImageOnClose) { | |
| 628 image = (uint8_t *)uprv_malloc(imageSize); | |
| 629 // Null pointer check | |
| 630 if (image == NULL) { | |
| 631 *status = U_MEMORY_ALLOCATION_ERROR; | |
| 632 return NULL; | |
| 633 } | |
| 634 ucol_cloneBinary(coll, image, imageSize, status); | |
| 635 imageAllocated = TRUE; | |
| 636 } | |
| 637 else { | |
| 638 image = (uint8_t *)coll->image; | |
| 639 } | |
| 640 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollat
or, status); | |
| 641 if (U_FAILURE(*status)) { | |
| 642 return NULL; | |
| 643 } | |
| 644 | |
| 645 if (coll->rules) { | |
| 646 if (coll->freeRulesOnClose) { | |
| 647 localCollator->rules = u_strcpy(rules, coll->rules); | |
| 648 //bufferEnd += rulesSize; | |
| 649 } | |
| 650 else { | |
| 651 localCollator->rules = coll->rules; | |
| 652 } | |
| 653 localCollator->freeRulesOnClose = FALSE; | |
| 654 localCollator->rulesLength = coll->rulesLength; | |
| 655 } | |
| 656 | |
| 657 // collator reordering | |
| 658 if (coll->defaultReorderCodes) { | |
| 659 localCollator->defaultReorderCodes = | |
| 660 (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCode
s, coll->defaultReorderCodesLength * sizeof(int32_t)); | |
| 661 localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLeng
th; | |
| 662 localCollator->freeDefaultReorderCodesOnClose = FALSE; | |
| 663 } | |
| 664 if (coll->reorderCodes) { | |
| 665 localCollator->reorderCodes = | |
| 666 (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorde
rCodesLength * sizeof(int32_t)); | |
| 667 localCollator->reorderCodesLength = coll->reorderCodesLength; | |
| 668 localCollator->freeReorderCodesOnClose = FALSE; | |
| 669 } | |
| 670 if (coll->leadBytePermutationTable) { | |
| 671 localCollator->leadBytePermutationTable = | |
| 672 (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermu
tationTable, 256); | |
| 673 localCollator->freeLeadBytePermutationTableOnClose = FALSE; | |
| 674 } | |
| 675 | |
| 676 int32_t i; | |
| 677 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { | |
| 678 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(col
l, (UColAttribute)i, status), status); | |
| 679 } | |
| 680 // zero copies of pointers | |
| 681 localCollator->actualLocale = NULL; | |
| 682 localCollator->validLocale = NULL; | |
| 683 localCollator->requestedLocale = NULL; | |
| 684 localCollator->ucaRules = coll->ucaRules; // There should only be one copy h
ere. | |
| 685 localCollator->freeOnClose = TRUE; | |
| 686 localCollator->freeImageOnClose = imageAllocated; | |
| 687 return localCollator; | |
| 688 } | 100 } |
| 689 | 101 |
| 690 U_CAPI void U_EXPORT2 | 102 U_CAPI void U_EXPORT2 |
| 691 ucol_close(UCollator *coll) | 103 ucol_close(UCollator *coll) |
| 692 { | 104 { |
| 693 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); | 105 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); |
| 694 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); | 106 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); |
| 695 if(coll != NULL) { | 107 if(coll != NULL) { |
| 696 // these are always owned by each UCollator struct, | 108 delete Collator::fromUCollator(coll); |
| 697 // so we always free them | |
| 698 if(coll->validLocale != NULL) { | |
| 699 uprv_free(coll->validLocale); | |
| 700 } | |
| 701 if(coll->actualLocale != NULL) { | |
| 702 uprv_free(coll->actualLocale); | |
| 703 } | |
| 704 if(coll->requestedLocale != NULL) { | |
| 705 uprv_free(coll->requestedLocale); | |
| 706 } | |
| 707 if(coll->latinOneCEs != NULL) { | |
| 708 uprv_free(coll->latinOneCEs); | |
| 709 } | |
| 710 if(coll->options != NULL && coll->freeOptionsOnClose) { | |
| 711 uprv_free(coll->options); | |
| 712 } | |
| 713 if(coll->rules != NULL && coll->freeRulesOnClose) { | |
| 714 uprv_free((UChar *)coll->rules); | |
| 715 } | |
| 716 if(coll->image != NULL && coll->freeImageOnClose) { | |
| 717 uprv_free((UCATableHeader *)coll->image); | |
| 718 } | |
| 719 | |
| 720 if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutati
onTableOnClose == TRUE) { | |
| 721 uprv_free(coll->leadBytePermutationTable); | |
| 722 } | |
| 723 if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnC
lose == TRUE) { | |
| 724 uprv_free(coll->defaultReorderCodes); | |
| 725 } | |
| 726 if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE)
{ | |
| 727 uprv_free(coll->reorderCodes); | |
| 728 } | |
| 729 | |
| 730 if(coll->delegate != NULL) { | |
| 731 delete (Collator*)coll->delegate; | |
| 732 } | |
| 733 | |
| 734 /* Here, it would be advisable to close: */ | |
| 735 /* - UData for UCA (unless we stuff it in the root resb */ | |
| 736 /* Again, do we need additional housekeeping... HMMM! */ | |
| 737 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose); | |
| 738 if(coll->freeOnClose){ | |
| 739 /* for safeClone, if freeOnClose is FALSE, | |
| 740 don't free the other instance data */ | |
| 741 uprv_free(coll); | |
| 742 } | |
| 743 } | 109 } |
| 744 UTRACE_EXIT(); | 110 UTRACE_EXIT(); |
| 745 } | 111 } |
| 746 | 112 |
| 747 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCo
de *status) { | |
| 748 if(U_FAILURE(*status)) { | |
| 749 return; | |
| 750 } | |
| 751 result->caseFirst = (UColAttributeValue)opts->caseFirst; | |
| 752 result->caseLevel = (UColAttributeValue)opts->caseLevel; | |
| 753 result->frenchCollation = (UColAttributeValue)opts->frenchCollation; | |
| 754 result->normalizationMode = (UColAttributeValue)opts->normalizationMode; | |
| 755 if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) { | |
| 756 return; | |
| 757 } | |
| 758 result->strength = (UColAttributeValue)opts->strength; | |
| 759 result->variableTopValue = opts->variableTopValue; | |
| 760 result->alternateHandling = (UColAttributeValue)opts->alternateHandling; | |
| 761 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ; | |
| 762 result->numericCollation = (UColAttributeValue)opts->numericCollation; | |
| 763 result->caseFirstisDefault = TRUE; | |
| 764 result->caseLevelisDefault = TRUE; | |
| 765 result->frenchCollationisDefault = TRUE; | |
| 766 result->normalizationModeisDefault = TRUE; | |
| 767 result->strengthisDefault = TRUE; | |
| 768 result->variableTopValueisDefault = TRUE; | |
| 769 result->alternateHandlingisDefault = TRUE; | |
| 770 result->hiraganaQisDefault = TRUE; | |
| 771 result->numericCollationisDefault = TRUE; | |
| 772 | |
| 773 ucol_updateInternalState(result, status); | |
| 774 | |
| 775 result->options = opts; | |
| 776 } | |
| 777 | |
| 778 | |
| 779 /** | |
| 780 * Approximate determination if a character is at a contraction end. | |
| 781 * Guaranteed to be TRUE if a character is at the end of a contraction, | |
| 782 * otherwise it is not deterministic. | |
| 783 * @param c character to be determined | |
| 784 * @param coll collator | |
| 785 */ | |
| 786 static | |
| 787 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) { | |
| 788 if (c < coll->minContrEndCP) { | |
| 789 return FALSE; | |
| 790 } | |
| 791 | |
| 792 int32_t hash = c; | |
| 793 uint8_t htbyte; | |
| 794 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { | |
| 795 if (U16_IS_TRAIL(c)) { | |
| 796 return TRUE; | |
| 797 } | |
| 798 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; | |
| 799 } | |
| 800 htbyte = coll->contrEndCP[hash>>3]; | |
| 801 return (((htbyte >> (hash & 7)) & 1) == 1); | |
| 802 } | |
| 803 | |
| 804 | |
| 805 | |
| 806 /* | |
| 807 * i_getCombiningClass() | |
| 808 * A fast, at least partly inline version of u_getCombiningClass() | |
| 809 * This is a candidate for further optimization. Used heavily | |
| 810 * in contraction processing. | |
| 811 */ | |
| 812 static | |
| 813 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) { | |
| 814 uint8_t sCC = 0; | |
| 815 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) { | |
| 816 sCC = u_getCombiningClass(c); | |
| 817 } | |
| 818 return sCC; | |
| 819 } | |
| 820 | |
| 821 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, con
st UCollator *UCA, UErrorCode *status) { | |
| 822 UChar c; | |
| 823 UCollator *result = fillIn; | |
| 824 if(U_FAILURE(*status) || image == NULL) { | |
| 825 return NULL; | |
| 826 } | |
| 827 | |
| 828 if(result == NULL) { | |
| 829 result = (UCollator *)uprv_malloc(sizeof(UCollator)); | |
| 830 if(result == NULL) { | |
| 831 *status = U_MEMORY_ALLOCATION_ERROR; | |
| 832 return result; | |
| 833 } | |
| 834 result->freeOnClose = TRUE; | |
| 835 } else { | |
| 836 result->freeOnClose = FALSE; | |
| 837 } | |
| 838 | |
| 839 result->delegate = NULL; | |
| 840 | |
| 841 result->image = image; | |
| 842 result->mapping.getFoldingOffset = _getFoldingOffset; | |
| 843 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosit
ion; | |
| 844 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE -
result->image->mappingPosition, status); | |
| 845 if(U_FAILURE(*status)) { | |
| 846 if(result->freeOnClose == TRUE) { | |
| 847 uprv_free(result); | |
| 848 result = NULL; | |
| 849 } | |
| 850 return result; | |
| 851 } | |
| 852 | |
| 853 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping); | |
| 854 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->
contractionCEs); | |
| 855 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->c
ontractionIndex); | |
| 856 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expan
sion); | |
| 857 result->rules = NULL; | |
| 858 result->rulesLength = 0; | |
| 859 result->freeRulesOnClose = FALSE; | |
| 860 result->defaultReorderCodes = NULL; | |
| 861 result->defaultReorderCodesLength = 0; | |
| 862 result->freeDefaultReorderCodesOnClose = FALSE; | |
| 863 result->reorderCodes = NULL; | |
| 864 result->reorderCodesLength = 0; | |
| 865 result->freeReorderCodesOnClose = FALSE; | |
| 866 result->leadBytePermutationTable = NULL; | |
| 867 result->freeLeadBytePermutationTableOnClose = FALSE; | |
| 868 | |
| 869 /* get the version info from UCATableHeader and populate the Collator struct
*/ | |
| 870 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/ | |
| 871 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules v
ersion*/ | |
| 872 result->dataVersion[2] = 0; | |
| 873 result->dataVersion[3] = 0; | |
| 874 | |
| 875 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP; | |
| 876 result->minUnsafeCP = 0; | |
| 877 for (c=0; c<0x300; c++) { // Find the smallest unsafe char. | |
| 878 if (ucol_unsafeCP(c, result)) break; | |
| 879 } | |
| 880 result->minUnsafeCP = c; | |
| 881 | |
| 882 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP; | |
| 883 result->minContrEndCP = 0; | |
| 884 for (c=0; c<0x300; c++) { // Find the Contraction-ending char. | |
| 885 if (ucol_contractionEndCP(c, result)) break; | |
| 886 } | |
| 887 result->minContrEndCP = c; | |
| 888 | |
| 889 /* max expansion tables */ | |
| 890 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image + | |
| 891 result->image->endExpansionCE); | |
| 892 result->lastEndExpansionCE = result->endExpansionCE + | |
| 893 result->image->endExpansionCECount - 1; | |
| 894 result->expansionCESize = (uint8_t*)result->image + | |
| 895 result->image->expansionCESize; | |
| 896 | |
| 897 | |
| 898 //result->errorCode = *status; | |
| 899 | |
| 900 result->latinOneCEs = NULL; | |
| 901 | |
| 902 result->latinOneRegenTable = FALSE; | |
| 903 result->latinOneFailed = FALSE; | |
| 904 result->UCA = UCA; | |
| 905 | |
| 906 /* Normally these will be set correctly later. This is the default if you us
e UCA or the default. */ | |
| 907 result->ucaRules = NULL; | |
| 908 result->actualLocale = NULL; | |
| 909 result->validLocale = NULL; | |
| 910 result->requestedLocale = NULL; | |
| 911 result->hasRealData = FALSE; // real data lives in .dat file... | |
| 912 result->freeImageOnClose = FALSE; | |
| 913 | |
| 914 /* set attributes */ | |
| 915 ucol_setOptionsFromHeader( | |
| 916 result, | |
| 917 (UColOptionSet*)((uint8_t*)result->image+result->image->options), | |
| 918 status); | |
| 919 result->freeOptionsOnClose = FALSE; | |
| 920 | |
| 921 return result; | |
| 922 } | |
| 923 | |
| 924 /* new Mark's code */ | |
| 925 | |
| 926 /** | |
| 927 * For generation of Implicit CEs | |
| 928 * @author Davis | |
| 929 * | |
| 930 * Cleaned up so that changes can be made more easily. | |
| 931 * Old values: | |
| 932 # First Implicit: E26A792D | |
| 933 # Last Implicit: E3DC70C0 | |
| 934 # First CJK: E0030300 | |
| 935 # Last CJK: E0A9DD00 | |
| 936 # First CJK_A: E0A9DF00 | |
| 937 # Last CJK_A: E0DE3100 | |
| 938 */ | |
| 939 /* Following is a port of Mark's code for new treatment of implicits. | |
| 940 * It is positioned here, since ucol_initUCA need to initialize the | |
| 941 * variables below according to the data in the fractional UCA. | |
| 942 */ | |
| 943 | |
| 944 /** | |
| 945 * Function used to: | |
| 946 * a) collapse the 2 different Han ranges from UCA into one (in the right order)
, and | |
| 947 * b) bump any non-CJK characters by 10FFFF. | |
| 948 * The relevant blocks are: | |
| 949 * A: 4E00..9FFF; CJK Unified Ideographs | |
| 950 * F900..FAFF; CJK Compatibility Ideographs | |
| 951 * B: 3400..4DBF; CJK Unified Ideographs Extension A | |
| 952 * 20000..XX; CJK Unified Ideographs Extension B (and others later on) | |
| 953 * As long as | |
| 954 * no new B characters are allocated between 4E00 and FAFF, and | |
| 955 * no new A characters are outside of this range, | |
| 956 * (very high probability) this simple code will work. | |
| 957 * The reordered blocks are: | |
| 958 * Block1 is CJK | |
| 959 * Block2 is CJK_COMPAT_USED | |
| 960 * Block3 is CJK_A | |
| 961 * (all contiguous) | |
| 962 * Any other CJK gets its normal code point | |
| 963 * Any non-CJK gets +10FFFF | |
| 964 * When we reorder Block1, we make sure that it is at the very start, | |
| 965 * so that it will use a 3-byte form. | |
| 966 * Warning: the we only pick up the compatibility characters that are | |
| 967 * NOT decomposed, so that block is smaller! | |
| 968 */ | |
| 969 | |
| 970 // CONSTANTS | |
| 971 static const UChar32 | |
| 972 NON_CJK_OFFSET = 0x110000, | |
| 973 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2 | |
| 974 | |
| 975 /** | |
| 976 * Precomputed by initImplicitConstants() | |
| 977 */ | |
| 978 static int32_t | |
| 979 final3Multiplier = 0, | |
| 980 final4Multiplier = 0, | |
| 981 final3Count = 0, | |
| 982 final4Count = 0, | |
| 983 medialCount = 0, | |
| 984 min3Primary = 0, | |
| 985 min4Primary = 0, | |
| 986 max4Primary = 0, | |
| 987 minTrail = 0, | |
| 988 maxTrail = 0, | |
| 989 max3Trail = 0, | |
| 990 max4Trail = 0, | |
| 991 min4Boundary = 0; | |
| 992 | |
| 993 static const UChar32 | |
| 994 // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; | |
| 995 // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; (Unicode 6.1) | |
| 996 CJK_BASE = 0x4E00, | |
| 997 CJK_LIMIT = 0x9FCC+1, | |
| 998 // Unified CJK ideographs in the compatibility ideographs block. | |
| 999 CJK_COMPAT_USED_BASE = 0xFA0E, | |
| 1000 CJK_COMPAT_USED_LIMIT = 0xFA2F+1, | |
| 1001 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; | |
| 1002 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; | |
| 1003 CJK_A_BASE = 0x3400, | |
| 1004 CJK_A_LIMIT = 0x4DB5+1, | |
| 1005 // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;; | |
| 1006 // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;; | |
| 1007 CJK_B_BASE = 0x20000, | |
| 1008 CJK_B_LIMIT = 0x2A6D6+1, | |
| 1009 // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;; | |
| 1010 // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;; | |
| 1011 CJK_C_BASE = 0x2A700, | |
| 1012 CJK_C_LIMIT = 0x2B734+1, | |
| 1013 // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;; | |
| 1014 // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;; | |
| 1015 CJK_D_BASE = 0x2B740, | |
| 1016 CJK_D_LIMIT = 0x2B81D+1; | |
| 1017 // when adding to this list, look for all occurrences (in project) | |
| 1018 // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing
!!!! | |
| 1019 | |
| 1020 static UChar32 swapCJK(UChar32 i) { | |
| 1021 if (i < CJK_A_BASE) { | |
| 1022 // non-CJK | |
| 1023 } else if (i < CJK_A_LIMIT) { | |
| 1024 // Extension A has lower code points than the original Unihan+compat | |
| 1025 // but sorts higher. | |
| 1026 return i - CJK_A_BASE | |
| 1027 + (CJK_LIMIT - CJK_BASE) | |
| 1028 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); | |
| 1029 } else if (i < CJK_BASE) { | |
| 1030 // non-CJK | |
| 1031 } else if (i < CJK_LIMIT) { | |
| 1032 return i - CJK_BASE; | |
| 1033 } else if (i < CJK_COMPAT_USED_BASE) { | |
| 1034 // non-CJK | |
| 1035 } else if (i < CJK_COMPAT_USED_LIMIT) { | |
| 1036 return i - CJK_COMPAT_USED_BASE | |
| 1037 + (CJK_LIMIT - CJK_BASE); | |
| 1038 } else if (i < CJK_B_BASE) { | |
| 1039 // non-CJK | |
| 1040 } else if (i < CJK_B_LIMIT) { | |
| 1041 return i; // non-BMP-CJK | |
| 1042 } else if (i < CJK_C_BASE) { | |
| 1043 // non-CJK | |
| 1044 } else if (i < CJK_C_LIMIT) { | |
| 1045 return i; // non-BMP-CJK | |
| 1046 } else if (i < CJK_D_BASE) { | |
| 1047 // non-CJK | |
| 1048 } else if (i < CJK_D_LIMIT) { | |
| 1049 return i; // non-BMP-CJK | |
| 1050 } | |
| 1051 return i + NON_CJK_OFFSET; // non-CJK | |
| 1052 } | |
| 1053 | |
| 1054 U_CAPI UChar32 U_EXPORT2 | |
| 1055 uprv_uca_getRawFromCodePoint(UChar32 i) { | |
| 1056 return swapCJK(i)+1; | |
| 1057 } | |
| 1058 | |
| 1059 U_CAPI UChar32 U_EXPORT2 | |
| 1060 uprv_uca_getCodePointFromRaw(UChar32 i) { | |
| 1061 i--; | |
| 1062 UChar32 result = 0; | |
| 1063 if(i >= NON_CJK_OFFSET) { | |
| 1064 result = i - NON_CJK_OFFSET; | |
| 1065 } else if(i >= CJK_B_BASE) { | |
| 1066 result = i; | |
| 1067 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted | |
| 1068 if(i < CJK_LIMIT - CJK_BASE) { | |
| 1069 result = i + CJK_BASE; | |
| 1070 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMP
AT_USED_BASE)) { | |
| 1071 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE); | |
| 1072 } else { | |
| 1073 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_
LIMIT - CJK_COMPAT_USED_BASE); | |
| 1074 } | |
| 1075 } else { | |
| 1076 result = -1; | |
| 1077 } | |
| 1078 return result; | |
| 1079 } | |
| 1080 | |
| 1081 // GET IMPLICIT PRIMARY WEIGHTS | |
| 1082 // Return value is left justified primary key | |
| 1083 U_CAPI uint32_t U_EXPORT2 | |
| 1084 uprv_uca_getImplicitFromRaw(UChar32 cp) { | |
| 1085 /* | |
| 1086 if (cp < 0 || cp > UCOL_MAX_INPUT) { | |
| 1087 throw new IllegalArgumentException("Code point out of range " + Utility.
hex(cp)); | |
| 1088 } | |
| 1089 */ | |
| 1090 int32_t last0 = cp - min4Boundary; | |
| 1091 if (last0 < 0) { | |
| 1092 int32_t last1 = cp / final3Count; | |
| 1093 last0 = cp % final3Count; | |
| 1094 | |
| 1095 int32_t last2 = last1 / medialCount; | |
| 1096 last1 %= medialCount; | |
| 1097 | |
| 1098 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at
start | |
| 1099 last1 = minTrail + last1; // offset | |
| 1100 last2 = min3Primary + last2; // offset | |
| 1101 /* | |
| 1102 if (last2 >= min4Primary) { | |
| 1103 throw new IllegalArgumentException("4-byte out of range: " + Utility
.hex(cp) + ", " + Utility.hex(last2)); | |
| 1104 } | |
| 1105 */ | |
| 1106 return (last2 << 24) + (last1 << 16) + (last0 << 8); | |
| 1107 } else { | |
| 1108 int32_t last1 = last0 / final4Count; | |
| 1109 last0 %= final4Count; | |
| 1110 | |
| 1111 int32_t last2 = last1 / medialCount; | |
| 1112 last1 %= medialCount; | |
| 1113 | |
| 1114 int32_t last3 = last2 / medialCount; | |
| 1115 last2 %= medialCount; | |
| 1116 | |
| 1117 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at
start | |
| 1118 last1 = minTrail + last1; // offset | |
| 1119 last2 = minTrail + last2; // offset | |
| 1120 last3 = min4Primary + last3; // offset | |
| 1121 /* | |
| 1122 if (last3 > max4Primary) { | |
| 1123 throw new IllegalArgumentException("4-byte out of range: " + Utility
.hex(cp) + ", " + Utility.hex(last3)); | |
| 1124 } | |
| 1125 */ | |
| 1126 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0; | |
| 1127 } | |
| 1128 } | |
| 1129 | |
| 1130 static uint32_t U_EXPORT2 | |
| 1131 uprv_uca_getImplicitPrimary(UChar32 cp) { | |
| 1132 //fprintf(stdout, "Incoming: %04x\n", cp); | |
| 1133 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); | |
| 1134 | |
| 1135 cp = swapCJK(cp); | |
| 1136 cp++; | |
| 1137 // we now have a range of numbers from 0 to 21FFFF. | |
| 1138 | |
| 1139 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); | |
| 1140 //fprintf(stdout, "CJK swapped: %04x\n", cp); | |
| 1141 | |
| 1142 return uprv_uca_getImplicitFromRaw(cp); | |
| 1143 } | |
| 1144 | |
| 1145 /** | |
| 1146 * Converts implicit CE into raw integer ("code point") | |
| 1147 * @param implicit | |
| 1148 * @return -1 if illegal format | |
| 1149 */ | |
| 1150 U_CAPI UChar32 U_EXPORT2 | |
| 1151 uprv_uca_getRawFromImplicit(uint32_t implicit) { | |
| 1152 UChar32 result; | |
| 1153 UChar32 b3 = implicit & 0xFF; | |
| 1154 UChar32 b2 = (implicit >> 8) & 0xFF; | |
| 1155 UChar32 b1 = (implicit >> 16) & 0xFF; | |
| 1156 UChar32 b0 = (implicit >> 24) & 0xFF; | |
| 1157 | |
| 1158 // simple parameter checks | |
| 1159 if (b0 < min3Primary || b0 > max4Primary | |
| 1160 || b1 < minTrail || b1 > maxTrail) | |
| 1161 return -1; | |
| 1162 // normal offsets | |
| 1163 b1 -= minTrail; | |
| 1164 | |
| 1165 // take care of the final values, and compose | |
| 1166 if (b0 < min4Primary) { | |
| 1167 if (b2 < minTrail || b2 > max3Trail || b3 != 0) | |
| 1168 return -1; | |
| 1169 b2 -= minTrail; | |
| 1170 UChar32 remainder = b2 % final3Multiplier; | |
| 1171 if (remainder != 0) | |
| 1172 return -1; | |
| 1173 b0 -= min3Primary; | |
| 1174 b2 /= final3Multiplier; | |
| 1175 result = ((b0 * medialCount) + b1) * final3Count + b2; | |
| 1176 } else { | |
| 1177 if (b2 < minTrail || b2 > maxTrail | |
| 1178 || b3 < minTrail || b3 > max4Trail) | |
| 1179 return -1; | |
| 1180 b2 -= minTrail; | |
| 1181 b3 -= minTrail; | |
| 1182 UChar32 remainder = b3 % final4Multiplier; | |
| 1183 if (remainder != 0) | |
| 1184 return -1; | |
| 1185 b3 /= final4Multiplier; | |
| 1186 b0 -= min4Primary; | |
| 1187 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count +
b3 + min4Boundary; | |
| 1188 } | |
| 1189 // final check | |
| 1190 if (result < 0 || result > UCOL_MAX_INPUT) | |
| 1191 return -1; | |
| 1192 return result; | |
| 1193 } | |
| 1194 | |
| 1195 | |
| 1196 static inline int32_t divideAndRoundUp(int a, int b) { | |
| 1197 return 1 + (a-1)/b; | |
| 1198 } | |
| 1199 | |
| 1200 /* this function is either called from initUCA or from genUCA before | |
| 1201 * doing canonical closure for the UCA. | |
| 1202 */ | |
| 1203 | |
| 1204 /** | |
| 1205 * Set up to generate implicits. | |
| 1206 * Maintenance Note: this function may end up being called more than once, due | |
| 1207 * to threading races during initialization. Make sure that | |
| 1208 * none of the Constants is ever transiently assigned an | |
| 1209 * incorrect value. | |
| 1210 * @param minPrimary | |
| 1211 * @param maxPrimary | |
| 1212 * @param minTrail final byte | |
| 1213 * @param maxTrail final byte | |
| 1214 * @param gap3 the gap we leave for tailoring for 3-byte forms | |
| 1215 * @param gap4 the gap we leave for tailoring for 4-byte forms | |
| 1216 */ | |
| 1217 static void initImplicitConstants(int minPrimary, int maxPrimary, | |
| 1218 int minTrailIn, int maxTrailIn, | |
| 1219 int gap3, int primaries3count, | |
| 1220 UErrorCode *status) { | |
| 1221 // some simple parameter checks | |
| 1222 if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) | |
| 1223 || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) | |
| 1224 || (primaries3count < 1)) | |
| 1225 { | |
| 1226 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
| 1227 return; | |
| 1228 }; | |
| 1229 | |
| 1230 minTrail = minTrailIn; | |
| 1231 maxTrail = maxTrailIn; | |
| 1232 | |
| 1233 min3Primary = minPrimary; | |
| 1234 max4Primary = maxPrimary; | |
| 1235 // compute constants for use later. | |
| 1236 // number of values we can use in trailing bytes | |
| 1237 // leave room for empty values between AND above, e.g. if gap = 2 | |
| 1238 // range 3..7 => +3 -4 -5 -6 -7: so 1 value | |
| 1239 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values | |
| 1240 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values | |
| 1241 final3Multiplier = gap3 + 1; | |
| 1242 final3Count = (maxTrail - minTrail + 1) / final3Multiplier; | |
| 1243 max3Trail = minTrail + (final3Count - 1) * final3Multiplier; | |
| 1244 | |
| 1245 // medials can use full range | |
| 1246 medialCount = (maxTrail - minTrail + 1); | |
| 1247 // find out how many values fit in each form | |
| 1248 int32_t threeByteCount = medialCount * final3Count; | |
| 1249 // now determine where the 3/4 boundary is. | |
| 1250 // we use 3 bytes below the boundary, and 4 above | |
| 1251 int32_t primariesAvailable = maxPrimary - minPrimary + 1; | |
| 1252 int32_t primaries4count = primariesAvailable - primaries3count; | |
| 1253 | |
| 1254 | |
| 1255 int32_t min3ByteCoverage = primaries3count * threeByteCount; | |
| 1256 min4Primary = minPrimary + primaries3count; | |
| 1257 min4Boundary = min3ByteCoverage; | |
| 1258 // Now expand out the multiplier for the 4 bytes, and redo. | |
| 1259 | |
| 1260 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary; | |
| 1261 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count
); | |
| 1262 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCo
unt * medialCount); | |
| 1263 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte; | |
| 1264 if (gap4 < 1) { | |
| 1265 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
| 1266 return; | |
| 1267 } | |
| 1268 final4Multiplier = gap4 + 1; | |
| 1269 final4Count = neededPerFinalByte; | |
| 1270 max4Trail = minTrail + (final4Count - 1) * final4Multiplier; | |
| 1271 } | |
| 1272 | |
| 1273 /** | |
| 1274 * Supply parameters for generating implicit CEs | |
| 1275 */ | |
| 1276 U_CAPI void U_EXPORT2 | |
| 1277 uprv_uca_initImplicitConstants(UErrorCode *status) { | |
| 1278 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms
. | |
| 1279 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status); | |
| 1280 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1,
1, status); | |
| 1281 } | |
| 1282 | |
| 1283 | |
| 1284 /* collIterNormalize Incremental Normalization happens here.
*/ | |
| 1285 /* pick up the range of chars identifed by FCD,
*/ | |
| 1286 /* normalize it into the collIterate's writable buffer,
*/ | |
| 1287 /* switch the collIterate's state to use the writable b
uffer. */ | |
| 1288 /*
*/ | |
| 1289 static | |
| 1290 void collIterNormalize(collIterate *collationSource) | |
| 1291 { | |
| 1292 UErrorCode status = U_ZERO_ERROR; | |
| 1293 const UChar *srcP = collationSource->pos - 1; /* Start of chars to nor
malize */ | |
| 1294 const UChar *endP = collationSource->fcdPosition; /* End of region to norma
lize+1 */ | |
| 1295 | |
| 1296 collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP -
srcP)), | |
| 1297 collationSource->writableBuffer, | |
| 1298 status); | |
| 1299 if (U_FAILURE(status)) { | |
| 1300 #ifdef UCOL_DEBUG | |
| 1301 fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_erro
rName(status)); | |
| 1302 #endif | |
| 1303 return; | |
| 1304 } | |
| 1305 | |
| 1306 collationSource->pos = collationSource->writableBuffer.getTerminatedB
uffer(); | |
| 1307 collationSource->origFlags = collationSource->flags; | |
| 1308 collationSource->flags |= UCOL_ITER_INNORMBUF; | |
| 1309 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE
_ITERATOR); | |
| 1310 } | |
| 1311 | |
| 1312 | |
| 1313 // This function takes the iterator and extracts normalized stuff up to the next
boundary | |
| 1314 // It is similar in the end results to the collIterNormalize, but for the cases
when we | |
| 1315 // use an iterator | |
| 1316 /*static | |
| 1317 inline void normalizeIterator(collIterate *collationSource) { | |
| 1318 UErrorCode status = U_ZERO_ERROR; | |
| 1319 UBool wasNormalized = FALSE; | |
| 1320 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->ite
rator, UITER_CURRENT); | |
| 1321 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iter
ator); | |
| 1322 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writa
bleBuffer, | |
| 1323 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalize
d, &status); | |
| 1324 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->w
ritableBufSize) { | |
| 1325 // reallocate and terminate | |
| 1326 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer, | |
| 1327 &collationSource->writableBuffer, | |
| 1328 (int32_t *)&collationSource->writableBufSize, nor
mLen + 1, | |
| 1329 0) | |
| 1330 ) { | |
| 1331 #ifdef UCOL_DEBUG | |
| 1332 fprintf(stderr, "normalizeIterator(), out of memory\n"); | |
| 1333 #endif | |
| 1334 return; | |
| 1335 } | |
| 1336 status = U_ZERO_ERROR; | |
| 1337 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITE
R_ZERO); | |
| 1338 collationSource->iterator->setState(collationSource->iterator, iterIndex, &s
tatus); | |
| 1339 normLen = unorm_next(collationSource->iterator, collationSource->writableBuf
fer, | |
| 1340 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalize
d, &status); | |
| 1341 } | |
| 1342 // Terminate the buffer - we already checked that it is big enough | |
| 1343 collationSource->writableBuffer[normLen] = 0; | |
| 1344 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { | |
| 1345 collationSource->flags |= UCOL_ITER_ALLOCATED; | |
| 1346 } | |
| 1347 collationSource->pos = collationSource->writableBuffer; | |
| 1348 collationSource->origFlags = collationSource->flags; | |
| 1349 collationSource->flags |= UCOL_ITER_INNORMBUF; | |
| 1350 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_I
TERATOR); | |
| 1351 }*/ | |
| 1352 | |
| 1353 | |
| 1354 /* Incremental FCD check and normalize
*/ | |
| 1355 /* Called from getNextCE when normalization state is suspect.
*/ | |
| 1356 /* When entering, the state is known to be this:
*/ | |
| 1357 /* o We are working in the main buffer of the collIterate, not the side
*/ | |
| 1358 /* writable buffer. When in the side buffer, normalization mode is alw
ays off, */ | |
| 1359 /* so we won't get here.
*/ | |
| 1360 /* o The leading combining class from the current character is 0 or
*/ | |
| 1361 /* the trailing combining class of the previous char was zero.
*/ | |
| 1362 /* True because the previous call to this function will have always exi
ted */ | |
| 1363 /* that way, and we get called for every char where cc might be non-zer
o. */ | |
| 1364 static | |
| 1365 inline UBool collIterFCD(collIterate *collationSource) { | |
| 1366 const UChar *srcP, *endP; | |
| 1367 uint8_t leadingCC; | |
| 1368 uint8_t prevTrailingCC = 0; | |
| 1369 uint16_t fcd; | |
| 1370 UBool needNormalize = FALSE; | |
| 1371 | |
| 1372 srcP = collationSource->pos-1; | |
| 1373 | |
| 1374 if (collationSource->flags & UCOL_ITER_HASLEN) { | |
| 1375 endP = collationSource->endp; | |
| 1376 } else { | |
| 1377 endP = NULL; | |
| 1378 } | |
| 1379 | |
| 1380 // Get the trailing combining class of the current character. If it's zero,
we are OK. | |
| 1381 fcd = g_nfcImpl->nextFCD16(srcP, endP); | |
| 1382 if (fcd != 0) { | |
| 1383 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); | |
| 1384 | |
| 1385 if (prevTrailingCC != 0) { | |
| 1386 // The current char has a non-zero trailing CC. Scan forward until
we find | |
| 1387 // a char with a leading cc of zero. | |
| 1388 while (endP == NULL || srcP != endP) | |
| 1389 { | |
| 1390 const UChar *savedSrcP = srcP; | |
| 1391 | |
| 1392 fcd = g_nfcImpl->nextFCD16(srcP, endP); | |
| 1393 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); | |
| 1394 if (leadingCC == 0) { | |
| 1395 srcP = savedSrcP; // Hit char that is not part of combi
ning sequence. | |
| 1396 // back up over it. (Could be surr
ogate pair!) | |
| 1397 break; | |
| 1398 } | |
| 1399 | |
| 1400 if (leadingCC < prevTrailingCC) { | |
| 1401 needNormalize = TRUE; | |
| 1402 } | |
| 1403 | |
| 1404 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); | |
| 1405 } | |
| 1406 } | |
| 1407 } | |
| 1408 | |
| 1409 collationSource->fcdPosition = (UChar *)srcP; | |
| 1410 | |
| 1411 return needNormalize; | |
| 1412 } | |
| 1413 | |
| 1414 /****************************************************************************/ | |
| 1415 /* Following are the CE retrieval functions */ | |
| 1416 /* */ | |
| 1417 /****************************************************************************/ | |
| 1418 | |
| 1419 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource); | |
| 1420 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource); | |
| 1421 | |
| 1422 /* there should be a macro version of this function in the header file */ | |
| 1423 /* This is the first function that tries to fetch a collation element */ | |
| 1424 /* If it's not succesfull or it encounters a more difficult situation */ | |
| 1425 /* some more sofisticated and slower functions are invoked */ | |
| 1426 static | |
| 1427 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou
rce, UErrorCode *status) { | |
| 1428 uint32_t order = 0; | |
| 1429 if (collationSource->CEpos > collationSource->toReturn) { /* Are there
any CEs from previous expansions? */ | |
| 1430 order = *(collationSource->toReturn++); /* if so
, return them */ | |
| 1431 if(collationSource->CEpos == collationSource->toReturn) { | |
| 1432 collationSource->CEpos = collationSource->toReturn = collationSource
->extendCEs ? collationSource->extendCEs : collationSource->CEs; | |
| 1433 } | |
| 1434 return order; | |
| 1435 } | |
| 1436 | |
| 1437 UChar ch = 0; | |
| 1438 collationSource->offsetReturn = NULL; | |
| 1439 | |
| 1440 do { | |
| 1441 for (;;) /* Loop handles case when incremental
normalize switches */ | |
| 1442 { /* to or from the side buffer / ori
ginal string, and we */ | |
| 1443 /* need to start again to get the next character. */ | |
| 1444 | |
| 1445 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBU
F | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0) | |
| 1446 { | |
| 1447 // The source string is null terminated and we're not working fr
om the side buffer, | |
| 1448 // and we're not normalizing. This is the fast path. | |
| 1449 // (We can be in the side buffer for Thai pre-vowel reordering
even when not normalizing.) | |
| 1450 ch = *collationSource->pos++; | |
| 1451 if (ch != 0) { | |
| 1452 break; | |
| 1453 } | |
| 1454 else { | |
| 1455 return UCOL_NO_MORE_CES; | |
| 1456 } | |
| 1457 } | |
| 1458 | |
| 1459 if (collationSource->flags & UCOL_ITER_HASLEN) { | |
| 1460 // Normal path for strings when length is specified. | |
| 1461 // (We can't be in side buffer because it is always null termi
nated.) | |
| 1462 if (collationSource->pos >= collationSource->endp) { | |
| 1463 // Ran off of the end of the main source string. We're done
. | |
| 1464 return UCOL_NO_MORE_CES; | |
| 1465 } | |
| 1466 ch = *collationSource->pos++; | |
| 1467 } | |
| 1468 else if(collationSource->flags & UCOL_USE_ITERATOR) { | |
| 1469 UChar32 iterCh = collationSource->iterator->next(collationSource
->iterator); | |
| 1470 if(iterCh == U_SENTINEL) { | |
| 1471 return UCOL_NO_MORE_CES; | |
| 1472 } | |
| 1473 ch = (UChar)iterCh; | |
| 1474 } | |
| 1475 else | |
| 1476 { | |
| 1477 // Null terminated string. | |
| 1478 ch = *collationSource->pos++; | |
| 1479 if (ch == 0) { | |
| 1480 // Ran off end of buffer. | |
| 1481 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { | |
| 1482 // Ran off end of main string. backing up one character. | |
| 1483 collationSource->pos--; | |
| 1484 return UCOL_NO_MORE_CES; | |
| 1485 } | |
| 1486 else | |
| 1487 { | |
| 1488 // Hit null in the normalize side buffer. | |
| 1489 // Usually this means the end of the normalized data, | |
| 1490 // except for one odd case: a null followed by combining
chars, | |
| 1491 // which is the case if we are at the start of the buf
fer. | |
| 1492 if (collationSource->pos == collationSource->writableBuf
fer.getBuffer()+1) { | |
| 1493 break; | |
| 1494 } | |
| 1495 | |
| 1496 // Null marked end of side buffer. | |
| 1497 // Revert to the main string and | |
| 1498 // loop back to top to try again to get a character. | |
| 1499 collationSource->pos = collationSource->fcdPosition; | |
| 1500 collationSource->flags = collationSource->origFlags; | |
| 1501 continue; | |
| 1502 } | |
| 1503 } | |
| 1504 } | |
| 1505 | |
| 1506 if(collationSource->flags&UCOL_HIRAGANA_Q) { | |
| 1507 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set
the flag | |
| 1508 * based on whether the previous codepoint was Hiragana or Katak
ana. | |
| 1509 */ | |
| 1510 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)
) || | |
| 1511 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >=
0x3099 && ch <= 0x309C))) { | |
| 1512 collationSource->flags |= UCOL_WAS_HIRAGANA; | |
| 1513 } else { | |
| 1514 collationSource->flags &= ~UCOL_WAS_HIRAGANA; | |
| 1515 } | |
| 1516 } | |
| 1517 | |
| 1518 // We've got a character. See if there's any fcd and/or normalizati
on stuff to do. | |
| 1519 // Note that UCOL_ITER_NORM flag is always zero when we are in th
e side buffer. | |
| 1520 if ((collationSource->flags & UCOL_ITER_NORM) == 0) { | |
| 1521 break; | |
| 1522 } | |
| 1523 | |
| 1524 if (collationSource->fcdPosition >= collationSource->pos) { | |
| 1525 // An earlier FCD check has already covered the current characte
r. | |
| 1526 // We can go ahead and process this char. | |
| 1527 break; | |
| 1528 } | |
| 1529 | |
| 1530 if (ch < ZERO_CC_LIMIT_ ) { | |
| 1531 // Fast fcd safe path. Trailing combining class == 0. This cha
r is OK. | |
| 1532 break; | |
| 1533 } | |
| 1534 | |
| 1535 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
| 1536 // We need to peek at the next character in order to tell if we
are FCD | |
| 1537 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSour
ce->pos >= collationSource->endp) { | |
| 1538 // We are at the last char of source string. | |
| 1539 // It is always OK for FCD check. | |
| 1540 break; | |
| 1541 } | |
| 1542 | |
| 1543 // Not at last char of source string (or we'll check against ter
minating null). Do the FCD fast test | |
| 1544 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
| 1545 break; | |
| 1546 } | |
| 1547 } | |
| 1548 | |
| 1549 | |
| 1550 // Need a more complete FCD check and possible normalization. | |
| 1551 if (collIterFCD(collationSource)) { | |
| 1552 collIterNormalize(collationSource); | |
| 1553 } | |
| 1554 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { | |
| 1555 // No normalization was needed. Go ahead and process the char
we already had. | |
| 1556 break; | |
| 1557 } | |
| 1558 | |
| 1559 // Some normalization happened. Next loop iteration will pick up a
char | |
| 1560 // from the normalization buffer. | |
| 1561 | |
| 1562 } // end for (;;) | |
| 1563 | |
| 1564 | |
| 1565 if (ch <= 0xFF) { | |
| 1566 /* For latin-1 characters we never need to fall back to the UCA tab
le */ | |
| 1567 /* because all of the UCA data is replicated in the latinOneMappi
ng array */ | |
| 1568 order = coll->latinOneMapping[ch]; | |
| 1569 if (order > UCOL_NOT_FOUND) { | |
| 1570 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource,
status); | |
| 1571 } | |
| 1572 } | |
| 1573 else | |
| 1574 { | |
| 1575 // Always use UCA for Han, Hangul | |
| 1576 // (Han extension A is before main Han block) | |
| 1577 // **** Han compatibility chars ?? **** | |
| 1578 if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && | |
| 1579 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) { | |
| 1580 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) { | |
| 1581 // between the two target ranges; do normal lookup | |
| 1582 // **** this range is YI, Modifier tone letters, **** | |
| 1583 // **** Latin-D, Syloti Nagari, Phagas-pa. **** | |
| 1584 // **** Latin-D might be tailored, so we need to **** | |
| 1585 // **** do the normal lookup for these guys. **** | |
| 1586 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); | |
| 1587 } else { | |
| 1588 // in one of the target ranges; use UCA | |
| 1589 order = UCOL_NOT_FOUND; | |
| 1590 } | |
| 1591 } else { | |
| 1592 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); | |
| 1593 } | |
| 1594 | |
| 1595 if(order > UCOL_NOT_FOUND) { /
* if a CE is special */ | |
| 1596 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource,
status); /* and try to get the special CE */ | |
| 1597 } | |
| 1598 | |
| 1599 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a g
ood CE in the tailoring */ | |
| 1600 /* if we got here, the codepoint MUST be over 0xFF - so we look
directly in the trie */ | |
| 1601 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); | |
| 1602 | |
| 1603 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE *
/ | |
| 1604 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collatio
nSource, status); | |
| 1605 } | |
| 1606 } | |
| 1607 } | |
| 1608 } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_L
AST_HANGUL ); | |
| 1609 | |
| 1610 if(order == UCOL_NOT_FOUND) { | |
| 1611 order = getImplicit(ch, collationSource); | |
| 1612 } | |
| 1613 return order; /* return the CE */ | |
| 1614 } | |
| 1615 | |
| 1616 /* ucol_getNextCE, out-of-line version for use from other files. */ | |
| 1617 U_CAPI uint32_t U_EXPORT2 | |
| 1618 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *
status) { | |
| 1619 return ucol_IGetNextCE(coll, collationSource, status); | |
| 1620 } | |
| 1621 | |
| 1622 | |
| 1623 /** | |
| 1624 * Incremental previous normalization happens here. Pick up the range of chars | |
| 1625 * identifed by FCD, normalize it into the collIterate's writable buffer, | |
| 1626 * switch the collIterate's state to use the writable buffer. | |
| 1627 * @param data collation iterator data | |
| 1628 */ | |
| 1629 static | |
| 1630 void collPrevIterNormalize(collIterate *data) | |
| 1631 { | |
| 1632 UErrorCode status = U_ZERO_ERROR; | |
| 1633 const UChar *pEnd = data->pos; /* End normalize + 1 */ | |
| 1634 const UChar *pStart; | |
| 1635 | |
| 1636 /* Start normalize */ | |
| 1637 if (data->fcdPosition == NULL) { | |
| 1638 pStart = data->string; | |
| 1639 } | |
| 1640 else { | |
| 1641 pStart = data->fcdPosition + 1; | |
| 1642 } | |
| 1643 | |
| 1644 int32_t normLen = | |
| 1645 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pSta
rt) + 1)), | |
| 1646 data->writableBuffer, | |
| 1647 status). | |
| 1648 length(); | |
| 1649 if(U_FAILURE(status)) { | |
| 1650 return; | |
| 1651 } | |
| 1652 /* | |
| 1653 this puts the null termination infront of the normalized string instead | |
| 1654 of the end | |
| 1655 */ | |
| 1656 data->writableBuffer.insert(0, (UChar)0); | |
| 1657 | |
| 1658 /* | |
| 1659 * The usual case at this point is that we've got a base | |
| 1660 * character followed by marks that were normalized. If | |
| 1661 * fcdPosition is NULL, that means that we backed up to | |
| 1662 * the beginning of the string and there's no base character. | |
| 1663 * | |
| 1664 * Forward processing will usually normalize when it sees | |
| 1665 * the first mark, so that mark will get it's natural offset | |
| 1666 * and the rest will get the offset of the character following | |
| 1667 * the marks. The base character will also get its natural offset. | |
| 1668 * | |
| 1669 * We write the offset of the base character, if there is one, | |
| 1670 * followed by the offset of the first mark and then the offsets | |
| 1671 * of the rest of the marks. | |
| 1672 */ | |
| 1673 int32_t firstMarkOffset = 0; | |
| 1674 int32_t trailOffset = (int32_t)(data->pos - data->string + 1); | |
| 1675 int32_t trailCount = normLen - 1; | |
| 1676 | |
| 1677 if (data->fcdPosition != NULL) { | |
| 1678 int32_t baseOffset = (int32_t)(data->fcdPosition - data->string); | |
| 1679 UChar baseChar = *data->fcdPosition; | |
| 1680 | |
| 1681 firstMarkOffset = baseOffset + 1; | |
| 1682 | |
| 1683 /* | |
| 1684 * If the base character is the start of a contraction, forward processi
ng | |
| 1685 * will normalize the marks while checking for the contraction, which me
ans | |
| 1686 * that the offset of the first mark will the same as the other marks. | |
| 1687 * | |
| 1688 * **** THIS IS PROBABLY NOT A COMPLETE TEST **** | |
| 1689 */ | |
| 1690 if (baseChar >= 0x100) { | |
| 1691 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, bas
eChar); | |
| 1692 | |
| 1693 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) { | |
| 1694 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, bas
eChar); | |
| 1695 } | |
| 1696 | |
| 1697 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION
_TAG) { | |
| 1698 firstMarkOffset = trailOffset; | |
| 1699 } | |
| 1700 } | |
| 1701 | |
| 1702 data->appendOffset(baseOffset, status); | |
| 1703 } | |
| 1704 | |
| 1705 data->appendOffset(firstMarkOffset, status); | |
| 1706 | |
| 1707 for (int32_t i = 0; i < trailCount; i += 1) { | |
| 1708 data->appendOffset(trailOffset, status); | |
| 1709 } | |
| 1710 | |
| 1711 data->offsetRepeatValue = trailOffset; | |
| 1712 | |
| 1713 data->offsetReturn = data->offsetStore - 1; | |
| 1714 if (data->offsetReturn == data->offsetBuffer) { | |
| 1715 data->offsetStore = data->offsetBuffer; | |
| 1716 } | |
| 1717 | |
| 1718 data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen; | |
| 1719 data->origFlags = data->flags; | |
| 1720 data->flags |= UCOL_ITER_INNORMBUF; | |
| 1721 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); | |
| 1722 } | |
| 1723 | |
| 1724 | |
| 1725 /** | |
| 1726 * Incremental FCD check for previous iteration and normalize. Called from | |
| 1727 * getPrevCE when normalization state is suspect. | |
| 1728 * When entering, the state is known to be this: | |
| 1729 * o We are working in the main buffer of the collIterate, not the side | |
| 1730 * writable buffer. When in the side buffer, normalization mode is always | |
| 1731 * off, so we won't get here. | |
| 1732 * o The leading combining class from the current character is 0 or the | |
| 1733 * trailing combining class of the previous char was zero. | |
| 1734 * True because the previous call to this function will have always exited | |
| 1735 * that way, and we get called for every char where cc might be non-zero. | |
| 1736 * @param data collation iterate struct | |
| 1737 * @return normalization status, TRUE for normalization to be done, FALSE | |
| 1738 * otherwise | |
| 1739 */ | |
| 1740 static | |
| 1741 inline UBool collPrevIterFCD(collIterate *data) | |
| 1742 { | |
| 1743 const UChar *src, *start; | |
| 1744 uint8_t leadingCC; | |
| 1745 uint8_t trailingCC = 0; | |
| 1746 uint16_t fcd; | |
| 1747 UBool result = FALSE; | |
| 1748 | |
| 1749 start = data->string; | |
| 1750 src = data->pos + 1; | |
| 1751 | |
| 1752 /* Get the trailing combining class of the current character. */ | |
| 1753 fcd = g_nfcImpl->previousFCD16(start, src); | |
| 1754 | |
| 1755 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); | |
| 1756 | |
| 1757 if (leadingCC != 0) { | |
| 1758 /* | |
| 1759 The current char has a non-zero leading combining class. | |
| 1760 Scan backward until we find a char with a trailing cc of zero. | |
| 1761 */ | |
| 1762 for (;;) | |
| 1763 { | |
| 1764 if (start == src) { | |
| 1765 data->fcdPosition = NULL; | |
| 1766 return result; | |
| 1767 } | |
| 1768 | |
| 1769 fcd = g_nfcImpl->previousFCD16(start, src); | |
| 1770 | |
| 1771 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); | |
| 1772 | |
| 1773 if (trailingCC == 0) { | |
| 1774 break; | |
| 1775 } | |
| 1776 | |
| 1777 if (leadingCC < trailingCC) { | |
| 1778 result = TRUE; | |
| 1779 } | |
| 1780 | |
| 1781 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); | |
| 1782 } | |
| 1783 } | |
| 1784 | |
| 1785 data->fcdPosition = (UChar *)src; | |
| 1786 | |
| 1787 return result; | |
| 1788 } | |
| 1789 | |
| 1790 /** gets a code unit from the string at a given offset | |
| 1791 * Handles both normal and iterative cases. | |
| 1792 * No error checking - caller beware! | |
| 1793 */ | |
| 1794 static inline | |
| 1795 UChar peekCodeUnit(collIterate *source, int32_t offset) { | |
| 1796 if(source->pos != NULL) { | |
| 1797 return *(source->pos + offset); | |
| 1798 } else if(source->iterator != NULL) { | |
| 1799 UChar32 c; | |
| 1800 if(offset != 0) { | |
| 1801 source->iterator->move(source->iterator, offset, UITER_CURRENT); | |
| 1802 c = source->iterator->next(source->iterator); | |
| 1803 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT); | |
| 1804 } else { | |
| 1805 c = source->iterator->current(source->iterator); | |
| 1806 } | |
| 1807 return c >= 0 ? (UChar)c : 0xfffd; // If the caller works properly, we
should never see c<0. | |
| 1808 } else { | |
| 1809 return 0xfffd; | |
| 1810 } | |
| 1811 } | |
| 1812 | |
| 1813 // Code point version. Treats the offset as a _code point_ delta. | |
| 1814 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-for
med UTF-16. | |
| 1815 // We cannot use U16_FWD_1 and similar because we do not know the start and limi
t of the buffer. | |
| 1816 static inline | |
| 1817 UChar32 peekCodePoint(collIterate *source, int32_t offset) { | |
| 1818 UChar32 c; | |
| 1819 if(source->pos != NULL) { | |
| 1820 const UChar *p = source->pos; | |
| 1821 if(offset >= 0) { | |
| 1822 // Skip forward over (offset-1) code points. | |
| 1823 while(--offset >= 0) { | |
| 1824 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) { | |
| 1825 ++p; | |
| 1826 } | |
| 1827 } | |
| 1828 // Read the code point there. | |
| 1829 c = *p++; | |
| 1830 UChar trail; | |
| 1831 if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) { | |
| 1832 c = U16_GET_SUPPLEMENTARY(c, trail); | |
| 1833 } | |
| 1834 } else /* offset<0 */ { | |
| 1835 // Skip backward over (offset-1) code points. | |
| 1836 while(++offset < 0) { | |
| 1837 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) { | |
| 1838 --p; | |
| 1839 } | |
| 1840 } | |
| 1841 // Read the code point before that. | |
| 1842 c = *--p; | |
| 1843 UChar lead; | |
| 1844 if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) { | |
| 1845 c = U16_GET_SUPPLEMENTARY(lead, c); | |
| 1846 } | |
| 1847 } | |
| 1848 } else if(source->iterator != NULL) { | |
| 1849 if(offset >= 0) { | |
| 1850 // Skip forward over (offset-1) code points. | |
| 1851 int32_t fwd = offset; | |
| 1852 while(fwd-- > 0) { | |
| 1853 uiter_next32(source->iterator); | |
| 1854 } | |
| 1855 // Read the code point there. | |
| 1856 c = uiter_current32(source->iterator); | |
| 1857 // Return to the starting point, skipping backward over (offset-1) c
ode points. | |
| 1858 while(offset-- > 0) { | |
| 1859 uiter_previous32(source->iterator); | |
| 1860 } | |
| 1861 } else /* offset<0 */ { | |
| 1862 // Read backward, reading offset code points, remember only the last
-read one. | |
| 1863 int32_t back = offset; | |
| 1864 do { | |
| 1865 c = uiter_previous32(source->iterator); | |
| 1866 } while(++back < 0); | |
| 1867 // Return to the starting position, skipping forward over offset cod
e points. | |
| 1868 do { | |
| 1869 uiter_next32(source->iterator); | |
| 1870 } while(++offset < 0); | |
| 1871 } | |
| 1872 } else { | |
| 1873 c = U_SENTINEL; | |
| 1874 } | |
| 1875 return c; | |
| 1876 } | |
| 1877 | |
| 1878 /** | |
| 1879 * Determines if we are at the start of the data string in the backwards | |
| 1880 * collation iterator | |
| 1881 * @param data collation iterator | |
| 1882 * @return TRUE if we are at the start | |
| 1883 */ | |
| 1884 static | |
| 1885 inline UBool isAtStartPrevIterate(collIterate *data) { | |
| 1886 if(data->pos == NULL && data->iterator != NULL) { | |
| 1887 return !data->iterator->hasPrevious(data->iterator); | |
| 1888 } | |
| 1889 //return (collIter_bos(data)) || | |
| 1890 return (data->pos == data->string) || | |
| 1891 ((data->flags & UCOL_ITER_INNORMBUF) && (data->pos != NULL) && | |
| 1892 *(data->pos - 1) == 0 && data->fcdPosition == NULL); | |
| 1893 } | |
| 1894 | |
| 1895 static | |
| 1896 inline void goBackOne(collIterate *data) { | |
| 1897 # if 0 | |
| 1898 // somehow, it looks like we need to keep iterator synced up | |
| 1899 // at all times, as above. | |
| 1900 if(data->pos) { | |
| 1901 data->pos--; | |
| 1902 } | |
| 1903 if(data->iterator) { | |
| 1904 data->iterator->previous(data->iterator); | |
| 1905 } | |
| 1906 #endif | |
| 1907 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) { | |
| 1908 data->iterator->previous(data->iterator); | |
| 1909 } | |
| 1910 if(data->pos) { | |
| 1911 data->pos --; | |
| 1912 } | |
| 1913 } | |
| 1914 | |
| 1915 /** | |
| 1916 * Inline function that gets a simple CE. | |
| 1917 * So what it does is that it will first check the expansion buffer. If the | |
| 1918 * expansion buffer is not empty, ie the end pointer to the expansion buffer | |
| 1919 * is different from the string pointer, we return the collation element at the | |
| 1920 * return pointer and decrement it. | |
| 1921 * For more complicated CEs it resorts to getComplicatedCE. | |
| 1922 * @param coll collator data | |
| 1923 * @param data collation iterator struct | |
| 1924 * @param status error status | |
| 1925 */ | |
| 1926 static | |
| 1927 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, | |
| 1928 UErrorCode *status) | |
| 1929 { | |
| 1930 uint32_t result = (uint32_t)UCOL_NULLORDER; | |
| 1931 | |
| 1932 if (data->offsetReturn != NULL) { | |
| 1933 if (data->offsetRepeatCount > 0) { | |
| 1934 data->offsetRepeatCount -= 1; | |
| 1935 } else { | |
| 1936 if (data->offsetReturn == data->offsetBuffer) { | |
| 1937 data->offsetReturn = NULL; | |
| 1938 data->offsetStore = data->offsetBuffer; | |
| 1939 } else { | |
| 1940 data->offsetReturn -= 1; | |
| 1941 } | |
| 1942 } | |
| 1943 } | |
| 1944 | |
| 1945 if ((data->extendCEs && data->toReturn > data->extendCEs) || | |
| 1946 (!data->extendCEs && data->toReturn > data->CEs)) | |
| 1947 { | |
| 1948 data->toReturn -= 1; | |
| 1949 result = *(data->toReturn); | |
| 1950 if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) { | |
| 1951 data->CEpos = data->toReturn; | |
| 1952 } | |
| 1953 } | |
| 1954 else { | |
| 1955 UChar ch = 0; | |
| 1956 | |
| 1957 do { | |
| 1958 /* | |
| 1959 Loop handles case when incremental normalize switches to or from the | |
| 1960 side buffer / original string, and we need to start again to get the | |
| 1961 next character. | |
| 1962 */ | |
| 1963 for (;;) { | |
| 1964 if (data->flags & UCOL_ITER_HASLEN) { | |
| 1965 /* | |
| 1966 Normal path for strings when length is specified. | |
| 1967 Not in side buffer because it is always null terminated. | |
| 1968 */ | |
| 1969 if (data->pos <= data->string) { | |
| 1970 /* End of the main source string */ | |
| 1971 return UCOL_NO_MORE_CES; | |
| 1972 } | |
| 1973 data->pos --; | |
| 1974 ch = *data->pos; | |
| 1975 } | |
| 1976 // we are using an iterator to go back. Pray for us! | |
| 1977 else if (data->flags & UCOL_USE_ITERATOR) { | |
| 1978 UChar32 iterCh = data->iterator->previous(data->iterator); | |
| 1979 if(iterCh == U_SENTINEL) { | |
| 1980 return UCOL_NO_MORE_CES; | |
| 1981 } else { | |
| 1982 ch = (UChar)iterCh; | |
| 1983 } | |
| 1984 } | |
| 1985 else { | |
| 1986 data->pos --; | |
| 1987 ch = *data->pos; | |
| 1988 /* we are in the side buffer. */ | |
| 1989 if (ch == 0) { | |
| 1990 /* | |
| 1991 At the start of the normalize side buffer. | |
| 1992 Go back to string. | |
| 1993 Because pointer points to the last accessed character, | |
| 1994 hence we have to increment it by one here. | |
| 1995 */ | |
| 1996 data->flags = data->origFlags; | |
| 1997 data->offsetRepeatValue = 0; | |
| 1998 | |
| 1999 if (data->fcdPosition == NULL) { | |
| 2000 data->pos = data->string; | |
| 2001 return UCOL_NO_MORE_CES; | |
| 2002 } | |
| 2003 else { | |
| 2004 data->pos = data->fcdPosition + 1; | |
| 2005 } | |
| 2006 | |
| 2007 continue; | |
| 2008 } | |
| 2009 } | |
| 2010 | |
| 2011 if(data->flags&UCOL_HIRAGANA_Q) { | |
| 2012 if(ch>=0x3040 && ch<=0x309f) { | |
| 2013 data->flags |= UCOL_WAS_HIRAGANA; | |
| 2014 } else { | |
| 2015 data->flags &= ~UCOL_WAS_HIRAGANA; | |
| 2016 } | |
| 2017 } | |
| 2018 | |
| 2019 /* | |
| 2020 * got a character to determine if there's fcd and/or normalizati
on | |
| 2021 * stuff to do. | |
| 2022 * if the current character is not fcd. | |
| 2023 * if current character is at the start of the string | |
| 2024 * Trailing combining class == 0. | |
| 2025 * Note if pos is in the writablebuffer, norm is always 0 | |
| 2026 */ | |
| 2027 if (ch < ZERO_CC_LIMIT_ || | |
| 2028 // this should propel us out of the loop in the iterator case | |
| 2029 (data->flags & UCOL_ITER_NORM) == 0 || | |
| 2030 (data->fcdPosition != NULL && data->fcdPosition <= data->pos
) | |
| 2031 || data->string == data->pos) { | |
| 2032 break; | |
| 2033 } | |
| 2034 | |
| 2035 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
| 2036 /* if next character is FCD */ | |
| 2037 if (data->pos == data->string) { | |
| 2038 /* First char of string is always OK for FCD check */ | |
| 2039 break; | |
| 2040 } | |
| 2041 | |
| 2042 /* Not first char of string, do the FCD fast test */ | |
| 2043 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
| 2044 break; | |
| 2045 } | |
| 2046 } | |
| 2047 | |
| 2048 /* Need a more complete FCD check and possible normalization. */ | |
| 2049 if (collPrevIterFCD(data)) { | |
| 2050 collPrevIterNormalize(data); | |
| 2051 } | |
| 2052 | |
| 2053 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { | |
| 2054 /* No normalization. Go ahead and process the char. */ | |
| 2055 break; | |
| 2056 } | |
| 2057 | |
| 2058 /* | |
| 2059 Some normalization happened. | |
| 2060 Next loop picks up a char from the normalization buffer. | |
| 2061 */ | |
| 2062 } | |
| 2063 | |
| 2064 /* attempt to handle contractions, after removal of the backwards | |
| 2065 contraction | |
| 2066 */ | |
| 2067 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data))
{ | |
| 2068 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, d
ata, status); | |
| 2069 } else { | |
| 2070 if (ch <= 0xFF) { | |
| 2071 result = coll->latinOneMapping[ch]; | |
| 2072 } | |
| 2073 else { | |
| 2074 // Always use UCA for [3400..9FFF], [AC00..D7AF] | |
| 2075 // **** [FA0E..FA2F] ?? **** | |
| 2076 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && | |
| 2077 (ch >= 0x3400 && ch <= 0xD7AF)) { | |
| 2078 if (ch > 0x9FFF && ch < 0xAC00) { | |
| 2079 // between the two target ranges; do normal lookup | |
| 2080 // **** this range is YI, Modifier tone letters, ***
* | |
| 2081 // **** Latin-D, Syloti Nagari, Phagas-pa. ***
* | |
| 2082 // **** Latin-D might be tailored, so we need to ***
* | |
| 2083 // **** do the normal lookup for these guys. ***
* | |
| 2084 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); | |
| 2085 } else { | |
| 2086 result = UCOL_NOT_FOUND; | |
| 2087 } | |
| 2088 } else { | |
| 2089 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); | |
| 2090 } | |
| 2091 } | |
| 2092 if (result > UCOL_NOT_FOUND) { | |
| 2093 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, s
tatus); | |
| 2094 } | |
| 2095 if (result == UCOL_NOT_FOUND) { // Not found in master list | |
| 2096 if (!isAtStartPrevIterate(data) && | |
| 2097 ucol_contractionEndCP(ch, data->coll)) | |
| 2098 { | |
| 2099 result = UCOL_CONTRACTION; | |
| 2100 } else { | |
| 2101 if(coll->UCA) { | |
| 2102 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping,
ch); | |
| 2103 } | |
| 2104 } | |
| 2105 | |
| 2106 if (result > UCOL_NOT_FOUND) { | |
| 2107 if(coll->UCA) { | |
| 2108 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, re
sult, data, status); | |
| 2109 } | |
| 2110 } | |
| 2111 } | |
| 2112 } | |
| 2113 } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= U
COL_LAST_HANGUL ); | |
| 2114 | |
| 2115 if(result == UCOL_NOT_FOUND) { | |
| 2116 result = getPrevImplicit(ch, data); | |
| 2117 } | |
| 2118 } | |
| 2119 | |
| 2120 return result; | |
| 2121 } | |
| 2122 | |
| 2123 | |
| 2124 /* ucol_getPrevCE, out-of-line version for use from other files. */ | |
| 2125 U_CFUNC uint32_t U_EXPORT2 | |
| 2126 ucol_getPrevCE(const UCollator *coll, collIterate *data, | |
| 2127 UErrorCode *status) { | |
| 2128 return ucol_IGetPrevCE(coll, data, status); | |
| 2129 } | |
| 2130 | |
| 2131 | |
| 2132 /* this should be connected to special Jamo handling */ | |
| 2133 U_CFUNC uint32_t U_EXPORT2 | |
| 2134 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) { | |
| 2135 collIterate colIt; | |
| 2136 IInit_collIterate(coll, &u, 1, &colIt, status); | |
| 2137 if(U_FAILURE(*status)) { | |
| 2138 return 0; | |
| 2139 } | |
| 2140 return ucol_IGetNextCE(coll, &colIt, status); | |
| 2141 } | |
| 2142 | |
| 2143 /** | |
| 2144 * Inserts the argument character into the end of the buffer pushing back the | |
| 2145 * null terminator. | |
| 2146 * @param data collIterate struct data | |
| 2147 * @param ch character to be appended | |
| 2148 * @return the position of the new addition | |
| 2149 */ | |
| 2150 static | |
| 2151 inline const UChar * insertBufferEnd(collIterate *data, UChar ch) | |
| 2152 { | |
| 2153 int32_t oldLength = data->writableBuffer.length(); | |
| 2154 return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength; | |
| 2155 } | |
| 2156 | |
| 2157 /** | |
| 2158 * Inserts the argument string into the end of the buffer pushing back the | |
| 2159 * null terminator. | |
| 2160 * @param data collIterate struct data | |
| 2161 * @param string to be appended | |
| 2162 * @param length of the string to be appended | |
| 2163 * @return the position of the new addition | |
| 2164 */ | |
| 2165 static | |
| 2166 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_
t length) | |
| 2167 { | |
| 2168 int32_t oldLength = data->writableBuffer.length(); | |
| 2169 return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldL
ength; | |
| 2170 } | |
| 2171 | |
| 2172 /** | |
| 2173 * Special normalization function for contraction in the forwards iterator. | |
| 2174 * This normalization sequence will place the current character at source->pos | |
| 2175 * and its following normalized sequence into the buffer. | |
| 2176 * The fcd position, pos will be changed. | |
| 2177 * pos will now point to positions in the buffer. | |
| 2178 * Flags will be changed accordingly. | |
| 2179 * @param data collation iterator data | |
| 2180 */ | |
| 2181 static | |
| 2182 inline void normalizeNextContraction(collIterate *data) | |
| 2183 { | |
| 2184 int32_t strsize; | |
| 2185 UErrorCode status = U_ZERO_ERROR; | |
| 2186 /* because the pointer points to the next character */ | |
| 2187 const UChar *pStart = data->pos - 1; | |
| 2188 const UChar *pEnd; | |
| 2189 | |
| 2190 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { | |
| 2191 data->writableBuffer.setTo(*(pStart - 1)); | |
| 2192 strsize = 1; | |
| 2193 } | |
| 2194 else { | |
| 2195 strsize = data->writableBuffer.length(); | |
| 2196 } | |
| 2197 | |
| 2198 pEnd = data->fcdPosition; | |
| 2199 | |
| 2200 data->writableBuffer.append( | |
| 2201 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStar
t)), status)); | |
| 2202 if(U_FAILURE(status)) { | |
| 2203 return; | |
| 2204 } | |
| 2205 | |
| 2206 data->pos = data->writableBuffer.getTerminatedBuffer() + strsize; | |
| 2207 data->origFlags = data->flags; | |
| 2208 data->flags |= UCOL_ITER_INNORMBUF; | |
| 2209 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); | |
| 2210 } | |
| 2211 | |
| 2212 /** | |
| 2213 * Contraction character management function that returns the next character | |
| 2214 * for the forwards iterator. | |
| 2215 * Does nothing if the next character is in buffer and not the first character | |
| 2216 * in it. | |
| 2217 * Else it checks next character in data string to see if it is normalizable. | |
| 2218 * If it is not, the character is simply copied into the buffer, else | |
| 2219 * the whole normalized substring is copied into the buffer, including the | |
| 2220 * current character. | |
| 2221 * @param data collation element iterator data | |
| 2222 * @return next character | |
| 2223 */ | |
| 2224 static | |
| 2225 inline UChar getNextNormalizedChar(collIterate *data) | |
| 2226 { | |
| 2227 UChar nextch; | |
| 2228 UChar ch; | |
| 2229 // Here we need to add the iterator code. One problem is the way | |
| 2230 // end of string is handled. If we just return next char, it could | |
| 2231 // be the sentinel. Most of the cases already check for this, but we | |
| 2232 // need to be sure. | |
| 2233 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) { | |
| 2234 /* if no normalization and not in buffer. */ | |
| 2235 if(data->flags & UCOL_USE_ITERATOR) { | |
| 2236 return (UChar)data->iterator->next(data->iterator); | |
| 2237 } else { | |
| 2238 return *(data->pos ++); | |
| 2239 } | |
| 2240 } | |
| 2241 | |
| 2242 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) { | |
| 2243 //normalizeIterator(data); | |
| 2244 //} | |
| 2245 | |
| 2246 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); | |
| 2247 if ((innormbuf && *data->pos != 0) || | |
| 2248 (data->fcdPosition != NULL && !innormbuf && | |
| 2249 data->pos < data->fcdPosition)) { | |
| 2250 /* | |
| 2251 if next character is in normalized buffer, no further normalization | |
| 2252 is required | |
| 2253 */ | |
| 2254 return *(data->pos ++); | |
| 2255 } | |
| 2256 | |
| 2257 if (data->flags & UCOL_ITER_HASLEN) { | |
| 2258 /* in data string */ | |
| 2259 if (data->pos + 1 == data->endp) { | |
| 2260 return *(data->pos ++); | |
| 2261 } | |
| 2262 if (data->pos >= data->endp) { | |
| 2263 return (UChar) -1; // return U+FFFF (non-char) to indicate an error | |
| 2264 } | |
| 2265 } | |
| 2266 else { | |
| 2267 if (innormbuf) { | |
| 2268 // inside the normalization buffer, but at the end | |
| 2269 // (since we encountered zero). This means, in the | |
| 2270 // case we're using char iterator, that we need to | |
| 2271 // do another round of normalization. | |
| 2272 //if(data->origFlags & UCOL_USE_ITERATOR) { | |
| 2273 // we need to restore original flags, | |
| 2274 // otherwise, we'll lose them | |
| 2275 //data->flags = data->origFlags; | |
| 2276 //normalizeIterator(data); | |
| 2277 //return *(data->pos++); | |
| 2278 //} else { | |
| 2279 /* | |
| 2280 in writable buffer, at this point fcdPosition can not be | |
| 2281 pointing to the end of the data string. see contracting tag. | |
| 2282 */ | |
| 2283 if(data->fcdPosition) { | |
| 2284 if (*(data->fcdPosition + 1) == 0 || | |
| 2285 data->fcdPosition + 1 == data->endp) { | |
| 2286 /* at the end of the string, dump it into the normalizer */ | |
| 2287 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1; | |
| 2288 // Check if data->pos received a null pointer | |
| 2289 if (data->pos == NULL) { | |
| 2290 return (UChar)-1; // Return to indicate error. | |
| 2291 } | |
| 2292 return *(data->fcdPosition ++); | |
| 2293 } | |
| 2294 data->pos = data->fcdPosition; | |
| 2295 } else if(data->origFlags & UCOL_USE_ITERATOR) { | |
| 2296 // if we are here, we're using a normalizing iterator. | |
| 2297 // we should just continue further. | |
| 2298 data->flags = data->origFlags; | |
| 2299 data->pos = NULL; | |
| 2300 return (UChar)data->iterator->next(data->iterator); | |
| 2301 } | |
| 2302 //} | |
| 2303 } | |
| 2304 else { | |
| 2305 if (*(data->pos + 1) == 0) { | |
| 2306 return *(data->pos ++); | |
| 2307 } | |
| 2308 } | |
| 2309 } | |
| 2310 | |
| 2311 ch = *data->pos ++; | |
| 2312 nextch = *data->pos; | |
| 2313 | |
| 2314 /* | |
| 2315 * if the current character is not fcd. | |
| 2316 * Trailing combining class == 0. | |
| 2317 */ | |
| 2318 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) && | |
| 2319 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ || | |
| 2320 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) { | |
| 2321 /* | |
| 2322 Need a more complete FCD check and possible normalization. | |
| 2323 normalize substring will be appended to buffer | |
| 2324 */ | |
| 2325 if (collIterFCD(data)) { | |
| 2326 normalizeNextContraction(data); | |
| 2327 return *(data->pos ++); | |
| 2328 } | |
| 2329 else if (innormbuf) { | |
| 2330 /* fcdposition shifted even when there's no normalization, if we | |
| 2331 don't input the rest into this, we'll get the wrong position when | |
| 2332 we reach the end of the writableBuffer */ | |
| 2333 int32_t length = (int32_t)(data->fcdPosition - data->pos + 1); | |
| 2334 data->pos = insertBufferEnd(data, data->pos - 1, length); | |
| 2335 // Check if data->pos received a null pointer | |
| 2336 if (data->pos == NULL) { | |
| 2337 return (UChar)-1; // Return to indicate error. | |
| 2338 } | |
| 2339 return *(data->pos ++); | |
| 2340 } | |
| 2341 } | |
| 2342 | |
| 2343 if (innormbuf) { | |
| 2344 /* | |
| 2345 no normalization is to be done hence only one character will be | |
| 2346 appended to the buffer. | |
| 2347 */ | |
| 2348 data->pos = insertBufferEnd(data, ch) + 1; | |
| 2349 // Check if data->pos received a null pointer | |
| 2350 if (data->pos == NULL) { | |
| 2351 return (UChar)-1; // Return to indicate error. | |
| 2352 } | |
| 2353 } | |
| 2354 | |
| 2355 /* points back to the pos in string */ | |
| 2356 return ch; | |
| 2357 } | |
| 2358 | |
| 2359 | |
| 2360 | |
| 2361 /** | |
| 2362 * Function to copy the buffer into writableBuffer and sets the fcd position to | |
| 2363 * the correct position | |
| 2364 * @param source data string source | |
| 2365 * @param buffer character buffer | |
| 2366 */ | |
| 2367 static | |
| 2368 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &b
uffer) | |
| 2369 { | |
| 2370 /* okay confusing part here. to ensure that the skipped characters are | |
| 2371 considered later, we need to place it in the appropriate position in the | |
| 2372 normalization buffer and reassign the pos pointer. simple case if pos | |
| 2373 reside in string, simply copy to normalization buffer and | |
| 2374 fcdposition = pos, pos = start of normalization buffer. if pos in | |
| 2375 normalization buffer, we'll insert the copy infront of pos and point pos | |
| 2376 to the start of the normalization buffer. why am i doing these copies? | |
| 2377 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecial
CE does | |
| 2378 not require any changes, which be really painful. */ | |
| 2379 if (source->flags & UCOL_ITER_INNORMBUF) { | |
| 2380 int32_t replaceLength = source->pos - source->writableBuffer.getBuffer()
; | |
| 2381 source->writableBuffer.replace(0, replaceLength, buffer); | |
| 2382 } | |
| 2383 else { | |
| 2384 source->fcdPosition = source->pos; | |
| 2385 source->origFlags = source->flags; | |
| 2386 source->flags |= UCOL_ITER_INNORMBUF; | |
| 2387 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_IT
ERATOR); | |
| 2388 source->writableBuffer = buffer; | |
| 2389 } | |
| 2390 | |
| 2391 source->pos = source->writableBuffer.getTerminatedBuffer(); | |
| 2392 } | |
| 2393 | |
| 2394 /** | |
| 2395 * Function to get the discontiguos collation element within the source. | |
| 2396 * Note this function will set the position to the appropriate places. | |
| 2397 * @param coll current collator used | |
| 2398 * @param source data string source | |
| 2399 * @param constart index to the start character in the contraction table | |
| 2400 * @return discontiguos collation element offset | |
| 2401 */ | |
| 2402 static | |
| 2403 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source, | |
| 2404 const UChar *constart) | |
| 2405 { | |
| 2406 /* source->pos currently points to the second combining character after | |
| 2407 the start character */ | |
| 2408 const UChar *temppos = source->pos; | |
| 2409 UnicodeString buffer; | |
| 2410 const UChar *tempconstart = constart; | |
| 2411 uint8_t tempflags = source->flags; | |
| 2412 UBool multicontraction = FALSE; | |
| 2413 collIterateState discState; | |
| 2414 | |
| 2415 backupState(source, &discState); | |
| 2416 | |
| 2417 buffer.setTo(peekCodePoint(source, -1)); | |
| 2418 for (;;) { | |
| 2419 UChar *UCharOffset; | |
| 2420 UChar schar, | |
| 2421 tchar; | |
| 2422 uint32_t result; | |
| 2423 | |
| 2424 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp) | |
| 2425 || (peekCodeUnit(source, 0) == 0 && | |
| 2426 //|| (*source->pos == 0 && | |
| 2427 ((source->flags & UCOL_ITER_INNORMBUF) == 0 || | |
| 2428 source->fcdPosition == NULL || | |
| 2429 source->fcdPosition == source->endp || | |
| 2430 *(source->fcdPosition) == 0 || | |
| 2431 u_getCombiningClass(*(source->fcdPosition)) == 0)) || | |
| 2432 /* end of string in null terminated string or stopped by a | |
| 2433 null character, note fcd does not always point to a base | |
| 2434 character after the discontiguos change */ | |
| 2435 u_getCombiningClass(peekCodePoint(source, 0)) == 0) { | |
| 2436 //u_getCombiningClass(*(source->pos)) == 0) { | |
| 2437 //constart = (UChar *)coll->image + getContractOffset(CE); | |
| 2438 if (multicontraction) { | |
| 2439 source->pos = temppos - 1; | |
| 2440 setDiscontiguosAttribute(source, buffer); | |
| 2441 return *(coll->contractionCEs + | |
| 2442 (tempconstart - coll->contractionIndex)); | |
| 2443 } | |
| 2444 constart = tempconstart; | |
| 2445 break; | |
| 2446 } | |
| 2447 | |
| 2448 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/ | |
| 2449 schar = getNextNormalizedChar(source); | |
| 2450 | |
| 2451 while (schar > (tchar = *UCharOffset)) { | |
| 2452 UCharOffset++; | |
| 2453 } | |
| 2454 | |
| 2455 if (schar != tchar) { | |
| 2456 /* not the correct codepoint. we stuff the current codepoint into | |
| 2457 the discontiguos buffer and try the next character */ | |
| 2458 buffer.append(schar); | |
| 2459 continue; | |
| 2460 } | |
| 2461 else { | |
| 2462 if (u_getCombiningClass(schar) == | |
| 2463 u_getCombiningClass(peekCodePoint(source, -2))) { | |
| 2464 buffer.append(schar); | |
| 2465 continue; | |
| 2466 } | |
| 2467 result = *(coll->contractionCEs + | |
| 2468 (UCharOffset - coll->contractionIndex)); | |
| 2469 } | |
| 2470 | |
| 2471 if (result == UCOL_NOT_FOUND) { | |
| 2472 break; | |
| 2473 } else if (isContraction(result)) { | |
| 2474 /* this is a multi-contraction*/ | |
| 2475 tempconstart = (UChar *)coll->image + getContractOffset(result); | |
| 2476 if (*(coll->contractionCEs + (constart - coll->contractionIndex)) | |
| 2477 != UCOL_NOT_FOUND) { | |
| 2478 multicontraction = TRUE; | |
| 2479 temppos = source->pos + 1; | |
| 2480 } | |
| 2481 } else { | |
| 2482 setDiscontiguosAttribute(source, buffer); | |
| 2483 return result; | |
| 2484 } | |
| 2485 } | |
| 2486 | |
| 2487 /* no problems simply reverting just like that, | |
| 2488 if we are in string before getting into this function, points back to | |
| 2489 string hence no problem. | |
| 2490 if we are in normalization buffer before getting into this function, | |
| 2491 since we'll never use another normalization within this function, we | |
| 2492 know that fcdposition points to a base character. the normalization buffer | |
| 2493 never change, hence this revert works. */ | |
| 2494 loadState(source, &discState, TRUE); | |
| 2495 goBackOne(source); | |
| 2496 | |
| 2497 //source->pos = temppos - 1; | |
| 2498 source->flags = tempflags; | |
| 2499 return *(coll->contractionCEs + (constart - coll->contractionIndex)); | |
| 2500 } | |
| 2501 | |
| 2502 /* now uses Mark's getImplicitPrimary code */ | |
| 2503 static | |
| 2504 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) { | |
| 2505 uint32_t r = uprv_uca_getImplicitPrimary(cp); | |
| 2506 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0; | |
| 2507 collationSource->offsetRepeatCount += 1; | |
| 2508 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order' | |
| 2509 } | |
| 2510 | |
| 2511 /** | |
| 2512 * Inserts the argument character into the front of the buffer replacing the | |
| 2513 * front null terminator. | |
| 2514 * @param data collation element iterator data | |
| 2515 * @param ch character to be appended | |
| 2516 */ | |
| 2517 static | |
| 2518 inline void insertBufferFront(collIterate *data, UChar ch) | |
| 2519 { | |
| 2520 data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTer
minatedBuffer() + 2; | |
| 2521 } | |
| 2522 | |
| 2523 /** | |
| 2524 * Special normalization function for contraction in the previous iterator. | |
| 2525 * This normalization sequence will place the current character at source->pos | |
| 2526 * and its following normalized sequence into the buffer. | |
| 2527 * The fcd position, pos will be changed. | |
| 2528 * pos will now point to positions in the buffer. | |
| 2529 * Flags will be changed accordingly. | |
| 2530 * @param data collation iterator data | |
| 2531 */ | |
| 2532 static | |
| 2533 inline void normalizePrevContraction(collIterate *data, UErrorCode *status) | |
| 2534 { | |
| 2535 const UChar *pEnd = data->pos + 1; /* End normalize + 1 */ | |
| 2536 const UChar *pStart; | |
| 2537 | |
| 2538 UnicodeString endOfBuffer; | |
| 2539 if (data->flags & UCOL_ITER_HASLEN) { | |
| 2540 /* | |
| 2541 normalization buffer not used yet, we'll pull down the next | |
| 2542 character into the end of the buffer | |
| 2543 */ | |
| 2544 endOfBuffer.setTo(*pEnd); | |
| 2545 } | |
| 2546 else { | |
| 2547 endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL | |
| 2548 } | |
| 2549 | |
| 2550 if (data->fcdPosition == NULL) { | |
| 2551 pStart = data->string; | |
| 2552 } | |
| 2553 else { | |
| 2554 pStart = data->fcdPosition + 1; | |
| 2555 } | |
| 2556 int32_t normLen = | |
| 2557 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStar
t)), | |
| 2558 data->writableBuffer, | |
| 2559 *status). | |
| 2560 length(); | |
| 2561 if(U_FAILURE(*status)) { | |
| 2562 return; | |
| 2563 } | |
| 2564 /* | |
| 2565 this puts the null termination infront of the normalized string instead | |
| 2566 of the end | |
| 2567 */ | |
| 2568 data->pos = | |
| 2569 data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminat
edBuffer() + | |
| 2570 1 + normLen; | |
| 2571 data->origFlags = data->flags; | |
| 2572 data->flags |= UCOL_ITER_INNORMBUF; | |
| 2573 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); | |
| 2574 } | |
| 2575 | |
| 2576 /** | |
| 2577 * Contraction character management function that returns the previous character | |
| 2578 * for the backwards iterator. | |
| 2579 * Does nothing if the previous character is in buffer and not the first | |
| 2580 * character in it. | |
| 2581 * Else it checks previous character in data string to see if it is | |
| 2582 * normalizable. | |
| 2583 * If it is not, the character is simply copied into the buffer, else | |
| 2584 * the whole normalized substring is copied into the buffer, including the | |
| 2585 * current character. | |
| 2586 * @param data collation element iterator data | |
| 2587 * @return previous character | |
| 2588 */ | |
| 2589 static | |
| 2590 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status) | |
| 2591 { | |
| 2592 UChar prevch; | |
| 2593 UChar ch; | |
| 2594 const UChar *start; | |
| 2595 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); | |
| 2596 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 || | |
| 2597 (innormbuf && *(data->pos - 1) != 0)) { | |
| 2598 /* | |
| 2599 if no normalization. | |
| 2600 if previous character is in normalized buffer, no further normalization | |
| 2601 is required | |
| 2602 */ | |
| 2603 if(data->flags & UCOL_USE_ITERATOR) { | |
| 2604 data->iterator->move(data->iterator, -1, UITER_CURRENT); | |
| 2605 return (UChar)data->iterator->next(data->iterator); | |
| 2606 } else { | |
| 2607 return *(data->pos - 1); | |
| 2608 } | |
| 2609 } | |
| 2610 | |
| 2611 start = data->pos; | |
| 2612 if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) { | |
| 2613 /* in data string */ | |
| 2614 if ((start - 1) == data->string) { | |
| 2615 return *(start - 1); | |
| 2616 } | |
| 2617 start --; | |
| 2618 ch = *start; | |
| 2619 prevch = *(start - 1); | |
| 2620 } | |
| 2621 else { | |
| 2622 /* | |
| 2623 in writable buffer, at this point fcdPosition can not be NULL. | |
| 2624 see contracting tag. | |
| 2625 */ | |
| 2626 if (data->fcdPosition == data->string) { | |
| 2627 /* at the start of the string, just dump it into the normalizer */ | |
| 2628 insertBufferFront(data, *(data->fcdPosition)); | |
| 2629 data->fcdPosition = NULL; | |
| 2630 return *(data->pos - 1); | |
| 2631 } | |
| 2632 start = data->fcdPosition; | |
| 2633 ch = *start; | |
| 2634 prevch = *(start - 1); | |
| 2635 } | |
| 2636 /* | |
| 2637 * if the current character is not fcd. | |
| 2638 * Trailing combining class == 0. | |
| 2639 */ | |
| 2640 if (data->fcdPosition > start && | |
| 2641 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_)) | |
| 2642 { | |
| 2643 /* | |
| 2644 Need a more complete FCD check and possible normalization. | |
| 2645 normalize substring will be appended to buffer | |
| 2646 */ | |
| 2647 const UChar *backuppos = data->pos; | |
| 2648 data->pos = start; | |
| 2649 if (collPrevIterFCD(data)) { | |
| 2650 normalizePrevContraction(data, status); | |
| 2651 return *(data->pos - 1); | |
| 2652 } | |
| 2653 data->pos = backuppos; | |
| 2654 data->fcdPosition ++; | |
| 2655 } | |
| 2656 | |
| 2657 if (innormbuf) { | |
| 2658 /* | |
| 2659 no normalization is to be done hence only one character will be | |
| 2660 appended to the buffer. | |
| 2661 */ | |
| 2662 insertBufferFront(data, ch); | |
| 2663 data->fcdPosition --; | |
| 2664 } | |
| 2665 | |
| 2666 return ch; | |
| 2667 } | |
| 2668 | |
| 2669 /* This function handles the special CEs like contractions, expansions, surrogat
es, Thai */ | |
| 2670 /* It is called by getNextCE */ | |
| 2671 | |
| 2672 /* The following should be even */ | |
| 2673 #define UCOL_MAX_DIGITS_FOR_NUMBER 254 | |
| 2674 | |
| 2675 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
lIterate *source, UErrorCode *status) { | |
| 2676 collIterateState entryState; | |
| 2677 backupState(source, &entryState); | |
| 2678 UChar32 cp = ch; | |
| 2679 | |
| 2680 for (;;) { | |
| 2681 // This loop will repeat only in the case of contractions, and only when
a contraction | |
| 2682 // is found and the first CE resulting from that contraction is itself
a special | |
| 2683 // (an expansion, for example.) All other special CE types are fully
handled the | |
| 2684 // first time through, and the loop exits. | |
| 2685 | |
| 2686 const uint32_t *CEOffset = NULL; | |
| 2687 switch(getCETag(CE)) { | |
| 2688 case NOT_FOUND_TAG: | |
| 2689 /* This one is not found, and we'll let somebody else bother about i
t... no more games */ | |
| 2690 return CE; | |
| 2691 case SPEC_PROC_TAG: | |
| 2692 { | |
| 2693 // Special processing is getting a CE that is preceded by a cert
ain prefix | |
| 2694 // Currently this is only needed for optimizing Japanese length
and iteration marks. | |
| 2695 // When we encouter a special processing tag, we go backwards an
d try to see if | |
| 2696 // we have a match. | |
| 2697 // Contraction tables are used - so the whole process is not unl
ike contraction. | |
| 2698 // prefix data is stored backwards in the table. | |
| 2699 const UChar *UCharOffset; | |
| 2700 UChar schar, tchar; | |
| 2701 collIterateState prefixState; | |
| 2702 backupState(source, &prefixState); | |
| 2703 loadState(source, &entryState, TRUE); | |
| 2704 goBackOne(source); // We want to look at the point where we ente
red - actually one | |
| 2705 // before that... | |
| 2706 | |
| 2707 for(;;) { | |
| 2708 // This loop will run once per source string character, for
as long as we | |
| 2709 // are matching a potential contraction sequence | |
| 2710 | |
| 2711 // First we position ourselves at the begining of contractio
n sequence | |
| 2712 const UChar *ContractionStart = UCharOffset = (UChar *)coll-
>image+getContractOffset(CE); | |
| 2713 if (collIter_bos(source)) { | |
| 2714 CE = *(coll->contractionCEs + (UCharOffset - coll->contr
actionIndex)); | |
| 2715 break; | |
| 2716 } | |
| 2717 schar = getPrevNormalizedChar(source, status); | |
| 2718 goBackOne(source); | |
| 2719 | |
| 2720 while(schar > (tchar = *UCharOffset)) { /* since the contrac
tion codepoints should be ordered, we skip all that are smaller */ | |
| 2721 UCharOffset++; | |
| 2722 } | |
| 2723 | |
| 2724 if (schar == tchar) { | |
| 2725 // Found the source string char in the table. | |
| 2726 // Pick up the corresponding CE from the table. | |
| 2727 CE = *(coll->contractionCEs + | |
| 2728 (UCharOffset - coll->contractionIndex)); | |
| 2729 } | |
| 2730 else | |
| 2731 { | |
| 2732 // Source string char was not in the table. | |
| 2733 // We have not found the prefix. | |
| 2734 CE = *(coll->contractionCEs + | |
| 2735 (ContractionStart - coll->contractionIndex)); | |
| 2736 } | |
| 2737 | |
| 2738 if(!isPrefix(CE)) { | |
| 2739 // The source string char was in the contraction table,
and the corresponding | |
| 2740 // CE is not a prefix CE. We found the prefix, break | |
| 2741 // out of loop, this CE will end up being returned. T
his is the normal | |
| 2742 // way out of prefix handling when the source actually
contained | |
| 2743 // the prefix. | |
| 2744 break; | |
| 2745 } | |
| 2746 } | |
| 2747 if(CE != UCOL_NOT_FOUND) { // we found something and we can meri
lly continue | |
| 2748 loadState(source, &prefixState, TRUE); | |
| 2749 if(source->origFlags & UCOL_USE_ITERATOR) { | |
| 2750 source->flags = source->origFlags; | |
| 2751 } | |
| 2752 } else { // prefix search was a failure, we have to backup all t
he way to the start | |
| 2753 loadState(source, &entryState, TRUE); | |
| 2754 } | |
| 2755 break; | |
| 2756 } | |
| 2757 case CONTRACTION_TAG: | |
| 2758 { | |
| 2759 /* This should handle contractions */ | |
| 2760 collIterateState state; | |
| 2761 backupState(source, &state); | |
| 2762 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->imag
e+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND; | |
| 2763 const UChar *UCharOffset; | |
| 2764 UChar schar, tchar; | |
| 2765 | |
| 2766 for (;;) { | |
| 2767 /* This loop will run once per source string character, for
as long as we */ | |
| 2768 /* are matching a potential contraction sequence
*/ | |
| 2769 | |
| 2770 /* First we position ourselves at the begining of contractio
n sequence */ | |
| 2771 const UChar *ContractionStart = UCharOffset = (UChar *)coll-
>image+getContractOffset(CE); | |
| 2772 | |
| 2773 if (collIter_eos(source)) { | |
| 2774 // Ran off the end of the source string. | |
| 2775 CE = *(coll->contractionCEs + (UCharOffset - coll->contr
actionIndex)); | |
| 2776 // So we'll pick whatever we have at the point... | |
| 2777 if (CE == UCOL_NOT_FOUND) { | |
| 2778 // back up the source over all the chars we scanned
going into this contraction. | |
| 2779 CE = firstCE; | |
| 2780 loadState(source, &state, TRUE); | |
| 2781 if(source->origFlags & UCOL_USE_ITERATOR) { | |
| 2782 source->flags = source->origFlags; | |
| 2783 } | |
| 2784 } | |
| 2785 break; | |
| 2786 } | |
| 2787 | |
| 2788 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the di
scontiguos stuff */ /* skip the backward offset, see above */ | |
| 2789 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8); | |
| 2790 | |
| 2791 schar = getNextNormalizedChar(source); | |
| 2792 while(schar > (tchar = *UCharOffset)) { /* since the contrac
tion codepoints should be ordered, we skip all that are smaller */ | |
| 2793 UCharOffset++; | |
| 2794 } | |
| 2795 | |
| 2796 if (schar == tchar) { | |
| 2797 // Found the source string char in the contraction table
. | |
| 2798 // Pick up the corresponding CE from the table. | |
| 2799 CE = *(coll->contractionCEs + | |
| 2800 (UCharOffset - coll->contractionIndex)); | |
| 2801 } | |
| 2802 else | |
| 2803 { | |
| 2804 // Source string char was not in contraction table. | |
| 2805 // Unless we have a discontiguous contraction, we have
finished | |
| 2806 // with this contraction. | |
| 2807 // in order to do the proper detection, we | |
| 2808 // need to see if we're dealing with a supplementary | |
| 2809 /* We test whether the next two char are surrogate pairs
. | |
| 2810 * This test is done if the iterator is not NULL. | |
| 2811 * If there is no surrogate pair, the iterator | |
| 2812 * goes back one if needed. */ | |
| 2813 UChar32 miss = schar; | |
| 2814 if (source->iterator) { | |
| 2815 UChar32 surrNextChar; /* the next char in the iterat
ion to test */ | |
| 2816 int32_t prevPos; /* holds the previous position befo
re move forward of the source iterator */ | |
| 2817 if(U16_IS_LEAD(schar) && source->iterator->hasNext(s
ource->iterator)) { | |
| 2818 prevPos = source->iterator->index; | |
| 2819 surrNextChar = getNextNormalizedChar(source); | |
| 2820 if (U16_IS_TRAIL(surrNextChar)) { | |
| 2821 miss = U16_GET_SUPPLEMENTARY(schar, surrNext
Char); | |
| 2822 } else if (prevPos < source->iterator->index){ | |
| 2823 goBackOne(source); | |
| 2824 } | |
| 2825 } | |
| 2826 } else if (U16_IS_LEAD(schar) && source->pos + 1 < sourc
e->endp) { | |
| 2827 const UChar* prevPos = source->pos; | |
| 2828 UChar nextChar = getNextNormalizedChar(source); | |
| 2829 if (U16_IS_TRAIL(nextChar)) { | |
| 2830 miss = U16_GET_SUPPLEMENTARY(schar, nextChar); | |
| 2831 } else if (prevPos < source->pos) { | |
| 2832 goBackOne(source); | |
| 2833 } | |
| 2834 } | |
| 2835 | |
| 2836 uint8_t sCC; | |
| 2837 if (miss < 0x300 || | |
| 2838 maxCC == 0 || | |
| 2839 (sCC = i_getCombiningClass(miss, coll)) == 0 || | |
| 2840 sCC>maxCC || | |
| 2841 (allSame != 0 && sCC == maxCC) || | |
| 2842 collIter_eos(source)) | |
| 2843 { | |
| 2844 // Contraction can not be discontiguous. | |
| 2845 goBackOne(source); // back up the source string by
one, | |
| 2846 // because the character we just looked at was | |
| 2847 // not part of the contraction. */ | |
| 2848 if(U_IS_SUPPLEMENTARY(miss)) { | |
| 2849 goBackOne(source); | |
| 2850 } | |
| 2851 CE = *(coll->contractionCEs + | |
| 2852 (ContractionStart - coll->contractionIndex)); | |
| 2853 } else { | |
| 2854 // | |
| 2855 // Contraction is possibly discontiguous. | |
| 2856 // Scan more of source string looking for a match | |
| 2857 // | |
| 2858 UChar tempchar; | |
| 2859 /* find the next character if schar is not a base ch
aracter | |
| 2860 and we are not yet at the end of the string */ | |
| 2861 tempchar = getNextNormalizedChar(source); | |
| 2862 // probably need another supplementary thingie here | |
| 2863 goBackOne(source); | |
| 2864 if (i_getCombiningClass(tempchar, coll) == 0) { | |
| 2865 goBackOne(source); | |
| 2866 if(U_IS_SUPPLEMENTARY(miss)) { | |
| 2867 goBackOne(source); | |
| 2868 } | |
| 2869 /* Spit out the last char of the string, wasn't
tasty enough */ | |
| 2870 CE = *(coll->contractionCEs + | |
| 2871 (ContractionStart - coll->contractionIndex))
; | |
| 2872 } else { | |
| 2873 CE = getDiscontiguous(coll, source, ContractionS
tart); | |
| 2874 } | |
| 2875 } | |
| 2876 } // else after if(schar == tchar) | |
| 2877 | |
| 2878 if(CE == UCOL_NOT_FOUND) { | |
| 2879 /* The Source string did not match the contraction that
we were checking. */ | |
| 2880 /* Back up the source position to undo the effects of h
aving partially */ | |
| 2881 /* scanned through what ultimately proved to not be a
contraction. */ | |
| 2882 loadState(source, &state, TRUE); | |
| 2883 CE = firstCE; | |
| 2884 break; | |
| 2885 } | |
| 2886 | |
| 2887 if(!isContraction(CE)) { | |
| 2888 // The source string char was in the contraction table,
and the corresponding | |
| 2889 // CE is not a contraction CE. We completed the contr
action, break | |
| 2890 // out of loop, this CE will end up being returned. T
his is the normal | |
| 2891 // way out of contraction handling when the source act
ually contained | |
| 2892 // the contraction. | |
| 2893 break; | |
| 2894 } | |
| 2895 | |
| 2896 | |
| 2897 // The source string char was in the contraction table, and
the corresponding | |
| 2898 // CE is IS a contraction CE. We will continue looping t
o check the source | |
| 2899 // string for the remaining chars in the contraction. | |
| 2900 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart
- coll->contractionIndex)); | |
| 2901 if(tempCE != UCOL_NOT_FOUND) { | |
| 2902 // We have scanned a a section of source string for whic
h there is a | |
| 2903 // CE from the contraction table. Remember the CE and
scan position, so | |
| 2904 // that we can return to this point if further scanning
fails to | |
| 2905 // match a longer contraction sequence. | |
| 2906 firstCE = tempCE; | |
| 2907 | |
| 2908 goBackOne(source); | |
| 2909 backupState(source, &state); | |
| 2910 getNextNormalizedChar(source); | |
| 2911 | |
| 2912 // Another way to do this is: | |
| 2913 //collIterateState tempState; | |
| 2914 //backupState(source, &tempState); | |
| 2915 //goBackOne(source); | |
| 2916 //backupState(source, &state); | |
| 2917 //loadState(source, &tempState, TRUE); | |
| 2918 | |
| 2919 // The problem is that for incomplete contractions we ha
ve to remember the previous | |
| 2920 // position. Before, the only thing I needed to do was s
tate.pos--; | |
| 2921 // After iterator introduction and especially after intr
oduction of normalizing | |
| 2922 // iterators, it became much more difficult to decrease
the saved state. | |
| 2923 // I'm not yet sure which of the two methods above is fa
ster. | |
| 2924 } | |
| 2925 } // for(;;) | |
| 2926 break; | |
| 2927 } // case CONTRACTION_TAG: | |
| 2928 case LONG_PRIMARY_TAG: | |
| 2929 { | |
| 2930 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; | |
| 2931 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYT
E_COMMON; | |
| 2932 source->offsetRepeatCount += 1; | |
| 2933 return CE; | |
| 2934 } | |
| 2935 case EXPANSION_TAG: | |
| 2936 { | |
| 2937 /* This should handle expansion. */ | |
| 2938 /* NOTE: we can encounter both continuations and expansions in a
n expansion! */ | |
| 2939 /* I have to decide where continuations are going to be dealt wi
th */ | |
| 2940 uint32_t size; | |
| 2941 uint32_t i; /* general counter */ | |
| 2942 | |
| 2943 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* fi
nd the offset to expansion table */ | |
| 2944 size = getExpansionCount(CE); | |
| 2945 CE = *CEOffset++; | |
| 2946 //source->offsetRepeatCount = -1; | |
| 2947 | |
| 2948 if(size != 0) { /* if there are less than 16 elements in expansi
on, we don't terminate */ | |
| 2949 for(i = 1; i<size; i++) { | |
| 2950 *(source->CEpos++) = *CEOffset++; | |
| 2951 source->offsetRepeatCount += 1; | |
| 2952 } | |
| 2953 } else { /* else, we do */ | |
| 2954 while(*CEOffset != 0) { | |
| 2955 *(source->CEpos++) = *CEOffset++; | |
| 2956 source->offsetRepeatCount += 1; | |
| 2957 } | |
| 2958 } | |
| 2959 | |
| 2960 return CE; | |
| 2961 } | |
| 2962 case DIGIT_TAG: | |
| 2963 { | |
| 2964 /* | |
| 2965 We do a check to see if we want to collate digits as numbers; if
so we generate | |
| 2966 a custom collation key. Otherwise we pull out the value stored i
n the expansion table. | |
| 2967 */ | |
| 2968 //uint32_t size; | |
| 2969 uint32_t i; /* general counter */ | |
| 2970 | |
| 2971 if (source->coll->numericCollation == UCOL_ON){ | |
| 2972 collIterateState digitState = {0,0,0,0,0,0,0,0,0}; | |
| 2973 UChar32 char32 = 0; | |
| 2974 int32_t digVal = 0; | |
| 2975 | |
| 2976 uint32_t digIndx = 0; | |
| 2977 uint32_t endIndex = 0; | |
| 2978 uint32_t trailingZeroIndex = 0; | |
| 2979 | |
| 2980 uint8_t collateVal = 0; | |
| 2981 | |
| 2982 UBool nonZeroValReached = FALSE; | |
| 2983 | |
| 2984 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I j
ust need a temporary place to store my generated CEs. | |
| 2985 /* | |
| 2986 We parse the source string until we hit a char that's N
OT a digit. | |
| 2987 Use this u_charDigitValue. This might be slow because we
have to | |
| 2988 handle surrogates... | |
| 2989 */ | |
| 2990 /* | |
| 2991 if (U16_IS_LEAD(ch)){ | |
| 2992 if (!collIter_eos(source)) { | |
| 2993 backupState(source, &digitState); | |
| 2994 UChar trail = getNextNormalizedChar(source); | |
| 2995 if(U16_IS_TRAIL(trail)) { | |
| 2996 char32 = U16_GET_SUPPLEMENTARY(ch, trail); | |
| 2997 } else { | |
| 2998 loadState(source, &digitState, TRUE); | |
| 2999 char32 = ch; | |
| 3000 } | |
| 3001 } else { | |
| 3002 char32 = ch; | |
| 3003 } | |
| 3004 } else { | |
| 3005 char32 = ch; | |
| 3006 } | |
| 3007 digVal = u_charDigitValue(char32); | |
| 3008 */ | |
| 3009 digVal = u_charDigitValue(cp); // if we have arrived here, w
e have | |
| 3010 // already processed possible supplementaries that trigered
the digit tag - | |
| 3011 // all supplementaries are marked in the UCA. | |
| 3012 /* | |
| 3013 We pad a zero in front of the first element anyways. Th
is takes | |
| 3014 care of the (probably) most common case where people are
sorting things followed | |
| 3015 by a single digit | |
| 3016 */ | |
| 3017 digIndx++; | |
| 3018 for(;;){ | |
| 3019 // Make sure we have enough space. No longer needed; | |
| 3020 // at this point digIndx now has a max value of UCOL_MAX
_DIGITS_FOR_NUMBER | |
| 3021 // (it has been pre-incremented) so we just ensure that
numTempBuf is big enough | |
| 3022 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3). | |
| 3023 | |
| 3024 // Skipping over leading zeroes. | |
| 3025 if (digVal != 0) { | |
| 3026 nonZeroValReached = TRUE; | |
| 3027 } | |
| 3028 if (nonZeroValReached) { | |
| 3029 /* | |
| 3030 We parse the digit string into base 100 numbers (thi
s fits into a byte). | |
| 3031 We only add to the buffer in twos, thus if we are pa
rsing an odd character, | |
| 3032 that serves as the 'tens' digit while the if we are
parsing an even one, that | |
| 3033 is the 'ones' digit. We dumped the parsed base 100 v
alue (collateVal) into | |
| 3034 a buffer. We multiply each collateVal by 2 (to give
us room) and add 5 (to avoid | |
| 3035 overlapping magic CE byte values). The last byte we
subtract 1 to ensure it is less | |
| 3036 than all the other bytes. | |
| 3037 */ | |
| 3038 | |
| 3039 if (digIndx % 2 == 1){ | |
| 3040 collateVal += (uint8_t)digVal; | |
| 3041 | |
| 3042 // We don't enter the low-order-digit case unles
s we've already seen | |
| 3043 // the high order, or for the first digit, which
is always non-zero. | |
| 3044 if (collateVal != 0) | |
| 3045 trailingZeroIndex = 0; | |
| 3046 | |
| 3047 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; | |
| 3048 collateVal = 0; | |
| 3049 } | |
| 3050 else{ | |
| 3051 // We drop the collation value into the buffer s
o if we need to do | |
| 3052 // a "front patch" we don't have to check to see
if we're hitting the | |
| 3053 // last element. | |
| 3054 collateVal = (uint8_t)(digVal * 10); | |
| 3055 | |
| 3056 // Check for trailing zeroes. | |
| 3057 if (collateVal == 0) | |
| 3058 { | |
| 3059 if (!trailingZeroIndex) | |
| 3060 trailingZeroIndex = (digIndx/2) + 2; | |
| 3061 } | |
| 3062 else | |
| 3063 trailingZeroIndex = 0; | |
| 3064 | |
| 3065 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; | |
| 3066 } | |
| 3067 digIndx++; | |
| 3068 } | |
| 3069 | |
| 3070 // Get next character. | |
| 3071 if (!collIter_eos(source)){ | |
| 3072 ch = getNextNormalizedChar(source); | |
| 3073 if (U16_IS_LEAD(ch)){ | |
| 3074 if (!collIter_eos(source)) { | |
| 3075 backupState(source, &digitState); | |
| 3076 UChar trail = getNextNormalizedChar(source); | |
| 3077 if(U16_IS_TRAIL(trail)) { | |
| 3078 char32 = U16_GET_SUPPLEMENTARY(ch, trail
); | |
| 3079 } else { | |
| 3080 loadState(source, &digitState, TRUE); | |
| 3081 char32 = ch; | |
| 3082 } | |
| 3083 } | |
| 3084 } else { | |
| 3085 char32 = ch; | |
| 3086 } | |
| 3087 | |
| 3088 if ((digVal = u_charDigitValue(char32)) == -1 || dig
Indx > UCOL_MAX_DIGITS_FOR_NUMBER){ | |
| 3089 // Resetting position to point to the next unpro
cessed char. We | |
| 3090 // overshot it when doing our test/set for numbe
rs. | |
| 3091 if (char32 > 0xFFFF) { // For surrogates. | |
| 3092 loadState(source, &digitState, TRUE); | |
| 3093 //goBackOne(source); | |
| 3094 } | |
| 3095 goBackOne(source); | |
| 3096 break; | |
| 3097 } | |
| 3098 } else { | |
| 3099 break; | |
| 3100 } | |
| 3101 } | |
| 3102 | |
| 3103 if (nonZeroValReached == FALSE){ | |
| 3104 digIndx = 2; | |
| 3105 numTempBuf[2] = 6; | |
| 3106 } | |
| 3107 | |
| 3108 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx
/2) + 2) ; | |
| 3109 if (digIndx % 2 != 0){ | |
| 3110 /* | |
| 3111 We missed a value. Since digIndx isn't even, stuck too m
any values into the buffer (this is what | |
| 3112 we get for padding the first byte with a zero). "Front-p
atch" now by pushing all nybbles forward. | |
| 3113 Doing it this way ensures that at least 50% of the time
(statistically speaking) we'll only be doing a | |
| 3114 single pass and optimizes for strings with single digits
. I'm just assuming that's the more common case. | |
| 3115 */ | |
| 3116 | |
| 3117 for(i = 2; i < endIndex; i++){ | |
| 3118 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10)
* 10) + | |
| 3119 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6; | |
| 3120 } | |
| 3121 --digIndx; | |
| 3122 } | |
| 3123 | |
| 3124 // Subtract one off of the last byte. | |
| 3125 numTempBuf[endIndex-1] -= 1; | |
| 3126 | |
| 3127 /* | |
| 3128 We want to skip over the first two slots in the buffer. The
first slot | |
| 3129 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The
second slot is for the | |
| 3130 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. | |
| 3131 */ | |
| 3132 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; | |
| 3133 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F)); | |
| 3134 | |
| 3135 // Now transfer the collation key to our collIterate struct. | |
| 3136 // The total size for our collation key is endIndx bumped up
to the next largest even value divided by two. | |
| 3137 //size = ((endIndex+1) & ~1)/2; | |
| 3138 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARY
ORDERSHIFT) | //Primary weight | |
| 3139 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Seco
ndary weight | |
| 3140 UCOL_BYTE_COMMON; // Tertiary weight. | |
| 3141 i = 2; // Reset the index into the buffer. | |
| 3142 while(i < endIndex) | |
| 3143 { | |
| 3144 uint32_t primWeight = numTempBuf[i++] << 8; | |
| 3145 if ( i < endIndex) | |
| 3146 primWeight |= numTempBuf[i++]; | |
| 3147 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHI
FT) | UCOL_CONTINUATION_MARKER; | |
| 3148 } | |
| 3149 | |
| 3150 } else { | |
| 3151 // no numeric mode, we'll just switch to whatever we stashed
and continue | |
| 3152 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /
* find the offset to expansion table */ | |
| 3153 CE = *CEOffset++; | |
| 3154 break; | |
| 3155 } | |
| 3156 return CE; | |
| 3157 } | |
| 3158 /* various implicits optimization */ | |
| 3159 case IMPLICIT_TAG: /* everything that is not defined otherwise */ | |
| 3160 /* UCA is filled with these. Tailorings are NOT_FOUND */ | |
| 3161 return getImplicit(cp, source); | |
| 3162 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
*/ | |
| 3163 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImpl
icit | |
| 3164 return getImplicit(cp, source); | |
| 3165 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ | |
| 3166 { | |
| 3167 static const uint32_t | |
| 3168 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11
A7; | |
| 3169 //const uint32_t LCount = 19; | |
| 3170 static const uint32_t VCount = 21; | |
| 3171 static const uint32_t TCount = 28; | |
| 3172 //const uint32_t NCount = VCount * TCount; // 588 | |
| 3173 //const uint32_t SCount = LCount * NCount; // 11172 | |
| 3174 uint32_t L = ch - SBase; | |
| 3175 | |
| 3176 // divide into pieces | |
| 3177 | |
| 3178 uint32_t T = L % TCount; // we do it in this order since some co
mpilers can do % and / in one operation | |
| 3179 L /= TCount; | |
| 3180 uint32_t V = L % VCount; | |
| 3181 L /= VCount; | |
| 3182 | |
| 3183 // offset them | |
| 3184 | |
| 3185 L += LBase; | |
| 3186 V += VBase; | |
| 3187 T += TBase; | |
| 3188 | |
| 3189 // return the first CE, but first put the rest into the expansio
n buffer | |
| 3190 if (!source->coll->image->jamoSpecial) { // FAST PATH | |
| 3191 | |
| 3192 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V
); | |
| 3193 if (T != TBase) { | |
| 3194 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mappin
g, T); | |
| 3195 } | |
| 3196 | |
| 3197 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L); | |
| 3198 | |
| 3199 } else { // Jamo is Special | |
| 3200 // Since Hanguls pass the FCD check, it is | |
| 3201 // guaranteed that we won't be in | |
| 3202 // the normalization buffer if something like this happens | |
| 3203 | |
| 3204 // However, if we are using a uchar iterator and normalizati
on | |
| 3205 // is ON, the Hangul that lead us here is going to be in tha
t | |
| 3206 // normalization buffer. Here we want to restore the uchar | |
| 3207 // iterator state and pull out of the normalization buffer | |
| 3208 if(source->iterator != NULL && source->flags & UCOL_ITER_INN
ORMBUF) { | |
| 3209 source->flags = source->origFlags; // restore the iterat
or | |
| 3210 source->pos = NULL; | |
| 3211 } | |
| 3212 | |
| 3213 // Move Jamos into normalization buffer | |
| 3214 UChar *buffer = source->writableBuffer.getBuffer(4); | |
| 3215 int32_t bufferLength; | |
| 3216 buffer[0] = (UChar)L; | |
| 3217 buffer[1] = (UChar)V; | |
| 3218 if (T != TBase) { | |
| 3219 buffer[2] = (UChar)T; | |
| 3220 bufferLength = 3; | |
| 3221 } else { | |
| 3222 bufferLength = 2; | |
| 3223 } | |
| 3224 source->writableBuffer.releaseBuffer(bufferLength); | |
| 3225 | |
| 3226 // Indicate where to continue in main input string after exh
austing the writableBuffer | |
| 3227 source->fcdPosition = source->pos; | |
| 3228 | |
| 3229 source->pos = source->writableBuffer.getTerminatedBuffer()
; | |
| 3230 source->origFlags = source->flags; | |
| 3231 source->flags |= UCOL_ITER_INNORMBUF; | |
| 3232 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); | |
| 3233 | |
| 3234 return(UCOL_IGNORABLE); | |
| 3235 } | |
| 3236 } | |
| 3237 case SURROGATE_TAG: | |
| 3238 /* we encountered a leading surrogate. We shall get the CE by using
the following code unit */ | |
| 3239 /* two things can happen here: next code point can be a trailing sur
rogate - we will use it */ | |
| 3240 /* to retrieve the CE, or it is not a trailing surrogate (or the str
ing is done). In that case */ | |
| 3241 /* we treat it like an unassigned code point. */ | |
| 3242 { | |
| 3243 UChar trail; | |
| 3244 collIterateState state; | |
| 3245 backupState(source, &state); | |
| 3246 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNorma
lizedChar(source))))) { | |
| 3247 // we chould have stepped one char forward and it might have
turned that it | |
| 3248 // was not a trail surrogate. In that case, we have to backu
p. | |
| 3249 loadState(source, &state, TRUE); | |
| 3250 return UCOL_NOT_FOUND; | |
| 3251 } else { | |
| 3252 /* TODO: CE contain the data from the previous CE + the mask
. It should at least be unmasked */ | |
| 3253 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFF
FF, trail); | |
| 3254 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates
in this block, but not this one. | |
| 3255 // We need to backup | |
| 3256 loadState(source, &state, TRUE); | |
| 3257 return CE; | |
| 3258 } | |
| 3259 // calculate the supplementary code point value, if surrogat
e was not tailored | |
| 3260 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10U
L)+0xdc00-0x10000)); | |
| 3261 } | |
| 3262 } | |
| 3263 break; | |
| 3264 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ | |
| 3265 UChar nextChar; | |
| 3266 if( source->flags & UCOL_USE_ITERATOR) { | |
| 3267 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source
->iterator))) { | |
| 3268 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); | |
| 3269 source->iterator->next(source->iterator); | |
| 3270 return getImplicit(cp, source); | |
| 3271 } | |
| 3272 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->po
s<source->endp)) && | |
| 3273 U_IS_TRAIL((nextChar=*source->pos))) { | |
| 3274 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); | |
| 3275 source->pos++; | |
| 3276 return getImplicit(cp, source); | |
| 3277 } | |
| 3278 return UCOL_NOT_FOUND; | |
| 3279 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ | |
| 3280 return UCOL_NOT_FOUND; /* broken surrogate sequence */ | |
| 3281 case CHARSET_TAG: | |
| 3282 /* not yet implemented */ | |
| 3283 /* probably after 1.8 */ | |
| 3284 return UCOL_NOT_FOUND; | |
| 3285 default: | |
| 3286 *status = U_INTERNAL_PROGRAM_ERROR; | |
| 3287 CE=0; | |
| 3288 break; | |
| 3289 } | |
| 3290 if (CE <= UCOL_NOT_FOUND) break; | |
| 3291 } | |
| 3292 return CE; | |
| 3293 } | |
| 3294 | |
| 3295 | |
| 3296 /* now uses Mark's getImplicitPrimary code */ | |
| 3297 static | |
| 3298 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) { | |
| 3299 uint32_t r = uprv_uca_getImplicitPrimary(cp); | |
| 3300 | |
| 3301 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505; | |
| 3302 collationSource->toReturn = collationSource->CEpos; | |
| 3303 | |
| 3304 // **** doesn't work if using iterator **** | |
| 3305 if (collationSource->flags & UCOL_ITER_INNORMBUF) { | |
| 3306 collationSource->offsetRepeatCount = 1; | |
| 3307 } else { | |
| 3308 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->
string); | |
| 3309 | |
| 3310 UErrorCode errorCode = U_ZERO_ERROR; | |
| 3311 collationSource->appendOffset(firstOffset, errorCode); | |
| 3312 collationSource->appendOffset(firstOffset + 1, errorCode); | |
| 3313 | |
| 3314 collationSource->offsetReturn = collationSource->offsetStore - 1; | |
| 3315 *(collationSource->offsetBuffer) = firstOffset; | |
| 3316 if (collationSource->offsetReturn == collationSource->offsetBuffer) { | |
| 3317 collationSource->offsetStore = collationSource->offsetBuffer; | |
| 3318 } | |
| 3319 } | |
| 3320 | |
| 3321 return ((r & 0x0000FFFF)<<16) | 0x000000C0; | |
| 3322 } | |
| 3323 | |
| 3324 /** | |
| 3325 * This function handles the special CEs like contractions, expansions, | |
| 3326 * surrogates, Thai. | |
| 3327 * It is called by both getPrevCE | |
| 3328 */ | |
| 3329 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, | |
| 3330 collIterate *source, | |
| 3331 UErrorCode *status) | |
| 3332 { | |
| 3333 const uint32_t *CEOffset = NULL; | |
| 3334 UChar *UCharOffset = NULL; | |
| 3335 UChar schar; | |
| 3336 const UChar *constart = NULL; | |
| 3337 uint32_t size; | |
| 3338 UChar buffer[UCOL_MAX_BUFFER]; | |
| 3339 uint32_t *endCEBuffer; | |
| 3340 UChar *strbuffer; | |
| 3341 int32_t noChars = 0; | |
| 3342 int32_t CECount = 0; | |
| 3343 | |
| 3344 for(;;) | |
| 3345 { | |
| 3346 /* the only ces that loops are thai and contractions */ | |
| 3347 switch (getCETag(CE)) | |
| 3348 { | |
| 3349 case NOT_FOUND_TAG: /* this tag always returns */ | |
| 3350 return CE; | |
| 3351 | |
| 3352 case SPEC_PROC_TAG: | |
| 3353 { | |
| 3354 // Special processing is getting a CE that is preceded by a cert
ain prefix | |
| 3355 // Currently this is only needed for optimizing Japanese length
and iteration marks. | |
| 3356 // When we encouter a special processing tag, we go backwards an
d try to see if | |
| 3357 // we have a match. | |
| 3358 // Contraction tables are used - so the whole process is not unl
ike contraction. | |
| 3359 // prefix data is stored backwards in the table. | |
| 3360 const UChar *UCharOffset; | |
| 3361 UChar schar, tchar; | |
| 3362 collIterateState prefixState; | |
| 3363 backupState(source, &prefixState); | |
| 3364 for(;;) { | |
| 3365 // This loop will run once per source string character, for
as long as we | |
| 3366 // are matching a potential contraction sequence | |
| 3367 | |
| 3368 // First we position ourselves at the begining of contractio
n sequence | |
| 3369 const UChar *ContractionStart = UCharOffset = (UChar *)coll-
>image+getContractOffset(CE); | |
| 3370 | |
| 3371 if (collIter_bos(source)) { | |
| 3372 CE = *(coll->contractionCEs + (UCharOffset - coll->contr
actionIndex)); | |
| 3373 break; | |
| 3374 } | |
| 3375 schar = getPrevNormalizedChar(source, status); | |
| 3376 goBackOne(source); | |
| 3377 | |
| 3378 while(schar > (tchar = *UCharOffset)) { /* since the contrac
tion codepoints should be ordered, we skip all that are smaller */ | |
| 3379 UCharOffset++; | |
| 3380 } | |
| 3381 | |
| 3382 if (schar == tchar) { | |
| 3383 // Found the source string char in the table. | |
| 3384 // Pick up the corresponding CE from the table. | |
| 3385 CE = *(coll->contractionCEs + | |
| 3386 (UCharOffset - coll->contractionIndex)); | |
| 3387 } | |
| 3388 else | |
| 3389 { | |
| 3390 // if there is a completely ignorable code point in the
middle of | |
| 3391 // a prefix, we need to act as if it's not there | |
| 3392 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-
fdef are set to zero) | |
| 3393 // lone surrogates cannot be set to zero as it would bre
ak other processing | |
| 3394 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping
, schar); | |
| 3395 // it's easy for BMP code points | |
| 3396 if(isZeroCE == 0) { | |
| 3397 continue; | |
| 3398 } else if(U16_IS_SURROGATE(schar)) { | |
| 3399 // for supplementary code points, we have to check t
he next one | |
| 3400 // situations where we are going to ignore | |
| 3401 // 1. beginning of the string: schar is a lone surro
gate | |
| 3402 // 2. schar is a lone surrogate | |
| 3403 // 3. schar is a trail surrogate in a valid surrogat
e sequence | |
| 3404 // that is explicitly set to zero. | |
| 3405 if (!collIter_bos(source)) { | |
| 3406 UChar lead; | |
| 3407 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(
lead = getPrevNormalizedChar(source, status))) { | |
| 3408 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapp
ing, lead); | |
| 3409 if(isSpecial(isZeroCE) && getCETag(isZeroCE)
== SURROGATE_TAG) { | |
| 3410 uint32_t finalCE = UTRIE_GET32_FROM_OFFS
ET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar); | |
| 3411 if(finalCE == 0) { | |
| 3412 // this is a real, assigned complete
ly ignorable code point | |
| 3413 goBackOne(source); | |
| 3414 continue; | |
| 3415 } | |
| 3416 } | |
| 3417 } else { | |
| 3418 // lone surrogate, treat like unassigned | |
| 3419 return UCOL_NOT_FOUND; | |
| 3420 } | |
| 3421 } else { | |
| 3422 // lone surrogate at the beggining, treat like u
nassigned | |
| 3423 return UCOL_NOT_FOUND; | |
| 3424 } | |
| 3425 } | |
| 3426 // Source string char was not in the table. | |
| 3427 // We have not found the prefix. | |
| 3428 CE = *(coll->contractionCEs + | |
| 3429 (ContractionStart - coll->contractionIndex)); | |
| 3430 } | |
| 3431 | |
| 3432 if(!isPrefix(CE)) { | |
| 3433 // The source string char was in the contraction table,
and the corresponding | |
| 3434 // CE is not a prefix CE. We found the prefix, break | |
| 3435 // out of loop, this CE will end up being returned. T
his is the normal | |
| 3436 // way out of prefix handling when the source actually
contained | |
| 3437 // the prefix. | |
| 3438 break; | |
| 3439 } | |
| 3440 } | |
| 3441 loadState(source, &prefixState, TRUE); | |
| 3442 break; | |
| 3443 } | |
| 3444 | |
| 3445 case CONTRACTION_TAG: { | |
| 3446 /* to ensure that the backwards and forwards iteration matches, we | |
| 3447 take the current region of most possible match and pass it through | |
| 3448 the forward iteration. this will ensure that the obstinate problem o
f | |
| 3449 overlapping contractions will not occur. | |
| 3450 */ | |
| 3451 schar = peekCodeUnit(source, 0); | |
| 3452 constart = (UChar *)coll->image + getContractOffset(CE); | |
| 3453 if (isAtStartPrevIterate(source) | |
| 3454 /* commented away contraction end checks after adding the checks | |
| 3455 in getPrevCE */) { | |
| 3456 /* start of string or this is not the end of any contraction
*/ | |
| 3457 CE = *(coll->contractionCEs + | |
| 3458 (constart - coll->contractionIndex)); | |
| 3459 break; | |
| 3460 } | |
| 3461 strbuffer = buffer; | |
| 3462 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1); | |
| 3463 *(UCharOffset --) = 0; | |
| 3464 noChars = 0; | |
| 3465 // have to swap thai characters | |
| 3466 while (ucol_unsafeCP(schar, coll)) { | |
| 3467 *(UCharOffset) = schar; | |
| 3468 noChars++; | |
| 3469 UCharOffset --; | |
| 3470 schar = getPrevNormalizedChar(source, status); | |
| 3471 goBackOne(source); | |
| 3472 // TODO: when we exhaust the contraction buffer, | |
| 3473 // it needs to get reallocated. The problem is | |
| 3474 // that the size depends on the string which is | |
| 3475 // not iterated over. However, since we're travelling | |
| 3476 // backwards, we already had to set the iterator at | |
| 3477 // the end - so we might as well know where we are? | |
| 3478 if (UCharOffset + 1 == buffer) { | |
| 3479 /* we have exhausted the buffer */ | |
| 3480 int32_t newsize = 0; | |
| 3481 if(source->pos) { // actually dealing with a position | |
| 3482 newsize = (int32_t)(source->pos - source->string + 1); | |
| 3483 } else { // iterator | |
| 3484 newsize = 4 * UCOL_MAX_BUFFER; | |
| 3485 } | |
| 3486 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) * | |
| 3487 (newsize + UCOL_MAX_BUFFER)); | |
| 3488 /* test for NULL */ | |
| 3489 if (strbuffer == NULL) { | |
| 3490 *status = U_MEMORY_ALLOCATION_ERROR; | |
| 3491 return UCOL_NO_MORE_CES; | |
| 3492 } | |
| 3493 UCharOffset = strbuffer + newsize; | |
| 3494 uprv_memcpy(UCharOffset, buffer, | |
| 3495 UCOL_MAX_BUFFER * sizeof(UChar)); | |
| 3496 UCharOffset --; | |
| 3497 } | |
| 3498 if ((source->pos && (source->pos == source->string || | |
| 3499 ((source->flags & UCOL_ITER_INNORMBUF) && | |
| 3500 *(source->pos - 1) == 0 && source->fcdPosition == NULL))) | |
| 3501 || (source->iterator && !source->iterator->hasPrevious(sourc
e->iterator))) { | |
| 3502 break; | |
| 3503 } | |
| 3504 } | |
| 3505 /* adds the initial base character to the string */ | |
| 3506 *(UCharOffset) = schar; | |
| 3507 noChars++; | |
| 3508 | |
| 3509 int32_t offsetBias; | |
| 3510 | |
| 3511 // **** doesn't work if using iterator **** | |
| 3512 if (source->flags & UCOL_ITER_INNORMBUF) { | |
| 3513 offsetBias = -1; | |
| 3514 } else { | |
| 3515 offsetBias = (int32_t)(source->pos - source->string); | |
| 3516 } | |
| 3517 | |
| 3518 /* a new collIterate is used to simplify things, since using the cur
rent | |
| 3519 collIterate will mean that the forward and backwards iteration will | |
| 3520 share and change the same buffers. we don't want to get into that. *
/ | |
| 3521 collIterate temp; | |
| 3522 int32_t rawOffset; | |
| 3523 | |
| 3524 IInit_collIterate(coll, UCharOffset, noChars, &temp, status); | |
| 3525 if(U_FAILURE(*status)) { | |
| 3526 return (uint32_t)UCOL_NULLORDER; | |
| 3527 } | |
| 3528 temp.flags &= ~UCOL_ITER_NORM; | |
| 3529 temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT; | |
| 3530 | |
| 3531 rawOffset = (int32_t)(temp.pos - temp.string); // should always be z
ero? | |
| 3532 CE = ucol_IGetNextCE(coll, &temp, status); | |
| 3533 | |
| 3534 if (source->extendCEs) { | |
| 3535 endCEBuffer = source->extendCEs + source->extendCEsSize; | |
| 3536 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(u
int32_t)); | |
| 3537 } else { | |
| 3538 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE; | |
| 3539 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_
t)); | |
| 3540 } | |
| 3541 | |
| 3542 while (CE != UCOL_NO_MORE_CES) { | |
| 3543 *(source->CEpos ++) = CE; | |
| 3544 | |
| 3545 if (offsetBias >= 0) { | |
| 3546 source->appendOffset(rawOffset + offsetBias, *status); | |
| 3547 } | |
| 3548 | |
| 3549 CECount++; | |
| 3550 if (source->CEpos == endCEBuffer) { | |
| 3551 /* ran out of CE space, reallocate to new buffer. | |
| 3552 If reallocation fails, reset pointers and bail out, | |
| 3553 there's no guarantee of the right character position after | |
| 3554 this bail*/ | |
| 3555 if (!increaseCEsCapacity(source)) { | |
| 3556 *status = U_MEMORY_ALLOCATION_ERROR; | |
| 3557 break; | |
| 3558 } | |
| 3559 | |
| 3560 endCEBuffer = source->extendCEs + source->extendCEsSize; | |
| 3561 } | |
| 3562 | |
| 3563 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) { | |
| 3564 rawOffset = (int32_t)(temp.fcdPosition - temp.string); | |
| 3565 } else { | |
| 3566 rawOffset = (int32_t)(temp.pos - temp.string); | |
| 3567 } | |
| 3568 | |
| 3569 CE = ucol_IGetNextCE(coll, &temp, status); | |
| 3570 } | |
| 3571 | |
| 3572 if (strbuffer != buffer) { | |
| 3573 uprv_free(strbuffer); | |
| 3574 } | |
| 3575 if (U_FAILURE(*status)) { | |
| 3576 return (uint32_t)UCOL_NULLORDER; | |
| 3577 } | |
| 3578 | |
| 3579 if (source->offsetRepeatValue != 0) { | |
| 3580 if (CECount > noChars) { | |
| 3581 source->offsetRepeatCount += temp.offsetRepeatCount; | |
| 3582 } else { | |
| 3583 // **** does this really skip the right offsets? **** | |
| 3584 source->offsetReturn -= (noChars - CECount); | |
| 3585 } | |
| 3586 } | |
| 3587 | |
| 3588 if (offsetBias >= 0) { | |
| 3589 source->offsetReturn = source->offsetStore - 1; | |
| 3590 if (source->offsetReturn == source->offsetBuffer) { | |
| 3591 source->offsetStore = source->offsetBuffer; | |
| 3592 } | |
| 3593 } | |
| 3594 | |
| 3595 source->toReturn = source->CEpos - 1; | |
| 3596 if (source->toReturn == source->CEs) { | |
| 3597 source->CEpos = source->CEs; | |
| 3598 } | |
| 3599 | |
| 3600 return *(source->toReturn); | |
| 3601 } | |
| 3602 case LONG_PRIMARY_TAG: | |
| 3603 { | |
| 3604 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON
<< 8) | UCOL_BYTE_COMMON; | |
| 3605 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; | |
| 3606 source->toReturn = source->CEpos - 1; | |
| 3607 | |
| 3608 if (source->flags & UCOL_ITER_INNORMBUF) { | |
| 3609 source->offsetRepeatCount = 1; | |
| 3610 } else { | |
| 3611 int32_t firstOffset = (int32_t)(source->pos - source->string
); | |
| 3612 | |
| 3613 source->appendOffset(firstOffset, *status); | |
| 3614 source->appendOffset(firstOffset + 1, *status); | |
| 3615 | |
| 3616 source->offsetReturn = source->offsetStore - 1; | |
| 3617 *(source->offsetBuffer) = firstOffset; | |
| 3618 if (source->offsetReturn == source->offsetBuffer) { | |
| 3619 source->offsetStore = source->offsetBuffer; | |
| 3620 } | |
| 3621 } | |
| 3622 | |
| 3623 | |
| 3624 return *(source->toReturn); | |
| 3625 } | |
| 3626 | |
| 3627 case EXPANSION_TAG: /* this tag always returns */ | |
| 3628 { | |
| 3629 /* | |
| 3630 This should handle expansion. | |
| 3631 NOTE: we can encounter both continuations and expansions in an expan
sion! | |
| 3632 I have to decide where continuations are going to be dealt with | |
| 3633 */ | |
| 3634 int32_t firstOffset = (int32_t)(source->pos - source->string); | |
| 3635 | |
| 3636 // **** doesn't work if using iterator **** | |
| 3637 if (source->offsetReturn != NULL) { | |
| 3638 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetRet
urn == source->offsetBuffer) { | |
| 3639 source->offsetStore = source->offsetBuffer; | |
| 3640 }else { | |
| 3641 firstOffset = -1; | |
| 3642 } | |
| 3643 } | |
| 3644 | |
| 3645 /* find the offset to expansion table */ | |
| 3646 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); | |
| 3647 size = getExpansionCount(CE); | |
| 3648 if (size != 0) { | |
| 3649 /* | |
| 3650 if there are less than 16 elements in expansion, we don't termin
ate | |
| 3651 */ | |
| 3652 uint32_t count; | |
| 3653 | |
| 3654 for (count = 0; count < size; count++) { | |
| 3655 *(source->CEpos ++) = *CEOffset++; | |
| 3656 | |
| 3657 if (firstOffset >= 0) { | |
| 3658 source->appendOffset(firstOffset + 1, *status); | |
| 3659 } | |
| 3660 } | |
| 3661 } else { | |
| 3662 /* else, we do */ | |
| 3663 while (*CEOffset != 0) { | |
| 3664 *(source->CEpos ++) = *CEOffset ++; | |
| 3665 | |
| 3666 if (firstOffset >= 0) { | |
| 3667 source->appendOffset(firstOffset + 1, *status); | |
| 3668 } | |
| 3669 } | |
| 3670 } | |
| 3671 | |
| 3672 if (firstOffset >= 0) { | |
| 3673 source->offsetReturn = source->offsetStore - 1; | |
| 3674 *(source->offsetBuffer) = firstOffset; | |
| 3675 if (source->offsetReturn == source->offsetBuffer) { | |
| 3676 source->offsetStore = source->offsetBuffer; | |
| 3677 } | |
| 3678 } else { | |
| 3679 source->offsetRepeatCount += size - 1; | |
| 3680 } | |
| 3681 | |
| 3682 source->toReturn = source->CEpos - 1; | |
| 3683 // in case of one element expansion, we | |
| 3684 // want to immediately return CEpos | |
| 3685 if(source->toReturn == source->CEs) { | |
| 3686 source->CEpos = source->CEs; | |
| 3687 } | |
| 3688 | |
| 3689 return *(source->toReturn); | |
| 3690 } | |
| 3691 | |
| 3692 case DIGIT_TAG: | |
| 3693 { | |
| 3694 /* | |
| 3695 We do a check to see if we want to collate digits as numbers; if
so we generate | |
| 3696 a custom collation key. Otherwise we pull out the value stored i
n the expansion table. | |
| 3697 */ | |
| 3698 uint32_t i; /* general counter */ | |
| 3699 | |
| 3700 if (source->coll->numericCollation == UCOL_ON){ | |
| 3701 uint32_t digIndx = 0; | |
| 3702 uint32_t endIndex = 0; | |
| 3703 uint32_t leadingZeroIndex = 0; | |
| 3704 uint32_t trailingZeroCount = 0; | |
| 3705 | |
| 3706 uint8_t collateVal = 0; | |
| 3707 | |
| 3708 UBool nonZeroValReached = FALSE; | |
| 3709 | |
| 3710 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I j
ust need a temporary place to store my generated CEs. | |
| 3711 /* | |
| 3712 We parse the source string until we hit a char that's NOT a
digit. | |
| 3713 Use this u_charDigitValue. This might be slow because we hav
e to | |
| 3714 handle surrogates... | |
| 3715 */ | |
| 3716 /* | |
| 3717 We need to break up the digit string into collection element
s of UCOL_MAX_DIGITS_FOR_NUMBER or less, | |
| 3718 with any chunks smaller than that being on the right end of
the digit string - i.e. the first collation | |
| 3719 element we process when going backward. To determine how lon
g that chunk might be, we may need to make | |
| 3720 two passes through the loop that collects digits - one to se
e how long the string is (and how much is | |
| 3721 leading zeros) to determine the length of that right-hand ch
unk, and a second (if the whole string has | |
| 3722 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits
) to actually process that collation | |
| 3723 element chunk after resetting the state to the initialState
at the right side of the digit string. | |
| 3724 */ | |
| 3725 uint32_t ceLimit = 0; | |
| 3726 UChar initial_ch = ch; | |
| 3727 collIterateState initialState = {0,0,0,0,0,0,0,0,0}; | |
| 3728 backupState(source, &initialState); | |
| 3729 | |
| 3730 for(;;) { | |
| 3731 collIterateState state = {0,0,0,0,0,0,0,0,0}; | |
| 3732 UChar32 char32 = 0; | |
| 3733 int32_t digVal = 0; | |
| 3734 | |
| 3735 if (U16_IS_TRAIL (ch)) { | |
| 3736 if (!collIter_bos(source)){ | |
| 3737 UChar lead = getPrevNormalizedChar(source, statu
s); | |
| 3738 if(U16_IS_LEAD(lead)) { | |
| 3739 char32 = U16_GET_SUPPLEMENTARY(lead,ch); | |
| 3740 goBackOne(source); | |
| 3741 } else { | |
| 3742 char32 = ch; | |
| 3743 } | |
| 3744 } else { | |
| 3745 char32 = ch; | |
| 3746 } | |
| 3747 } else { | |
| 3748 char32 = ch; | |
| 3749 } | |
| 3750 digVal = u_charDigitValue(char32); | |
| 3751 | |
| 3752 for(;;) { | |
| 3753 // Make sure we have enough space. No longer needed; | |
| 3754 // at this point the largest value of digIndx when w
e need to save data in numTempBuf | |
| 3755 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-
incremented) so we just ensure | |
| 3756 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FO
R_NUMBER/2 + 2). | |
| 3757 | |
| 3758 // Skip over trailing zeroes, and keep a count of th
em. | |
| 3759 if (digVal != 0) | |
| 3760 nonZeroValReached = TRUE; | |
| 3761 | |
| 3762 if (nonZeroValReached) { | |
| 3763 /* | |
| 3764 We parse the digit string into base 100 numbers
(this fits into a byte). | |
| 3765 We only add to the buffer in twos, thus if we ar
e parsing an odd character, | |
| 3766 that serves as the 'tens' digit while the if we
are parsing an even one, that | |
| 3767 is the 'ones' digit. We dumped the parsed base 1
00 value (collateVal) into | |
| 3768 a buffer. We multiply each collateVal by 2 (to g
ive us room) and add 5 (to avoid | |
| 3769 overlapping magic CE byte values). The last byte
we subtract 1 to ensure it is less | |
| 3770 than all the other bytes. | |
| 3771 | |
| 3772 Since we're doing in this reverse we want to put
the first digit encountered into the | |
| 3773 ones place and the second digit encountered into
the tens place. | |
| 3774 */ | |
| 3775 | |
| 3776 if ((digIndx + trailingZeroCount) % 2 == 1) { | |
| 3777 // High-order digit case (tens place) | |
| 3778 collateVal += (uint8_t)(digVal * 10); | |
| 3779 | |
| 3780 // We cannot set leadingZeroIndex unless it
has been set for the | |
| 3781 // low-order digit. Therefore, all we can do
for the high-order | |
| 3782 // digit is turn it off, never on. | |
| 3783 // The only time we will have a high digit w
ithout a low is for | |
| 3784 // the very first non-zero digit, so no zero
check is necessary. | |
| 3785 if (collateVal != 0) | |
| 3786 leadingZeroIndex = 0; | |
| 3787 | |
| 3788 // The first pass through, digIndx may excee
d the limit, but in that case | |
| 3789 // we no longer care about numTempBuf conten
ts since they will be discarded | |
| 3790 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER )
{ | |
| 3791 numTempBuf[(digIndx/2) + 2] = collateVal
*2 + 6; | |
| 3792 } | |
| 3793 collateVal = 0; | |
| 3794 } else { | |
| 3795 // Low-order digit case (ones place) | |
| 3796 collateVal = (uint8_t)digVal; | |
| 3797 | |
| 3798 // Check for leading zeroes. | |
| 3799 if (collateVal == 0) { | |
| 3800 if (!leadingZeroIndex) | |
| 3801 leadingZeroIndex = (digIndx/2) + 2; | |
| 3802 } else | |
| 3803 leadingZeroIndex = 0; | |
| 3804 | |
| 3805 // No need to write to buffer; the case of a
last odd digit | |
| 3806 // is handled below. | |
| 3807 } | |
| 3808 ++digIndx; | |
| 3809 } else | |
| 3810 ++trailingZeroCount; | |
| 3811 | |
| 3812 if (!collIter_bos(source)) { | |
| 3813 ch = getPrevNormalizedChar(source, status); | |
| 3814 //goBackOne(source); | |
| 3815 if (U16_IS_TRAIL(ch)) { | |
| 3816 backupState(source, &state); | |
| 3817 if (!collIter_bos(source)) { | |
| 3818 goBackOne(source); | |
| 3819 UChar lead = getPrevNormalizedChar(sourc
e, status); | |
| 3820 | |
| 3821 if(U16_IS_LEAD(lead)) { | |
| 3822 char32 = U16_GET_SUPPLEMENTARY(lead,
ch); | |
| 3823 } else { | |
| 3824 loadState(source, &state, FALSE); | |
| 3825 char32 = ch; | |
| 3826 } | |
| 3827 } | |
| 3828 } else | |
| 3829 char32 = ch; | |
| 3830 | |
| 3831 if ((digVal = u_charDigitValue(char32)) == -1 ||
(ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) { | |
| 3832 if (char32 > 0xFFFF) {// For surrogates. | |
| 3833 loadState(source, &state, FALSE); | |
| 3834 } | |
| 3835 // Don't need to "reverse" the goBackOne cal
l, | |
| 3836 // as this points to the next position to pr
ocess.. | |
| 3837 //if (char32 > 0xFFFF) // For surrogates. | |
| 3838 //getNextNormalizedChar(source); | |
| 3839 break; | |
| 3840 } | |
| 3841 | |
| 3842 goBackOne(source); | |
| 3843 }else | |
| 3844 break; | |
| 3845 } | |
| 3846 | |
| 3847 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_N
UMBER) { | |
| 3848 // our collation element is not too big, go ahead an
d finish with it | |
| 3849 break; | |
| 3850 } | |
| 3851 // our digit string is too long for a collation element; | |
| 3852 // set the limit for it, reset the state and begin again | |
| 3853 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGIT
S_FOR_NUMBER; | |
| 3854 if ( ceLimit == 0 ) { | |
| 3855 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER; | |
| 3856 } | |
| 3857 ch = initial_ch; | |
| 3858 loadState(source, &initialState, FALSE); | |
| 3859 digIndx = endIndex = leadingZeroIndex = trailingZeroCoun
t = 0; | |
| 3860 collateVal = 0; | |
| 3861 nonZeroValReached = FALSE; | |
| 3862 } | |
| 3863 | |
| 3864 if (! nonZeroValReached) { | |
| 3865 digIndx = 2; | |
| 3866 trailingZeroCount = 0; | |
| 3867 numTempBuf[2] = 6; | |
| 3868 } | |
| 3869 | |
| 3870 if ((digIndx + trailingZeroCount) % 2 != 0) { | |
| 3871 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6; | |
| 3872 digIndx += 1; // The implicit leading zero | |
| 3873 } | |
| 3874 if (trailingZeroCount % 2 != 0) { | |
| 3875 // We had to consume one trailing zero for the low digit | |
| 3876 // of the least significant byte | |
| 3877 digIndx += 1; // The trailing zero not in the expo
nent | |
| 3878 trailingZeroCount -= 1; | |
| 3879 } | |
| 3880 | |
| 3881 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2
) + 2) ; | |
| 3882 | |
| 3883 // Subtract one off of the last byte. Really the first byte
here, but it's reversed... | |
| 3884 numTempBuf[2] -= 1; | |
| 3885 | |
| 3886 /* | |
| 3887 We want to skip over the first two slots in the buffer. The
first slot | |
| 3888 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The
second slot is for the | |
| 3889 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. | |
| 3890 The exponent must be adjusted by the number of leading zeroe
s, and the number of | |
| 3891 trailing zeroes. | |
| 3892 */ | |
| 3893 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; | |
| 3894 uint32_t exponent = (digIndx+trailingZeroCount)/2; | |
| 3895 if (leadingZeroIndex) | |
| 3896 exponent -= ((digIndx/2) + 2 - leadingZeroIndex); | |
| 3897 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F)); | |
| 3898 | |
| 3899 // Now transfer the collation key to our collIterate struct. | |
| 3900 // The total size for our collation key is half of endIndex,
rounded up. | |
| 3901 int32_t size = (endIndex+1)/2; | |
| 3902 if(!ensureCEsCapacity(source, size)) { | |
| 3903 return (uint32_t)UCOL_NULLORDER; | |
| 3904 } | |
| 3905 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1])
<< UCOL_PRIMARYORDERSHIFT) | //Primary weight | |
| 3906 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Seco
ndary weight | |
| 3907 UCOL_BYTE_COMMON; // Tertiary weight. | |
| 3908 i = endIndex - 1; // Reset the index into the buffer. | |
| 3909 while(i >= 2) { | |
| 3910 uint32_t primWeight = numTempBuf[i--] << 8; | |
| 3911 if ( i >= 2) | |
| 3912 primWeight |= numTempBuf[i--]; | |
| 3913 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHI
FT) | UCOL_CONTINUATION_MARKER; | |
| 3914 } | |
| 3915 | |
| 3916 source->toReturn = source->CEpos -1; | |
| 3917 return *(source->toReturn); | |
| 3918 } else { | |
| 3919 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); | |
| 3920 CE = *(CEOffset++); | |
| 3921 break; | |
| 3922 } | |
| 3923 } | |
| 3924 | |
| 3925 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ | |
| 3926 { | |
| 3927 static const uint32_t | |
| 3928 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11
A7; | |
| 3929 //const uint32_t LCount = 19; | |
| 3930 static const uint32_t VCount = 21; | |
| 3931 static const uint32_t TCount = 28; | |
| 3932 //const uint32_t NCount = VCount * TCount; /* 588 */ | |
| 3933 //const uint32_t SCount = LCount * NCount; /* 11172 */ | |
| 3934 | |
| 3935 uint32_t L = ch - SBase; | |
| 3936 /* | |
| 3937 divide into pieces. | |
| 3938 we do it in this order since some compilers can do % and / in on
e | |
| 3939 operation | |
| 3940 */ | |
| 3941 uint32_t T = L % TCount; | |
| 3942 L /= TCount; | |
| 3943 uint32_t V = L % VCount; | |
| 3944 L /= VCount; | |
| 3945 | |
| 3946 /* offset them */ | |
| 3947 L += LBase; | |
| 3948 V += VBase; | |
| 3949 T += TBase; | |
| 3950 | |
| 3951 int32_t firstOffset = (int32_t)(source->pos - source->string); | |
| 3952 source->appendOffset(firstOffset, *status); | |
| 3953 | |
| 3954 /* | |
| 3955 * return the first CE, but first put the rest into the expansio
n buffer | |
| 3956 */ | |
| 3957 if (!source->coll->image->jamoSpecial) { | |
| 3958 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L
); | |
| 3959 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V
); | |
| 3960 source->appendOffset(firstOffset + 1, *status); | |
| 3961 | |
| 3962 if (T != TBase) { | |
| 3963 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mappin
g, T); | |
| 3964 source->appendOffset(firstOffset + 1, *status); | |
| 3965 } | |
| 3966 | |
| 3967 source->toReturn = source->CEpos - 1; | |
| 3968 | |
| 3969 source->offsetReturn = source->offsetStore - 1; | |
| 3970 if (source->offsetReturn == source->offsetBuffer) { | |
| 3971 source->offsetStore = source->offsetBuffer; | |
| 3972 } | |
| 3973 | |
| 3974 return *(source->toReturn); | |
| 3975 } else { | |
| 3976 // Since Hanguls pass the FCD check, it is | |
| 3977 // guaranteed that we won't be in | |
| 3978 // the normalization buffer if something like this happens | |
| 3979 | |
| 3980 // Move Jamos into normalization buffer | |
| 3981 UChar *tempbuffer = source->writableBuffer.getBuffer(5); | |
| 3982 int32_t tempbufferLength, jamoOffset; | |
| 3983 tempbuffer[0] = 0; | |
| 3984 tempbuffer[1] = (UChar)L; | |
| 3985 tempbuffer[2] = (UChar)V; | |
| 3986 if (T != TBase) { | |
| 3987 tempbuffer[3] = (UChar)T; | |
| 3988 tempbufferLength = 4; | |
| 3989 } else { | |
| 3990 tempbufferLength = 3; | |
| 3991 } | |
| 3992 source->writableBuffer.releaseBuffer(tempbufferLength); | |
| 3993 | |
| 3994 // Indicate where to continue in main input string after exh
austing the writableBuffer | |
| 3995 if (source->pos == source->string) { | |
| 3996 jamoOffset = 0; | |
| 3997 source->fcdPosition = NULL; | |
| 3998 } else { | |
| 3999 jamoOffset = source->pos - source->string; | |
| 4000 source->fcdPosition = source->pos-1; | |
| 4001 } | |
| 4002 | |
| 4003 // Append offsets for the additional chars | |
| 4004 // (not the 0, and not the L whose offsets match the origina
l Hangul) | |
| 4005 int32_t jamoRemaining = tempbufferLength - 2; | |
| 4006 jamoOffset++; // appended offsets should match end of origin
al Hangul | |
| 4007 while (jamoRemaining-- > 0) { | |
| 4008 source->appendOffset(jamoOffset, *status); | |
| 4009 } | |
| 4010 | |
| 4011 source->offsetRepeatValue = jamoOffset; | |
| 4012 | |
| 4013 source->offsetReturn = source->offsetStore - 1; | |
| 4014 if (source->offsetReturn == source->offsetBuffer) { | |
| 4015 source->offsetStore = source->offsetBuffer; | |
| 4016 } | |
| 4017 | |
| 4018 source->pos = source->writableBuffer.getTermin
atedBuffer() + tempbufferLength; | |
| 4019 source->origFlags = source->flags; | |
| 4020 source->flags |= UCOL_ITER_INNORMBUF; | |
| 4021 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HAS
LEN); | |
| 4022 | |
| 4023 return(UCOL_IGNORABLE); | |
| 4024 } | |
| 4025 } | |
| 4026 | |
| 4027 case IMPLICIT_TAG: /* everything that is not defined otherwise */ | |
| 4028 return getPrevImplicit(ch, source); | |
| 4029 | |
| 4030 // TODO: Remove CJK implicits as they are handled by the getImplicit
Primary function | |
| 4031 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
*/ | |
| 4032 return getPrevImplicit(ch, source); | |
| 4033 | |
| 4034 case SURROGATE_TAG: /* This is a surrogate pair */ | |
| 4035 /* essentially an engaged lead surrogate. */ | |
| 4036 /* if you have encountered it here, it means that a */ | |
| 4037 /* broken sequence was encountered and this is an error */ | |
| 4038 return UCOL_NOT_FOUND; | |
| 4039 | |
| 4040 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ | |
| 4041 return UCOL_NOT_FOUND; /* broken surrogate sequence */ | |
| 4042 | |
| 4043 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ | |
| 4044 { | |
| 4045 UChar32 cp = 0; | |
| 4046 UChar prevChar; | |
| 4047 const UChar *prev; | |
| 4048 if (isAtStartPrevIterate(source)) { | |
| 4049 /* we are at the start of the string, wrong place to be at *
/ | |
| 4050 return UCOL_NOT_FOUND; | |
| 4051 } | |
| 4052 if (source->pos != source->writableBuffer.getBuffer()) { | |
| 4053 prev = source->pos - 1; | |
| 4054 } else { | |
| 4055 prev = source->fcdPosition; | |
| 4056 } | |
| 4057 prevChar = *prev; | |
| 4058 | |
| 4059 /* Handles Han and Supplementary characters here.*/ | |
| 4060 if (U16_IS_LEAD(prevChar)) { | |
| 4061 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<
10UL)+0xdc00-0x10000)); | |
| 4062 source->pos = prev; | |
| 4063 } else { | |
| 4064 return UCOL_NOT_FOUND; /* like unassigned */ | |
| 4065 } | |
| 4066 | |
| 4067 return getPrevImplicit(cp, source); | |
| 4068 } | |
| 4069 | |
| 4070 /* UCA is filled with these. Tailorings are NOT_FOUND */ | |
| 4071 /* not yet implemented */ | |
| 4072 case CHARSET_TAG: /* this tag always returns */ | |
| 4073 /* probably after 1.8 */ | |
| 4074 return UCOL_NOT_FOUND; | |
| 4075 | |
| 4076 default: /* this tag always returns */ | |
| 4077 *status = U_INTERNAL_PROGRAM_ERROR; | |
| 4078 CE=0; | |
| 4079 break; | |
| 4080 } | |
| 4081 | |
| 4082 if (CE <= UCOL_NOT_FOUND) { | |
| 4083 break; | |
| 4084 } | |
| 4085 } | |
| 4086 | |
| 4087 return CE; | |
| 4088 } | |
| 4089 | |
| 4090 /* This should really be a macro
*/ | |
| 4091 /* This function is used to reverse parts of a buffer. We need this operation wh
en doing continuation */ | |
| 4092 /* secondaries in French
*/ | |
| 4093 /* | |
| 4094 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) { | |
| 4095 uint8_t temp; | |
| 4096 while(start<end) { | |
| 4097 temp = *start; | |
| 4098 *start++ = *end; | |
| 4099 *end-- = temp; | |
| 4100 } | |
| 4101 } | |
| 4102 */ | |
| 4103 | |
| 4104 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \ | |
| 4105 TYPE tempA; \ | |
| 4106 while((start)<(end)) { \ | |
| 4107 tempA = *(start); \ | |
| 4108 *(start)++ = *(end); \ | |
| 4109 *(end)-- = tempA; \ | |
| 4110 } \ | |
| 4111 } | |
| 4112 | |
| 4113 /****************************************************************************/ | |
| 4114 /* Following are the sortkey generation functions */ | |
| 4115 /* */ | |
| 4116 /****************************************************************************/ | |
| 4117 | |
| 4118 U_CAPI int32_t U_EXPORT2 | 113 U_CAPI int32_t U_EXPORT2 |
| 4119 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, | 114 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, |
| 4120 const uint8_t *src2, int32_t src2Length, | 115 const uint8_t *src2, int32_t src2Length, |
| 4121 uint8_t *dest, int32_t destCapacity) { | 116 uint8_t *dest, int32_t destCapacity) { |
| 4122 /* check arguments */ | 117 /* check arguments */ |
| 4123 if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[sr
c1Length-1]!=0) || | 118 if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[sr
c1Length-1]!=0) || |
| 4124 src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[sr
c2Length-1]!=0) || | 119 src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[sr
c2Length-1]!=0) || |
| 4125 destCapacity<0 || (destCapacity>0 && dest==NULL) | 120 destCapacity<0 || (destCapacity>0 && dest==NULL) |
| 4126 ) { | 121 ) { |
| 4127 /* error, attempt to write a zero byte and return 0 */ | 122 /* error, attempt to write a zero byte and return 0 */ |
| (...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4183 /* src1 is not finished, therefore *src2==0, and src1 is appended */ | 178 /* src1 is not finished, therefore *src2==0, and src1 is appended */ |
| 4184 src2=src1; | 179 src2=src1; |
| 4185 } | 180 } |
| 4186 /* append src2, "the other, unfinished sort key" */ | 181 /* append src2, "the other, unfinished sort key" */ |
| 4187 while((*p++=*src2++)!=0) {} | 182 while((*p++=*src2++)!=0) {} |
| 4188 | 183 |
| 4189 /* the actual length might be less than destLength if either sort key contai
ned illegally embedded zero bytes */ | 184 /* the actual length might be less than destLength if either sort key contai
ned illegally embedded zero bytes */ |
| 4190 return (int32_t)(p-dest); | 185 return (int32_t)(p-dest); |
| 4191 } | 186 } |
| 4192 | 187 |
| 4193 U_NAMESPACE_BEGIN | |
| 4194 | |
| 4195 class SortKeyByteSink : public ByteSink { | |
| 4196 public: | |
| 4197 SortKeyByteSink(char *dest, int32_t destCapacity) | |
| 4198 : buffer_(dest), capacity_(destCapacity), | |
| 4199 appended_(0) { | |
| 4200 if (buffer_ == NULL) { | |
| 4201 capacity_ = 0; | |
| 4202 } else if(capacity_ < 0) { | |
| 4203 buffer_ = NULL; | |
| 4204 capacity_ = 0; | |
| 4205 } | |
| 4206 } | |
| 4207 virtual ~SortKeyByteSink(); | |
| 4208 | |
| 4209 virtual void Append(const char *bytes, int32_t n); | |
| 4210 void Append(uint32_t b) { | |
| 4211 if (appended_ < capacity_ || Resize(1, appended_)) { | |
| 4212 buffer_[appended_] = (char)b; | |
| 4213 } | |
| 4214 ++appended_; | |
| 4215 } | |
| 4216 void Append(uint32_t b1, uint32_t b2) { | |
| 4217 int32_t a2 = appended_ + 2; | |
| 4218 if (a2 <= capacity_ || Resize(2, appended_)) { | |
| 4219 buffer_[appended_] = (char)b1; | |
| 4220 buffer_[appended_ + 1] = (char)b2; | |
| 4221 } else if(appended_ < capacity_) { | |
| 4222 buffer_[appended_] = (char)b1; | |
| 4223 } | |
| 4224 appended_ = a2; | |
| 4225 } | |
| 4226 virtual char *GetAppendBuffer(int32_t min_capacity, | |
| 4227 int32_t desired_capacity_hint, | |
| 4228 char *scratch, int32_t scratch_capacity, | |
| 4229 int32_t *result_capacity); | |
| 4230 int32_t NumberOfBytesAppended() const { return appended_; } | |
| 4231 /** @return FALSE if memory allocation failed */ | |
| 4232 UBool IsOk() const { return buffer_ != NULL; } | |
| 4233 | |
| 4234 protected: | |
| 4235 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t leng
th) = 0; | |
| 4236 virtual UBool Resize(int32_t appendCapacity, int32_t length) = 0; | |
| 4237 | |
| 4238 void SetNotOk() { | |
| 4239 buffer_ = NULL; | |
| 4240 capacity_ = 0; | |
| 4241 } | |
| 4242 | |
| 4243 char *buffer_; | |
| 4244 int32_t capacity_; | |
| 4245 int32_t appended_; | |
| 4246 | |
| 4247 private: | |
| 4248 SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemente
d | |
| 4249 SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator
not implemented | |
| 4250 }; | |
| 4251 | |
| 4252 SortKeyByteSink::~SortKeyByteSink() {} | |
| 4253 | |
| 4254 void | |
| 4255 SortKeyByteSink::Append(const char *bytes, int32_t n) { | |
| 4256 if (n <= 0 || bytes == NULL) { | |
| 4257 return; | |
| 4258 } | |
| 4259 int32_t length = appended_; | |
| 4260 appended_ += n; | |
| 4261 if ((buffer_ + length) == bytes) { | |
| 4262 return; // the caller used GetAppendBuffer() and wrote the bytes alread
y | |
| 4263 } | |
| 4264 int32_t available = capacity_ - length; | |
| 4265 if (n <= available) { | |
| 4266 uprv_memcpy(buffer_ + length, bytes, n); | |
| 4267 } else { | |
| 4268 AppendBeyondCapacity(bytes, n, length); | |
| 4269 } | |
| 4270 } | |
| 4271 | |
| 4272 char * | |
| 4273 SortKeyByteSink::GetAppendBuffer(int32_t min_capacity, | |
| 4274 int32_t desired_capacity_hint, | |
| 4275 char *scratch, | |
| 4276 int32_t scratch_capacity, | |
| 4277 int32_t *result_capacity) { | |
| 4278 if (min_capacity < 1 || scratch_capacity < min_capacity) { | |
| 4279 *result_capacity = 0; | |
| 4280 return NULL; | |
| 4281 } | |
| 4282 int32_t available = capacity_ - appended_; | |
| 4283 if (available >= min_capacity) { | |
| 4284 *result_capacity = available; | |
| 4285 return buffer_ + appended_; | |
| 4286 } else if (Resize(desired_capacity_hint, appended_)) { | |
| 4287 *result_capacity = capacity_ - appended_; | |
| 4288 return buffer_ + appended_; | |
| 4289 } else { | |
| 4290 *result_capacity = scratch_capacity; | |
| 4291 return scratch; | |
| 4292 } | |
| 4293 } | |
| 4294 | |
| 4295 class FixedSortKeyByteSink : public SortKeyByteSink { | |
| 4296 public: | |
| 4297 FixedSortKeyByteSink(char *dest, int32_t destCapacity) | |
| 4298 : SortKeyByteSink(dest, destCapacity) {} | |
| 4299 virtual ~FixedSortKeyByteSink(); | |
| 4300 | |
| 4301 private: | |
| 4302 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t leng
th); | |
| 4303 virtual UBool Resize(int32_t appendCapacity, int32_t length); | |
| 4304 }; | |
| 4305 | |
| 4306 FixedSortKeyByteSink::~FixedSortKeyByteSink() {} | |
| 4307 | |
| 4308 void | |
| 4309 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int
32_t length) { | |
| 4310 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_ | |
| 4311 // Fill the buffer completely. | |
| 4312 int32_t available = capacity_ - length; | |
| 4313 if (available > 0) { | |
| 4314 uprv_memcpy(buffer_ + length, bytes, available); | |
| 4315 } | |
| 4316 } | |
| 4317 | |
| 4318 UBool | |
| 4319 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) { | |
| 4320 return FALSE; | |
| 4321 } | |
| 4322 | |
| 4323 class CollationKeyByteSink : public SortKeyByteSink { | |
| 4324 public: | |
| 4325 CollationKeyByteSink(CollationKey &key) | |
| 4326 : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getC
apacity()), | |
| 4327 key_(key) {} | |
| 4328 virtual ~CollationKeyByteSink(); | |
| 4329 | |
| 4330 private: | |
| 4331 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t leng
th); | |
| 4332 virtual UBool Resize(int32_t appendCapacity, int32_t length); | |
| 4333 | |
| 4334 CollationKey &key_; | |
| 4335 }; | |
| 4336 | |
| 4337 CollationKeyByteSink::~CollationKeyByteSink() {} | |
| 4338 | |
| 4339 void | |
| 4340 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t
length) { | |
| 4341 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_ | |
| 4342 if (Resize(n, length)) { | |
| 4343 uprv_memcpy(buffer_ + length, bytes, n); | |
| 4344 } | |
| 4345 } | |
| 4346 | |
| 4347 UBool | |
| 4348 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) { | |
| 4349 if (buffer_ == NULL) { | |
| 4350 return FALSE; // allocation failed before already | |
| 4351 } | |
| 4352 int32_t newCapacity = 2 * capacity_; | |
| 4353 int32_t altCapacity = length + 2 * appendCapacity; | |
| 4354 if (newCapacity < altCapacity) { | |
| 4355 newCapacity = altCapacity; | |
| 4356 } | |
| 4357 if (newCapacity < 200) { | |
| 4358 newCapacity = 200; | |
| 4359 } | |
| 4360 uint8_t *newBuffer = key_.reallocate(newCapacity, length); | |
| 4361 if (newBuffer == NULL) { | |
| 4362 SetNotOk(); | |
| 4363 return FALSE; | |
| 4364 } | |
| 4365 buffer_ = reinterpret_cast<char *>(newBuffer); | |
| 4366 capacity_ = newCapacity; | |
| 4367 return TRUE; | |
| 4368 } | |
| 4369 | |
| 4370 /** | |
| 4371 * uint8_t byte buffer, similar to CharString but simpler. | |
| 4372 */ | |
| 4373 class SortKeyLevel : public UMemory { | |
| 4374 public: | |
| 4375 SortKeyLevel() : len(0), ok(TRUE) {} | |
| 4376 ~SortKeyLevel() {} | |
| 4377 | |
| 4378 /** @return FALSE if memory allocation failed */ | |
| 4379 UBool isOk() const { return ok; } | |
| 4380 UBool isEmpty() const { return len == 0; } | |
| 4381 int32_t length() const { return len; } | |
| 4382 const uint8_t *data() const { return buffer.getAlias(); } | |
| 4383 uint8_t operator[](int32_t index) const { return buffer[index]; } | |
| 4384 | |
| 4385 void appendByte(uint32_t b); | |
| 4386 | |
| 4387 void appendTo(ByteSink &sink) const { | |
| 4388 sink.Append(reinterpret_cast<const char *>(buffer.getAlias()), len); | |
| 4389 } | |
| 4390 | |
| 4391 uint8_t &lastByte() { | |
| 4392 U_ASSERT(len > 0); | |
| 4393 return buffer[len - 1]; | |
| 4394 } | |
| 4395 | |
| 4396 uint8_t *getLastFewBytes(int32_t n) { | |
| 4397 if (ok && len >= n) { | |
| 4398 return buffer.getAlias() + len - n; | |
| 4399 } else { | |
| 4400 return NULL; | |
| 4401 } | |
| 4402 } | |
| 4403 | |
| 4404 private: | |
| 4405 MaybeStackArray<uint8_t, 40> buffer; | |
| 4406 int32_t len; | |
| 4407 UBool ok; | |
| 4408 | |
| 4409 UBool ensureCapacity(int32_t appendCapacity); | |
| 4410 | |
| 4411 SortKeyLevel(const SortKeyLevel &other); // forbid copying of this class | |
| 4412 SortKeyLevel &operator=(const SortKeyLevel &other); // forbid copying of thi
s class | |
| 4413 }; | |
| 4414 | |
| 4415 void SortKeyLevel::appendByte(uint32_t b) { | |
| 4416 if(len < buffer.getCapacity() || ensureCapacity(1)) { | |
| 4417 buffer[len++] = (uint8_t)b; | |
| 4418 } | |
| 4419 } | |
| 4420 | |
| 4421 UBool SortKeyLevel::ensureCapacity(int32_t appendCapacity) { | |
| 4422 if(!ok) { | |
| 4423 return FALSE; | |
| 4424 } | |
| 4425 int32_t newCapacity = 2 * buffer.getCapacity(); | |
| 4426 int32_t altCapacity = len + 2 * appendCapacity; | |
| 4427 if (newCapacity < altCapacity) { | |
| 4428 newCapacity = altCapacity; | |
| 4429 } | |
| 4430 if (newCapacity < 200) { | |
| 4431 newCapacity = 200; | |
| 4432 } | |
| 4433 if(buffer.resize(newCapacity, len)==NULL) { | |
| 4434 return ok = FALSE; | |
| 4435 } | |
| 4436 return TRUE; | |
| 4437 } | |
| 4438 | |
| 4439 U_NAMESPACE_END | |
| 4440 | |
| 4441 /* sortkey API */ | |
| 4442 U_CAPI int32_t U_EXPORT2 | 188 U_CAPI int32_t U_EXPORT2 |
| 4443 ucol_getSortKey(const UCollator *coll, | 189 ucol_getSortKey(const UCollator *coll, |
| 4444 const UChar *source, | 190 const UChar *source, |
| 4445 int32_t sourceLength, | 191 int32_t sourceLength, |
| 4446 uint8_t *result, | 192 uint8_t *result, |
| 4447 int32_t resultLength) | 193 int32_t resultLength) |
| 4448 { | 194 { |
| 4449 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); | 195 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); |
| 4450 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { | 196 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { |
| 4451 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, sour
ce, | 197 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, sour
ce, |
| 4452 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLengt
h)); | 198 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLengt
h)); |
| 4453 } | 199 } |
| 4454 | 200 |
| 4455 if(coll->delegate != NULL) { | 201 int32_t keySize = Collator::fromUCollator(coll)-> |
| 4456 return ((const Collator*)coll->delegate)->getSortKey(source, sourceLength,
result, resultLength); | 202 getSortKey(source, sourceLength, result, resultLength); |
| 4457 } | 203 |
| 4458 | |
| 4459 UErrorCode status = U_ZERO_ERROR; | |
| 4460 int32_t keySize = 0; | |
| 4461 | |
| 4462 if(source != NULL) { | |
| 4463 // source == NULL is actually an error situation, but we would need to | |
| 4464 // have an error code to return it. Until we introduce a new | |
| 4465 // API, it stays like this | |
| 4466 | |
| 4467 /* this uses the function pointer that is set in updateinternalstate */ | |
| 4468 /* currently, there are two funcs: */ | |
| 4469 /*ucol_calcSortKey(...);*/ | |
| 4470 /*ucol_calcSortKeySimpleTertiary(...);*/ | |
| 4471 | |
| 4472 uint8_t noDest[1] = { 0 }; | |
| 4473 if(result == NULL) { | |
| 4474 // Distinguish pure preflighting from an allocation error. | |
| 4475 result = noDest; | |
| 4476 resultLength = 0; | |
| 4477 } | |
| 4478 FixedSortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength
); | |
| 4479 coll->sortKeyGen(coll, source, sourceLength, sink, &status); | |
| 4480 if(U_SUCCESS(status)) { | |
| 4481 keySize = sink.NumberOfBytesAppended(); | |
| 4482 } | |
| 4483 } | |
| 4484 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); | 204 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); |
| 4485 UTRACE_EXIT_STATUS(status); | 205 UTRACE_EXIT_VALUE(keySize); |
| 4486 return keySize; | 206 return keySize; |
| 4487 } | 207 } |
| 4488 | 208 |
| 4489 U_CFUNC int32_t | |
| 4490 ucol_getCollationKey(const UCollator *coll, | |
| 4491 const UChar *source, int32_t sourceLength, | |
| 4492 CollationKey &key, | |
| 4493 UErrorCode &errorCode) { | |
| 4494 CollationKeyByteSink sink(key); | |
| 4495 coll->sortKeyGen(coll, source, sourceLength, sink, &errorCode); | |
| 4496 return sink.NumberOfBytesAppended(); | |
| 4497 } | |
| 4498 | |
| 4499 // Is this primary weight compressible? | |
| 4500 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit). | |
| 4501 // TODO: This should use per-lead-byte flags from FractionalUCA.txt. | |
| 4502 static inline UBool | |
| 4503 isCompressible(const UCollator * /*coll*/, uint8_t primary1) { | |
| 4504 return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegul
arPrimary; | |
| 4505 } | |
| 4506 | |
| 4507 static | |
| 4508 inline void doCaseShift(SortKeyLevel &cases, uint32_t &caseShift) { | |
| 4509 if (caseShift == 0) { | |
| 4510 cases.appendByte(UCOL_CASE_BYTE_START); | |
| 4511 caseShift = UCOL_CASE_SHIFT_START; | |
| 4512 } | |
| 4513 } | |
| 4514 | |
| 4515 // Packs the secondary buffer when processing French locale. | |
| 4516 static void | |
| 4517 packFrench(const uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result)
{ | |
| 4518 secondaries += secsize; // We read the secondary-level bytes back to front. | |
| 4519 uint8_t secondary; | |
| 4520 int32_t count2 = 0; | |
| 4521 int32_t i = 0; | |
| 4522 // we use i here since the key size already accounts for terminators, so we'
ll discard the increment | |
| 4523 for(i = 0; i<secsize; i++) { | |
| 4524 secondary = *(secondaries-i-1); | |
| 4525 /* This is compression code. */ | |
| 4526 if (secondary == UCOL_COMMON2) { | |
| 4527 ++count2; | |
| 4528 } else { | |
| 4529 if (count2 > 0) { | |
| 4530 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. | |
| 4531 while (count2 > UCOL_TOP_COUNT2) { | |
| 4532 result.Append(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); | |
| 4533 count2 -= (uint32_t)UCOL_TOP_COUNT2; | |
| 4534 } | |
| 4535 result.Append(UCOL_COMMON_TOP2 - (count2-1)); | |
| 4536 } else { | |
| 4537 while (count2 > UCOL_BOT_COUNT2) { | |
| 4538 result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); | |
| 4539 count2 -= (uint32_t)UCOL_BOT_COUNT2; | |
| 4540 } | |
| 4541 result.Append(UCOL_COMMON_BOT2 + (count2-1)); | |
| 4542 } | |
| 4543 count2 = 0; | |
| 4544 } | |
| 4545 result.Append(secondary); | |
| 4546 } | |
| 4547 } | |
| 4548 if (count2 > 0) { | |
| 4549 while (count2 > UCOL_BOT_COUNT2) { | |
| 4550 result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); | |
| 4551 count2 -= (uint32_t)UCOL_BOT_COUNT2; | |
| 4552 } | |
| 4553 result.Append(UCOL_COMMON_BOT2 + (count2-1)); | |
| 4554 } | |
| 4555 } | |
| 4556 | |
| 4557 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0 | |
| 4558 | |
| 4559 /* This is the sortkey work horse function */ | |
| 4560 U_CFUNC void U_CALLCONV | |
| 4561 ucol_calcSortKey(const UCollator *coll, | |
| 4562 const UChar *source, | |
| 4563 int32_t sourceLength, | |
| 4564 SortKeyByteSink &result, | |
| 4565 UErrorCode *status) | |
| 4566 { | |
| 4567 if(U_FAILURE(*status)) { | |
| 4568 return; | |
| 4569 } | |
| 4570 | |
| 4571 SortKeyByteSink &primaries = result; | |
| 4572 SortKeyLevel secondaries; | |
| 4573 SortKeyLevel tertiaries; | |
| 4574 SortKeyLevel cases; | |
| 4575 SortKeyLevel quads; | |
| 4576 | |
| 4577 UnicodeString normSource; | |
| 4578 | |
| 4579 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); | |
| 4580 | |
| 4581 UColAttributeValue strength = coll->strength; | |
| 4582 | |
| 4583 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); | |
| 4584 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); | |
| 4585 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); | |
| 4586 UBool compareIdent = (strength == UCOL_IDENTICAL); | |
| 4587 UBool doCase = (coll->caseLevel == UCOL_ON); | |
| 4588 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0)
; | |
| 4589 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); | |
| 4590 //UBool qShifted = shifted && (compareQuad == 0); | |
| 4591 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); | |
| 4592 | |
| 4593 uint32_t variableTopValue = coll->variableTopValue; | |
| 4594 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no | |
| 4595 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high. | |
| 4596 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); | |
| 4597 uint8_t UCOL_HIRAGANA_QUAD = 0; | |
| 4598 if(doHiragana) { | |
| 4599 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++; | |
| 4600 /* allocate one more space for hiragana, value for hiragana */ | |
| 4601 } | |
| 4602 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); | |
| 4603 | |
| 4604 /* support for special features like caselevel and funky secondaries */ | |
| 4605 int32_t lastSecondaryLength = 0; | |
| 4606 uint32_t caseShift = 0; | |
| 4607 | |
| 4608 /* If we need to normalize, we'll do it all at once at the beginning! */ | |
| 4609 const Normalizer2 *norm2; | |
| 4610 if(compareIdent) { | |
| 4611 norm2 = Normalizer2Factory::getNFDInstance(*status); | |
| 4612 } else if(coll->normalizationMode != UCOL_OFF) { | |
| 4613 norm2 = Normalizer2Factory::getFCDInstance(*status); | |
| 4614 } else { | |
| 4615 norm2 = NULL; | |
| 4616 } | |
| 4617 if(norm2 != NULL) { | |
| 4618 normSource.setTo(FALSE, source, len); | |
| 4619 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); | |
| 4620 if(qcYesLength != len) { | |
| 4621 UnicodeString unnormalized = normSource.tempSubString(qcYesLength); | |
| 4622 normSource.truncate(qcYesLength); | |
| 4623 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); | |
| 4624 source = normSource.getBuffer(); | |
| 4625 len = normSource.length(); | |
| 4626 } | |
| 4627 } | |
| 4628 collIterate s; | |
| 4629 IInit_collIterate(coll, source, len, &s, status); | |
| 4630 if(U_FAILURE(*status)) { | |
| 4631 return; | |
| 4632 } | |
| 4633 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was norma
lized. | |
| 4634 | |
| 4635 uint32_t order = 0; | |
| 4636 | |
| 4637 uint8_t primary1 = 0; | |
| 4638 uint8_t primary2 = 0; | |
| 4639 uint8_t secondary = 0; | |
| 4640 uint8_t tertiary = 0; | |
| 4641 uint8_t caseSwitch = coll->caseSwitch; | |
| 4642 uint8_t tertiaryMask = coll->tertiaryMask; | |
| 4643 int8_t tertiaryAddition = coll->tertiaryAddition; | |
| 4644 uint8_t tertiaryTop = coll->tertiaryTop; | |
| 4645 uint8_t tertiaryBottom = coll->tertiaryBottom; | |
| 4646 uint8_t tertiaryCommon = coll->tertiaryCommon; | |
| 4647 uint8_t caseBits = 0; | |
| 4648 | |
| 4649 UBool wasShifted = FALSE; | |
| 4650 UBool notIsContinuation = FALSE; | |
| 4651 | |
| 4652 uint32_t count2 = 0, count3 = 0, count4 = 0; | |
| 4653 uint8_t leadPrimary = 0; | |
| 4654 | |
| 4655 for(;;) { | |
| 4656 order = ucol_IGetNextCE(coll, &s, status); | |
| 4657 if(order == UCOL_NO_MORE_CES) { | |
| 4658 break; | |
| 4659 } | |
| 4660 | |
| 4661 if(order == 0) { | |
| 4662 continue; | |
| 4663 } | |
| 4664 | |
| 4665 notIsContinuation = !isContinuation(order); | |
| 4666 | |
| 4667 if(notIsContinuation) { | |
| 4668 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK); | |
| 4669 } else { | |
| 4670 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); | |
| 4671 } | |
| 4672 | |
| 4673 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); | |
| 4674 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); | |
| 4675 primary1 = (uint8_t)(order >> 8); | |
| 4676 | |
| 4677 uint8_t originalPrimary1 = primary1; | |
| 4678 if(notIsContinuation && coll->leadBytePermutationTable != NULL) { | |
| 4679 primary1 = coll->leadBytePermutationTable[primary1]; | |
| 4680 } | |
| 4681 | |
| 4682 if((shifted && ((notIsContinuation && order <= variableTopValue && prima
ry1 > 0) | |
| 4683 || (!notIsContinuation && wasShifted))) | |
| 4684 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that
primary ignorables */ | |
| 4685 { | |
| 4686 /* and other ignorables should be removed if following a shifted cod
e point */ | |
| 4687 if(primary1 == 0) { /* if we were shifted and we got an ignorable co
de point */ | |
| 4688 /* we should just completely ignore it */ | |
| 4689 continue; | |
| 4690 } | |
| 4691 if(compareQuad == 0) { | |
| 4692 if(count4 > 0) { | |
| 4693 while (count4 > UCOL_BOT_COUNT4) { | |
| 4694 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); | |
| 4695 count4 -= UCOL_BOT_COUNT4; | |
| 4696 } | |
| 4697 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1)); | |
| 4698 count4 = 0; | |
| 4699 } | |
| 4700 /* We are dealing with a variable and we're treating them as shi
fted */ | |
| 4701 /* This is a shifted ignorable */ | |
| 4702 if(primary1 != 0) { /* we need to check this since we could be i
n continuation */ | |
| 4703 quads.appendByte(primary1); | |
| 4704 } | |
| 4705 if(primary2 != 0) { | |
| 4706 quads.appendByte(primary2); | |
| 4707 } | |
| 4708 } | |
| 4709 wasShifted = TRUE; | |
| 4710 } else { | |
| 4711 wasShifted = FALSE; | |
| 4712 /* Note: This code assumes that the table is well built i.e. not hav
ing 0 bytes where they are not supposed to be. */ | |
| 4713 /* Usually, we'll have non-zero primary1 & primary2, except in cases
of a-z and friends, when primary2 will */ | |
| 4714 /* regular and simple sortkey calc */ | |
| 4715 if(primary1 != UCOL_IGNORABLE) { | |
| 4716 if(notIsContinuation) { | |
| 4717 if(leadPrimary == primary1) { | |
| 4718 primaries.Append(primary2); | |
| 4719 } else { | |
| 4720 if(leadPrimary != 0) { | |
| 4721 primaries.Append((primary1 > leadPrimary) ? UCOL_BYT
E_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); | |
| 4722 } | |
| 4723 if(primary2 == UCOL_IGNORABLE) { | |
| 4724 /* one byter, not compressed */ | |
| 4725 primaries.Append(primary1); | |
| 4726 leadPrimary = 0; | |
| 4727 } else if(isCompressible(coll, originalPrimary1)) { | |
| 4728 /* compress */ | |
| 4729 primaries.Append(leadPrimary = primary1, primary2); | |
| 4730 } else { | |
| 4731 leadPrimary = 0; | |
| 4732 primaries.Append(primary1, primary2); | |
| 4733 } | |
| 4734 } | |
| 4735 } else { /* we are in continuation, so we're gonna add primary t
o the key don't care about compression */ | |
| 4736 if(primary2 == UCOL_IGNORABLE) { | |
| 4737 primaries.Append(primary1); | |
| 4738 } else { | |
| 4739 primaries.Append(primary1, primary2); | |
| 4740 } | |
| 4741 } | |
| 4742 } | |
| 4743 | |
| 4744 if(secondary > compareSec) { | |
| 4745 if(!isFrenchSec) { | |
| 4746 /* This is compression code. */ | |
| 4747 if (secondary == UCOL_COMMON2 && notIsContinuation) { | |
| 4748 ++count2; | |
| 4749 } else { | |
| 4750 if (count2 > 0) { | |
| 4751 if (secondary > UCOL_COMMON2) { // not necessary for
4th level. | |
| 4752 while (count2 > UCOL_TOP_COUNT2) { | |
| 4753 secondaries.appendByte(UCOL_COMMON_TOP2 - UC
OL_TOP_COUNT2); | |
| 4754 count2 -= (uint32_t)UCOL_TOP_COUNT2; | |
| 4755 } | |
| 4756 secondaries.appendByte(UCOL_COMMON_TOP2 - (count
2-1)); | |
| 4757 } else { | |
| 4758 while (count2 > UCOL_BOT_COUNT2) { | |
| 4759 secondaries.appendByte(UCOL_COMMON_BOT2 + UC
OL_BOT_COUNT2); | |
| 4760 count2 -= (uint32_t)UCOL_BOT_COUNT2; | |
| 4761 } | |
| 4762 secondaries.appendByte(UCOL_COMMON_BOT2 + (count
2-1)); | |
| 4763 } | |
| 4764 count2 = 0; | |
| 4765 } | |
| 4766 secondaries.appendByte(secondary); | |
| 4767 } | |
| 4768 } else { | |
| 4769 /* Do the special handling for French secondaries */ | |
| 4770 /* We need to get continuation elements and do intermediate
restore */ | |
| 4771 /* abc1c2c3de with french secondaries need to be edc1c2c3ba
NOT edc3c2c1ba */ | |
| 4772 if(notIsContinuation) { | |
| 4773 if (lastSecondaryLength > 1) { | |
| 4774 uint8_t *frenchStartPtr = secondaries.getLastFewByte
s(lastSecondaryLength); | |
| 4775 if (frenchStartPtr != NULL) { | |
| 4776 /* reverse secondaries from frenchStartPtr up to
frenchEndPtr */ | |
| 4777 uint8_t *frenchEndPtr = frenchStartPtr + lastSec
ondaryLength - 1; | |
| 4778 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr); | |
| 4779 } | |
| 4780 } | |
| 4781 lastSecondaryLength = 1; | |
| 4782 } else { | |
| 4783 ++lastSecondaryLength; | |
| 4784 } | |
| 4785 secondaries.appendByte(secondary); | |
| 4786 } | |
| 4787 } | |
| 4788 | |
| 4789 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { | |
| 4790 // do the case level if we need to do it. We don't want to calcu
late | |
| 4791 // case level for primary ignorables if we have only primary str
ength and case level | |
| 4792 // otherwise we would break well formedness of CEs | |
| 4793 doCaseShift(cases, caseShift); | |
| 4794 if(notIsContinuation) { | |
| 4795 caseBits = (uint8_t)(tertiary & 0xC0); | |
| 4796 | |
| 4797 if(tertiary != 0) { | |
| 4798 if(coll->caseFirst == UCOL_UPPER_FIRST) { | |
| 4799 if((caseBits & 0xC0) == 0) { | |
| 4800 cases.lastByte() |= 1 << (--caseShift); | |
| 4801 } else { | |
| 4802 cases.lastByte() |= 0 << (--caseShift); | |
| 4803 /* second bit */ | |
| 4804 doCaseShift(cases, caseShift); | |
| 4805 cases.lastByte() |= ((caseBits>>6)&1) << (--case
Shift); | |
| 4806 } | |
| 4807 } else { | |
| 4808 if((caseBits & 0xC0) == 0) { | |
| 4809 cases.lastByte() |= 0 << (--caseShift); | |
| 4810 } else { | |
| 4811 cases.lastByte() |= 1 << (--caseShift); | |
| 4812 /* second bit */ | |
| 4813 doCaseShift(cases, caseShift); | |
| 4814 cases.lastByte() |= ((caseBits>>7)&1) << (--case
Shift); | |
| 4815 } | |
| 4816 } | |
| 4817 } | |
| 4818 } | |
| 4819 } else { | |
| 4820 if(notIsContinuation) { | |
| 4821 tertiary ^= caseSwitch; | |
| 4822 } | |
| 4823 } | |
| 4824 | |
| 4825 tertiary &= tertiaryMask; | |
| 4826 if(tertiary > compareTer) { | |
| 4827 /* This is compression code. */ | |
| 4828 /* sequence size check is included in the if clause */ | |
| 4829 if (tertiary == tertiaryCommon && notIsContinuation) { | |
| 4830 ++count3; | |
| 4831 } else { | |
| 4832 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMO
N3_NORMAL) { | |
| 4833 tertiary += tertiaryAddition; | |
| 4834 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UC
OL_COMMON3_UPPERFIRST) { | |
| 4835 tertiary -= tertiaryAddition; | |
| 4836 } | |
| 4837 if (count3 > 0) { | |
| 4838 if ((tertiary > tertiaryCommon)) { | |
| 4839 while (count3 > coll->tertiaryTopCount) { | |
| 4840 tertiaries.appendByte(tertiaryTop - coll->tertia
ryTopCount); | |
| 4841 count3 -= (uint32_t)coll->tertiaryTopCount; | |
| 4842 } | |
| 4843 tertiaries.appendByte(tertiaryTop - (count3-1)); | |
| 4844 } else { | |
| 4845 while (count3 > coll->tertiaryBottomCount) { | |
| 4846 tertiaries.appendByte(tertiaryBottom + coll->ter
tiaryBottomCount); | |
| 4847 count3 -= (uint32_t)coll->tertiaryBottomCount; | |
| 4848 } | |
| 4849 tertiaries.appendByte(tertiaryBottom + (count3-1)); | |
| 4850 } | |
| 4851 count3 = 0; | |
| 4852 } | |
| 4853 tertiaries.appendByte(tertiary); | |
| 4854 } | |
| 4855 } | |
| 4856 | |
| 4857 if(/*qShifted*/(compareQuad==0) && notIsContinuation) { | |
| 4858 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we ne
ed to note it | |
| 4859 if(count4>0) { // Close this part | |
| 4860 while (count4 > UCOL_BOT_COUNT4) { | |
| 4861 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4)
; | |
| 4862 count4 -= UCOL_BOT_COUNT4; | |
| 4863 } | |
| 4864 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1)); | |
| 4865 count4 = 0; | |
| 4866 } | |
| 4867 quads.appendByte(UCOL_HIRAGANA_QUAD); // Add the Hiragana | |
| 4868 } else { // This wasn't Hiragana, so we can continue adding stuf
f | |
| 4869 count4++; | |
| 4870 } | |
| 4871 } | |
| 4872 } | |
| 4873 } | |
| 4874 | |
| 4875 /* Here, we are generally done with processing */ | |
| 4876 /* bailing out would not be too productive */ | |
| 4877 | |
| 4878 UBool ok = TRUE; | |
| 4879 if(U_SUCCESS(*status)) { | |
| 4880 /* we have done all the CE's, now let's put them together to form a key
*/ | |
| 4881 if(compareSec == 0) { | |
| 4882 if (count2 > 0) { | |
| 4883 while (count2 > UCOL_BOT_COUNT2) { | |
| 4884 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); | |
| 4885 count2 -= (uint32_t)UCOL_BOT_COUNT2; | |
| 4886 } | |
| 4887 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); | |
| 4888 } | |
| 4889 result.Append(UCOL_LEVELTERMINATOR); | |
| 4890 if(!secondaries.isOk()) { | |
| 4891 ok = FALSE; | |
| 4892 } else if(!isFrenchSec) { | |
| 4893 secondaries.appendTo(result); | |
| 4894 } else { | |
| 4895 // If there are any unresolved continuation secondaries, | |
| 4896 // reverse them here so that we can reverse the whole secondary
thing. | |
| 4897 if (lastSecondaryLength > 1) { | |
| 4898 uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSe
condaryLength); | |
| 4899 if (frenchStartPtr != NULL) { | |
| 4900 /* reverse secondaries from frenchStartPtr up to frenchE
ndPtr */ | |
| 4901 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLe
ngth - 1; | |
| 4902 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, french
EndPtr); | |
| 4903 } | |
| 4904 } | |
| 4905 packFrench(secondaries.data(), secondaries.length(), result); | |
| 4906 } | |
| 4907 } | |
| 4908 | |
| 4909 if(doCase) { | |
| 4910 ok &= cases.isOk(); | |
| 4911 result.Append(UCOL_LEVELTERMINATOR); | |
| 4912 cases.appendTo(result); | |
| 4913 } | |
| 4914 | |
| 4915 if(compareTer == 0) { | |
| 4916 if (count3 > 0) { | |
| 4917 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) { | |
| 4918 while (count3 >= coll->tertiaryTopCount) { | |
| 4919 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCou
nt); | |
| 4920 count3 -= (uint32_t)coll->tertiaryTopCount; | |
| 4921 } | |
| 4922 tertiaries.appendByte(tertiaryTop - count3); | |
| 4923 } else { | |
| 4924 while (count3 > coll->tertiaryBottomCount) { | |
| 4925 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBot
tomCount); | |
| 4926 count3 -= (uint32_t)coll->tertiaryBottomCount; | |
| 4927 } | |
| 4928 tertiaries.appendByte(tertiaryBottom + (count3-1)); | |
| 4929 } | |
| 4930 } | |
| 4931 ok &= tertiaries.isOk(); | |
| 4932 result.Append(UCOL_LEVELTERMINATOR); | |
| 4933 tertiaries.appendTo(result); | |
| 4934 | |
| 4935 if(compareQuad == 0/*qShifted == TRUE*/) { | |
| 4936 if(count4 > 0) { | |
| 4937 while (count4 > UCOL_BOT_COUNT4) { | |
| 4938 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); | |
| 4939 count4 -= UCOL_BOT_COUNT4; | |
| 4940 } | |
| 4941 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1)); | |
| 4942 } | |
| 4943 ok &= quads.isOk(); | |
| 4944 result.Append(UCOL_LEVELTERMINATOR); | |
| 4945 quads.appendTo(result); | |
| 4946 } | |
| 4947 | |
| 4948 if(compareIdent) { | |
| 4949 result.Append(UCOL_LEVELTERMINATOR); | |
| 4950 u_writeIdenticalLevelRun(s.string, len, result); | |
| 4951 } | |
| 4952 } | |
| 4953 result.Append(0); | |
| 4954 } | |
| 4955 | |
| 4956 /* To avoid memory leak, free the offset buffer if necessary. */ | |
| 4957 ucol_freeOffsetBuffer(&s); | |
| 4958 | |
| 4959 ok &= result.IsOk(); | |
| 4960 if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; } | |
| 4961 } | |
| 4962 | |
| 4963 | |
| 4964 U_CFUNC void U_CALLCONV | |
| 4965 ucol_calcSortKeySimpleTertiary(const UCollator *coll, | |
| 4966 const UChar *source, | |
| 4967 int32_t sourceLength, | |
| 4968 SortKeyByteSink &result, | |
| 4969 UErrorCode *status) | |
| 4970 { | |
| 4971 U_ALIGN_CODE(16); | |
| 4972 | |
| 4973 if(U_FAILURE(*status)) { | |
| 4974 return; | |
| 4975 } | |
| 4976 | |
| 4977 SortKeyByteSink &primaries = result; | |
| 4978 SortKeyLevel secondaries; | |
| 4979 SortKeyLevel tertiaries; | |
| 4980 | |
| 4981 UnicodeString normSource; | |
| 4982 | |
| 4983 int32_t len = sourceLength; | |
| 4984 | |
| 4985 /* If we need to normalize, we'll do it all at once at the beginning! */ | |
| 4986 if(coll->normalizationMode != UCOL_OFF) { | |
| 4987 normSource.setTo(len < 0, source, len); | |
| 4988 const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status); | |
| 4989 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); | |
| 4990 if(qcYesLength != normSource.length()) { | |
| 4991 UnicodeString unnormalized = normSource.tempSubString(qcYesLength); | |
| 4992 normSource.truncate(qcYesLength); | |
| 4993 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); | |
| 4994 source = normSource.getBuffer(); | |
| 4995 len = normSource.length(); | |
| 4996 } | |
| 4997 } | |
| 4998 collIterate s; | |
| 4999 IInit_collIterate(coll, (UChar *)source, len, &s, status); | |
| 5000 if(U_FAILURE(*status)) { | |
| 5001 return; | |
| 5002 } | |
| 5003 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was norma
lized. | |
| 5004 | |
| 5005 uint32_t order = 0; | |
| 5006 | |
| 5007 uint8_t primary1 = 0; | |
| 5008 uint8_t primary2 = 0; | |
| 5009 uint8_t secondary = 0; | |
| 5010 uint8_t tertiary = 0; | |
| 5011 uint8_t caseSwitch = coll->caseSwitch; | |
| 5012 uint8_t tertiaryMask = coll->tertiaryMask; | |
| 5013 int8_t tertiaryAddition = coll->tertiaryAddition; | |
| 5014 uint8_t tertiaryTop = coll->tertiaryTop; | |
| 5015 uint8_t tertiaryBottom = coll->tertiaryBottom; | |
| 5016 uint8_t tertiaryCommon = coll->tertiaryCommon; | |
| 5017 | |
| 5018 UBool notIsContinuation = FALSE; | |
| 5019 | |
| 5020 uint32_t count2 = 0, count3 = 0; | |
| 5021 uint8_t leadPrimary = 0; | |
| 5022 | |
| 5023 for(;;) { | |
| 5024 order = ucol_IGetNextCE(coll, &s, status); | |
| 5025 | |
| 5026 if(order == 0) { | |
| 5027 continue; | |
| 5028 } | |
| 5029 | |
| 5030 if(order == UCOL_NO_MORE_CES) { | |
| 5031 break; | |
| 5032 } | |
| 5033 | |
| 5034 notIsContinuation = !isContinuation(order); | |
| 5035 | |
| 5036 if(notIsContinuation) { | |
| 5037 tertiary = (uint8_t)((order & tertiaryMask)); | |
| 5038 } else { | |
| 5039 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); | |
| 5040 } | |
| 5041 | |
| 5042 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); | |
| 5043 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); | |
| 5044 primary1 = (uint8_t)(order >> 8); | |
| 5045 | |
| 5046 uint8_t originalPrimary1 = primary1; | |
| 5047 if (coll->leadBytePermutationTable != NULL && notIsContinuation) { | |
| 5048 primary1 = coll->leadBytePermutationTable[primary1]; | |
| 5049 } | |
| 5050 | |
| 5051 /* Note: This code assumes that the table is well built i.e. not having
0 bytes where they are not supposed to be. */ | |
| 5052 /* Usually, we'll have non-zero primary1 & primary2, except in cases of
a-z and friends, when primary2 will */ | |
| 5053 /* be zero with non zero primary1. primary3 is different than 0 only for
long primaries - see above. */ | |
| 5054 /* regular and simple sortkey calc */ | |
| 5055 if(primary1 != UCOL_IGNORABLE) { | |
| 5056 if(notIsContinuation) { | |
| 5057 if(leadPrimary == primary1) { | |
| 5058 primaries.Append(primary2); | |
| 5059 } else { | |
| 5060 if(leadPrimary != 0) { | |
| 5061 primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UN
SHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); | |
| 5062 } | |
| 5063 if(primary2 == UCOL_IGNORABLE) { | |
| 5064 /* one byter, not compressed */ | |
| 5065 primaries.Append(primary1); | |
| 5066 leadPrimary = 0; | |
| 5067 } else if(isCompressible(coll, originalPrimary1)) { | |
| 5068 /* compress */ | |
| 5069 primaries.Append(leadPrimary = primary1, primary2); | |
| 5070 } else { | |
| 5071 leadPrimary = 0; | |
| 5072 primaries.Append(primary1, primary2); | |
| 5073 } | |
| 5074 } | |
| 5075 } else { /* we are in continuation, so we're gonna add primary to th
e key don't care about compression */ | |
| 5076 if(primary2 == UCOL_IGNORABLE) { | |
| 5077 primaries.Append(primary1); | |
| 5078 } else { | |
| 5079 primaries.Append(primary1, primary2); | |
| 5080 } | |
| 5081 } | |
| 5082 } | |
| 5083 | |
| 5084 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */ | |
| 5085 /* This is compression code. */ | |
| 5086 if (secondary == UCOL_COMMON2 && notIsContinuation) { | |
| 5087 ++count2; | |
| 5088 } else { | |
| 5089 if (count2 > 0) { | |
| 5090 if (secondary > UCOL_COMMON2) { // not necessary for 4th lev
el. | |
| 5091 while (count2 > UCOL_TOP_COUNT2) { | |
| 5092 secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_C
OUNT2); | |
| 5093 count2 -= (uint32_t)UCOL_TOP_COUNT2; | |
| 5094 } | |
| 5095 secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1)); | |
| 5096 } else { | |
| 5097 while (count2 > UCOL_BOT_COUNT2) { | |
| 5098 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_C
OUNT2); | |
| 5099 count2 -= (uint32_t)UCOL_BOT_COUNT2; | |
| 5100 } | |
| 5101 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); | |
| 5102 } | |
| 5103 count2 = 0; | |
| 5104 } | |
| 5105 secondaries.appendByte(secondary); | |
| 5106 } | |
| 5107 } | |
| 5108 | |
| 5109 if(notIsContinuation) { | |
| 5110 tertiary ^= caseSwitch; | |
| 5111 } | |
| 5112 | |
| 5113 if(tertiary > 0) { | |
| 5114 /* This is compression code. */ | |
| 5115 /* sequence size check is included in the if clause */ | |
| 5116 if (tertiary == tertiaryCommon && notIsContinuation) { | |
| 5117 ++count3; | |
| 5118 } else { | |
| 5119 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_N
ORMAL) { | |
| 5120 tertiary += tertiaryAddition; | |
| 5121 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_
COMMON3_UPPERFIRST) { | |
| 5122 tertiary -= tertiaryAddition; | |
| 5123 } | |
| 5124 if (count3 > 0) { | |
| 5125 if ((tertiary > tertiaryCommon)) { | |
| 5126 while (count3 > coll->tertiaryTopCount) { | |
| 5127 tertiaries.appendByte(tertiaryTop - coll->tertiaryTo
pCount); | |
| 5128 count3 -= (uint32_t)coll->tertiaryTopCount; | |
| 5129 } | |
| 5130 tertiaries.appendByte(tertiaryTop - (count3-1)); | |
| 5131 } else { | |
| 5132 while (count3 > coll->tertiaryBottomCount) { | |
| 5133 tertiaries.appendByte(tertiaryBottom + coll->tertiar
yBottomCount); | |
| 5134 count3 -= (uint32_t)coll->tertiaryBottomCount; | |
| 5135 } | |
| 5136 tertiaries.appendByte(tertiaryBottom + (count3-1)); | |
| 5137 } | |
| 5138 count3 = 0; | |
| 5139 } | |
| 5140 tertiaries.appendByte(tertiary); | |
| 5141 } | |
| 5142 } | |
| 5143 } | |
| 5144 | |
| 5145 UBool ok = TRUE; | |
| 5146 if(U_SUCCESS(*status)) { | |
| 5147 /* we have done all the CE's, now let's put them together to form a key
*/ | |
| 5148 if (count2 > 0) { | |
| 5149 while (count2 > UCOL_BOT_COUNT2) { | |
| 5150 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); | |
| 5151 count2 -= (uint32_t)UCOL_BOT_COUNT2; | |
| 5152 } | |
| 5153 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); | |
| 5154 } | |
| 5155 ok &= secondaries.isOk(); | |
| 5156 result.Append(UCOL_LEVELTERMINATOR); | |
| 5157 secondaries.appendTo(result); | |
| 5158 | |
| 5159 if (count3 > 0) { | |
| 5160 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) { | |
| 5161 while (count3 >= coll->tertiaryTopCount) { | |
| 5162 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount); | |
| 5163 count3 -= (uint32_t)coll->tertiaryTopCount; | |
| 5164 } | |
| 5165 tertiaries.appendByte(tertiaryTop - count3); | |
| 5166 } else { | |
| 5167 while (count3 > coll->tertiaryBottomCount) { | |
| 5168 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomC
ount); | |
| 5169 count3 -= (uint32_t)coll->tertiaryBottomCount; | |
| 5170 } | |
| 5171 tertiaries.appendByte(tertiaryBottom + (count3-1)); | |
| 5172 } | |
| 5173 } | |
| 5174 ok &= tertiaries.isOk(); | |
| 5175 result.Append(UCOL_LEVELTERMINATOR); | |
| 5176 tertiaries.appendTo(result); | |
| 5177 | |
| 5178 result.Append(0); | |
| 5179 } | |
| 5180 | |
| 5181 /* To avoid memory leak, free the offset buffer if necessary. */ | |
| 5182 ucol_freeOffsetBuffer(&s); | |
| 5183 | |
| 5184 ok &= result.IsOk(); | |
| 5185 if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; } | |
| 5186 } | |
| 5187 | |
| 5188 static inline | |
| 5189 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) { | |
| 5190 UBool notIsContinuation = !isContinuation(CE); | |
| 5191 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF); | |
| 5192 if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0) | |
| 5193 || (!notIsContinuation && *wasShifted))) | |
| 5194 || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that pri
mary ignorables */ | |
| 5195 { | |
| 5196 // The stuff below should probably be in the sortkey code... maybe not..
. | |
| 5197 if(primary1 != 0) { /* if we were shifted and we got an ignorable code p
oint */ | |
| 5198 /* we should just completely ignore it */ | |
| 5199 *wasShifted = TRUE; | |
| 5200 //continue; | |
| 5201 } | |
| 5202 //*wasShifted = TRUE; | |
| 5203 return TRUE; | |
| 5204 } else { | |
| 5205 *wasShifted = FALSE; | |
| 5206 return FALSE; | |
| 5207 } | |
| 5208 } | |
| 5209 static inline | |
| 5210 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *des
t) { | |
| 5211 if(level < maxLevel) { | |
| 5212 dest[i++] = UCOL_LEVELTERMINATOR; | |
| 5213 } else { | |
| 5214 dest[i++] = 0; | |
| 5215 } | |
| 5216 } | |
| 5217 | |
| 5218 /** enumeration of level identifiers for partial sort key generation */ | |
| 5219 enum { | |
| 5220 UCOL_PSK_PRIMARY = 0, | |
| 5221 UCOL_PSK_SECONDARY = 1, | |
| 5222 UCOL_PSK_CASE = 2, | |
| 5223 UCOL_PSK_TERTIARY = 3, | |
| 5224 UCOL_PSK_QUATERNARY = 4, | |
| 5225 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have t
hree bits to blow */ | |
| 5226 UCOL_PSK_IDENTICAL = 6, | |
| 5227 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce
zeros */ | |
| 5228 UCOL_PSK_LIMIT | |
| 5229 }; | |
| 5230 | |
| 5231 /** collation state enum. *_SHIFT value is how much to shift right | |
| 5232 * to get the state piece to the right. *_MASK value should be | |
| 5233 * ANDed with the shifted state. This data is stored in state[1] | |
| 5234 * field. | |
| 5235 */ | |
| 5236 enum { | |
| 5237 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value
from above */ | |
| 5238 UCOL_PSK_LEVEL_MASK = 7, /** three bits */ | |
| 5239 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary
or quaternary already written */ | |
| 5240 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1, | |
| 5241 /** can be only 0 or 1, since we get up to two bytes from primary or quatern
ary | |
| 5242 * This field is also used to denote that the French secondary level is fin
ished | |
| 5243 */ | |
| 5244 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */ | |
| 5245 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */ | |
| 5246 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already wri
tten */ | |
| 5247 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */ | |
| 5248 /** When we do French we need to reverse secondary values. However, continua
tions | |
| 5249 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2
c3ba | |
| 5250 */ | |
| 5251 UCOL_PSK_BOCSU_BYTES_SHIFT = 7, | |
| 5252 UCOL_PSK_BOCSU_BYTES_MASK = 3, | |
| 5253 UCOL_PSK_CONSUMED_CES_SHIFT = 9, | |
| 5254 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF | |
| 5255 }; | |
| 5256 | |
| 5257 // macro calculating the number of expansion CEs available | |
| 5258 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn | |
| 5259 | |
| 5260 | |
| 5261 /** main sortkey part procedure. On the first call, | |
| 5262 * you should pass in a collator, an iterator, empty state | |
| 5263 * state[0] == state[1] == 0, a buffer to hold results | |
| 5264 * number of bytes you need and an error code pointer. | |
| 5265 * Make sure your buffer is big enough to hold the wanted | |
| 5266 * number of sortkey bytes. I don't check. | |
| 5267 * The only meaningful status you can get back is | |
| 5268 * U_BUFFER_OVERFLOW_ERROR, which basically means that you | |
| 5269 * have been dealt a raw deal and that you probably won't | |
| 5270 * be able to use partial sortkey generation for this | |
| 5271 * particular combination of string and collator. This | |
| 5272 * is highly unlikely, but you should still check the error code. | |
| 5273 * Any other status means that you're not in a sane situation | |
| 5274 * anymore. After the first call, preserve state values and | |
| 5275 * use them on subsequent calls to obtain more bytes of a sortkey. | |
| 5276 * Use until the number of bytes written is smaller than the requested | |
| 5277 * number of bytes. Generated sortkey is not compatible with the | |
| 5278 * one generated by ucol_getSortKey, as we don't do any compression. | |
| 5279 * However, levels are still terminated by a 1 (one) and the sortkey | |
| 5280 * is terminated by a 0 (zero). Identical level is the same as in the | |
| 5281 * regular sortkey - internal bocu-1 implementation is used. | |
| 5282 * For curious, although you cannot do much about this, here is | |
| 5283 * the structure of state words. | |
| 5284 * state[0] - iterator state. Depends on the iterator implementation, | |
| 5285 * but allows the iterator to continue where it stopped in | |
| 5286 * the last iteration. | |
| 5287 * state[1] - collation processing state. Here is the distribution | |
| 5288 * of the bits: | |
| 5289 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary | |
| 5290 * quaternary, quin (we don't use this one), identical and | |
| 5291 * null (producing only zeroes - first one to terminate the | |
| 5292 * sortkey and subsequent to fill the buffer). | |
| 5293 * 3 - byte count. Number of bytes written on the primary level. | |
| 5294 * 4 - was shifted. Whether the previous iteration finished in the | |
| 5295 * shifted state. | |
| 5296 * 5, 6 - French continuation bytes written. See the comment in the enum | |
| 5297 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on | |
| 5298 * the identical level. | |
| 5299 * 9..31 - CEs consumed. Number of getCE or next32 operations performed | |
| 5300 * since thes last successful update of the iterator state. | |
| 5301 */ | |
| 5302 U_CAPI int32_t U_EXPORT2 | 209 U_CAPI int32_t U_EXPORT2 |
| 5303 ucol_nextSortKeyPart(const UCollator *coll, | 210 ucol_nextSortKeyPart(const UCollator *coll, |
| 5304 UCharIterator *iter, | 211 UCharIterator *iter, |
| 5305 uint32_t state[2], | 212 uint32_t state[2], |
| 5306 uint8_t *dest, int32_t count, | 213 uint8_t *dest, int32_t count, |
| 5307 UErrorCode *status) | 214 UErrorCode *status) |
| 5308 { | 215 { |
| 5309 /* error checking */ | 216 /* error checking */ |
| 5310 if(status==NULL || U_FAILURE(*status)) { | 217 if(status==NULL || U_FAILURE(*status)) { |
| 5311 return 0; | 218 return 0; |
| 5312 } | 219 } |
| 5313 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); | 220 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); |
| 5314 if( coll==NULL || iter==NULL || | |
| 5315 state==NULL || | |
| 5316 count<0 || (count>0 && dest==NULL) | |
| 5317 ) { | |
| 5318 *status=U_ILLEGAL_ARGUMENT_ERROR; | |
| 5319 UTRACE_EXIT_STATUS(status); | |
| 5320 return 0; | |
| 5321 } | |
| 5322 | |
| 5323 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=
%d", | 221 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=
%d", |
| 5324 coll, iter, state[0], state[1], dest, count); | 222 coll, iter, state[0], state[1], dest, count); |
| 5325 | 223 |
| 5326 if(count==0) { | 224 int32_t i = Collator::fromUCollator(coll)-> |
| 5327 /* nothing to do */ | 225 internalNextSortKeyPart(iter, state, dest, count, *status); |
| 5328 UTRACE_EXIT_VALUE(0); | 226 |
| 5329 return 0; | |
| 5330 } | |
| 5331 /** Setting up situation according to the state we got from the previous ite
ration */ | |
| 5332 // The state of the iterator from the previous invocation | |
| 5333 uint32_t iterState = state[0]; | |
| 5334 // Has the last iteration ended in the shifted state | |
| 5335 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_
SHIFTED_MASK)?TRUE:FALSE; | |
| 5336 // What is the current level of the sortkey? | |
| 5337 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK; | |
| 5338 // Have we written only one byte from a two byte primary in the previous ite
ration? | |
| 5339 // Also on secondary level - have we finished with the French secondary? | |
| 5340 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_D
ONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK; | |
| 5341 // number of bytes in the continuation buffer for French | |
| 5342 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USE
D_FRENCH_MASK; | |
| 5343 // Number of bytes already written from a bocsu sequence. Since | |
| 5344 // the longes bocsu sequence is 4 long, this can be up to 3. | |
| 5345 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK
_BOCSU_BYTES_MASK; | |
| 5346 // Number of elements that need to be consumed in this iteration because | |
| 5347 // the iterator returned UITER_NO_STATE at the end of the last iteration, | |
| 5348 // so we had to save the last valid state. | |
| 5349 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED
_CES_MASK; | |
| 5350 | |
| 5351 /** values that depend on the collator attributes */ | |
| 5352 // strength of the collator. | |
| 5353 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status); | |
| 5354 // maximal level of the partial sortkey. Need to take whether case level is
done | |
| 5355 int32_t maxLevel = 0; | |
| 5356 if(strength < UCOL_TERTIARY) { | |
| 5357 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { | |
| 5358 maxLevel = UCOL_PSK_CASE; | |
| 5359 } else { | |
| 5360 maxLevel = strength; | |
| 5361 } | |
| 5362 } else { | |
| 5363 if(strength == UCOL_TERTIARY) { | |
| 5364 maxLevel = UCOL_PSK_TERTIARY; | |
| 5365 } else if(strength == UCOL_QUATERNARY) { | |
| 5366 maxLevel = UCOL_PSK_QUATERNARY; | |
| 5367 } else { // identical | |
| 5368 maxLevel = UCOL_IDENTICAL; | |
| 5369 } | |
| 5370 } | |
| 5371 // value for the quaternary level if Hiragana is encountered. Used for JIS X
4061 collation | |
| 5372 uint8_t UCOL_HIRAGANA_QUAD = | |
| 5373 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON
)?0xFE:0xFF; | |
| 5374 // Boundary value that decides whether a CE is shifted or not | |
| 5375 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopV
alue<<16):0; | |
| 5376 // Are we doing French collation? | |
| 5377 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status)
== UCOL_ON); | |
| 5378 | |
| 5379 /** initializing the collation state */ | |
| 5380 UBool notIsContinuation = FALSE; | |
| 5381 uint32_t CE = UCOL_NO_MORE_CES; | |
| 5382 | |
| 5383 collIterate s; | |
| 5384 IInit_collIterate(coll, NULL, -1, &s, status); | |
| 5385 if(U_FAILURE(*status)) { | |
| 5386 UTRACE_EXIT_STATUS(*status); | |
| 5387 return 0; | |
| 5388 } | |
| 5389 s.iterator = iter; | |
| 5390 s.flags |= UCOL_USE_ITERATOR; | |
| 5391 // This variable tells us whether we have produced some other levels in this
iteration | |
| 5392 // before we moved to the identical level. In that case, we need to switch t
he | |
| 5393 // type of the iterator. | |
| 5394 UBool doingIdenticalFromStart = FALSE; | |
| 5395 // Normalizing iterator | |
| 5396 // The division for the array length may truncate the array size to | |
| 5397 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high | |
| 5398 // for all platforms anyway. | |
| 5399 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
| 5400 UNormIterator *normIter = NULL; | |
| 5401 // If the normalization is turned on for the collator and we are below ident
ical level | |
| 5402 // we will use a FCD normalizing iterator | |
| 5403 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && le
vel < UCOL_PSK_IDENTICAL) { | |
| 5404 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); | |
| 5405 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status); | |
| 5406 s.flags &= ~UCOL_ITER_NORM; | |
| 5407 if(U_FAILURE(*status)) { | |
| 5408 UTRACE_EXIT_STATUS(*status); | |
| 5409 return 0; | |
| 5410 } | |
| 5411 } else if(level == UCOL_PSK_IDENTICAL) { | |
| 5412 // for identical level, we need a NFD iterator. We need to instantiate i
t here, since we | |
| 5413 // will be updating the state - and this cannot be done on an ordinary i
terator. | |
| 5414 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); | |
| 5415 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); | |
| 5416 s.flags &= ~UCOL_ITER_NORM; | |
| 5417 if(U_FAILURE(*status)) { | |
| 5418 UTRACE_EXIT_STATUS(*status); | |
| 5419 return 0; | |
| 5420 } | |
| 5421 doingIdenticalFromStart = TRUE; | |
| 5422 } | |
| 5423 | |
| 5424 // This is the tentative new state of the iterator. The problem | |
| 5425 // is that the iterator might return an undefined state, in | |
| 5426 // which case we should save the last valid state and increase | |
| 5427 // the iterator skip value. | |
| 5428 uint32_t newState = 0; | |
| 5429 | |
| 5430 // First, we set the iterator to the last valid position | |
| 5431 // from the last iteration. This was saved in state[0]. | |
| 5432 if(iterState == 0) { | |
| 5433 /* initial state */ | |
| 5434 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone)
{ | |
| 5435 s.iterator->move(s.iterator, 0, UITER_LIMIT); | |
| 5436 } else { | |
| 5437 s.iterator->move(s.iterator, 0, UITER_START); | |
| 5438 } | |
| 5439 } else { | |
| 5440 /* reset to previous state */ | |
| 5441 s.iterator->setState(s.iterator, iterState, status); | |
| 5442 if(U_FAILURE(*status)) { | |
| 5443 UTRACE_EXIT_STATUS(*status); | |
| 5444 return 0; | |
| 5445 } | |
| 5446 } | |
| 5447 | |
| 5448 | |
| 5449 | |
| 5450 // This variable tells us whether we can attempt to update the state | |
| 5451 // of iterator. Situations where we don't want to update iterator state | |
| 5452 // are the existence of expansion CEs that are not yet processed, and | |
| 5453 // finishing the case level without enough space in the buffer to insert | |
| 5454 // a level terminator. | |
| 5455 UBool canUpdateState = TRUE; | |
| 5456 | |
| 5457 // Consume all the CEs that were consumed at the end of the previous | |
| 5458 // iteration without updating the iterator state. On identical level, | |
| 5459 // consume the code points. | |
| 5460 int32_t counter = cces; | |
| 5461 if(level < UCOL_PSK_IDENTICAL) { | |
| 5462 while(counter-->0) { | |
| 5463 // If we're doing French and we are on the secondary level, | |
| 5464 // we go backwards. | |
| 5465 if(level == UCOL_PSK_SECONDARY && doingFrench) { | |
| 5466 CE = ucol_IGetPrevCE(coll, &s, status); | |
| 5467 } else { | |
| 5468 CE = ucol_IGetNextCE(coll, &s, status); | |
| 5469 } | |
| 5470 if(CE==UCOL_NO_MORE_CES) { | |
| 5471 /* should not happen */ | |
| 5472 *status=U_INTERNAL_PROGRAM_ERROR; | |
| 5473 UTRACE_EXIT_STATUS(*status); | |
| 5474 return 0; | |
| 5475 } | |
| 5476 if(uprv_numAvailableExpCEs(s)) { | |
| 5477 canUpdateState = FALSE; | |
| 5478 } | |
| 5479 } | |
| 5480 } else { | |
| 5481 while(counter-->0) { | |
| 5482 uiter_next32(s.iterator); | |
| 5483 } | |
| 5484 } | |
| 5485 | |
| 5486 // French secondary needs to know whether the iterator state of zero came fr
om previous level OR | |
| 5487 // from a new invocation... | |
| 5488 UBool wasDoingPrimary = FALSE; | |
| 5489 // destination buffer byte counter. When this guy | |
| 5490 // gets to count, we're done with the iteration | |
| 5491 int32_t i = 0; | |
| 5492 // used to count the zero bytes written after we | |
| 5493 // have finished with the sort key | |
| 5494 int32_t j = 0; | |
| 5495 | |
| 5496 | |
| 5497 // Hm.... I think we're ready to plunge in. Basic story is as following: | |
| 5498 // we have a fall through case based on level. This is used for initial | |
| 5499 // positioning on iteration start. Every level processor contains a | |
| 5500 // for(;;) which will be broken when we exhaust all the CEs. Other | |
| 5501 // way to exit is a goto saveState, which happens when we have filled | |
| 5502 // out our buffer. | |
| 5503 switch(level) { | |
| 5504 case UCOL_PSK_PRIMARY: | |
| 5505 wasDoingPrimary = TRUE; | |
| 5506 for(;;) { | |
| 5507 if(i==count) { | |
| 5508 goto saveState; | |
| 5509 } | |
| 5510 // We should save the state only if we | |
| 5511 // are sure that we are done with the | |
| 5512 // previous iterator state | |
| 5513 if(canUpdateState && byteCountOrFrenchDone == 0) { | |
| 5514 newState = s.iterator->getState(s.iterator); | |
| 5515 if(newState != UITER_NO_STATE) { | |
| 5516 iterState = newState; | |
| 5517 cces = 0; | |
| 5518 } | |
| 5519 } | |
| 5520 CE = ucol_IGetNextCE(coll, &s, status); | |
| 5521 cces++; | |
| 5522 if(CE==UCOL_NO_MORE_CES) { | |
| 5523 // Add the level separator | |
| 5524 terminatePSKLevel(level, maxLevel, i, dest); | |
| 5525 byteCountOrFrenchDone=0; | |
| 5526 // Restart the iteration an move to the | |
| 5527 // second level | |
| 5528 s.iterator->move(s.iterator, 0, UITER_START); | |
| 5529 cces = 0; | |
| 5530 level = UCOL_PSK_SECONDARY; | |
| 5531 break; | |
| 5532 } | |
| 5533 if(!isContinuation(CE)){ | |
| 5534 if(coll->leadBytePermutationTable != NULL){ | |
| 5535 CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE &
0x00FFFFFF); | |
| 5536 } | |
| 5537 } | |
| 5538 if(!isShiftedCE(CE, LVT, &wasShifted)) { | |
| 5539 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */ | |
| 5540 if(CE != 0) { | |
| 5541 if(byteCountOrFrenchDone == 0) { | |
| 5542 // get the second byte of primary | |
| 5543 dest[i++]=(uint8_t)(CE >> 8); | |
| 5544 } else { | |
| 5545 byteCountOrFrenchDone = 0; | |
| 5546 } | |
| 5547 if((CE &=0xff)!=0) { | |
| 5548 if(i==count) { | |
| 5549 /* overflow */ | |
| 5550 byteCountOrFrenchDone = 1; | |
| 5551 cces--; | |
| 5552 goto saveState; | |
| 5553 } | |
| 5554 dest[i++]=(uint8_t)CE; | |
| 5555 } | |
| 5556 } | |
| 5557 } | |
| 5558 if(uprv_numAvailableExpCEs(s)) { | |
| 5559 canUpdateState = FALSE; | |
| 5560 } else { | |
| 5561 canUpdateState = TRUE; | |
| 5562 } | |
| 5563 } | |
| 5564 /* fall through to next level */ | |
| 5565 case UCOL_PSK_SECONDARY: | |
| 5566 if(strength >= UCOL_SECONDARY) { | |
| 5567 if(!doingFrench) { | |
| 5568 for(;;) { | |
| 5569 if(i == count) { | |
| 5570 goto saveState; | |
| 5571 } | |
| 5572 // We should save the state only if we | |
| 5573 // are sure that we are done with the | |
| 5574 // previous iterator state | |
| 5575 if(canUpdateState) { | |
| 5576 newState = s.iterator->getState(s.iterator); | |
| 5577 if(newState != UITER_NO_STATE) { | |
| 5578 iterState = newState; | |
| 5579 cces = 0; | |
| 5580 } | |
| 5581 } | |
| 5582 CE = ucol_IGetNextCE(coll, &s, status); | |
| 5583 cces++; | |
| 5584 if(CE==UCOL_NO_MORE_CES) { | |
| 5585 // Add the level separator | |
| 5586 terminatePSKLevel(level, maxLevel, i, dest); | |
| 5587 byteCountOrFrenchDone = 0; | |
| 5588 // Restart the iteration an move to the | |
| 5589 // second level | |
| 5590 s.iterator->move(s.iterator, 0, UITER_START); | |
| 5591 cces = 0; | |
| 5592 level = UCOL_PSK_CASE; | |
| 5593 break; | |
| 5594 } | |
| 5595 if(!isShiftedCE(CE, LVT, &wasShifted)) { | |
| 5596 CE >>= 8; /* get secondary */ | |
| 5597 if(CE != 0) { | |
| 5598 dest[i++]=(uint8_t)CE; | |
| 5599 } | |
| 5600 } | |
| 5601 if(uprv_numAvailableExpCEs(s)) { | |
| 5602 canUpdateState = FALSE; | |
| 5603 } else { | |
| 5604 canUpdateState = TRUE; | |
| 5605 } | |
| 5606 } | |
| 5607 } else { // French secondary processing | |
| 5608 uint8_t frenchBuff[UCOL_MAX_BUFFER]; | |
| 5609 int32_t frenchIndex = 0; | |
| 5610 // Here we are going backwards. | |
| 5611 // If the iterator is at the beggining, it should be | |
| 5612 // moved to end. | |
| 5613 if(wasDoingPrimary) { | |
| 5614 s.iterator->move(s.iterator, 0, UITER_LIMIT); | |
| 5615 cces = 0; | |
| 5616 } | |
| 5617 for(;;) { | |
| 5618 if(i == count) { | |
| 5619 goto saveState; | |
| 5620 } | |
| 5621 if(canUpdateState) { | |
| 5622 newState = s.iterator->getState(s.iterator); | |
| 5623 if(newState != UITER_NO_STATE) { | |
| 5624 iterState = newState; | |
| 5625 cces = 0; | |
| 5626 } | |
| 5627 } | |
| 5628 CE = ucol_IGetPrevCE(coll, &s, status); | |
| 5629 cces++; | |
| 5630 if(CE==UCOL_NO_MORE_CES) { | |
| 5631 // Add the level separator | |
| 5632 terminatePSKLevel(level, maxLevel, i, dest); | |
| 5633 byteCountOrFrenchDone = 0; | |
| 5634 // Restart the iteration an move to the next level | |
| 5635 s.iterator->move(s.iterator, 0, UITER_START); | |
| 5636 level = UCOL_PSK_CASE; | |
| 5637 break; | |
| 5638 } | |
| 5639 if(isContinuation(CE)) { // if it's a continuation, we want
to save it and | |
| 5640 // reverse when we get a first non-continuation CE. | |
| 5641 CE >>= 8; | |
| 5642 frenchBuff[frenchIndex++] = (uint8_t)CE; | |
| 5643 } else if(!isShiftedCE(CE, LVT, &wasShifted)) { | |
| 5644 CE >>= 8; /* get secondary */ | |
| 5645 if(!frenchIndex) { | |
| 5646 if(CE != 0) { | |
| 5647 dest[i++]=(uint8_t)CE; | |
| 5648 } | |
| 5649 } else { | |
| 5650 frenchBuff[frenchIndex++] = (uint8_t)CE; | |
| 5651 frenchIndex -= usedFrench; | |
| 5652 usedFrench = 0; | |
| 5653 while(i < count && frenchIndex) { | |
| 5654 dest[i++] = frenchBuff[--frenchIndex]; | |
| 5655 usedFrench++; | |
| 5656 } | |
| 5657 } | |
| 5658 } | |
| 5659 if(uprv_numAvailableExpCEs(s)) { | |
| 5660 canUpdateState = FALSE; | |
| 5661 } else { | |
| 5662 canUpdateState = TRUE; | |
| 5663 } | |
| 5664 } | |
| 5665 } | |
| 5666 } else { | |
| 5667 level = UCOL_PSK_CASE; | |
| 5668 } | |
| 5669 /* fall through to next level */ | |
| 5670 case UCOL_PSK_CASE: | |
| 5671 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { | |
| 5672 uint32_t caseShift = UCOL_CASE_SHIFT_START; | |
| 5673 uint8_t caseByte = UCOL_CASE_BYTE_START; | |
| 5674 uint8_t caseBits = 0; | |
| 5675 | |
| 5676 for(;;) { | |
| 5677 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START); | |
| 5678 if(i == count) { | |
| 5679 goto saveState; | |
| 5680 } | |
| 5681 // We should save the state only if we | |
| 5682 // are sure that we are done with the | |
| 5683 // previous iterator state | |
| 5684 if(canUpdateState) { | |
| 5685 newState = s.iterator->getState(s.iterator); | |
| 5686 if(newState != UITER_NO_STATE) { | |
| 5687 iterState = newState; | |
| 5688 cces = 0; | |
| 5689 } | |
| 5690 } | |
| 5691 CE = ucol_IGetNextCE(coll, &s, status); | |
| 5692 cces++; | |
| 5693 if(CE==UCOL_NO_MORE_CES) { | |
| 5694 // On the case level we might have an unfinished | |
| 5695 // case byte. Add one if it's started. | |
| 5696 if(caseShift != UCOL_CASE_SHIFT_START) { | |
| 5697 dest[i++] = caseByte; | |
| 5698 } | |
| 5699 cces = 0; | |
| 5700 // We have finished processing CEs on this level. | |
| 5701 // However, we don't know if we have enough space | |
| 5702 // to add a case level terminator. | |
| 5703 if(i < count) { | |
| 5704 // Add the level separator | |
| 5705 terminatePSKLevel(level, maxLevel, i, dest); | |
| 5706 // Restart the iteration and move to the | |
| 5707 // next level | |
| 5708 s.iterator->move(s.iterator, 0, UITER_START); | |
| 5709 level = UCOL_PSK_TERTIARY; | |
| 5710 } else { | |
| 5711 canUpdateState = FALSE; | |
| 5712 } | |
| 5713 break; | |
| 5714 } | |
| 5715 | |
| 5716 if(!isShiftedCE(CE, LVT, &wasShifted)) { | |
| 5717 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || s
trength > UCOL_PRIMARY)) { | |
| 5718 // do the case level if we need to do it. We don't want
to calculate | |
| 5719 // case level for primary ignorables if we have only pri
mary strength and case level | |
| 5720 // otherwise we would break well formedness of CEs | |
| 5721 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); | |
| 5722 caseBits = (uint8_t)(CE & 0xC0); | |
| 5723 // this copies the case level logic from the | |
| 5724 // sort key generation code | |
| 5725 if(CE != 0) { | |
| 5726 if (caseShift == 0) { | |
| 5727 dest[i++] = caseByte; | |
| 5728 caseShift = UCOL_CASE_SHIFT_START; | |
| 5729 caseByte = UCOL_CASE_BYTE_START; | |
| 5730 } | |
| 5731 if(coll->caseFirst == UCOL_UPPER_FIRST) { | |
| 5732 if((caseBits & 0xC0) == 0) { | |
| 5733 caseByte |= 1 << (--caseShift); | |
| 5734 } else { | |
| 5735 caseByte |= 0 << (--caseShift); | |
| 5736 /* second bit */ | |
| 5737 if(caseShift == 0) { | |
| 5738 dest[i++] = caseByte; | |
| 5739 caseShift = UCOL_CASE_SHIFT_START; | |
| 5740 caseByte = UCOL_CASE_BYTE_START; | |
| 5741 } | |
| 5742 caseByte |= ((caseBits>>6)&1) << (--caseShif
t); | |
| 5743 } | |
| 5744 } else { | |
| 5745 if((caseBits & 0xC0) == 0) { | |
| 5746 caseByte |= 0 << (--caseShift); | |
| 5747 } else { | |
| 5748 caseByte |= 1 << (--caseShift); | |
| 5749 /* second bit */ | |
| 5750 if(caseShift == 0) { | |
| 5751 dest[i++] = caseByte; | |
| 5752 caseShift = UCOL_CASE_SHIFT_START; | |
| 5753 caseByte = UCOL_CASE_BYTE_START; | |
| 5754 } | |
| 5755 caseByte |= ((caseBits>>7)&1) << (--caseShif
t); | |
| 5756 } | |
| 5757 } | |
| 5758 } | |
| 5759 | |
| 5760 } | |
| 5761 } | |
| 5762 // Not sure this is correct for the case level - revisit | |
| 5763 if(uprv_numAvailableExpCEs(s)) { | |
| 5764 canUpdateState = FALSE; | |
| 5765 } else { | |
| 5766 canUpdateState = TRUE; | |
| 5767 } | |
| 5768 } | |
| 5769 } else { | |
| 5770 level = UCOL_PSK_TERTIARY; | |
| 5771 } | |
| 5772 /* fall through to next level */ | |
| 5773 case UCOL_PSK_TERTIARY: | |
| 5774 if(strength >= UCOL_TERTIARY) { | |
| 5775 for(;;) { | |
| 5776 if(i == count) { | |
| 5777 goto saveState; | |
| 5778 } | |
| 5779 // We should save the state only if we | |
| 5780 // are sure that we are done with the | |
| 5781 // previous iterator state | |
| 5782 if(canUpdateState) { | |
| 5783 newState = s.iterator->getState(s.iterator); | |
| 5784 if(newState != UITER_NO_STATE) { | |
| 5785 iterState = newState; | |
| 5786 cces = 0; | |
| 5787 } | |
| 5788 } | |
| 5789 CE = ucol_IGetNextCE(coll, &s, status); | |
| 5790 cces++; | |
| 5791 if(CE==UCOL_NO_MORE_CES) { | |
| 5792 // Add the level separator | |
| 5793 terminatePSKLevel(level, maxLevel, i, dest); | |
| 5794 byteCountOrFrenchDone = 0; | |
| 5795 // Restart the iteration an move to the | |
| 5796 // second level | |
| 5797 s.iterator->move(s.iterator, 0, UITER_START); | |
| 5798 cces = 0; | |
| 5799 level = UCOL_PSK_QUATERNARY; | |
| 5800 break; | |
| 5801 } | |
| 5802 if(!isShiftedCE(CE, LVT, &wasShifted)) { | |
| 5803 notIsContinuation = !isContinuation(CE); | |
| 5804 | |
| 5805 if(notIsContinuation) { | |
| 5806 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); | |
| 5807 CE ^= coll->caseSwitch; | |
| 5808 CE &= coll->tertiaryMask; | |
| 5809 } else { | |
| 5810 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); | |
| 5811 } | |
| 5812 | |
| 5813 if(CE != 0) { | |
| 5814 dest[i++]=(uint8_t)CE; | |
| 5815 } | |
| 5816 } | |
| 5817 if(uprv_numAvailableExpCEs(s)) { | |
| 5818 canUpdateState = FALSE; | |
| 5819 } else { | |
| 5820 canUpdateState = TRUE; | |
| 5821 } | |
| 5822 } | |
| 5823 } else { | |
| 5824 // if we're not doing tertiary | |
| 5825 // skip to the end | |
| 5826 level = UCOL_PSK_NULL; | |
| 5827 } | |
| 5828 /* fall through to next level */ | |
| 5829 case UCOL_PSK_QUATERNARY: | |
| 5830 if(strength >= UCOL_QUATERNARY) { | |
| 5831 for(;;) { | |
| 5832 if(i == count) { | |
| 5833 goto saveState; | |
| 5834 } | |
| 5835 // We should save the state only if we | |
| 5836 // are sure that we are done with the | |
| 5837 // previous iterator state | |
| 5838 if(canUpdateState) { | |
| 5839 newState = s.iterator->getState(s.iterator); | |
| 5840 if(newState != UITER_NO_STATE) { | |
| 5841 iterState = newState; | |
| 5842 cces = 0; | |
| 5843 } | |
| 5844 } | |
| 5845 CE = ucol_IGetNextCE(coll, &s, status); | |
| 5846 cces++; | |
| 5847 if(CE==UCOL_NO_MORE_CES) { | |
| 5848 // Add the level separator | |
| 5849 terminatePSKLevel(level, maxLevel, i, dest); | |
| 5850 //dest[i++] = UCOL_LEVELTERMINATOR; | |
| 5851 byteCountOrFrenchDone = 0; | |
| 5852 // Restart the iteration an move to the | |
| 5853 // second level | |
| 5854 s.iterator->move(s.iterator, 0, UITER_START); | |
| 5855 cces = 0; | |
| 5856 level = UCOL_PSK_QUIN; | |
| 5857 break; | |
| 5858 } | |
| 5859 if(CE==0) | |
| 5860 continue; | |
| 5861 if(isShiftedCE(CE, LVT, &wasShifted)) { | |
| 5862 CE >>= 16; /* get primary */ | |
| 5863 if(CE != 0) { | |
| 5864 if(byteCountOrFrenchDone == 0) { | |
| 5865 dest[i++]=(uint8_t)(CE >> 8); | |
| 5866 } else { | |
| 5867 byteCountOrFrenchDone = 0; | |
| 5868 } | |
| 5869 if((CE &=0xff)!=0) { | |
| 5870 if(i==count) { | |
| 5871 /* overflow */ | |
| 5872 byteCountOrFrenchDone = 1; | |
| 5873 goto saveState; | |
| 5874 } | |
| 5875 dest[i++]=(uint8_t)CE; | |
| 5876 } | |
| 5877 } | |
| 5878 } else { | |
| 5879 notIsContinuation = !isContinuation(CE); | |
| 5880 if(notIsContinuation) { | |
| 5881 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana a
nd we need to note it | |
| 5882 dest[i++] = UCOL_HIRAGANA_QUAD; | |
| 5883 } else { | |
| 5884 dest[i++] = 0xFF; | |
| 5885 } | |
| 5886 } | |
| 5887 } | |
| 5888 if(uprv_numAvailableExpCEs(s)) { | |
| 5889 canUpdateState = FALSE; | |
| 5890 } else { | |
| 5891 canUpdateState = TRUE; | |
| 5892 } | |
| 5893 } | |
| 5894 } else { | |
| 5895 // if we're not doing quaternary | |
| 5896 // skip to the end | |
| 5897 level = UCOL_PSK_NULL; | |
| 5898 } | |
| 5899 /* fall through to next level */ | |
| 5900 case UCOL_PSK_QUIN: | |
| 5901 level = UCOL_PSK_IDENTICAL; | |
| 5902 /* fall through to next level */ | |
| 5903 case UCOL_PSK_IDENTICAL: | |
| 5904 if(strength >= UCOL_IDENTICAL) { | |
| 5905 UChar32 first, second; | |
| 5906 int32_t bocsuBytesWritten = 0; | |
| 5907 // We always need to do identical on | |
| 5908 // the NFD form of the string. | |
| 5909 if(normIter == NULL) { | |
| 5910 // we arrived from the level below and | |
| 5911 // normalization was not turned on. | |
| 5912 // therefore, we need to make a fresh NFD iterator | |
| 5913 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter),
status); | |
| 5914 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); | |
| 5915 } else if(!doingIdenticalFromStart) { | |
| 5916 // there is an iterator, but we did some other levels. | |
| 5917 // therefore, we have a FCD iterator - need to make | |
| 5918 // a NFD one. | |
| 5919 // normIter being at the beginning does not guarantee | |
| 5920 // that the underlying iterator is at the beginning | |
| 5921 iter->move(iter, 0, UITER_START); | |
| 5922 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); | |
| 5923 } | |
| 5924 // At this point we have a NFD iterator that is positioned | |
| 5925 // in the right place | |
| 5926 if(U_FAILURE(*status)) { | |
| 5927 UTRACE_EXIT_STATUS(*status); | |
| 5928 return 0; | |
| 5929 } | |
| 5930 first = uiter_previous32(s.iterator); | |
| 5931 // maybe we're at the start of the string | |
| 5932 if(first == U_SENTINEL) { | |
| 5933 first = 0; | |
| 5934 } else { | |
| 5935 uiter_next32(s.iterator); | |
| 5936 } | |
| 5937 | |
| 5938 j = 0; | |
| 5939 for(;;) { | |
| 5940 if(i == count) { | |
| 5941 if(j+1 < bocsuBytesWritten) { | |
| 5942 bocsuBytesUsed = j+1; | |
| 5943 } | |
| 5944 goto saveState; | |
| 5945 } | |
| 5946 | |
| 5947 // On identical level, we will always save | |
| 5948 // the state if we reach this point, since | |
| 5949 // we don't depend on getNextCE for content | |
| 5950 // all the content is in our buffer and we | |
| 5951 // already either stored the full buffer OR | |
| 5952 // otherwise we won't arrive here. | |
| 5953 newState = s.iterator->getState(s.iterator); | |
| 5954 if(newState != UITER_NO_STATE) { | |
| 5955 iterState = newState; | |
| 5956 cces = 0; | |
| 5957 } | |
| 5958 | |
| 5959 uint8_t buff[4]; | |
| 5960 second = uiter_next32(s.iterator); | |
| 5961 cces++; | |
| 5962 | |
| 5963 // end condition for identical level | |
| 5964 if(second == U_SENTINEL) { | |
| 5965 terminatePSKLevel(level, maxLevel, i, dest); | |
| 5966 level = UCOL_PSK_NULL; | |
| 5967 break; | |
| 5968 } | |
| 5969 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, seco
nd, buff); | |
| 5970 first = second; | |
| 5971 | |
| 5972 j = 0; | |
| 5973 if(bocsuBytesUsed != 0) { | |
| 5974 while(bocsuBytesUsed-->0) { | |
| 5975 j++; | |
| 5976 } | |
| 5977 } | |
| 5978 | |
| 5979 while(i < count && j < bocsuBytesWritten) { | |
| 5980 dest[i++] = buff[j++]; | |
| 5981 } | |
| 5982 } | |
| 5983 | |
| 5984 } else { | |
| 5985 level = UCOL_PSK_NULL; | |
| 5986 } | |
| 5987 /* fall through to next level */ | |
| 5988 case UCOL_PSK_NULL: | |
| 5989 j = i; | |
| 5990 while(j<count) { | |
| 5991 dest[j++]=0; | |
| 5992 } | |
| 5993 break; | |
| 5994 default: | |
| 5995 *status = U_INTERNAL_PROGRAM_ERROR; | |
| 5996 UTRACE_EXIT_STATUS(*status); | |
| 5997 return 0; | |
| 5998 } | |
| 5999 | |
| 6000 saveState: | |
| 6001 // Now we need to return stuff. First we want to see whether we have | |
| 6002 // done everything for the current state of iterator. | |
| 6003 if(byteCountOrFrenchDone | |
| 6004 || canUpdateState == FALSE | |
| 6005 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE) | |
| 6006 { | |
| 6007 // Any of above mean that the previous transaction | |
| 6008 // wasn't finished and that we should store the | |
| 6009 // previous iterator state. | |
| 6010 state[0] = iterState; | |
| 6011 } else { | |
| 6012 // The transaction is complete. We will continue in the next iteration. | |
| 6013 state[0] = s.iterator->getState(s.iterator); | |
| 6014 cces = 0; | |
| 6015 } | |
| 6016 // Store the number of bocsu bytes written. | |
| 6017 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) { | |
| 6018 *status = U_INDEX_OUTOFBOUNDS_ERROR; | |
| 6019 } | |
| 6020 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BY
TES_SHIFT; | |
| 6021 | |
| 6022 // Next we put in the level of comparison | |
| 6023 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT); | |
| 6024 | |
| 6025 // If we are doing French, we need to store whether we have just finished th
e French level | |
| 6026 if(level == UCOL_PSK_SECONDARY && doingFrench) { | |
| 6027 state[1] |= (((int32_t)(state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_D
ONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); | |
| 6028 } else { | |
| 6029 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE
_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); | |
| 6030 } | |
| 6031 | |
| 6032 // Was the latest CE shifted | |
| 6033 if(wasShifted) { | |
| 6034 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT; | |
| 6035 } | |
| 6036 // Check for cces overflow | |
| 6037 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) { | |
| 6038 *status = U_INDEX_OUTOFBOUNDS_ERROR; | |
| 6039 } | |
| 6040 // Store cces | |
| 6041 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SH
IFT); | |
| 6042 | |
| 6043 // Check for French overflow | |
| 6044 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) { | |
| 6045 *status = U_INDEX_OUTOFBOUNDS_ERROR; | |
| 6046 } | |
| 6047 // Store number of bytes written in the French secondary continuation sequen
ce | |
| 6048 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENC
H_SHIFT); | |
| 6049 | |
| 6050 | |
| 6051 // If we have used normalizing iterator, get rid of it | |
| 6052 if(normIter != NULL) { | |
| 6053 unorm_closeIter(normIter); | |
| 6054 } | |
| 6055 | |
| 6056 /* To avoid memory leak, free the offset buffer if necessary. */ | |
| 6057 ucol_freeOffsetBuffer(&s); | |
| 6058 | |
| 6059 // Return number of meaningful sortkey bytes. | 227 // Return number of meaningful sortkey bytes. |
| 6060 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", | 228 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", |
| 6061 dest,i, state[0], state[1]); | 229 dest,i, state[0], state[1]); |
| 6062 UTRACE_EXIT_VALUE(i); | 230 UTRACE_EXIT_VALUE_STATUS(i, *status); |
| 6063 return i; | 231 return i; |
| 6064 } | 232 } |
| 6065 | 233 |
| 6066 /** | 234 /** |
| 6067 * Produce a bound for a given sortkey and a number of levels. | 235 * Produce a bound for a given sortkey and a number of levels. |
| 6068 */ | 236 */ |
| 6069 U_CAPI int32_t U_EXPORT2 | 237 U_CAPI int32_t U_EXPORT2 |
| 6070 ucol_getBound(const uint8_t *source, | 238 ucol_getBound(const uint8_t *source, |
| 6071 int32_t sourceLength, | 239 int32_t sourceLength, |
| 6072 UColBoundMode boundType, | 240 UColBoundMode boundType, |
| 6073 uint32_t noOfLevels, | 241 uint32_t noOfLevels, |
| 6074 uint8_t *result, | 242 uint8_t *result, |
| 6075 int32_t resultLength, | 243 int32_t resultLength, |
| 6076 UErrorCode *status) | 244 UErrorCode *status) |
| 6077 { | 245 { |
| 6078 // consistency checks | 246 // consistency checks |
| 6079 if(status == NULL || U_FAILURE(*status)) { | 247 if(status == NULL || U_FAILURE(*status)) { |
| 6080 return 0; | 248 return 0; |
| 6081 } | 249 } |
| 6082 if(source == NULL) { | 250 if(source == NULL) { |
| 6083 *status = U_ILLEGAL_ARGUMENT_ERROR; | 251 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 6084 return 0; | 252 return 0; |
| 6085 } | 253 } |
| 6086 | 254 |
| 6087 int32_t sourceIndex = 0; | 255 int32_t sourceIndex = 0; |
| 6088 // Scan the string until we skip enough of the key OR reach the end of the k
ey | 256 // Scan the string until we skip enough of the key OR reach the end of the k
ey |
| 6089 do { | 257 do { |
| 6090 sourceIndex++; | 258 sourceIndex++; |
| 6091 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) { | 259 if(source[sourceIndex] == Collation::LEVEL_SEPARATOR_BYTE) { |
| 6092 noOfLevels--; | 260 noOfLevels--; |
| 6093 } | 261 } |
| 6094 } while (noOfLevels > 0 | 262 } while (noOfLevels > 0 |
| 6095 && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); | 263 && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); |
| 6096 | 264 |
| 6097 if((source[sourceIndex] == 0 || sourceIndex == sourceLength) | 265 if((source[sourceIndex] == 0 || sourceIndex == sourceLength) |
| 6098 && noOfLevels > 0) { | 266 && noOfLevels > 0) { |
| 6099 *status = U_SORT_KEY_TOO_SHORT_WARNING; | 267 *status = U_SORT_KEY_TOO_SHORT_WARNING; |
| 6100 } | 268 } |
| 6101 | 269 |
| (...skipping 22 matching lines...) Expand all Loading... |
| 6124 return 0; | 292 return 0; |
| 6125 } | 293 } |
| 6126 result[sourceIndex++] = 0; | 294 result[sourceIndex++] = 0; |
| 6127 | 295 |
| 6128 return sourceIndex; | 296 return sourceIndex; |
| 6129 } else { | 297 } else { |
| 6130 return sourceIndex+boundType+1; | 298 return sourceIndex+boundType+1; |
| 6131 } | 299 } |
| 6132 } | 300 } |
| 6133 | 301 |
| 6134 /****************************************************************************/ | 302 U_CAPI void U_EXPORT2 |
| 6135 /* Following are the functions that deal with the properties of a collator */ | 303 ucol_setMaxVariable(UCollator *coll, UColReorderCode group, UErrorCode *pErrorCo
de) { |
| 6136 /* there are new APIs and some compatibility APIs */ | 304 if(U_FAILURE(*pErrorCode)) { return; } |
| 6137 /****************************************************************************/ | 305 Collator::fromUCollator(coll)->setMaxVariable(group, *pErrorCode); |
| 6138 | 306 } |
| 6139 static inline void | 307 |
| 6140 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE, | 308 U_CAPI UColReorderCode U_EXPORT2 |
| 6141 int32_t *primShift, int32_t *secShift, int32_t *terShift) | 309 ucol_getMaxVariable(const UCollator *coll) { |
| 6142 { | 310 return Collator::fromUCollator(coll)->getMaxVariable(); |
| 6143 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; | |
| 6144 UBool reverseSecondary = FALSE; | |
| 6145 UBool continuation = isContinuation(CE); | |
| 6146 if(!continuation) { | |
| 6147 tertiary = (uint8_t)((CE & coll->tertiaryMask)); | |
| 6148 tertiary ^= coll->caseSwitch; | |
| 6149 reverseSecondary = TRUE; | |
| 6150 } else { | |
| 6151 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); | |
| 6152 tertiary &= UCOL_REMOVE_CASE; | |
| 6153 reverseSecondary = FALSE; | |
| 6154 } | |
| 6155 | |
| 6156 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); | |
| 6157 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); | |
| 6158 primary1 = (uint8_t)(CE >> 8); | |
| 6159 | |
| 6160 if(primary1 != 0) { | |
| 6161 if (coll->leadBytePermutationTable != NULL && !continuation) { | |
| 6162 primary1 = coll->leadBytePermutationTable[primary1]; | |
| 6163 } | |
| 6164 | |
| 6165 coll->latinOneCEs[ch] |= (primary1 << *primShift); | |
| 6166 *primShift -= 8; | |
| 6167 } | |
| 6168 if(primary2 != 0) { | |
| 6169 if(*primShift < 0) { | |
| 6170 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; | |
| 6171 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; | |
| 6172 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; | |
| 6173 return; | |
| 6174 } | |
| 6175 coll->latinOneCEs[ch] |= (primary2 << *primShift); | |
| 6176 *primShift -= 8; | |
| 6177 } | |
| 6178 if(secondary != 0) { | |
| 6179 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse se
condary | |
| 6180 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space fo
r secondary | |
| 6181 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24); | |
| 6182 } else { // normal case | |
| 6183 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secSh
ift); | |
| 6184 } | |
| 6185 *secShift -= 8; | |
| 6186 } | |
| 6187 if(tertiary != 0) { | |
| 6188 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift
); | |
| 6189 *terShift -= 8; | |
| 6190 } | |
| 6191 } | |
| 6192 | |
| 6193 static inline UBool | |
| 6194 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) { | |
| 6195 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3); | |
| 6196 if(newTable == NULL) { | |
| 6197 *status = U_MEMORY_ALLOCATION_ERROR; | |
| 6198 coll->latinOneFailed = TRUE; | |
| 6199 return FALSE; | |
| 6200 } | |
| 6201 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTable
Len)*sizeof(uint32_t); | |
| 6202 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3); | |
| 6203 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy); | |
| 6204 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToC
opy); | |
| 6205 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, siz
eToCopy); | |
| 6206 coll->latinOneTableLen = size; | |
| 6207 uprv_free(coll->latinOneCEs); | |
| 6208 coll->latinOneCEs = newTable; | |
| 6209 return TRUE; | |
| 6210 } | |
| 6211 | |
| 6212 static UBool | |
| 6213 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) { | |
| 6214 UBool result = TRUE; | |
| 6215 if(coll->latinOneCEs == NULL) { | |
| 6216 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINO
NETABLELEN*3); | |
| 6217 if(coll->latinOneCEs == NULL) { | |
| 6218 *status = U_MEMORY_ALLOCATION_ERROR; | |
| 6219 return FALSE; | |
| 6220 } | |
| 6221 coll->latinOneTableLen = UCOL_LATINONETABLELEN; | |
| 6222 } | |
| 6223 UChar ch = 0; | |
| 6224 UCollationElements *it = ucol_openElements(coll, &ch, 1, status); | |
| 6225 // Check for null pointer | |
| 6226 if (U_FAILURE(*status)) { | |
| 6227 ucol_closeElements(it); | |
| 6228 return FALSE; | |
| 6229 } | |
| 6230 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3)
; | |
| 6231 | |
| 6232 int32_t primShift = 24, secShift = 24, terShift = 24; | |
| 6233 uint32_t CE = 0; | |
| 6234 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1; | |
| 6235 | |
| 6236 // TODO: make safe if you get more than you wanted... | |
| 6237 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) { | |
| 6238 primShift = 24; secShift = 24; terShift = 24; | |
| 6239 if(ch < 0x100) { | |
| 6240 CE = coll->latinOneMapping[ch]; | |
| 6241 } else { | |
| 6242 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); | |
| 6243 if(CE == UCOL_NOT_FOUND && coll->UCA) { | |
| 6244 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); | |
| 6245 } | |
| 6246 } | |
| 6247 if(CE < UCOL_NOT_FOUND) { | |
| 6248 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift
); | |
| 6249 } else { | |
| 6250 switch (getCETag(CE)) { | |
| 6251 case EXPANSION_TAG: | |
| 6252 case DIGIT_TAG: | |
| 6253 ucol_setText(it, &ch, 1, status); | |
| 6254 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) { | |
| 6255 if(primShift < 0 || secShift < 0 || terShift < 0) { | |
| 6256 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; | |
| 6257 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL
_OUT_CE; | |
| 6258 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BA
IL_OUT_CE; | |
| 6259 break; | |
| 6260 } | |
| 6261 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &
terShift); | |
| 6262 } | |
| 6263 break; | |
| 6264 case CONTRACTION_TAG: | |
| 6265 // here is the trick | |
| 6266 // F2 is contraction. We do something very similar to contractio
ns | |
| 6267 // but have two indices, one in the real contraction table and t
he | |
| 6268 // other to where we stuffed things. This hopes that we don't ha
ve | |
| 6269 // many contractions (this should work for latin-1 tables). | |
| 6270 { | |
| 6271 if((CE & 0x00FFF000) != 0) { | |
| 6272 *status = U_UNSUPPORTED_ERROR; | |
| 6273 goto cleanup_after_failure; | |
| 6274 } | |
| 6275 | |
| 6276 const UChar *UCharOffset = (UChar *)coll->image+getContractO
ffset(CE); | |
| 6277 | |
| 6278 CE |= (contractionOffset & 0xFFF) << 12; // insert the offse
t in latin-1 table | |
| 6279 | |
| 6280 coll->latinOneCEs[ch] = CE; | |
| 6281 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE; | |
| 6282 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE; | |
| 6283 | |
| 6284 // We're going to jump into contraction table, pick the elem
ents | |
| 6285 // and use them | |
| 6286 do { | |
| 6287 CE = *(coll->contractionCEs + | |
| 6288 (UCharOffset - coll->contractionIndex)); | |
| 6289 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG)
{ | |
| 6290 uint32_t size; | |
| 6291 uint32_t i; /* general counter */ | |
| 6292 uint32_t *CEOffset = (uint32_t *)coll->image+getExpa
nsionOffset(CE); /* find the offset to expansion table */ | |
| 6293 size = getExpansionCount(CE); | |
| 6294 //CE = *CEOffset++; | |
| 6295 if(size != 0) { /* if there are less than 16 element
s in expansion, we don't terminate */ | |
| 6296 for(i = 0; i<size; i++) { | |
| 6297 if(primShift < 0 || secShift < 0 || terShift
< 0) { | |
| 6298 coll->latinOneCEs[(UChar)contractionOffs
et] = UCOL_BAIL_OUT_CE; | |
| 6299 coll->latinOneCEs[coll->latinOneTableLen
+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; | |
| 6300 coll->latinOneCEs[2*coll->latinOneTableL
en+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; | |
| 6301 break; | |
| 6302 } | |
| 6303 ucol_addLatinOneEntry(coll, (UChar)contracti
onOffset, *CEOffset++, &primShift, &secShift, &terShift); | |
| 6304 } | |
| 6305 } else { /* else, we do */ | |
| 6306 while(*CEOffset != 0) { | |
| 6307 if(primShift < 0 || secShift < 0 || terShift
< 0) { | |
| 6308 coll->latinOneCEs[(UChar)contractionOffs
et] = UCOL_BAIL_OUT_CE; | |
| 6309 coll->latinOneCEs[coll->latinOneTableLen
+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; | |
| 6310 coll->latinOneCEs[2*coll->latinOneTableL
en+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; | |
| 6311 break; | |
| 6312 } | |
| 6313 ucol_addLatinOneEntry(coll, (UChar)contracti
onOffset, *CEOffset++, &primShift, &secShift, &terShift); | |
| 6314 } | |
| 6315 } | |
| 6316 contractionOffset++; | |
| 6317 } else if(CE < UCOL_NOT_FOUND) { | |
| 6318 ucol_addLatinOneEntry(coll, (UChar)contractionOffset
++, CE, &primShift, &secShift, &terShift); | |
| 6319 } else { | |
| 6320 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_B
AIL_OUT_CE; | |
| 6321 coll->latinOneCEs[coll->latinOneTableLen+(UChar)cont
ractionOffset] = UCOL_BAIL_OUT_CE; | |
| 6322 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)co
ntractionOffset] = UCOL_BAIL_OUT_CE; | |
| 6323 contractionOffset++; | |
| 6324 } | |
| 6325 UCharOffset++; | |
| 6326 primShift = 24; secShift = 24; terShift = 24; | |
| 6327 if(contractionOffset == coll->latinOneTableLen) { // we
need to reallocate | |
| 6328 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneT
ableLen, status)) { | |
| 6329 goto cleanup_after_failure; | |
| 6330 } | |
| 6331 } | |
| 6332 } while(*UCharOffset != 0xFFFF); | |
| 6333 } | |
| 6334 break;; | |
| 6335 case SPEC_PROC_TAG: | |
| 6336 { | |
| 6337 // 0xB7 is a precontext character defined in UCA5.1, a speci
al | |
| 6338 // handle is implemeted in order to save LatinOne table for | |
| 6339 // most locales. | |
| 6340 if (ch==0xb7) { | |
| 6341 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShif
t, &terShift); | |
| 6342 } | |
| 6343 else { | |
| 6344 goto cleanup_after_failure; | |
| 6345 } | |
| 6346 } | |
| 6347 break; | |
| 6348 default: | |
| 6349 goto cleanup_after_failure; | |
| 6350 } | |
| 6351 } | |
| 6352 } | |
| 6353 // compact table | |
| 6354 if(contractionOffset < coll->latinOneTableLen) { | |
| 6355 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) { | |
| 6356 goto cleanup_after_failure; | |
| 6357 } | |
| 6358 } | |
| 6359 ucol_closeElements(it); | |
| 6360 return result; | |
| 6361 | |
| 6362 cleanup_after_failure: | |
| 6363 // status should already be set before arriving here. | |
| 6364 coll->latinOneFailed = TRUE; | |
| 6365 ucol_closeElements(it); | |
| 6366 return FALSE; | |
| 6367 } | |
| 6368 | |
| 6369 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) { | |
| 6370 if(U_SUCCESS(*status)) { | |
| 6371 if(coll->caseFirst == UCOL_UPPER_FIRST) { | |
| 6372 coll->caseSwitch = UCOL_CASE_SWITCH; | |
| 6373 } else { | |
| 6374 coll->caseSwitch = UCOL_NO_CASE_SWITCH; | |
| 6375 } | |
| 6376 | |
| 6377 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) { | |
| 6378 coll->tertiaryMask = UCOL_REMOVE_CASE; | |
| 6379 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; | |
| 6380 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /*
Should be 0x80 */ | |
| 6381 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF; | |
| 6382 coll->tertiaryBottom = UCOL_COMMON_BOT3; | |
| 6383 } else { | |
| 6384 coll->tertiaryMask = UCOL_KEEP_CASE; | |
| 6385 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON; | |
| 6386 if(coll->caseFirst == UCOL_UPPER_FIRST) { | |
| 6387 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST; | |
| 6388 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER; | |
| 6389 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER; | |
| 6390 } else { | |
| 6391 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; | |
| 6392 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER; | |
| 6393 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER; | |
| 6394 } | |
| 6395 } | |
| 6396 | |
| 6397 /* Set the compression values */ | |
| 6398 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBott
om - 1); | |
| 6399 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* w
e multilply double with int, but need only int */ | |
| 6400 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopC
ount); | |
| 6401 | |
| 6402 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY | |
| 6403 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == U
COL_NON_IGNORABLE) | |
| 6404 { | |
| 6405 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary; | |
| 6406 } else { | |
| 6407 coll->sortKeyGen = ucol_calcSortKey; | |
| 6408 } | |
| 6409 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && col
l->numericCollation == UCOL_OFF | |
| 6410 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneF
ailed) | |
| 6411 { | |
| 6412 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) { | |
| 6413 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in build
ing latin1 table, we'll use it | |
| 6414 //fprintf(stderr, "F"); | |
| 6415 coll->latinOneUse = TRUE; | |
| 6416 } else { | |
| 6417 coll->latinOneUse = FALSE; | |
| 6418 } | |
| 6419 if(*status == U_UNSUPPORTED_ERROR) { | |
| 6420 *status = U_ZERO_ERROR; | |
| 6421 } | |
| 6422 } else { // latin1Table exists and it doesn't need to be regenerated
, just use it | |
| 6423 coll->latinOneUse = TRUE; | |
| 6424 } | |
| 6425 } else { | |
| 6426 coll->latinOneUse = FALSE; | |
| 6427 } | |
| 6428 } | |
| 6429 } | 311 } |
| 6430 | 312 |
| 6431 U_CAPI uint32_t U_EXPORT2 | 313 U_CAPI uint32_t U_EXPORT2 |
| 6432 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCod
e *status) { | 314 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCod
e *status) { |
| 6433 if(U_FAILURE(*status) || coll == NULL) { | 315 if(U_FAILURE(*status) || coll == NULL) { |
| 6434 return 0; | 316 return 0; |
| 6435 } | 317 } |
| 6436 if(len == -1) { | 318 return Collator::fromUCollator(coll)->setVariableTop(varTop, len, *status); |
| 6437 len = u_strlen(varTop); | |
| 6438 } | |
| 6439 if(len == 0) { | |
| 6440 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
| 6441 return 0; | |
| 6442 } | |
| 6443 | |
| 6444 if(coll->delegate!=NULL) { | |
| 6445 return ((Collator*)coll->delegate)->setVariableTop(varTop, len, *status); | |
| 6446 } | |
| 6447 | |
| 6448 | |
| 6449 collIterate s; | |
| 6450 IInit_collIterate(coll, varTop, len, &s, status); | |
| 6451 if(U_FAILURE(*status)) { | |
| 6452 return 0; | |
| 6453 } | |
| 6454 | |
| 6455 uint32_t CE = ucol_IGetNextCE(coll, &s, status); | |
| 6456 | |
| 6457 /* here we check if we have consumed all characters */ | |
| 6458 /* you can put in either one character or a contraction */ | |
| 6459 /* you shouldn't put more... */ | |
| 6460 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) { | |
| 6461 *status = U_CE_NOT_FOUND_ERROR; | |
| 6462 return 0; | |
| 6463 } | |
| 6464 | |
| 6465 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status); | |
| 6466 | |
| 6467 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) { | |
| 6468 *status = U_PRIMARY_TOO_LONG_ERROR; | |
| 6469 return 0; | |
| 6470 } | |
| 6471 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) { | |
| 6472 coll->variableTopValueisDefault = FALSE; | |
| 6473 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16; | |
| 6474 } | |
| 6475 | |
| 6476 /* To avoid memory leak, free the offset buffer if necessary. */ | |
| 6477 ucol_freeOffsetBuffer(&s); | |
| 6478 | |
| 6479 return CE & UCOL_PRIMARYMASK; | |
| 6480 } | 319 } |
| 6481 | 320 |
| 6482 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode
*status) { | 321 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode
*status) { |
| 6483 if(U_FAILURE(*status) || coll == NULL) { | 322 if(U_FAILURE(*status) || coll == NULL) { |
| 6484 return 0; | 323 return 0; |
| 6485 } | 324 } |
| 6486 if(coll->delegate!=NULL) { | 325 return Collator::fromUCollator(coll)->getVariableTop(*status); |
| 6487 return ((const Collator*)coll->delegate)->getVariableTop(*status); | |
| 6488 } | |
| 6489 return coll->variableTopValue<<16; | |
| 6490 } | 326 } |
| 6491 | 327 |
| 6492 U_CAPI void U_EXPORT2 | 328 U_CAPI void U_EXPORT2 |
| 6493 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *stat
us) { | 329 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *stat
us) { |
| 6494 if(U_FAILURE(*status) || coll == NULL) { | 330 if(U_FAILURE(*status) || coll == NULL) { |
| 6495 return; | 331 return; |
| 6496 } | 332 } |
| 6497 | 333 Collator::fromUCollator(coll)->setVariableTop(varTop, *status); |
| 6498 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) { | 334 } |
| 6499 coll->variableTopValueisDefault = FALSE; | 335 |
| 6500 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16; | |
| 6501 } | |
| 6502 } | |
| 6503 /* Attribute setter API */ | |
| 6504 U_CAPI void U_EXPORT2 | 336 U_CAPI void U_EXPORT2 |
| 6505 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value,
UErrorCode *status) { | 337 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value,
UErrorCode *status) { |
| 6506 if(U_FAILURE(*status) || coll == NULL) { | 338 if(U_FAILURE(*status) || coll == NULL) { |
| 6507 return; | 339 return; |
| 6508 } | 340 } |
| 6509 | 341 |
| 6510 if(coll->delegate != NULL) { | 342 Collator::fromUCollator(coll)->setAttribute(attr, value, *status); |
| 6511 ((Collator*)coll->delegate)->setAttribute(attr,value,*status); | |
| 6512 return; | |
| 6513 } | |
| 6514 | |
| 6515 UColAttributeValue oldFrench = coll->frenchCollation; | |
| 6516 UColAttributeValue oldCaseFirst = coll->caseFirst; | |
| 6517 switch(attr) { | |
| 6518 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */ | |
| 6519 if(value == UCOL_ON) { | |
| 6520 coll->numericCollation = UCOL_ON; | |
| 6521 coll->numericCollationisDefault = FALSE; | |
| 6522 } else if (value == UCOL_OFF) { | |
| 6523 coll->numericCollation = UCOL_OFF; | |
| 6524 coll->numericCollationisDefault = FALSE; | |
| 6525 } else if (value == UCOL_DEFAULT) { | |
| 6526 coll->numericCollationisDefault = TRUE; | |
| 6527 coll->numericCollation = (UColAttributeValue)coll->options->numericC
ollation; | |
| 6528 } else { | |
| 6529 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
| 6530 } | |
| 6531 break; | |
| 6532 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragan
a */ | |
| 6533 if(value == UCOL_ON || value == UCOL_OFF || value == UCOL_DEFAULT) { | |
| 6534 // This attribute is an implementation detail of the CLDR Japanese t
ailoring. | |
| 6535 // The implementation might change to use a different mechanism | |
| 6536 // to achieve the same Japanese sort order. | |
| 6537 // Since ICU 50, this attribute is not settable any more via API fun
ctions. | |
| 6538 } else { | |
| 6539 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
| 6540 } | |
| 6541 break; | |
| 6542 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*
/ | |
| 6543 if(value == UCOL_ON) { | |
| 6544 coll->frenchCollation = UCOL_ON; | |
| 6545 coll->frenchCollationisDefault = FALSE; | |
| 6546 } else if (value == UCOL_OFF) { | |
| 6547 coll->frenchCollation = UCOL_OFF; | |
| 6548 coll->frenchCollationisDefault = FALSE; | |
| 6549 } else if (value == UCOL_DEFAULT) { | |
| 6550 coll->frenchCollationisDefault = TRUE; | |
| 6551 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCol
lation; | |
| 6552 } else { | |
| 6553 *status = U_ILLEGAL_ARGUMENT_ERROR ; | |
| 6554 } | |
| 6555 break; | |
| 6556 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ | |
| 6557 if(value == UCOL_SHIFTED) { | |
| 6558 coll->alternateHandling = UCOL_SHIFTED; | |
| 6559 coll->alternateHandlingisDefault = FALSE; | |
| 6560 } else if (value == UCOL_NON_IGNORABLE) { | |
| 6561 coll->alternateHandling = UCOL_NON_IGNORABLE; | |
| 6562 coll->alternateHandlingisDefault = FALSE; | |
| 6563 } else if (value == UCOL_DEFAULT) { | |
| 6564 coll->alternateHandlingisDefault = TRUE; | |
| 6565 coll->alternateHandling = (UColAttributeValue)coll->options->alterna
teHandling ; | |
| 6566 } else { | |
| 6567 *status = U_ILLEGAL_ARGUMENT_ERROR ; | |
| 6568 } | |
| 6569 break; | |
| 6570 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ | |
| 6571 if(value == UCOL_LOWER_FIRST) { | |
| 6572 coll->caseFirst = UCOL_LOWER_FIRST; | |
| 6573 coll->caseFirstisDefault = FALSE; | |
| 6574 } else if (value == UCOL_UPPER_FIRST) { | |
| 6575 coll->caseFirst = UCOL_UPPER_FIRST; | |
| 6576 coll->caseFirstisDefault = FALSE; | |
| 6577 } else if (value == UCOL_OFF) { | |
| 6578 coll->caseFirst = UCOL_OFF; | |
| 6579 coll->caseFirstisDefault = FALSE; | |
| 6580 } else if (value == UCOL_DEFAULT) { | |
| 6581 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst; | |
| 6582 coll->caseFirstisDefault = TRUE; | |
| 6583 } else { | |
| 6584 *status = U_ILLEGAL_ARGUMENT_ERROR ; | |
| 6585 } | |
| 6586 break; | |
| 6587 case UCOL_CASE_LEVEL: /* do we have an extra case level */ | |
| 6588 if(value == UCOL_ON) { | |
| 6589 coll->caseLevel = UCOL_ON; | |
| 6590 coll->caseLevelisDefault = FALSE; | |
| 6591 } else if (value == UCOL_OFF) { | |
| 6592 coll->caseLevel = UCOL_OFF; | |
| 6593 coll->caseLevelisDefault = FALSE; | |
| 6594 } else if (value == UCOL_DEFAULT) { | |
| 6595 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel; | |
| 6596 coll->caseLevelisDefault = TRUE; | |
| 6597 } else { | |
| 6598 *status = U_ILLEGAL_ARGUMENT_ERROR ; | |
| 6599 } | |
| 6600 break; | |
| 6601 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ | |
| 6602 if(value == UCOL_ON) { | |
| 6603 coll->normalizationMode = UCOL_ON; | |
| 6604 coll->normalizationModeisDefault = FALSE; | |
| 6605 initializeFCD(status); | |
| 6606 } else if (value == UCOL_OFF) { | |
| 6607 coll->normalizationMode = UCOL_OFF; | |
| 6608 coll->normalizationModeisDefault = FALSE; | |
| 6609 } else if (value == UCOL_DEFAULT) { | |
| 6610 coll->normalizationModeisDefault = TRUE; | |
| 6611 coll->normalizationMode = (UColAttributeValue)coll->options->normali
zationMode; | |
| 6612 if(coll->normalizationMode == UCOL_ON) { | |
| 6613 initializeFCD(status); | |
| 6614 } | |
| 6615 } else { | |
| 6616 *status = U_ILLEGAL_ARGUMENT_ERROR ; | |
| 6617 } | |
| 6618 break; | |
| 6619 case UCOL_STRENGTH: /* attribute for strength */ | |
| 6620 if (value == UCOL_DEFAULT) { | |
| 6621 coll->strengthisDefault = TRUE; | |
| 6622 coll->strength = (UColAttributeValue)coll->options->strength; | |
| 6623 } else if (value <= UCOL_IDENTICAL) { | |
| 6624 coll->strengthisDefault = FALSE; | |
| 6625 coll->strength = value; | |
| 6626 } else { | |
| 6627 *status = U_ILLEGAL_ARGUMENT_ERROR ; | |
| 6628 } | |
| 6629 break; | |
| 6630 case UCOL_ATTRIBUTE_COUNT: | |
| 6631 default: | |
| 6632 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
| 6633 break; | |
| 6634 } | |
| 6635 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) { | |
| 6636 coll->latinOneRegenTable = TRUE; | |
| 6637 } else { | |
| 6638 coll->latinOneRegenTable = FALSE; | |
| 6639 } | |
| 6640 ucol_updateInternalState(coll, status); | |
| 6641 } | 343 } |
| 6642 | 344 |
| 6643 U_CAPI UColAttributeValue U_EXPORT2 | 345 U_CAPI UColAttributeValue U_EXPORT2 |
| 6644 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status)
{ | 346 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status)
{ |
| 6645 if(U_FAILURE(*status) || coll == NULL) { | 347 if(U_FAILURE(*status) || coll == NULL) { |
| 6646 return UCOL_DEFAULT; | 348 return UCOL_DEFAULT; |
| 6647 } | 349 } |
| 6648 | 350 |
| 6649 if(coll->delegate != NULL) { | 351 return Collator::fromUCollator(coll)->getAttribute(attr, *status); |
| 6650 return ((Collator*)coll->delegate)->getAttribute(attr,*status); | |
| 6651 } | |
| 6652 | |
| 6653 switch(attr) { | |
| 6654 case UCOL_NUMERIC_COLLATION: | |
| 6655 return coll->numericCollation; | |
| 6656 case UCOL_HIRAGANA_QUATERNARY_MODE: | |
| 6657 return coll->hiraganaQ; | |
| 6658 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*
/ | |
| 6659 return coll->frenchCollation; | |
| 6660 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ | |
| 6661 return coll->alternateHandling; | |
| 6662 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ | |
| 6663 return coll->caseFirst; | |
| 6664 case UCOL_CASE_LEVEL: /* do we have an extra case level */ | |
| 6665 return coll->caseLevel; | |
| 6666 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ | |
| 6667 return coll->normalizationMode; | |
| 6668 case UCOL_STRENGTH: /* attribute for strength */ | |
| 6669 return coll->strength; | |
| 6670 case UCOL_ATTRIBUTE_COUNT: | |
| 6671 default: | |
| 6672 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
| 6673 break; | |
| 6674 } | |
| 6675 return UCOL_DEFAULT; | |
| 6676 } | 352 } |
| 6677 | 353 |
| 6678 U_CAPI void U_EXPORT2 | 354 U_CAPI void U_EXPORT2 |
| 6679 ucol_setStrength( UCollator *coll, | 355 ucol_setStrength( UCollator *coll, |
| 6680 UCollationStrength strength) | 356 UCollationStrength strength) |
| 6681 { | 357 { |
| 6682 UErrorCode status = U_ZERO_ERROR; | 358 UErrorCode status = U_ZERO_ERROR; |
| 6683 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); | 359 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); |
| 6684 } | 360 } |
| 6685 | 361 |
| 6686 U_CAPI UCollationStrength U_EXPORT2 | 362 U_CAPI UCollationStrength U_EXPORT2 |
| 6687 ucol_getStrength(const UCollator *coll) | 363 ucol_getStrength(const UCollator *coll) |
| 6688 { | 364 { |
| 6689 UErrorCode status = U_ZERO_ERROR; | 365 UErrorCode status = U_ZERO_ERROR; |
| 6690 return ucol_getAttribute(coll, UCOL_STRENGTH, &status); | 366 return ucol_getAttribute(coll, UCOL_STRENGTH, &status); |
| 6691 } | 367 } |
| 6692 | 368 |
| 6693 U_CAPI int32_t U_EXPORT2 | 369 U_CAPI int32_t U_EXPORT2 |
| 6694 ucol_getReorderCodes(const UCollator *coll, | 370 ucol_getReorderCodes(const UCollator *coll, |
| 6695 int32_t *dest, | 371 int32_t *dest, |
| 6696 int32_t destCapacity, | 372 int32_t destCapacity, |
| 6697 UErrorCode *status) { | 373 UErrorCode *status) { |
| 6698 if (U_FAILURE(*status)) { | 374 if (U_FAILURE(*status)) { |
| 6699 return 0; | 375 return 0; |
| 6700 } | 376 } |
| 6701 | 377 |
| 6702 if(coll->delegate!=NULL) { | 378 return Collator::fromUCollator(coll)->getReorderCodes(dest, destCapacity, *s
tatus); |
| 6703 return ((const Collator*)coll->delegate)->getReorderCodes(dest, destCapaci
ty, *status); | |
| 6704 } | |
| 6705 | |
| 6706 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { | |
| 6707 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
| 6708 return 0; | |
| 6709 } | |
| 6710 | |
| 6711 #ifdef UCOL_DEBUG | |
| 6712 printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength); | |
| 6713 printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLe
ngth); | |
| 6714 #endif | |
| 6715 | |
| 6716 if (coll->reorderCodesLength > destCapacity) { | |
| 6717 *status = U_BUFFER_OVERFLOW_ERROR; | |
| 6718 return coll->reorderCodesLength; | |
| 6719 } | |
| 6720 for (int32_t i = 0; i < coll->reorderCodesLength; i++) { | |
| 6721 dest[i] = coll->reorderCodes[i]; | |
| 6722 } | |
| 6723 return coll->reorderCodesLength; | |
| 6724 } | 379 } |
| 6725 | 380 |
| 6726 U_CAPI void U_EXPORT2 | 381 U_CAPI void U_EXPORT2 |
| 6727 ucol_setReorderCodes(UCollator* coll, | 382 ucol_setReorderCodes(UCollator* coll, |
| 6728 const int32_t* reorderCodes, | 383 const int32_t* reorderCodes, |
| 6729 int32_t reorderCodesLength, | 384 int32_t reorderCodesLength, |
| 6730 UErrorCode *status) { | 385 UErrorCode *status) { |
| 6731 if (U_FAILURE(*status)) { | 386 if (U_FAILURE(*status)) { |
| 6732 return; | 387 return; |
| 6733 } | 388 } |
| 6734 | 389 |
| 6735 if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NUL
L)) { | 390 Collator::fromUCollator(coll)->setReorderCodes(reorderCodes, reorderCodesLen
gth, *status); |
| 6736 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
| 6737 return; | |
| 6738 } | |
| 6739 | |
| 6740 if(coll->delegate!=NULL) { | |
| 6741 ((Collator*)coll->delegate)->setReorderCodes(reorderCodes, reorderCodesLen
gth, *status); | |
| 6742 return; | |
| 6743 } | |
| 6744 | |
| 6745 if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) { | |
| 6746 uprv_free(coll->reorderCodes); | |
| 6747 } | |
| 6748 coll->reorderCodes = NULL; | |
| 6749 coll->freeReorderCodesOnClose = FALSE; | |
| 6750 coll->reorderCodesLength = 0; | |
| 6751 if (reorderCodesLength == 0) { | |
| 6752 if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutat
ionTableOnClose == TRUE) { | |
| 6753 uprv_free(coll->leadBytePermutationTable); | |
| 6754 } | |
| 6755 coll->leadBytePermutationTable = NULL; | |
| 6756 coll->freeLeadBytePermutationTableOnClose = FALSE; | |
| 6757 return; | |
| 6758 } | |
| 6759 coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int3
2_t)); | |
| 6760 if (coll->reorderCodes == NULL) { | |
| 6761 *status = U_MEMORY_ALLOCATION_ERROR; | |
| 6762 return; | |
| 6763 } | |
| 6764 coll->freeReorderCodesOnClose = TRUE; | |
| 6765 for (int32_t i = 0; i < reorderCodesLength; i++) { | |
| 6766 coll->reorderCodes[i] = reorderCodes[i]; | |
| 6767 } | |
| 6768 coll->reorderCodesLength = reorderCodesLength; | |
| 6769 ucol_buildPermutationTable(coll, status); | |
| 6770 } | 391 } |
| 6771 | 392 |
| 6772 U_CAPI int32_t U_EXPORT2 | 393 U_CAPI int32_t U_EXPORT2 |
| 6773 ucol_getEquivalentReorderCodes(int32_t reorderCode, | 394 ucol_getEquivalentReorderCodes(int32_t reorderCode, |
| 6774 int32_t* dest, | 395 int32_t* dest, |
| 6775 int32_t destCapacity, | 396 int32_t destCapacity, |
| 6776 UErrorCode *pErrorCode) { | 397 UErrorCode *pErrorCode) { |
| 6777 bool equivalentCodesSet[USCRIPT_CODE_LIMIT]; | 398 return Collator::getEquivalentReorderCodes(reorderCode, dest, destCapacity,
*pErrorCode); |
| 6778 uint16_t leadBytes[256]; | 399 } |
| 6779 int leadBytesCount; | |
| 6780 int leadByteIndex; | |
| 6781 int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT]; | |
| 6782 int reorderCodesForLeadByteCount; | |
| 6783 int reorderCodeIndex; | |
| 6784 | |
| 6785 int32_t equivalentCodesCount = 0; | |
| 6786 int setIndex; | |
| 6787 | |
| 6788 if (U_FAILURE(*pErrorCode)) { | |
| 6789 return 0; | |
| 6790 } | |
| 6791 | |
| 6792 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { | |
| 6793 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
| 6794 return 0; | |
| 6795 } | |
| 6796 | |
| 6797 uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool)); | |
| 6798 | |
| 6799 const UCollator* uca = ucol_initUCA(pErrorCode); | |
| 6800 if (U_FAILURE(*pErrorCode)) { | |
| 6801 » return 0; | |
| 6802 } | |
| 6803 leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes
, 256); | |
| 6804 for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) { | |
| 6805 reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte( | |
| 6806 uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE
_LIMIT); | |
| 6807 for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCou
nt; reorderCodeIndex++) { | |
| 6808 equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true
; | |
| 6809 } | |
| 6810 } | |
| 6811 | |
| 6812 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) { | |
| 6813 if (equivalentCodesSet[setIndex] == true) { | |
| 6814 equivalentCodesCount++; | |
| 6815 } | |
| 6816 } | |
| 6817 | |
| 6818 if (destCapacity == 0) { | |
| 6819 return equivalentCodesCount; | |
| 6820 } | |
| 6821 | |
| 6822 equivalentCodesCount = 0; | |
| 6823 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) { | |
| 6824 if (equivalentCodesSet[setIndex] == true) { | |
| 6825 dest[equivalentCodesCount++] = setIndex; | |
| 6826 if (equivalentCodesCount >= destCapacity) { | |
| 6827 break; | |
| 6828 } | |
| 6829 } | |
| 6830 } | |
| 6831 return equivalentCodesCount; | |
| 6832 } | |
| 6833 | |
| 6834 | |
| 6835 /****************************************************************************/ | |
| 6836 /* Following are misc functions */ | |
| 6837 /* there are new APIs and some compatibility APIs */ | |
| 6838 /****************************************************************************/ | |
| 6839 | 400 |
| 6840 U_CAPI void U_EXPORT2 | 401 U_CAPI void U_EXPORT2 |
| 6841 ucol_getVersion(const UCollator* coll, | 402 ucol_getVersion(const UCollator* coll, |
| 6842 UVersionInfo versionInfo) | 403 UVersionInfo versionInfo) |
| 6843 { | 404 { |
| 6844 if(coll->delegate!=NULL) { | 405 Collator::fromUCollator(coll)->getVersion(versionInfo); |
| 6845 ((const Collator*)coll->delegate)->getVersion(versionInfo); | |
| 6846 return; | |
| 6847 } | |
| 6848 /* RunTime version */ | |
| 6849 uint8_t rtVersion = UCOL_RUNTIME_VERSION; | |
| 6850 /* Builder version*/ | |
| 6851 uint8_t bdVersion = coll->image->version[0]; | |
| 6852 | |
| 6853 /* Charset Version. Need to get the version from cnv files | |
| 6854 * makeconv should populate cnv files with version and | |
| 6855 * an api has to be provided in ucnv.h to obtain this version | |
| 6856 */ | |
| 6857 uint8_t csVersion = 0; | |
| 6858 | |
| 6859 /* combine the version info */ | |
| 6860 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersi
on)); | |
| 6861 | |
| 6862 /* Tailoring rules */ | |
| 6863 versionInfo[0] = (uint8_t)(cmbVersion>>8); | |
| 6864 versionInfo[1] = (uint8_t)cmbVersion; | |
| 6865 versionInfo[2] = coll->image->version[1]; | |
| 6866 if(coll->UCA) { | |
| 6867 /* Include the minor number when getting the UCA version. (major & 1f) <
< 3 | (minor & 7) */ | |
| 6868 versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->
UCA->image->UCAVersion[1] & 0x07); | |
| 6869 } else { | |
| 6870 versionInfo[3] = 0; | |
| 6871 } | |
| 6872 } | |
| 6873 | |
| 6874 | |
| 6875 /* This internal API checks whether a character is tailored or not */ | |
| 6876 U_CAPI UBool U_EXPORT2 | |
| 6877 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) { | |
| 6878 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) { | |
| 6879 return FALSE; | |
| 6880 } | |
| 6881 | |
| 6882 uint32_t CE = UCOL_NOT_FOUND; | |
| 6883 const UChar *ContractionStart = NULL; | |
| 6884 if(u < 0x100) { /* latin-1 */ | |
| 6885 CE = coll->latinOneMapping[u]; | |
| 6886 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) { | |
| 6887 return FALSE; | |
| 6888 } | |
| 6889 } else { /* regular */ | |
| 6890 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u); | |
| 6891 } | |
| 6892 | |
| 6893 if(isContraction(CE)) { | |
| 6894 ContractionStart = (UChar *)coll->image+getContractOffset(CE); | |
| 6895 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex)
); | |
| 6896 } | |
| 6897 | |
| 6898 return (UBool)(CE != UCOL_NOT_FOUND); | |
| 6899 } | |
| 6900 | |
| 6901 | |
| 6902 /****************************************************************************/ | |
| 6903 /* Following are the string compare functions */ | |
| 6904 /* */ | |
| 6905 /****************************************************************************/ | |
| 6906 | |
| 6907 | |
| 6908 /* ucol_checkIdent internal function. Does byte level string compare. */ | |
| 6909 /* Used by strcoll if strength == identical and strings */ | |
| 6910 /* are otherwise equal. */ | |
| 6911 /* */ | |
| 6912 /* Comparison must be done on NFD normalized strings. */ | |
| 6913 /* FCD is not good enough. */ | |
| 6914 | |
| 6915 static | |
| 6916 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBoo
l normalize, UErrorCode *status) | |
| 6917 { | |
| 6918 // When we arrive here, we can have normal strings or UCharIterators. Curren
tly they are both | |
| 6919 // of same type, but that doesn't really mean that it will stay that way. | |
| 6920 int32_t comparison; | |
| 6921 | |
| 6922 if (sColl->flags & UCOL_USE_ITERATOR) { | |
| 6923 // The division for the array length may truncate the array size to | |
| 6924 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too
high | |
| 6925 // for all platforms anyway. | |
| 6926 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
| 6927 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
| 6928 UNormIterator *sNIt = NULL, *tNIt = NULL; | |
| 6929 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); | |
| 6930 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); | |
| 6931 sColl->iterator->move(sColl->iterator, 0, UITER_START); | |
| 6932 tColl->iterator->move(tColl->iterator, 0, UITER_START); | |
| 6933 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, sta
tus); | |
| 6934 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, sta
tus); | |
| 6935 comparison = u_strCompareIter(sIt, tIt, TRUE); | |
| 6936 unorm_closeIter(sNIt); | |
| 6937 unorm_closeIter(tNIt); | |
| 6938 } else { | |
| 6939 int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl-
>endp - sColl->string) : -1; | |
| 6940 const UChar *sBuf = sColl->string; | |
| 6941 int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl-
>endp - tColl->string) : -1; | |
| 6942 const UChar *tBuf = tColl->string; | |
| 6943 | |
| 6944 if (normalize) { | |
| 6945 *status = U_ZERO_ERROR; | |
| 6946 // Note: We could use Normalizer::compare() or similar, but for shor
t strings | |
| 6947 // which may not be in FCD it might be faster to just NFD them. | |
| 6948 // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather tha
n | |
| 6949 // NFD'ing immediately might be faster for long strings, | |
| 6950 // but string comparison is usually done on relatively short strings
. | |
| 6951 sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN
) == 0, sBuf, sLen), | |
| 6952 sColl->writableBuffer, | |
| 6953 *status); | |
| 6954 tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN
) == 0, tBuf, tLen), | |
| 6955 tColl->writableBuffer, | |
| 6956 *status); | |
| 6957 if(U_FAILURE(*status)) { | |
| 6958 return UCOL_LESS; | |
| 6959 } | |
| 6960 comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writ
ableBuffer); | |
| 6961 } else { | |
| 6962 comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE); | |
| 6963 } | |
| 6964 } | |
| 6965 | |
| 6966 if (comparison < 0) { | |
| 6967 return UCOL_LESS; | |
| 6968 } else if (comparison == 0) { | |
| 6969 return UCOL_EQUAL; | |
| 6970 } else /* comparison > 0 */ { | |
| 6971 return UCOL_GREATER; | |
| 6972 } | |
| 6973 } | |
| 6974 | |
| 6975 /* CEBuf - A struct and some inline functions to handle the saving */ | |
| 6976 /* of CEs in a buffer within ucol_strcoll */ | |
| 6977 | |
| 6978 #define UCOL_CEBUF_SIZE 512 | |
| 6979 typedef struct ucol_CEBuf { | |
| 6980 uint32_t *buf; | |
| 6981 uint32_t *endp; | |
| 6982 uint32_t *pos; | |
| 6983 uint32_t localArray[UCOL_CEBUF_SIZE]; | |
| 6984 } ucol_CEBuf; | |
| 6985 | |
| 6986 | |
| 6987 static | |
| 6988 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) { | |
| 6989 (b)->buf = (b)->pos = (b)->localArray; | |
| 6990 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE; | |
| 6991 } | |
| 6992 | |
| 6993 static | |
| 6994 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) { | |
| 6995 uint32_t oldSize; | |
| 6996 uint32_t newSize; | |
| 6997 uint32_t *newBuf; | |
| 6998 | |
| 6999 ci->flags |= UCOL_ITER_ALLOCATED; | |
| 7000 oldSize = (uint32_t)(b->pos - b->buf); | |
| 7001 newSize = oldSize * 2; | |
| 7002 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t)); | |
| 7003 if(newBuf == NULL) { | |
| 7004 *status = U_MEMORY_ALLOCATION_ERROR; | |
| 7005 } | |
| 7006 else { | |
| 7007 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t)); | |
| 7008 if (b->buf != b->localArray) { | |
| 7009 uprv_free(b->buf); | |
| 7010 } | |
| 7011 b->buf = newBuf; | |
| 7012 b->endp = b->buf + newSize; | |
| 7013 b->pos = b->buf + oldSize; | |
| 7014 } | |
| 7015 } | |
| 7016 | |
| 7017 static | |
| 7018 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCo
de *status) { | |
| 7019 if (b->pos == b->endp) { | |
| 7020 ucol_CEBuf_Expand(b, ci, status); | |
| 7021 } | |
| 7022 if (U_SUCCESS(*status)) { | |
| 7023 *(b)->pos++ = ce; | |
| 7024 } | |
| 7025 } | |
| 7026 | |
| 7027 /* This is a trick string compare function that goes in and uses sortkeys to com
pare */ | |
| 7028 /* It is used when compare gets in trouble and needs to bail out
*/ | |
| 7029 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl, | |
| 7030 collIterate *tColl, | |
| 7031 UErrorCode *status) | |
| 7032 { | |
| 7033 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER]; | |
| 7034 uint8_t *sourceKeyP = sourceKey; | |
| 7035 uint8_t *targetKeyP = targetKey; | |
| 7036 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER; | |
| 7037 const UCollator *coll = sColl->coll; | |
| 7038 const UChar *source = NULL; | |
| 7039 const UChar *target = NULL; | |
| 7040 int32_t result = UCOL_EQUAL; | |
| 7041 UnicodeString sourceString, targetString; | |
| 7042 int32_t sourceLength; | |
| 7043 int32_t targetLength; | |
| 7044 | |
| 7045 if(sColl->flags & UCOL_USE_ITERATOR) { | |
| 7046 sColl->iterator->move(sColl->iterator, 0, UITER_START); | |
| 7047 tColl->iterator->move(tColl->iterator, 0, UITER_START); | |
| 7048 UChar32 c; | |
| 7049 while((c=sColl->iterator->next(sColl->iterator))>=0) { | |
| 7050 sourceString.append((UChar)c); | |
| 7051 } | |
| 7052 while((c=tColl->iterator->next(tColl->iterator))>=0) { | |
| 7053 targetString.append((UChar)c); | |
| 7054 } | |
| 7055 source = sourceString.getBuffer(); | |
| 7056 sourceLength = sourceString.length(); | |
| 7057 target = targetString.getBuffer(); | |
| 7058 targetLength = targetString.length(); | |
| 7059 } else { // no iterators | |
| 7060 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sCo
ll->string):-1; | |
| 7061 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tCo
ll->string):-1; | |
| 7062 source = sColl->string; | |
| 7063 target = tColl->string; | |
| 7064 } | |
| 7065 | |
| 7066 | |
| 7067 | |
| 7068 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourc
eKeyLen); | |
| 7069 if(sourceKeyLen > UCOL_MAX_BUFFER) { | |
| 7070 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t)); | |
| 7071 if(sourceKeyP == NULL) { | |
| 7072 *status = U_MEMORY_ALLOCATION_ERROR; | |
| 7073 goto cleanup_and_do_compare; | |
| 7074 } | |
| 7075 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, s
ourceKeyLen); | |
| 7076 } | |
| 7077 | |
| 7078 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targe
tKeyLen); | |
| 7079 if(targetKeyLen > UCOL_MAX_BUFFER) { | |
| 7080 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t)); | |
| 7081 if(targetKeyP == NULL) { | |
| 7082 *status = U_MEMORY_ALLOCATION_ERROR; | |
| 7083 goto cleanup_and_do_compare; | |
| 7084 } | |
| 7085 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, t
argetKeyLen); | |
| 7086 } | |
| 7087 | |
| 7088 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP); | |
| 7089 | |
| 7090 cleanup_and_do_compare: | |
| 7091 if(sourceKeyP != NULL && sourceKeyP != sourceKey) { | |
| 7092 uprv_free(sourceKeyP); | |
| 7093 } | |
| 7094 | |
| 7095 if(targetKeyP != NULL && targetKeyP != targetKey) { | |
| 7096 uprv_free(targetKeyP); | |
| 7097 } | |
| 7098 | |
| 7099 if(result<0) { | |
| 7100 return UCOL_LESS; | |
| 7101 } else if(result>0) { | |
| 7102 return UCOL_GREATER; | |
| 7103 } else { | |
| 7104 return UCOL_EQUAL; | |
| 7105 } | |
| 7106 } | |
| 7107 | |
| 7108 | |
| 7109 static UCollationResult | |
| 7110 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status) | |
| 7111 { | |
| 7112 U_ALIGN_CODE(16); | |
| 7113 | |
| 7114 const UCollator *coll = sColl->coll; | |
| 7115 | |
| 7116 | |
| 7117 // setting up the collator parameters | |
| 7118 UColAttributeValue strength = coll->strength; | |
| 7119 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY); | |
| 7120 | |
| 7121 UBool checkSecTer = initialCheckSecTer; | |
| 7122 UBool checkTertiary = (strength >= UCOL_TERTIARY); | |
| 7123 UBool checkQuad = (strength >= UCOL_QUATERNARY); | |
| 7124 UBool checkIdent = (strength == UCOL_IDENTICAL); | |
| 7125 UBool checkCase = (coll->caseLevel == UCOL_ON); | |
| 7126 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer; | |
| 7127 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); | |
| 7128 UBool qShifted = shifted && checkQuad; | |
| 7129 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad; | |
| 7130 | |
| 7131 if(doHiragana && shifted) { | |
| 7132 return (ucol_compareUsingSortKeys(sColl, tColl, status)); | |
| 7133 } | |
| 7134 uint8_t caseSwitch = coll->caseSwitch; | |
| 7135 uint8_t tertiaryMask = coll->tertiaryMask; | |
| 7136 | |
| 7137 // This is the lowest primary value that will not be ignored if shifted | |
| 7138 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0; | |
| 7139 | |
| 7140 UCollationResult result = UCOL_EQUAL; | |
| 7141 UCollationResult hirResult = UCOL_EQUAL; | |
| 7142 | |
| 7143 // Preparing the CE buffers. They will be filled during the primary phase | |
| 7144 ucol_CEBuf sCEs; | |
| 7145 ucol_CEBuf tCEs; | |
| 7146 UCOL_INIT_CEBUF(&sCEs); | |
| 7147 UCOL_INIT_CEBUF(&tCEs); | |
| 7148 | |
| 7149 uint32_t secS = 0, secT = 0; | |
| 7150 uint32_t sOrder=0, tOrder=0; | |
| 7151 | |
| 7152 // Non shifted primary processing is quite simple | |
| 7153 if(!shifted) { | |
| 7154 for(;;) { | |
| 7155 // We fetch CEs until we hit a non ignorable primary or end. | |
| 7156 uint32_t sPrimary; | |
| 7157 do { | |
| 7158 // We get the next CE | |
| 7159 sOrder = ucol_IGetNextCE(coll, sColl, status); | |
| 7160 // Stuff it in the buffer | |
| 7161 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); | |
| 7162 // And keep just the primary part. | |
| 7163 sPrimary = sOrder & UCOL_PRIMARYMASK; | |
| 7164 } while(sPrimary == 0); | |
| 7165 | |
| 7166 // see the comments on the above block | |
| 7167 uint32_t tPrimary; | |
| 7168 do { | |
| 7169 tOrder = ucol_IGetNextCE(coll, tColl, status); | |
| 7170 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); | |
| 7171 tPrimary = tOrder & UCOL_PRIMARYMASK; | |
| 7172 } while(tPrimary == 0); | |
| 7173 | |
| 7174 // if both primaries are the same | |
| 7175 if(sPrimary == tPrimary) { | |
| 7176 // and there are no more CEs, we advance to the next level | |
| 7177 if(sPrimary == UCOL_NO_MORE_CES_PRIMARY) { | |
| 7178 break; | |
| 7179 } | |
| 7180 if(doHiragana && hirResult == UCOL_EQUAL) { | |
| 7181 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCO
L_WAS_HIRAGANA)) { | |
| 7182 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl
->flags & UCOL_WAS_HIRAGANA)) | |
| 7183 ? UCOL_LESS:UCOL_GREATER; | |
| 7184 } | |
| 7185 } | |
| 7186 } else { | |
| 7187 // only need to check one for continuation | |
| 7188 // if one is then the other must be or the preceding CE would be
a prefix of the other | |
| 7189 if (coll->leadBytePermutationTable != NULL && !isContinuation(sO
rder)) { | |
| 7190 sPrimary = (coll->leadBytePermutationTable[sPrimary>>24] <<
24) | (sPrimary & 0x00FFFFFF); | |
| 7191 tPrimary = (coll->leadBytePermutationTable[tPrimary>>24] <<
24) | (tPrimary & 0x00FFFFFF); | |
| 7192 } | |
| 7193 // if two primaries are different, we are done | |
| 7194 result = (sPrimary < tPrimary) ? UCOL_LESS: UCOL_GREATER; | |
| 7195 goto commonReturn; | |
| 7196 } | |
| 7197 } // no primary difference... do the rest from the buffers | |
| 7198 } else { // shifted - do a slightly more complicated processing :) | |
| 7199 for(;;) { | |
| 7200 UBool sInShifted = FALSE; | |
| 7201 UBool tInShifted = FALSE; | |
| 7202 // This version of code can be refactored. However, it seems easier
to understand this way. | |
| 7203 // Source loop. Same as the target loop. | |
| 7204 for(;;) { | |
| 7205 sOrder = ucol_IGetNextCE(coll, sColl, status); | |
| 7206 if(sOrder == UCOL_NO_MORE_CES) { | |
| 7207 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); | |
| 7208 break; | |
| 7209 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMA
SK) == 0)) { | |
| 7210 /* UCA amendment - ignore ignorables that follow shifted cod
e points */ | |
| 7211 continue; | |
| 7212 } else if(isContinuation(sOrder)) { | |
| 7213 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary va
lue */ | |
| 7214 if(sInShifted) { | |
| 7215 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* pres
erve interesting continuation */ | |
| 7216 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); | |
| 7217 continue; | |
| 7218 } else { | |
| 7219 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); | |
| 7220 break; | |
| 7221 } | |
| 7222 } else { /* Just lower level values */ | |
| 7223 if(sInShifted) { | |
| 7224 continue; | |
| 7225 } else { | |
| 7226 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); | |
| 7227 continue; | |
| 7228 } | |
| 7229 } | |
| 7230 } else { /* regular */ | |
| 7231 if(coll->leadBytePermutationTable != NULL){ | |
| 7232 sOrder = (coll->leadBytePermutationTable[sOrder>>24] <<
24) | (sOrder & 0x00FFFFFF); | |
| 7233 } | |
| 7234 if((sOrder & UCOL_PRIMARYMASK) > LVT) { | |
| 7235 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); | |
| 7236 break; | |
| 7237 } else { | |
| 7238 if((sOrder & UCOL_PRIMARYMASK) > 0) { | |
| 7239 sInShifted = TRUE; | |
| 7240 sOrder &= UCOL_PRIMARYMASK; | |
| 7241 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); | |
| 7242 continue; | |
| 7243 } else { | |
| 7244 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); | |
| 7245 sInShifted = FALSE; | |
| 7246 continue; | |
| 7247 } | |
| 7248 } | |
| 7249 } | |
| 7250 } | |
| 7251 sOrder &= UCOL_PRIMARYMASK; | |
| 7252 sInShifted = FALSE; | |
| 7253 | |
| 7254 for(;;) { | |
| 7255 tOrder = ucol_IGetNextCE(coll, tColl, status); | |
| 7256 if(tOrder == UCOL_NO_MORE_CES) { | |
| 7257 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); | |
| 7258 break; | |
| 7259 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMA
SK) == 0)) { | |
| 7260 /* UCA amendment - ignore ignorables that follow shifted cod
e points */ | |
| 7261 continue; | |
| 7262 } else if(isContinuation(tOrder)) { | |
| 7263 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary va
lue */ | |
| 7264 if(tInShifted) { | |
| 7265 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* pres
erve interesting continuation */ | |
| 7266 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); | |
| 7267 continue; | |
| 7268 } else { | |
| 7269 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); | |
| 7270 break; | |
| 7271 } | |
| 7272 } else { /* Just lower level values */ | |
| 7273 if(tInShifted) { | |
| 7274 continue; | |
| 7275 } else { | |
| 7276 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); | |
| 7277 continue; | |
| 7278 } | |
| 7279 } | |
| 7280 } else { /* regular */ | |
| 7281 if(coll->leadBytePermutationTable != NULL){ | |
| 7282 tOrder = (coll->leadBytePermutationTable[tOrder>>24] <<
24) | (tOrder & 0x00FFFFFF); | |
| 7283 } | |
| 7284 if((tOrder & UCOL_PRIMARYMASK) > LVT) { | |
| 7285 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); | |
| 7286 break; | |
| 7287 } else { | |
| 7288 if((tOrder & UCOL_PRIMARYMASK) > 0) { | |
| 7289 tInShifted = TRUE; | |
| 7290 tOrder &= UCOL_PRIMARYMASK; | |
| 7291 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); | |
| 7292 continue; | |
| 7293 } else { | |
| 7294 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); | |
| 7295 tInShifted = FALSE; | |
| 7296 continue; | |
| 7297 } | |
| 7298 } | |
| 7299 } | |
| 7300 } | |
| 7301 tOrder &= UCOL_PRIMARYMASK; | |
| 7302 tInShifted = FALSE; | |
| 7303 | |
| 7304 if(sOrder == tOrder) { | |
| 7305 /* | |
| 7306 if(doHiragana && hirResult == UCOL_EQUAL) { | |
| 7307 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_
HIRAGANA)) { | |
| 7308 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags &
UCOL_WAS_HIRAGANA)) | |
| 7309 ? UCOL_LESS:UCOL_GREATER; | |
| 7310 } | |
| 7311 } | |
| 7312 */ | |
| 7313 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { | |
| 7314 break; | |
| 7315 } else { | |
| 7316 sOrder = 0; | |
| 7317 tOrder = 0; | |
| 7318 continue; | |
| 7319 } | |
| 7320 } else { | |
| 7321 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER; | |
| 7322 goto commonReturn; | |
| 7323 } | |
| 7324 } /* no primary difference... do the rest from the buffers */ | |
| 7325 } | |
| 7326 | |
| 7327 /* now, we're gonna reexamine collected CEs */ | |
| 7328 uint32_t *sCE; | |
| 7329 uint32_t *tCE; | |
| 7330 | |
| 7331 /* This is the secondary level of comparison */ | |
| 7332 if(checkSecTer) { | |
| 7333 if(!isFrenchSec) { /* normal */ | |
| 7334 sCE = sCEs.buf; | |
| 7335 tCE = tCEs.buf; | |
| 7336 for(;;) { | |
| 7337 while (secS == 0) { | |
| 7338 secS = *(sCE++) & UCOL_SECONDARYMASK; | |
| 7339 } | |
| 7340 | |
| 7341 while(secT == 0) { | |
| 7342 secT = *(tCE++) & UCOL_SECONDARYMASK; | |
| 7343 } | |
| 7344 | |
| 7345 if(secS == secT) { | |
| 7346 if(secS == UCOL_NO_MORE_CES_SECONDARY) { | |
| 7347 break; | |
| 7348 } else { | |
| 7349 secS = 0; secT = 0; | |
| 7350 continue; | |
| 7351 } | |
| 7352 } else { | |
| 7353 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; | |
| 7354 goto commonReturn; | |
| 7355 } | |
| 7356 } | |
| 7357 } else { /* do the French */ | |
| 7358 uint32_t *sCESave = NULL; | |
| 7359 uint32_t *tCESave = NULL; | |
| 7360 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimi
zed */ | |
| 7361 tCE = tCEs.pos-2; | |
| 7362 for(;;) { | |
| 7363 while (secS == 0 && sCE >= sCEs.buf) { | |
| 7364 if(sCESave == NULL) { | |
| 7365 secS = *(sCE--); | |
| 7366 if(isContinuation(secS)) { | |
| 7367 while(isContinuation(secS = *(sCE--))) | |
| 7368 ; | |
| 7369 /* after this, secS has the start of continuation, a
nd sCEs points before that */ | |
| 7370 sCESave = sCE; /* we save it, so that we know where
to come back AND that we need to go forward */ | |
| 7371 sCE+=2; /* need to point to the first continuation
CP */ | |
| 7372 /* However, now you can just continue doing stuff */ | |
| 7373 } | |
| 7374 } else { | |
| 7375 secS = *(sCE++); | |
| 7376 if(!isContinuation(secS)) { /* This means we have finish
ed with this cont */ | |
| 7377 sCE = sCESave; /* reset the pointer to be
fore continuation */ | |
| 7378 sCESave = NULL; | |
| 7379 secS = 0; /* Fetch a fresh CE before the continuati
on sequence. */ | |
| 7380 continue; | |
| 7381 } | |
| 7382 } | |
| 7383 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit *
/ | |
| 7384 } | |
| 7385 | |
| 7386 while(secT == 0 && tCE >= tCEs.buf) { | |
| 7387 if(tCESave == NULL) { | |
| 7388 secT = *(tCE--); | |
| 7389 if(isContinuation(secT)) { | |
| 7390 while(isContinuation(secT = *(tCE--))) | |
| 7391 ; | |
| 7392 /* after this, secS has the start of continuation, a
nd sCEs points before that */ | |
| 7393 tCESave = tCE; /* we save it, so that we know where
to come back AND that we need to go forward */ | |
| 7394 tCE+=2; /* need to point to the first continuation
CP */ | |
| 7395 /* However, now you can just continue doing stuff */ | |
| 7396 } | |
| 7397 } else { | |
| 7398 secT = *(tCE++); | |
| 7399 if(!isContinuation(secT)) { /* This means we have finish
ed with this cont */ | |
| 7400 tCE = tCESave; /* reset the pointer to befo
re continuation */ | |
| 7401 tCESave = NULL; | |
| 7402 secT = 0; /* Fetch a fresh CE before the continuati
on sequence. */ | |
| 7403 continue; | |
| 7404 } | |
| 7405 } | |
| 7406 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit *
/ | |
| 7407 } | |
| 7408 | |
| 7409 if(secS == secT) { | |
| 7410 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf &&
tCE < tCEs.buf)) { | |
| 7411 break; | |
| 7412 } else { | |
| 7413 secS = 0; secT = 0; | |
| 7414 continue; | |
| 7415 } | |
| 7416 } else { | |
| 7417 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; | |
| 7418 goto commonReturn; | |
| 7419 } | |
| 7420 } | |
| 7421 } | |
| 7422 } | |
| 7423 | |
| 7424 /* doing the case bit */ | |
| 7425 if(checkCase) { | |
| 7426 sCE = sCEs.buf; | |
| 7427 tCE = tCEs.buf; | |
| 7428 for(;;) { | |
| 7429 while((secS & UCOL_REMOVE_CASE) == 0) { | |
| 7430 if(!isContinuation(*sCE++)) { | |
| 7431 secS =*(sCE-1); | |
| 7432 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMA
RY) { | |
| 7433 // primary ignorables should not be considered on the ca
se level when the strength is primary | |
| 7434 // otherwise, the CEs stop being well-formed | |
| 7435 secS &= UCOL_TERT_CASE_MASK; | |
| 7436 secS ^= caseSwitch; | |
| 7437 } else { | |
| 7438 secS = 0; | |
| 7439 } | |
| 7440 } else { | |
| 7441 secS = 0; | |
| 7442 } | |
| 7443 } | |
| 7444 | |
| 7445 while((secT & UCOL_REMOVE_CASE) == 0) { | |
| 7446 if(!isContinuation(*tCE++)) { | |
| 7447 secT = *(tCE-1); | |
| 7448 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMA
RY) { | |
| 7449 // primary ignorables should not be considered on the ca
se level when the strength is primary | |
| 7450 // otherwise, the CEs stop being well-formed | |
| 7451 secT &= UCOL_TERT_CASE_MASK; | |
| 7452 secT ^= caseSwitch; | |
| 7453 } else { | |
| 7454 secT = 0; | |
| 7455 } | |
| 7456 } else { | |
| 7457 secT = 0; | |
| 7458 } | |
| 7459 } | |
| 7460 | |
| 7461 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) { | |
| 7462 result = UCOL_LESS; | |
| 7463 goto commonReturn; | |
| 7464 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK))
{ | |
| 7465 result = UCOL_GREATER; | |
| 7466 goto commonReturn; | |
| 7467 } | |
| 7468 | |
| 7469 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT &
UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) { | |
| 7470 break; | |
| 7471 } else { | |
| 7472 secS = 0; | |
| 7473 secT = 0; | |
| 7474 } | |
| 7475 } | |
| 7476 } | |
| 7477 | |
| 7478 /* Tertiary level */ | |
| 7479 if(checkTertiary) { | |
| 7480 secS = 0; | |
| 7481 secT = 0; | |
| 7482 sCE = sCEs.buf; | |
| 7483 tCE = tCEs.buf; | |
| 7484 for(;;) { | |
| 7485 while((secS & UCOL_REMOVE_CASE) == 0) { | |
| 7486 sOrder = *sCE++; | |
| 7487 secS = sOrder & tertiaryMask; | |
| 7488 if(!isContinuation(sOrder)) { | |
| 7489 secS ^= caseSwitch; | |
| 7490 } else { | |
| 7491 secS &= UCOL_REMOVE_CASE; | |
| 7492 } | |
| 7493 } | |
| 7494 | |
| 7495 while((secT & UCOL_REMOVE_CASE) == 0) { | |
| 7496 tOrder = *tCE++; | |
| 7497 secT = tOrder & tertiaryMask; | |
| 7498 if(!isContinuation(tOrder)) { | |
| 7499 secT ^= caseSwitch; | |
| 7500 } else { | |
| 7501 secT &= UCOL_REMOVE_CASE; | |
| 7502 } | |
| 7503 } | |
| 7504 | |
| 7505 if(secS == secT) { | |
| 7506 if((secS & UCOL_REMOVE_CASE) == 1) { | |
| 7507 break; | |
| 7508 } else { | |
| 7509 secS = 0; secT = 0; | |
| 7510 continue; | |
| 7511 } | |
| 7512 } else { | |
| 7513 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; | |
| 7514 goto commonReturn; | |
| 7515 } | |
| 7516 } | |
| 7517 } | |
| 7518 | |
| 7519 | |
| 7520 if(qShifted /*checkQuad*/) { | |
| 7521 UBool sInShifted = TRUE; | |
| 7522 UBool tInShifted = TRUE; | |
| 7523 secS = 0; | |
| 7524 secT = 0; | |
| 7525 sCE = sCEs.buf; | |
| 7526 tCE = tCEs.buf; | |
| 7527 for(;;) { | |
| 7528 while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(sec
S) && !sInShifted)) { | |
| 7529 secS = *(sCE++); | |
| 7530 if(isContinuation(secS)) { | |
| 7531 if(!sInShifted) { | |
| 7532 continue; | |
| 7533 } | |
| 7534 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non
continuation */ | |
| 7535 secS = UCOL_PRIMARYMASK; | |
| 7536 sInShifted = FALSE; | |
| 7537 } else { | |
| 7538 sInShifted = TRUE; | |
| 7539 } | |
| 7540 } | |
| 7541 secS &= UCOL_PRIMARYMASK; | |
| 7542 | |
| 7543 | |
| 7544 while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(sec
T) && !tInShifted)) { | |
| 7545 secT = *(tCE++); | |
| 7546 if(isContinuation(secT)) { | |
| 7547 if(!tInShifted) { | |
| 7548 continue; | |
| 7549 } | |
| 7550 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) { | |
| 7551 secT = UCOL_PRIMARYMASK; | |
| 7552 tInShifted = FALSE; | |
| 7553 } else { | |
| 7554 tInShifted = TRUE; | |
| 7555 } | |
| 7556 } | |
| 7557 secT &= UCOL_PRIMARYMASK; | |
| 7558 | |
| 7559 if(secS == secT) { | |
| 7560 if(secS == UCOL_NO_MORE_CES_PRIMARY) { | |
| 7561 break; | |
| 7562 } else { | |
| 7563 secS = 0; secT = 0; | |
| 7564 continue; | |
| 7565 } | |
| 7566 } else { | |
| 7567 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; | |
| 7568 goto commonReturn; | |
| 7569 } | |
| 7570 } | |
| 7571 } else if(doHiragana && hirResult != UCOL_EQUAL) { | |
| 7572 // If we're fine on quaternaries, we might be different | |
| 7573 // on Hiragana. This, however, might fail us in shifted. | |
| 7574 result = hirResult; | |
| 7575 goto commonReturn; | |
| 7576 } | |
| 7577 | |
| 7578 /* For IDENTICAL comparisons, we use a bitwise character comparison */ | |
| 7579 /* as a tiebreaker if all else is equal. */ | |
| 7580 /* Getting here should be quite rare - strings are not identical - */ | |
| 7581 /* that is checked first, but compared == through all other checks. */ | |
| 7582 if(checkIdent) | |
| 7583 { | |
| 7584 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UC
OL_ON); | |
| 7585 result = ucol_checkIdent(sColl, tColl, TRUE, status); | |
| 7586 } | |
| 7587 | |
| 7588 commonReturn: | |
| 7589 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) { | |
| 7590 if (sCEs.buf != sCEs.localArray ) { | |
| 7591 uprv_free(sCEs.buf); | |
| 7592 } | |
| 7593 if (tCEs.buf != tCEs.localArray ) { | |
| 7594 uprv_free(tCEs.buf); | |
| 7595 } | |
| 7596 } | |
| 7597 | |
| 7598 return result; | |
| 7599 } | |
| 7600 | |
| 7601 static UCollationResult | |
| 7602 ucol_strcollRegular(const UCollator *coll, | |
| 7603 const UChar *source, int32_t sourceLength, | |
| 7604 const UChar *target, int32_t targetLength, | |
| 7605 UErrorCode *status) { | |
| 7606 collIterate sColl, tColl; | |
| 7607 // Preparing the context objects for iterating over strings | |
| 7608 IInit_collIterate(coll, source, sourceLength, &sColl, status); | |
| 7609 IInit_collIterate(coll, target, targetLength, &tColl, status); | |
| 7610 if(U_FAILURE(*status)) { | |
| 7611 return UCOL_LESS; | |
| 7612 } | |
| 7613 return ucol_strcollRegular(&sColl, &tColl, status); | |
| 7614 } | |
| 7615 | |
| 7616 static inline uint32_t | |
| 7617 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength, | |
| 7618 uint32_t CE, const UChar *s, int32_t *index, int32_t l
en) | |
| 7619 { | |
| 7620 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); | |
| 7621 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; | |
| 7622 int32_t offset = 1; | |
| 7623 UChar schar = 0, tchar = 0; | |
| 7624 | |
| 7625 for(;;) { | |
| 7626 if(len == -1) { | |
| 7627 if(s[*index] == 0) { // end of string | |
| 7628 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOn
eOffset]); | |
| 7629 } else { | |
| 7630 schar = s[*index]; | |
| 7631 } | |
| 7632 } else { | |
| 7633 if(*index == len) { | |
| 7634 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOn
eOffset]); | |
| 7635 } else { | |
| 7636 schar = s[*index]; | |
| 7637 } | |
| 7638 } | |
| 7639 | |
| 7640 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contractio
n codepoints should be ordered, we skip all that are smaller */ | |
| 7641 offset++; | |
| 7642 } | |
| 7643 | |
| 7644 if (schar == tchar) { | |
| 7645 (*index)++; | |
| 7646 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff
set+offset]); | |
| 7647 } | |
| 7648 else | |
| 7649 { | |
| 7650 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { | |
| 7651 return UCOL_BAIL_OUT_CE; | |
| 7652 } | |
| 7653 // skip completely ignorables | |
| 7654 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); | |
| 7655 if(isZeroCE == 0) { // we have to ignore completely ignorables | |
| 7656 (*index)++; | |
| 7657 continue; | |
| 7658 } | |
| 7659 | |
| 7660 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff
set]); | |
| 7661 } | |
| 7662 } | |
| 7663 } | |
| 7664 | |
| 7665 | |
| 7666 /** | |
| 7667 * This is a fast strcoll, geared towards text in Latin-1. | |
| 7668 * It supports contractions of size two, French secondaries | |
| 7669 * and case switching. You can use it with strengths primary | |
| 7670 * to tertiary. It does not support shifted and case level. | |
| 7671 * It relies on the table build by setupLatin1Table. If it | |
| 7672 * doesn't understand something, it will go to the regular | |
| 7673 * strcoll. | |
| 7674 */ | |
| 7675 static UCollationResult | |
| 7676 ucol_strcollUseLatin1( const UCollator *coll, | |
| 7677 const UChar *source, | |
| 7678 int32_t sLen, | |
| 7679 const UChar *target, | |
| 7680 int32_t tLen, | |
| 7681 UErrorCode *status) | |
| 7682 { | |
| 7683 U_ALIGN_CODE(16); | |
| 7684 int32_t strength = coll->strength; | |
| 7685 | |
| 7686 int32_t sIndex = 0, tIndex = 0; | |
| 7687 UChar sChar = 0, tChar = 0; | |
| 7688 uint32_t sOrder=0, tOrder=0; | |
| 7689 | |
| 7690 UBool endOfSource = FALSE; | |
| 7691 | |
| 7692 uint32_t *elements = coll->latinOneCEs; | |
| 7693 | |
| 7694 UBool haveContractions = FALSE; // if we have contractions in our string | |
| 7695 // we cannot do French secondary | |
| 7696 | |
| 7697 // Do the primary level | |
| 7698 for(;;) { | |
| 7699 while(sOrder==0) { // this loop skips primary ignorables | |
| 7700 // sOrder=getNextlatinOneCE(source); | |
| 7701 if(sLen==-1) { // handling zero terminated strings | |
| 7702 sChar=source[sIndex++]; | |
| 7703 if(sChar==0) { | |
| 7704 endOfSource = TRUE; | |
| 7705 break; | |
| 7706 } | |
| 7707 } else { // handling strings with known length | |
| 7708 if(sIndex==sLen) { | |
| 7709 endOfSource = TRUE; | |
| 7710 break; | |
| 7711 } | |
| 7712 sChar=source[sIndex++]; | |
| 7713 } | |
| 7714 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sCha
r > 0xFF, but this is faster on win32) | |
| 7715 //fprintf(stderr, "R"); | |
| 7716 return ucol_strcollRegular(coll, source, sLen, target, tLen, sta
tus); | |
| 7717 } | |
| 7718 sOrder = elements[sChar]; | |
| 7719 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special | |
| 7720 // specials can basically be either contractions or bail-out sig
ns. If we get anything | |
| 7721 // else, we'll bail out anywasy | |
| 7722 if(getCETag(sOrder) == CONTRACTION_TAG) { | |
| 7723 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOr
der, source, &sIndex, sLen); | |
| 7724 haveContractions = TRUE; // if there are contractions, we ca
nnot do French secondary | |
| 7725 // However, if there are contractions in the table, but we a
lways use just one char, | |
| 7726 // we might be able to do French. This should be checked out
. | |
| 7727 } | |
| 7728 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { | |
| 7729 //fprintf(stderr, "S"); | |
| 7730 return ucol_strcollRegular(coll, source, sLen, target, tLen,
status); | |
| 7731 } | |
| 7732 } | |
| 7733 } | |
| 7734 | |
| 7735 while(tOrder==0) { // this loop skips primary ignorables | |
| 7736 // tOrder=getNextlatinOneCE(target); | |
| 7737 if(tLen==-1) { // handling zero terminated strings | |
| 7738 tChar=target[tIndex++]; | |
| 7739 if(tChar==0) { | |
| 7740 if(endOfSource) { // this is different than source loop, | |
| 7741 // as we already know that source loop is done here, | |
| 7742 // so we can either finish the primary loop if both | |
| 7743 // strings are done or anounce the result if only | |
| 7744 // target is done. Same below. | |
| 7745 goto endOfPrimLoop; | |
| 7746 } else { | |
| 7747 return UCOL_GREATER; | |
| 7748 } | |
| 7749 } | |
| 7750 } else { // handling strings with known length | |
| 7751 if(tIndex==tLen) { | |
| 7752 if(endOfSource) { | |
| 7753 goto endOfPrimLoop; | |
| 7754 } else { | |
| 7755 return UCOL_GREATER; | |
| 7756 } | |
| 7757 } | |
| 7758 tChar=target[tIndex++]; | |
| 7759 } | |
| 7760 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sCha
r > 0xFF, but this is faster on win32) | |
| 7761 //fprintf(stderr, "R"); | |
| 7762 return ucol_strcollRegular(coll, source, sLen, target, tLen, sta
tus); | |
| 7763 } | |
| 7764 tOrder = elements[tChar]; | |
| 7765 if(tOrder >= UCOL_NOT_FOUND) { | |
| 7766 // Handling specials, see the comments for source | |
| 7767 if(getCETag(tOrder) == CONTRACTION_TAG) { | |
| 7768 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOr
der, target, &tIndex, tLen); | |
| 7769 haveContractions = TRUE; | |
| 7770 } | |
| 7771 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { | |
| 7772 //fprintf(stderr, "S"); | |
| 7773 return ucol_strcollRegular(coll, source, sLen, target, tLen,
status); | |
| 7774 } | |
| 7775 } | |
| 7776 } | |
| 7777 if(endOfSource) { // source is finished, but target is not, say the resu
lt. | |
| 7778 return UCOL_LESS; | |
| 7779 } | |
| 7780 | |
| 7781 if(sOrder == tOrder) { // if we have same CEs, we continue the loop | |
| 7782 sOrder = 0; tOrder = 0; | |
| 7783 continue; | |
| 7784 } else { | |
| 7785 // compare current top bytes | |
| 7786 if(((sOrder^tOrder)&0xFF000000)!=0) { | |
| 7787 // top bytes differ, return difference | |
| 7788 if(sOrder < tOrder) { | |
| 7789 return UCOL_LESS; | |
| 7790 } else if(sOrder > tOrder) { | |
| 7791 return UCOL_GREATER; | |
| 7792 } | |
| 7793 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24)
; | |
| 7794 // since we must return enum value | |
| 7795 } | |
| 7796 | |
| 7797 // top bytes match, continue with following bytes | |
| 7798 sOrder<<=8; | |
| 7799 tOrder<<=8; | |
| 7800 } | |
| 7801 } | |
| 7802 | |
| 7803 endOfPrimLoop: | |
| 7804 // after primary loop, we definitely know the sizes of strings, | |
| 7805 // so we set it and use simpler loop for secondaries and tertiaries | |
| 7806 sLen = sIndex; tLen = tIndex; | |
| 7807 if(strength >= UCOL_SECONDARY) { | |
| 7808 // adjust the table beggining | |
| 7809 elements += coll->latinOneTableLen; | |
| 7810 endOfSource = FALSE; | |
| 7811 | |
| 7812 if(coll->frenchCollation == UCOL_OFF) { // non French | |
| 7813 // This loop is a simplified copy of primary loop | |
| 7814 // at this point we know that whole strings are latin-1, so we don't | |
| 7815 // check for that. We also know that we only have contractions as | |
| 7816 // specials. | |
| 7817 sIndex = 0; tIndex = 0; | |
| 7818 for(;;) { | |
| 7819 while(sOrder==0) { | |
| 7820 if(sIndex==sLen) { | |
| 7821 endOfSource = TRUE; | |
| 7822 break; | |
| 7823 } | |
| 7824 sChar=source[sIndex++]; | |
| 7825 sOrder = elements[sChar]; | |
| 7826 if(sOrder > UCOL_NOT_FOUND) { | |
| 7827 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDAR
Y, sOrder, source, &sIndex, sLen); | |
| 7828 } | |
| 7829 } | |
| 7830 | |
| 7831 while(tOrder==0) { | |
| 7832 if(tIndex==tLen) { | |
| 7833 if(endOfSource) { | |
| 7834 goto endOfSecLoop; | |
| 7835 } else { | |
| 7836 return UCOL_GREATER; | |
| 7837 } | |
| 7838 } | |
| 7839 tChar=target[tIndex++]; | |
| 7840 tOrder = elements[tChar]; | |
| 7841 if(tOrder > UCOL_NOT_FOUND) { | |
| 7842 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDAR
Y, tOrder, target, &tIndex, tLen); | |
| 7843 } | |
| 7844 } | |
| 7845 if(endOfSource) { | |
| 7846 return UCOL_LESS; | |
| 7847 } | |
| 7848 | |
| 7849 if(sOrder == tOrder) { | |
| 7850 sOrder = 0; tOrder = 0; | |
| 7851 continue; | |
| 7852 } else { | |
| 7853 // see primary loop for comments on this | |
| 7854 if(((sOrder^tOrder)&0xFF000000)!=0) { | |
| 7855 if(sOrder < tOrder) { | |
| 7856 return UCOL_LESS; | |
| 7857 } else if(sOrder > tOrder) { | |
| 7858 return UCOL_GREATER; | |
| 7859 } | |
| 7860 } | |
| 7861 sOrder<<=8; | |
| 7862 tOrder<<=8; | |
| 7863 } | |
| 7864 } | |
| 7865 } else { // French | |
| 7866 if(haveContractions) { // if we have contractions, we have to bail o
ut | |
| 7867 // since we don't really know how to handle them here | |
| 7868 return ucol_strcollRegular(coll, source, sLen, target, tLen, sta
tus); | |
| 7869 } | |
| 7870 // For French, we go backwards | |
| 7871 sIndex = sLen; tIndex = tLen; | |
| 7872 for(;;) { | |
| 7873 while(sOrder==0) { | |
| 7874 if(sIndex==0) { | |
| 7875 endOfSource = TRUE; | |
| 7876 break; | |
| 7877 } | |
| 7878 sChar=source[--sIndex]; | |
| 7879 sOrder = elements[sChar]; | |
| 7880 // don't even look for contractions | |
| 7881 } | |
| 7882 | |
| 7883 while(tOrder==0) { | |
| 7884 if(tIndex==0) { | |
| 7885 if(endOfSource) { | |
| 7886 goto endOfSecLoop; | |
| 7887 } else { | |
| 7888 return UCOL_GREATER; | |
| 7889 } | |
| 7890 } | |
| 7891 tChar=target[--tIndex]; | |
| 7892 tOrder = elements[tChar]; | |
| 7893 // don't even look for contractions | |
| 7894 } | |
| 7895 if(endOfSource) { | |
| 7896 return UCOL_LESS; | |
| 7897 } | |
| 7898 | |
| 7899 if(sOrder == tOrder) { | |
| 7900 sOrder = 0; tOrder = 0; | |
| 7901 continue; | |
| 7902 } else { | |
| 7903 // see the primary loop for comments | |
| 7904 if(((sOrder^tOrder)&0xFF000000)!=0) { | |
| 7905 if(sOrder < tOrder) { | |
| 7906 return UCOL_LESS; | |
| 7907 } else if(sOrder > tOrder) { | |
| 7908 return UCOL_GREATER; | |
| 7909 } | |
| 7910 } | |
| 7911 sOrder<<=8; | |
| 7912 tOrder<<=8; | |
| 7913 } | |
| 7914 } | |
| 7915 } | |
| 7916 } | |
| 7917 | |
| 7918 endOfSecLoop: | |
| 7919 if(strength >= UCOL_TERTIARY) { | |
| 7920 // tertiary loop is the same as secondary (except no French) | |
| 7921 elements += coll->latinOneTableLen; | |
| 7922 sIndex = 0; tIndex = 0; | |
| 7923 endOfSource = FALSE; | |
| 7924 for(;;) { | |
| 7925 while(sOrder==0) { | |
| 7926 if(sIndex==sLen) { | |
| 7927 endOfSource = TRUE; | |
| 7928 break; | |
| 7929 } | |
| 7930 sChar=source[sIndex++]; | |
| 7931 sOrder = elements[sChar]; | |
| 7932 if(sOrder > UCOL_NOT_FOUND) { | |
| 7933 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sO
rder, source, &sIndex, sLen); | |
| 7934 } | |
| 7935 } | |
| 7936 while(tOrder==0) { | |
| 7937 if(tIndex==tLen) { | |
| 7938 if(endOfSource) { | |
| 7939 return UCOL_EQUAL; // if both strings are at the end, th
ey are equal | |
| 7940 } else { | |
| 7941 return UCOL_GREATER; | |
| 7942 } | |
| 7943 } | |
| 7944 tChar=target[tIndex++]; | |
| 7945 tOrder = elements[tChar]; | |
| 7946 if(tOrder > UCOL_NOT_FOUND) { | |
| 7947 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tO
rder, target, &tIndex, tLen); | |
| 7948 } | |
| 7949 } | |
| 7950 if(endOfSource) { | |
| 7951 return UCOL_LESS; | |
| 7952 } | |
| 7953 if(sOrder == tOrder) { | |
| 7954 sOrder = 0; tOrder = 0; | |
| 7955 continue; | |
| 7956 } else { | |
| 7957 if(((sOrder^tOrder)&0xff000000)!=0) { | |
| 7958 if(sOrder < tOrder) { | |
| 7959 return UCOL_LESS; | |
| 7960 } else if(sOrder > tOrder) { | |
| 7961 return UCOL_GREATER; | |
| 7962 } | |
| 7963 } | |
| 7964 sOrder<<=8; | |
| 7965 tOrder<<=8; | |
| 7966 } | |
| 7967 } | |
| 7968 } | |
| 7969 return UCOL_EQUAL; | |
| 7970 } | |
| 7971 | |
| 7972 /* | |
| 7973 Note: ucol_strcollUTF8 supports null terminated input. Calculating length of | |
| 7974 null terminated input string takes extra amount of CPU cycles. | |
| 7975 */ | |
| 7976 static UCollationResult | |
| 7977 ucol_strcollRegularUTF8( | |
| 7978 const UCollator *coll, | |
| 7979 const char *source, | |
| 7980 int32_t sourceLength, | |
| 7981 const char *target, | |
| 7982 int32_t targetLength, | |
| 7983 UErrorCode *status) | |
| 7984 { | |
| 7985 UCharIterator src; | |
| 7986 UCharIterator tgt; | |
| 7987 | |
| 7988 uiter_setUTF8(&src, source, sourceLength); | |
| 7989 uiter_setUTF8(&tgt, target, targetLength); | |
| 7990 | |
| 7991 // Preparing the context objects for iterating over strings | |
| 7992 collIterate sColl, tColl; | |
| 7993 IInit_collIterate(coll, NULL, -1, &sColl, status); | |
| 7994 IInit_collIterate(coll, NULL, -1, &tColl, status); | |
| 7995 if(U_FAILURE(*status)) { | |
| 7996 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) | |
| 7997 return UCOL_EQUAL; | |
| 7998 } | |
| 7999 // The division for the array length may truncate the array size to | |
| 8000 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high | |
| 8001 // for all platforms anyway. | |
| 8002 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
| 8003 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
| 8004 UNormIterator *sNormIter = NULL, *tNormIter = NULL; | |
| 8005 | |
| 8006 sColl.iterator = &src; | |
| 8007 sColl.flags |= UCOL_USE_ITERATOR; | |
| 8008 tColl.flags |= UCOL_USE_ITERATOR; | |
| 8009 tColl.iterator = &tgt; | |
| 8010 | |
| 8011 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { | |
| 8012 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), statu
s); | |
| 8013 sColl.iterator = unorm_setIter(sNormIter, &src, UNORM_FCD, status); | |
| 8014 sColl.flags &= ~UCOL_ITER_NORM; | |
| 8015 | |
| 8016 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), statu
s); | |
| 8017 tColl.iterator = unorm_setIter(tNormIter, &tgt, UNORM_FCD, status); | |
| 8018 tColl.flags &= ~UCOL_ITER_NORM; | |
| 8019 } | |
| 8020 | |
| 8021 return ucol_strcollRegular(&sColl, &tColl, status); | |
| 8022 } | |
| 8023 | |
| 8024 static inline uint32_t | |
| 8025 ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength, | |
| 8026 uint32_t CE, const char *s, int32_t *index, int32_t le
n) | |
| 8027 { | |
| 8028 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); | |
| 8029 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; | |
| 8030 int32_t offset = 1; | |
| 8031 UChar32 schar = 0, tchar = 0; | |
| 8032 | |
| 8033 for(;;) { | |
| 8034 if (*index == len) { | |
| 8035 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff
set]); | |
| 8036 } | |
| 8037 U8_GET_OR_FFFD((const uint8_t*)s, 0, *index, len, schar); | |
| 8038 if (len < 0 && schar == 0) { | |
| 8039 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff
set]); | |
| 8040 } | |
| 8041 | |
| 8042 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contractio
n codepoints should be ordered, we skip all that are smaller */ | |
| 8043 offset++; | |
| 8044 } | |
| 8045 | |
| 8046 if (schar == tchar) { | |
| 8047 U8_FWD_1(s, *index, len); | |
| 8048 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff
set+offset]); | |
| 8049 } | |
| 8050 else | |
| 8051 { | |
| 8052 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { | |
| 8053 return UCOL_BAIL_OUT_CE; | |
| 8054 } | |
| 8055 // skip completely ignorables | |
| 8056 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); | |
| 8057 if(isZeroCE == 0) { // we have to ignore completely ignorables | |
| 8058 U8_FWD_1(s, *index, len); | |
| 8059 continue; | |
| 8060 } | |
| 8061 | |
| 8062 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff
set]); | |
| 8063 } | |
| 8064 } | |
| 8065 } | |
| 8066 | |
| 8067 static inline UCollationResult | |
| 8068 ucol_strcollUseLatin1UTF8( | |
| 8069 const UCollator *coll, | |
| 8070 const char *source, | |
| 8071 int32_t sLen, | |
| 8072 const char *target, | |
| 8073 int32_t tLen, | |
| 8074 UErrorCode *status) | |
| 8075 { | |
| 8076 U_ALIGN_CODE(16); | |
| 8077 int32_t strength = coll->strength; | |
| 8078 | |
| 8079 int32_t sIndex = 0, tIndex = 0; | |
| 8080 UChar32 sChar = 0, tChar = 0; | |
| 8081 uint32_t sOrder=0, tOrder=0; | |
| 8082 | |
| 8083 UBool endOfSource = FALSE; | |
| 8084 | |
| 8085 uint32_t *elements = coll->latinOneCEs; | |
| 8086 | |
| 8087 UBool haveContractions = FALSE; // if we have contractions in our string | |
| 8088 // we cannot do French secondary | |
| 8089 | |
| 8090 // Do the primary level | |
| 8091 for(;;) { | |
| 8092 while(sOrder==0) { // this loop skips primary ignorables | |
| 8093 // sOrder=getNextlatinOneCE(source); | |
| 8094 if (sIndex == sLen) { | |
| 8095 endOfSource = TRUE; | |
| 8096 break; | |
| 8097 } | |
| 8098 U8_NEXT_OR_FFFD(source, sIndex, sLen ,sChar); | |
| 8099 if (sLen < 0 && sChar == 0) { | |
| 8100 endOfSource = TRUE; | |
| 8101 sLen = sIndex; | |
| 8102 break; | |
| 8103 } | |
| 8104 if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (
sChar > 0xFF, but this is faster on win32) | |
| 8105 //fprintf(stderr, "R"); | |
| 8106 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen,
status); | |
| 8107 } | |
| 8108 sOrder = elements[sChar]; | |
| 8109 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special | |
| 8110 // specials can basically be either contractions or bail-out sig
ns. If we get anything | |
| 8111 // else, we'll bail out anywasy | |
| 8112 if(getCETag(sOrder) == CONTRACTION_TAG) { | |
| 8113 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY,
sOrder, source, &sIndex, sLen); | |
| 8114 haveContractions = TRUE; // if there are contractions, we ca
nnot do French secondary | |
| 8115 // However, if there are contractions in the table, but we a
lways use just one char, | |
| 8116 // we might be able to do French. This should be checked out
. | |
| 8117 } | |
| 8118 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { | |
| 8119 //fprintf(stderr, "S"); | |
| 8120 return ucol_strcollRegularUTF8(coll, source, sLen, target, t
Len, status); | |
| 8121 } | |
| 8122 } | |
| 8123 } | |
| 8124 | |
| 8125 while(tOrder==0) { // this loop skips primary ignorables | |
| 8126 // tOrder=getNextlatinOneCE(target); | |
| 8127 if (tIndex == tLen) { | |
| 8128 if(endOfSource) { | |
| 8129 goto endOfPrimLoopU8; | |
| 8130 } else { | |
| 8131 return UCOL_GREATER; | |
| 8132 } | |
| 8133 } | |
| 8134 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); | |
| 8135 if (tLen < 0 && tChar == 0) { | |
| 8136 if(endOfSource) { | |
| 8137 tLen = tIndex; | |
| 8138 goto endOfPrimLoopU8; | |
| 8139 } else { | |
| 8140 return UCOL_GREATER; | |
| 8141 } | |
| 8142 } | |
| 8143 if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (
sChar > 0xFF, but this is faster on win32) | |
| 8144 //fprintf(stderr, "R"); | |
| 8145 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen,
status); | |
| 8146 } | |
| 8147 tOrder = elements[tChar]; | |
| 8148 if(tOrder >= UCOL_NOT_FOUND) { | |
| 8149 // Handling specials, see the comments for source | |
| 8150 if(getCETag(tOrder) == CONTRACTION_TAG) { | |
| 8151 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY,
tOrder, target, &tIndex, tLen); | |
| 8152 haveContractions = TRUE; | |
| 8153 } | |
| 8154 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { | |
| 8155 //fprintf(stderr, "S"); | |
| 8156 return ucol_strcollRegularUTF8(coll, source, sLen, target, t
Len, status); | |
| 8157 } | |
| 8158 } | |
| 8159 } | |
| 8160 if(endOfSource) { // source is finished, but target is not, say the resu
lt. | |
| 8161 return UCOL_LESS; | |
| 8162 } | |
| 8163 | |
| 8164 if(sOrder == tOrder) { // if we have same CEs, we continue the loop | |
| 8165 sOrder = 0; tOrder = 0; | |
| 8166 continue; | |
| 8167 } else { | |
| 8168 // compare current top bytes | |
| 8169 if(((sOrder^tOrder)&0xFF000000)!=0) { | |
| 8170 // top bytes differ, return difference | |
| 8171 if(sOrder < tOrder) { | |
| 8172 return UCOL_LESS; | |
| 8173 } else if(sOrder > tOrder) { | |
| 8174 return UCOL_GREATER; | |
| 8175 } | |
| 8176 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24)
; | |
| 8177 // since we must return enum value | |
| 8178 } | |
| 8179 | |
| 8180 // top bytes match, continue with following bytes | |
| 8181 sOrder<<=8; | |
| 8182 tOrder<<=8; | |
| 8183 } | |
| 8184 } | |
| 8185 | |
| 8186 endOfPrimLoopU8: | |
| 8187 // after primary loop, we definitely know the sizes of strings, | |
| 8188 // so we set it and use simpler loop for secondaries and tertiaries | |
| 8189 sLen = sIndex; tLen = tIndex; | |
| 8190 if(strength >= UCOL_SECONDARY) { | |
| 8191 // adjust the table beggining | |
| 8192 elements += coll->latinOneTableLen; | |
| 8193 endOfSource = FALSE; | |
| 8194 | |
| 8195 if(coll->frenchCollation == UCOL_OFF) { // non French | |
| 8196 // This loop is a simplified copy of primary loop | |
| 8197 // at this point we know that whole strings are latin-1, so we don't | |
| 8198 // check for that. We also know that we only have contractions as | |
| 8199 // specials. | |
| 8200 sIndex = 0; tIndex = 0; | |
| 8201 for(;;) { | |
| 8202 while(sOrder==0) { | |
| 8203 if(sIndex==sLen) { | |
| 8204 endOfSource = TRUE; | |
| 8205 break; | |
| 8206 } | |
| 8207 U_ASSERT(sLen >= 0); | |
| 8208 U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar); | |
| 8209 U_ASSERT(sChar >= 0 && sChar <= 0xFF); | |
| 8210 sOrder = elements[sChar]; | |
| 8211 if(sOrder > UCOL_NOT_FOUND) { | |
| 8212 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECO
NDARY, sOrder, source, &sIndex, sLen); | |
| 8213 } | |
| 8214 } | |
| 8215 | |
| 8216 while(tOrder==0) { | |
| 8217 if(tIndex==tLen) { | |
| 8218 if(endOfSource) { | |
| 8219 goto endOfSecLoopU8; | |
| 8220 } else { | |
| 8221 return UCOL_GREATER; | |
| 8222 } | |
| 8223 } | |
| 8224 U_ASSERT(tLen >= 0); | |
| 8225 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); | |
| 8226 U_ASSERT(tChar >= 0 && tChar <= 0xFF); | |
| 8227 tOrder = elements[tChar]; | |
| 8228 if(tOrder > UCOL_NOT_FOUND) { | |
| 8229 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECO
NDARY, tOrder, target, &tIndex, tLen); | |
| 8230 } | |
| 8231 } | |
| 8232 if(endOfSource) { | |
| 8233 return UCOL_LESS; | |
| 8234 } | |
| 8235 | |
| 8236 if(sOrder == tOrder) { | |
| 8237 sOrder = 0; tOrder = 0; | |
| 8238 continue; | |
| 8239 } else { | |
| 8240 // see primary loop for comments on this | |
| 8241 if(((sOrder^tOrder)&0xFF000000)!=0) { | |
| 8242 if(sOrder < tOrder) { | |
| 8243 return UCOL_LESS; | |
| 8244 } else if(sOrder > tOrder) { | |
| 8245 return UCOL_GREATER; | |
| 8246 } | |
| 8247 } | |
| 8248 sOrder<<=8; | |
| 8249 tOrder<<=8; | |
| 8250 } | |
| 8251 } | |
| 8252 } else { // French | |
| 8253 if(haveContractions) { // if we have contractions, we have to bail o
ut | |
| 8254 // since we don't really know how to handle them here | |
| 8255 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen,
status); | |
| 8256 } | |
| 8257 // For French, we go backwards | |
| 8258 sIndex = sLen; tIndex = tLen; | |
| 8259 for(;;) { | |
| 8260 while(sOrder==0) { | |
| 8261 if(sIndex==0) { | |
| 8262 endOfSource = TRUE; | |
| 8263 break; | |
| 8264 } | |
| 8265 U8_PREV_OR_FFFD(source, 0, sIndex, sChar); | |
| 8266 U_ASSERT(sChar >= 0 && sChar <= 0xFF); | |
| 8267 sOrder = elements[sChar]; | |
| 8268 // don't even look for contractions | |
| 8269 } | |
| 8270 | |
| 8271 while(tOrder==0) { | |
| 8272 if(tIndex==0) { | |
| 8273 if(endOfSource) { | |
| 8274 goto endOfSecLoopU8; | |
| 8275 } else { | |
| 8276 return UCOL_GREATER; | |
| 8277 } | |
| 8278 } | |
| 8279 U8_PREV_OR_FFFD(target, 0, tIndex, tChar); | |
| 8280 U_ASSERT(tChar >= 0 && tChar <= 0xFF); | |
| 8281 tOrder = elements[tChar]; | |
| 8282 // don't even look for contractions | |
| 8283 } | |
| 8284 if(endOfSource) { | |
| 8285 return UCOL_LESS; | |
| 8286 } | |
| 8287 | |
| 8288 if(sOrder == tOrder) { | |
| 8289 sOrder = 0; tOrder = 0; | |
| 8290 continue; | |
| 8291 } else { | |
| 8292 // see the primary loop for comments | |
| 8293 if(((sOrder^tOrder)&0xFF000000)!=0) { | |
| 8294 if(sOrder < tOrder) { | |
| 8295 return UCOL_LESS; | |
| 8296 } else if(sOrder > tOrder) { | |
| 8297 return UCOL_GREATER; | |
| 8298 } | |
| 8299 } | |
| 8300 sOrder<<=8; | |
| 8301 tOrder<<=8; | |
| 8302 } | |
| 8303 } | |
| 8304 } | |
| 8305 } | |
| 8306 | |
| 8307 endOfSecLoopU8: | |
| 8308 if(strength >= UCOL_TERTIARY) { | |
| 8309 // tertiary loop is the same as secondary (except no French) | |
| 8310 elements += coll->latinOneTableLen; | |
| 8311 sIndex = 0; tIndex = 0; | |
| 8312 endOfSource = FALSE; | |
| 8313 for(;;) { | |
| 8314 while(sOrder==0) { | |
| 8315 if(sIndex==sLen) { | |
| 8316 endOfSource = TRUE; | |
| 8317 break; | |
| 8318 } | |
| 8319 U_ASSERT(sLen >= 0); | |
| 8320 U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar); | |
| 8321 U_ASSERT(sChar >= 0 && sChar <= 0xFF); | |
| 8322 sOrder = elements[sChar]; | |
| 8323 if(sOrder > UCOL_NOT_FOUND) { | |
| 8324 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY
, sOrder, source, &sIndex, sLen); | |
| 8325 } | |
| 8326 } | |
| 8327 while(tOrder==0) { | |
| 8328 if(tIndex==tLen) { | |
| 8329 if(endOfSource) { | |
| 8330 return UCOL_EQUAL; // if both strings are at the end, th
ey are equal | |
| 8331 } else { | |
| 8332 return UCOL_GREATER; | |
| 8333 } | |
| 8334 } | |
| 8335 U_ASSERT(tLen >= 0); | |
| 8336 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); | |
| 8337 U_ASSERT(tChar >= 0 && tChar <= 0xFF); | |
| 8338 tOrder = elements[tChar]; | |
| 8339 if(tOrder > UCOL_NOT_FOUND) { | |
| 8340 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY
, tOrder, target, &tIndex, tLen); | |
| 8341 } | |
| 8342 } | |
| 8343 if(endOfSource) { | |
| 8344 return UCOL_LESS; | |
| 8345 } | |
| 8346 if(sOrder == tOrder) { | |
| 8347 sOrder = 0; tOrder = 0; | |
| 8348 continue; | |
| 8349 } else { | |
| 8350 if(((sOrder^tOrder)&0xff000000)!=0) { | |
| 8351 if(sOrder < tOrder) { | |
| 8352 return UCOL_LESS; | |
| 8353 } else if(sOrder > tOrder) { | |
| 8354 return UCOL_GREATER; | |
| 8355 } | |
| 8356 } | |
| 8357 sOrder<<=8; | |
| 8358 tOrder<<=8; | |
| 8359 } | |
| 8360 } | |
| 8361 } | |
| 8362 return UCOL_EQUAL; | |
| 8363 } | 406 } |
| 8364 | 407 |
| 8365 U_CAPI UCollationResult U_EXPORT2 | 408 U_CAPI UCollationResult U_EXPORT2 |
| 8366 ucol_strcollIter( const UCollator *coll, | 409 ucol_strcollIter( const UCollator *coll, |
| 8367 UCharIterator *sIter, | 410 UCharIterator *sIter, |
| 8368 UCharIterator *tIter, | 411 UCharIterator *tIter, |
| 8369 UErrorCode *status) | 412 UErrorCode *status) |
| 8370 { | 413 { |
| 8371 if(!status || U_FAILURE(*status)) { | 414 if(!status || U_FAILURE(*status)) { |
| 8372 return UCOL_EQUAL; | 415 return UCOL_EQUAL; |
| 8373 } | 416 } |
| 8374 | 417 |
| 8375 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); | 418 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); |
| 8376 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIt
er); | 419 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIt
er); |
| 8377 | 420 |
| 8378 if (sIter == tIter) { | |
| 8379 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) | |
| 8380 return UCOL_EQUAL; | |
| 8381 } | |
| 8382 if(sIter == NULL || tIter == NULL || coll == NULL) { | 421 if(sIter == NULL || tIter == NULL || coll == NULL) { |
| 8383 *status = U_ILLEGAL_ARGUMENT_ERROR; | 422 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 8384 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) | 423 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); |
| 8385 return UCOL_EQUAL; | 424 return UCOL_EQUAL; |
| 8386 } | 425 } |
| 8387 | 426 |
| 8388 UCollationResult result = UCOL_EQUAL; | 427 UCollationResult result = Collator::fromUCollator(coll)->compare(*sIter, *tI
ter, *status); |
| 8389 | 428 |
| 8390 // Preparing the context objects for iterating over strings | 429 UTRACE_EXIT_VALUE_STATUS(result, *status); |
| 8391 collIterate sColl, tColl; | |
| 8392 IInit_collIterate(coll, NULL, -1, &sColl, status); | |
| 8393 IInit_collIterate(coll, NULL, -1, &tColl, status); | |
| 8394 if(U_FAILURE(*status)) { | |
| 8395 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) | |
| 8396 return UCOL_EQUAL; | |
| 8397 } | |
| 8398 // The division for the array length may truncate the array size to | |
| 8399 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high | |
| 8400 // for all platforms anyway. | |
| 8401 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
| 8402 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
| 8403 UNormIterator *sNormIter = NULL, *tNormIter = NULL; | |
| 8404 | |
| 8405 sColl.iterator = sIter; | |
| 8406 sColl.flags |= UCOL_USE_ITERATOR; | |
| 8407 tColl.flags |= UCOL_USE_ITERATOR; | |
| 8408 tColl.iterator = tIter; | |
| 8409 | |
| 8410 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { | |
| 8411 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), statu
s); | |
| 8412 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status); | |
| 8413 sColl.flags &= ~UCOL_ITER_NORM; | |
| 8414 | |
| 8415 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), statu
s); | |
| 8416 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status); | |
| 8417 tColl.flags &= ~UCOL_ITER_NORM; | |
| 8418 } | |
| 8419 | |
| 8420 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL; | |
| 8421 | |
| 8422 while((sChar = sColl.iterator->next(sColl.iterator)) == | |
| 8423 (tChar = tColl.iterator->next(tColl.iterator))) { | |
| 8424 if(sChar == U_SENTINEL) { | |
| 8425 result = UCOL_EQUAL; | |
| 8426 goto end_compare; | |
| 8427 } | |
| 8428 } | |
| 8429 | |
| 8430 if(sChar == U_SENTINEL) { | |
| 8431 tChar = tColl.iterator->previous(tColl.iterator); | |
| 8432 } | |
| 8433 | |
| 8434 if(tChar == U_SENTINEL) { | |
| 8435 sChar = sColl.iterator->previous(sColl.iterator); | |
| 8436 } | |
| 8437 | |
| 8438 sChar = sColl.iterator->previous(sColl.iterator); | |
| 8439 tChar = tColl.iterator->previous(tColl.iterator); | |
| 8440 | |
| 8441 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll)) | |
| 8442 { | |
| 8443 // We are stopped in the middle of a contraction. | |
| 8444 // Scan backwards through the == part of the string looking for the star
t of the contraction. | |
| 8445 // It doesn't matter which string we scan, since they are the same in
this region. | |
| 8446 do | |
| 8447 { | |
| 8448 sChar = sColl.iterator->previous(sColl.iterator); | |
| 8449 tChar = tColl.iterator->previous(tColl.iterator); | |
| 8450 } | |
| 8451 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll)); | |
| 8452 } | |
| 8453 | |
| 8454 | |
| 8455 if(U_SUCCESS(*status)) { | |
| 8456 result = ucol_strcollRegular(&sColl, &tColl, status); | |
| 8457 } | |
| 8458 | |
| 8459 end_compare: | |
| 8460 if(sNormIter || tNormIter) { | |
| 8461 unorm_closeIter(sNormIter); | |
| 8462 unorm_closeIter(tNormIter); | |
| 8463 } | |
| 8464 | |
| 8465 UTRACE_EXIT_VALUE_STATUS(result, *status) | |
| 8466 return result; | 430 return result; |
| 8467 } | 431 } |
| 8468 | 432 |
| 8469 | 433 |
| 8470 /* */ | 434 /* */ |
| 8471 /* ucol_strcoll Main public API string comparison function */ | 435 /* ucol_strcoll Main public API string comparison function */ |
| 8472 /* */ | 436 /* */ |
| 8473 U_CAPI UCollationResult U_EXPORT2 | 437 U_CAPI UCollationResult U_EXPORT2 |
| 8474 ucol_strcoll( const UCollator *coll, | 438 ucol_strcoll( const UCollator *coll, |
| 8475 const UChar *source, | 439 const UChar *source, |
| 8476 int32_t sourceLength, | 440 int32_t sourceLength, |
| 8477 const UChar *target, | 441 const UChar *target, |
| 8478 int32_t targetLength) | 442 int32_t targetLength) |
| 8479 { | 443 { |
| 8480 U_ALIGN_CODE(16); | 444 U_ALIGN_CODE(16); |
| 8481 | 445 |
| 8482 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); | 446 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); |
| 8483 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { | 447 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { |
| 8484 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, sour
ce, target); | 448 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, sour
ce, target); |
| 8485 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLengt
h); | 449 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLengt
h); |
| 8486 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLengt
h); | 450 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLengt
h); |
| 8487 } | 451 } |
| 8488 | 452 |
| 8489 if((source == NULL && sourceLength != 0) || (target == NULL && targetLength
!= 0)) { | |
| 8490 // do not crash, but return. Should have | |
| 8491 // status argument to return error. | |
| 8492 UTRACE_EXIT_VALUE(UCOL_EQUAL); | |
| 8493 return UCOL_EQUAL; | |
| 8494 } | |
| 8495 | |
| 8496 /* Quick check if source and target are same strings. */ | |
| 8497 /* They should either both be NULL terminated or the explicit length should
be set on both. */ | |
| 8498 if (source==target && sourceLength==targetLength) { | |
| 8499 UTRACE_EXIT_VALUE(UCOL_EQUAL); | |
| 8500 return UCOL_EQUAL; | |
| 8501 } | |
| 8502 | |
| 8503 if(coll->delegate != NULL) { | |
| 8504 UErrorCode status = U_ZERO_ERROR; | |
| 8505 return ((const Collator*)coll->delegate)->compare(source,sourceLength,targ
et,targetLength, status); | |
| 8506 } | |
| 8507 | |
| 8508 /* Scan the strings. Find:
*/ | |
| 8509 /* The length of any leading portion that is equal
*/ | |
| 8510 /* Whether they are exactly equal. (in which case we just return)
*/ | |
| 8511 const UChar *pSrc = source; | |
| 8512 const UChar *pTarg = target; | |
| 8513 int32_t equalLength; | |
| 8514 | |
| 8515 if (sourceLength == -1 && targetLength == -1) { | |
| 8516 // Both strings are null terminated. | |
| 8517 // Scan through any leading equal portion. | |
| 8518 while (*pSrc == *pTarg && *pSrc != 0) { | |
| 8519 pSrc++; | |
| 8520 pTarg++; | |
| 8521 } | |
| 8522 if (*pSrc == 0 && *pTarg == 0) { | |
| 8523 UTRACE_EXIT_VALUE(UCOL_EQUAL); | |
| 8524 return UCOL_EQUAL; | |
| 8525 } | |
| 8526 equalLength = (int32_t)(pSrc - source); | |
| 8527 } | |
| 8528 else | |
| 8529 { | |
| 8530 // One or both strings has an explicit length. | |
| 8531 const UChar *pSrcEnd = source + sourceLength; | |
| 8532 const UChar *pTargEnd = target + targetLength; | |
| 8533 | |
| 8534 // Scan while the strings are bitwise ==, or until one is exhausted. | |
| 8535 for (;;) { | |
| 8536 if (pSrc == pSrcEnd || pTarg == pTargEnd) { | |
| 8537 break; | |
| 8538 } | |
| 8539 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLeng
th == -1)) { | |
| 8540 break; | |
| 8541 } | |
| 8542 if (*pSrc != *pTarg) { | |
| 8543 break; | |
| 8544 } | |
| 8545 pSrc++; | |
| 8546 pTarg++; | |
| 8547 } | |
| 8548 equalLength = (int32_t)(pSrc - source); | |
| 8549 | |
| 8550 // If we made it all the way through both strings, we are done. They ar
e == | |
| 8551 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of
src string, however it was specified. */ | |
| 8552 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also
at end of dest string */ | |
| 8553 { | |
| 8554 UTRACE_EXIT_VALUE(UCOL_EQUAL); | |
| 8555 return UCOL_EQUAL; | |
| 8556 } | |
| 8557 } | |
| 8558 if (equalLength > 0) { | |
| 8559 /* There is an identical portion at the beginning of the two strings.
*/ | |
| 8560 /* If the identical portion ends within a contraction or a comibining
*/ | |
| 8561 /* character sequence, back up to the start of that sequence.
*/ | |
| 8562 | |
| 8563 // These values should already be set by the code above. | |
| 8564 //pSrc = source + equalLength; /* point to the first differing c
hars */ | |
| 8565 //pTarg = target + equalLength; | |
| 8566 if ((pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) || | |
| 8567 (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))) | |
| 8568 { | |
| 8569 // We are stopped in the middle of a contraction. | |
| 8570 // Scan backwards through the == part of the string looking for the
start of the contraction. | |
| 8571 // It doesn't matter which string we scan, since they are the same
in this region. | |
| 8572 do | |
| 8573 { | |
| 8574 equalLength--; | |
| 8575 pSrc--; | |
| 8576 } | |
| 8577 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll)); | |
| 8578 } | |
| 8579 | |
| 8580 source += equalLength; | |
| 8581 target += equalLength; | |
| 8582 if (sourceLength > 0) { | |
| 8583 sourceLength -= equalLength; | |
| 8584 } | |
| 8585 if (targetLength > 0) { | |
| 8586 targetLength -= equalLength; | |
| 8587 } | |
| 8588 } | |
| 8589 | |
| 8590 UErrorCode status = U_ZERO_ERROR; | 453 UErrorCode status = U_ZERO_ERROR; |
| 8591 UCollationResult returnVal; | 454 UCollationResult returnVal = Collator::fromUCollator(coll)-> |
| 8592 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLeng
th > 0 && *target&0xff00)) { | 455 compare(source, sourceLength, target, targetLength, status); |
| 8593 returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targ
etLength, &status); | 456 UTRACE_EXIT_VALUE_STATUS(returnVal, status); |
| 8594 } else { | |
| 8595 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, ta
rgetLength, &status); | |
| 8596 } | |
| 8597 UTRACE_EXIT_VALUE(returnVal); | |
| 8598 return returnVal; | 457 return returnVal; |
| 8599 } | 458 } |
| 8600 | 459 |
| 8601 U_CAPI UCollationResult U_EXPORT2 | 460 U_CAPI UCollationResult U_EXPORT2 |
| 8602 ucol_strcollUTF8( | 461 ucol_strcollUTF8( |
| 8603 const UCollator *coll, | 462 const UCollator *coll, |
| 8604 const char *source, | 463 const char *source, |
| 8605 int32_t sourceLength, | 464 int32_t sourceLength, |
| 8606 const char *target, | 465 const char *target, |
| 8607 int32_t targetLength, | 466 int32_t targetLength, |
| 8608 UErrorCode *status) | 467 UErrorCode *status) |
| 8609 { | 468 { |
| 8610 U_ALIGN_CODE(16); | 469 U_ALIGN_CODE(16); |
| 8611 | 470 |
| 8612 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8); | 471 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8); |
| 8613 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { | 472 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { |
| 8614 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, sour
ce, target); | 473 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, sour
ce, target); |
| 8615 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLengt
h); | 474 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLengt
h); |
| 8616 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLengt
h); | 475 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLengt
h); |
| 8617 } | 476 } |
| 8618 | 477 |
| 8619 if (U_FAILURE(*status)) { | 478 if (U_FAILURE(*status)) { |
| 8620 /* do nothing */ | 479 /* do nothing */ |
| 8621 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); | 480 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); |
| 8622 return UCOL_EQUAL; | 481 return UCOL_EQUAL; |
| 8623 } | 482 } |
| 8624 | 483 |
| 8625 if((source == NULL && sourceLength != 0) || (target == NULL && targetLength
!= 0)) { | 484 UCollationResult returnVal = Collator::fromUCollator(coll)->internalCompareU
TF8( |
| 8626 *status = U_ILLEGAL_ARGUMENT_ERROR; | 485 source, sourceLength, target, targetLength, *status); |
| 8627 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); | |
| 8628 return UCOL_EQUAL; | |
| 8629 } | |
| 8630 | |
| 8631 /* Quick check if source and target are same strings. */ | |
| 8632 /* They should either both be NULL terminated or the explicit length should
be set on both. */ | |
| 8633 if (source==target && sourceLength==targetLength) { | |
| 8634 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); | |
| 8635 return UCOL_EQUAL; | |
| 8636 } | |
| 8637 | |
| 8638 if(coll->delegate != NULL) { | |
| 8639 return ((const Collator*)coll->delegate)->compareUTF8( | |
| 8640 StringPiece(source, (sourceLength < 0) ? uprv_strlen(source) : sourc
eLength), | |
| 8641 StringPiece(target, (targetLength < 0) ? uprv_strlen(target) : targe
tLength), | |
| 8642 *status); | |
| 8643 } | |
| 8644 | |
| 8645 /* Scan the strings. Find:
*/ | |
| 8646 /* The length of any leading portion that is equal
*/ | |
| 8647 /* Whether they are exactly equal. (in which case we just return)
*/ | |
| 8648 const char *pSrc = source; | |
| 8649 const char *pTarg = target; | |
| 8650 UBool bSrcLimit = FALSE; | |
| 8651 UBool bTargLimit = FALSE; | |
| 8652 | |
| 8653 if (sourceLength == -1 && targetLength == -1) { | |
| 8654 // Both strings are null terminated. | |
| 8655 // Scan through any leading equal portion. | |
| 8656 while (*pSrc == *pTarg && *pSrc != 0) { | |
| 8657 pSrc++; | |
| 8658 pTarg++; | |
| 8659 } | |
| 8660 if (*pSrc == 0 && *pTarg == 0) { | |
| 8661 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); | |
| 8662 return UCOL_EQUAL; | |
| 8663 } | |
| 8664 bSrcLimit = (*pSrc == 0); | |
| 8665 bTargLimit = (*pTarg == 0); | |
| 8666 } | |
| 8667 else | |
| 8668 { | |
| 8669 // One or both strings has an explicit length. | |
| 8670 const char *pSrcEnd = source + sourceLength; | |
| 8671 const char *pTargEnd = target + targetLength; | |
| 8672 | |
| 8673 // Scan while the strings are bitwise ==, or until one is exhausted. | |
| 8674 for (;;) { | |
| 8675 if (pSrc == pSrcEnd || pTarg == pTargEnd) { | |
| 8676 break; | |
| 8677 } | |
| 8678 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLeng
th == -1)) { | |
| 8679 break; | |
| 8680 } | |
| 8681 if (*pSrc != *pTarg) { | |
| 8682 break; | |
| 8683 } | |
| 8684 pSrc++; | |
| 8685 pTarg++; | |
| 8686 } | |
| 8687 bSrcLimit = (pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)); | |
| 8688 bTargLimit = (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)); | |
| 8689 | |
| 8690 // If we made it all the way through both strings, we are done. They ar
e == | |
| 8691 if (bSrcLimit && /* At end of src string, however it was specified. *
/ | |
| 8692 bTargLimit) /* and also at end of dest string *
/ | |
| 8693 { | |
| 8694 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); | |
| 8695 return UCOL_EQUAL; | |
| 8696 } | |
| 8697 } | |
| 8698 | |
| 8699 U_ASSERT(!(bSrcLimit && bTargLimit)); | |
| 8700 | |
| 8701 int32_t equalLength = pSrc - source; | |
| 8702 UBool bSawNonLatin1 = FALSE; | |
| 8703 | |
| 8704 if (equalLength > 0) { | |
| 8705 // Align position to the start of UTF-8 code point. | |
| 8706 if (bTargLimit) { | |
| 8707 U8_SET_CP_START((const uint8_t*)source, 0, equalLength); | |
| 8708 } else { | |
| 8709 U8_SET_CP_START((const uint8_t*)target, 0, equalLength); | |
| 8710 } | |
| 8711 pSrc = source + equalLength; | |
| 8712 pTarg = target + equalLength; | |
| 8713 } | |
| 8714 | |
| 8715 if (equalLength > 0) { | |
| 8716 /* There is an identical portion at the beginning of the two strings.
*/ | |
| 8717 /* If the identical portion ends within a contraction or a comibining
*/ | |
| 8718 /* character sequence, back up to the start of that sequence.
*/ | |
| 8719 UBool bUnsafeCP = FALSE; | |
| 8720 UChar32 uc32 = -1; | |
| 8721 | |
| 8722 if (!bSrcLimit) { | |
| 8723 U8_GET_OR_FFFD((const uint8_t*)source, 0, equalLength, sourceLength,
uc32); | |
| 8724 if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) { | |
| 8725 bUnsafeCP = TRUE; | |
| 8726 } | |
| 8727 bSawNonLatin1 |= (uc32 > 0xff); | |
| 8728 } | |
| 8729 if (!bTargLimit) { | |
| 8730 U8_GET_OR_FFFD((const uint8_t*)target, 0, equalLength, targetLength,
uc32); | |
| 8731 if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) { | |
| 8732 bUnsafeCP = TRUE; | |
| 8733 } | |
| 8734 bSawNonLatin1 |= (uc32 > 0xff); | |
| 8735 } | |
| 8736 | |
| 8737 if (bUnsafeCP) { | |
| 8738 while (equalLength > 0) { | |
| 8739 // We are stopped in the middle of a contraction. | |
| 8740 // Scan backwards through the == part of the string looking for
the start of the contraction. | |
| 8741 // It doesn't matter which string we scan, since they are the
same in this region. | |
| 8742 U8_PREV_OR_FFFD((uint8_t*)source, 0, equalLength, uc32); | |
| 8743 bSawNonLatin1 |= (uc32 > 0xff); | |
| 8744 if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) { | |
| 8745 break; | |
| 8746 } | |
| 8747 } | |
| 8748 } | |
| 8749 source += equalLength; | |
| 8750 target += equalLength; | |
| 8751 if (sourceLength > 0) { | |
| 8752 sourceLength -= equalLength; | |
| 8753 } | |
| 8754 if (targetLength > 0) { | |
| 8755 targetLength -= equalLength; | |
| 8756 } | |
| 8757 } else { | |
| 8758 // Lead byte of Latin 1 character is 0x00 - 0xC3 | |
| 8759 bSawNonLatin1 = (source && (sourceLength != 0) && (uint8_t)*source > 0xc
3); | |
| 8760 bSawNonLatin1 |= (target && (targetLength != 0) && (uint8_t)*target > 0x
c3); | |
| 8761 } | |
| 8762 | |
| 8763 UCollationResult returnVal; | |
| 8764 | |
| 8765 if(!coll->latinOneUse || bSawNonLatin1) { | |
| 8766 returnVal = ucol_strcollRegularUTF8(coll, source, sourceLength, target,
targetLength, status); | |
| 8767 } else { | |
| 8768 returnVal = ucol_strcollUseLatin1UTF8(coll, source, sourceLength, target
, targetLength, status); | |
| 8769 } | |
| 8770 UTRACE_EXIT_VALUE_STATUS(returnVal, *status); | 486 UTRACE_EXIT_VALUE_STATUS(returnVal, *status); |
| 8771 return returnVal; | 487 return returnVal; |
| 8772 } | 488 } |
| 8773 | 489 |
| 8774 | 490 |
| 8775 /* convenience function for comparing strings */ | 491 /* convenience function for comparing strings */ |
| 8776 U_CAPI UBool U_EXPORT2 | 492 U_CAPI UBool U_EXPORT2 |
| 8777 ucol_greater( const UCollator *coll, | 493 ucol_greater( const UCollator *coll, |
| 8778 const UChar *source, | 494 const UChar *source, |
| 8779 int32_t sourceLength, | 495 int32_t sourceLength, |
| (...skipping 23 matching lines...) Expand all Loading... |
| 8803 int32_t sourceLength, | 519 int32_t sourceLength, |
| 8804 const UChar *target, | 520 const UChar *target, |
| 8805 int32_t targetLength) | 521 int32_t targetLength) |
| 8806 { | 522 { |
| 8807 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) | 523 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) |
| 8808 == UCOL_EQUAL); | 524 == UCOL_EQUAL); |
| 8809 } | 525 } |
| 8810 | 526 |
| 8811 U_CAPI void U_EXPORT2 | 527 U_CAPI void U_EXPORT2 |
| 8812 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { | 528 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { |
| 8813 if(coll && coll->UCA) { | 529 const Collator *c = Collator::fromUCollator(coll); |
| 8814 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo)); | 530 if(c != NULL) { |
| 531 UVersionInfo v; |
| 532 c->getVersion(v); |
| 533 // Note: This is tied to how the current implementation encodes the UCA
version |
| 534 // in the overall getVersion(). |
| 535 // Alternatively, we could load the root collator and get at lower-level
data from there. |
| 536 // Either way, it will reflect the input collator's UCA version only |
| 537 // if it is a known implementation. |
| 538 // It would be cleaner to make this a virtual Collator method. |
| 539 info[0] = v[1] >> 3; |
| 540 info[1] = v[1] & 7; |
| 541 info[2] = v[2] >> 6; |
| 542 info[3] = 0; |
| 8815 } | 543 } |
| 8816 } | 544 } |
| 8817 | 545 |
| 546 U_CAPI const UChar * U_EXPORT2 |
| 547 ucol_getRules(const UCollator *coll, int32_t *length) { |
| 548 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); |
| 549 // OK to crash if coll==NULL: We do not want to check "this" pointers. |
| 550 if(rbc != NULL || coll == NULL) { |
| 551 const UnicodeString &rules = rbc->getRules(); |
| 552 U_ASSERT(rules.getBuffer()[rules.length()] == 0); |
| 553 *length = rules.length(); |
| 554 return rules.getBuffer(); |
| 555 } |
| 556 static const UChar _NUL = 0; |
| 557 *length = 0; |
| 558 return &_NUL; |
| 559 } |
| 560 |
| 561 U_CAPI int32_t U_EXPORT2 |
| 562 ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int3
2_t bufferLen) { |
| 563 UnicodeString rules; |
| 564 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); |
| 565 if(rbc != NULL || coll == NULL) { |
| 566 rbc->getRules(delta, rules); |
| 567 } |
| 568 if(buffer != NULL && bufferLen > 0) { |
| 569 UErrorCode errorCode = U_ZERO_ERROR; |
| 570 return rules.extract(buffer, bufferLen, errorCode); |
| 571 } else { |
| 572 return rules.length(); |
| 573 } |
| 574 } |
| 575 |
| 576 U_CAPI const char * U_EXPORT2 |
| 577 ucol_getLocale(const UCollator *coll, ULocDataLocaleType type, UErrorCode *statu
s) { |
| 578 return ucol_getLocaleByType(coll, type, status); |
| 579 } |
| 580 |
| 581 U_CAPI const char * U_EXPORT2 |
| 582 ucol_getLocaleByType(const UCollator *coll, ULocDataLocaleType type, UErrorCode
*status) { |
| 583 if(U_FAILURE(*status)) { |
| 584 return NULL; |
| 585 } |
| 586 UTRACE_ENTRY(UTRACE_UCOL_GETLOCALE); |
| 587 UTRACE_DATA1(UTRACE_INFO, "coll=%p", coll); |
| 588 |
| 589 const char *result; |
| 590 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); |
| 591 if(rbc == NULL && coll != NULL) { |
| 592 *status = U_UNSUPPORTED_ERROR; |
| 593 result = NULL; |
| 594 } else { |
| 595 result = rbc->internalGetLocaleID(type, *status); |
| 596 } |
| 597 |
| 598 UTRACE_DATA1(UTRACE_INFO, "result = %s", result); |
| 599 UTRACE_EXIT_STATUS(*status); |
| 600 return result; |
| 601 } |
| 602 |
| 603 U_CAPI USet * U_EXPORT2 |
| 604 ucol_getTailoredSet(const UCollator *coll, UErrorCode *status) { |
| 605 if(U_FAILURE(*status)) { |
| 606 return NULL; |
| 607 } |
| 608 UnicodeSet *set = Collator::fromUCollator(coll)->getTailoredSet(*status); |
| 609 if(U_FAILURE(*status)) { |
| 610 delete set; |
| 611 return NULL; |
| 612 } |
| 613 return set->toUSet(); |
| 614 } |
| 615 |
| 616 U_CAPI UBool U_EXPORT2 |
| 617 ucol_equals(const UCollator *source, const UCollator *target) { |
| 618 return source == target || |
| 619 (*Collator::fromUCollator(source)) == (*Collator::fromUCollator(target))
; |
| 620 } |
| 621 |
| 8818 #endif /* #if !UCONFIG_NO_COLLATION */ | 622 #endif /* #if !UCONFIG_NO_COLLATION */ |
| OLD | NEW |