source/i18n/ucol.cpp - Issue 845603002: Update ICU to 54.1 step 1

Side by Side Diff: source/i18n/ucol.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master

Patch Set: remove unusued directories Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 *******************************************************************************	2 *******************************************************************************

3 * Copyright (C) 1996-2013, International Business Machines	3 * Copyright (C) 1996-2014, International Business Machines

4 * Corporation and others. All Rights Reserved.	4 * Corporation and others. All Rights Reserved.

5 *******************************************************************************	5 *******************************************************************************

6 * file name: ucol.cpp	6 * file name: ucol.cpp

7 * encoding: US-ASCII	7 * encoding: US-ASCII

8 * tab size: 8 (not used)	8 * tab size: 8 (not used)

9 * indentation:4	9 * indentation:4

10 *	10 *

11 * Modification history	11 * Modification history

12 * Date Name Comments	12 * Date Name Comments

13 * 1996-1999 various members of ICU team maintained C API for collation framewo rk	13 * 1996-1999 various members of ICU team maintained C API for collation framewo rk

14 * 02/16/2001 synwee Added internal method getPrevSpecialCE	14 * 02/16/2001 synwee Added internal method getPrevSpecialCE

15 * 03/01/2001 synwee Added maxexpansion functionality.	15 * 03/01/2001 synwee Added maxexpansion functionality.

16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compl iant	16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compl iant

	17 * 2012-2014 markus Rewritten in C++ again.

17 */	18 */

18	19

19 #include "unicode/utypes.h"	20 #include "unicode/utypes.h"

20	21

21 #if !UCONFIG_NO_COLLATION	22 #if !UCONFIG_NO_COLLATION

22	23

	24 #include "unicode/coll.h"

	25 #include "unicode/tblcoll.h"

23 #include "unicode/bytestream.h"	26 #include "unicode/bytestream.h"

24 #include "unicode/coleitr.h"	27 #include "unicode/coleitr.h"

25 #include "unicode/unorm.h"	28 #include "unicode/ucoleitr.h"

26 #include "unicode/udata.h"

27 #include "unicode/ustring.h"	29 #include "unicode/ustring.h"

28 #include "unicode/utf8.h"

29

30 #include "ucol_imp.h"

31 #include "bocsu.h"

32

33 #include "normalizer2impl.h"

34 #include "unorm_it.h"

35 #include "umutex.h"

36 #include "cmemory.h"	30 #include "cmemory.h"

37 #include "ucln_in.h"	31 #include "collation.h"

38 #include "cstring.h"	32 #include "cstring.h"

39 #include "utracimp.h"

40 #include "putilimp.h"	33 #include "putilimp.h"

41 #include "uassert.h"	34 #include "uassert.h"

42 #include "unicode/coll.h"	35 #include "utracimp.h"

43

44 #ifdef UCOL_DEBUG

45 #include <stdio.h>

46 #endif

47	36

48 U_NAMESPACE_USE	37 U_NAMESPACE_USE

49	38

50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))

51

52 #define LAST_BYTE_MASK_ 0xFF

53 #define SECOND_LAST_BYTE_SHIFT_ 8

54

55 #define ZERO_CC_LIMIT_ 0xC0

56

57 // These are static pointers to the NFC/NFD implementation instance.

58 // Each of them is always the same between calls to u_cleanup

59 // and therefore writing to it is not synchronized.

60 // They are cleaned in ucol_cleanup

61 static const Normalizer2 *g_nfd = NULL;

62 static const Normalizer2Impl *g_nfcImpl = NULL;

63

64 // These are values from UCA required for

65 // implicit generation and supressing sort key compression

66 // they should regularly be in the UCA, but if one

67 // is running without UCA, it could be a problem

68 static const int32_t maxRegularPrimary = 0x7A;

69 static const int32_t minImplicitPrimary = 0xE0;

70 static const int32_t maxImplicitPrimary = 0xE4;

71

72 U_CDECL_BEGIN

73 static UBool U_CALLCONV

74 ucol_cleanup(void)

75 {

76 g_nfd = NULL;

77 g_nfcImpl = NULL;

78 return TRUE;

79 }

80

81 static int32_t U_CALLCONV

82 _getFoldingOffset(uint32_t data) {

83 return (int32_t)(data&0xFFFFFF);

84 }

85

86 U_CDECL_END

87

88 static inline

89 UBool initializeNFD(UErrorCode *status) {

90 if (g_nfd != NULL) {

91 return TRUE;

92 } else {

93 // The result is constant, until the library is reloaded.

94 g_nfd = Normalizer2Factory::getNFDInstance(*status);

95 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);

96 return U_SUCCESS(*status);

97 }

98 }

99

100 // init FCD data

101 static inline

102 UBool initializeFCD(UErrorCode *status) {

103 if (g_nfcImpl != NULL) {

104 return TRUE;

105 } else {

106 // The result is constant, until the library is reloaded.

107 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);

108 // Note: Alternatively, we could also store this pointer in each collIte rate struct,

109 // same as Normalizer2Factory::getImpl(collIterate->nfd).

110 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);

111 return U_SUCCESS(*status);

112 }

113 }

114

115 static

116 inline void IInit_collIterate(const UCollator collator, const UChar sourceStri ng,

117 int32_t sourceLen, collIterate *s,

118 UErrorCode *status)

119 {

120 (s)->string = (s)->pos = sourceString;

121 (s)->origFlags = 0;

122 (s)->flags = 0;

123 if (sourceLen >= 0) {

124 s->flags \|= UCOL_ITER_HASLEN;

125 (s)->endp = (UChar *)sourceString+sourceLen;

126 }

127 else {

128 /* change to enable easier checking for end of string for fcdpositon */

129 (s)->endp = NULL;

130 }

131 (s)->extendCEs = NULL;

132 (s)->extendCEsSize = 0;

133 (s)->CEpos = (s)->toReturn = (s)->CEs;

134 (s)->offsetBuffer = NULL;

135 (s)->offsetBufferSize = 0;

136 (s)->offsetReturn = (s)->offsetStore = NULL;

137 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;

138 (s)->coll = (collator);

139 if (initializeNFD(status)) {

140 (s)->nfd = g_nfd;

141 } else {

142 return;

143 }

144 (s)->fcdPosition = 0;

145 if(collator->normalizationMode == UCOL_ON) {

146 (s)->flags \|= UCOL_ITER_NORM;

147 }

148 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {

149 (s)->flags \|= UCOL_HIRAGANA_Q;

150 }

151 (s)->iterator = NULL;

152 //(s)->iteratorIndex = 0;

153 }

154

155 U_CAPI void U_EXPORT2

156 uprv_init_collIterate(const UCollator collator, const UChar sourceString,

157 int32_t sourceLen, collIterate *s,

158 UErrorCode *status) {

159 /* Out-of-line version for use from other files. */

160 IInit_collIterate(collator, sourceString, sourceLen, s, status);

161 }

162

163 U_CAPI collIterate * U_EXPORT2

164 uprv_new_collIterate(UErrorCode *status) {

165 if(U_FAILURE(*status)) {

166 return NULL;

167 }

168 collIterate *s = new collIterate;

169 if(s == NULL) {

170 *status = U_MEMORY_ALLOCATION_ERROR;

171 return NULL;

172 }

173 return s;

174 }

175

176 U_CAPI void U_EXPORT2

177 uprv_delete_collIterate(collIterate *s) {

178 delete s;

179 }

180

181 U_CAPI UBool U_EXPORT2

182 uprv_collIterateAtEnd(collIterate *s) {

183 return s == NULL \|\| s->pos == s->endp;

184 }

185

186 /**

187 * Backup the state of the collIterate struct data

188 * @param data collIterate to backup

189 * @param backup storage

190 */

191 static

192 inline void backupState(const collIterate data, collIterateState backup)

193 {

194 backup->fcdPosition = data->fcdPosition;

195 backup->flags = data->flags;

196 backup->origFlags = data->origFlags;

197 backup->pos = data->pos;

198 backup->bufferaddress = data->writableBuffer.getBuffer();

199 backup->buffersize = data->writableBuffer.length();

200 backup->iteratorMove = 0;

201 backup->iteratorIndex = 0;

202 if(data->iterator != NULL) {

203 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER _CURRENT);

204 backup->iteratorIndex = data->iterator->getState(data->iterator);

205 // no we try to fixup if we're using a normalizing iterator and we get U ITER_NO_STATE

206 if(backup->iteratorIndex == UITER_NO_STATE) {

207 while((backup->iteratorIndex = data->iterator->getState(data->iterat or)) == UITER_NO_STATE) {

208 backup->iteratorMove++;

209 data->iterator->move(data->iterator, -1, UITER_CURRENT);

210 }

211 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CUR RENT);

212 }

213 }

214 }

215

216 /**

217 * Loads the state into the collIterate struct data

218 * @param data collIterate to backup

219 * @param backup storage

220 * @param forwards boolean to indicate if forwards iteration is used,

221 * false indicates backwards iteration

222 */

223 static

224 inline void loadState(collIterate data, const collIterateState backup,

225 UBool forwards)

226 {

227 UErrorCode status = U_ZERO_ERROR;

228 data->flags = backup->flags;

229 data->origFlags = backup->origFlags;

230 if(data->iterator != NULL) {

231 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO );

232 data->iterator->setState(data->iterator, backup->iteratorIndex, &status) ;

233 if(backup->iteratorMove != 0) {

234 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CUR RENT);

235 }

236 }

237 data->pos = backup->pos;

238

239 if ((data->flags & UCOL_ITER_INNORMBUF) &&

240 data->writableBuffer.getBuffer() != backup->bufferaddress) {

241 /*

242 this is when a new buffer has been reallocated and we'll have to

243 calculate the new position.

244 note the new buffer has to contain the contents of the old buffer.

245 */

246 if (forwards) {

247 data->pos = data->writableBuffer.getTerminatedBuffer() +

248 (data->pos - backup->bufferaddress);

249 }

250 else {

251 /* backwards direction */

252 int32_t temp = backup->buffersize -

253 (int32_t)(data->pos - backup->bufferaddress);

254 data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writ ableBuffer.length() - temp);

255 }

256 }

257 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {

258 /*

259 this is alittle tricky.

260 if we are initially not in the normalization buffer, even if we

261 normalize in the later stage, the data in the buffer will be

262 ignored, since we skip back up to the data string.

263 however if we are already in the normalization buffer, any

264 further normalization will pull data into the normalization

265 buffer and modify the fcdPosition.

266 since we are keeping the data in the buffer for use, the

267 fcdPosition can not be reverted back.

268 arrgghh....

269 */

270 data->fcdPosition = backup->fcdPosition;

271 }

272 }

273

274 static UBool

275 reallocCEs(collIterate *data, int32_t newCapacity) {

276 uint32_t *oldCEs = data->extendCEs;

277 if(oldCEs == NULL) {

278 oldCEs = data->CEs;

279 }

280 int32_t length = data->CEpos - oldCEs;

281 uint32_t newCEs = (uint32_t )uprv_malloc(newCapacity * 4);

282 if(newCEs == NULL) {

283 return FALSE;

284 }

285 uprv_memcpy(newCEs, oldCEs, length * 4);

286 uprv_free(data->extendCEs);

287 data->extendCEs = newCEs;

288 data->extendCEsSize = newCapacity;

289 data->CEpos = newCEs + length;

290 return TRUE;

291 }

292

293 static UBool

294 increaseCEsCapacity(collIterate *data) {

295 int32_t oldCapacity;

296 if(data->extendCEs != NULL) {

297 oldCapacity = data->extendCEsSize;

298 } else {

299 oldCapacity = LENGTHOF(data->CEs);

300 }

301 return reallocCEs(data, 2 * oldCapacity);

302 }

303

304 static UBool

305 ensureCEsCapacity(collIterate *data, int32_t minCapacity) {

306 int32_t oldCapacity;

307 if(data->extendCEs != NULL) {

308 oldCapacity = data->extendCEsSize;

309 } else {

310 oldCapacity = LENGTHOF(data->CEs);

311 }

312 if(minCapacity <= oldCapacity) {

313 return TRUE;

314 }

315 oldCapacity *= 2;

316 return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacit y);

317 }

318

319 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {

320 if(U_FAILURE(errorCode)) {

321 return;

322 }

323 int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuf fer);

324 U_ASSERT(length >= offsetBufferSize \|\| offsetStore != NULL);

325 if(length >= offsetBufferSize) {

326 int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;

327 int32_t newBuffer = static_cast<int32_t >(uprv_malloc(newCapacity * 4) );

328 if(newBuffer == NULL) {

329 errorCode = U_MEMORY_ALLOCATION_ERROR;

330 return;

331 }

332 if(length > 0) {

333 uprv_memcpy(newBuffer, offsetBuffer, length * 4);

334 }

335 uprv_free(offsetBuffer);

336 offsetBuffer = newBuffer;

337 offsetStore = offsetBuffer + length;

338 offsetBufferSize = newCapacity;

339 }

340 *offsetStore++ = offset;

341 }

342

343 /*

344 * collIter_eos()

345 * Checks for a collIterate being positioned at the end of

346 * its source string.

347 *

348 */

349 static

350 inline UBool collIter_eos(collIterate *s) {

351 if(s->flags & UCOL_USE_ITERATOR) {

352 return !(s->iterator->hasNext(s->iterator));

353 }

354 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {

355 // Null terminated string, but not at null, so not at end.

356 // Whether in main or normalization buffer doesn't matter.

357 return FALSE;

358 }

359

360 // String with length. Can't be in normalization buffer, which is always

361 // null termintated.

362 if (s->flags & UCOL_ITER_HASLEN) {

363 return (s->pos == s->endp);

364 }

365

366 // We are at a null termination, could be either normalization buffer or mai n string.

367 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {

368 // At null at end of main string.

369 return TRUE;

370 }

371

372 // At null at end of normalization buffer. Need to check whether there ther e are

373 // any characters left in the main buffer.

374 if(s->origFlags & UCOL_USE_ITERATOR) {

375 return !(s->iterator->hasNext(s->iterator));

376 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {

377 // Null terminated main string. fcdPosition is the 'return' position in to main buf.

378 return (*s->fcdPosition == 0);

379 }

380 else {

381 // Main string with an end pointer.

382 return s->fcdPosition == s->endp;

383 }

384 }

385

386 /*

387 * collIter_bos()

388 * Checks for a collIterate being positioned at the start of

389 * its source string.

390 *

391 */

392 static

393 inline UBool collIter_bos(collIterate *source) {

394 // if we're going backwards, we need to know whether there is more in the

395 // iterator, even if we are in the side buffer

396 if(source->flags & UCOL_USE_ITERATOR \|\| source->origFlags & UCOL_USE_ITERATOR) {

397 return !source->iterator->hasPrevious(source->iterator);

398 }

399 if (source->pos <= source->string \|\|

400 ((source->flags & UCOL_ITER_INNORMBUF) &&

401 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {

402 return TRUE;

403 }

404 return FALSE;

405 }

406

407 /*static

408 inline UBool collIter_SimpleBos(collIterate *source) {

409 // if we're going backwards, we need to know whether there is more in the

410 // iterator, even if we are in the side buffer

411 if(source->flags & UCOL_USE_ITERATOR \|\| source->origFlags & UCOL_USE_ITERATOR) {

412 return !source->iterator->hasPrevious(source->iterator);

413 }

414 if (source->pos == source->string) {

415 return TRUE;

416 }

417 return FALSE;

418 }*/

419 //return (data->pos == data->string) \|\|

420

421

422 /****************************************************************************/

423 /* Following are the open/close functions */

424 /* */

425 /****************************************************************************/

426

427 static UCollator*

428 ucol_initFromBinary(const uint8_t *bin, int32_t length,

429 const UCollator *base,

430 UCollator *fillIn,

431 UErrorCode *status)

432 {

433 UCollator *result = fillIn;

434 if(U_FAILURE(*status)) {

435 return NULL;

436 }

437 /*

438 if(base == NULL) {

439 // we don't support null base yet

440 *status = U_ILLEGAL_ARGUMENT_ERROR;

441 return NULL;

442 }

443 */

444 // We need these and we could be running without UCA

445 uprv_uca_initImplicitConstants(status);

446 UCATableHeader colData = (UCATableHeader )bin;

447 // do we want version check here? We're trying to figure out whether collato rs are compatible

448 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeo f(UVersionInfo)) != 0 \|\|

449 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersio nInfo)) != 0)) \|\|

450 colData->version[0] != UCOL_BUILDER_VERSION)

451 {

452 *status = U_COLLATOR_VERSION_MISMATCH;

453 return NULL;

454 }

455 else {

456 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(s izeof(UColOptionSet)))) {

457 result = ucol_initCollator((const UCATableHeader *)bin, result, base , status);

458 if(U_FAILURE(*status)){

459 return NULL;

460 }

461 result->hasRealData = TRUE;

462 }

463 else {

464 if(base) {

465 result = ucol_initCollator(base->image, result, base, status);

466 ucol_setOptionsFromHeader(result, (UColOptionSet )(bin+((const UCATableHeader )bin)->options), status);

467 if(U_FAILURE(*status)){

468 return NULL;

469 }

470 result->hasRealData = FALSE;

471 }

472 else {

473 *status = U_USELESS_COLLATOR_ERROR;

474 return NULL;

475 }

476 }

477 result->freeImageOnClose = FALSE;

478 }

479 result->actualLocale = NULL;

480 result->validLocale = NULL;

481 result->requestedLocale = NULL;

482 result->rules = NULL;

483 result->rulesLength = 0;

484 result->freeRulesOnClose = FALSE;

485 result->ucaRules = NULL;

486 return result;

487 }

488

489 U_CAPI UCollator* U_EXPORT2	39 U_CAPI UCollator* U_EXPORT2

490 ucol_openBinary(const uint8_t *bin, int32_t length,	40 ucol_openBinary(const uint8_t *bin, int32_t length,

491 const UCollator *base,	41 const UCollator *base,

492 UErrorCode *status)	42 UErrorCode *status)

493 {	43 {

494 return ucol_initFromBinary(bin, length, base, NULL, status);	44 if(U_FAILURE(*status)) { return NULL; }

	45 RuleBasedCollator *coll = new RuleBasedCollator(

	46 bin, length,

	47 RuleBasedCollator::rbcFromUCollator(base),

	48 *status);

	49 if(coll == NULL) {

	50 *status = U_MEMORY_ALLOCATION_ERROR;

	51 return NULL;

	52 }

	53 if(U_FAILURE(*status)) {

	54 delete coll;

	55 return NULL;

	56 }

	57 return coll->toUCollator();

495 }	58 }

496	59

497 U_CAPI int32_t U_EXPORT2	60 U_CAPI int32_t U_EXPORT2

498 ucol_cloneBinary(const UCollator *coll,	61 ucol_cloneBinary(const UCollator *coll,

499 uint8_t *buffer, int32_t capacity,	62 uint8_t *buffer, int32_t capacity,

500 UErrorCode *status)	63 UErrorCode *status)

501 {	64 {

502 int32_t length = 0;

503 if(U_FAILURE(*status)) {	65 if(U_FAILURE(*status)) {

504 return length;	66 return 0;

505 }	67 }

506 if(capacity < 0) {	68 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);

507 *status = U_ILLEGAL_ARGUMENT_ERROR;	69 if(rbc == NULL && coll != NULL) {

508 return length;	70 *status = U_UNSUPPORTED_ERROR;

	71 return 0;

509 }	72 }

510 if(coll->hasRealData == TRUE) {	73 return rbc->cloneBinary(buffer, capacity, *status);

511 length = coll->image->size;

512 if(length <= capacity) {

513 uprv_memcpy(buffer, coll->image, length);

514 } else {

515 *status = U_BUFFER_OVERFLOW_ERROR;

516 }

517 } else {

518 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof( UColOptionSet)));

519 if(length <= capacity) {

520 /* build the UCATableHeader with minimal entries */

521 /* do not copy the header from the UCA file because its values are w rong! */

522 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */

523

524 /* reset everything */

525 uprv_memset(buffer, 0, length);

526

527 /* set the tailoring-specific values */

528 UCATableHeader myData = (UCATableHeader )buffer;

529 myData->size = length;

530

531 /* offset for the options, the only part of the data that is present after the header */

532 myData->options = sizeof(UCATableHeader);

533

534 /* need to always set the expansion value for an upper bound of the options */

535 myData->expansion = myData->options + sizeof(UColOptionSet);

536

537 myData->magic = UCOL_HEADER_MAGIC;

538 myData->isBigEndian = U_IS_BIG_ENDIAN;

539 myData->charSetFamily = U_CHARSET_FAMILY;

540

541 /* copy UCA's version; genrb will override all but the builder versi on with tailoring data */

542 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionIn fo));

543

544 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVer sionInfo));

545 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVer sionInfo));

546 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeo f(UVersionInfo));

547 myData->jamoSpecial = coll->image->jamoSpecial;

548

549 /* copy the collator options */

550 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options , sizeof(UColOptionSet));

551 } else {

552 *status = U_BUFFER_OVERFLOW_ERROR;

553 }

554 }

555 return length;

556 }	74 }

557	75

558 U_CAPI UCollator* U_EXPORT2	76 U_CAPI UCollator* U_EXPORT2

559 ucol_safeClone(const UCollator coll, void /stackBuffer/, int32_t * pBufferS ize, UErrorCode *status)	77 ucol_safeClone(const UCollator coll, void /stackBuffer/, int32_t * pBufferS ize, UErrorCode *status)

560 {	78 {

561 UCollator * localCollator;

562 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);

563 int32_t imageSize = 0;

564 int32_t rulesSize = 0;

565 int32_t rulesPadding = 0;

566 int32_t defaultReorderCodesSize = 0;

567 int32_t reorderCodesSize = 0;

568 uint8_t *image;

569 UChar *rules;

570 int32_t* defaultReorderCodes;

571 int32_t* reorderCodes;

572 uint8_t* leadBytePermutationTable;

573 UBool imageAllocated = FALSE;

574

575 if (status == NULL \|\| U_FAILURE(*status)){	79 if (status == NULL \|\| U_FAILURE(*status)){

576 return NULL;	80 return NULL;

577 }	81 }

578 if (coll == NULL) {	82 if (coll == NULL) {

579 *status = U_ILLEGAL_ARGUMENT_ERROR;	83 *status = U_ILLEGAL_ARGUMENT_ERROR;

580 return NULL;	84 return NULL;

581 }	85 }

582

583 if (coll->rules && coll->freeRulesOnClose) {

584 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);

585 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));

586 bufferSizeNeeded += rulesSize + rulesPadding;

587 }

588 // no padding for alignment needed from here since the next two are 4 byte q uantities

589 if (coll->defaultReorderCodes) {

590 defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32 _t);

591 bufferSizeNeeded += defaultReorderCodesSize;

592 }

593 if (coll->reorderCodes) {

594 reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t);

595 bufferSizeNeeded += reorderCodesSize;

596 }

597 if (coll->leadBytePermutationTable) {

598 bufferSizeNeeded += 256 * sizeof(uint8_t);

599 }

600

601 if (pBufferSize != NULL) {	86 if (pBufferSize != NULL) {

602 int32_t inputSize = *pBufferSize;	87 int32_t inputSize = *pBufferSize;

603 *pBufferSize = 1;	88 *pBufferSize = 1;

604 if (inputSize == 0) {	89 if (inputSize == 0) {

605 return NULL; // preflighting for deprecated functionality	90 return NULL; // preflighting for deprecated functionality

606 }	91 }

607 }	92 }

608	93 Collator *newColl = Collator::fromUCollator(coll)->clone();

609 char stackBufferChars = (char )uprv_malloc(bufferSizeNeeded);	94 if (newColl == NULL) {

610 // Null pointer check.

611 if (stackBufferChars == NULL) {

612 *status = U_MEMORY_ALLOCATION_ERROR;	95 *status = U_MEMORY_ALLOCATION_ERROR;

613 return NULL;	96 } else {

	97 *status = U_SAFECLONE_ALLOCATED_WARNING;

614 }	98 }

615 *status = U_SAFECLONE_ALLOCATED_WARNING;	99 return newColl->toUCollator();

616

617 localCollator = (UCollator *)stackBufferChars;

618 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);

619 defaultReorderCodes = (int32_t)((uint8_t)rules + rulesSize);

620 reorderCodes = (int32_t)((uint8_t)defaultReorderCodes + defaultReorderCode sSize);

621 leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize;

622

623 {

624 UErrorCode tempStatus = U_ZERO_ERROR;

625 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);

626 }

627 if (coll->freeImageOnClose) {

628 image = (uint8_t *)uprv_malloc(imageSize);

629 // Null pointer check

630 if (image == NULL) {

631 *status = U_MEMORY_ALLOCATION_ERROR;

632 return NULL;

633 }

634 ucol_cloneBinary(coll, image, imageSize, status);

635 imageAllocated = TRUE;

636 }

637 else {

638 image = (uint8_t *)coll->image;

639 }

640 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollat or, status);

641 if (U_FAILURE(*status)) {

642 return NULL;

643 }

644

645 if (coll->rules) {

646 if (coll->freeRulesOnClose) {

647 localCollator->rules = u_strcpy(rules, coll->rules);

648 //bufferEnd += rulesSize;

649 }

650 else {

651 localCollator->rules = coll->rules;

652 }

653 localCollator->freeRulesOnClose = FALSE;

654 localCollator->rulesLength = coll->rulesLength;

655 }

656

657 // collator reordering

658 if (coll->defaultReorderCodes) {

659 localCollator->defaultReorderCodes =

660 (int32_t) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCode s, coll->defaultReorderCodesLength sizeof(int32_t));

661 localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLeng th;

662 localCollator->freeDefaultReorderCodesOnClose = FALSE;

663 }

664 if (coll->reorderCodes) {

665 localCollator->reorderCodes =

666 (int32_t)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorde rCodesLength sizeof(int32_t));

667 localCollator->reorderCodesLength = coll->reorderCodesLength;

668 localCollator->freeReorderCodesOnClose = FALSE;

669 }

670 if (coll->leadBytePermutationTable) {

671 localCollator->leadBytePermutationTable =

672 (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermu tationTable, 256);

673 localCollator->freeLeadBytePermutationTableOnClose = FALSE;

674 }

675

676 int32_t i;

677 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {

678 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(col l, (UColAttribute)i, status), status);

679 }

680 // zero copies of pointers

681 localCollator->actualLocale = NULL;

682 localCollator->validLocale = NULL;

683 localCollator->requestedLocale = NULL;

684 localCollator->ucaRules = coll->ucaRules; // There should only be one copy h ere.

685 localCollator->freeOnClose = TRUE;

686 localCollator->freeImageOnClose = imageAllocated;

687 return localCollator;

688 }	100 }

689	101

690 U_CAPI void U_EXPORT2	102 U_CAPI void U_EXPORT2

691 ucol_close(UCollator *coll)	103 ucol_close(UCollator *coll)

692 {	104 {

693 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);	105 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);

694 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);	106 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);

695 if(coll != NULL) {	107 if(coll != NULL) {

696 // these are always owned by each UCollator struct,	108 delete Collator::fromUCollator(coll);

697 // so we always free them

698 if(coll->validLocale != NULL) {

699 uprv_free(coll->validLocale);

700 }

701 if(coll->actualLocale != NULL) {

702 uprv_free(coll->actualLocale);

703 }

704 if(coll->requestedLocale != NULL) {

705 uprv_free(coll->requestedLocale);

706 }

707 if(coll->latinOneCEs != NULL) {

708 uprv_free(coll->latinOneCEs);

709 }

710 if(coll->options != NULL && coll->freeOptionsOnClose) {

711 uprv_free(coll->options);

712 }

713 if(coll->rules != NULL && coll->freeRulesOnClose) {

714 uprv_free((UChar *)coll->rules);

715 }

716 if(coll->image != NULL && coll->freeImageOnClose) {

717 uprv_free((UCATableHeader *)coll->image);

718 }

719

720 if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutati onTableOnClose == TRUE) {

721 uprv_free(coll->leadBytePermutationTable);

722 }

723 if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnC lose == TRUE) {

724 uprv_free(coll->defaultReorderCodes);

725 }

726 if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {

727 uprv_free(coll->reorderCodes);

728 }

729

730 if(coll->delegate != NULL) {

731 delete (Collator*)coll->delegate;

732 }

733

734 /* Here, it would be advisable to close: */

735 /* - UData for UCA (unless we stuff it in the root resb */

736 /* Again, do we need additional housekeeping... HMMM! */

737 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);

738 if(coll->freeOnClose){

739 /* for safeClone, if freeOnClose is FALSE,

740 don't free the other instance data */

741 uprv_free(coll);

742 }

743 }	109 }

744 UTRACE_EXIT();	110 UTRACE_EXIT();

745 }	111 }

746	112

747 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCo de *status) {

748 if(U_FAILURE(*status)) {

749 return;

750 }

751 result->caseFirst = (UColAttributeValue)opts->caseFirst;

752 result->caseLevel = (UColAttributeValue)opts->caseLevel;

753 result->frenchCollation = (UColAttributeValue)opts->frenchCollation;

754 result->normalizationMode = (UColAttributeValue)opts->normalizationMode;

755 if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) {

756 return;

757 }

758 result->strength = (UColAttributeValue)opts->strength;

759 result->variableTopValue = opts->variableTopValue;

760 result->alternateHandling = (UColAttributeValue)opts->alternateHandling;

761 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;

762 result->numericCollation = (UColAttributeValue)opts->numericCollation;

763 result->caseFirstisDefault = TRUE;

764 result->caseLevelisDefault = TRUE;

765 result->frenchCollationisDefault = TRUE;

766 result->normalizationModeisDefault = TRUE;

767 result->strengthisDefault = TRUE;

768 result->variableTopValueisDefault = TRUE;

769 result->alternateHandlingisDefault = TRUE;

770 result->hiraganaQisDefault = TRUE;

771 result->numericCollationisDefault = TRUE;

772

773 ucol_updateInternalState(result, status);

774

775 result->options = opts;

776 }

777

778

779 /**

780 * Approximate determination if a character is at a contraction end.

781 * Guaranteed to be TRUE if a character is at the end of a contraction,

782 * otherwise it is not deterministic.

783 * @param c character to be determined

784 * @param coll collator

785 */

786 static

787 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {

788 if (c < coll->minContrEndCP) {

789 return FALSE;

790 }

791

792 int32_t hash = c;

793 uint8_t htbyte;

794 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {

795 if (U16_IS_TRAIL(c)) {

796 return TRUE;

797 }

798 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;

799 }

800 htbyte = coll->contrEndCP[hash>>3];

801 return (((htbyte >> (hash & 7)) & 1) == 1);

802 }

803

804

805

806 /*

807 * i_getCombiningClass()

808 * A fast, at least partly inline version of u_getCombiningClass()

809 * This is a candidate for further optimization. Used heavily

810 * in contraction processing.

811 */

812 static

813 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {

814 uint8_t sCC = 0;

815 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) \|\| c > 0xFFFF) {

816 sCC = u_getCombiningClass(c);

817 }

818 return sCC;

819 }

820

821 UCollator* ucol_initCollator(const UCATableHeader image, UCollator fillIn, con st UCollator UCA, UErrorCode status) {

822 UChar c;

823 UCollator *result = fillIn;

824 if(U_FAILURE(*status) \|\| image == NULL) {

825 return NULL;

826 }

827

828 if(result == NULL) {

829 result = (UCollator *)uprv_malloc(sizeof(UCollator));

830 if(result == NULL) {

831 *status = U_MEMORY_ALLOCATION_ERROR;

832 return result;

833 }

834 result->freeOnClose = TRUE;

835 } else {

836 result->freeOnClose = FALSE;

837 }

838

839 result->delegate = NULL;

840

841 result->image = image;

842 result->mapping.getFoldingOffset = _getFoldingOffset;

843 const uint8_t mapping = (uint8_t)result->image+result->image->mappingPosit ion;

844 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);

845 if(U_FAILURE(*status)) {

846 if(result->freeOnClose == TRUE) {

847 uprv_free(result);

848 result = NULL;

849 }

850 return result;

851 }

852

853 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);

854 result->contractionCEs = (uint32_t)((uint8_t)result->image+result->image-> contractionCEs);

855 result->contractionIndex = (UChar)((uint8_t)result->image+result->image->c ontractionIndex);

856 result->expansion = (uint32_t)((uint8_t)result->image+result->image->expan sion);

857 result->rules = NULL;

858 result->rulesLength = 0;

859 result->freeRulesOnClose = FALSE;

860 result->defaultReorderCodes = NULL;

861 result->defaultReorderCodesLength = 0;

862 result->freeDefaultReorderCodesOnClose = FALSE;

863 result->reorderCodes = NULL;

864 result->reorderCodesLength = 0;

865 result->freeReorderCodesOnClose = FALSE;

866 result->leadBytePermutationTable = NULL;

867 result->freeLeadBytePermutationTableOnClose = FALSE;

868

869 /* get the version info from UCATableHeader and populate the Collator struct */

870 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/

871 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules v ersion*/

872 result->dataVersion[2] = 0;

873 result->dataVersion[3] = 0;

874

875 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;

876 result->minUnsafeCP = 0;

877 for (c=0; c<0x300; c++) { // Find the smallest unsafe char.

878 if (ucol_unsafeCP(c, result)) break;

879 }

880 result->minUnsafeCP = c;

881

882 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;

883 result->minContrEndCP = 0;

884 for (c=0; c<0x300; c++) { // Find the Contraction-ending char.

885 if (ucol_contractionEndCP(c, result)) break;

886 }

887 result->minContrEndCP = c;

888

889 /* max expansion tables */

890 result->endExpansionCE = (uint32_t)((uint8_t)result->image +

891 result->image->endExpansionCE);

892 result->lastEndExpansionCE = result->endExpansionCE +

893 result->image->endExpansionCECount - 1;

894 result->expansionCESize = (uint8_t*)result->image +

895 result->image->expansionCESize;

896

897

898 //result->errorCode = *status;

899

900 result->latinOneCEs = NULL;

901

902 result->latinOneRegenTable = FALSE;

903 result->latinOneFailed = FALSE;

904 result->UCA = UCA;

905

906 /* Normally these will be set correctly later. This is the default if you us e UCA or the default. */

907 result->ucaRules = NULL;

908 result->actualLocale = NULL;

909 result->validLocale = NULL;

910 result->requestedLocale = NULL;

911 result->hasRealData = FALSE; // real data lives in .dat file...

912 result->freeImageOnClose = FALSE;

913

914 /* set attributes */

915 ucol_setOptionsFromHeader(

916 result,

917 (UColOptionSet)((uint8_t)result->image+result->image->options),

918 status);

919 result->freeOptionsOnClose = FALSE;

920

921 return result;

922 }

923

924 /* new Mark's code */

925

926 /**

927 * For generation of Implicit CEs

928 * @author Davis

929 *

930 * Cleaned up so that changes can be made more easily.

931 * Old values:

932 # First Implicit: E26A792D

933 # Last Implicit: E3DC70C0

934 # First CJK: E0030300

935 # Last CJK: E0A9DD00

936 # First CJK_A: E0A9DF00

937 # Last CJK_A: E0DE3100

938 */

939 /* Following is a port of Mark's code for new treatment of implicits.

940 * It is positioned here, since ucol_initUCA need to initialize the

941 * variables below according to the data in the fractional UCA.

942 */

943

944 /**

945 * Function used to:

946 * a) collapse the 2 different Han ranges from UCA into one (in the right order) , and

947 * b) bump any non-CJK characters by 10FFFF.

948 * The relevant blocks are:

949 * A: 4E00..9FFF; CJK Unified Ideographs

950 * F900..FAFF; CJK Compatibility Ideographs

951 * B: 3400..4DBF; CJK Unified Ideographs Extension A

952 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)

953 * As long as

954 * no new B characters are allocated between 4E00 and FAFF, and

955 * no new A characters are outside of this range,

956 * (very high probability) this simple code will work.

957 * The reordered blocks are:

958 * Block1 is CJK

959 * Block2 is CJK_COMPAT_USED

960 * Block3 is CJK_A

961 * (all contiguous)

962 * Any other CJK gets its normal code point

963 * Any non-CJK gets +10FFFF

964 * When we reorder Block1, we make sure that it is at the very start,

965 * so that it will use a 3-byte form.

966 * Warning: the we only pick up the compatibility characters that are

967 * NOT decomposed, so that block is smaller!

968 */

969

970 // CONSTANTS

971 static const UChar32

972 NON_CJK_OFFSET = 0x110000,

973 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2

974

975 /**

976 * Precomputed by initImplicitConstants()

977 */

978 static int32_t

979 final3Multiplier = 0,

980 final4Multiplier = 0,

981 final3Count = 0,

982 final4Count = 0,

983 medialCount = 0,

984 min3Primary = 0,

985 min4Primary = 0,

986 max4Primary = 0,

987 minTrail = 0,

988 maxTrail = 0,

989 max3Trail = 0,

990 max4Trail = 0,

991 min4Boundary = 0;

992

993 static const UChar32

994 // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;

995 // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; (Unicode 6.1)

996 CJK_BASE = 0x4E00,

997 CJK_LIMIT = 0x9FCC+1,

998 // Unified CJK ideographs in the compatibility ideographs block.

999 CJK_COMPAT_USED_BASE = 0xFA0E,

1000 CJK_COMPAT_USED_LIMIT = 0xFA2F+1,

1001 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;

1002 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;

1003 CJK_A_BASE = 0x3400,

1004 CJK_A_LIMIT = 0x4DB5+1,

1005 // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;

1006 // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;

1007 CJK_B_BASE = 0x20000,

1008 CJK_B_LIMIT = 0x2A6D6+1,

1009 // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;

1010 // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;

1011 CJK_C_BASE = 0x2A700,

1012 CJK_C_LIMIT = 0x2B734+1,

1013 // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;

1014 // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;

1015 CJK_D_BASE = 0x2B740,

1016 CJK_D_LIMIT = 0x2B81D+1;

1017 // when adding to this list, look for all occurrences (in project)

1018 // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing !!!!

1019

1020 static UChar32 swapCJK(UChar32 i) {

1021 if (i < CJK_A_BASE) {

1022 // non-CJK

1023 } else if (i < CJK_A_LIMIT) {

1024 // Extension A has lower code points than the original Unihan+compat

1025 // but sorts higher.

1026 return i - CJK_A_BASE

1027 + (CJK_LIMIT - CJK_BASE)

1028 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);

1029 } else if (i < CJK_BASE) {

1030 // non-CJK

1031 } else if (i < CJK_LIMIT) {

1032 return i - CJK_BASE;

1033 } else if (i < CJK_COMPAT_USED_BASE) {

1034 // non-CJK

1035 } else if (i < CJK_COMPAT_USED_LIMIT) {

1036 return i - CJK_COMPAT_USED_BASE

1037 + (CJK_LIMIT - CJK_BASE);

1038 } else if (i < CJK_B_BASE) {

1039 // non-CJK

1040 } else if (i < CJK_B_LIMIT) {

1041 return i; // non-BMP-CJK

1042 } else if (i < CJK_C_BASE) {

1043 // non-CJK

1044 } else if (i < CJK_C_LIMIT) {

1045 return i; // non-BMP-CJK

1046 } else if (i < CJK_D_BASE) {

1047 // non-CJK

1048 } else if (i < CJK_D_LIMIT) {

1049 return i; // non-BMP-CJK

1050 }

1051 return i + NON_CJK_OFFSET; // non-CJK

1052 }

1053

1054 U_CAPI UChar32 U_EXPORT2

1055 uprv_uca_getRawFromCodePoint(UChar32 i) {

1056 return swapCJK(i)+1;

1057 }

1058

1059 U_CAPI UChar32 U_EXPORT2

1060 uprv_uca_getCodePointFromRaw(UChar32 i) {

1061 i--;

1062 UChar32 result = 0;

1063 if(i >= NON_CJK_OFFSET) {

1064 result = i - NON_CJK_OFFSET;

1065 } else if(i >= CJK_B_BASE) {

1066 result = i;

1067 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted

1068 if(i < CJK_LIMIT - CJK_BASE) {

1069 result = i + CJK_BASE;

1070 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMP AT_USED_BASE)) {

1071 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);

1072 } else {

1073 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_ LIMIT - CJK_COMPAT_USED_BASE);

1074 }

1075 } else {

1076 result = -1;

1077 }

1078 return result;

1079 }

1080

1081 // GET IMPLICIT PRIMARY WEIGHTS

1082 // Return value is left justified primary key

1083 U_CAPI uint32_t U_EXPORT2

1084 uprv_uca_getImplicitFromRaw(UChar32 cp) {

1085 /*

1086 if (cp < 0 \|\| cp > UCOL_MAX_INPUT) {

1087 throw new IllegalArgumentException("Code point out of range " + Utility. hex(cp));

1088 }

1089 */

1090 int32_t last0 = cp - min4Boundary;

1091 if (last0 < 0) {

1092 int32_t last1 = cp / final3Count;

1093 last0 = cp % final3Count;

1094

1095 int32_t last2 = last1 / medialCount;

1096 last1 %= medialCount;

1097

1098 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start

1099 last1 = minTrail + last1; // offset

1100 last2 = min3Primary + last2; // offset

1101 /*

1102 if (last2 >= min4Primary) {

1103 throw new IllegalArgumentException("4-byte out of range: " + Utility .hex(cp) + ", " + Utility.hex(last2));

1104 }

1105 */

1106 return (last2 << 24) + (last1 << 16) + (last0 << 8);

1107 } else {

1108 int32_t last1 = last0 / final4Count;

1109 last0 %= final4Count;

1110

1111 int32_t last2 = last1 / medialCount;

1112 last1 %= medialCount;

1113

1114 int32_t last3 = last2 / medialCount;

1115 last2 %= medialCount;

1116

1117 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start

1118 last1 = minTrail + last1; // offset

1119 last2 = minTrail + last2; // offset

1120 last3 = min4Primary + last3; // offset

1121 /*

1122 if (last3 > max4Primary) {

1123 throw new IllegalArgumentException("4-byte out of range: " + Utility .hex(cp) + ", " + Utility.hex(last3));

1124 }

1125 */

1126 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;

1127 }

1128 }

1129

1130 static uint32_t U_EXPORT2

1131 uprv_uca_getImplicitPrimary(UChar32 cp) {

1132 //fprintf(stdout, "Incoming: %04x\n", cp);

1133 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));

1134

1135 cp = swapCJK(cp);

1136 cp++;

1137 // we now have a range of numbers from 0 to 21FFFF.

1138

1139 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));

1140 //fprintf(stdout, "CJK swapped: %04x\n", cp);

1141

1142 return uprv_uca_getImplicitFromRaw(cp);

1143 }

1144

1145 /**

1146 * Converts implicit CE into raw integer ("code point")

1147 * @param implicit

1148 * @return -1 if illegal format

1149 */

1150 U_CAPI UChar32 U_EXPORT2

1151 uprv_uca_getRawFromImplicit(uint32_t implicit) {

1152 UChar32 result;

1153 UChar32 b3 = implicit & 0xFF;

1154 UChar32 b2 = (implicit >> 8) & 0xFF;

1155 UChar32 b1 = (implicit >> 16) & 0xFF;

1156 UChar32 b0 = (implicit >> 24) & 0xFF;

1157

1158 // simple parameter checks

1159 if (b0 < min3Primary \|\| b0 > max4Primary

1160 \|\| b1 < minTrail \|\| b1 > maxTrail)

1161 return -1;

1162 // normal offsets

1163 b1 -= minTrail;

1164

1165 // take care of the final values, and compose

1166 if (b0 < min4Primary) {

1167 if (b2 < minTrail \|\| b2 > max3Trail \|\| b3 != 0)

1168 return -1;

1169 b2 -= minTrail;

1170 UChar32 remainder = b2 % final3Multiplier;

1171 if (remainder != 0)

1172 return -1;

1173 b0 -= min3Primary;

1174 b2 /= final3Multiplier;

1175 result = ((b0 * medialCount) + b1) * final3Count + b2;

1176 } else {

1177 if (b2 < minTrail \|\| b2 > maxTrail

1178 \|\| b3 < minTrail \|\| b3 > max4Trail)

1179 return -1;

1180 b2 -= minTrail;

1181 b3 -= minTrail;

1182 UChar32 remainder = b3 % final4Multiplier;

1183 if (remainder != 0)

1184 return -1;

1185 b3 /= final4Multiplier;

1186 b0 -= min4Primary;

1187 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;

1188 }

1189 // final check

1190 if (result < 0 \|\| result > UCOL_MAX_INPUT)

1191 return -1;

1192 return result;

1193 }

1194

1195

1196 static inline int32_t divideAndRoundUp(int a, int b) {

1197 return 1 + (a-1)/b;

1198 }

1199

1200 /* this function is either called from initUCA or from genUCA before

1201 * doing canonical closure for the UCA.

1202 */

1203

1204 /**

1205 * Set up to generate implicits.

1206 * Maintenance Note: this function may end up being called more than once, due

1207 * to threading races during initialization. Make sure that

1208 * none of the Constants is ever transiently assigned an

1209 * incorrect value.

1210 * @param minPrimary

1211 * @param maxPrimary

1212 * @param minTrail final byte

1213 * @param maxTrail final byte

1214 * @param gap3 the gap we leave for tailoring for 3-byte forms

1215 * @param gap4 the gap we leave for tailoring for 4-byte forms

1216 */

1217 static void initImplicitConstants(int minPrimary, int maxPrimary,

1218 int minTrailIn, int maxTrailIn,

1219 int gap3, int primaries3count,

1220 UErrorCode *status) {

1221 // some simple parameter checks

1222 if ((minPrimary < 0 \|\| minPrimary >= maxPrimary \|\| maxPrimary > 0xFF)

1223 \|\| (minTrailIn < 0 \|\| minTrailIn >= maxTrailIn \|\| maxTrailIn > 0xFF)

1224 \|\| (primaries3count < 1))

1225 {

1226 *status = U_ILLEGAL_ARGUMENT_ERROR;

1227 return;

1228 };

1229

1230 minTrail = minTrailIn;

1231 maxTrail = maxTrailIn;

1232

1233 min3Primary = minPrimary;

1234 max4Primary = maxPrimary;

1235 // compute constants for use later.

1236 // number of values we can use in trailing bytes

1237 // leave room for empty values between AND above, e.g. if gap = 2

1238 // range 3..7 => +3 -4 -5 -6 -7: so 1 value

1239 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values

1240 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values

1241 final3Multiplier = gap3 + 1;

1242 final3Count = (maxTrail - minTrail + 1) / final3Multiplier;

1243 max3Trail = minTrail + (final3Count - 1) * final3Multiplier;

1244

1245 // medials can use full range

1246 medialCount = (maxTrail - minTrail + 1);

1247 // find out how many values fit in each form

1248 int32_t threeByteCount = medialCount * final3Count;

1249 // now determine where the 3/4 boundary is.

1250 // we use 3 bytes below the boundary, and 4 above

1251 int32_t primariesAvailable = maxPrimary - minPrimary + 1;

1252 int32_t primaries4count = primariesAvailable - primaries3count;

1253

1254

1255 int32_t min3ByteCoverage = primaries3count * threeByteCount;

1256 min4Primary = minPrimary + primaries3count;

1257 min4Boundary = min3ByteCoverage;

1258 // Now expand out the multiplier for the 4 bytes, and redo.

1259

1260 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;

1261 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count );

1262 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCo unt * medialCount);

1263 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;

1264 if (gap4 < 1) {

1265 *status = U_ILLEGAL_ARGUMENT_ERROR;

1266 return;

1267 }

1268 final4Multiplier = gap4 + 1;

1269 final4Count = neededPerFinalByte;

1270 max4Trail = minTrail + (final4Count - 1) * final4Multiplier;

1271 }

1272

1273 /**

1274 * Supply parameters for generating implicit CEs

1275 */

1276 U_CAPI void U_EXPORT2

1277 uprv_uca_initImplicitConstants(UErrorCode *status) {

1278 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms .

1279 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);

1280 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);

1281 }

1282

1283

1284 /* collIterNormalize Incremental Normalization happens here. */

1285 /* pick up the range of chars identifed by FCD, */

1286 /* normalize it into the collIterate's writable buffer, */

1287 /* switch the collIterate's state to use the writable b uffer. */

1288 /* */

1289 static

1290 void collIterNormalize(collIterate *collationSource)

1291 {

1292 UErrorCode status = U_ZERO_ERROR;

1293 const UChar srcP = collationSource->pos - 1; / Start of chars to nor malize */

1294 const UChar endP = collationSource->fcdPosition; / End of region to norma lize+1 */

1295

1296 collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),

1297 collationSource->writableBuffer,

1298 status);

1299 if (U_FAILURE(status)) {

1300 #ifdef UCOL_DEBUG

1301 fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_erro rName(status));

1302 #endif

1303 return;

1304 }

1305

1306 collationSource->pos = collationSource->writableBuffer.getTerminatedB uffer();

1307 collationSource->origFlags = collationSource->flags;

1308 collationSource->flags \|= UCOL_ITER_INNORMBUF;

1309 collationSource->flags &= ~(UCOL_ITER_NORM \| UCOL_ITER_HASLEN \| UCOL_USE _ITERATOR);

1310 }

1311

1312

1313 // This function takes the iterator and extracts normalized stuff up to the next boundary

1314 // It is similar in the end results to the collIterNormalize, but for the cases when we

1315 // use an iterator

1316 /*static

1317 inline void normalizeIterator(collIterate *collationSource) {

1318 UErrorCode status = U_ZERO_ERROR;

1319 UBool wasNormalized = FALSE;

1320 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->ite rator, UITER_CURRENT);

1321 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iter ator);

1322 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writa bleBuffer,

1323 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalize d, &status);

1324 if(status == U_BUFFER_OVERFLOW_ERROR \|\| normLen == (int32_t)collationSource->w ritableBufSize) {

1325 // reallocate and terminate

1326 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,

1327 &collationSource->writableBuffer,

1328 (int32_t *)&collationSource->writableBufSize, nor mLen + 1,

1329 0)

1330 ) {

1331 #ifdef UCOL_DEBUG

1332 fprintf(stderr, "normalizeIterator(), out of memory\n");

1333 #endif

1334 return;

1335 }

1336 status = U_ZERO_ERROR;

1337 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITE R_ZERO);

1338 collationSource->iterator->setState(collationSource->iterator, iterIndex, &s tatus);

1339 normLen = unorm_next(collationSource->iterator, collationSource->writableBuf fer,

1340 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalize d, &status);

1341 }

1342 // Terminate the buffer - we already checked that it is big enough

1343 collationSource->writableBuffer[normLen] = 0;

1344 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {

1345 collationSource->flags \|= UCOL_ITER_ALLOCATED;

1346 }

1347 collationSource->pos = collationSource->writableBuffer;

1348 collationSource->origFlags = collationSource->flags;

1349 collationSource->flags \|= UCOL_ITER_INNORMBUF;

1350 collationSource->flags &= ~(UCOL_ITER_NORM \| UCOL_ITER_HASLEN \| UCOL_USE_I TERATOR);

1351 }*/

1352

1353

1354 /* Incremental FCD check and normalize */

1355 /* Called from getNextCE when normalization state is suspect. */

1356 /* When entering, the state is known to be this: */

1357 /* o We are working in the main buffer of the collIterate, not the side */

1358 /* writable buffer. When in the side buffer, normalization mode is alw ays off, */

1359 /* so we won't get here. */

1360 /* o The leading combining class from the current character is 0 or */

1361 /* the trailing combining class of the previous char was zero. */

1362 /* True because the previous call to this function will have always exi ted */

1363 /* that way, and we get called for every char where cc might be non-zer o. */

1364 static

1365 inline UBool collIterFCD(collIterate *collationSource) {

1366 const UChar srcP, endP;

1367 uint8_t leadingCC;

1368 uint8_t prevTrailingCC = 0;

1369 uint16_t fcd;

1370 UBool needNormalize = FALSE;

1371

1372 srcP = collationSource->pos-1;

1373

1374 if (collationSource->flags & UCOL_ITER_HASLEN) {

1375 endP = collationSource->endp;

1376 } else {

1377 endP = NULL;

1378 }

1379

1380 // Get the trailing combining class of the current character. If it's zero, we are OK.

1381 fcd = g_nfcImpl->nextFCD16(srcP, endP);

1382 if (fcd != 0) {

1383 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);

1384

1385 if (prevTrailingCC != 0) {

1386 // The current char has a non-zero trailing CC. Scan forward until we find

1387 // a char with a leading cc of zero.

1388 while (endP == NULL \|\| srcP != endP)

1389 {

1390 const UChar *savedSrcP = srcP;

1391

1392 fcd = g_nfcImpl->nextFCD16(srcP, endP);

1393 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);

1394 if (leadingCC == 0) {

1395 srcP = savedSrcP; // Hit char that is not part of combi ning sequence.

1396 // back up over it. (Could be surr ogate pair!)

1397 break;

1398 }

1399

1400 if (leadingCC < prevTrailingCC) {

1401 needNormalize = TRUE;

1402 }

1403

1404 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);

1405 }

1406 }

1407 }

1408

1409 collationSource->fcdPosition = (UChar *)srcP;

1410

1411 return needNormalize;

1412 }

1413

1414 /****************************************************************************/

1415 /* Following are the CE retrieval functions */

1416 /* */

1417 /****************************************************************************/

1418

1419 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);

1420 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);

1421

1422 /* there should be a macro version of this function in the header file */

1423 /* This is the first function that tries to fetch a collation element */

1424 /* If it's not succesfull or it encounters a more difficult situation */

1425 /* some more sofisticated and slower functions are invoked */

1426 static

1427 inline uint32_t ucol_IGetNextCE(const UCollator coll, collIterate collationSou rce, UErrorCode *status) {

1428 uint32_t order = 0;

1429 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */

1430 order = (collationSource->toReturn++); / if so , return them */

1431 if(collationSource->CEpos == collationSource->toReturn) {

1432 collationSource->CEpos = collationSource->toReturn = collationSource ->extendCEs ? collationSource->extendCEs : collationSource->CEs;

1433 }

1434 return order;

1435 }

1436

1437 UChar ch = 0;

1438 collationSource->offsetReturn = NULL;

1439

1440 do {

1441 for (;;) /* Loop handles case when incremental normalize switches */

1442 { /* to or from the side buffer / ori ginal string, and we */

1443 /* need to start again to get the next character. */

1444

1445 if ((collationSource->flags & (UCOL_ITER_HASLEN \| UCOL_ITER_INNORMBU F \| UCOL_ITER_NORM \| UCOL_HIRAGANA_Q \| UCOL_USE_ITERATOR)) == 0)

1446 {

1447 // The source string is null terminated and we're not working fr om the side buffer,

1448 // and we're not normalizing. This is the fast path.

1449 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)

1450 ch = *collationSource->pos++;

1451 if (ch != 0) {

1452 break;

1453 }

1454 else {

1455 return UCOL_NO_MORE_CES;

1456 }

1457 }

1458

1459 if (collationSource->flags & UCOL_ITER_HASLEN) {

1460 // Normal path for strings when length is specified.

1461 // (We can't be in side buffer because it is always null termi nated.)

1462 if (collationSource->pos >= collationSource->endp) {

1463 // Ran off of the end of the main source string. We're done .

1464 return UCOL_NO_MORE_CES;

1465 }

1466 ch = *collationSource->pos++;

1467 }

1468 else if(collationSource->flags & UCOL_USE_ITERATOR) {

1469 UChar32 iterCh = collationSource->iterator->next(collationSource ->iterator);

1470 if(iterCh == U_SENTINEL) {

1471 return UCOL_NO_MORE_CES;

1472 }

1473 ch = (UChar)iterCh;

1474 }

1475 else

1476 {

1477 // Null terminated string.

1478 ch = *collationSource->pos++;

1479 if (ch == 0) {

1480 // Ran off end of buffer.

1481 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {

1482 // Ran off end of main string. backing up one character.

1483 collationSource->pos--;

1484 return UCOL_NO_MORE_CES;

1485 }

1486 else

1487 {

1488 // Hit null in the normalize side buffer.

1489 // Usually this means the end of the normalized data,

1490 // except for one odd case: a null followed by combining chars,

1491 // which is the case if we are at the start of the buf fer.

1492 if (collationSource->pos == collationSource->writableBuf fer.getBuffer()+1) {

1493 break;

1494 }

1495

1496 // Null marked end of side buffer.

1497 // Revert to the main string and

1498 // loop back to top to try again to get a character.

1499 collationSource->pos = collationSource->fcdPosition;

1500 collationSource->flags = collationSource->origFlags;

1501 continue;

1502 }

1503 }

1504 }

1505

1506 if(collationSource->flags&UCOL_HIRAGANA_Q) {

1507 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag

1508 * based on whether the previous codepoint was Hiragana or Katak ana.

1509 */

1510 if(((ch>=0x3040 && ch<=0x3096) \|\| (ch >= 0x309d && ch <= 0x309f) ) \|\|

1511 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {

1512 collationSource->flags \|= UCOL_WAS_HIRAGANA;

1513 } else {

1514 collationSource->flags &= ~UCOL_WAS_HIRAGANA;

1515 }

1516 }

1517

1518 // We've got a character. See if there's any fcd and/or normalizati on stuff to do.

1519 // Note that UCOL_ITER_NORM flag is always zero when we are in th e side buffer.

1520 if ((collationSource->flags & UCOL_ITER_NORM) == 0) {

1521 break;

1522 }

1523

1524 if (collationSource->fcdPosition >= collationSource->pos) {

1525 // An earlier FCD check has already covered the current characte r.

1526 // We can go ahead and process this char.

1527 break;

1528 }

1529

1530 if (ch < ZERO_CC_LIMIT_ ) {

1531 // Fast fcd safe path. Trailing combining class == 0. This cha r is OK.

1532 break;

1533 }

1534

1535 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {

1536 // We need to peek at the next character in order to tell if we are FCD

1537 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSour ce->pos >= collationSource->endp) {

1538 // We are at the last char of source string.

1539 // It is always OK for FCD check.

1540 break;

1541 }

1542

1543 // Not at last char of source string (or we'll check against ter minating null). Do the FCD fast test

1544 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {

1545 break;

1546 }

1547 }

1548

1549

1550 // Need a more complete FCD check and possible normalization.

1551 if (collIterFCD(collationSource)) {

1552 collIterNormalize(collationSource);

1553 }

1554 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {

1555 // No normalization was needed. Go ahead and process the char we already had.

1556 break;

1557 }

1558

1559 // Some normalization happened. Next loop iteration will pick up a char

1560 // from the normalization buffer.

1561

1562 } // end for (;;)

1563

1564

1565 if (ch <= 0xFF) {

1566 /* For latin-1 characters we never need to fall back to the UCA tab le */

1567 /* because all of the UCA data is replicated in the latinOneMappi ng array */

1568 order = coll->latinOneMapping[ch];

1569 if (order > UCOL_NOT_FOUND) {

1570 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);

1571 }

1572 }

1573 else

1574 {

1575 // Always use UCA for Han, Hangul

1576 // (Han extension A is before main Han block)

1577 // ** Han compatibility chars ?? **

1578 if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&

1579 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {

1580 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {

1581 // between the two target ranges; do normal lookup

1582 // ** this range is YI, Modifier tone letters, **

1583 // ** Latin-D, Syloti Nagari, Phagas-pa. **

1584 // ** Latin-D might be tailored, so we need to **

1585 // ** do the normal lookup for these guys. **

1586 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);

1587 } else {

1588 // in one of the target ranges; use UCA

1589 order = UCOL_NOT_FOUND;

1590 }

1591 } else {

1592 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);

1593 }

1594

1595 if(order > UCOL_NOT_FOUND) { / * if a CE is special */

1596 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */

1597 }

1598

1599 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a g ood CE in the tailoring */

1600 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */

1601 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);

1602

1603 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE * /

1604 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collatio nSource, status);

1605 }

1606 }

1607 }

1608 } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_L AST_HANGUL );

1609

1610 if(order == UCOL_NOT_FOUND) {

1611 order = getImplicit(ch, collationSource);

1612 }

1613 return order; /* return the CE */

1614 }

1615

1616 /* ucol_getNextCE, out-of-line version for use from other files. */

1617 U_CAPI uint32_t U_EXPORT2

1618 ucol_getNextCE(const UCollator coll, collIterate collationSource, UErrorCode * status) {

1619 return ucol_IGetNextCE(coll, collationSource, status);

1620 }

1621

1622

1623 /**

1624 * Incremental previous normalization happens here. Pick up the range of chars

1625 * identifed by FCD, normalize it into the collIterate's writable buffer,

1626 * switch the collIterate's state to use the writable buffer.

1627 * @param data collation iterator data

1628 */

1629 static

1630 void collPrevIterNormalize(collIterate *data)

1631 {

1632 UErrorCode status = U_ZERO_ERROR;

1633 const UChar pEnd = data->pos; / End normalize + 1 */

1634 const UChar *pStart;

1635

1636 /* Start normalize */

1637 if (data->fcdPosition == NULL) {

1638 pStart = data->string;

1639 }

1640 else {

1641 pStart = data->fcdPosition + 1;

1642 }

1643

1644 int32_t normLen =

1645 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pSta rt) + 1)),

1646 data->writableBuffer,

1647 status).

1648 length();

1649 if(U_FAILURE(status)) {

1650 return;

1651 }

1652 /*

1653 this puts the null termination infront of the normalized string instead

1654 of the end

1655 */

1656 data->writableBuffer.insert(0, (UChar)0);

1657

1658 /*

1659 * The usual case at this point is that we've got a base

1660 * character followed by marks that were normalized. If

1661 * fcdPosition is NULL, that means that we backed up to

1662 * the beginning of the string and there's no base character.

1663 *

1664 * Forward processing will usually normalize when it sees

1665 * the first mark, so that mark will get it's natural offset

1666 * and the rest will get the offset of the character following

1667 * the marks. The base character will also get its natural offset.

1668 *

1669 * We write the offset of the base character, if there is one,

1670 * followed by the offset of the first mark and then the offsets

1671 * of the rest of the marks.

1672 */

1673 int32_t firstMarkOffset = 0;

1674 int32_t trailOffset = (int32_t)(data->pos - data->string + 1);

1675 int32_t trailCount = normLen - 1;

1676

1677 if (data->fcdPosition != NULL) {

1678 int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);

1679 UChar baseChar = *data->fcdPosition;

1680

1681 firstMarkOffset = baseOffset + 1;

1682

1683 /*

1684 * If the base character is the start of a contraction, forward processi ng

1685 * will normalize the marks while checking for the contraction, which me ans

1686 * that the offset of the first mark will the same as the other marks.

1687 *

1688 * ** THIS IS PROBABLY NOT A COMPLETE TEST **

1689 */

1690 if (baseChar >= 0x100) {

1691 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, bas eChar);

1692

1693 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {

1694 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, bas eChar);

1695 }

1696

1697 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION _TAG) {

1698 firstMarkOffset = trailOffset;

1699 }

1700 }

1701

1702 data->appendOffset(baseOffset, status);

1703 }

1704

1705 data->appendOffset(firstMarkOffset, status);

1706

1707 for (int32_t i = 0; i < trailCount; i += 1) {

1708 data->appendOffset(trailOffset, status);

1709 }

1710

1711 data->offsetRepeatValue = trailOffset;

1712

1713 data->offsetReturn = data->offsetStore - 1;

1714 if (data->offsetReturn == data->offsetBuffer) {

1715 data->offsetStore = data->offsetBuffer;

1716 }

1717

1718 data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;

1719 data->origFlags = data->flags;

1720 data->flags \|= UCOL_ITER_INNORMBUF;

1721 data->flags &= ~(UCOL_ITER_NORM \| UCOL_ITER_HASLEN);

1722 }

1723

1724

1725 /**

1726 * Incremental FCD check for previous iteration and normalize. Called from

1727 * getPrevCE when normalization state is suspect.

1728 * When entering, the state is known to be this:

1729 * o We are working in the main buffer of the collIterate, not the side

1730 * writable buffer. When in the side buffer, normalization mode is always

1731 * off, so we won't get here.

1732 * o The leading combining class from the current character is 0 or the

1733 * trailing combining class of the previous char was zero.

1734 * True because the previous call to this function will have always exited

1735 * that way, and we get called for every char where cc might be non-zero.

1736 * @param data collation iterate struct

1737 * @return normalization status, TRUE for normalization to be done, FALSE

1738 * otherwise

1739 */

1740 static

1741 inline UBool collPrevIterFCD(collIterate *data)

1742 {

1743 const UChar src, start;

1744 uint8_t leadingCC;

1745 uint8_t trailingCC = 0;

1746 uint16_t fcd;

1747 UBool result = FALSE;

1748

1749 start = data->string;

1750 src = data->pos + 1;

1751

1752 /* Get the trailing combining class of the current character. */

1753 fcd = g_nfcImpl->previousFCD16(start, src);

1754

1755 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);

1756

1757 if (leadingCC != 0) {

1758 /*

1759 The current char has a non-zero leading combining class.

1760 Scan backward until we find a char with a trailing cc of zero.

1761 */

1762 for (;;)

1763 {

1764 if (start == src) {

1765 data->fcdPosition = NULL;

1766 return result;

1767 }

1768

1769 fcd = g_nfcImpl->previousFCD16(start, src);

1770

1771 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);

1772

1773 if (trailingCC == 0) {

1774 break;

1775 }

1776

1777 if (leadingCC < trailingCC) {

1778 result = TRUE;

1779 }

1780

1781 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);

1782 }

1783 }

1784

1785 data->fcdPosition = (UChar *)src;

1786

1787 return result;

1788 }

1789

1790 /** gets a code unit from the string at a given offset

1791 * Handles both normal and iterative cases.

1792 * No error checking - caller beware!

1793 */

1794 static inline

1795 UChar peekCodeUnit(collIterate *source, int32_t offset) {

1796 if(source->pos != NULL) {

1797 return *(source->pos + offset);

1798 } else if(source->iterator != NULL) {

1799 UChar32 c;

1800 if(offset != 0) {

1801 source->iterator->move(source->iterator, offset, UITER_CURRENT);

1802 c = source->iterator->next(source->iterator);

1803 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);

1804 } else {

1805 c = source->iterator->current(source->iterator);

1806 }

1807 return c >= 0 ? (UChar)c : 0xfffd; // If the caller works properly, we should never see c<0.

1808 } else {

1809 return 0xfffd;

1810 }

1811 }

1812

1813 // Code point version. Treats the offset as a _code point_ delta.

1814 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-for med UTF-16.

1815 // We cannot use U16_FWD_1 and similar because we do not know the start and limi t of the buffer.

1816 static inline

1817 UChar32 peekCodePoint(collIterate *source, int32_t offset) {

1818 UChar32 c;

1819 if(source->pos != NULL) {

1820 const UChar *p = source->pos;

1821 if(offset >= 0) {

1822 // Skip forward over (offset-1) code points.

1823 while(--offset >= 0) {

1824 if(U16_IS_LEAD(p++) && U16_IS_TRAIL(p)) {

1825 ++p;

1826 }

1827 }

1828 // Read the code point there.

1829 c = *p++;

1830 UChar trail;

1831 if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {

1832 c = U16_GET_SUPPLEMENTARY(c, trail);

1833 }

1834 } else /* offset<0 */ {

1835 // Skip backward over (offset-1) code points.

1836 while(++offset < 0) {

1837 if(U16_IS_TRAIL(--p) && U16_IS_LEAD((p - 1))) {

1838 --p;

1839 }

1840 }

1841 // Read the code point before that.

1842 c = *--p;

1843 UChar lead;

1844 if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {

1845 c = U16_GET_SUPPLEMENTARY(lead, c);

1846 }

1847 }

1848 } else if(source->iterator != NULL) {

1849 if(offset >= 0) {

1850 // Skip forward over (offset-1) code points.

1851 int32_t fwd = offset;

1852 while(fwd-- > 0) {

1853 uiter_next32(source->iterator);

1854 }

1855 // Read the code point there.

1856 c = uiter_current32(source->iterator);

1857 // Return to the starting point, skipping backward over (offset-1) c ode points.

1858 while(offset-- > 0) {

1859 uiter_previous32(source->iterator);

1860 }

1861 } else /* offset<0 */ {

1862 // Read backward, reading offset code points, remember only the last -read one.

1863 int32_t back = offset;

1864 do {

1865 c = uiter_previous32(source->iterator);

1866 } while(++back < 0);

1867 // Return to the starting position, skipping forward over offset cod e points.

1868 do {

1869 uiter_next32(source->iterator);

1870 } while(++offset < 0);

1871 }

1872 } else {

1873 c = U_SENTINEL;

1874 }

1875 return c;

1876 }

1877

1878 /**

1879 * Determines if we are at the start of the data string in the backwards

1880 * collation iterator

1881 * @param data collation iterator

1882 * @return TRUE if we are at the start

1883 */

1884 static

1885 inline UBool isAtStartPrevIterate(collIterate *data) {

1886 if(data->pos == NULL && data->iterator != NULL) {

1887 return !data->iterator->hasPrevious(data->iterator);

1888 }

1889 //return (collIter_bos(data)) \|\|

1890 return (data->pos == data->string) \|\|

1891 ((data->flags & UCOL_ITER_INNORMBUF) && (data->pos != NULL) &&

1892 *(data->pos - 1) == 0 && data->fcdPosition == NULL);

1893 }

1894

1895 static

1896 inline void goBackOne(collIterate *data) {

1897 # if 0

1898 // somehow, it looks like we need to keep iterator synced up

1899 // at all times, as above.

1900 if(data->pos) {

1901 data->pos--;

1902 }

1903 if(data->iterator) {

1904 data->iterator->previous(data->iterator);

1905 }

1906 #endif

1907 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {

1908 data->iterator->previous(data->iterator);

1909 }

1910 if(data->pos) {

1911 data->pos --;

1912 }

1913 }

1914

1915 /**

1916 * Inline function that gets a simple CE.

1917 * So what it does is that it will first check the expansion buffer. If the

1918 * expansion buffer is not empty, ie the end pointer to the expansion buffer

1919 * is different from the string pointer, we return the collation element at the

1920 * return pointer and decrement it.

1921 * For more complicated CEs it resorts to getComplicatedCE.

1922 * @param coll collator data

1923 * @param data collation iterator struct

1924 * @param status error status

1925 */

1926 static

1927 inline uint32_t ucol_IGetPrevCE(const UCollator coll, collIterate data,

1928 UErrorCode *status)

1929 {

1930 uint32_t result = (uint32_t)UCOL_NULLORDER;

1931

1932 if (data->offsetReturn != NULL) {

1933 if (data->offsetRepeatCount > 0) {

1934 data->offsetRepeatCount -= 1;

1935 } else {

1936 if (data->offsetReturn == data->offsetBuffer) {

1937 data->offsetReturn = NULL;

1938 data->offsetStore = data->offsetBuffer;

1939 } else {

1940 data->offsetReturn -= 1;

1941 }

1942 }

1943 }

1944

1945 if ((data->extendCEs && data->toReturn > data->extendCEs) \|\|

1946 (!data->extendCEs && data->toReturn > data->CEs))

1947 {

1948 data->toReturn -= 1;

1949 result = *(data->toReturn);

1950 if (data->CEs == data->toReturn \|\| data->extendCEs == data->toReturn) {

1951 data->CEpos = data->toReturn;

1952 }

1953 }

1954 else {

1955 UChar ch = 0;

1956

1957 do {

1958 /*

1959 Loop handles case when incremental normalize switches to or from the

1960 side buffer / original string, and we need to start again to get the

1961 next character.

1962 */

1963 for (;;) {

1964 if (data->flags & UCOL_ITER_HASLEN) {

1965 /*

1966 Normal path for strings when length is specified.

1967 Not in side buffer because it is always null terminated.

1968 */

1969 if (data->pos <= data->string) {

1970 /* End of the main source string */

1971 return UCOL_NO_MORE_CES;

1972 }

1973 data->pos --;

1974 ch = *data->pos;

1975 }

1976 // we are using an iterator to go back. Pray for us!

1977 else if (data->flags & UCOL_USE_ITERATOR) {

1978 UChar32 iterCh = data->iterator->previous(data->iterator);

1979 if(iterCh == U_SENTINEL) {

1980 return UCOL_NO_MORE_CES;

1981 } else {

1982 ch = (UChar)iterCh;

1983 }

1984 }

1985 else {

1986 data->pos --;

1987 ch = *data->pos;

1988 /* we are in the side buffer. */

1989 if (ch == 0) {

1990 /*

1991 At the start of the normalize side buffer.

1992 Go back to string.

1993 Because pointer points to the last accessed character,

1994 hence we have to increment it by one here.

1995 */

1996 data->flags = data->origFlags;

1997 data->offsetRepeatValue = 0;

1998

1999 if (data->fcdPosition == NULL) {

2000 data->pos = data->string;

2001 return UCOL_NO_MORE_CES;

2002 }

2003 else {

2004 data->pos = data->fcdPosition + 1;

2005 }

2006

2007 continue;

2008 }

2009 }

2010

2011 if(data->flags&UCOL_HIRAGANA_Q) {

2012 if(ch>=0x3040 && ch<=0x309f) {

2013 data->flags \|= UCOL_WAS_HIRAGANA;

2014 } else {

2015 data->flags &= ~UCOL_WAS_HIRAGANA;

2016 }

2017 }

2018

2019 /*

2020 * got a character to determine if there's fcd and/or normalizati on

2021 * stuff to do.

2022 * if the current character is not fcd.

2023 * if current character is at the start of the string

2024 * Trailing combining class == 0.

2025 * Note if pos is in the writablebuffer, norm is always 0

2026 */

2027 if (ch < ZERO_CC_LIMIT_ \|\|

2028 // this should propel us out of the loop in the iterator case

2029 (data->flags & UCOL_ITER_NORM) == 0 \|\|

2030 (data->fcdPosition != NULL && data->fcdPosition <= data->pos )

2031 \|\| data->string == data->pos) {

2032 break;

2033 }

2034

2035 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {

2036 /* if next character is FCD */

2037 if (data->pos == data->string) {

2038 /* First char of string is always OK for FCD check */

2039 break;

2040 }

2041

2042 /* Not first char of string, do the FCD fast test */

2043 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {

2044 break;

2045 }

2046 }

2047

2048 /* Need a more complete FCD check and possible normalization. */

2049 if (collPrevIterFCD(data)) {

2050 collPrevIterNormalize(data);

2051 }

2052

2053 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {

2054 /* No normalization. Go ahead and process the char. */

2055 break;

2056 }

2057

2058 /*

2059 Some normalization happened.

2060 Next loop picks up a char from the normalization buffer.

2061 */

2062 }

2063

2064 /* attempt to handle contractions, after removal of the backwards

2065 contraction

2066 */

2067 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {

2068 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, d ata, status);

2069 } else {

2070 if (ch <= 0xFF) {

2071 result = coll->latinOneMapping[ch];

2072 }

2073 else {

2074 // Always use UCA for [3400..9FFF], [AC00..D7AF]

2075 // ** [FA0E..FA2F] ?? **

2076 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&

2077 (ch >= 0x3400 && ch <= 0xD7AF)) {

2078 if (ch > 0x9FFF && ch < 0xAC00) {

2079 // between the two target ranges; do normal lookup

2080 // ** this range is YI, Modifier tone letters, * *

2081 // ** Latin-D, Syloti Nagari, Phagas-pa. * *

2082 // ** Latin-D might be tailored, so we need to * *

2083 // ** do the normal lookup for these guys. * *

2084 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);

2085 } else {

2086 result = UCOL_NOT_FOUND;

2087 }

2088 } else {

2089 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);

2090 }

2091 }

2092 if (result > UCOL_NOT_FOUND) {

2093 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, s tatus);

2094 }

2095 if (result == UCOL_NOT_FOUND) { // Not found in master list

2096 if (!isAtStartPrevIterate(data) &&

2097 ucol_contractionEndCP(ch, data->coll))

2098 {

2099 result = UCOL_CONTRACTION;

2100 } else {

2101 if(coll->UCA) {

2102 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);

2103 }

2104 }

2105

2106 if (result > UCOL_NOT_FOUND) {

2107 if(coll->UCA) {

2108 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, re sult, data, status);

2109 }

2110 }

2111 }

2112 }

2113 } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= U COL_LAST_HANGUL );

2114

2115 if(result == UCOL_NOT_FOUND) {

2116 result = getPrevImplicit(ch, data);

2117 }

2118 }

2119

2120 return result;

2121 }

2122

2123

2124 /* ucol_getPrevCE, out-of-line version for use from other files. */

2125 U_CFUNC uint32_t U_EXPORT2

2126 ucol_getPrevCE(const UCollator coll, collIterate data,

2127 UErrorCode *status) {

2128 return ucol_IGetPrevCE(coll, data, status);

2129 }

2130

2131

2132 /* this should be connected to special Jamo handling */

2133 U_CFUNC uint32_t U_EXPORT2

2134 ucol_getFirstCE(const UCollator coll, UChar u, UErrorCode status) {

2135 collIterate colIt;

2136 IInit_collIterate(coll, &u, 1, &colIt, status);

2137 if(U_FAILURE(*status)) {

2138 return 0;

2139 }

2140 return ucol_IGetNextCE(coll, &colIt, status);

2141 }

2142

2143 /**

2144 * Inserts the argument character into the end of the buffer pushing back the

2145 * null terminator.

2146 * @param data collIterate struct data

2147 * @param ch character to be appended

2148 * @return the position of the new addition

2149 */

2150 static

2151 inline const UChar * insertBufferEnd(collIterate *data, UChar ch)

2152 {

2153 int32_t oldLength = data->writableBuffer.length();

2154 return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;

2155 }

2156

2157 /**

2158 * Inserts the argument string into the end of the buffer pushing back the

2159 * null terminator.

2160 * @param data collIterate struct data

2161 * @param string to be appended

2162 * @param length of the string to be appended

2163 * @return the position of the new addition

2164 */

2165 static

2166 inline const UChar * insertBufferEnd(collIterate data, const UChar str, int32_ t length)

2167 {

2168 int32_t oldLength = data->writableBuffer.length();

2169 return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldL ength;

2170 }

2171

2172 /**

2173 * Special normalization function for contraction in the forwards iterator.

2174 * This normalization sequence will place the current character at source->pos

2175 * and its following normalized sequence into the buffer.

2176 * The fcd position, pos will be changed.

2177 * pos will now point to positions in the buffer.

2178 * Flags will be changed accordingly.

2179 * @param data collation iterator data

2180 */

2181 static

2182 inline void normalizeNextContraction(collIterate *data)

2183 {

2184 int32_t strsize;

2185 UErrorCode status = U_ZERO_ERROR;

2186 /* because the pointer points to the next character */

2187 const UChar *pStart = data->pos - 1;

2188 const UChar *pEnd;

2189

2190 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {

2191 data->writableBuffer.setTo(*(pStart - 1));

2192 strsize = 1;

2193 }

2194 else {

2195 strsize = data->writableBuffer.length();

2196 }

2197

2198 pEnd = data->fcdPosition;

2199

2200 data->writableBuffer.append(

2201 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStar t)), status));

2202 if(U_FAILURE(status)) {

2203 return;

2204 }

2205

2206 data->pos = data->writableBuffer.getTerminatedBuffer() + strsize;

2207 data->origFlags = data->flags;

2208 data->flags \|= UCOL_ITER_INNORMBUF;

2209 data->flags &= ~(UCOL_ITER_NORM \| UCOL_ITER_HASLEN);

2210 }

2211

2212 /**

2213 * Contraction character management function that returns the next character

2214 * for the forwards iterator.

2215 * Does nothing if the next character is in buffer and not the first character

2216 * in it.

2217 * Else it checks next character in data string to see if it is normalizable.

2218 * If it is not, the character is simply copied into the buffer, else

2219 * the whole normalized substring is copied into the buffer, including the

2220 * current character.

2221 * @param data collation element iterator data

2222 * @return next character

2223 */

2224 static

2225 inline UChar getNextNormalizedChar(collIterate *data)

2226 {

2227 UChar nextch;

2228 UChar ch;

2229 // Here we need to add the iterator code. One problem is the way

2230 // end of string is handled. If we just return next char, it could

2231 // be the sentinel. Most of the cases already check for this, but we

2232 // need to be sure.

2233 if ((data->flags & (UCOL_ITER_NORM \| UCOL_ITER_INNORMBUF)) == 0 ) {

2234 /* if no normalization and not in buffer. */

2235 if(data->flags & UCOL_USE_ITERATOR) {

2236 return (UChar)data->iterator->next(data->iterator);

2237 } else {

2238 return *(data->pos ++);

2239 }

2240 }

2241

2242 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {

2243 //normalizeIterator(data);

2244 //}

2245

2246 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);

2247 if ((innormbuf && *data->pos != 0) \|\|

2248 (data->fcdPosition != NULL && !innormbuf &&

2249 data->pos < data->fcdPosition)) {

2250 /*

2251 if next character is in normalized buffer, no further normalization

2252 is required

2253 */

2254 return *(data->pos ++);

2255 }

2256

2257 if (data->flags & UCOL_ITER_HASLEN) {

2258 /* in data string */

2259 if (data->pos + 1 == data->endp) {

2260 return *(data->pos ++);

2261 }

2262 if (data->pos >= data->endp) {

2263 return (UChar) -1; // return U+FFFF (non-char) to indicate an error

2264 }

2265 }

2266 else {

2267 if (innormbuf) {

2268 // inside the normalization buffer, but at the end

2269 // (since we encountered zero). This means, in the

2270 // case we're using char iterator, that we need to

2271 // do another round of normalization.

2272 //if(data->origFlags & UCOL_USE_ITERATOR) {

2273 // we need to restore original flags,

2274 // otherwise, we'll lose them

2275 //data->flags = data->origFlags;

2276 //normalizeIterator(data);

2277 //return *(data->pos++);

2278 //} else {

2279 /*

2280 in writable buffer, at this point fcdPosition can not be

2281 pointing to the end of the data string. see contracting tag.

2282 */

2283 if(data->fcdPosition) {

2284 if (*(data->fcdPosition + 1) == 0 \|\|

2285 data->fcdPosition + 1 == data->endp) {

2286 /* at the end of the string, dump it into the normalizer */

2287 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;

2288 // Check if data->pos received a null pointer

2289 if (data->pos == NULL) {

2290 return (UChar)-1; // Return to indicate error.

2291 }

2292 return *(data->fcdPosition ++);

2293 }

2294 data->pos = data->fcdPosition;

2295 } else if(data->origFlags & UCOL_USE_ITERATOR) {

2296 // if we are here, we're using a normalizing iterator.

2297 // we should just continue further.

2298 data->flags = data->origFlags;

2299 data->pos = NULL;

2300 return (UChar)data->iterator->next(data->iterator);

2301 }

2302 //}

2303 }

2304 else {

2305 if (*(data->pos + 1) == 0) {

2306 return *(data->pos ++);

2307 }

2308 }

2309 }

2310

2311 ch = *data->pos ++;

2312 nextch = *data->pos;

2313

2314 /*

2315 * if the current character is not fcd.

2316 * Trailing combining class == 0.

2317 */

2318 if ((data->fcdPosition == NULL \|\| data->fcdPosition < data->pos) &&

2319 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ \|\|

2320 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {

2321 /*

2322 Need a more complete FCD check and possible normalization.

2323 normalize substring will be appended to buffer

2324 */

2325 if (collIterFCD(data)) {

2326 normalizeNextContraction(data);

2327 return *(data->pos ++);

2328 }

2329 else if (innormbuf) {

2330 /* fcdposition shifted even when there's no normalization, if we

2331 don't input the rest into this, we'll get the wrong position when

2332 we reach the end of the writableBuffer */

2333 int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);

2334 data->pos = insertBufferEnd(data, data->pos - 1, length);

2335 // Check if data->pos received a null pointer

2336 if (data->pos == NULL) {

2337 return (UChar)-1; // Return to indicate error.

2338 }

2339 return *(data->pos ++);

2340 }

2341 }

2342

2343 if (innormbuf) {

2344 /*

2345 no normalization is to be done hence only one character will be

2346 appended to the buffer.

2347 */

2348 data->pos = insertBufferEnd(data, ch) + 1;

2349 // Check if data->pos received a null pointer

2350 if (data->pos == NULL) {

2351 return (UChar)-1; // Return to indicate error.

2352 }

2353 }

2354

2355 /* points back to the pos in string */

2356 return ch;

2357 }

2358

2359

2360

2361 /**

2362 * Function to copy the buffer into writableBuffer and sets the fcd position to

2363 * the correct position

2364 * @param source data string source

2365 * @param buffer character buffer

2366 */

2367 static

2368 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &b uffer)

2369 {

2370 /* okay confusing part here. to ensure that the skipped characters are

2371 considered later, we need to place it in the appropriate position in the

2372 normalization buffer and reassign the pos pointer. simple case if pos

2373 reside in string, simply copy to normalization buffer and

2374 fcdposition = pos, pos = start of normalization buffer. if pos in

2375 normalization buffer, we'll insert the copy infront of pos and point pos

2376 to the start of the normalization buffer. why am i doing these copies?

2377 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecial CE does

2378 not require any changes, which be really painful. */

2379 if (source->flags & UCOL_ITER_INNORMBUF) {

2380 int32_t replaceLength = source->pos - source->writableBuffer.getBuffer() ;

2381 source->writableBuffer.replace(0, replaceLength, buffer);

2382 }

2383 else {

2384 source->fcdPosition = source->pos;

2385 source->origFlags = source->flags;

2386 source->flags \|= UCOL_ITER_INNORMBUF;

2387 source->flags &= ~(UCOL_ITER_NORM \| UCOL_ITER_HASLEN \| UCOL_USE_IT ERATOR);

2388 source->writableBuffer = buffer;

2389 }

2390

2391 source->pos = source->writableBuffer.getTerminatedBuffer();

2392 }

2393

2394 /**

2395 * Function to get the discontiguos collation element within the source.

2396 * Note this function will set the position to the appropriate places.

2397 * @param coll current collator used

2398 * @param source data string source

2399 * @param constart index to the start character in the contraction table

2400 * @return discontiguos collation element offset

2401 */

2402 static

2403 uint32_t getDiscontiguous(const UCollator coll, collIterate source,

2404 const UChar *constart)

2405 {

2406 /* source->pos currently points to the second combining character after

2407 the start character */

2408 const UChar *temppos = source->pos;

2409 UnicodeString buffer;

2410 const UChar *tempconstart = constart;

2411 uint8_t tempflags = source->flags;

2412 UBool multicontraction = FALSE;

2413 collIterateState discState;

2414

2415 backupState(source, &discState);

2416

2417 buffer.setTo(peekCodePoint(source, -1));

2418 for (;;) {

2419 UChar *UCharOffset;

2420 UChar schar,

2421 tchar;

2422 uint32_t result;

2423

2424 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)

2425 \|\| (peekCodeUnit(source, 0) == 0 &&

2426 //\|\| (*source->pos == 0 &&

2427 ((source->flags & UCOL_ITER_INNORMBUF) == 0 \|\|

2428 source->fcdPosition == NULL \|\|

2429 source->fcdPosition == source->endp \|\|

2430 *(source->fcdPosition) == 0 \|\|

2431 u_getCombiningClass(*(source->fcdPosition)) == 0)) \|\|

2432 /* end of string in null terminated string or stopped by a

2433 null character, note fcd does not always point to a base

2434 character after the discontiguos change */

2435 u_getCombiningClass(peekCodePoint(source, 0)) == 0) {

2436 //u_getCombiningClass(*(source->pos)) == 0) {

2437 //constart = (UChar *)coll->image + getContractOffset(CE);

2438 if (multicontraction) {

2439 source->pos = temppos - 1;

2440 setDiscontiguosAttribute(source, buffer);

2441 return *(coll->contractionCEs +

2442 (tempconstart - coll->contractionIndex));

2443 }

2444 constart = tempconstart;

2445 break;

2446 }

2447

2448 UCharOffset = (UChar )(tempconstart + 1); / skip the backward offset*/

2449 schar = getNextNormalizedChar(source);

2450

2451 while (schar > (tchar = *UCharOffset)) {

2452 UCharOffset++;

2453 }

2454

2455 if (schar != tchar) {

2456 /* not the correct codepoint. we stuff the current codepoint into

2457 the discontiguos buffer and try the next character */

2458 buffer.append(schar);

2459 continue;

2460 }

2461 else {

2462 if (u_getCombiningClass(schar) ==

2463 u_getCombiningClass(peekCodePoint(source, -2))) {

2464 buffer.append(schar);

2465 continue;

2466 }

2467 result = *(coll->contractionCEs +

2468 (UCharOffset - coll->contractionIndex));

2469 }

2470

2471 if (result == UCOL_NOT_FOUND) {

2472 break;

2473 } else if (isContraction(result)) {

2474 /* this is a multi-contraction*/

2475 tempconstart = (UChar *)coll->image + getContractOffset(result);

2476 if (*(coll->contractionCEs + (constart - coll->contractionIndex))

2477 != UCOL_NOT_FOUND) {

2478 multicontraction = TRUE;

2479 temppos = source->pos + 1;

2480 }

2481 } else {

2482 setDiscontiguosAttribute(source, buffer);

2483 return result;

2484 }

2485 }

2486

2487 /* no problems simply reverting just like that,

2488 if we are in string before getting into this function, points back to

2489 string hence no problem.

2490 if we are in normalization buffer before getting into this function,

2491 since we'll never use another normalization within this function, we

2492 know that fcdposition points to a base character. the normalization buffer

2493 never change, hence this revert works. */

2494 loadState(source, &discState, TRUE);

2495 goBackOne(source);

2496

2497 //source->pos = temppos - 1;

2498 source->flags = tempflags;

2499 return *(coll->contractionCEs + (constart - coll->contractionIndex));

2500 }

2501

2502 /* now uses Mark's getImplicitPrimary code */

2503 static

2504 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {

2505 uint32_t r = uprv_uca_getImplicitPrimary(cp);

2506 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) \| 0x000000C0;

2507 collationSource->offsetRepeatCount += 1;

2508 return (r & UCOL_PRIMARYMASK) \| 0x00000505; // This was 'order'

2509 }

2510

2511 /**

2512 * Inserts the argument character into the front of the buffer replacing the

2513 * front null terminator.

2514 * @param data collation element iterator data

2515 * @param ch character to be appended

2516 */

2517 static

2518 inline void insertBufferFront(collIterate *data, UChar ch)

2519 {

2520 data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTer minatedBuffer() + 2;

2521 }

2522

2523 /**

2524 * Special normalization function for contraction in the previous iterator.

2525 * This normalization sequence will place the current character at source->pos

2526 * and its following normalized sequence into the buffer.

2527 * The fcd position, pos will be changed.

2528 * pos will now point to positions in the buffer.

2529 * Flags will be changed accordingly.

2530 * @param data collation iterator data

2531 */

2532 static

2533 inline void normalizePrevContraction(collIterate data, UErrorCode status)

2534 {

2535 const UChar pEnd = data->pos + 1; / End normalize + 1 */

2536 const UChar *pStart;

2537

2538 UnicodeString endOfBuffer;

2539 if (data->flags & UCOL_ITER_HASLEN) {

2540 /*

2541 normalization buffer not used yet, we'll pull down the next

2542 character into the end of the buffer

2543 */

2544 endOfBuffer.setTo(*pEnd);

2545 }

2546 else {

2547 endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL

2548 }

2549

2550 if (data->fcdPosition == NULL) {

2551 pStart = data->string;

2552 }

2553 else {

2554 pStart = data->fcdPosition + 1;

2555 }

2556 int32_t normLen =

2557 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStar t)),

2558 data->writableBuffer,

2559 *status).

2560 length();

2561 if(U_FAILURE(*status)) {

2562 return;

2563 }

2564 /*

2565 this puts the null termination infront of the normalized string instead

2566 of the end

2567 */

2568 data->pos =

2569 data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminat edBuffer() +

2570 1 + normLen;

2571 data->origFlags = data->flags;

2572 data->flags \|= UCOL_ITER_INNORMBUF;

2573 data->flags &= ~(UCOL_ITER_NORM \| UCOL_ITER_HASLEN);

2574 }

2575

2576 /**

2577 * Contraction character management function that returns the previous character

2578 * for the backwards iterator.

2579 * Does nothing if the previous character is in buffer and not the first

2580 * character in it.

2581 * Else it checks previous character in data string to see if it is

2582 * normalizable.

2583 * If it is not, the character is simply copied into the buffer, else

2584 * the whole normalized substring is copied into the buffer, including the

2585 * current character.

2586 * @param data collation element iterator data

2587 * @return previous character

2588 */

2589 static

2590 inline UChar getPrevNormalizedChar(collIterate data, UErrorCode status)

2591 {

2592 UChar prevch;

2593 UChar ch;

2594 const UChar *start;

2595 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);

2596 if ((data->flags & (UCOL_ITER_NORM \| UCOL_ITER_INNORMBUF)) == 0 \|\|

2597 (innormbuf && *(data->pos - 1) != 0)) {

2598 /*

2599 if no normalization.

2600 if previous character is in normalized buffer, no further normalization

2601 is required

2602 */

2603 if(data->flags & UCOL_USE_ITERATOR) {

2604 data->iterator->move(data->iterator, -1, UITER_CURRENT);

2605 return (UChar)data->iterator->next(data->iterator);

2606 } else {

2607 return *(data->pos - 1);

2608 }

2609 }

2610

2611 start = data->pos;

2612 if ((data->fcdPosition==NULL)\|\|(data->flags & UCOL_ITER_HASLEN)) {

2613 /* in data string */

2614 if ((start - 1) == data->string) {

2615 return *(start - 1);

2616 }

2617 start --;

2618 ch = *start;

2619 prevch = *(start - 1);

2620 }

2621 else {

2622 /*

2623 in writable buffer, at this point fcdPosition can not be NULL.

2624 see contracting tag.

2625 */

2626 if (data->fcdPosition == data->string) {

2627 /* at the start of the string, just dump it into the normalizer */

2628 insertBufferFront(data, *(data->fcdPosition));

2629 data->fcdPosition = NULL;

2630 return *(data->pos - 1);

2631 }

2632 start = data->fcdPosition;

2633 ch = *start;

2634 prevch = *(start - 1);

2635 }

2636 /*

2637 * if the current character is not fcd.

2638 * Trailing combining class == 0.

2639 */

2640 if (data->fcdPosition > start &&

2641 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ \|\| prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))

2642 {

2643 /*

2644 Need a more complete FCD check and possible normalization.

2645 normalize substring will be appended to buffer

2646 */

2647 const UChar *backuppos = data->pos;

2648 data->pos = start;

2649 if (collPrevIterFCD(data)) {

2650 normalizePrevContraction(data, status);

2651 return *(data->pos - 1);

2652 }

2653 data->pos = backuppos;

2654 data->fcdPosition ++;

2655 }

2656

2657 if (innormbuf) {

2658 /*

2659 no normalization is to be done hence only one character will be

2660 appended to the buffer.

2661 */

2662 insertBufferFront(data, ch);

2663 data->fcdPosition --;

2664 }

2665

2666 return ch;

2667 }

2668

2669 /* This function handles the special CEs like contractions, expansions, surrogat es, Thai */

2670 /* It is called by getNextCE */

2671

2672 /* The following should be even */

2673 #define UCOL_MAX_DIGITS_FOR_NUMBER 254

2674

2675 uint32_t ucol_prv_getSpecialCE(const UCollator coll, UChar ch, uint32_t CE, col lIterate source, UErrorCode *status) {

2676 collIterateState entryState;

2677 backupState(source, &entryState);

2678 UChar32 cp = ch;

2679

2680 for (;;) {

2681 // This loop will repeat only in the case of contractions, and only when a contraction

2682 // is found and the first CE resulting from that contraction is itself a special

2683 // (an expansion, for example.) All other special CE types are fully handled the

2684 // first time through, and the loop exits.

2685

2686 const uint32_t *CEOffset = NULL;

2687 switch(getCETag(CE)) {

2688 case NOT_FOUND_TAG:

2689 /* This one is not found, and we'll let somebody else bother about i t... no more games */

2690 return CE;

2691 case SPEC_PROC_TAG:

2692 {

2693 // Special processing is getting a CE that is preceded by a cert ain prefix

2694 // Currently this is only needed for optimizing Japanese length and iteration marks.

2695 // When we encouter a special processing tag, we go backwards an d try to see if

2696 // we have a match.

2697 // Contraction tables are used - so the whole process is not unl ike contraction.

2698 // prefix data is stored backwards in the table.

2699 const UChar *UCharOffset;

2700 UChar schar, tchar;

2701 collIterateState prefixState;

2702 backupState(source, &prefixState);

2703 loadState(source, &entryState, TRUE);

2704 goBackOne(source); // We want to look at the point where we ente red - actually one

2705 // before that...

2706

2707 for(;;) {

2708 // This loop will run once per source string character, for as long as we

2709 // are matching a potential contraction sequence

2710

2711 // First we position ourselves at the begining of contractio n sequence

2712 const UChar ContractionStart = UCharOffset = (UChar )coll- >image+getContractOffset(CE);

2713 if (collIter_bos(source)) {

2714 CE = *(coll->contractionCEs + (UCharOffset - coll->contr actionIndex));

2715 break;

2716 }

2717 schar = getPrevNormalizedChar(source, status);

2718 goBackOne(source);

2719

2720 while(schar > (tchar = UCharOffset)) { / since the contrac tion codepoints should be ordered, we skip all that are smaller */

2721 UCharOffset++;

2722 }

2723

2724 if (schar == tchar) {

2725 // Found the source string char in the table.

2726 // Pick up the corresponding CE from the table.

2727 CE = *(coll->contractionCEs +

2728 (UCharOffset - coll->contractionIndex));

2729 }

2730 else

2731 {

2732 // Source string char was not in the table.

2733 // We have not found the prefix.

2734 CE = *(coll->contractionCEs +

2735 (ContractionStart - coll->contractionIndex));

2736 }

2737

2738 if(!isPrefix(CE)) {

2739 // The source string char was in the contraction table, and the corresponding

2740 // CE is not a prefix CE. We found the prefix, break

2741 // out of loop, this CE will end up being returned. T his is the normal

2742 // way out of prefix handling when the source actually contained

2743 // the prefix.

2744 break;

2745 }

2746 }

2747 if(CE != UCOL_NOT_FOUND) { // we found something and we can meri lly continue

2748 loadState(source, &prefixState, TRUE);

2749 if(source->origFlags & UCOL_USE_ITERATOR) {

2750 source->flags = source->origFlags;

2751 }

2752 } else { // prefix search was a failure, we have to backup all t he way to the start

2753 loadState(source, &entryState, TRUE);

2754 }

2755 break;

2756 }

2757 case CONTRACTION_TAG:

2758 {

2759 /* This should handle contractions */

2760 collIterateState state;

2761 backupState(source, &state);

2762 uint32_t firstCE = (coll->contractionCEs + ((UChar )coll->imag e+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;

2763 const UChar *UCharOffset;

2764 UChar schar, tchar;

2765

2766 for (;;) {

2767 /* This loop will run once per source string character, for as long as we */

2768 /* are matching a potential contraction sequence */

2769

2770 /* First we position ourselves at the begining of contractio n sequence */

2771 const UChar ContractionStart = UCharOffset = (UChar )coll- >image+getContractOffset(CE);

2772

2773 if (collIter_eos(source)) {

2774 // Ran off the end of the source string.

2775 CE = *(coll->contractionCEs + (UCharOffset - coll->contr actionIndex));

2776 // So we'll pick whatever we have at the point...

2777 if (CE == UCOL_NOT_FOUND) {

2778 // back up the source over all the chars we scanned going into this contraction.

2779 CE = firstCE;

2780 loadState(source, &state, TRUE);

2781 if(source->origFlags & UCOL_USE_ITERATOR) {

2782 source->flags = source->origFlags;

2783 }

2784 }

2785 break;

2786 }

2787

2788 uint8_t maxCC = (uint8_t)((UCharOffset)&0xFF); /get the di scontiguos stuff / / skip the backward offset, see above */

2789 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);

2790

2791 schar = getNextNormalizedChar(source);

2792 while(schar > (tchar = UCharOffset)) { / since the contrac tion codepoints should be ordered, we skip all that are smaller */

2793 UCharOffset++;

2794 }

2795

2796 if (schar == tchar) {

2797 // Found the source string char in the contraction table .

2798 // Pick up the corresponding CE from the table.

2799 CE = *(coll->contractionCEs +

2800 (UCharOffset - coll->contractionIndex));

2801 }

2802 else

2803 {

2804 // Source string char was not in contraction table.

2805 // Unless we have a discontiguous contraction, we have finished

2806 // with this contraction.

2807 // in order to do the proper detection, we

2808 // need to see if we're dealing with a supplementary

2809 /* We test whether the next two char are surrogate pairs .

2810 * This test is done if the iterator is not NULL.

2811 * If there is no surrogate pair, the iterator

2812 * goes back one if needed. */

2813 UChar32 miss = schar;

2814 if (source->iterator) {

2815 UChar32 surrNextChar; /* the next char in the iterat ion to test */

2816 int32_t prevPos; /* holds the previous position befo re move forward of the source iterator */

2817 if(U16_IS_LEAD(schar) && source->iterator->hasNext(s ource->iterator)) {

2818 prevPos = source->iterator->index;

2819 surrNextChar = getNextNormalizedChar(source);

2820 if (U16_IS_TRAIL(surrNextChar)) {

2821 miss = U16_GET_SUPPLEMENTARY(schar, surrNext Char);

2822 } else if (prevPos < source->iterator->index){

2823 goBackOne(source);

2824 }

2825 }

2826 } else if (U16_IS_LEAD(schar) && source->pos + 1 < sourc e->endp) {

2827 const UChar* prevPos = source->pos;

2828 UChar nextChar = getNextNormalizedChar(source);

2829 if (U16_IS_TRAIL(nextChar)) {

2830 miss = U16_GET_SUPPLEMENTARY(schar, nextChar);

2831 } else if (prevPos < source->pos) {

2832 goBackOne(source);

2833 }

2834 }

2835

2836 uint8_t sCC;

2837 if (miss < 0x300 \|\|

2838 maxCC == 0 \|\|

2839 (sCC = i_getCombiningClass(miss, coll)) == 0 \|\|

2840 sCC>maxCC \|\|

2841 (allSame != 0 && sCC == maxCC) \|\|

2842 collIter_eos(source))

2843 {

2844 // Contraction can not be discontiguous.

2845 goBackOne(source); // back up the source string by one,

2846 // because the character we just looked at was

2847 // not part of the contraction. */

2848 if(U_IS_SUPPLEMENTARY(miss)) {

2849 goBackOne(source);

2850 }

2851 CE = *(coll->contractionCEs +

2852 (ContractionStart - coll->contractionIndex));

2853 } else {

2854 //

2855 // Contraction is possibly discontiguous.

2856 // Scan more of source string looking for a match

2857 //

2858 UChar tempchar;

2859 /* find the next character if schar is not a base ch aracter

2860 and we are not yet at the end of the string */

2861 tempchar = getNextNormalizedChar(source);

2862 // probably need another supplementary thingie here

2863 goBackOne(source);

2864 if (i_getCombiningClass(tempchar, coll) == 0) {

2865 goBackOne(source);

2866 if(U_IS_SUPPLEMENTARY(miss)) {

2867 goBackOne(source);

2868 }

2869 /* Spit out the last char of the string, wasn't tasty enough */

2870 CE = *(coll->contractionCEs +

2871 (ContractionStart - coll->contractionIndex)) ;

2872 } else {

2873 CE = getDiscontiguous(coll, source, ContractionS tart);

2874 }

2875 }

2876 } // else after if(schar == tchar)

2877

2878 if(CE == UCOL_NOT_FOUND) {

2879 /* The Source string did not match the contraction that we were checking. */

2880 /* Back up the source position to undo the effects of h aving partially */

2881 /* scanned through what ultimately proved to not be a contraction. */

2882 loadState(source, &state, TRUE);

2883 CE = firstCE;

2884 break;

2885 }

2886

2887 if(!isContraction(CE)) {

2888 // The source string char was in the contraction table, and the corresponding

2889 // CE is not a contraction CE. We completed the contr action, break

2890 // out of loop, this CE will end up being returned. T his is the normal

2891 // way out of contraction handling when the source act ually contained

2892 // the contraction.

2893 break;

2894 }

2895

2896

2897 // The source string char was in the contraction table, and the corresponding

2898 // CE is IS a contraction CE. We will continue looping t o check the source

2899 // string for the remaining chars in the contraction.

2900 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));

2901 if(tempCE != UCOL_NOT_FOUND) {

2902 // We have scanned a a section of source string for whic h there is a

2903 // CE from the contraction table. Remember the CE and scan position, so

2904 // that we can return to this point if further scanning fails to

2905 // match a longer contraction sequence.

2906 firstCE = tempCE;

2907

2908 goBackOne(source);

2909 backupState(source, &state);

2910 getNextNormalizedChar(source);

2911

2912 // Another way to do this is:

2913 //collIterateState tempState;

2914 //backupState(source, &tempState);

2915 //goBackOne(source);

2916 //backupState(source, &state);

2917 //loadState(source, &tempState, TRUE);

2918

2919 // The problem is that for incomplete contractions we ha ve to remember the previous

2920 // position. Before, the only thing I needed to do was s tate.pos--;

2921 // After iterator introduction and especially after intr oduction of normalizing

2922 // iterators, it became much more difficult to decrease the saved state.

2923 // I'm not yet sure which of the two methods above is fa ster.

2924 }

2925 } // for(;;)

2926 break;

2927 } // case CONTRACTION_TAG:

2928 case LONG_PRIMARY_TAG:

2929 {

2930 *(source->CEpos++) = ((CE & 0xFF)<<24)\|UCOL_CONTINUATION_MARKER;

2931 CE = ((CE & 0xFFFF00) << 8) \| (UCOL_BYTE_COMMON << 8) \| UCOL_BYT E_COMMON;

2932 source->offsetRepeatCount += 1;

2933 return CE;

2934 }

2935 case EXPANSION_TAG:

2936 {

2937 /* This should handle expansion. */

2938 /* NOTE: we can encounter both continuations and expansions in a n expansion! */

2939 /* I have to decide where continuations are going to be dealt wi th */

2940 uint32_t size;

2941 uint32_t i; /* general counter */

2942

2943 CEOffset = (uint32_t )coll->image+getExpansionOffset(CE); / fi nd the offset to expansion table */

2944 size = getExpansionCount(CE);

2945 CE = *CEOffset++;

2946 //source->offsetRepeatCount = -1;

2947

2948 if(size != 0) { /* if there are less than 16 elements in expansi on, we don't terminate */

2949 for(i = 1; i<size; i++) {

2950 (source->CEpos++) = CEOffset++;

2951 source->offsetRepeatCount += 1;

2952 }

2953 } else { /* else, we do */

2954 while(*CEOffset != 0) {

2955 (source->CEpos++) = CEOffset++;

2956 source->offsetRepeatCount += 1;

2957 }

2958 }

2959

2960 return CE;

2961 }

2962 case DIGIT_TAG:

2963 {

2964 /*

2965 We do a check to see if we want to collate digits as numbers; if so we generate

2966 a custom collation key. Otherwise we pull out the value stored i n the expansion table.

2967 */

2968 //uint32_t size;

2969 uint32_t i; /* general counter */

2970

2971 if (source->coll->numericCollation == UCOL_ON){

2972 collIterateState digitState = {0,0,0,0,0,0,0,0,0};

2973 UChar32 char32 = 0;

2974 int32_t digVal = 0;

2975

2976 uint32_t digIndx = 0;

2977 uint32_t endIndex = 0;

2978 uint32_t trailingZeroIndex = 0;

2979

2980 uint8_t collateVal = 0;

2981

2982 UBool nonZeroValReached = FALSE;

2983

2984 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I j ust need a temporary place to store my generated CEs.

2985 /*

2986 We parse the source string until we hit a char that's N OT a digit.

2987 Use this u_charDigitValue. This might be slow because we have to

2988 handle surrogates...

2989 */

2990 /*

2991 if (U16_IS_LEAD(ch)){

2992 if (!collIter_eos(source)) {

2993 backupState(source, &digitState);

2994 UChar trail = getNextNormalizedChar(source);

2995 if(U16_IS_TRAIL(trail)) {

2996 char32 = U16_GET_SUPPLEMENTARY(ch, trail);

2997 } else {

2998 loadState(source, &digitState, TRUE);

2999 char32 = ch;

3000 }

3001 } else {

3002 char32 = ch;

3003 }

3004 } else {

3005 char32 = ch;

3006 }

3007 digVal = u_charDigitValue(char32);

3008 */

3009 digVal = u_charDigitValue(cp); // if we have arrived here, w e have

3010 // already processed possible supplementaries that trigered the digit tag -

3011 // all supplementaries are marked in the UCA.

3012 /*

3013 We pad a zero in front of the first element anyways. Th is takes

3014 care of the (probably) most common case where people are sorting things followed

3015 by a single digit

3016 */

3017 digIndx++;

3018 for(;;){

3019 // Make sure we have enough space. No longer needed;

3020 // at this point digIndx now has a max value of UCOL_MAX _DIGITS_FOR_NUMBER

3021 // (it has been pre-incremented) so we just ensure that numTempBuf is big enough

3022 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).

3023

3024 // Skipping over leading zeroes.

3025 if (digVal != 0) {

3026 nonZeroValReached = TRUE;

3027 }

3028 if (nonZeroValReached) {

3029 /*

3030 We parse the digit string into base 100 numbers (thi s fits into a byte).

3031 We only add to the buffer in twos, thus if we are pa rsing an odd character,

3032 that serves as the 'tens' digit while the if we are parsing an even one, that

3033 is the 'ones' digit. We dumped the parsed base 100 v alue (collateVal) into

3034 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid

3035 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less

3036 than all the other bytes.

3037 */

3038

3039 if (digIndx % 2 == 1){

3040 collateVal += (uint8_t)digVal;

3041

3042 // We don't enter the low-order-digit case unles s we've already seen

3043 // the high order, or for the first digit, which is always non-zero.

3044 if (collateVal != 0)

3045 trailingZeroIndex = 0;

3046

3047 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;

3048 collateVal = 0;

3049 }

3050 else{

3051 // We drop the collation value into the buffer s o if we need to do

3052 // a "front patch" we don't have to check to see if we're hitting the

3053 // last element.

3054 collateVal = (uint8_t)(digVal * 10);

3055

3056 // Check for trailing zeroes.

3057 if (collateVal == 0)

3058 {

3059 if (!trailingZeroIndex)

3060 trailingZeroIndex = (digIndx/2) + 2;

3061 }

3062 else

3063 trailingZeroIndex = 0;

3064

3065 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;

3066 }

3067 digIndx++;

3068 }

3069

3070 // Get next character.

3071 if (!collIter_eos(source)){

3072 ch = getNextNormalizedChar(source);

3073 if (U16_IS_LEAD(ch)){

3074 if (!collIter_eos(source)) {

3075 backupState(source, &digitState);

3076 UChar trail = getNextNormalizedChar(source);

3077 if(U16_IS_TRAIL(trail)) {

3078 char32 = U16_GET_SUPPLEMENTARY(ch, trail );

3079 } else {

3080 loadState(source, &digitState, TRUE);

3081 char32 = ch;

3082 }

3083 }

3084 } else {

3085 char32 = ch;

3086 }

3087

3088 if ((digVal = u_charDigitValue(char32)) == -1 \|\| dig Indx > UCOL_MAX_DIGITS_FOR_NUMBER){

3089 // Resetting position to point to the next unpro cessed char. We

3090 // overshot it when doing our test/set for numbe rs.

3091 if (char32 > 0xFFFF) { // For surrogates.

3092 loadState(source, &digitState, TRUE);

3093 //goBackOne(source);

3094 }

3095 goBackOne(source);

3096 break;

3097 }

3098 } else {

3099 break;

3100 }

3101 }

3102

3103 if (nonZeroValReached == FALSE){

3104 digIndx = 2;

3105 numTempBuf[2] = 6;

3106 }

3107

3108 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx /2) + 2) ;

3109 if (digIndx % 2 != 0){

3110 /*

3111 We missed a value. Since digIndx isn't even, stuck too m any values into the buffer (this is what

3112 we get for padding the first byte with a zero). "Front-p atch" now by pushing all nybbles forward.

3113 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a

3114 single pass and optimizes for strings with single digits . I'm just assuming that's the more common case.

3115 */

3116

3117 for(i = 2; i < endIndex; i++){

3118 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) +

3119 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;

3120 }

3121 --digIndx;

3122 }

3123

3124 // Subtract one off of the last byte.

3125 numTempBuf[endIndex-1] -= 1;

3126

3127 /*

3128 We want to skip over the first two slots in the buffer. The first slot

3129 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the

3130 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.

3131 */

3132 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;

3133 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));

3134

3135 // Now transfer the collation key to our collIterate struct.

3136 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.

3137 //size = ((endIndex+1) & ~1)/2;

3138 CE = (((numTempBuf[0] << 8) \| numTempBuf[1]) << UCOL_PRIMARY ORDERSHIFT) \| //Primary weight

3139 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) \| // Seco ndary weight

3140 UCOL_BYTE_COMMON; // Tertiary weight.

3141 i = 2; // Reset the index into the buffer.

3142 while(i < endIndex)

3143 {

3144 uint32_t primWeight = numTempBuf[i++] << 8;

3145 if ( i < endIndex)

3146 primWeight \|= numTempBuf[i++];

3147 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHI FT) \| UCOL_CONTINUATION_MARKER;

3148 }

3149

3150 } else {

3151 // no numeric mode, we'll just switch to whatever we stashed and continue

3152 CEOffset = (uint32_t )coll->image+getExpansionOffset(CE); / find the offset to expansion table */

3153 CE = *CEOffset++;

3154 break;

3155 }

3156 return CE;

3157 }

3158 /* various implicits optimization */

3159 case IMPLICIT_TAG: /* everything that is not defined otherwise */

3160 /* UCA is filled with these. Tailorings are NOT_FOUND */

3161 return getImplicit(cp, source);

3162 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D */

3163 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImpl icit

3164 return getImplicit(cp, source);

3165 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/

3166 {

3167 static const uint32_t

3168 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11 A7;

3169 //const uint32_t LCount = 19;

3170 static const uint32_t VCount = 21;

3171 static const uint32_t TCount = 28;

3172 //const uint32_t NCount = VCount * TCount; // 588

3173 //const uint32_t SCount = LCount * NCount; // 11172

3174 uint32_t L = ch - SBase;

3175

3176 // divide into pieces

3177

3178 uint32_t T = L % TCount; // we do it in this order since some co mpilers can do % and / in one operation

3179 L /= TCount;

3180 uint32_t V = L % VCount;

3181 L /= VCount;

3182

3183 // offset them

3184

3185 L += LBase;

3186 V += VBase;

3187 T += TBase;

3188

3189 // return the first CE, but first put the rest into the expansio n buffer

3190 if (!source->coll->image->jamoSpecial) { // FAST PATH

3191

3192 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V );

3193 if (T != TBase) {

3194 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mappin g, T);

3195 }

3196

3197 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);

3198

3199 } else { // Jamo is Special

3200 // Since Hanguls pass the FCD check, it is

3201 // guaranteed that we won't be in

3202 // the normalization buffer if something like this happens

3203

3204 // However, if we are using a uchar iterator and normalizati on

3205 // is ON, the Hangul that lead us here is going to be in tha t

3206 // normalization buffer. Here we want to restore the uchar

3207 // iterator state and pull out of the normalization buffer

3208 if(source->iterator != NULL && source->flags & UCOL_ITER_INN ORMBUF) {

3209 source->flags = source->origFlags; // restore the iterat or

3210 source->pos = NULL;

3211 }

3212

3213 // Move Jamos into normalization buffer

3214 UChar *buffer = source->writableBuffer.getBuffer(4);

3215 int32_t bufferLength;

3216 buffer[0] = (UChar)L;

3217 buffer[1] = (UChar)V;

3218 if (T != TBase) {

3219 buffer[2] = (UChar)T;

3220 bufferLength = 3;

3221 } else {

3222 bufferLength = 2;

3223 }

3224 source->writableBuffer.releaseBuffer(bufferLength);

3225

3226 // Indicate where to continue in main input string after exh austing the writableBuffer

3227 source->fcdPosition = source->pos;

3228

3229 source->pos = source->writableBuffer.getTerminatedBuffer() ;

3230 source->origFlags = source->flags;

3231 source->flags \|= UCOL_ITER_INNORMBUF;

3232 source->flags &= ~(UCOL_ITER_NORM \| UCOL_ITER_HASLEN);

3233

3234 return(UCOL_IGNORABLE);

3235 }

3236 }

3237 case SURROGATE_TAG:

3238 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */

3239 /* two things can happen here: next code point can be a trailing sur rogate - we will use it */

3240 /* to retrieve the CE, or it is not a trailing surrogate (or the str ing is done). In that case */

3241 /* we treat it like an unassigned code point. */

3242 {

3243 UChar trail;

3244 collIterateState state;

3245 backupState(source, &state);

3246 if (collIter_eos(source) \|\| !(U16_IS_TRAIL((trail = getNextNorma lizedChar(source))))) {

3247 // we chould have stepped one char forward and it might have turned that it

3248 // was not a trail surrogate. In that case, we have to backu p.

3249 loadState(source, &state, TRUE);

3250 return UCOL_NOT_FOUND;

3251 } else {

3252 /* TODO: CE contain the data from the previous CE + the mask . It should at least be unmasked */

3253 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFF FF, trail);

3254 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.

3255 // We need to backup

3256 loadState(source, &state, TRUE);

3257 return CE;

3258 }

3259 // calculate the supplementary code point value, if surrogat e was not tailored

3260 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10U L)+0xdc00-0x10000));

3261 }

3262 }

3263 break;

3264 case LEAD_SURROGATE_TAG: /* D800-DBFF*/

3265 UChar nextChar;

3266 if( source->flags & UCOL_USE_ITERATOR) {

3267 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source ->iterator))) {

3268 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);

3269 source->iterator->next(source->iterator);

3270 return getImplicit(cp, source);

3271 }

3272 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) \|\| (source->po s<source->endp)) &&

3273 U_IS_TRAIL((nextChar=*source->pos))) {

3274 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);

3275 source->pos++;

3276 return getImplicit(cp, source);

3277 }

3278 return UCOL_NOT_FOUND;

3279 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/

3280 return UCOL_NOT_FOUND; /* broken surrogate sequence */

3281 case CHARSET_TAG:

3282 /* not yet implemented */

3283 /* probably after 1.8 */

3284 return UCOL_NOT_FOUND;

3285 default:

3286 *status = U_INTERNAL_PROGRAM_ERROR;

3287 CE=0;

3288 break;

3289 }

3290 if (CE <= UCOL_NOT_FOUND) break;

3291 }

3292 return CE;

3293 }

3294

3295

3296 /* now uses Mark's getImplicitPrimary code */

3297 static

3298 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {

3299 uint32_t r = uprv_uca_getImplicitPrimary(cp);

3300

3301 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) \| 0x00000505;

3302 collationSource->toReturn = collationSource->CEpos;

3303

3304 // ** doesn't work if using iterator **

3305 if (collationSource->flags & UCOL_ITER_INNORMBUF) {

3306 collationSource->offsetRepeatCount = 1;

3307 } else {

3308 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource-> string);

3309

3310 UErrorCode errorCode = U_ZERO_ERROR;

3311 collationSource->appendOffset(firstOffset, errorCode);

3312 collationSource->appendOffset(firstOffset + 1, errorCode);

3313

3314 collationSource->offsetReturn = collationSource->offsetStore - 1;

3315 *(collationSource->offsetBuffer) = firstOffset;

3316 if (collationSource->offsetReturn == collationSource->offsetBuffer) {

3317 collationSource->offsetStore = collationSource->offsetBuffer;

3318 }

3319 }

3320

3321 return ((r & 0x0000FFFF)<<16) \| 0x000000C0;

3322 }

3323

3324 /**

3325 * This function handles the special CEs like contractions, expansions,

3326 * surrogates, Thai.

3327 * It is called by both getPrevCE

3328 */

3329 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,

3330 collIterate *source,

3331 UErrorCode *status)

3332 {

3333 const uint32_t *CEOffset = NULL;

3334 UChar *UCharOffset = NULL;

3335 UChar schar;

3336 const UChar *constart = NULL;

3337 uint32_t size;

3338 UChar buffer[UCOL_MAX_BUFFER];

3339 uint32_t *endCEBuffer;

3340 UChar *strbuffer;

3341 int32_t noChars = 0;

3342 int32_t CECount = 0;

3343

3344 for(;;)

3345 {

3346 /* the only ces that loops are thai and contractions */

3347 switch (getCETag(CE))

3348 {

3349 case NOT_FOUND_TAG: /* this tag always returns */

3350 return CE;

3351

3352 case SPEC_PROC_TAG:

3353 {

3354 // Special processing is getting a CE that is preceded by a cert ain prefix

3355 // Currently this is only needed for optimizing Japanese length and iteration marks.

3356 // When we encouter a special processing tag, we go backwards an d try to see if

3357 // we have a match.

3358 // Contraction tables are used - so the whole process is not unl ike contraction.

3359 // prefix data is stored backwards in the table.

3360 const UChar *UCharOffset;

3361 UChar schar, tchar;

3362 collIterateState prefixState;

3363 backupState(source, &prefixState);

3364 for(;;) {

3365 // This loop will run once per source string character, for as long as we

3366 // are matching a potential contraction sequence

3367

3368 // First we position ourselves at the begining of contractio n sequence

3369 const UChar ContractionStart = UCharOffset = (UChar )coll- >image+getContractOffset(CE);

3370

3371 if (collIter_bos(source)) {

3372 CE = *(coll->contractionCEs + (UCharOffset - coll->contr actionIndex));

3373 break;

3374 }

3375 schar = getPrevNormalizedChar(source, status);

3376 goBackOne(source);

3377

3378 while(schar > (tchar = UCharOffset)) { / since the contrac tion codepoints should be ordered, we skip all that are smaller */

3379 UCharOffset++;

3380 }

3381

3382 if (schar == tchar) {

3383 // Found the source string char in the table.

3384 // Pick up the corresponding CE from the table.

3385 CE = *(coll->contractionCEs +

3386 (UCharOffset - coll->contractionIndex));

3387 }

3388 else

3389 {

3390 // if there is a completely ignorable code point in the middle of

3391 // a prefix, we need to act as if it's not there

3392 // assumption: 'real' noncharacters (fffe, ffff, fdd0- fdef are set to zero)

3393 // lone surrogates cannot be set to zero as it would bre ak other processing

3394 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping , schar);

3395 // it's easy for BMP code points

3396 if(isZeroCE == 0) {

3397 continue;

3398 } else if(U16_IS_SURROGATE(schar)) {

3399 // for supplementary code points, we have to check t he next one

3400 // situations where we are going to ignore

3401 // 1. beginning of the string: schar is a lone surro gate

3402 // 2. schar is a lone surrogate

3403 // 3. schar is a trail surrogate in a valid surrogat e sequence

3404 // that is explicitly set to zero.

3405 if (!collIter_bos(source)) {

3406 UChar lead;

3407 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD( lead = getPrevNormalizedChar(source, status))) {

3408 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapp ing, lead);

3409 if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {

3410 uint32_t finalCE = UTRIE_GET32_FROM_OFFS ET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);

3411 if(finalCE == 0) {

3412 // this is a real, assigned complete ly ignorable code point

3413 goBackOne(source);

3414 continue;

3415 }

3416 }

3417 } else {

3418 // lone surrogate, treat like unassigned

3419 return UCOL_NOT_FOUND;

3420 }

3421 } else {

3422 // lone surrogate at the beggining, treat like u nassigned

3423 return UCOL_NOT_FOUND;

3424 }

3425 }

3426 // Source string char was not in the table.

3427 // We have not found the prefix.

3428 CE = *(coll->contractionCEs +

3429 (ContractionStart - coll->contractionIndex));

3430 }

3431

3432 if(!isPrefix(CE)) {

3433 // The source string char was in the contraction table, and the corresponding

3434 // CE is not a prefix CE. We found the prefix, break

3435 // out of loop, this CE will end up being returned. T his is the normal

3436 // way out of prefix handling when the source actually contained

3437 // the prefix.

3438 break;

3439 }

3440 }

3441 loadState(source, &prefixState, TRUE);

3442 break;

3443 }

3444

3445 case CONTRACTION_TAG: {

3446 /* to ensure that the backwards and forwards iteration matches, we

3447 take the current region of most possible match and pass it through

3448 the forward iteration. this will ensure that the obstinate problem o f

3449 overlapping contractions will not occur.

3450 */

3451 schar = peekCodeUnit(source, 0);

3452 constart = (UChar *)coll->image + getContractOffset(CE);

3453 if (isAtStartPrevIterate(source)

3454 /* commented away contraction end checks after adding the checks

3455 in getPrevCE */) {

3456 /* start of string or this is not the end of any contraction */

3457 CE = *(coll->contractionCEs +

3458 (constart - coll->contractionIndex));

3459 break;

3460 }

3461 strbuffer = buffer;

3462 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);

3463 *(UCharOffset --) = 0;

3464 noChars = 0;

3465 // have to swap thai characters

3466 while (ucol_unsafeCP(schar, coll)) {

3467 *(UCharOffset) = schar;

3468 noChars++;

3469 UCharOffset --;

3470 schar = getPrevNormalizedChar(source, status);

3471 goBackOne(source);

3472 // TODO: when we exhaust the contraction buffer,

3473 // it needs to get reallocated. The problem is

3474 // that the size depends on the string which is

3475 // not iterated over. However, since we're travelling

3476 // backwards, we already had to set the iterator at

3477 // the end - so we might as well know where we are?

3478 if (UCharOffset + 1 == buffer) {

3479 /* we have exhausted the buffer */

3480 int32_t newsize = 0;

3481 if(source->pos) { // actually dealing with a position

3482 newsize = (int32_t)(source->pos - source->string + 1);

3483 } else { // iterator

3484 newsize = 4 * UCOL_MAX_BUFFER;

3485 }

3486 strbuffer = (UChar )uprv_malloc(sizeof(UChar)

3487 (newsize + UCOL_MAX_BUFFER));

3488 /* test for NULL */

3489 if (strbuffer == NULL) {

3490 *status = U_MEMORY_ALLOCATION_ERROR;

3491 return UCOL_NO_MORE_CES;

3492 }

3493 UCharOffset = strbuffer + newsize;

3494 uprv_memcpy(UCharOffset, buffer,

3495 UCOL_MAX_BUFFER * sizeof(UChar));

3496 UCharOffset --;

3497 }

3498 if ((source->pos && (source->pos == source->string \|\|

3499 ((source->flags & UCOL_ITER_INNORMBUF) &&

3500 *(source->pos - 1) == 0 && source->fcdPosition == NULL)))

3501 \|\| (source->iterator && !source->iterator->hasPrevious(sourc e->iterator))) {

3502 break;

3503 }

3504 }

3505 /* adds the initial base character to the string */

3506 *(UCharOffset) = schar;

3507 noChars++;

3508

3509 int32_t offsetBias;

3510

3511 // ** doesn't work if using iterator **

3512 if (source->flags & UCOL_ITER_INNORMBUF) {

3513 offsetBias = -1;

3514 } else {

3515 offsetBias = (int32_t)(source->pos - source->string);

3516 }

3517

3518 /* a new collIterate is used to simplify things, since using the cur rent

3519 collIterate will mean that the forward and backwards iteration will

3520 share and change the same buffers. we don't want to get into that. * /

3521 collIterate temp;

3522 int32_t rawOffset;

3523

3524 IInit_collIterate(coll, UCharOffset, noChars, &temp, status);

3525 if(U_FAILURE(*status)) {

3526 return (uint32_t)UCOL_NULLORDER;

3527 }

3528 temp.flags &= ~UCOL_ITER_NORM;

3529 temp.flags \|= source->flags & UCOL_FORCE_HAN_IMPLICIT;

3530

3531 rawOffset = (int32_t)(temp.pos - temp.string); // should always be z ero?

3532 CE = ucol_IGetNextCE(coll, &temp, status);

3533

3534 if (source->extendCEs) {

3535 endCEBuffer = source->extendCEs + source->extendCEsSize;

3536 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(u int32_t));

3537 } else {

3538 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;

3539 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_ t));

3540 }

3541

3542 while (CE != UCOL_NO_MORE_CES) {

3543 *(source->CEpos ++) = CE;

3544

3545 if (offsetBias >= 0) {

3546 source->appendOffset(rawOffset + offsetBias, *status);

3547 }

3548

3549 CECount++;

3550 if (source->CEpos == endCEBuffer) {

3551 /* ran out of CE space, reallocate to new buffer.

3552 If reallocation fails, reset pointers and bail out,

3553 there's no guarantee of the right character position after

3554 this bail*/

3555 if (!increaseCEsCapacity(source)) {

3556 *status = U_MEMORY_ALLOCATION_ERROR;

3557 break;

3558 }

3559

3560 endCEBuffer = source->extendCEs + source->extendCEsSize;

3561 }

3562

3563 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {

3564 rawOffset = (int32_t)(temp.fcdPosition - temp.string);

3565 } else {

3566 rawOffset = (int32_t)(temp.pos - temp.string);

3567 }

3568

3569 CE = ucol_IGetNextCE(coll, &temp, status);

3570 }

3571

3572 if (strbuffer != buffer) {

3573 uprv_free(strbuffer);

3574 }

3575 if (U_FAILURE(*status)) {

3576 return (uint32_t)UCOL_NULLORDER;

3577 }

3578

3579 if (source->offsetRepeatValue != 0) {

3580 if (CECount > noChars) {

3581 source->offsetRepeatCount += temp.offsetRepeatCount;

3582 } else {

3583 // ** does this really skip the right offsets? **

3584 source->offsetReturn -= (noChars - CECount);

3585 }

3586 }

3587

3588 if (offsetBias >= 0) {

3589 source->offsetReturn = source->offsetStore - 1;

3590 if (source->offsetReturn == source->offsetBuffer) {

3591 source->offsetStore = source->offsetBuffer;

3592 }

3593 }

3594

3595 source->toReturn = source->CEpos - 1;

3596 if (source->toReturn == source->CEs) {

3597 source->CEpos = source->CEs;

3598 }

3599

3600 return *(source->toReturn);

3601 }

3602 case LONG_PRIMARY_TAG:

3603 {

3604 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) \| (UCOL_BYTE_COMMON << 8) \| UCOL_BYTE_COMMON;

3605 *(source->CEpos++) = ((CE & 0xFF)<<24)\|UCOL_CONTINUATION_MARKER;

3606 source->toReturn = source->CEpos - 1;

3607

3608 if (source->flags & UCOL_ITER_INNORMBUF) {

3609 source->offsetRepeatCount = 1;

3610 } else {

3611 int32_t firstOffset = (int32_t)(source->pos - source->string );

3612

3613 source->appendOffset(firstOffset, *status);

3614 source->appendOffset(firstOffset + 1, *status);

3615

3616 source->offsetReturn = source->offsetStore - 1;

3617 *(source->offsetBuffer) = firstOffset;

3618 if (source->offsetReturn == source->offsetBuffer) {

3619 source->offsetStore = source->offsetBuffer;

3620 }

3621 }

3622

3623

3624 return *(source->toReturn);

3625 }

3626

3627 case EXPANSION_TAG: /* this tag always returns */

3628 {

3629 /*

3630 This should handle expansion.

3631 NOTE: we can encounter both continuations and expansions in an expan sion!

3632 I have to decide where continuations are going to be dealt with

3633 */

3634 int32_t firstOffset = (int32_t)(source->pos - source->string);

3635

3636 // ** doesn't work if using iterator **

3637 if (source->offsetReturn != NULL) {

3638 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetRet urn == source->offsetBuffer) {

3639 source->offsetStore = source->offsetBuffer;

3640 }else {

3641 firstOffset = -1;

3642 }

3643 }

3644

3645 /* find the offset to expansion table */

3646 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);

3647 size = getExpansionCount(CE);

3648 if (size != 0) {

3649 /*

3650 if there are less than 16 elements in expansion, we don't termin ate

3651 */

3652 uint32_t count;

3653

3654 for (count = 0; count < size; count++) {

3655 (source->CEpos ++) = CEOffset++;

3656

3657 if (firstOffset >= 0) {

3658 source->appendOffset(firstOffset + 1, *status);

3659 }

3660 }

3661 } else {

3662 /* else, we do */

3663 while (*CEOffset != 0) {

3664 (source->CEpos ++) = CEOffset ++;

3665

3666 if (firstOffset >= 0) {

3667 source->appendOffset(firstOffset + 1, *status);

3668 }

3669 }

3670 }

3671

3672 if (firstOffset >= 0) {

3673 source->offsetReturn = source->offsetStore - 1;

3674 *(source->offsetBuffer) = firstOffset;

3675 if (source->offsetReturn == source->offsetBuffer) {

3676 source->offsetStore = source->offsetBuffer;

3677 }

3678 } else {

3679 source->offsetRepeatCount += size - 1;

3680 }

3681

3682 source->toReturn = source->CEpos - 1;

3683 // in case of one element expansion, we

3684 // want to immediately return CEpos

3685 if(source->toReturn == source->CEs) {

3686 source->CEpos = source->CEs;

3687 }

3688

3689 return *(source->toReturn);

3690 }

3691

3692 case DIGIT_TAG:

3693 {

3694 /*

3695 We do a check to see if we want to collate digits as numbers; if so we generate

3696 a custom collation key. Otherwise we pull out the value stored i n the expansion table.

3697 */

3698 uint32_t i; /* general counter */

3699

3700 if (source->coll->numericCollation == UCOL_ON){

3701 uint32_t digIndx = 0;

3702 uint32_t endIndex = 0;

3703 uint32_t leadingZeroIndex = 0;

3704 uint32_t trailingZeroCount = 0;

3705

3706 uint8_t collateVal = 0;

3707

3708 UBool nonZeroValReached = FALSE;

3709

3710 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I j ust need a temporary place to store my generated CEs.

3711 /*

3712 We parse the source string until we hit a char that's NOT a digit.

3713 Use this u_charDigitValue. This might be slow because we hav e to

3714 handle surrogates...

3715 */

3716 /*

3717 We need to break up the digit string into collection element s of UCOL_MAX_DIGITS_FOR_NUMBER or less,

3718 with any chunks smaller than that being on the right end of the digit string - i.e. the first collation

3719 element we process when going backward. To determine how lon g that chunk might be, we may need to make

3720 two passes through the loop that collects digits - one to se e how long the string is (and how much is

3721 leading zeros) to determine the length of that right-hand ch unk, and a second (if the whole string has

3722 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits ) to actually process that collation

3723 element chunk after resetting the state to the initialState at the right side of the digit string.

3724 */

3725 uint32_t ceLimit = 0;

3726 UChar initial_ch = ch;

3727 collIterateState initialState = {0,0,0,0,0,0,0,0,0};

3728 backupState(source, &initialState);

3729

3730 for(;;) {

3731 collIterateState state = {0,0,0,0,0,0,0,0,0};

3732 UChar32 char32 = 0;

3733 int32_t digVal = 0;

3734

3735 if (U16_IS_TRAIL (ch)) {

3736 if (!collIter_bos(source)){

3737 UChar lead = getPrevNormalizedChar(source, statu s);

3738 if(U16_IS_LEAD(lead)) {

3739 char32 = U16_GET_SUPPLEMENTARY(lead,ch);

3740 goBackOne(source);

3741 } else {

3742 char32 = ch;

3743 }

3744 } else {

3745 char32 = ch;

3746 }

3747 } else {

3748 char32 = ch;

3749 }

3750 digVal = u_charDigitValue(char32);

3751

3752 for(;;) {

3753 // Make sure we have enough space. No longer needed;

3754 // at this point the largest value of digIndx when w e need to save data in numTempBuf

3755 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post- incremented) so we just ensure

3756 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FO R_NUMBER/2 + 2).

3757

3758 // Skip over trailing zeroes, and keep a count of th em.

3759 if (digVal != 0)

3760 nonZeroValReached = TRUE;

3761

3762 if (nonZeroValReached) {

3763 /*

3764 We parse the digit string into base 100 numbers (this fits into a byte).

3765 We only add to the buffer in twos, thus if we ar e parsing an odd character,

3766 that serves as the 'tens' digit while the if we are parsing an even one, that

3767 is the 'ones' digit. We dumped the parsed base 1 00 value (collateVal) into

3768 a buffer. We multiply each collateVal by 2 (to g ive us room) and add 5 (to avoid

3769 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less

3770 than all the other bytes.

3771

3772 Since we're doing in this reverse we want to put the first digit encountered into the

3773 ones place and the second digit encountered into the tens place.

3774 */

3775

3776 if ((digIndx + trailingZeroCount) % 2 == 1) {

3777 // High-order digit case (tens place)

3778 collateVal += (uint8_t)(digVal * 10);

3779

3780 // We cannot set leadingZeroIndex unless it has been set for the

3781 // low-order digit. Therefore, all we can do for the high-order

3782 // digit is turn it off, never on.

3783 // The only time we will have a high digit w ithout a low is for

3784 // the very first non-zero digit, so no zero check is necessary.

3785 if (collateVal != 0)

3786 leadingZeroIndex = 0;

3787

3788 // The first pass through, digIndx may excee d the limit, but in that case

3789 // we no longer care about numTempBuf conten ts since they will be discarded

3790 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {

3791 numTempBuf[(digIndx/2) + 2] = collateVal *2 + 6;

3792 }

3793 collateVal = 0;

3794 } else {

3795 // Low-order digit case (ones place)

3796 collateVal = (uint8_t)digVal;

3797

3798 // Check for leading zeroes.

3799 if (collateVal == 0) {

3800 if (!leadingZeroIndex)

3801 leadingZeroIndex = (digIndx/2) + 2;

3802 } else

3803 leadingZeroIndex = 0;

3804

3805 // No need to write to buffer; the case of a last odd digit

3806 // is handled below.

3807 }

3808 ++digIndx;

3809 } else

3810 ++trailingZeroCount;

3811

3812 if (!collIter_bos(source)) {

3813 ch = getPrevNormalizedChar(source, status);

3814 //goBackOne(source);

3815 if (U16_IS_TRAIL(ch)) {

3816 backupState(source, &state);

3817 if (!collIter_bos(source)) {

3818 goBackOne(source);

3819 UChar lead = getPrevNormalizedChar(sourc e, status);

3820

3821 if(U16_IS_LEAD(lead)) {

3822 char32 = U16_GET_SUPPLEMENTARY(lead, ch);

3823 } else {

3824 loadState(source, &state, FALSE);

3825 char32 = ch;

3826 }

3827 }

3828 } else

3829 char32 = ch;

3830

3831 if ((digVal = u_charDigitValue(char32)) == -1 \|\| (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {

3832 if (char32 > 0xFFFF) {// For surrogates.

3833 loadState(source, &state, FALSE);

3834 }

3835 // Don't need to "reverse" the goBackOne cal l,

3836 // as this points to the next position to pr ocess..

3837 //if (char32 > 0xFFFF) // For surrogates.

3838 //getNextNormalizedChar(source);

3839 break;

3840 }

3841

3842 goBackOne(source);

3843 }else

3844 break;

3845 }

3846

3847 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_N UMBER) {

3848 // our collation element is not too big, go ahead an d finish with it

3849 break;

3850 }

3851 // our digit string is too long for a collation element;

3852 // set the limit for it, reset the state and begin again

3853 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGIT S_FOR_NUMBER;

3854 if ( ceLimit == 0 ) {

3855 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;

3856 }

3857 ch = initial_ch;

3858 loadState(source, &initialState, FALSE);

3859 digIndx = endIndex = leadingZeroIndex = trailingZeroCoun t = 0;

3860 collateVal = 0;

3861 nonZeroValReached = FALSE;

3862 }

3863

3864 if (! nonZeroValReached) {

3865 digIndx = 2;

3866 trailingZeroCount = 0;

3867 numTempBuf[2] = 6;

3868 }

3869

3870 if ((digIndx + trailingZeroCount) % 2 != 0) {

3871 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;

3872 digIndx += 1; // The implicit leading zero

3873 }

3874 if (trailingZeroCount % 2 != 0) {

3875 // We had to consume one trailing zero for the low digit

3876 // of the least significant byte

3877 digIndx += 1; // The trailing zero not in the expo nent

3878 trailingZeroCount -= 1;

3879 }

3880

3881 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2 ) + 2) ;

3882

3883 // Subtract one off of the last byte. Really the first byte here, but it's reversed...

3884 numTempBuf[2] -= 1;

3885

3886 /*

3887 We want to skip over the first two slots in the buffer. The first slot

3888 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the

3889 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.

3890 The exponent must be adjusted by the number of leading zeroe s, and the number of

3891 trailing zeroes.

3892 */

3893 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;

3894 uint32_t exponent = (digIndx+trailingZeroCount)/2;

3895 if (leadingZeroIndex)

3896 exponent -= ((digIndx/2) + 2 - leadingZeroIndex);

3897 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));

3898

3899 // Now transfer the collation key to our collIterate struct.

3900 // The total size for our collation key is half of endIndex, rounded up.

3901 int32_t size = (endIndex+1)/2;

3902 if(!ensureCEsCapacity(source, size)) {

3903 return (uint32_t)UCOL_NULLORDER;

3904 }

3905 *(source->CEpos++) = (((numTempBuf[0] << 8) \| numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) \| //Primary weight

3906 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) \| // Seco ndary weight

3907 UCOL_BYTE_COMMON; // Tertiary weight.

3908 i = endIndex - 1; // Reset the index into the buffer.

3909 while(i >= 2) {

3910 uint32_t primWeight = numTempBuf[i--] << 8;

3911 if ( i >= 2)

3912 primWeight \|= numTempBuf[i--];

3913 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHI FT) \| UCOL_CONTINUATION_MARKER;

3914 }

3915

3916 source->toReturn = source->CEpos -1;

3917 return *(source->toReturn);

3918 } else {

3919 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);

3920 CE = *(CEOffset++);

3921 break;

3922 }

3923 }

3924

3925 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/

3926 {

3927 static const uint32_t

3928 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11 A7;

3929 //const uint32_t LCount = 19;

3930 static const uint32_t VCount = 21;

3931 static const uint32_t TCount = 28;

3932 //const uint32_t NCount = VCount * TCount; /* 588 */

3933 //const uint32_t SCount = LCount * NCount; /* 11172 */

3934

3935 uint32_t L = ch - SBase;

3936 /*

3937 divide into pieces.

3938 we do it in this order since some compilers can do % and / in on e

3939 operation

3940 */

3941 uint32_t T = L % TCount;

3942 L /= TCount;

3943 uint32_t V = L % VCount;

3944 L /= VCount;

3945

3946 /* offset them */

3947 L += LBase;

3948 V += VBase;

3949 T += TBase;

3950

3951 int32_t firstOffset = (int32_t)(source->pos - source->string);

3952 source->appendOffset(firstOffset, *status);

3953

3954 /*

3955 * return the first CE, but first put the rest into the expansio n buffer

3956 */

3957 if (!source->coll->image->jamoSpecial) {

3958 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L );

3959 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V );

3960 source->appendOffset(firstOffset + 1, *status);

3961

3962 if (T != TBase) {

3963 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mappin g, T);

3964 source->appendOffset(firstOffset + 1, *status);

3965 }

3966

3967 source->toReturn = source->CEpos - 1;

3968

3969 source->offsetReturn = source->offsetStore - 1;

3970 if (source->offsetReturn == source->offsetBuffer) {

3971 source->offsetStore = source->offsetBuffer;

3972 }

3973

3974 return *(source->toReturn);

3975 } else {

3976 // Since Hanguls pass the FCD check, it is

3977 // guaranteed that we won't be in

3978 // the normalization buffer if something like this happens

3979

3980 // Move Jamos into normalization buffer

3981 UChar *tempbuffer = source->writableBuffer.getBuffer(5);

3982 int32_t tempbufferLength, jamoOffset;

3983 tempbuffer[0] = 0;

3984 tempbuffer[1] = (UChar)L;

3985 tempbuffer[2] = (UChar)V;

3986 if (T != TBase) {

3987 tempbuffer[3] = (UChar)T;

3988 tempbufferLength = 4;

3989 } else {

3990 tempbufferLength = 3;

3991 }

3992 source->writableBuffer.releaseBuffer(tempbufferLength);

3993

3994 // Indicate where to continue in main input string after exh austing the writableBuffer

3995 if (source->pos == source->string) {

3996 jamoOffset = 0;

3997 source->fcdPosition = NULL;

3998 } else {

3999 jamoOffset = source->pos - source->string;

4000 source->fcdPosition = source->pos-1;

4001 }

4002

4003 // Append offsets for the additional chars

4004 // (not the 0, and not the L whose offsets match the origina l Hangul)

4005 int32_t jamoRemaining = tempbufferLength - 2;

4006 jamoOffset++; // appended offsets should match end of origin al Hangul

4007 while (jamoRemaining-- > 0) {

4008 source->appendOffset(jamoOffset, *status);

4009 }

4010

4011 source->offsetRepeatValue = jamoOffset;

4012

4013 source->offsetReturn = source->offsetStore - 1;

4014 if (source->offsetReturn == source->offsetBuffer) {

4015 source->offsetStore = source->offsetBuffer;

4016 }

4017

4018 source->pos = source->writableBuffer.getTermin atedBuffer() + tempbufferLength;

4019 source->origFlags = source->flags;

4020 source->flags \|= UCOL_ITER_INNORMBUF;

4021 source->flags &= ~(UCOL_ITER_NORM \| UCOL_ITER_HAS LEN);

4022

4023 return(UCOL_IGNORABLE);

4024 }

4025 }

4026

4027 case IMPLICIT_TAG: /* everything that is not defined otherwise */

4028 return getPrevImplicit(ch, source);

4029

4030 // TODO: Remove CJK implicits as they are handled by the getImplicit Primary function

4031 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D */

4032 return getPrevImplicit(ch, source);

4033

4034 case SURROGATE_TAG: /* This is a surrogate pair */

4035 /* essentially an engaged lead surrogate. */

4036 /* if you have encountered it here, it means that a */

4037 /* broken sequence was encountered and this is an error */

4038 return UCOL_NOT_FOUND;

4039

4040 case LEAD_SURROGATE_TAG: /* D800-DBFF*/

4041 return UCOL_NOT_FOUND; /* broken surrogate sequence */

4042

4043 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/

4044 {

4045 UChar32 cp = 0;

4046 UChar prevChar;

4047 const UChar *prev;

4048 if (isAtStartPrevIterate(source)) {

4049 /* we are at the start of the string, wrong place to be at * /

4050 return UCOL_NOT_FOUND;

4051 }

4052 if (source->pos != source->writableBuffer.getBuffer()) {

4053 prev = source->pos - 1;

4054 } else {

4055 prev = source->fcdPosition;

4056 }

4057 prevChar = *prev;

4058

4059 /* Handles Han and Supplementary characters here.*/

4060 if (U16_IS_LEAD(prevChar)) {

4061 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<< 10UL)+0xdc00-0x10000));

4062 source->pos = prev;

4063 } else {

4064 return UCOL_NOT_FOUND; /* like unassigned */

4065 }

4066

4067 return getPrevImplicit(cp, source);

4068 }

4069

4070 /* UCA is filled with these. Tailorings are NOT_FOUND */

4071 /* not yet implemented */

4072 case CHARSET_TAG: /* this tag always returns */

4073 /* probably after 1.8 */

4074 return UCOL_NOT_FOUND;

4075

4076 default: /* this tag always returns */

4077 *status = U_INTERNAL_PROGRAM_ERROR;

4078 CE=0;

4079 break;

4080 }

4081

4082 if (CE <= UCOL_NOT_FOUND) {

4083 break;

4084 }

4085 }

4086

4087 return CE;

4088 }

4089

4090 /* This should really be a macro */

4091 /* This function is used to reverse parts of a buffer. We need this operation wh en doing continuation */

4092 /* secondaries in French */

4093 /*

4094 void uprv_ucol_reverse_buffer(uint8_t start, uint8_t end) {

4095 uint8_t temp;

4096 while(start<end) {

4097 temp = *start;

4098 start++ = end;

4099 *end-- = temp;

4100 }

4101 }

4102 */

4103

4104 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \

4105 TYPE tempA; \

4106 while((start)<(end)) { \

4107 tempA = *(start); \

4108 (start)++ = (end); \

4109 *(end)-- = tempA; \

4110 } \

4111 }

4112

4113 /****************************************************************************/

4114 /* Following are the sortkey generation functions */

4115 /* */

4116 /****************************************************************************/

4117

4118 U_CAPI int32_t U_EXPORT2	113 U_CAPI int32_t U_EXPORT2

4119 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,	114 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,

4120 const uint8_t *src2, int32_t src2Length,	115 const uint8_t *src2, int32_t src2Length,

4121 uint8_t *dest, int32_t destCapacity) {	116 uint8_t *dest, int32_t destCapacity) {

4122 /* check arguments */	117 /* check arguments */

4123 if( src1==NULL \|\| src1Length<-1 \|\| src1Length==0 \|\| (src1Length>0 && src1[sr c1Length-1]!=0) \|\|	118 if( src1==NULL \|\| src1Length<-1 \|\| src1Length==0 \|\| (src1Length>0 && src1[sr c1Length-1]!=0) \|\|

4124 src2==NULL \|\| src2Length<-1 \|\| src2Length==0 \|\| (src2Length>0 && src2[sr c2Length-1]!=0) \|\|	119 src2==NULL \|\| src2Length<-1 \|\| src2Length==0 \|\| (src2Length>0 && src2[sr c2Length-1]!=0) \|\|

4125 destCapacity<0 \|\| (destCapacity>0 && dest==NULL)	120 destCapacity<0 \|\| (destCapacity>0 && dest==NULL)

4126 ) {	121 ) {

4127 /* error, attempt to write a zero byte and return 0 */	122 /* error, attempt to write a zero byte and return 0 */

(...skipping 55 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4183 /* src1 is not finished, therefore src2==0, and src1 is appended /	178 /* src1 is not finished, therefore src2==0, and src1 is appended /

4184 src2=src1;	179 src2=src1;

4185 }	180 }

4186 /* append src2, "the other, unfinished sort key" */	181 /* append src2, "the other, unfinished sort key" */

4187 while((p++=src2++)!=0) {}	182 while((p++=src2++)!=0) {}

4188	183

4189 /* the actual length might be less than destLength if either sort key contai ned illegally embedded zero bytes */	184 /* the actual length might be less than destLength if either sort key contai ned illegally embedded zero bytes */

4190 return (int32_t)(p-dest);	185 return (int32_t)(p-dest);

4191 }	186 }

4192	187

4193 U_NAMESPACE_BEGIN

4194

4195 class SortKeyByteSink : public ByteSink {

4196 public:

4197 SortKeyByteSink(char *dest, int32_t destCapacity)

4198 : buffer_(dest), capacity_(destCapacity),

4199 appended_(0) {

4200 if (buffer_ == NULL) {

4201 capacity_ = 0;

4202 } else if(capacity_ < 0) {

4203 buffer_ = NULL;

4204 capacity_ = 0;

4205 }

4206 }

4207 virtual ~SortKeyByteSink();

4208

4209 virtual void Append(const char *bytes, int32_t n);

4210 void Append(uint32_t b) {

4211 if (appended_ < capacity_ \|\| Resize(1, appended_)) {

4212 buffer_[appended_] = (char)b;

4213 }

4214 ++appended_;

4215 }

4216 void Append(uint32_t b1, uint32_t b2) {

4217 int32_t a2 = appended_ + 2;

4218 if (a2 <= capacity_ \|\| Resize(2, appended_)) {

4219 buffer_[appended_] = (char)b1;

4220 buffer_[appended_ + 1] = (char)b2;

4221 } else if(appended_ < capacity_) {

4222 buffer_[appended_] = (char)b1;

4223 }

4224 appended_ = a2;

4225 }

4226 virtual char *GetAppendBuffer(int32_t min_capacity,

4227 int32_t desired_capacity_hint,

4228 char *scratch, int32_t scratch_capacity,

4229 int32_t *result_capacity);

4230 int32_t NumberOfBytesAppended() const { return appended_; }

4231 /** @return FALSE if memory allocation failed */

4232 UBool IsOk() const { return buffer_ != NULL; }

4233

4234 protected:

4235 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t leng th) = 0;

4236 virtual UBool Resize(int32_t appendCapacity, int32_t length) = 0;

4237

4238 void SetNotOk() {

4239 buffer_ = NULL;

4240 capacity_ = 0;

4241 }

4242

4243 char *buffer_;

4244 int32_t capacity_;

4245 int32_t appended_;

4246

4247 private:

4248 SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemente d

4249 SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented

4250 };

4251

4252 SortKeyByteSink::~SortKeyByteSink() {}

4253

4254 void

4255 SortKeyByteSink::Append(const char *bytes, int32_t n) {

4256 if (n <= 0 \|\| bytes == NULL) {

4257 return;

4258 }

4259 int32_t length = appended_;

4260 appended_ += n;

4261 if ((buffer_ + length) == bytes) {

4262 return; // the caller used GetAppendBuffer() and wrote the bytes alread y

4263 }

4264 int32_t available = capacity_ - length;

4265 if (n <= available) {

4266 uprv_memcpy(buffer_ + length, bytes, n);

4267 } else {

4268 AppendBeyondCapacity(bytes, n, length);

4269 }

4270 }

4271

4272 char *

4273 SortKeyByteSink::GetAppendBuffer(int32_t min_capacity,

4274 int32_t desired_capacity_hint,

4275 char *scratch,

4276 int32_t scratch_capacity,

4277 int32_t *result_capacity) {

4278 if (min_capacity < 1 \|\| scratch_capacity < min_capacity) {

4279 *result_capacity = 0;

4280 return NULL;

4281 }

4282 int32_t available = capacity_ - appended_;

4283 if (available >= min_capacity) {

4284 *result_capacity = available;

4285 return buffer_ + appended_;

4286 } else if (Resize(desired_capacity_hint, appended_)) {

4287 *result_capacity = capacity_ - appended_;

4288 return buffer_ + appended_;

4289 } else {

4290 *result_capacity = scratch_capacity;

4291 return scratch;

4292 }

4293 }

4294

4295 class FixedSortKeyByteSink : public SortKeyByteSink {

4296 public:

4297 FixedSortKeyByteSink(char *dest, int32_t destCapacity)

4298 : SortKeyByteSink(dest, destCapacity) {}

4299 virtual ~FixedSortKeyByteSink();

4300

4301 private:

4302 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t leng th);

4303 virtual UBool Resize(int32_t appendCapacity, int32_t length);

4304 };

4305

4306 FixedSortKeyByteSink::~FixedSortKeyByteSink() {}

4307

4308 void

4309 FixedSortKeyByteSink::AppendBeyondCapacity(const char bytes, int32_t /n*/, int 32_t length) {

4310 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_

4311 // Fill the buffer completely.

4312 int32_t available = capacity_ - length;

4313 if (available > 0) {

4314 uprv_memcpy(buffer_ + length, bytes, available);

4315 }

4316 }

4317

4318 UBool

4319 FixedSortKeyByteSink::Resize(int32_t /appendCapacity/, int32_t /length/) {

4320 return FALSE;

4321 }

4322

4323 class CollationKeyByteSink : public SortKeyByteSink {

4324 public:

4325 CollationKeyByteSink(CollationKey &key)

4326 : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getC apacity()),

4327 key_(key) {}

4328 virtual ~CollationKeyByteSink();

4329

4330 private:

4331 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t leng th);

4332 virtual UBool Resize(int32_t appendCapacity, int32_t length);

4333

4334 CollationKey &key_;

4335 };

4336

4337 CollationKeyByteSink::~CollationKeyByteSink() {}

4338

4339 void

4340 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {

4341 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_

4342 if (Resize(n, length)) {

4343 uprv_memcpy(buffer_ + length, bytes, n);

4344 }

4345 }

4346

4347 UBool

4348 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {

4349 if (buffer_ == NULL) {

4350 return FALSE; // allocation failed before already

4351 }

4352 int32_t newCapacity = 2 * capacity_;

4353 int32_t altCapacity = length + 2 * appendCapacity;

4354 if (newCapacity < altCapacity) {

4355 newCapacity = altCapacity;

4356 }

4357 if (newCapacity < 200) {

4358 newCapacity = 200;

4359 }

4360 uint8_t *newBuffer = key_.reallocate(newCapacity, length);

4361 if (newBuffer == NULL) {

4362 SetNotOk();

4363 return FALSE;

4364 }

4365 buffer_ = reinterpret_cast<char *>(newBuffer);

4366 capacity_ = newCapacity;

4367 return TRUE;

4368 }

4369

4370 /**

4371 * uint8_t byte buffer, similar to CharString but simpler.

4372 */

4373 class SortKeyLevel : public UMemory {

4374 public:

4375 SortKeyLevel() : len(0), ok(TRUE) {}

4376 ~SortKeyLevel() {}

4377

4378 /** @return FALSE if memory allocation failed */

4379 UBool isOk() const { return ok; }

4380 UBool isEmpty() const { return len == 0; }

4381 int32_t length() const { return len; }

4382 const uint8_t *data() const { return buffer.getAlias(); }

4383 uint8_t operator[](int32_t index) const { return buffer[index]; }

4384

4385 void appendByte(uint32_t b);

4386

4387 void appendTo(ByteSink &sink) const {

4388 sink.Append(reinterpret_cast<const char *>(buffer.getAlias()), len);

4389 }

4390

4391 uint8_t &lastByte() {

4392 U_ASSERT(len > 0);

4393 return buffer[len - 1];

4394 }

4395

4396 uint8_t *getLastFewBytes(int32_t n) {

4397 if (ok && len >= n) {

4398 return buffer.getAlias() + len - n;

4399 } else {

4400 return NULL;

4401 }

4402 }

4403

4404 private:

4405 MaybeStackArray<uint8_t, 40> buffer;

4406 int32_t len;

4407 UBool ok;

4408

4409 UBool ensureCapacity(int32_t appendCapacity);

4410

4411 SortKeyLevel(const SortKeyLevel &other); // forbid copying of this class

4412 SortKeyLevel &operator=(const SortKeyLevel &other); // forbid copying of thi s class

4413 };

4414

4415 void SortKeyLevel::appendByte(uint32_t b) {

4416 if(len < buffer.getCapacity() \|\| ensureCapacity(1)) {

4417 buffer[len++] = (uint8_t)b;

4418 }

4419 }

4420

4421 UBool SortKeyLevel::ensureCapacity(int32_t appendCapacity) {

4422 if(!ok) {

4423 return FALSE;

4424 }

4425 int32_t newCapacity = 2 * buffer.getCapacity();

4426 int32_t altCapacity = len + 2 * appendCapacity;

4427 if (newCapacity < altCapacity) {

4428 newCapacity = altCapacity;

4429 }

4430 if (newCapacity < 200) {

4431 newCapacity = 200;

4432 }

4433 if(buffer.resize(newCapacity, len)==NULL) {

4434 return ok = FALSE;

4435 }

4436 return TRUE;

4437 }

4438

4439 U_NAMESPACE_END

4440

4441 /* sortkey API */

4442 U_CAPI int32_t U_EXPORT2	188 U_CAPI int32_t U_EXPORT2

4443 ucol_getSortKey(const UCollator *coll,	189 ucol_getSortKey(const UCollator *coll,

4444 const UChar *source,	190 const UChar *source,

4445 int32_t sourceLength,	191 int32_t sourceLength,

4446 uint8_t *result,	192 uint8_t *result,

4447 int32_t resultLength)	193 int32_t resultLength)

4448 {	194 {

4449 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);	195 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);

4450 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {	196 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {

4451 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, sour ce,	197 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, sour ce,

4452 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLengt h));	198 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLengt h));

4453 }	199 }

4454	200

4455 if(coll->delegate != NULL) {	201 int32_t keySize = Collator::fromUCollator(coll)->

4456 return ((const Collator*)coll->delegate)->getSortKey(source, sourceLength, result, resultLength);	202 getSortKey(source, sourceLength, result, resultLength);

4457 }	203

4458

4459 UErrorCode status = U_ZERO_ERROR;

4460 int32_t keySize = 0;

4461

4462 if(source != NULL) {

4463 // source == NULL is actually an error situation, but we would need to

4464 // have an error code to return it. Until we introduce a new

4465 // API, it stays like this

4466

4467 /* this uses the function pointer that is set in updateinternalstate */

4468 /* currently, there are two funcs: */

4469 /ucol_calcSortKey(...);/

4470 /ucol_calcSortKeySimpleTertiary(...);/

4471

4472 uint8_t noDest[1] = { 0 };

4473 if(result == NULL) {

4474 // Distinguish pure preflighting from an allocation error.

4475 result = noDest;

4476 resultLength = 0;

4477 }

4478 FixedSortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength );

4479 coll->sortKeyGen(coll, source, sourceLength, sink, &status);

4480 if(U_SUCCESS(status)) {

4481 keySize = sink.NumberOfBytesAppended();

4482 }

4483 }

4484 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);	204 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);

4485 UTRACE_EXIT_STATUS(status);	205 UTRACE_EXIT_VALUE(keySize);

4486 return keySize;	206 return keySize;

4487 }	207 }

4488	208

4489 U_CFUNC int32_t

4490 ucol_getCollationKey(const UCollator *coll,

4491 const UChar *source, int32_t sourceLength,

4492 CollationKey &key,

4493 UErrorCode &errorCode) {

4494 CollationKeyByteSink sink(key);

4495 coll->sortKeyGen(coll, source, sourceLength, sink, &errorCode);

4496 return sink.NumberOfBytesAppended();

4497 }

4498

4499 // Is this primary weight compressible?

4500 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).

4501 // TODO: This should use per-lead-byte flags from FractionalUCA.txt.

4502 static inline UBool

4503 isCompressible(const UCollator * /coll/, uint8_t primary1) {

4504 return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegul arPrimary;

4505 }

4506

4507 static

4508 inline void doCaseShift(SortKeyLevel &cases, uint32_t &caseShift) {

4509 if (caseShift == 0) {

4510 cases.appendByte(UCOL_CASE_BYTE_START);

4511 caseShift = UCOL_CASE_SHIFT_START;

4512 }

4513 }

4514

4515 // Packs the secondary buffer when processing French locale.

4516 static void

4517 packFrench(const uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result) {

4518 secondaries += secsize; // We read the secondary-level bytes back to front.

4519 uint8_t secondary;

4520 int32_t count2 = 0;

4521 int32_t i = 0;

4522 // we use i here since the key size already accounts for terminators, so we' ll discard the increment

4523 for(i = 0; i<secsize; i++) {

4524 secondary = *(secondaries-i-1);

4525 /* This is compression code. */

4526 if (secondary == UCOL_COMMON2) {

4527 ++count2;

4528 } else {

4529 if (count2 > 0) {

4530 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.

4531 while (count2 > UCOL_TOP_COUNT2) {

4532 result.Append(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);

4533 count2 -= (uint32_t)UCOL_TOP_COUNT2;

4534 }

4535 result.Append(UCOL_COMMON_TOP2 - (count2-1));

4536 } else {

4537 while (count2 > UCOL_BOT_COUNT2) {

4538 result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);

4539 count2 -= (uint32_t)UCOL_BOT_COUNT2;

4540 }

4541 result.Append(UCOL_COMMON_BOT2 + (count2-1));

4542 }

4543 count2 = 0;

4544 }

4545 result.Append(secondary);

4546 }

4547 }

4548 if (count2 > 0) {

4549 while (count2 > UCOL_BOT_COUNT2) {

4550 result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);

4551 count2 -= (uint32_t)UCOL_BOT_COUNT2;

4552 }

4553 result.Append(UCOL_COMMON_BOT2 + (count2-1));

4554 }

4555 }

4556

4557 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0

4558

4559 /* This is the sortkey work horse function */

4560 U_CFUNC void U_CALLCONV

4561 ucol_calcSortKey(const UCollator *coll,

4562 const UChar *source,

4563 int32_t sourceLength,

4564 SortKeyByteSink &result,

4565 UErrorCode *status)

4566 {

4567 if(U_FAILURE(*status)) {

4568 return;

4569 }

4570

4571 SortKeyByteSink &primaries = result;

4572 SortKeyLevel secondaries;

4573 SortKeyLevel tertiaries;

4574 SortKeyLevel cases;

4575 SortKeyLevel quads;

4576

4577 UnicodeString normSource;

4578

4579 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);

4580

4581 UColAttributeValue strength = coll->strength;

4582

4583 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);

4584 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);

4585 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);

4586 UBool compareIdent = (strength == UCOL_IDENTICAL);

4587 UBool doCase = (coll->caseLevel == UCOL_ON);

4588 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0) ;

4589 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);

4590 //UBool qShifted = shifted && (compareQuad == 0);

4591 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);

4592

4593 uint32_t variableTopValue = coll->variableTopValue;

4594 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no

4595 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.

4596 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);

4597 uint8_t UCOL_HIRAGANA_QUAD = 0;

4598 if(doHiragana) {

4599 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;

4600 /* allocate one more space for hiragana, value for hiragana */

4601 }

4602 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);

4603

4604 /* support for special features like caselevel and funky secondaries */

4605 int32_t lastSecondaryLength = 0;

4606 uint32_t caseShift = 0;

4607

4608 /* If we need to normalize, we'll do it all at once at the beginning! */

4609 const Normalizer2 *norm2;

4610 if(compareIdent) {

4611 norm2 = Normalizer2Factory::getNFDInstance(*status);

4612 } else if(coll->normalizationMode != UCOL_OFF) {

4613 norm2 = Normalizer2Factory::getFCDInstance(*status);

4614 } else {

4615 norm2 = NULL;

4616 }

4617 if(norm2 != NULL) {

4618 normSource.setTo(FALSE, source, len);

4619 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);

4620 if(qcYesLength != len) {

4621 UnicodeString unnormalized = normSource.tempSubString(qcYesLength);

4622 normSource.truncate(qcYesLength);

4623 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);

4624 source = normSource.getBuffer();

4625 len = normSource.length();

4626 }

4627 }

4628 collIterate s;

4629 IInit_collIterate(coll, source, len, &s, status);

4630 if(U_FAILURE(*status)) {

4631 return;

4632 }

4633 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was norma lized.

4634

4635 uint32_t order = 0;

4636

4637 uint8_t primary1 = 0;

4638 uint8_t primary2 = 0;

4639 uint8_t secondary = 0;

4640 uint8_t tertiary = 0;

4641 uint8_t caseSwitch = coll->caseSwitch;

4642 uint8_t tertiaryMask = coll->tertiaryMask;

4643 int8_t tertiaryAddition = coll->tertiaryAddition;

4644 uint8_t tertiaryTop = coll->tertiaryTop;

4645 uint8_t tertiaryBottom = coll->tertiaryBottom;

4646 uint8_t tertiaryCommon = coll->tertiaryCommon;

4647 uint8_t caseBits = 0;

4648

4649 UBool wasShifted = FALSE;

4650 UBool notIsContinuation = FALSE;

4651

4652 uint32_t count2 = 0, count3 = 0, count4 = 0;

4653 uint8_t leadPrimary = 0;

4654

4655 for(;;) {

4656 order = ucol_IGetNextCE(coll, &s, status);

4657 if(order == UCOL_NO_MORE_CES) {

4658 break;

4659 }

4660

4661 if(order == 0) {

4662 continue;

4663 }

4664

4665 notIsContinuation = !isContinuation(order);

4666

4667 if(notIsContinuation) {

4668 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);

4669 } else {

4670 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));

4671 }

4672

4673 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);

4674 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);

4675 primary1 = (uint8_t)(order >> 8);

4676

4677 uint8_t originalPrimary1 = primary1;

4678 if(notIsContinuation && coll->leadBytePermutationTable != NULL) {

4679 primary1 = coll->leadBytePermutationTable[primary1];

4680 }

4681

4682 if((shifted && ((notIsContinuation && order <= variableTopValue && prima ry1 > 0)

4683 \|\| (!notIsContinuation && wasShifted)))

4684 \|\| (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */

4685 {

4686 /* and other ignorables should be removed if following a shifted cod e point */

4687 if(primary1 == 0) { /* if we were shifted and we got an ignorable co de point */

4688 /* we should just completely ignore it */

4689 continue;

4690 }

4691 if(compareQuad == 0) {

4692 if(count4 > 0) {

4693 while (count4 > UCOL_BOT_COUNT4) {

4694 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);

4695 count4 -= UCOL_BOT_COUNT4;

4696 }

4697 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));

4698 count4 = 0;

4699 }

4700 /* We are dealing with a variable and we're treating them as shi fted */

4701 /* This is a shifted ignorable */

4702 if(primary1 != 0) { /* we need to check this since we could be i n continuation */

4703 quads.appendByte(primary1);

4704 }

4705 if(primary2 != 0) {

4706 quads.appendByte(primary2);

4707 }

4708 }

4709 wasShifted = TRUE;

4710 } else {

4711 wasShifted = FALSE;

4712 /* Note: This code assumes that the table is well built i.e. not hav ing 0 bytes where they are not supposed to be. */

4713 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */

4714 /* regular and simple sortkey calc */

4715 if(primary1 != UCOL_IGNORABLE) {

4716 if(notIsContinuation) {

4717 if(leadPrimary == primary1) {

4718 primaries.Append(primary2);

4719 } else {

4720 if(leadPrimary != 0) {

4721 primaries.Append((primary1 > leadPrimary) ? UCOL_BYT E_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);

4722 }

4723 if(primary2 == UCOL_IGNORABLE) {

4724 /* one byter, not compressed */

4725 primaries.Append(primary1);

4726 leadPrimary = 0;

4727 } else if(isCompressible(coll, originalPrimary1)) {

4728 /* compress */

4729 primaries.Append(leadPrimary = primary1, primary2);

4730 } else {

4731 leadPrimary = 0;

4732 primaries.Append(primary1, primary2);

4733 }

4734 }

4735 } else { /* we are in continuation, so we're gonna add primary t o the key don't care about compression */

4736 if(primary2 == UCOL_IGNORABLE) {

4737 primaries.Append(primary1);

4738 } else {

4739 primaries.Append(primary1, primary2);

4740 }

4741 }

4742 }

4743

4744 if(secondary > compareSec) {

4745 if(!isFrenchSec) {

4746 /* This is compression code. */

4747 if (secondary == UCOL_COMMON2 && notIsContinuation) {

4748 ++count2;

4749 } else {

4750 if (count2 > 0) {

4751 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.

4752 while (count2 > UCOL_TOP_COUNT2) {

4753 secondaries.appendByte(UCOL_COMMON_TOP2 - UC OL_TOP_COUNT2);

4754 count2 -= (uint32_t)UCOL_TOP_COUNT2;

4755 }

4756 secondaries.appendByte(UCOL_COMMON_TOP2 - (count 2-1));

4757 } else {

4758 while (count2 > UCOL_BOT_COUNT2) {

4759 secondaries.appendByte(UCOL_COMMON_BOT2 + UC OL_BOT_COUNT2);

4760 count2 -= (uint32_t)UCOL_BOT_COUNT2;

4761 }

4762 secondaries.appendByte(UCOL_COMMON_BOT2 + (count 2-1));

4763 }

4764 count2 = 0;

4765 }

4766 secondaries.appendByte(secondary);

4767 }

4768 } else {

4769 /* Do the special handling for French secondaries */

4770 /* We need to get continuation elements and do intermediate restore */

4771 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */

4772 if(notIsContinuation) {

4773 if (lastSecondaryLength > 1) {

4774 uint8_t *frenchStartPtr = secondaries.getLastFewByte s(lastSecondaryLength);

4775 if (frenchStartPtr != NULL) {

4776 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */

4777 uint8_t *frenchEndPtr = frenchStartPtr + lastSec ondaryLength - 1;

4778 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr , frenchEndPtr);

4779 }

4780 }

4781 lastSecondaryLength = 1;

4782 } else {

4783 ++lastSecondaryLength;

4784 }

4785 secondaries.appendByte(secondary);

4786 }

4787 }

4788

4789 if(doCase && (primary1 > 0 \|\| strength >= UCOL_SECONDARY)) {

4790 // do the case level if we need to do it. We don't want to calcu late

4791 // case level for primary ignorables if we have only primary str ength and case level

4792 // otherwise we would break well formedness of CEs

4793 doCaseShift(cases, caseShift);

4794 if(notIsContinuation) {

4795 caseBits = (uint8_t)(tertiary & 0xC0);

4796

4797 if(tertiary != 0) {

4798 if(coll->caseFirst == UCOL_UPPER_FIRST) {

4799 if((caseBits & 0xC0) == 0) {

4800 cases.lastByte() \|= 1 << (--caseShift);

4801 } else {

4802 cases.lastByte() \|= 0 << (--caseShift);

4803 /* second bit */

4804 doCaseShift(cases, caseShift);

4805 cases.lastByte() \|= ((caseBits>>6)&1) << (--case Shift);

4806 }

4807 } else {

4808 if((caseBits & 0xC0) == 0) {

4809 cases.lastByte() \|= 0 << (--caseShift);

4810 } else {

4811 cases.lastByte() \|= 1 << (--caseShift);

4812 /* second bit */

4813 doCaseShift(cases, caseShift);

4814 cases.lastByte() \|= ((caseBits>>7)&1) << (--case Shift);

4815 }

4816 }

4817 }

4818 }

4819 } else {

4820 if(notIsContinuation) {

4821 tertiary ^= caseSwitch;

4822 }

4823 }

4824

4825 tertiary &= tertiaryMask;

4826 if(tertiary > compareTer) {

4827 /* This is compression code. */

4828 /* sequence size check is included in the if clause */

4829 if (tertiary == tertiaryCommon && notIsContinuation) {

4830 ++count3;

4831 } else {

4832 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMO N3_NORMAL) {

4833 tertiary += tertiaryAddition;

4834 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UC OL_COMMON3_UPPERFIRST) {

4835 tertiary -= tertiaryAddition;

4836 }

4837 if (count3 > 0) {

4838 if ((tertiary > tertiaryCommon)) {

4839 while (count3 > coll->tertiaryTopCount) {

4840 tertiaries.appendByte(tertiaryTop - coll->tertia ryTopCount);

4841 count3 -= (uint32_t)coll->tertiaryTopCount;

4842 }

4843 tertiaries.appendByte(tertiaryTop - (count3-1));

4844 } else {

4845 while (count3 > coll->tertiaryBottomCount) {

4846 tertiaries.appendByte(tertiaryBottom + coll->ter tiaryBottomCount);

4847 count3 -= (uint32_t)coll->tertiaryBottomCount;

4848 }

4849 tertiaries.appendByte(tertiaryBottom + (count3-1));

4850 }

4851 count3 = 0;

4852 }

4853 tertiaries.appendByte(tertiary);

4854 }

4855 }

4856

4857 if(/qShifted/(compareQuad==0) && notIsContinuation) {

4858 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we ne ed to note it

4859 if(count4>0) { // Close this part

4860 while (count4 > UCOL_BOT_COUNT4) {

4861 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4) ;

4862 count4 -= UCOL_BOT_COUNT4;

4863 }

4864 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));

4865 count4 = 0;

4866 }

4867 quads.appendByte(UCOL_HIRAGANA_QUAD); // Add the Hiragana

4868 } else { // This wasn't Hiragana, so we can continue adding stuf f

4869 count4++;

4870 }

4871 }

4872 }

4873 }

4874

4875 /* Here, we are generally done with processing */

4876 /* bailing out would not be too productive */

4877

4878 UBool ok = TRUE;

4879 if(U_SUCCESS(*status)) {

4880 /* we have done all the CE's, now let's put them together to form a key */

4881 if(compareSec == 0) {

4882 if (count2 > 0) {

4883 while (count2 > UCOL_BOT_COUNT2) {

4884 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);

4885 count2 -= (uint32_t)UCOL_BOT_COUNT2;

4886 }

4887 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));

4888 }

4889 result.Append(UCOL_LEVELTERMINATOR);

4890 if(!secondaries.isOk()) {

4891 ok = FALSE;

4892 } else if(!isFrenchSec) {

4893 secondaries.appendTo(result);

4894 } else {

4895 // If there are any unresolved continuation secondaries,

4896 // reverse them here so that we can reverse the whole secondary thing.

4897 if (lastSecondaryLength > 1) {

4898 uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSe condaryLength);

4899 if (frenchStartPtr != NULL) {

4900 /* reverse secondaries from frenchStartPtr up to frenchE ndPtr */

4901 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLe ngth - 1;

4902 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, french EndPtr);

4903 }

4904 }

4905 packFrench(secondaries.data(), secondaries.length(), result);

4906 }

4907 }

4908

4909 if(doCase) {

4910 ok &= cases.isOk();

4911 result.Append(UCOL_LEVELTERMINATOR);

4912 cases.appendTo(result);

4913 }

4914

4915 if(compareTer == 0) {

4916 if (count3 > 0) {

4917 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {

4918 while (count3 >= coll->tertiaryTopCount) {

4919 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCou nt);

4920 count3 -= (uint32_t)coll->tertiaryTopCount;

4921 }

4922 tertiaries.appendByte(tertiaryTop - count3);

4923 } else {

4924 while (count3 > coll->tertiaryBottomCount) {

4925 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBot tomCount);

4926 count3 -= (uint32_t)coll->tertiaryBottomCount;

4927 }

4928 tertiaries.appendByte(tertiaryBottom + (count3-1));

4929 }

4930 }

4931 ok &= tertiaries.isOk();

4932 result.Append(UCOL_LEVELTERMINATOR);

4933 tertiaries.appendTo(result);

4934

4935 if(compareQuad == 0/qShifted == TRUE/) {

4936 if(count4 > 0) {

4937 while (count4 > UCOL_BOT_COUNT4) {

4938 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);

4939 count4 -= UCOL_BOT_COUNT4;

4940 }

4941 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));

4942 }

4943 ok &= quads.isOk();

4944 result.Append(UCOL_LEVELTERMINATOR);

4945 quads.appendTo(result);

4946 }

4947

4948 if(compareIdent) {

4949 result.Append(UCOL_LEVELTERMINATOR);

4950 u_writeIdenticalLevelRun(s.string, len, result);

4951 }

4952 }

4953 result.Append(0);

4954 }

4955

4956 /* To avoid memory leak, free the offset buffer if necessary. */

4957 ucol_freeOffsetBuffer(&s);

4958

4959 ok &= result.IsOk();

4960 if(!ok && U_SUCCESS(status)) { status = U_MEMORY_ALLOCATION_ERROR; }

4961 }

4962

4963

4964 U_CFUNC void U_CALLCONV

4965 ucol_calcSortKeySimpleTertiary(const UCollator *coll,

4966 const UChar *source,

4967 int32_t sourceLength,

4968 SortKeyByteSink &result,

4969 UErrorCode *status)

4970 {

4971 U_ALIGN_CODE(16);

4972

4973 if(U_FAILURE(*status)) {

4974 return;

4975 }

4976

4977 SortKeyByteSink &primaries = result;

4978 SortKeyLevel secondaries;

4979 SortKeyLevel tertiaries;

4980

4981 UnicodeString normSource;

4982

4983 int32_t len = sourceLength;

4984

4985 /* If we need to normalize, we'll do it all at once at the beginning! */

4986 if(coll->normalizationMode != UCOL_OFF) {

4987 normSource.setTo(len < 0, source, len);

4988 const Normalizer2 norm2 = Normalizer2Factory::getFCDInstance(status);

4989 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);

4990 if(qcYesLength != normSource.length()) {

4991 UnicodeString unnormalized = normSource.tempSubString(qcYesLength);

4992 normSource.truncate(qcYesLength);

4993 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);

4994 source = normSource.getBuffer();

4995 len = normSource.length();

4996 }

4997 }

4998 collIterate s;

4999 IInit_collIterate(coll, (UChar *)source, len, &s, status);

5000 if(U_FAILURE(*status)) {

5001 return;

5002 }

5003 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was norma lized.

5004

5005 uint32_t order = 0;

5006

5007 uint8_t primary1 = 0;

5008 uint8_t primary2 = 0;

5009 uint8_t secondary = 0;

5010 uint8_t tertiary = 0;

5011 uint8_t caseSwitch = coll->caseSwitch;

5012 uint8_t tertiaryMask = coll->tertiaryMask;

5013 int8_t tertiaryAddition = coll->tertiaryAddition;

5014 uint8_t tertiaryTop = coll->tertiaryTop;

5015 uint8_t tertiaryBottom = coll->tertiaryBottom;

5016 uint8_t tertiaryCommon = coll->tertiaryCommon;

5017

5018 UBool notIsContinuation = FALSE;

5019

5020 uint32_t count2 = 0, count3 = 0;

5021 uint8_t leadPrimary = 0;

5022

5023 for(;;) {

5024 order = ucol_IGetNextCE(coll, &s, status);

5025

5026 if(order == 0) {

5027 continue;

5028 }

5029

5030 if(order == UCOL_NO_MORE_CES) {

5031 break;

5032 }

5033

5034 notIsContinuation = !isContinuation(order);

5035

5036 if(notIsContinuation) {

5037 tertiary = (uint8_t)((order & tertiaryMask));

5038 } else {

5039 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));

5040 }

5041

5042 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);

5043 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);

5044 primary1 = (uint8_t)(order >> 8);

5045

5046 uint8_t originalPrimary1 = primary1;

5047 if (coll->leadBytePermutationTable != NULL && notIsContinuation) {

5048 primary1 = coll->leadBytePermutationTable[primary1];

5049 }

5050

5051 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */

5052 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */

5053 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */

5054 /* regular and simple sortkey calc */

5055 if(primary1 != UCOL_IGNORABLE) {

5056 if(notIsContinuation) {

5057 if(leadPrimary == primary1) {

5058 primaries.Append(primary2);

5059 } else {

5060 if(leadPrimary != 0) {

5061 primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UN SHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);

5062 }

5063 if(primary2 == UCOL_IGNORABLE) {

5064 /* one byter, not compressed */

5065 primaries.Append(primary1);

5066 leadPrimary = 0;

5067 } else if(isCompressible(coll, originalPrimary1)) {

5068 /* compress */

5069 primaries.Append(leadPrimary = primary1, primary2);

5070 } else {

5071 leadPrimary = 0;

5072 primaries.Append(primary1, primary2);

5073 }

5074 }

5075 } else { /* we are in continuation, so we're gonna add primary to th e key don't care about compression */

5076 if(primary2 == UCOL_IGNORABLE) {

5077 primaries.Append(primary1);

5078 } else {

5079 primaries.Append(primary1, primary2);

5080 }

5081 }

5082 }

5083

5084 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */

5085 /* This is compression code. */

5086 if (secondary == UCOL_COMMON2 && notIsContinuation) {

5087 ++count2;

5088 } else {

5089 if (count2 > 0) {

5090 if (secondary > UCOL_COMMON2) { // not necessary for 4th lev el.

5091 while (count2 > UCOL_TOP_COUNT2) {

5092 secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_C OUNT2);

5093 count2 -= (uint32_t)UCOL_TOP_COUNT2;

5094 }

5095 secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1));

5096 } else {

5097 while (count2 > UCOL_BOT_COUNT2) {

5098 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_C OUNT2);

5099 count2 -= (uint32_t)UCOL_BOT_COUNT2;

5100 }

5101 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));

5102 }

5103 count2 = 0;

5104 }

5105 secondaries.appendByte(secondary);

5106 }

5107 }

5108

5109 if(notIsContinuation) {

5110 tertiary ^= caseSwitch;

5111 }

5112

5113 if(tertiary > 0) {

5114 /* This is compression code. */

5115 /* sequence size check is included in the if clause */

5116 if (tertiary == tertiaryCommon && notIsContinuation) {

5117 ++count3;

5118 } else {

5119 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_N ORMAL) {

5120 tertiary += tertiaryAddition;

5121 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_ COMMON3_UPPERFIRST) {

5122 tertiary -= tertiaryAddition;

5123 }

5124 if (count3 > 0) {

5125 if ((tertiary > tertiaryCommon)) {

5126 while (count3 > coll->tertiaryTopCount) {

5127 tertiaries.appendByte(tertiaryTop - coll->tertiaryTo pCount);

5128 count3 -= (uint32_t)coll->tertiaryTopCount;

5129 }

5130 tertiaries.appendByte(tertiaryTop - (count3-1));

5131 } else {

5132 while (count3 > coll->tertiaryBottomCount) {

5133 tertiaries.appendByte(tertiaryBottom + coll->tertiar yBottomCount);

5134 count3 -= (uint32_t)coll->tertiaryBottomCount;

5135 }

5136 tertiaries.appendByte(tertiaryBottom + (count3-1));

5137 }

5138 count3 = 0;

5139 }

5140 tertiaries.appendByte(tertiary);

5141 }

5142 }

5143 }

5144

5145 UBool ok = TRUE;

5146 if(U_SUCCESS(*status)) {

5147 /* we have done all the CE's, now let's put them together to form a key */

5148 if (count2 > 0) {

5149 while (count2 > UCOL_BOT_COUNT2) {

5150 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);

5151 count2 -= (uint32_t)UCOL_BOT_COUNT2;

5152 }

5153 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));

5154 }

5155 ok &= secondaries.isOk();

5156 result.Append(UCOL_LEVELTERMINATOR);

5157 secondaries.appendTo(result);

5158

5159 if (count3 > 0) {

5160 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {

5161 while (count3 >= coll->tertiaryTopCount) {

5162 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);

5163 count3 -= (uint32_t)coll->tertiaryTopCount;

5164 }

5165 tertiaries.appendByte(tertiaryTop - count3);

5166 } else {

5167 while (count3 > coll->tertiaryBottomCount) {

5168 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomC ount);

5169 count3 -= (uint32_t)coll->tertiaryBottomCount;

5170 }

5171 tertiaries.appendByte(tertiaryBottom + (count3-1));

5172 }

5173 }

5174 ok &= tertiaries.isOk();

5175 result.Append(UCOL_LEVELTERMINATOR);

5176 tertiaries.appendTo(result);

5177

5178 result.Append(0);

5179 }

5180

5181 /* To avoid memory leak, free the offset buffer if necessary. */

5182 ucol_freeOffsetBuffer(&s);

5183

5184 ok &= result.IsOk();

5185 if(!ok && U_SUCCESS(status)) { status = U_MEMORY_ALLOCATION_ERROR; }

5186 }

5187

5188 static inline

5189 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {

5190 UBool notIsContinuation = !isContinuation(CE);

5191 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);

5192 if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)

5193 \|\| (!notIsContinuation && *wasShifted)))

5194 \|\| (wasShifted && primary1 == 0)) / amendment to the UCA says that pri mary ignorables */

5195 {

5196 // The stuff below should probably be in the sortkey code... maybe not.. .

5197 if(primary1 != 0) { /* if we were shifted and we got an ignorable code p oint */

5198 /* we should just completely ignore it */

5199 *wasShifted = TRUE;

5200 //continue;

5201 }

5202 //*wasShifted = TRUE;

5203 return TRUE;

5204 } else {

5205 *wasShifted = FALSE;

5206 return FALSE;

5207 }

5208 }

5209 static inline

5210 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *des t) {

5211 if(level < maxLevel) {

5212 dest[i++] = UCOL_LEVELTERMINATOR;

5213 } else {

5214 dest[i++] = 0;

5215 }

5216 }

5217

5218 /** enumeration of level identifiers for partial sort key generation */

5219 enum {

5220 UCOL_PSK_PRIMARY = 0,

5221 UCOL_PSK_SECONDARY = 1,

5222 UCOL_PSK_CASE = 2,

5223 UCOL_PSK_TERTIARY = 3,

5224 UCOL_PSK_QUATERNARY = 4,

5225 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have t hree bits to blow */

5226 UCOL_PSK_IDENTICAL = 6,

5227 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */

5228 UCOL_PSK_LIMIT

5229 };

5230

5231 /** collation state enum. *_SHIFT value is how much to shift right

5232 * to get the state piece to the right. *_MASK value should be

5233 * ANDed with the shifted state. This data is stored in state[1]

5234 * field.

5235 */

5236 enum {

5237 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */

5238 UCOL_PSK_LEVEL_MASK = 7, /** three bits */

5239 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */

5240 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,

5241 /** can be only 0 or 1, since we get up to two bytes from primary or quatern ary

5242 * This field is also used to denote that the French secondary level is fin ished

5243 */

5244 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */

5245 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */

5246 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already wri tten */

5247 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */

5248 /** When we do French we need to reverse secondary values. However, continua tions

5249 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2 c3ba

5250 */

5251 UCOL_PSK_BOCSU_BYTES_SHIFT = 7,

5252 UCOL_PSK_BOCSU_BYTES_MASK = 3,

5253 UCOL_PSK_CONSUMED_CES_SHIFT = 9,

5254 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF

5255 };

5256

5257 // macro calculating the number of expansion CEs available

5258 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn

5259

5260

5261 /** main sortkey part procedure. On the first call,

5262 * you should pass in a collator, an iterator, empty state

5263 * state[0] == state[1] == 0, a buffer to hold results

5264 * number of bytes you need and an error code pointer.

5265 * Make sure your buffer is big enough to hold the wanted

5266 * number of sortkey bytes. I don't check.

5267 * The only meaningful status you can get back is

5268 * U_BUFFER_OVERFLOW_ERROR, which basically means that you

5269 * have been dealt a raw deal and that you probably won't

5270 * be able to use partial sortkey generation for this

5271 * particular combination of string and collator. This

5272 * is highly unlikely, but you should still check the error code.

5273 * Any other status means that you're not in a sane situation

5274 * anymore. After the first call, preserve state values and

5275 * use them on subsequent calls to obtain more bytes of a sortkey.

5276 * Use until the number of bytes written is smaller than the requested

5277 * number of bytes. Generated sortkey is not compatible with the

5278 * one generated by ucol_getSortKey, as we don't do any compression.

5279 * However, levels are still terminated by a 1 (one) and the sortkey

5280 * is terminated by a 0 (zero). Identical level is the same as in the

5281 * regular sortkey - internal bocu-1 implementation is used.

5282 * For curious, although you cannot do much about this, here is

5283 * the structure of state words.

5284 * state[0] - iterator state. Depends on the iterator implementation,

5285 * but allows the iterator to continue where it stopped in

5286 * the last iteration.

5287 * state[1] - collation processing state. Here is the distribution

5288 * of the bits:

5289 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary

5290 * quaternary, quin (we don't use this one), identical and

5291 * null (producing only zeroes - first one to terminate the

5292 * sortkey and subsequent to fill the buffer).

5293 * 3 - byte count. Number of bytes written on the primary level.

5294 * 4 - was shifted. Whether the previous iteration finished in the

5295 * shifted state.

5296 * 5, 6 - French continuation bytes written. See the comment in the enum

5297 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on

5298 * the identical level.

5299 * 9..31 - CEs consumed. Number of getCE or next32 operations performed

5300 * since thes last successful update of the iterator state.

5301 */

5302 U_CAPI int32_t U_EXPORT2	209 U_CAPI int32_t U_EXPORT2

5303 ucol_nextSortKeyPart(const UCollator *coll,	210 ucol_nextSortKeyPart(const UCollator *coll,

5304 UCharIterator *iter,	211 UCharIterator *iter,

5305 uint32_t state[2],	212 uint32_t state[2],

5306 uint8_t *dest, int32_t count,	213 uint8_t *dest, int32_t count,

5307 UErrorCode *status)	214 UErrorCode *status)

5308 {	215 {

5309 /* error checking */	216 /* error checking */

5310 if(status==NULL \|\| U_FAILURE(*status)) {	217 if(status==NULL \|\| U_FAILURE(*status)) {

5311 return 0;	218 return 0;

5312 }	219 }

5313 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);	220 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);

5314 if( coll==NULL \|\| iter==NULL \|\|

5315 state==NULL \|\|

5316 count<0 \|\| (count>0 && dest==NULL)

5317 ) {

5318 *status=U_ILLEGAL_ARGUMENT_ERROR;

5319 UTRACE_EXIT_STATUS(status);

5320 return 0;

5321 }

5322

5323 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count= %d",	221 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count= %d",

5324 coll, iter, state[0], state[1], dest, count);	222 coll, iter, state[0], state[1], dest, count);

5325	223

5326 if(count==0) {	224 int32_t i = Collator::fromUCollator(coll)->

5327 /* nothing to do */	225 internalNextSortKeyPart(iter, state, dest, count, *status);

5328 UTRACE_EXIT_VALUE(0);	226

5329 return 0;

5330 }

5331 /** Setting up situation according to the state we got from the previous ite ration */

5332 // The state of the iterator from the previous invocation

5333 uint32_t iterState = state[0];

5334 // Has the last iteration ended in the shifted state

5335 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_ SHIFTED_MASK)?TRUE:FALSE;

5336 // What is the current level of the sortkey?

5337 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;

5338 // Have we written only one byte from a two byte primary in the previous ite ration?

5339 // Also on secondary level - have we finished with the French secondary?

5340 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_D ONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;

5341 // number of bytes in the continuation buffer for French

5342 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USE D_FRENCH_MASK;

5343 // Number of bytes already written from a bocsu sequence. Since

5344 // the longes bocsu sequence is 4 long, this can be up to 3.

5345 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK _BOCSU_BYTES_MASK;

5346 // Number of elements that need to be consumed in this iteration because

5347 // the iterator returned UITER_NO_STATE at the end of the last iteration,

5348 // so we had to save the last valid state.

5349 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED _CES_MASK;

5350

5351 /** values that depend on the collator attributes */

5352 // strength of the collator.

5353 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);

5354 // maximal level of the partial sortkey. Need to take whether case level is done

5355 int32_t maxLevel = 0;

5356 if(strength < UCOL_TERTIARY) {

5357 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {

5358 maxLevel = UCOL_PSK_CASE;

5359 } else {

5360 maxLevel = strength;

5361 }

5362 } else {

5363 if(strength == UCOL_TERTIARY) {

5364 maxLevel = UCOL_PSK_TERTIARY;

5365 } else if(strength == UCOL_QUATERNARY) {

5366 maxLevel = UCOL_PSK_QUATERNARY;

5367 } else { // identical

5368 maxLevel = UCOL_IDENTICAL;

5369 }

5370 }

5371 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation

5372 uint8_t UCOL_HIRAGANA_QUAD =

5373 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON )?0xFE:0xFF;

5374 // Boundary value that decides whether a CE is shifted or not

5375 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopV alue<<16):0;

5376 // Are we doing French collation?

5377 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);

5378

5379 /** initializing the collation state */

5380 UBool notIsContinuation = FALSE;

5381 uint32_t CE = UCOL_NO_MORE_CES;

5382

5383 collIterate s;

5384 IInit_collIterate(coll, NULL, -1, &s, status);

5385 if(U_FAILURE(*status)) {

5386 UTRACE_EXIT_STATUS(*status);

5387 return 0;

5388 }

5389 s.iterator = iter;

5390 s.flags \|= UCOL_USE_ITERATOR;

5391 // This variable tells us whether we have produced some other levels in this iteration

5392 // before we moved to the identical level. In that case, we need to switch t he

5393 // type of the iterator.

5394 UBool doingIdenticalFromStart = FALSE;

5395 // Normalizing iterator

5396 // The division for the array length may truncate the array size to

5397 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high

5398 // for all platforms anyway.

5399 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];

5400 UNormIterator *normIter = NULL;

5401 // If the normalization is turned on for the collator and we are below ident ical level

5402 // we will use a FCD normalizing iterator

5403 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && le vel < UCOL_PSK_IDENTICAL) {

5404 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);

5405 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);

5406 s.flags &= ~UCOL_ITER_NORM;

5407 if(U_FAILURE(*status)) {

5408 UTRACE_EXIT_STATUS(*status);

5409 return 0;

5410 }

5411 } else if(level == UCOL_PSK_IDENTICAL) {

5412 // for identical level, we need a NFD iterator. We need to instantiate i t here, since we

5413 // will be updating the state - and this cannot be done on an ordinary i terator.

5414 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);

5415 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);

5416 s.flags &= ~UCOL_ITER_NORM;

5417 if(U_FAILURE(*status)) {

5418 UTRACE_EXIT_STATUS(*status);

5419 return 0;

5420 }

5421 doingIdenticalFromStart = TRUE;

5422 }

5423

5424 // This is the tentative new state of the iterator. The problem

5425 // is that the iterator might return an undefined state, in

5426 // which case we should save the last valid state and increase

5427 // the iterator skip value.

5428 uint32_t newState = 0;

5429

5430 // First, we set the iterator to the last valid position

5431 // from the last iteration. This was saved in state[0].

5432 if(iterState == 0) {

5433 /* initial state */

5434 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {

5435 s.iterator->move(s.iterator, 0, UITER_LIMIT);

5436 } else {

5437 s.iterator->move(s.iterator, 0, UITER_START);

5438 }

5439 } else {

5440 /* reset to previous state */

5441 s.iterator->setState(s.iterator, iterState, status);

5442 if(U_FAILURE(*status)) {

5443 UTRACE_EXIT_STATUS(*status);

5444 return 0;

5445 }

5446 }

5447

5448

5449

5450 // This variable tells us whether we can attempt to update the state

5451 // of iterator. Situations where we don't want to update iterator state

5452 // are the existence of expansion CEs that are not yet processed, and

5453 // finishing the case level without enough space in the buffer to insert

5454 // a level terminator.

5455 UBool canUpdateState = TRUE;

5456

5457 // Consume all the CEs that were consumed at the end of the previous

5458 // iteration without updating the iterator state. On identical level,

5459 // consume the code points.

5460 int32_t counter = cces;

5461 if(level < UCOL_PSK_IDENTICAL) {

5462 while(counter-->0) {

5463 // If we're doing French and we are on the secondary level,

5464 // we go backwards.

5465 if(level == UCOL_PSK_SECONDARY && doingFrench) {

5466 CE = ucol_IGetPrevCE(coll, &s, status);

5467 } else {

5468 CE = ucol_IGetNextCE(coll, &s, status);

5469 }

5470 if(CE==UCOL_NO_MORE_CES) {

5471 /* should not happen */

5472 *status=U_INTERNAL_PROGRAM_ERROR;

5473 UTRACE_EXIT_STATUS(*status);

5474 return 0;

5475 }

5476 if(uprv_numAvailableExpCEs(s)) {

5477 canUpdateState = FALSE;

5478 }

5479 }

5480 } else {

5481 while(counter-->0) {

5482 uiter_next32(s.iterator);

5483 }

5484 }

5485

5486 // French secondary needs to know whether the iterator state of zero came fr om previous level OR

5487 // from a new invocation...

5488 UBool wasDoingPrimary = FALSE;

5489 // destination buffer byte counter. When this guy

5490 // gets to count, we're done with the iteration

5491 int32_t i = 0;

5492 // used to count the zero bytes written after we

5493 // have finished with the sort key

5494 int32_t j = 0;

5495

5496

5497 // Hm.... I think we're ready to plunge in. Basic story is as following:

5498 // we have a fall through case based on level. This is used for initial

5499 // positioning on iteration start. Every level processor contains a

5500 // for(;;) which will be broken when we exhaust all the CEs. Other

5501 // way to exit is a goto saveState, which happens when we have filled

5502 // out our buffer.

5503 switch(level) {

5504 case UCOL_PSK_PRIMARY:

5505 wasDoingPrimary = TRUE;

5506 for(;;) {

5507 if(i==count) {

5508 goto saveState;

5509 }

5510 // We should save the state only if we

5511 // are sure that we are done with the

5512 // previous iterator state

5513 if(canUpdateState && byteCountOrFrenchDone == 0) {

5514 newState = s.iterator->getState(s.iterator);

5515 if(newState != UITER_NO_STATE) {

5516 iterState = newState;

5517 cces = 0;

5518 }

5519 }

5520 CE = ucol_IGetNextCE(coll, &s, status);

5521 cces++;

5522 if(CE==UCOL_NO_MORE_CES) {

5523 // Add the level separator

5524 terminatePSKLevel(level, maxLevel, i, dest);

5525 byteCountOrFrenchDone=0;

5526 // Restart the iteration an move to the

5527 // second level

5528 s.iterator->move(s.iterator, 0, UITER_START);

5529 cces = 0;

5530 level = UCOL_PSK_SECONDARY;

5531 break;

5532 }

5533 if(!isContinuation(CE)){

5534 if(coll->leadBytePermutationTable != NULL){

5535 CE = (coll->leadBytePermutationTable[CE>>24] << 24) \| (CE & 0x00FFFFFF);

5536 }

5537 }

5538 if(!isShiftedCE(CE, LVT, &wasShifted)) {

5539 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */

5540 if(CE != 0) {

5541 if(byteCountOrFrenchDone == 0) {

5542 // get the second byte of primary

5543 dest[i++]=(uint8_t)(CE >> 8);

5544 } else {

5545 byteCountOrFrenchDone = 0;

5546 }

5547 if((CE &=0xff)!=0) {

5548 if(i==count) {

5549 /* overflow */

5550 byteCountOrFrenchDone = 1;

5551 cces--;

5552 goto saveState;

5553 }

5554 dest[i++]=(uint8_t)CE;

5555 }

5556 }

5557 }

5558 if(uprv_numAvailableExpCEs(s)) {

5559 canUpdateState = FALSE;

5560 } else {

5561 canUpdateState = TRUE;

5562 }

5563 }

5564 /* fall through to next level */

5565 case UCOL_PSK_SECONDARY:

5566 if(strength >= UCOL_SECONDARY) {

5567 if(!doingFrench) {

5568 for(;;) {

5569 if(i == count) {

5570 goto saveState;

5571 }

5572 // We should save the state only if we

5573 // are sure that we are done with the

5574 // previous iterator state

5575 if(canUpdateState) {

5576 newState = s.iterator->getState(s.iterator);

5577 if(newState != UITER_NO_STATE) {

5578 iterState = newState;

5579 cces = 0;

5580 }

5581 }

5582 CE = ucol_IGetNextCE(coll, &s, status);

5583 cces++;

5584 if(CE==UCOL_NO_MORE_CES) {

5585 // Add the level separator

5586 terminatePSKLevel(level, maxLevel, i, dest);

5587 byteCountOrFrenchDone = 0;

5588 // Restart the iteration an move to the

5589 // second level

5590 s.iterator->move(s.iterator, 0, UITER_START);

5591 cces = 0;

5592 level = UCOL_PSK_CASE;

5593 break;

5594 }

5595 if(!isShiftedCE(CE, LVT, &wasShifted)) {

5596 CE >>= 8; /* get secondary */

5597 if(CE != 0) {

5598 dest[i++]=(uint8_t)CE;

5599 }

5600 }

5601 if(uprv_numAvailableExpCEs(s)) {

5602 canUpdateState = FALSE;

5603 } else {

5604 canUpdateState = TRUE;

5605 }

5606 }

5607 } else { // French secondary processing

5608 uint8_t frenchBuff[UCOL_MAX_BUFFER];

5609 int32_t frenchIndex = 0;

5610 // Here we are going backwards.

5611 // If the iterator is at the beggining, it should be

5612 // moved to end.

5613 if(wasDoingPrimary) {

5614 s.iterator->move(s.iterator, 0, UITER_LIMIT);

5615 cces = 0;

5616 }

5617 for(;;) {

5618 if(i == count) {

5619 goto saveState;

5620 }

5621 if(canUpdateState) {

5622 newState = s.iterator->getState(s.iterator);

5623 if(newState != UITER_NO_STATE) {

5624 iterState = newState;

5625 cces = 0;

5626 }

5627 }

5628 CE = ucol_IGetPrevCE(coll, &s, status);

5629 cces++;

5630 if(CE==UCOL_NO_MORE_CES) {

5631 // Add the level separator

5632 terminatePSKLevel(level, maxLevel, i, dest);

5633 byteCountOrFrenchDone = 0;

5634 // Restart the iteration an move to the next level

5635 s.iterator->move(s.iterator, 0, UITER_START);

5636 level = UCOL_PSK_CASE;

5637 break;

5638 }

5639 if(isContinuation(CE)) { // if it's a continuation, we want to save it and

5640 // reverse when we get a first non-continuation CE.

5641 CE >>= 8;

5642 frenchBuff[frenchIndex++] = (uint8_t)CE;

5643 } else if(!isShiftedCE(CE, LVT, &wasShifted)) {

5644 CE >>= 8; /* get secondary */

5645 if(!frenchIndex) {

5646 if(CE != 0) {

5647 dest[i++]=(uint8_t)CE;

5648 }

5649 } else {

5650 frenchBuff[frenchIndex++] = (uint8_t)CE;

5651 frenchIndex -= usedFrench;

5652 usedFrench = 0;

5653 while(i < count && frenchIndex) {

5654 dest[i++] = frenchBuff[--frenchIndex];

5655 usedFrench++;

5656 }

5657 }

5658 }

5659 if(uprv_numAvailableExpCEs(s)) {

5660 canUpdateState = FALSE;

5661 } else {

5662 canUpdateState = TRUE;

5663 }

5664 }

5665 }

5666 } else {

5667 level = UCOL_PSK_CASE;

5668 }

5669 /* fall through to next level */

5670 case UCOL_PSK_CASE:

5671 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {

5672 uint32_t caseShift = UCOL_CASE_SHIFT_START;

5673 uint8_t caseByte = UCOL_CASE_BYTE_START;

5674 uint8_t caseBits = 0;

5675

5676 for(;;) {

5677 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);

5678 if(i == count) {

5679 goto saveState;

5680 }

5681 // We should save the state only if we

5682 // are sure that we are done with the

5683 // previous iterator state

5684 if(canUpdateState) {

5685 newState = s.iterator->getState(s.iterator);

5686 if(newState != UITER_NO_STATE) {

5687 iterState = newState;

5688 cces = 0;

5689 }

5690 }

5691 CE = ucol_IGetNextCE(coll, &s, status);

5692 cces++;

5693 if(CE==UCOL_NO_MORE_CES) {

5694 // On the case level we might have an unfinished

5695 // case byte. Add one if it's started.

5696 if(caseShift != UCOL_CASE_SHIFT_START) {

5697 dest[i++] = caseByte;

5698 }

5699 cces = 0;

5700 // We have finished processing CEs on this level.

5701 // However, we don't know if we have enough space

5702 // to add a case level terminator.

5703 if(i < count) {

5704 // Add the level separator

5705 terminatePSKLevel(level, maxLevel, i, dest);

5706 // Restart the iteration and move to the

5707 // next level

5708 s.iterator->move(s.iterator, 0, UITER_START);

5709 level = UCOL_PSK_TERTIARY;

5710 } else {

5711 canUpdateState = FALSE;

5712 }

5713 break;

5714 }

5715

5716 if(!isShiftedCE(CE, LVT, &wasShifted)) {

5717 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 \|\| s trength > UCOL_PRIMARY)) {

5718 // do the case level if we need to do it. We don't want to calculate

5719 // case level for primary ignorables if we have only pri mary strength and case level

5720 // otherwise we would break well formedness of CEs

5721 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);

5722 caseBits = (uint8_t)(CE & 0xC0);

5723 // this copies the case level logic from the

5724 // sort key generation code

5725 if(CE != 0) {

5726 if (caseShift == 0) {

5727 dest[i++] = caseByte;

5728 caseShift = UCOL_CASE_SHIFT_START;

5729 caseByte = UCOL_CASE_BYTE_START;

5730 }

5731 if(coll->caseFirst == UCOL_UPPER_FIRST) {

5732 if((caseBits & 0xC0) == 0) {

5733 caseByte \|= 1 << (--caseShift);

5734 } else {

5735 caseByte \|= 0 << (--caseShift);

5736 /* second bit */

5737 if(caseShift == 0) {

5738 dest[i++] = caseByte;

5739 caseShift = UCOL_CASE_SHIFT_START;

5740 caseByte = UCOL_CASE_BYTE_START;

5741 }

5742 caseByte \|= ((caseBits>>6)&1) << (--caseShif t);

5743 }

5744 } else {

5745 if((caseBits & 0xC0) == 0) {

5746 caseByte \|= 0 << (--caseShift);

5747 } else {

5748 caseByte \|= 1 << (--caseShift);

5749 /* second bit */

5750 if(caseShift == 0) {

5751 dest[i++] = caseByte;

5752 caseShift = UCOL_CASE_SHIFT_START;

5753 caseByte = UCOL_CASE_BYTE_START;

5754 }

5755 caseByte \|= ((caseBits>>7)&1) << (--caseShif t);

5756 }

5757 }

5758 }

5759

5760 }

5761 }

5762 // Not sure this is correct for the case level - revisit

5763 if(uprv_numAvailableExpCEs(s)) {

5764 canUpdateState = FALSE;

5765 } else {

5766 canUpdateState = TRUE;

5767 }

5768 }

5769 } else {

5770 level = UCOL_PSK_TERTIARY;

5771 }

5772 /* fall through to next level */

5773 case UCOL_PSK_TERTIARY:

5774 if(strength >= UCOL_TERTIARY) {

5775 for(;;) {

5776 if(i == count) {

5777 goto saveState;

5778 }

5779 // We should save the state only if we

5780 // are sure that we are done with the

5781 // previous iterator state

5782 if(canUpdateState) {

5783 newState = s.iterator->getState(s.iterator);

5784 if(newState != UITER_NO_STATE) {

5785 iterState = newState;

5786 cces = 0;

5787 }

5788 }

5789 CE = ucol_IGetNextCE(coll, &s, status);

5790 cces++;

5791 if(CE==UCOL_NO_MORE_CES) {

5792 // Add the level separator

5793 terminatePSKLevel(level, maxLevel, i, dest);

5794 byteCountOrFrenchDone = 0;

5795 // Restart the iteration an move to the

5796 // second level

5797 s.iterator->move(s.iterator, 0, UITER_START);

5798 cces = 0;

5799 level = UCOL_PSK_QUATERNARY;

5800 break;

5801 }

5802 if(!isShiftedCE(CE, LVT, &wasShifted)) {

5803 notIsContinuation = !isContinuation(CE);

5804

5805 if(notIsContinuation) {

5806 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);

5807 CE ^= coll->caseSwitch;

5808 CE &= coll->tertiaryMask;

5809 } else {

5810 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));

5811 }

5812

5813 if(CE != 0) {

5814 dest[i++]=(uint8_t)CE;

5815 }

5816 }

5817 if(uprv_numAvailableExpCEs(s)) {

5818 canUpdateState = FALSE;

5819 } else {

5820 canUpdateState = TRUE;

5821 }

5822 }

5823 } else {

5824 // if we're not doing tertiary

5825 // skip to the end

5826 level = UCOL_PSK_NULL;

5827 }

5828 /* fall through to next level */

5829 case UCOL_PSK_QUATERNARY:

5830 if(strength >= UCOL_QUATERNARY) {

5831 for(;;) {

5832 if(i == count) {

5833 goto saveState;

5834 }

5835 // We should save the state only if we

5836 // are sure that we are done with the

5837 // previous iterator state

5838 if(canUpdateState) {

5839 newState = s.iterator->getState(s.iterator);

5840 if(newState != UITER_NO_STATE) {

5841 iterState = newState;

5842 cces = 0;

5843 }

5844 }

5845 CE = ucol_IGetNextCE(coll, &s, status);

5846 cces++;

5847 if(CE==UCOL_NO_MORE_CES) {

5848 // Add the level separator

5849 terminatePSKLevel(level, maxLevel, i, dest);

5850 //dest[i++] = UCOL_LEVELTERMINATOR;

5851 byteCountOrFrenchDone = 0;

5852 // Restart the iteration an move to the

5853 // second level

5854 s.iterator->move(s.iterator, 0, UITER_START);

5855 cces = 0;

5856 level = UCOL_PSK_QUIN;

5857 break;

5858 }

5859 if(CE==0)

5860 continue;

5861 if(isShiftedCE(CE, LVT, &wasShifted)) {

5862 CE >>= 16; /* get primary */

5863 if(CE != 0) {

5864 if(byteCountOrFrenchDone == 0) {

5865 dest[i++]=(uint8_t)(CE >> 8);

5866 } else {

5867 byteCountOrFrenchDone = 0;

5868 }

5869 if((CE &=0xff)!=0) {

5870 if(i==count) {

5871 /* overflow */

5872 byteCountOrFrenchDone = 1;

5873 goto saveState;

5874 }

5875 dest[i++]=(uint8_t)CE;

5876 }

5877 }

5878 } else {

5879 notIsContinuation = !isContinuation(CE);

5880 if(notIsContinuation) {

5881 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana a nd we need to note it

5882 dest[i++] = UCOL_HIRAGANA_QUAD;

5883 } else {

5884 dest[i++] = 0xFF;

5885 }

5886 }

5887 }

5888 if(uprv_numAvailableExpCEs(s)) {

5889 canUpdateState = FALSE;

5890 } else {

5891 canUpdateState = TRUE;

5892 }

5893 }

5894 } else {

5895 // if we're not doing quaternary

5896 // skip to the end

5897 level = UCOL_PSK_NULL;

5898 }

5899 /* fall through to next level */

5900 case UCOL_PSK_QUIN:

5901 level = UCOL_PSK_IDENTICAL;

5902 /* fall through to next level */

5903 case UCOL_PSK_IDENTICAL:

5904 if(strength >= UCOL_IDENTICAL) {

5905 UChar32 first, second;

5906 int32_t bocsuBytesWritten = 0;

5907 // We always need to do identical on

5908 // the NFD form of the string.

5909 if(normIter == NULL) {

5910 // we arrived from the level below and

5911 // normalization was not turned on.

5912 // therefore, we need to make a fresh NFD iterator

5913 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);

5914 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);

5915 } else if(!doingIdenticalFromStart) {

5916 // there is an iterator, but we did some other levels.

5917 // therefore, we have a FCD iterator - need to make

5918 // a NFD one.

5919 // normIter being at the beginning does not guarantee

5920 // that the underlying iterator is at the beginning

5921 iter->move(iter, 0, UITER_START);

5922 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);

5923 }

5924 // At this point we have a NFD iterator that is positioned

5925 // in the right place

5926 if(U_FAILURE(*status)) {

5927 UTRACE_EXIT_STATUS(*status);

5928 return 0;

5929 }

5930 first = uiter_previous32(s.iterator);

5931 // maybe we're at the start of the string

5932 if(first == U_SENTINEL) {

5933 first = 0;

5934 } else {

5935 uiter_next32(s.iterator);

5936 }

5937

5938 j = 0;

5939 for(;;) {

5940 if(i == count) {

5941 if(j+1 < bocsuBytesWritten) {

5942 bocsuBytesUsed = j+1;

5943 }

5944 goto saveState;

5945 }

5946

5947 // On identical level, we will always save

5948 // the state if we reach this point, since

5949 // we don't depend on getNextCE for content

5950 // all the content is in our buffer and we

5951 // already either stored the full buffer OR

5952 // otherwise we won't arrive here.

5953 newState = s.iterator->getState(s.iterator);

5954 if(newState != UITER_NO_STATE) {

5955 iterState = newState;

5956 cces = 0;

5957 }

5958

5959 uint8_t buff[4];

5960 second = uiter_next32(s.iterator);

5961 cces++;

5962

5963 // end condition for identical level

5964 if(second == U_SENTINEL) {

5965 terminatePSKLevel(level, maxLevel, i, dest);

5966 level = UCOL_PSK_NULL;

5967 break;

5968 }

5969 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, seco nd, buff);

5970 first = second;

5971

5972 j = 0;

5973 if(bocsuBytesUsed != 0) {

5974 while(bocsuBytesUsed-->0) {

5975 j++;

5976 }

5977 }

5978

5979 while(i < count && j < bocsuBytesWritten) {

5980 dest[i++] = buff[j++];

5981 }

5982 }

5983

5984 } else {

5985 level = UCOL_PSK_NULL;

5986 }

5987 /* fall through to next level */

5988 case UCOL_PSK_NULL:

5989 j = i;

5990 while(j<count) {

5991 dest[j++]=0;

5992 }

5993 break;

5994 default:

5995 *status = U_INTERNAL_PROGRAM_ERROR;

5996 UTRACE_EXIT_STATUS(*status);

5997 return 0;

5998 }

5999

6000 saveState:

6001 // Now we need to return stuff. First we want to see whether we have

6002 // done everything for the current state of iterator.

6003 if(byteCountOrFrenchDone

6004 \|\| canUpdateState == FALSE

6005 \|\| (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)

6006 {

6007 // Any of above mean that the previous transaction

6008 // wasn't finished and that we should store the

6009 // previous iterator state.

6010 state[0] = iterState;

6011 } else {

6012 // The transaction is complete. We will continue in the next iteration.

6013 state[0] = s.iterator->getState(s.iterator);

6014 cces = 0;

6015 }

6016 // Store the number of bocsu bytes written.

6017 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {

6018 *status = U_INDEX_OUTOFBOUNDS_ERROR;

6019 }

6020 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BY TES_SHIFT;

6021

6022 // Next we put in the level of comparison

6023 state[1] \|= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);

6024

6025 // If we are doing French, we need to store whether we have just finished th e French level

6026 if(level == UCOL_PSK_SECONDARY && doingFrench) {

6027 state[1] \|= (((int32_t)(state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_D ONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);

6028 } else {

6029 state[1] \|= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE _MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);

6030 }

6031

6032 // Was the latest CE shifted

6033 if(wasShifted) {

6034 state[1] \|= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;

6035 }

6036 // Check for cces overflow

6037 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {

6038 *status = U_INDEX_OUTOFBOUNDS_ERROR;

6039 }

6040 // Store cces

6041 state[1] \|= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SH IFT);

6042

6043 // Check for French overflow

6044 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {

6045 *status = U_INDEX_OUTOFBOUNDS_ERROR;

6046 }

6047 // Store number of bytes written in the French secondary continuation sequen ce

6048 state[1] \|= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENC H_SHIFT);

6049

6050

6051 // If we have used normalizing iterator, get rid of it

6052 if(normIter != NULL) {

6053 unorm_closeIter(normIter);

6054 }

6055

6056 /* To avoid memory leak, free the offset buffer if necessary. */

6057 ucol_freeOffsetBuffer(&s);

6058

6059 // Return number of meaningful sortkey bytes.	227 // Return number of meaningful sortkey bytes.

6060 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",	228 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",

6061 dest,i, state[0], state[1]);	229 dest,i, state[0], state[1]);

6062 UTRACE_EXIT_VALUE(i);	230 UTRACE_EXIT_VALUE_STATUS(i, *status);

6063 return i;	231 return i;

6064 }	232 }

6065	233

6066 /**	234 /**

6067 * Produce a bound for a given sortkey and a number of levels.	235 * Produce a bound for a given sortkey and a number of levels.

6068 */	236 */

6069 U_CAPI int32_t U_EXPORT2	237 U_CAPI int32_t U_EXPORT2

6070 ucol_getBound(const uint8_t *source,	238 ucol_getBound(const uint8_t *source,

6071 int32_t sourceLength,	239 int32_t sourceLength,

6072 UColBoundMode boundType,	240 UColBoundMode boundType,

6073 uint32_t noOfLevels,	241 uint32_t noOfLevels,

6074 uint8_t *result,	242 uint8_t *result,

6075 int32_t resultLength,	243 int32_t resultLength,

6076 UErrorCode *status)	244 UErrorCode *status)

6077 {	245 {

6078 // consistency checks	246 // consistency checks

6079 if(status == NULL \|\| U_FAILURE(*status)) {	247 if(status == NULL \|\| U_FAILURE(*status)) {

6080 return 0;	248 return 0;

6081 }	249 }

6082 if(source == NULL) {	250 if(source == NULL) {

6083 *status = U_ILLEGAL_ARGUMENT_ERROR;	251 *status = U_ILLEGAL_ARGUMENT_ERROR;

6084 return 0;	252 return 0;

6085 }	253 }

6086	254

6087 int32_t sourceIndex = 0;	255 int32_t sourceIndex = 0;

6088 // Scan the string until we skip enough of the key OR reach the end of the k ey	256 // Scan the string until we skip enough of the key OR reach the end of the k ey

6089 do {	257 do {

6090 sourceIndex++;	258 sourceIndex++;

6091 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {	259 if(source[sourceIndex] == Collation::LEVEL_SEPARATOR_BYTE) {

6092 noOfLevels--;	260 noOfLevels--;

6093 }	261 }

6094 } while (noOfLevels > 0	262 } while (noOfLevels > 0

6095 && (source[sourceIndex] != 0 \|\| sourceIndex < sourceLength));	263 && (source[sourceIndex] != 0 \|\| sourceIndex < sourceLength));

6096	264

6097 if((source[sourceIndex] == 0 \|\| sourceIndex == sourceLength)	265 if((source[sourceIndex] == 0 \|\| sourceIndex == sourceLength)

6098 && noOfLevels > 0) {	266 && noOfLevels > 0) {

6099 *status = U_SORT_KEY_TOO_SHORT_WARNING;	267 *status = U_SORT_KEY_TOO_SHORT_WARNING;

6100 }	268 }

6101	269

(...skipping 22 matching lines...) Expand all Loading...
6124 return 0;	292 return 0;

6125 }	293 }

6126 result[sourceIndex++] = 0;	294 result[sourceIndex++] = 0;

6127	295

6128 return sourceIndex;	296 return sourceIndex;

6129 } else {	297 } else {

6130 return sourceIndex+boundType+1;	298 return sourceIndex+boundType+1;

6131 }	299 }

6132 }	300 }

6133	301

6134 /****************************************************************************/	302 U_CAPI void U_EXPORT2

6135 /* Following are the functions that deal with the properties of a collator */	303 ucol_setMaxVariable(UCollator coll, UColReorderCode group, UErrorCode pErrorCo de) {

6136 /* there are new APIs and some compatibility APIs */	304 if(U_FAILURE(*pErrorCode)) { return; }

6137 /****************************************************************************/	305 Collator::fromUCollator(coll)->setMaxVariable(group, *pErrorCode);

6138	306 }

6139 static inline void	307

6140 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,	308 U_CAPI UColReorderCode U_EXPORT2

6141 int32_t primShift, int32_t secShift, int32_t *terShift)	309 ucol_getMaxVariable(const UCollator *coll) {

6142 {	310 return Collator::fromUCollator(coll)->getMaxVariable();

6143 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;

6144 UBool reverseSecondary = FALSE;

6145 UBool continuation = isContinuation(CE);

6146 if(!continuation) {

6147 tertiary = (uint8_t)((CE & coll->tertiaryMask));

6148 tertiary ^= coll->caseSwitch;

6149 reverseSecondary = TRUE;

6150 } else {

6151 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));

6152 tertiary &= UCOL_REMOVE_CASE;

6153 reverseSecondary = FALSE;

6154 }

6155

6156 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);

6157 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);

6158 primary1 = (uint8_t)(CE >> 8);

6159

6160 if(primary1 != 0) {

6161 if (coll->leadBytePermutationTable != NULL && !continuation) {

6162 primary1 = coll->leadBytePermutationTable[primary1];

6163 }

6164

6165 coll->latinOneCEs[ch] \|= (primary1 << *primShift);

6166 *primShift -= 8;

6167 }

6168 if(primary2 != 0) {

6169 if(*primShift < 0) {

6170 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;

6171 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;

6172 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;

6173 return;

6174 }

6175 coll->latinOneCEs[ch] \|= (primary2 << *primShift);

6176 *primShift -= 8;

6177 }

6178 if(secondary != 0) {

6179 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse se condary

6180 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space fo r secondary

6181 coll->latinOneCEs[coll->latinOneTableLen+ch] \|= (secondary << 24);

6182 } else { // normal case

6183 coll->latinOneCEs[coll->latinOneTableLen+ch] \|= (secondary << *secSh ift);

6184 }

6185 *secShift -= 8;

6186 }

6187 if(tertiary != 0) {

6188 coll->latinOneCEs[2coll->latinOneTableLen+ch] \|= (tertiary << terShift );

6189 *terShift -= 8;

6190 }

6191 }

6192

6193 static inline UBool

6194 ucol_resizeLatinOneTable(UCollator coll, int32_t size, UErrorCode status) {

6195 uint32_t newTable = (uint32_t )uprv_malloc(sizesizeof(uint32_t)3);

6196 if(newTable == NULL) {

6197 *status = U_MEMORY_ALLOCATION_ERROR;

6198 coll->latinOneFailed = TRUE;

6199 return FALSE;

6200 }

6201 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTable Len)*sizeof(uint32_t);

6202 uprv_memset(newTable, 0, sizesizeof(uint32_t)3);

6203 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);

6204 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToC opy);

6205 uprv_memcpy(newTable+2size, coll->latinOneCEs+2coll->latinOneTableLen, siz eToCopy);

6206 coll->latinOneTableLen = size;

6207 uprv_free(coll->latinOneCEs);

6208 coll->latinOneCEs = newTable;

6209 return TRUE;

6210 }

6211

6212 static UBool

6213 ucol_setUpLatinOne(UCollator coll, UErrorCode status) {

6214 UBool result = TRUE;

6215 if(coll->latinOneCEs == NULL) {

6216 coll->latinOneCEs = (uint32_t )uprv_malloc(sizeof(uint32_t)UCOL_LATINO NETABLELEN*3);

6217 if(coll->latinOneCEs == NULL) {

6218 *status = U_MEMORY_ALLOCATION_ERROR;

6219 return FALSE;

6220 }

6221 coll->latinOneTableLen = UCOL_LATINONETABLELEN;

6222 }

6223 UChar ch = 0;

6224 UCollationElements *it = ucol_openElements(coll, &ch, 1, status);

6225 // Check for null pointer

6226 if (U_FAILURE(*status)) {

6227 ucol_closeElements(it);

6228 return FALSE;

6229 }

6230 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)coll->latinOneTableLen3) ;

6231

6232 int32_t primShift = 24, secShift = 24, terShift = 24;

6233 uint32_t CE = 0;

6234 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;

6235

6236 // TODO: make safe if you get more than you wanted...

6237 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {

6238 primShift = 24; secShift = 24; terShift = 24;

6239 if(ch < 0x100) {

6240 CE = coll->latinOneMapping[ch];

6241 } else {

6242 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);

6243 if(CE == UCOL_NOT_FOUND && coll->UCA) {

6244 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);

6245 }

6246 }

6247 if(CE < UCOL_NOT_FOUND) {

6248 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift );

6249 } else {

6250 switch (getCETag(CE)) {

6251 case EXPANSION_TAG:

6252 case DIGIT_TAG:

6253 ucol_setText(it, &ch, 1, status);

6254 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {

6255 if(primShift < 0 \|\| secShift < 0 \|\| terShift < 0) {

6256 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;

6257 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL _OUT_CE;

6258 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BA IL_OUT_CE;

6259 break;

6260 }

6261 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, & terShift);

6262 }

6263 break;

6264 case CONTRACTION_TAG:

6265 // here is the trick

6266 // F2 is contraction. We do something very similar to contractio ns

6267 // but have two indices, one in the real contraction table and t he

6268 // other to where we stuffed things. This hopes that we don't ha ve

6269 // many contractions (this should work for latin-1 tables).

6270 {

6271 if((CE & 0x00FFF000) != 0) {

6272 *status = U_UNSUPPORTED_ERROR;

6273 goto cleanup_after_failure;

6274 }

6275

6276 const UChar UCharOffset = (UChar )coll->image+getContractO ffset(CE);

6277

6278 CE \|= (contractionOffset & 0xFFF) << 12; // insert the offse t in latin-1 table

6279

6280 coll->latinOneCEs[ch] = CE;

6281 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;

6282 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;

6283

6284 // We're going to jump into contraction table, pick the elem ents

6285 // and use them

6286 do {

6287 CE = *(coll->contractionCEs +

6288 (UCharOffset - coll->contractionIndex));

6289 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {

6290 uint32_t size;

6291 uint32_t i; /* general counter */

6292 uint32_t CEOffset = (uint32_t )coll->image+getExpa nsionOffset(CE); /* find the offset to expansion table */

6293 size = getExpansionCount(CE);

6294 //CE = *CEOffset++;

6295 if(size != 0) { /* if there are less than 16 element s in expansion, we don't terminate */

6296 for(i = 0; i<size; i++) {

6297 if(primShift < 0 \|\| secShift < 0 \|\| terShift < 0) {

6298 coll->latinOneCEs[(UChar)contractionOffs et] = UCOL_BAIL_OUT_CE;

6299 coll->latinOneCEs[coll->latinOneTableLen +(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;

6300 coll->latinOneCEs[2*coll->latinOneTableL en+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;

6301 break;

6302 }

6303 ucol_addLatinOneEntry(coll, (UChar)contracti onOffset, *CEOffset++, &primShift, &secShift, &terShift);

6304 }

6305 } else { /* else, we do */

6306 while(*CEOffset != 0) {

6307 if(primShift < 0 \|\| secShift < 0 \|\| terShift < 0) {

6308 coll->latinOneCEs[(UChar)contractionOffs et] = UCOL_BAIL_OUT_CE;

6309 coll->latinOneCEs[coll->latinOneTableLen +(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;

6310 coll->latinOneCEs[2*coll->latinOneTableL en+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;

6311 break;

6312 }

6313 ucol_addLatinOneEntry(coll, (UChar)contracti onOffset, *CEOffset++, &primShift, &secShift, &terShift);

6314 }

6315 }

6316 contractionOffset++;

6317 } else if(CE < UCOL_NOT_FOUND) {

6318 ucol_addLatinOneEntry(coll, (UChar)contractionOffset ++, CE, &primShift, &secShift, &terShift);

6319 } else {

6320 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_B AIL_OUT_CE;

6321 coll->latinOneCEs[coll->latinOneTableLen+(UChar)cont ractionOffset] = UCOL_BAIL_OUT_CE;

6322 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)co ntractionOffset] = UCOL_BAIL_OUT_CE;

6323 contractionOffset++;

6324 }

6325 UCharOffset++;

6326 primShift = 24; secShift = 24; terShift = 24;

6327 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate

6328 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneT ableLen, status)) {

6329 goto cleanup_after_failure;

6330 }

6331 }

6332 } while(*UCharOffset != 0xFFFF);

6333 }

6334 break;;

6335 case SPEC_PROC_TAG:

6336 {

6337 // 0xB7 is a precontext character defined in UCA5.1, a speci al

6338 // handle is implemeted in order to save LatinOne table for

6339 // most locales.

6340 if (ch==0xb7) {

6341 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShif t, &terShift);

6342 }

6343 else {

6344 goto cleanup_after_failure;

6345 }

6346 }

6347 break;

6348 default:

6349 goto cleanup_after_failure;

6350 }

6351 }

6352 }

6353 // compact table

6354 if(contractionOffset < coll->latinOneTableLen) {

6355 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {

6356 goto cleanup_after_failure;

6357 }

6358 }

6359 ucol_closeElements(it);

6360 return result;

6361

6362 cleanup_after_failure:

6363 // status should already be set before arriving here.

6364 coll->latinOneFailed = TRUE;

6365 ucol_closeElements(it);

6366 return FALSE;

6367 }

6368

6369 void ucol_updateInternalState(UCollator coll, UErrorCode status) {

6370 if(U_SUCCESS(*status)) {

6371 if(coll->caseFirst == UCOL_UPPER_FIRST) {

6372 coll->caseSwitch = UCOL_CASE_SWITCH;

6373 } else {

6374 coll->caseSwitch = UCOL_NO_CASE_SWITCH;

6375 }

6376

6377 if(coll->caseLevel == UCOL_ON \|\| coll->caseFirst == UCOL_OFF) {

6378 coll->tertiaryMask = UCOL_REMOVE_CASE;

6379 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;

6380 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */

6381 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;

6382 coll->tertiaryBottom = UCOL_COMMON_BOT3;

6383 } else {

6384 coll->tertiaryMask = UCOL_KEEP_CASE;

6385 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;

6386 if(coll->caseFirst == UCOL_UPPER_FIRST) {

6387 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;

6388 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;

6389 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;

6390 } else {

6391 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;

6392 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;

6393 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;

6394 }

6395 }

6396

6397 /* Set the compression values */

6398 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBott om - 1);

6399 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3tertiaryTotal); / w e multilply double with int, but need only int */

6400 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopC ount);

6401

6402 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY

6403 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == U COL_NON_IGNORABLE)

6404 {

6405 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;

6406 } else {

6407 coll->sortKeyGen = ucol_calcSortKey;

6408 }

6409 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && col l->numericCollation == UCOL_OFF

6410 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneF ailed)

6411 {

6412 if(coll->latinOneCEs == NULL \|\| coll->latinOneRegenTable) {

6413 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in build ing latin1 table, we'll use it

6414 //fprintf(stderr, "F");

6415 coll->latinOneUse = TRUE;

6416 } else {

6417 coll->latinOneUse = FALSE;

6418 }

6419 if(*status == U_UNSUPPORTED_ERROR) {

6420 *status = U_ZERO_ERROR;

6421 }

6422 } else { // latin1Table exists and it doesn't need to be regenerated , just use it

6423 coll->latinOneUse = TRUE;

6424 }

6425 } else {

6426 coll->latinOneUse = FALSE;

6427 }

6428 }

6429 }	311 }

6430	312

6431 U_CAPI uint32_t U_EXPORT2	313 U_CAPI uint32_t U_EXPORT2

6432 ucol_setVariableTop(UCollator coll, const UChar varTop, int32_t len, UErrorCod e *status) {	314 ucol_setVariableTop(UCollator coll, const UChar varTop, int32_t len, UErrorCod e *status) {

6433 if(U_FAILURE(*status) \|\| coll == NULL) {	315 if(U_FAILURE(*status) \|\| coll == NULL) {

6434 return 0;	316 return 0;

6435 }	317 }

6436 if(len == -1) {	318 return Collator::fromUCollator(coll)->setVariableTop(varTop, len, *status);

6437 len = u_strlen(varTop);

6438 }

6439 if(len == 0) {

6440 *status = U_ILLEGAL_ARGUMENT_ERROR;

6441 return 0;

6442 }

6443

6444 if(coll->delegate!=NULL) {

6445 return ((Collator)coll->delegate)->setVariableTop(varTop, len, status);

6446 }

6447

6448

6449 collIterate s;

6450 IInit_collIterate(coll, varTop, len, &s, status);

6451 if(U_FAILURE(*status)) {

6452 return 0;

6453 }

6454

6455 uint32_t CE = ucol_IGetNextCE(coll, &s, status);

6456

6457 /* here we check if we have consumed all characters */

6458 /* you can put in either one character or a contraction */

6459 /* you shouldn't put more... */

6460 if(s.pos != s.endp \|\| CE == UCOL_NO_MORE_CES) {

6461 *status = U_CE_NOT_FOUND_ERROR;

6462 return 0;

6463 }

6464

6465 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);

6466

6467 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {

6468 *status = U_PRIMARY_TOO_LONG_ERROR;

6469 return 0;

6470 }

6471 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {

6472 coll->variableTopValueisDefault = FALSE;

6473 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;

6474 }

6475

6476 /* To avoid memory leak, free the offset buffer if necessary. */

6477 ucol_freeOffsetBuffer(&s);

6478

6479 return CE & UCOL_PRIMARYMASK;

6480 }	319 }

6481	320

6482 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator coll, UErrorCode status) {	321 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator coll, UErrorCode status) {

6483 if(U_FAILURE(*status) \|\| coll == NULL) {	322 if(U_FAILURE(*status) \|\| coll == NULL) {

6484 return 0;	323 return 0;

6485 }	324 }

6486 if(coll->delegate!=NULL) {	325 return Collator::fromUCollator(coll)->getVariableTop(*status);

6487 return ((const Collator)coll->delegate)->getVariableTop(status);

6488 }

6489 return coll->variableTopValue<<16;

6490 }	326 }

6491	327

6492 U_CAPI void U_EXPORT2	328 U_CAPI void U_EXPORT2

6493 ucol_restoreVariableTop(UCollator coll, const uint32_t varTop, UErrorCode stat us) {	329 ucol_restoreVariableTop(UCollator coll, const uint32_t varTop, UErrorCode stat us) {

6494 if(U_FAILURE(*status) \|\| coll == NULL) {	330 if(U_FAILURE(*status) \|\| coll == NULL) {

6495 return;	331 return;

6496 }	332 }

6497	333 Collator::fromUCollator(coll)->setVariableTop(varTop, *status);

6498 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {	334 }

6499 coll->variableTopValueisDefault = FALSE;	335

6500 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;

6501 }

6502 }

6503 /* Attribute setter API */

6504 U_CAPI void U_EXPORT2	336 U_CAPI void U_EXPORT2

6505 ucol_setAttribute(UCollator coll, UColAttribute attr, UColAttributeValue value, UErrorCode status) {	337 ucol_setAttribute(UCollator coll, UColAttribute attr, UColAttributeValue value, UErrorCode status) {

6506 if(U_FAILURE(*status) \|\| coll == NULL) {	338 if(U_FAILURE(*status) \|\| coll == NULL) {

6507 return;	339 return;

6508 }	340 }

6509	341

6510 if(coll->delegate != NULL) {	342 Collator::fromUCollator(coll)->setAttribute(attr, value, *status);

6511 ((Collator)coll->delegate)->setAttribute(attr,value,status);

6512 return;

6513 }

6514

6515 UColAttributeValue oldFrench = coll->frenchCollation;

6516 UColAttributeValue oldCaseFirst = coll->caseFirst;

6517 switch(attr) {

6518 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */

6519 if(value == UCOL_ON) {

6520 coll->numericCollation = UCOL_ON;

6521 coll->numericCollationisDefault = FALSE;

6522 } else if (value == UCOL_OFF) {

6523 coll->numericCollation = UCOL_OFF;

6524 coll->numericCollationisDefault = FALSE;

6525 } else if (value == UCOL_DEFAULT) {

6526 coll->numericCollationisDefault = TRUE;

6527 coll->numericCollation = (UColAttributeValue)coll->options->numericC ollation;

6528 } else {

6529 *status = U_ILLEGAL_ARGUMENT_ERROR;

6530 }

6531 break;

6532 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragan a */

6533 if(value == UCOL_ON \|\| value == UCOL_OFF \|\| value == UCOL_DEFAULT) {

6534 // This attribute is an implementation detail of the CLDR Japanese t ailoring.

6535 // The implementation might change to use a different mechanism

6536 // to achieve the same Japanese sort order.

6537 // Since ICU 50, this attribute is not settable any more via API fun ctions.

6538 } else {

6539 *status = U_ILLEGAL_ARGUMENT_ERROR;

6540 }

6541 break;

6542 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights* /

6543 if(value == UCOL_ON) {

6544 coll->frenchCollation = UCOL_ON;

6545 coll->frenchCollationisDefault = FALSE;

6546 } else if (value == UCOL_OFF) {

6547 coll->frenchCollation = UCOL_OFF;

6548 coll->frenchCollationisDefault = FALSE;

6549 } else if (value == UCOL_DEFAULT) {

6550 coll->frenchCollationisDefault = TRUE;

6551 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCol lation;

6552 } else {

6553 *status = U_ILLEGAL_ARGUMENT_ERROR ;

6554 }

6555 break;

6556 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/

6557 if(value == UCOL_SHIFTED) {

6558 coll->alternateHandling = UCOL_SHIFTED;

6559 coll->alternateHandlingisDefault = FALSE;

6560 } else if (value == UCOL_NON_IGNORABLE) {

6561 coll->alternateHandling = UCOL_NON_IGNORABLE;

6562 coll->alternateHandlingisDefault = FALSE;

6563 } else if (value == UCOL_DEFAULT) {

6564 coll->alternateHandlingisDefault = TRUE;

6565 coll->alternateHandling = (UColAttributeValue)coll->options->alterna teHandling ;

6566 } else {

6567 *status = U_ILLEGAL_ARGUMENT_ERROR ;

6568 }

6569 break;

6570 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */

6571 if(value == UCOL_LOWER_FIRST) {

6572 coll->caseFirst = UCOL_LOWER_FIRST;

6573 coll->caseFirstisDefault = FALSE;

6574 } else if (value == UCOL_UPPER_FIRST) {

6575 coll->caseFirst = UCOL_UPPER_FIRST;

6576 coll->caseFirstisDefault = FALSE;

6577 } else if (value == UCOL_OFF) {

6578 coll->caseFirst = UCOL_OFF;

6579 coll->caseFirstisDefault = FALSE;

6580 } else if (value == UCOL_DEFAULT) {

6581 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;

6582 coll->caseFirstisDefault = TRUE;

6583 } else {

6584 *status = U_ILLEGAL_ARGUMENT_ERROR ;

6585 }

6586 break;

6587 case UCOL_CASE_LEVEL: /* do we have an extra case level */

6588 if(value == UCOL_ON) {

6589 coll->caseLevel = UCOL_ON;

6590 coll->caseLevelisDefault = FALSE;

6591 } else if (value == UCOL_OFF) {

6592 coll->caseLevel = UCOL_OFF;

6593 coll->caseLevelisDefault = FALSE;

6594 } else if (value == UCOL_DEFAULT) {

6595 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;

6596 coll->caseLevelisDefault = TRUE;

6597 } else {

6598 *status = U_ILLEGAL_ARGUMENT_ERROR ;

6599 }

6600 break;

6601 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */

6602 if(value == UCOL_ON) {

6603 coll->normalizationMode = UCOL_ON;

6604 coll->normalizationModeisDefault = FALSE;

6605 initializeFCD(status);

6606 } else if (value == UCOL_OFF) {

6607 coll->normalizationMode = UCOL_OFF;

6608 coll->normalizationModeisDefault = FALSE;

6609 } else if (value == UCOL_DEFAULT) {

6610 coll->normalizationModeisDefault = TRUE;

6611 coll->normalizationMode = (UColAttributeValue)coll->options->normali zationMode;

6612 if(coll->normalizationMode == UCOL_ON) {

6613 initializeFCD(status);

6614 }

6615 } else {

6616 *status = U_ILLEGAL_ARGUMENT_ERROR ;

6617 }

6618 break;

6619 case UCOL_STRENGTH: /* attribute for strength */

6620 if (value == UCOL_DEFAULT) {

6621 coll->strengthisDefault = TRUE;

6622 coll->strength = (UColAttributeValue)coll->options->strength;

6623 } else if (value <= UCOL_IDENTICAL) {

6624 coll->strengthisDefault = FALSE;

6625 coll->strength = value;

6626 } else {

6627 *status = U_ILLEGAL_ARGUMENT_ERROR ;

6628 }

6629 break;

6630 case UCOL_ATTRIBUTE_COUNT:

6631 default:

6632 *status = U_ILLEGAL_ARGUMENT_ERROR;

6633 break;

6634 }

6635 if(oldFrench != coll->frenchCollation \|\| oldCaseFirst != coll->caseFirst) {

6636 coll->latinOneRegenTable = TRUE;

6637 } else {

6638 coll->latinOneRegenTable = FALSE;

6639 }

6640 ucol_updateInternalState(coll, status);

6641 }	343 }

6642	344

6643 U_CAPI UColAttributeValue U_EXPORT2	345 U_CAPI UColAttributeValue U_EXPORT2

6644 ucol_getAttribute(const UCollator coll, UColAttribute attr, UErrorCode status) {	346 ucol_getAttribute(const UCollator coll, UColAttribute attr, UErrorCode status) {

6645 if(U_FAILURE(*status) \|\| coll == NULL) {	347 if(U_FAILURE(*status) \|\| coll == NULL) {

6646 return UCOL_DEFAULT;	348 return UCOL_DEFAULT;

6647 }	349 }

6648	350

6649 if(coll->delegate != NULL) {	351 return Collator::fromUCollator(coll)->getAttribute(attr, *status);

6650 return ((Collator)coll->delegate)->getAttribute(attr,status);

6651 }

6652

6653 switch(attr) {

6654 case UCOL_NUMERIC_COLLATION:

6655 return coll->numericCollation;

6656 case UCOL_HIRAGANA_QUATERNARY_MODE:

6657 return coll->hiraganaQ;

6658 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights* /

6659 return coll->frenchCollation;

6660 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/

6661 return coll->alternateHandling;

6662 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */

6663 return coll->caseFirst;

6664 case UCOL_CASE_LEVEL: /* do we have an extra case level */

6665 return coll->caseLevel;

6666 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */

6667 return coll->normalizationMode;

6668 case UCOL_STRENGTH: /* attribute for strength */

6669 return coll->strength;

6670 case UCOL_ATTRIBUTE_COUNT:

6671 default:

6672 *status = U_ILLEGAL_ARGUMENT_ERROR;

6673 break;

6674 }

6675 return UCOL_DEFAULT;

6676 }	352 }

6677	353

6678 U_CAPI void U_EXPORT2	354 U_CAPI void U_EXPORT2

6679 ucol_setStrength( UCollator *coll,	355 ucol_setStrength( UCollator *coll,

6680 UCollationStrength strength)	356 UCollationStrength strength)

6681 {	357 {

6682 UErrorCode status = U_ZERO_ERROR;	358 UErrorCode status = U_ZERO_ERROR;

6683 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);	359 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);

6684 }	360 }

6685	361

6686 U_CAPI UCollationStrength U_EXPORT2	362 U_CAPI UCollationStrength U_EXPORT2

6687 ucol_getStrength(const UCollator *coll)	363 ucol_getStrength(const UCollator *coll)

6688 {	364 {

6689 UErrorCode status = U_ZERO_ERROR;	365 UErrorCode status = U_ZERO_ERROR;

6690 return ucol_getAttribute(coll, UCOL_STRENGTH, &status);	366 return ucol_getAttribute(coll, UCOL_STRENGTH, &status);

6691 }	367 }

6692	368

6693 U_CAPI int32_t U_EXPORT2	369 U_CAPI int32_t U_EXPORT2

6694 ucol_getReorderCodes(const UCollator *coll,	370 ucol_getReorderCodes(const UCollator *coll,

6695 int32_t *dest,	371 int32_t *dest,

6696 int32_t destCapacity,	372 int32_t destCapacity,

6697 UErrorCode *status) {	373 UErrorCode *status) {

6698 if (U_FAILURE(*status)) {	374 if (U_FAILURE(*status)) {

6699 return 0;	375 return 0;

6700 }	376 }

6701	377

6702 if(coll->delegate!=NULL) {	378 return Collator::fromUCollator(coll)->getReorderCodes(dest, destCapacity, *s tatus);

6703 return ((const Collator)coll->delegate)->getReorderCodes(dest, destCapaci ty, status);

6704 }

6705

6706 if (destCapacity < 0 \|\| (destCapacity > 0 && dest == NULL)) {

6707 *status = U_ILLEGAL_ARGUMENT_ERROR;

6708 return 0;

6709 }

6710

6711 #ifdef UCOL_DEBUG

6712 printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength);

6713 printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLe ngth);

6714 #endif

6715

6716 if (coll->reorderCodesLength > destCapacity) {

6717 *status = U_BUFFER_OVERFLOW_ERROR;

6718 return coll->reorderCodesLength;

6719 }

6720 for (int32_t i = 0; i < coll->reorderCodesLength; i++) {

6721 dest[i] = coll->reorderCodes[i];

6722 }

6723 return coll->reorderCodesLength;

6724 }	379 }

6725	380

6726 U_CAPI void U_EXPORT2	381 U_CAPI void U_EXPORT2

6727 ucol_setReorderCodes(UCollator* coll,	382 ucol_setReorderCodes(UCollator* coll,

6728 const int32_t* reorderCodes,	383 const int32_t* reorderCodes,

6729 int32_t reorderCodesLength,	384 int32_t reorderCodesLength,

6730 UErrorCode *status) {	385 UErrorCode *status) {

6731 if (U_FAILURE(*status)) {	386 if (U_FAILURE(*status)) {

6732 return;	387 return;

6733 }	388 }

6734	389

6735 if (reorderCodesLength < 0 \|\| (reorderCodesLength > 0 && reorderCodes == NUL L)) {	390 Collator::fromUCollator(coll)->setReorderCodes(reorderCodes, reorderCodesLen gth, *status);

6736 *status = U_ILLEGAL_ARGUMENT_ERROR;

6737 return;

6738 }

6739

6740 if(coll->delegate!=NULL) {

6741 ((Collator)coll->delegate)->setReorderCodes(reorderCodes, reorderCodesLen gth, status);

6742 return;

6743 }

6744

6745 if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {

6746 uprv_free(coll->reorderCodes);

6747 }

6748 coll->reorderCodes = NULL;

6749 coll->freeReorderCodesOnClose = FALSE;

6750 coll->reorderCodesLength = 0;

6751 if (reorderCodesLength == 0) {

6752 if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutat ionTableOnClose == TRUE) {

6753 uprv_free(coll->leadBytePermutationTable);

6754 }

6755 coll->leadBytePermutationTable = NULL;

6756 coll->freeLeadBytePermutationTableOnClose = FALSE;

6757 return;

6758 }

6759 coll->reorderCodes = (int32_t) uprv_malloc(reorderCodesLength sizeof(int3 2_t));

6760 if (coll->reorderCodes == NULL) {

6761 *status = U_MEMORY_ALLOCATION_ERROR;

6762 return;

6763 }

6764 coll->freeReorderCodesOnClose = TRUE;

6765 for (int32_t i = 0; i < reorderCodesLength; i++) {

6766 coll->reorderCodes[i] = reorderCodes[i];

6767 }

6768 coll->reorderCodesLength = reorderCodesLength;

6769 ucol_buildPermutationTable(coll, status);

6770 }	391 }

6771	392

6772 U_CAPI int32_t U_EXPORT2	393 U_CAPI int32_t U_EXPORT2

6773 ucol_getEquivalentReorderCodes(int32_t reorderCode,	394 ucol_getEquivalentReorderCodes(int32_t reorderCode,

6774 int32_t* dest,	395 int32_t* dest,

6775 int32_t destCapacity,	396 int32_t destCapacity,

6776 UErrorCode *pErrorCode) {	397 UErrorCode *pErrorCode) {

6777 bool equivalentCodesSet[USCRIPT_CODE_LIMIT];	398 return Collator::getEquivalentReorderCodes(reorderCode, dest, destCapacity, *pErrorCode);

6778 uint16_t leadBytes[256];	399 }

6779 int leadBytesCount;

6780 int leadByteIndex;

6781 int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT];

6782 int reorderCodesForLeadByteCount;

6783 int reorderCodeIndex;

6784

6785 int32_t equivalentCodesCount = 0;

6786 int setIndex;

6787

6788 if (U_FAILURE(*pErrorCode)) {

6789 return 0;

6790 }

6791

6792 if (destCapacity < 0 \|\| (destCapacity > 0 && dest == NULL)) {

6793 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;

6794 return 0;

6795 }

6796

6797 uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool));

6798

6799 const UCollator* uca = ucol_initUCA(pErrorCode);

6800 if (U_FAILURE(*pErrorCode)) {

6801 » return 0;

6802 }

6803 leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes , 256);

6804 for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) {

6805 reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte(

6806 uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE _LIMIT);

6807 for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCou nt; reorderCodeIndex++) {

6808 equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true ;

6809 }

6810 }

6811

6812 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {

6813 if (equivalentCodesSet[setIndex] == true) {

6814 equivalentCodesCount++;

6815 }

6816 }

6817

6818 if (destCapacity == 0) {

6819 return equivalentCodesCount;

6820 }

6821

6822 equivalentCodesCount = 0;

6823 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {

6824 if (equivalentCodesSet[setIndex] == true) {

6825 dest[equivalentCodesCount++] = setIndex;

6826 if (equivalentCodesCount >= destCapacity) {

6827 break;

6828 }

6829 }

6830 }

6831 return equivalentCodesCount;

6832 }

6833

6834

6835 /****************************************************************************/

6836 /* Following are misc functions */

6837 /* there are new APIs and some compatibility APIs */

6838 /****************************************************************************/

6839	400

6840 U_CAPI void U_EXPORT2	401 U_CAPI void U_EXPORT2

6841 ucol_getVersion(const UCollator* coll,	402 ucol_getVersion(const UCollator* coll,

6842 UVersionInfo versionInfo)	403 UVersionInfo versionInfo)

6843 {	404 {

6844 if(coll->delegate!=NULL) {	405 Collator::fromUCollator(coll)->getVersion(versionInfo);

6845 ((const Collator*)coll->delegate)->getVersion(versionInfo);

6846 return;

6847 }

6848 /* RunTime version */

6849 uint8_t rtVersion = UCOL_RUNTIME_VERSION;

6850 /* Builder version*/

6851 uint8_t bdVersion = coll->image->version[0];

6852

6853 /* Charset Version. Need to get the version from cnv files

6854 * makeconv should populate cnv files with version and

6855 * an api has to be provided in ucnv.h to obtain this version

6856 */

6857 uint8_t csVersion = 0;

6858

6859 /* combine the version info */

6860 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) \| (bdVersion<<6) \| (csVersi on));

6861

6862 /* Tailoring rules */

6863 versionInfo[0] = (uint8_t)(cmbVersion>>8);

6864 versionInfo[1] = (uint8_t)cmbVersion;

6865 versionInfo[2] = coll->image->version[1];

6866 if(coll->UCA) {

6867 /* Include the minor number when getting the UCA version. (major & 1f) < < 3 \| (minor & 7) */

6868 versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 \| (coll-> UCA->image->UCAVersion[1] & 0x07);

6869 } else {

6870 versionInfo[3] = 0;

6871 }

6872 }

6873

6874

6875 /* This internal API checks whether a character is tailored or not */

6876 U_CAPI UBool U_EXPORT2

6877 ucol_isTailored(const UCollator coll, const UChar u, UErrorCode status) {

6878 if(U_FAILURE(*status) \|\| coll == NULL \|\| coll == coll->UCA) {

6879 return FALSE;

6880 }

6881

6882 uint32_t CE = UCOL_NOT_FOUND;

6883 const UChar *ContractionStart = NULL;

6884 if(u < 0x100) { /* latin-1 */

6885 CE = coll->latinOneMapping[u];

6886 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {

6887 return FALSE;

6888 }

6889 } else { /* regular */

6890 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);

6891 }

6892

6893 if(isContraction(CE)) {

6894 ContractionStart = (UChar *)coll->image+getContractOffset(CE);

6895 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex) );

6896 }

6897

6898 return (UBool)(CE != UCOL_NOT_FOUND);

6899 }

6900

6901

6902 /****************************************************************************/

6903 /* Following are the string compare functions */

6904 /* */

6905 /****************************************************************************/

6906

6907

6908 /* ucol_checkIdent internal function. Does byte level string compare. */

6909 /* Used by strcoll if strength == identical and strings */

6910 /* are otherwise equal. */

6911 /* */

6912 /* Comparison must be done on NFD normalized strings. */

6913 /* FCD is not good enough. */

6914

6915 static

6916 UCollationResult ucol_checkIdent(collIterate sColl, collIterate tColl, UBoo l normalize, UErrorCode *status)

6917 {

6918 // When we arrive here, we can have normal strings or UCharIterators. Curren tly they are both

6919 // of same type, but that doesn't really mean that it will stay that way.

6920 int32_t comparison;

6921

6922 if (sColl->flags & UCOL_USE_ITERATOR) {

6923 // The division for the array length may truncate the array size to

6924 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high

6925 // for all platforms anyway.

6926 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];

6927 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];

6928 UNormIterator sNIt = NULL, tNIt = NULL;

6929 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);

6930 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);

6931 sColl->iterator->move(sColl->iterator, 0, UITER_START);

6932 tColl->iterator->move(tColl->iterator, 0, UITER_START);

6933 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, sta tus);

6934 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, sta tus);

6935 comparison = u_strCompareIter(sIt, tIt, TRUE);

6936 unorm_closeIter(sNIt);

6937 unorm_closeIter(tNIt);

6938 } else {

6939 int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl- >endp - sColl->string) : -1;

6940 const UChar *sBuf = sColl->string;

6941 int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl- >endp - tColl->string) : -1;

6942 const UChar *tBuf = tColl->string;

6943

6944 if (normalize) {

6945 *status = U_ZERO_ERROR;

6946 // Note: We could use Normalizer::compare() or similar, but for shor t strings

6947 // which may not be in FCD it might be faster to just NFD them.

6948 // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather tha n

6949 // NFD'ing immediately might be faster for long strings,

6950 // but string comparison is usually done on relatively short strings .

6951 sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN ) == 0, sBuf, sLen),

6952 sColl->writableBuffer,

6953 *status);

6954 tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN ) == 0, tBuf, tLen),

6955 tColl->writableBuffer,

6956 *status);

6957 if(U_FAILURE(*status)) {

6958 return UCOL_LESS;

6959 }

6960 comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writ ableBuffer);

6961 } else {

6962 comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);

6963 }

6964 }

6965

6966 if (comparison < 0) {

6967 return UCOL_LESS;

6968 } else if (comparison == 0) {

6969 return UCOL_EQUAL;

6970 } else /* comparison > 0 */ {

6971 return UCOL_GREATER;

6972 }

6973 }

6974

6975 /* CEBuf - A struct and some inline functions to handle the saving */

6976 /* of CEs in a buffer within ucol_strcoll */

6977

6978 #define UCOL_CEBUF_SIZE 512

6979 typedef struct ucol_CEBuf {

6980 uint32_t *buf;

6981 uint32_t *endp;

6982 uint32_t *pos;

6983 uint32_t localArray[UCOL_CEBUF_SIZE];

6984 } ucol_CEBuf;

6985

6986

6987 static

6988 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {

6989 (b)->buf = (b)->pos = (b)->localArray;

6990 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;

6991 }

6992

6993 static

6994 void ucol_CEBuf_Expand(ucol_CEBuf b, collIterate ci, UErrorCode *status) {

6995 uint32_t oldSize;

6996 uint32_t newSize;

6997 uint32_t *newBuf;

6998

6999 ci->flags \|= UCOL_ITER_ALLOCATED;

7000 oldSize = (uint32_t)(b->pos - b->buf);

7001 newSize = oldSize * 2;

7002 newBuf = (uint32_t )uprv_malloc(newSize sizeof(uint32_t));

7003 if(newBuf == NULL) {

7004 *status = U_MEMORY_ALLOCATION_ERROR;

7005 }

7006 else {

7007 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));

7008 if (b->buf != b->localArray) {

7009 uprv_free(b->buf);

7010 }

7011 b->buf = newBuf;

7012 b->endp = b->buf + newSize;

7013 b->pos = b->buf + oldSize;

7014 }

7015 }

7016

7017 static

7018 inline void UCOL_CEBUF_PUT(ucol_CEBuf b, uint32_t ce, collIterate ci, UErrorCo de *status) {

7019 if (b->pos == b->endp) {

7020 ucol_CEBuf_Expand(b, ci, status);

7021 }

7022 if (U_SUCCESS(*status)) {

7023 *(b)->pos++ = ce;

7024 }

7025 }

7026

7027 /* This is a trick string compare function that goes in and uses sortkeys to com pare */

7028 /* It is used when compare gets in trouble and needs to bail out */

7029 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,

7030 collIterate *tColl,

7031 UErrorCode *status)

7032 {

7033 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];

7034 uint8_t *sourceKeyP = sourceKey;

7035 uint8_t *targetKeyP = targetKey;

7036 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;

7037 const UCollator *coll = sColl->coll;

7038 const UChar *source = NULL;

7039 const UChar *target = NULL;

7040 int32_t result = UCOL_EQUAL;

7041 UnicodeString sourceString, targetString;

7042 int32_t sourceLength;

7043 int32_t targetLength;

7044

7045 if(sColl->flags & UCOL_USE_ITERATOR) {

7046 sColl->iterator->move(sColl->iterator, 0, UITER_START);

7047 tColl->iterator->move(tColl->iterator, 0, UITER_START);

7048 UChar32 c;

7049 while((c=sColl->iterator->next(sColl->iterator))>=0) {

7050 sourceString.append((UChar)c);

7051 }

7052 while((c=tColl->iterator->next(tColl->iterator))>=0) {

7053 targetString.append((UChar)c);

7054 }

7055 source = sourceString.getBuffer();

7056 sourceLength = sourceString.length();

7057 target = targetString.getBuffer();

7058 targetLength = targetString.length();

7059 } else { // no iterators

7060 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sCo ll->string):-1;

7061 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tCo ll->string):-1;

7062 source = sColl->string;

7063 target = tColl->string;

7064 }

7065

7066

7067

7068 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourc eKeyLen);

7069 if(sourceKeyLen > UCOL_MAX_BUFFER) {

7070 sourceKeyP = (uint8_t)uprv_malloc(sourceKeyLensizeof(uint8_t));

7071 if(sourceKeyP == NULL) {

7072 *status = U_MEMORY_ALLOCATION_ERROR;

7073 goto cleanup_and_do_compare;

7074 }

7075 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, s ourceKeyLen);

7076 }

7077

7078 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targe tKeyLen);

7079 if(targetKeyLen > UCOL_MAX_BUFFER) {

7080 targetKeyP = (uint8_t)uprv_malloc(targetKeyLensizeof(uint8_t));

7081 if(targetKeyP == NULL) {

7082 *status = U_MEMORY_ALLOCATION_ERROR;

7083 goto cleanup_and_do_compare;

7084 }

7085 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, t argetKeyLen);

7086 }

7087

7088 result = uprv_strcmp((const char)sourceKeyP, (const char)targetKeyP);

7089

7090 cleanup_and_do_compare:

7091 if(sourceKeyP != NULL && sourceKeyP != sourceKey) {

7092 uprv_free(sourceKeyP);

7093 }

7094

7095 if(targetKeyP != NULL && targetKeyP != targetKey) {

7096 uprv_free(targetKeyP);

7097 }

7098

7099 if(result<0) {

7100 return UCOL_LESS;

7101 } else if(result>0) {

7102 return UCOL_GREATER;

7103 } else {

7104 return UCOL_EQUAL;

7105 }

7106 }

7107

7108

7109 static UCollationResult

7110 ucol_strcollRegular(collIterate sColl, collIterate tColl, UErrorCode *status)

7111 {

7112 U_ALIGN_CODE(16);

7113

7114 const UCollator *coll = sColl->coll;

7115

7116

7117 // setting up the collator parameters

7118 UColAttributeValue strength = coll->strength;

7119 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY);

7120

7121 UBool checkSecTer = initialCheckSecTer;

7122 UBool checkTertiary = (strength >= UCOL_TERTIARY);

7123 UBool checkQuad = (strength >= UCOL_QUATERNARY);

7124 UBool checkIdent = (strength == UCOL_IDENTICAL);

7125 UBool checkCase = (coll->caseLevel == UCOL_ON);

7126 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;

7127 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);

7128 UBool qShifted = shifted && checkQuad;

7129 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;

7130

7131 if(doHiragana && shifted) {

7132 return (ucol_compareUsingSortKeys(sColl, tColl, status));

7133 }

7134 uint8_t caseSwitch = coll->caseSwitch;

7135 uint8_t tertiaryMask = coll->tertiaryMask;

7136

7137 // This is the lowest primary value that will not be ignored if shifted

7138 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;

7139

7140 UCollationResult result = UCOL_EQUAL;

7141 UCollationResult hirResult = UCOL_EQUAL;

7142

7143 // Preparing the CE buffers. They will be filled during the primary phase

7144 ucol_CEBuf sCEs;

7145 ucol_CEBuf tCEs;

7146 UCOL_INIT_CEBUF(&sCEs);

7147 UCOL_INIT_CEBUF(&tCEs);

7148

7149 uint32_t secS = 0, secT = 0;

7150 uint32_t sOrder=0, tOrder=0;

7151

7152 // Non shifted primary processing is quite simple

7153 if(!shifted) {

7154 for(;;) {

7155 // We fetch CEs until we hit a non ignorable primary or end.

7156 uint32_t sPrimary;

7157 do {

7158 // We get the next CE

7159 sOrder = ucol_IGetNextCE(coll, sColl, status);

7160 // Stuff it in the buffer

7161 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);

7162 // And keep just the primary part.

7163 sPrimary = sOrder & UCOL_PRIMARYMASK;

7164 } while(sPrimary == 0);

7165

7166 // see the comments on the above block

7167 uint32_t tPrimary;

7168 do {

7169 tOrder = ucol_IGetNextCE(coll, tColl, status);

7170 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);

7171 tPrimary = tOrder & UCOL_PRIMARYMASK;

7172 } while(tPrimary == 0);

7173

7174 // if both primaries are the same

7175 if(sPrimary == tPrimary) {

7176 // and there are no more CEs, we advance to the next level

7177 if(sPrimary == UCOL_NO_MORE_CES_PRIMARY) {

7178 break;

7179 }

7180 if(doHiragana && hirResult == UCOL_EQUAL) {

7181 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCO L_WAS_HIRAGANA)) {

7182 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl ->flags & UCOL_WAS_HIRAGANA))

7183 ? UCOL_LESS:UCOL_GREATER;

7184 }

7185 }

7186 } else {

7187 // only need to check one for continuation

7188 // if one is then the other must be or the preceding CE would be a prefix of the other

7189 if (coll->leadBytePermutationTable != NULL && !isContinuation(sO rder)) {

7190 sPrimary = (coll->leadBytePermutationTable[sPrimary>>24] << 24) \| (sPrimary & 0x00FFFFFF);

7191 tPrimary = (coll->leadBytePermutationTable[tPrimary>>24] << 24) \| (tPrimary & 0x00FFFFFF);

7192 }

7193 // if two primaries are different, we are done

7194 result = (sPrimary < tPrimary) ? UCOL_LESS: UCOL_GREATER;

7195 goto commonReturn;

7196 }

7197 } // no primary difference... do the rest from the buffers

7198 } else { // shifted - do a slightly more complicated processing :)

7199 for(;;) {

7200 UBool sInShifted = FALSE;

7201 UBool tInShifted = FALSE;

7202 // This version of code can be refactored. However, it seems easier to understand this way.

7203 // Source loop. Same as the target loop.

7204 for(;;) {

7205 sOrder = ucol_IGetNextCE(coll, sColl, status);

7206 if(sOrder == UCOL_NO_MORE_CES) {

7207 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);

7208 break;

7209 } else if(sOrder == 0 \|\| (sInShifted && (sOrder & UCOL_PRIMARYMA SK) == 0)) {

7210 /* UCA amendment - ignore ignorables that follow shifted cod e points */

7211 continue;

7212 } else if(isContinuation(sOrder)) {

7213 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary va lue */

7214 if(sInShifted) {

7215 sOrder = (sOrder & UCOL_PRIMARYMASK) \| 0xC0; /* pres erve interesting continuation */

7216 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);

7217 continue;

7218 } else {

7219 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);

7220 break;

7221 }

7222 } else { /* Just lower level values */

7223 if(sInShifted) {

7224 continue;

7225 } else {

7226 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);

7227 continue;

7228 }

7229 }

7230 } else { /* regular */

7231 if(coll->leadBytePermutationTable != NULL){

7232 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) \| (sOrder & 0x00FFFFFF);

7233 }

7234 if((sOrder & UCOL_PRIMARYMASK) > LVT) {

7235 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);

7236 break;

7237 } else {

7238 if((sOrder & UCOL_PRIMARYMASK) > 0) {

7239 sInShifted = TRUE;

7240 sOrder &= UCOL_PRIMARYMASK;

7241 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);

7242 continue;

7243 } else {

7244 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);

7245 sInShifted = FALSE;

7246 continue;

7247 }

7248 }

7249 }

7250 }

7251 sOrder &= UCOL_PRIMARYMASK;

7252 sInShifted = FALSE;

7253

7254 for(;;) {

7255 tOrder = ucol_IGetNextCE(coll, tColl, status);

7256 if(tOrder == UCOL_NO_MORE_CES) {

7257 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);

7258 break;

7259 } else if(tOrder == 0 \|\| (tInShifted && (tOrder & UCOL_PRIMARYMA SK) == 0)) {

7260 /* UCA amendment - ignore ignorables that follow shifted cod e points */

7261 continue;

7262 } else if(isContinuation(tOrder)) {

7263 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary va lue */

7264 if(tInShifted) {

7265 tOrder = (tOrder & UCOL_PRIMARYMASK) \| 0xC0; /* pres erve interesting continuation */

7266 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);

7267 continue;

7268 } else {

7269 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);

7270 break;

7271 }

7272 } else { /* Just lower level values */

7273 if(tInShifted) {

7274 continue;

7275 } else {

7276 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);

7277 continue;

7278 }

7279 }

7280 } else { /* regular */

7281 if(coll->leadBytePermutationTable != NULL){

7282 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) \| (tOrder & 0x00FFFFFF);

7283 }

7284 if((tOrder & UCOL_PRIMARYMASK) > LVT) {

7285 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);

7286 break;

7287 } else {

7288 if((tOrder & UCOL_PRIMARYMASK) > 0) {

7289 tInShifted = TRUE;

7290 tOrder &= UCOL_PRIMARYMASK;

7291 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);

7292 continue;

7293 } else {

7294 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);

7295 tInShifted = FALSE;

7296 continue;

7297 }

7298 }

7299 }

7300 }

7301 tOrder &= UCOL_PRIMARYMASK;

7302 tInShifted = FALSE;

7303

7304 if(sOrder == tOrder) {

7305 /*

7306 if(doHiragana && hirResult == UCOL_EQUAL) {

7307 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_ HIRAGANA)) {

7308 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))

7309 ? UCOL_LESS:UCOL_GREATER;

7310 }

7311 }

7312 */

7313 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {

7314 break;

7315 } else {

7316 sOrder = 0;

7317 tOrder = 0;

7318 continue;

7319 }

7320 } else {

7321 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;

7322 goto commonReturn;

7323 }

7324 } /* no primary difference... do the rest from the buffers */

7325 }

7326

7327 /* now, we're gonna reexamine collected CEs */

7328 uint32_t *sCE;

7329 uint32_t *tCE;

7330

7331 /* This is the secondary level of comparison */

7332 if(checkSecTer) {

7333 if(!isFrenchSec) { /* normal */

7334 sCE = sCEs.buf;

7335 tCE = tCEs.buf;

7336 for(;;) {

7337 while (secS == 0) {

7338 secS = *(sCE++) & UCOL_SECONDARYMASK;

7339 }

7340

7341 while(secT == 0) {

7342 secT = *(tCE++) & UCOL_SECONDARYMASK;

7343 }

7344

7345 if(secS == secT) {

7346 if(secS == UCOL_NO_MORE_CES_SECONDARY) {

7347 break;

7348 } else {

7349 secS = 0; secT = 0;

7350 continue;

7351 }

7352 } else {

7353 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;

7354 goto commonReturn;

7355 }

7356 }

7357 } else { /* do the French */

7358 uint32_t *sCESave = NULL;

7359 uint32_t *tCESave = NULL;

7360 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimi zed */

7361 tCE = tCEs.pos-2;

7362 for(;;) {

7363 while (secS == 0 && sCE >= sCEs.buf) {

7364 if(sCESave == NULL) {

7365 secS = *(sCE--);

7366 if(isContinuation(secS)) {

7367 while(isContinuation(secS = *(sCE--)))

7368 ;

7369 /* after this, secS has the start of continuation, a nd sCEs points before that */

7370 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */

7371 sCE+=2; /* need to point to the first continuation CP */

7372 /* However, now you can just continue doing stuff */

7373 }

7374 } else {

7375 secS = *(sCE++);

7376 if(!isContinuation(secS)) { /* This means we have finish ed with this cont */

7377 sCE = sCESave; /* reset the pointer to be fore continuation */

7378 sCESave = NULL;

7379 secS = 0; /* Fetch a fresh CE before the continuati on sequence. */

7380 continue;

7381 }

7382 }

7383 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit * /

7384 }

7385

7386 while(secT == 0 && tCE >= tCEs.buf) {

7387 if(tCESave == NULL) {

7388 secT = *(tCE--);

7389 if(isContinuation(secT)) {

7390 while(isContinuation(secT = *(tCE--)))

7391 ;

7392 /* after this, secS has the start of continuation, a nd sCEs points before that */

7393 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */

7394 tCE+=2; /* need to point to the first continuation CP */

7395 /* However, now you can just continue doing stuff */

7396 }

7397 } else {

7398 secT = *(tCE++);

7399 if(!isContinuation(secT)) { /* This means we have finish ed with this cont */

7400 tCE = tCESave; /* reset the pointer to befo re continuation */

7401 tCESave = NULL;

7402 secT = 0; /* Fetch a fresh CE before the continuati on sequence. */

7403 continue;

7404 }

7405 }

7406 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit * /

7407 }

7408

7409 if(secS == secT) {

7410 if(secS == UCOL_NO_MORE_CES_SECONDARY \|\| (sCE < sCEs.buf && tCE < tCEs.buf)) {

7411 break;

7412 } else {

7413 secS = 0; secT = 0;

7414 continue;

7415 }

7416 } else {

7417 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;

7418 goto commonReturn;

7419 }

7420 }

7421 }

7422 }

7423

7424 /* doing the case bit */

7425 if(checkCase) {

7426 sCE = sCEs.buf;

7427 tCE = tCEs.buf;

7428 for(;;) {

7429 while((secS & UCOL_REMOVE_CASE) == 0) {

7430 if(!isContinuation(*sCE++)) {

7431 secS =*(sCE-1);

7432 if(((secS & UCOL_PRIMARYMASK) != 0) \|\| strength > UCOL_PRIMA RY) {

7433 // primary ignorables should not be considered on the ca se level when the strength is primary

7434 // otherwise, the CEs stop being well-formed

7435 secS &= UCOL_TERT_CASE_MASK;

7436 secS ^= caseSwitch;

7437 } else {

7438 secS = 0;

7439 }

7440 } else {

7441 secS = 0;

7442 }

7443 }

7444

7445 while((secT & UCOL_REMOVE_CASE) == 0) {

7446 if(!isContinuation(*tCE++)) {

7447 secT = *(tCE-1);

7448 if(((secT & UCOL_PRIMARYMASK) != 0) \|\| strength > UCOL_PRIMA RY) {

7449 // primary ignorables should not be considered on the ca se level when the strength is primary

7450 // otherwise, the CEs stop being well-formed

7451 secT &= UCOL_TERT_CASE_MASK;

7452 secT ^= caseSwitch;

7453 } else {

7454 secT = 0;

7455 }

7456 } else {

7457 secT = 0;

7458 }

7459 }

7460

7461 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {

7462 result = UCOL_LESS;

7463 goto commonReturn;

7464 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {

7465 result = UCOL_GREATER;

7466 goto commonReturn;

7467 }

7468

7469 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY \|\| (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {

7470 break;

7471 } else {

7472 secS = 0;

7473 secT = 0;

7474 }

7475 }

7476 }

7477

7478 /* Tertiary level */

7479 if(checkTertiary) {

7480 secS = 0;

7481 secT = 0;

7482 sCE = sCEs.buf;

7483 tCE = tCEs.buf;

7484 for(;;) {

7485 while((secS & UCOL_REMOVE_CASE) == 0) {

7486 sOrder = *sCE++;

7487 secS = sOrder & tertiaryMask;

7488 if(!isContinuation(sOrder)) {

7489 secS ^= caseSwitch;

7490 } else {

7491 secS &= UCOL_REMOVE_CASE;

7492 }

7493 }

7494

7495 while((secT & UCOL_REMOVE_CASE) == 0) {

7496 tOrder = *tCE++;

7497 secT = tOrder & tertiaryMask;

7498 if(!isContinuation(tOrder)) {

7499 secT ^= caseSwitch;

7500 } else {

7501 secT &= UCOL_REMOVE_CASE;

7502 }

7503 }

7504

7505 if(secS == secT) {

7506 if((secS & UCOL_REMOVE_CASE) == 1) {

7507 break;

7508 } else {

7509 secS = 0; secT = 0;

7510 continue;

7511 }

7512 } else {

7513 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;

7514 goto commonReturn;

7515 }

7516 }

7517 }

7518

7519

7520 if(qShifted /checkQuad/) {

7521 UBool sInShifted = TRUE;

7522 UBool tInShifted = TRUE;

7523 secS = 0;

7524 secT = 0;

7525 sCE = sCEs.buf;

7526 tCE = tCEs.buf;

7527 for(;;) {

7528 while((secS == 0 && secS != UCOL_NO_MORE_CES) \|\| (isContinuation(sec S) && !sInShifted)) {

7529 secS = *(sCE++);

7530 if(isContinuation(secS)) {

7531 if(!sInShifted) {

7532 continue;

7533 }

7534 } else if(secS > LVT \|\| (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */

7535 secS = UCOL_PRIMARYMASK;

7536 sInShifted = FALSE;

7537 } else {

7538 sInShifted = TRUE;

7539 }

7540 }

7541 secS &= UCOL_PRIMARYMASK;

7542

7543

7544 while((secT == 0 && secT != UCOL_NO_MORE_CES) \|\| (isContinuation(sec T) && !tInShifted)) {

7545 secT = *(tCE++);

7546 if(isContinuation(secT)) {

7547 if(!tInShifted) {

7548 continue;

7549 }

7550 } else if(secT > LVT \|\| (secT & UCOL_PRIMARYMASK) == 0) {

7551 secT = UCOL_PRIMARYMASK;

7552 tInShifted = FALSE;

7553 } else {

7554 tInShifted = TRUE;

7555 }

7556 }

7557 secT &= UCOL_PRIMARYMASK;

7558

7559 if(secS == secT) {

7560 if(secS == UCOL_NO_MORE_CES_PRIMARY) {

7561 break;

7562 } else {

7563 secS = 0; secT = 0;

7564 continue;

7565 }

7566 } else {

7567 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;

7568 goto commonReturn;

7569 }

7570 }

7571 } else if(doHiragana && hirResult != UCOL_EQUAL) {

7572 // If we're fine on quaternaries, we might be different

7573 // on Hiragana. This, however, might fail us in shifted.

7574 result = hirResult;

7575 goto commonReturn;

7576 }

7577

7578 /* For IDENTICAL comparisons, we use a bitwise character comparison */

7579 /* as a tiebreaker if all else is equal. */

7580 /* Getting here should be quite rare - strings are not identical - */

7581 /* that is checked first, but compared == through all other checks. */

7582 if(checkIdent)

7583 {

7584 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UC OL_ON);

7585 result = ucol_checkIdent(sColl, tColl, TRUE, status);

7586 }

7587

7588 commonReturn:

7589 if ((sColl->flags \| tColl->flags) & UCOL_ITER_ALLOCATED) {

7590 if (sCEs.buf != sCEs.localArray ) {

7591 uprv_free(sCEs.buf);

7592 }

7593 if (tCEs.buf != tCEs.localArray ) {

7594 uprv_free(tCEs.buf);

7595 }

7596 }

7597

7598 return result;

7599 }

7600

7601 static UCollationResult

7602 ucol_strcollRegular(const UCollator *coll,

7603 const UChar *source, int32_t sourceLength,

7604 const UChar *target, int32_t targetLength,

7605 UErrorCode *status) {

7606 collIterate sColl, tColl;

7607 // Preparing the context objects for iterating over strings

7608 IInit_collIterate(coll, source, sourceLength, &sColl, status);

7609 IInit_collIterate(coll, target, targetLength, &tColl, status);

7610 if(U_FAILURE(*status)) {

7611 return UCOL_LESS;

7612 }

7613 return ucol_strcollRegular(&sColl, &tColl, status);

7614 }

7615

7616 static inline uint32_t

7617 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,

7618 uint32_t CE, const UChar s, int32_t index, int32_t l en)

7619 {

7620 const UChar UCharOffset = (UChar )coll->image+getContractOffset(CE&0xFFF);

7621 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;

7622 int32_t offset = 1;

7623 UChar schar = 0, tchar = 0;

7624

7625 for(;;) {

7626 if(len == -1) {

7627 if(s[*index] == 0) { // end of string

7628 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOn eOffset]);

7629 } else {

7630 schar = s[*index];

7631 }

7632 } else {

7633 if(*index == len) {

7634 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOn eOffset]);

7635 } else {

7636 schar = s[*index];

7637 }

7638 }

7639

7640 while(schar > (tchar = (UCharOffset+offset))) { / since the contractio n codepoints should be ordered, we skip all that are smaller */

7641 offset++;

7642 }

7643

7644 if (schar == tchar) {

7645 (*index)++;

7646 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff set+offset]);

7647 }

7648 else

7649 {

7650 if(schar & 0xFF00 /> UCOL_ENDOFLATIN1RANGE/) {

7651 return UCOL_BAIL_OUT_CE;

7652 }

7653 // skip completely ignorables

7654 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);

7655 if(isZeroCE == 0) { // we have to ignore completely ignorables

7656 (*index)++;

7657 continue;

7658 }

7659

7660 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff set]);

7661 }

7662 }

7663 }

7664

7665

7666 /**

7667 * This is a fast strcoll, geared towards text in Latin-1.

7668 * It supports contractions of size two, French secondaries

7669 * and case switching. You can use it with strengths primary

7670 * to tertiary. It does not support shifted and case level.

7671 * It relies on the table build by setupLatin1Table. If it

7672 * doesn't understand something, it will go to the regular

7673 * strcoll.

7674 */

7675 static UCollationResult

7676 ucol_strcollUseLatin1( const UCollator *coll,

7677 const UChar *source,

7678 int32_t sLen,

7679 const UChar *target,

7680 int32_t tLen,

7681 UErrorCode *status)

7682 {

7683 U_ALIGN_CODE(16);

7684 int32_t strength = coll->strength;

7685

7686 int32_t sIndex = 0, tIndex = 0;

7687 UChar sChar = 0, tChar = 0;

7688 uint32_t sOrder=0, tOrder=0;

7689

7690 UBool endOfSource = FALSE;

7691

7692 uint32_t *elements = coll->latinOneCEs;

7693

7694 UBool haveContractions = FALSE; // if we have contractions in our string

7695 // we cannot do French secondary

7696

7697 // Do the primary level

7698 for(;;) {

7699 while(sOrder==0) { // this loop skips primary ignorables

7700 // sOrder=getNextlatinOneCE(source);

7701 if(sLen==-1) { // handling zero terminated strings

7702 sChar=source[sIndex++];

7703 if(sChar==0) {

7704 endOfSource = TRUE;

7705 break;

7706 }

7707 } else { // handling strings with known length

7708 if(sIndex==sLen) {

7709 endOfSource = TRUE;

7710 break;

7711 }

7712 sChar=source[sIndex++];

7713 }

7714 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sCha r > 0xFF, but this is faster on win32)

7715 //fprintf(stderr, "R");

7716 return ucol_strcollRegular(coll, source, sLen, target, tLen, sta tus);

7717 }

7718 sOrder = elements[sChar];

7719 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special

7720 // specials can basically be either contractions or bail-out sig ns. If we get anything

7721 // else, we'll bail out anywasy

7722 if(getCETag(sOrder) == CONTRACTION_TAG) {

7723 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOr der, source, &sIndex, sLen);

7724 haveContractions = TRUE; // if there are contractions, we ca nnot do French secondary

7725 // However, if there are contractions in the table, but we a lways use just one char,

7726 // we might be able to do French. This should be checked out .

7727 }

7728 if(sOrder >= UCOL_NOT_FOUND /== UCOL_BAIL_OUT_CE/) {

7729 //fprintf(stderr, "S");

7730 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);

7731 }

7732 }

7733 }

7734

7735 while(tOrder==0) { // this loop skips primary ignorables

7736 // tOrder=getNextlatinOneCE(target);

7737 if(tLen==-1) { // handling zero terminated strings

7738 tChar=target[tIndex++];

7739 if(tChar==0) {

7740 if(endOfSource) { // this is different than source loop,

7741 // as we already know that source loop is done here,

7742 // so we can either finish the primary loop if both

7743 // strings are done or anounce the result if only

7744 // target is done. Same below.

7745 goto endOfPrimLoop;

7746 } else {

7747 return UCOL_GREATER;

7748 }

7749 }

7750 } else { // handling strings with known length

7751 if(tIndex==tLen) {

7752 if(endOfSource) {

7753 goto endOfPrimLoop;

7754 } else {

7755 return UCOL_GREATER;

7756 }

7757 }

7758 tChar=target[tIndex++];

7759 }

7760 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sCha r > 0xFF, but this is faster on win32)

7761 //fprintf(stderr, "R");

7762 return ucol_strcollRegular(coll, source, sLen, target, tLen, sta tus);

7763 }

7764 tOrder = elements[tChar];

7765 if(tOrder >= UCOL_NOT_FOUND) {

7766 // Handling specials, see the comments for source

7767 if(getCETag(tOrder) == CONTRACTION_TAG) {

7768 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOr der, target, &tIndex, tLen);

7769 haveContractions = TRUE;

7770 }

7771 if(tOrder >= UCOL_NOT_FOUND /== UCOL_BAIL_OUT_CE/) {

7772 //fprintf(stderr, "S");

7773 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);

7774 }

7775 }

7776 }

7777 if(endOfSource) { // source is finished, but target is not, say the resu lt.

7778 return UCOL_LESS;

7779 }

7780

7781 if(sOrder == tOrder) { // if we have same CEs, we continue the loop

7782 sOrder = 0; tOrder = 0;

7783 continue;

7784 } else {

7785 // compare current top bytes

7786 if(((sOrder^tOrder)&0xFF000000)!=0) {

7787 // top bytes differ, return difference

7788 if(sOrder < tOrder) {

7789 return UCOL_LESS;

7790 } else if(sOrder > tOrder) {

7791 return UCOL_GREATER;

7792 }

7793 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24) ;

7794 // since we must return enum value

7795 }

7796

7797 // top bytes match, continue with following bytes

7798 sOrder<<=8;

7799 tOrder<<=8;

7800 }

7801 }

7802

7803 endOfPrimLoop:

7804 // after primary loop, we definitely know the sizes of strings,

7805 // so we set it and use simpler loop for secondaries and tertiaries

7806 sLen = sIndex; tLen = tIndex;

7807 if(strength >= UCOL_SECONDARY) {

7808 // adjust the table beggining

7809 elements += coll->latinOneTableLen;

7810 endOfSource = FALSE;

7811

7812 if(coll->frenchCollation == UCOL_OFF) { // non French

7813 // This loop is a simplified copy of primary loop

7814 // at this point we know that whole strings are latin-1, so we don't

7815 // check for that. We also know that we only have contractions as

7816 // specials.

7817 sIndex = 0; tIndex = 0;

7818 for(;;) {

7819 while(sOrder==0) {

7820 if(sIndex==sLen) {

7821 endOfSource = TRUE;

7822 break;

7823 }

7824 sChar=source[sIndex++];

7825 sOrder = elements[sChar];

7826 if(sOrder > UCOL_NOT_FOUND) {

7827 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDAR Y, sOrder, source, &sIndex, sLen);

7828 }

7829 }

7830

7831 while(tOrder==0) {

7832 if(tIndex==tLen) {

7833 if(endOfSource) {

7834 goto endOfSecLoop;

7835 } else {

7836 return UCOL_GREATER;

7837 }

7838 }

7839 tChar=target[tIndex++];

7840 tOrder = elements[tChar];

7841 if(tOrder > UCOL_NOT_FOUND) {

7842 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDAR Y, tOrder, target, &tIndex, tLen);

7843 }

7844 }

7845 if(endOfSource) {

7846 return UCOL_LESS;

7847 }

7848

7849 if(sOrder == tOrder) {

7850 sOrder = 0; tOrder = 0;

7851 continue;

7852 } else {

7853 // see primary loop for comments on this

7854 if(((sOrder^tOrder)&0xFF000000)!=0) {

7855 if(sOrder < tOrder) {

7856 return UCOL_LESS;

7857 } else if(sOrder > tOrder) {

7858 return UCOL_GREATER;

7859 }

7860 }

7861 sOrder<<=8;

7862 tOrder<<=8;

7863 }

7864 }

7865 } else { // French

7866 if(haveContractions) { // if we have contractions, we have to bail o ut

7867 // since we don't really know how to handle them here

7868 return ucol_strcollRegular(coll, source, sLen, target, tLen, sta tus);

7869 }

7870 // For French, we go backwards

7871 sIndex = sLen; tIndex = tLen;

7872 for(;;) {

7873 while(sOrder==0) {

7874 if(sIndex==0) {

7875 endOfSource = TRUE;

7876 break;

7877 }

7878 sChar=source[--sIndex];

7879 sOrder = elements[sChar];

7880 // don't even look for contractions

7881 }

7882

7883 while(tOrder==0) {

7884 if(tIndex==0) {

7885 if(endOfSource) {

7886 goto endOfSecLoop;

7887 } else {

7888 return UCOL_GREATER;

7889 }

7890 }

7891 tChar=target[--tIndex];

7892 tOrder = elements[tChar];

7893 // don't even look for contractions

7894 }

7895 if(endOfSource) {

7896 return UCOL_LESS;

7897 }

7898

7899 if(sOrder == tOrder) {

7900 sOrder = 0; tOrder = 0;

7901 continue;

7902 } else {

7903 // see the primary loop for comments

7904 if(((sOrder^tOrder)&0xFF000000)!=0) {

7905 if(sOrder < tOrder) {

7906 return UCOL_LESS;

7907 } else if(sOrder > tOrder) {

7908 return UCOL_GREATER;

7909 }

7910 }

7911 sOrder<<=8;

7912 tOrder<<=8;

7913 }

7914 }

7915 }

7916 }

7917

7918 endOfSecLoop:

7919 if(strength >= UCOL_TERTIARY) {

7920 // tertiary loop is the same as secondary (except no French)

7921 elements += coll->latinOneTableLen;

7922 sIndex = 0; tIndex = 0;

7923 endOfSource = FALSE;

7924 for(;;) {

7925 while(sOrder==0) {

7926 if(sIndex==sLen) {

7927 endOfSource = TRUE;

7928 break;

7929 }

7930 sChar=source[sIndex++];

7931 sOrder = elements[sChar];

7932 if(sOrder > UCOL_NOT_FOUND) {

7933 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sO rder, source, &sIndex, sLen);

7934 }

7935 }

7936 while(tOrder==0) {

7937 if(tIndex==tLen) {

7938 if(endOfSource) {

7939 return UCOL_EQUAL; // if both strings are at the end, th ey are equal

7940 } else {

7941 return UCOL_GREATER;

7942 }

7943 }

7944 tChar=target[tIndex++];

7945 tOrder = elements[tChar];

7946 if(tOrder > UCOL_NOT_FOUND) {

7947 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tO rder, target, &tIndex, tLen);

7948 }

7949 }

7950 if(endOfSource) {

7951 return UCOL_LESS;

7952 }

7953 if(sOrder == tOrder) {

7954 sOrder = 0; tOrder = 0;

7955 continue;

7956 } else {

7957 if(((sOrder^tOrder)&0xff000000)!=0) {

7958 if(sOrder < tOrder) {

7959 return UCOL_LESS;

7960 } else if(sOrder > tOrder) {

7961 return UCOL_GREATER;

7962 }

7963 }

7964 sOrder<<=8;

7965 tOrder<<=8;

7966 }

7967 }

7968 }

7969 return UCOL_EQUAL;

7970 }

7971

7972 /*

7973 Note: ucol_strcollUTF8 supports null terminated input. Calculating length of

7974 null terminated input string takes extra amount of CPU cycles.

7975 */

7976 static UCollationResult

7977 ucol_strcollRegularUTF8(

7978 const UCollator *coll,

7979 const char *source,

7980 int32_t sourceLength,

7981 const char *target,

7982 int32_t targetLength,

7983 UErrorCode *status)

7984 {

7985 UCharIterator src;

7986 UCharIterator tgt;

7987

7988 uiter_setUTF8(&src, source, sourceLength);

7989 uiter_setUTF8(&tgt, target, targetLength);

7990

7991 // Preparing the context objects for iterating over strings

7992 collIterate sColl, tColl;

7993 IInit_collIterate(coll, NULL, -1, &sColl, status);

7994 IInit_collIterate(coll, NULL, -1, &tColl, status);

7995 if(U_FAILURE(*status)) {

7996 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)

7997 return UCOL_EQUAL;

7998 }

7999 // The division for the array length may truncate the array size to

8000 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high

8001 // for all platforms anyway.

8002 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];

8003 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];

8004 UNormIterator sNormIter = NULL, tNormIter = NULL;

8005

8006 sColl.iterator = &src;

8007 sColl.flags \|= UCOL_USE_ITERATOR;

8008 tColl.flags \|= UCOL_USE_ITERATOR;

8009 tColl.iterator = &tgt;

8010

8011 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {

8012 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), statu s);

8013 sColl.iterator = unorm_setIter(sNormIter, &src, UNORM_FCD, status);

8014 sColl.flags &= ~UCOL_ITER_NORM;

8015

8016 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), statu s);

8017 tColl.iterator = unorm_setIter(tNormIter, &tgt, UNORM_FCD, status);

8018 tColl.flags &= ~UCOL_ITER_NORM;

8019 }

8020

8021 return ucol_strcollRegular(&sColl, &tColl, status);

8022 }

8023

8024 static inline uint32_t

8025 ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength,

8026 uint32_t CE, const char s, int32_t index, int32_t le n)

8027 {

8028 const UChar UCharOffset = (UChar )coll->image+getContractOffset(CE&0xFFF);

8029 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;

8030 int32_t offset = 1;

8031 UChar32 schar = 0, tchar = 0;

8032

8033 for(;;) {

8034 if (*index == len) {

8035 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff set]);

8036 }

8037 U8_GET_OR_FFFD((const uint8_t)s, 0, index, len, schar);

8038 if (len < 0 && schar == 0) {

8039 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff set]);

8040 }

8041

8042 while(schar > (tchar = (UCharOffset+offset))) { / since the contractio n codepoints should be ordered, we skip all that are smaller */

8043 offset++;

8044 }

8045

8046 if (schar == tchar) {

8047 U8_FWD_1(s, *index, len);

8048 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff set+offset]);

8049 }

8050 else

8051 {

8052 if(schar & 0xFF00 /> UCOL_ENDOFLATIN1RANGE/) {

8053 return UCOL_BAIL_OUT_CE;

8054 }

8055 // skip completely ignorables

8056 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);

8057 if(isZeroCE == 0) { // we have to ignore completely ignorables

8058 U8_FWD_1(s, *index, len);

8059 continue;

8060 }

8061

8062 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff set]);

8063 }

8064 }

8065 }

8066

8067 static inline UCollationResult

8068 ucol_strcollUseLatin1UTF8(

8069 const UCollator *coll,

8070 const char *source,

8071 int32_t sLen,

8072 const char *target,

8073 int32_t tLen,

8074 UErrorCode *status)

8075 {

8076 U_ALIGN_CODE(16);

8077 int32_t strength = coll->strength;

8078

8079 int32_t sIndex = 0, tIndex = 0;

8080 UChar32 sChar = 0, tChar = 0;

8081 uint32_t sOrder=0, tOrder=0;

8082

8083 UBool endOfSource = FALSE;

8084

8085 uint32_t *elements = coll->latinOneCEs;

8086

8087 UBool haveContractions = FALSE; // if we have contractions in our string

8088 // we cannot do French secondary

8089

8090 // Do the primary level

8091 for(;;) {

8092 while(sOrder==0) { // this loop skips primary ignorables

8093 // sOrder=getNextlatinOneCE(source);

8094 if (sIndex == sLen) {

8095 endOfSource = TRUE;

8096 break;

8097 }

8098 U8_NEXT_OR_FFFD(source, sIndex, sLen ,sChar);

8099 if (sLen < 0 && sChar == 0) {

8100 endOfSource = TRUE;

8101 sLen = sIndex;

8102 break;

8103 }

8104 if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out ( sChar > 0xFF, but this is faster on win32)

8105 //fprintf(stderr, "R");

8106 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);

8107 }

8108 sOrder = elements[sChar];

8109 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special

8110 // specials can basically be either contractions or bail-out sig ns. If we get anything

8111 // else, we'll bail out anywasy

8112 if(getCETag(sOrder) == CONTRACTION_TAG) {

8113 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);

8114 haveContractions = TRUE; // if there are contractions, we ca nnot do French secondary

8115 // However, if there are contractions in the table, but we a lways use just one char,

8116 // we might be able to do French. This should be checked out .

8117 }

8118 if(sOrder >= UCOL_NOT_FOUND /== UCOL_BAIL_OUT_CE/) {

8119 //fprintf(stderr, "S");

8120 return ucol_strcollRegularUTF8(coll, source, sLen, target, t Len, status);

8121 }

8122 }

8123 }

8124

8125 while(tOrder==0) { // this loop skips primary ignorables

8126 // tOrder=getNextlatinOneCE(target);

8127 if (tIndex == tLen) {

8128 if(endOfSource) {

8129 goto endOfPrimLoopU8;

8130 } else {

8131 return UCOL_GREATER;

8132 }

8133 }

8134 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);

8135 if (tLen < 0 && tChar == 0) {

8136 if(endOfSource) {

8137 tLen = tIndex;

8138 goto endOfPrimLoopU8;

8139 } else {

8140 return UCOL_GREATER;

8141 }

8142 }

8143 if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out ( sChar > 0xFF, but this is faster on win32)

8144 //fprintf(stderr, "R");

8145 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);

8146 }

8147 tOrder = elements[tChar];

8148 if(tOrder >= UCOL_NOT_FOUND) {

8149 // Handling specials, see the comments for source

8150 if(getCETag(tOrder) == CONTRACTION_TAG) {

8151 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);

8152 haveContractions = TRUE;

8153 }

8154 if(tOrder >= UCOL_NOT_FOUND /== UCOL_BAIL_OUT_CE/) {

8155 //fprintf(stderr, "S");

8156 return ucol_strcollRegularUTF8(coll, source, sLen, target, t Len, status);

8157 }

8158 }

8159 }

8160 if(endOfSource) { // source is finished, but target is not, say the resu lt.

8161 return UCOL_LESS;

8162 }

8163

8164 if(sOrder == tOrder) { // if we have same CEs, we continue the loop

8165 sOrder = 0; tOrder = 0;

8166 continue;

8167 } else {

8168 // compare current top bytes

8169 if(((sOrder^tOrder)&0xFF000000)!=0) {

8170 // top bytes differ, return difference

8171 if(sOrder < tOrder) {

8172 return UCOL_LESS;

8173 } else if(sOrder > tOrder) {

8174 return UCOL_GREATER;

8175 }

8176 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24) ;

8177 // since we must return enum value

8178 }

8179

8180 // top bytes match, continue with following bytes

8181 sOrder<<=8;

8182 tOrder<<=8;

8183 }

8184 }

8185

8186 endOfPrimLoopU8:

8187 // after primary loop, we definitely know the sizes of strings,

8188 // so we set it and use simpler loop for secondaries and tertiaries

8189 sLen = sIndex; tLen = tIndex;

8190 if(strength >= UCOL_SECONDARY) {

8191 // adjust the table beggining

8192 elements += coll->latinOneTableLen;

8193 endOfSource = FALSE;

8194

8195 if(coll->frenchCollation == UCOL_OFF) { // non French

8196 // This loop is a simplified copy of primary loop

8197 // at this point we know that whole strings are latin-1, so we don't

8198 // check for that. We also know that we only have contractions as

8199 // specials.

8200 sIndex = 0; tIndex = 0;

8201 for(;;) {

8202 while(sOrder==0) {

8203 if(sIndex==sLen) {

8204 endOfSource = TRUE;

8205 break;

8206 }

8207 U_ASSERT(sLen >= 0);

8208 U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);

8209 U_ASSERT(sChar >= 0 && sChar <= 0xFF);

8210 sOrder = elements[sChar];

8211 if(sOrder > UCOL_NOT_FOUND) {

8212 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECO NDARY, sOrder, source, &sIndex, sLen);

8213 }

8214 }

8215

8216 while(tOrder==0) {

8217 if(tIndex==tLen) {

8218 if(endOfSource) {

8219 goto endOfSecLoopU8;

8220 } else {

8221 return UCOL_GREATER;

8222 }

8223 }

8224 U_ASSERT(tLen >= 0);

8225 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);

8226 U_ASSERT(tChar >= 0 && tChar <= 0xFF);

8227 tOrder = elements[tChar];

8228 if(tOrder > UCOL_NOT_FOUND) {

8229 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECO NDARY, tOrder, target, &tIndex, tLen);

8230 }

8231 }

8232 if(endOfSource) {

8233 return UCOL_LESS;

8234 }

8235

8236 if(sOrder == tOrder) {

8237 sOrder = 0; tOrder = 0;

8238 continue;

8239 } else {

8240 // see primary loop for comments on this

8241 if(((sOrder^tOrder)&0xFF000000)!=0) {

8242 if(sOrder < tOrder) {

8243 return UCOL_LESS;

8244 } else if(sOrder > tOrder) {

8245 return UCOL_GREATER;

8246 }

8247 }

8248 sOrder<<=8;

8249 tOrder<<=8;

8250 }

8251 }

8252 } else { // French

8253 if(haveContractions) { // if we have contractions, we have to bail o ut

8254 // since we don't really know how to handle them here

8255 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);

8256 }

8257 // For French, we go backwards

8258 sIndex = sLen; tIndex = tLen;

8259 for(;;) {

8260 while(sOrder==0) {

8261 if(sIndex==0) {

8262 endOfSource = TRUE;

8263 break;

8264 }

8265 U8_PREV_OR_FFFD(source, 0, sIndex, sChar);

8266 U_ASSERT(sChar >= 0 && sChar <= 0xFF);

8267 sOrder = elements[sChar];

8268 // don't even look for contractions

8269 }

8270

8271 while(tOrder==0) {

8272 if(tIndex==0) {

8273 if(endOfSource) {

8274 goto endOfSecLoopU8;

8275 } else {

8276 return UCOL_GREATER;

8277 }

8278 }

8279 U8_PREV_OR_FFFD(target, 0, tIndex, tChar);

8280 U_ASSERT(tChar >= 0 && tChar <= 0xFF);

8281 tOrder = elements[tChar];

8282 // don't even look for contractions

8283 }

8284 if(endOfSource) {

8285 return UCOL_LESS;

8286 }

8287

8288 if(sOrder == tOrder) {

8289 sOrder = 0; tOrder = 0;

8290 continue;

8291 } else {

8292 // see the primary loop for comments

8293 if(((sOrder^tOrder)&0xFF000000)!=0) {

8294 if(sOrder < tOrder) {

8295 return UCOL_LESS;

8296 } else if(sOrder > tOrder) {

8297 return UCOL_GREATER;

8298 }

8299 }

8300 sOrder<<=8;

8301 tOrder<<=8;

8302 }

8303 }

8304 }

8305 }

8306

8307 endOfSecLoopU8:

8308 if(strength >= UCOL_TERTIARY) {

8309 // tertiary loop is the same as secondary (except no French)

8310 elements += coll->latinOneTableLen;

8311 sIndex = 0; tIndex = 0;

8312 endOfSource = FALSE;

8313 for(;;) {

8314 while(sOrder==0) {

8315 if(sIndex==sLen) {

8316 endOfSource = TRUE;

8317 break;

8318 }

8319 U_ASSERT(sLen >= 0);

8320 U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);

8321 U_ASSERT(sChar >= 0 && sChar <= 0xFF);

8322 sOrder = elements[sChar];

8323 if(sOrder > UCOL_NOT_FOUND) {

8324 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY , sOrder, source, &sIndex, sLen);

8325 }

8326 }

8327 while(tOrder==0) {

8328 if(tIndex==tLen) {

8329 if(endOfSource) {

8330 return UCOL_EQUAL; // if both strings are at the end, th ey are equal

8331 } else {

8332 return UCOL_GREATER;

8333 }

8334 }

8335 U_ASSERT(tLen >= 0);

8336 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);

8337 U_ASSERT(tChar >= 0 && tChar <= 0xFF);

8338 tOrder = elements[tChar];

8339 if(tOrder > UCOL_NOT_FOUND) {

8340 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY , tOrder, target, &tIndex, tLen);

8341 }

8342 }

8343 if(endOfSource) {

8344 return UCOL_LESS;

8345 }

8346 if(sOrder == tOrder) {

8347 sOrder = 0; tOrder = 0;

8348 continue;

8349 } else {

8350 if(((sOrder^tOrder)&0xff000000)!=0) {

8351 if(sOrder < tOrder) {

8352 return UCOL_LESS;

8353 } else if(sOrder > tOrder) {

8354 return UCOL_GREATER;

8355 }

8356 }

8357 sOrder<<=8;

8358 tOrder<<=8;

8359 }

8360 }

8361 }

8362 return UCOL_EQUAL;

8363 }	406 }

8364	407

8365 U_CAPI UCollationResult U_EXPORT2	408 U_CAPI UCollationResult U_EXPORT2

8366 ucol_strcollIter( const UCollator *coll,	409 ucol_strcollIter( const UCollator *coll,

8367 UCharIterator *sIter,	410 UCharIterator *sIter,

8368 UCharIterator *tIter,	411 UCharIterator *tIter,

8369 UErrorCode *status)	412 UErrorCode *status)

8370 {	413 {

8371 if(!status \|\| U_FAILURE(*status)) {	414 if(!status \|\| U_FAILURE(*status)) {

8372 return UCOL_EQUAL;	415 return UCOL_EQUAL;

8373 }	416 }

8374	417

8375 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);	418 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);

8376 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIt er);	419 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIt er);

8377	420

8378 if (sIter == tIter) {

8379 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)

8380 return UCOL_EQUAL;

8381 }

8382 if(sIter == NULL \|\| tIter == NULL \|\| coll == NULL) {	421 if(sIter == NULL \|\| tIter == NULL \|\| coll == NULL) {

8383 *status = U_ILLEGAL_ARGUMENT_ERROR;	422 *status = U_ILLEGAL_ARGUMENT_ERROR;

8384 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)	423 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);

8385 return UCOL_EQUAL;	424 return UCOL_EQUAL;

8386 }	425 }

8387	426

8388 UCollationResult result = UCOL_EQUAL;	427 UCollationResult result = Collator::fromUCollator(coll)->compare(sIter, tI ter, *status);

8389	428

8390 // Preparing the context objects for iterating over strings	429 UTRACE_EXIT_VALUE_STATUS(result, *status);

8391 collIterate sColl, tColl;

8392 IInit_collIterate(coll, NULL, -1, &sColl, status);

8393 IInit_collIterate(coll, NULL, -1, &tColl, status);

8394 if(U_FAILURE(*status)) {

8395 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)

8396 return UCOL_EQUAL;

8397 }

8398 // The division for the array length may truncate the array size to

8399 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high

8400 // for all platforms anyway.

8401 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];

8402 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];

8403 UNormIterator sNormIter = NULL, tNormIter = NULL;

8404

8405 sColl.iterator = sIter;

8406 sColl.flags \|= UCOL_USE_ITERATOR;

8407 tColl.flags \|= UCOL_USE_ITERATOR;

8408 tColl.iterator = tIter;

8409

8410 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {

8411 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), statu s);

8412 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);

8413 sColl.flags &= ~UCOL_ITER_NORM;

8414

8415 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), statu s);

8416 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);

8417 tColl.flags &= ~UCOL_ITER_NORM;

8418 }

8419

8420 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;

8421

8422 while((sChar = sColl.iterator->next(sColl.iterator)) ==

8423 (tChar = tColl.iterator->next(tColl.iterator))) {

8424 if(sChar == U_SENTINEL) {

8425 result = UCOL_EQUAL;

8426 goto end_compare;

8427 }

8428 }

8429

8430 if(sChar == U_SENTINEL) {

8431 tChar = tColl.iterator->previous(tColl.iterator);

8432 }

8433

8434 if(tChar == U_SENTINEL) {

8435 sChar = sColl.iterator->previous(sColl.iterator);

8436 }

8437

8438 sChar = sColl.iterator->previous(sColl.iterator);

8439 tChar = tColl.iterator->previous(tColl.iterator);

8440

8441 if (ucol_unsafeCP((UChar)sChar, coll) \|\| ucol_unsafeCP((UChar)tChar, coll))

8442 {

8443 // We are stopped in the middle of a contraction.

8444 // Scan backwards through the == part of the string looking for the star t of the contraction.

8445 // It doesn't matter which string we scan, since they are the same in this region.

8446 do

8447 {

8448 sChar = sColl.iterator->previous(sColl.iterator);

8449 tChar = tColl.iterator->previous(tColl.iterator);

8450 }

8451 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));

8452 }

8453

8454

8455 if(U_SUCCESS(*status)) {

8456 result = ucol_strcollRegular(&sColl, &tColl, status);

8457 }

8458

8459 end_compare:

8460 if(sNormIter \|\| tNormIter) {

8461 unorm_closeIter(sNormIter);

8462 unorm_closeIter(tNormIter);

8463 }

8464

8465 UTRACE_EXIT_VALUE_STATUS(result, *status)

8466 return result;	430 return result;

8467 }	431 }

8468	432

8469	433

8470 /* */	434 /* */

8471 /* ucol_strcoll Main public API string comparison function */	435 /* ucol_strcoll Main public API string comparison function */

8472 /* */	436 /* */

8473 U_CAPI UCollationResult U_EXPORT2	437 U_CAPI UCollationResult U_EXPORT2

8474 ucol_strcoll( const UCollator *coll,	438 ucol_strcoll( const UCollator *coll,

8475 const UChar *source,	439 const UChar *source,

8476 int32_t sourceLength,	440 int32_t sourceLength,

8477 const UChar *target,	441 const UChar *target,

8478 int32_t targetLength)	442 int32_t targetLength)

8479 {	443 {

8480 U_ALIGN_CODE(16);	444 U_ALIGN_CODE(16);

8481	445

8482 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);	446 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);

8483 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {	447 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {

8484 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, sour ce, target);	448 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, sour ce, target);

8485 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLengt h);	449 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLengt h);

8486 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLengt h);	450 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLengt h);

8487 }	451 }

8488	452

8489 if((source == NULL && sourceLength != 0) \|\| (target == NULL && targetLength != 0)) {

8490 // do not crash, but return. Should have

8491 // status argument to return error.

8492 UTRACE_EXIT_VALUE(UCOL_EQUAL);

8493 return UCOL_EQUAL;

8494 }

8495

8496 /* Quick check if source and target are same strings. */

8497 /* They should either both be NULL terminated or the explicit length should be set on both. */

8498 if (source==target && sourceLength==targetLength) {

8499 UTRACE_EXIT_VALUE(UCOL_EQUAL);

8500 return UCOL_EQUAL;

8501 }

8502

8503 if(coll->delegate != NULL) {

8504 UErrorCode status = U_ZERO_ERROR;

8505 return ((const Collator*)coll->delegate)->compare(source,sourceLength,targ et,targetLength, status);

8506 }

8507

8508 /* Scan the strings. Find: */

8509 /* The length of any leading portion that is equal */

8510 /* Whether they are exactly equal. (in which case we just return) */

8511 const UChar *pSrc = source;

8512 const UChar *pTarg = target;

8513 int32_t equalLength;

8514

8515 if (sourceLength == -1 && targetLength == -1) {

8516 // Both strings are null terminated.

8517 // Scan through any leading equal portion.

8518 while (pSrc == pTarg && *pSrc != 0) {

8519 pSrc++;

8520 pTarg++;

8521 }

8522 if (pSrc == 0 && pTarg == 0) {

8523 UTRACE_EXIT_VALUE(UCOL_EQUAL);

8524 return UCOL_EQUAL;

8525 }

8526 equalLength = (int32_t)(pSrc - source);

8527 }

8528 else

8529 {

8530 // One or both strings has an explicit length.

8531 const UChar *pSrcEnd = source + sourceLength;

8532 const UChar *pTargEnd = target + targetLength;

8533

8534 // Scan while the strings are bitwise ==, or until one is exhausted.

8535 for (;;) {

8536 if (pSrc == pSrcEnd \|\| pTarg == pTargEnd) {

8537 break;

8538 }

8539 if ((pSrc == 0 && sourceLength == -1) \|\| (pTarg == 0 && targetLeng th == -1)) {

8540 break;

8541 }

8542 if (pSrc != pTarg) {

8543 break;

8544 }

8545 pSrc++;

8546 pTarg++;

8547 }

8548 equalLength = (int32_t)(pSrc - source);

8549

8550 // If we made it all the way through both strings, we are done. They ar e ==

8551 if ((pSrc ==pSrcEnd \|\| (pSrcEnd <pSrc && pSrc==0)) && / At end of src string, however it was specified. */

8552 (pTarg==pTargEnd \|\| (pTargEnd<pTarg && pTarg==0))) / and also at end of dest string */

8553 {

8554 UTRACE_EXIT_VALUE(UCOL_EQUAL);

8555 return UCOL_EQUAL;

8556 }

8557 }

8558 if (equalLength > 0) {

8559 /* There is an identical portion at the beginning of the two strings. */

8560 /* If the identical portion ends within a contraction or a comibining */

8561 /* character sequence, back up to the start of that sequence. */

8562

8563 // These values should already be set by the code above.

8564 //pSrc = source + equalLength; /* point to the first differing c hars */

8565 //pTarg = target + equalLength;

8566 if ((pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) \|\|

8567 (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)))

8568 {

8569 // We are stopped in the middle of a contraction.

8570 // Scan backwards through the == part of the string looking for the start of the contraction.

8571 // It doesn't matter which string we scan, since they are the same in this region.

8572 do

8573 {

8574 equalLength--;

8575 pSrc--;

8576 }

8577 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));

8578 }

8579

8580 source += equalLength;

8581 target += equalLength;

8582 if (sourceLength > 0) {

8583 sourceLength -= equalLength;

8584 }

8585 if (targetLength > 0) {

8586 targetLength -= equalLength;

8587 }

8588 }

8589

8590 UErrorCode status = U_ZERO_ERROR;	453 UErrorCode status = U_ZERO_ERROR;

8591 UCollationResult returnVal;	454 UCollationResult returnVal = Collator::fromUCollator(coll)->

8592 if(!coll->latinOneUse \|\| (sourceLength > 0 && source&0xff00) \|\| (targetLeng th > 0 && target&0xff00)) {	455 compare(source, sourceLength, target, targetLength, status);

8593 returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targ etLength, &status);	456 UTRACE_EXIT_VALUE_STATUS(returnVal, status);

8594 } else {

8595 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, ta rgetLength, &status);

8596 }

8597 UTRACE_EXIT_VALUE(returnVal);

8598 return returnVal;	457 return returnVal;

8599 }	458 }

8600	459

8601 U_CAPI UCollationResult U_EXPORT2	460 U_CAPI UCollationResult U_EXPORT2

8602 ucol_strcollUTF8(	461 ucol_strcollUTF8(

8603 const UCollator *coll,	462 const UCollator *coll,

8604 const char *source,	463 const char *source,

8605 int32_t sourceLength,	464 int32_t sourceLength,

8606 const char *target,	465 const char *target,

8607 int32_t targetLength,	466 int32_t targetLength,

8608 UErrorCode *status)	467 UErrorCode *status)

8609 {	468 {

8610 U_ALIGN_CODE(16);	469 U_ALIGN_CODE(16);

8611	470

8612 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8);	471 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8);

8613 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {	472 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {

8614 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, sour ce, target);	473 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, sour ce, target);

8615 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLengt h);	474 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLengt h);

8616 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLengt h);	475 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLengt h);

8617 }	476 }

8618	477

8619 if (U_FAILURE(*status)) {	478 if (U_FAILURE(*status)) {

8620 /* do nothing */	479 /* do nothing */

8621 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);	480 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);

8622 return UCOL_EQUAL;	481 return UCOL_EQUAL;

8623 }	482 }

8624	483

8625 if((source == NULL && sourceLength != 0) \|\| (target == NULL && targetLength != 0)) {	484 UCollationResult returnVal = Collator::fromUCollator(coll)->internalCompareU TF8(

8626 *status = U_ILLEGAL_ARGUMENT_ERROR;	485 source, sourceLength, target, targetLength, *status);

8627 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);

8628 return UCOL_EQUAL;

8629 }

8630

8631 /* Quick check if source and target are same strings. */

8632 /* They should either both be NULL terminated or the explicit length should be set on both. */

8633 if (source==target && sourceLength==targetLength) {

8634 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);

8635 return UCOL_EQUAL;

8636 }

8637

8638 if(coll->delegate != NULL) {

8639 return ((const Collator*)coll->delegate)->compareUTF8(

8640 StringPiece(source, (sourceLength < 0) ? uprv_strlen(source) : sourc eLength),

8641 StringPiece(target, (targetLength < 0) ? uprv_strlen(target) : targe tLength),

8642 *status);

8643 }

8644

8645 /* Scan the strings. Find: */

8646 /* The length of any leading portion that is equal */

8647 /* Whether they are exactly equal. (in which case we just return) */

8648 const char *pSrc = source;

8649 const char *pTarg = target;

8650 UBool bSrcLimit = FALSE;

8651 UBool bTargLimit = FALSE;

8652

8653 if (sourceLength == -1 && targetLength == -1) {

8654 // Both strings are null terminated.

8655 // Scan through any leading equal portion.

8656 while (pSrc == pTarg && *pSrc != 0) {

8657 pSrc++;

8658 pTarg++;

8659 }

8660 if (pSrc == 0 && pTarg == 0) {

8661 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);

8662 return UCOL_EQUAL;

8663 }

8664 bSrcLimit = (*pSrc == 0);

8665 bTargLimit = (*pTarg == 0);

8666 }

8667 else

8668 {

8669 // One or both strings has an explicit length.

8670 const char *pSrcEnd = source + sourceLength;

8671 const char *pTargEnd = target + targetLength;

8672

8673 // Scan while the strings are bitwise ==, or until one is exhausted.

8674 for (;;) {

8675 if (pSrc == pSrcEnd \|\| pTarg == pTargEnd) {

8676 break;

8677 }

8678 if ((pSrc == 0 && sourceLength == -1) \|\| (pTarg == 0 && targetLeng th == -1)) {

8679 break;

8680 }

8681 if (pSrc != pTarg) {

8682 break;

8683 }

8684 pSrc++;

8685 pTarg++;

8686 }

8687 bSrcLimit = (pSrc ==pSrcEnd \|\| (pSrcEnd <pSrc && *pSrc==0));

8688 bTargLimit = (pTarg==pTargEnd \|\| (pTargEnd<pTarg && *pTarg==0));

8689

8690 // If we made it all the way through both strings, we are done. They ar e ==

8691 if (bSrcLimit && /* At end of src string, however it was specified. * /

8692 bTargLimit) /* and also at end of dest string * /

8693 {

8694 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);

8695 return UCOL_EQUAL;

8696 }

8697 }

8698

8699 U_ASSERT(!(bSrcLimit && bTargLimit));

8700

8701 int32_t equalLength = pSrc - source;

8702 UBool bSawNonLatin1 = FALSE;

8703

8704 if (equalLength > 0) {

8705 // Align position to the start of UTF-8 code point.

8706 if (bTargLimit) {

8707 U8_SET_CP_START((const uint8_t*)source, 0, equalLength);

8708 } else {

8709 U8_SET_CP_START((const uint8_t*)target, 0, equalLength);

8710 }

8711 pSrc = source + equalLength;

8712 pTarg = target + equalLength;

8713 }

8714

8715 if (equalLength > 0) {

8716 /* There is an identical portion at the beginning of the two strings. */

8717 /* If the identical portion ends within a contraction or a comibining */

8718 /* character sequence, back up to the start of that sequence. */

8719 UBool bUnsafeCP = FALSE;

8720 UChar32 uc32 = -1;

8721

8722 if (!bSrcLimit) {

8723 U8_GET_OR_FFFD((const uint8_t*)source, 0, equalLength, sourceLength, uc32);

8724 if (uc32 >= 0x10000 \|\| ucol_unsafeCP((UChar)uc32, coll)) {

8725 bUnsafeCP = TRUE;

8726 }

8727 bSawNonLatin1 \|= (uc32 > 0xff);

8728 }

8729 if (!bTargLimit) {

8730 U8_GET_OR_FFFD((const uint8_t*)target, 0, equalLength, targetLength, uc32);

8731 if (uc32 >= 0x10000 \|\| ucol_unsafeCP((UChar)uc32, coll)) {

8732 bUnsafeCP = TRUE;

8733 }

8734 bSawNonLatin1 \|= (uc32 > 0xff);

8735 }

8736

8737 if (bUnsafeCP) {

8738 while (equalLength > 0) {

8739 // We are stopped in the middle of a contraction.

8740 // Scan backwards through the == part of the string looking for the start of the contraction.

8741 // It doesn't matter which string we scan, since they are the same in this region.

8742 U8_PREV_OR_FFFD((uint8_t*)source, 0, equalLength, uc32);

8743 bSawNonLatin1 \|= (uc32 > 0xff);

8744 if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) {

8745 break;

8746 }

8747 }

8748 }

8749 source += equalLength;

8750 target += equalLength;

8751 if (sourceLength > 0) {

8752 sourceLength -= equalLength;

8753 }

8754 if (targetLength > 0) {

8755 targetLength -= equalLength;

8756 }

8757 } else {

8758 // Lead byte of Latin 1 character is 0x00 - 0xC3

8759 bSawNonLatin1 = (source && (sourceLength != 0) && (uint8_t)*source > 0xc 3);

8760 bSawNonLatin1 \|= (target && (targetLength != 0) && (uint8_t)*target > 0x c3);

8761 }

8762

8763 UCollationResult returnVal;

8764

8765 if(!coll->latinOneUse \|\| bSawNonLatin1) {

8766 returnVal = ucol_strcollRegularUTF8(coll, source, sourceLength, target, targetLength, status);

8767 } else {

8768 returnVal = ucol_strcollUseLatin1UTF8(coll, source, sourceLength, target , targetLength, status);

8769 }

8770 UTRACE_EXIT_VALUE_STATUS(returnVal, *status);	486 UTRACE_EXIT_VALUE_STATUS(returnVal, *status);

8771 return returnVal;	487 return returnVal;

8772 }	488 }

8773	489

8774	490

8775 /* convenience function for comparing strings */	491 /* convenience function for comparing strings */

8776 U_CAPI UBool U_EXPORT2	492 U_CAPI UBool U_EXPORT2

8777 ucol_greater( const UCollator *coll,	493 ucol_greater( const UCollator *coll,

8778 const UChar *source,	494 const UChar *source,

8779 int32_t sourceLength,	495 int32_t sourceLength,

(...skipping 23 matching lines...) Expand all Loading...
8803 int32_t sourceLength,	519 int32_t sourceLength,

8804 const UChar *target,	520 const UChar *target,

8805 int32_t targetLength)	521 int32_t targetLength)

8806 {	522 {

8807 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)	523 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)

8808 == UCOL_EQUAL);	524 == UCOL_EQUAL);

8809 }	525 }

8810	526

8811 U_CAPI void U_EXPORT2	527 U_CAPI void U_EXPORT2

8812 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {	528 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {

8813 if(coll && coll->UCA) {	529 const Collator *c = Collator::fromUCollator(coll);

8814 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));	530 if(c != NULL) {

	531 UVersionInfo v;

	532 c->getVersion(v);

	533 // Note: This is tied to how the current implementation encodes the UCA version

	534 // in the overall getVersion().

	535 // Alternatively, we could load the root collator and get at lower-level data from there.

	536 // Either way, it will reflect the input collator's UCA version only

	537 // if it is a known implementation.

	538 // It would be cleaner to make this a virtual Collator method.

	539 info[0] = v[1] >> 3;

	540 info[1] = v[1] & 7;

	541 info[2] = v[2] >> 6;

	542 info[3] = 0;

8815 }	543 }

8816 }	544 }

8817	545

	546 U_CAPI const UChar * U_EXPORT2

	547 ucol_getRules(const UCollator coll, int32_t length) {

	548 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);

	549 // OK to crash if coll==NULL: We do not want to check "this" pointers.

	550 if(rbc != NULL \|\| coll == NULL) {

	551 const UnicodeString &rules = rbc->getRules();

	552 U_ASSERT(rules.getBuffer()[rules.length()] == 0);

	553 *length = rules.length();

	554 return rules.getBuffer();

	555 }

	556 static const UChar _NUL = 0;

	557 *length = 0;

	558 return &_NUL;

	559 }

	560

	561 U_CAPI int32_t U_EXPORT2

	562 ucol_getRulesEx(const UCollator coll, UColRuleOption delta, UChar buffer, int3 2_t bufferLen) {

	563 UnicodeString rules;

	564 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);

	565 if(rbc != NULL \|\| coll == NULL) {

	566 rbc->getRules(delta, rules);

	567 }

	568 if(buffer != NULL && bufferLen > 0) {

	569 UErrorCode errorCode = U_ZERO_ERROR;

	570 return rules.extract(buffer, bufferLen, errorCode);

	571 } else {

	572 return rules.length();

	573 }

	574 }

	575

	576 U_CAPI const char * U_EXPORT2

	577 ucol_getLocale(const UCollator coll, ULocDataLocaleType type, UErrorCode statu s) {

	578 return ucol_getLocaleByType(coll, type, status);

	579 }

	580

	581 U_CAPI const char * U_EXPORT2

	582 ucol_getLocaleByType(const UCollator coll, ULocDataLocaleType type, UErrorCode status) {

	583 if(U_FAILURE(*status)) {

	584 return NULL;

	585 }

	586 UTRACE_ENTRY(UTRACE_UCOL_GETLOCALE);

	587 UTRACE_DATA1(UTRACE_INFO, "coll=%p", coll);

	588

	589 const char *result;

	590 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);

	591 if(rbc == NULL && coll != NULL) {

	592 *status = U_UNSUPPORTED_ERROR;

	593 result = NULL;

	594 } else {

	595 result = rbc->internalGetLocaleID(type, *status);

	596 }

	597

	598 UTRACE_DATA1(UTRACE_INFO, "result = %s", result);

	599 UTRACE_EXIT_STATUS(*status);

	600 return result;

	601 }

	602

	603 U_CAPI USet * U_EXPORT2

	604 ucol_getTailoredSet(const UCollator coll, UErrorCode status) {

	605 if(U_FAILURE(*status)) {

	606 return NULL;

	607 }

	608 UnicodeSet set = Collator::fromUCollator(coll)->getTailoredSet(status);

	609 if(U_FAILURE(*status)) {

	610 delete set;

	611 return NULL;

	612 }

	613 return set->toUSet();

	614 }

	615

	616 U_CAPI UBool U_EXPORT2

	617 ucol_equals(const UCollator source, const UCollator target) {

	618 return source == target \|\|

	619 (Collator::fromUCollator(source)) == (Collator::fromUCollator(target)) ;

	620 }

	621

8818 #endif /* #if !UCONFIG_NO_COLLATION */	622 #endif /* #if !UCONFIG_NO_COLLATION */

OLD	NEW

« no previous file with comments | « source/i18n/ucln_in.cpp ('k') | source/i18n/ucol_bld.h » ('j') | no next file with comments »