Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1426)

Side by Side Diff: icu46/source/i18n/ucol.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/
Patch Set: Created 10 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « icu46/source/i18n/ucln_in.c ('k') | icu46/source/i18n/ucol_bld.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Property Changes:
Added: svn:eol-style
+ LF
OLDNEW
(Empty)
1 /*
2 *******************************************************************************
3 * Copyright (C) 1996-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ucol.cpp
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * Modification history
12 * Date Name Comments
13 * 1996-1999 various members of ICU team maintained C API for collation framewo rk
14 * 02/16/2001 synwee Added internal method getPrevSpecialCE
15 * 03/01/2001 synwee Added maxexpansion functionality.
16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compl iant
17 */
18
19 #include "unicode/utypes.h"
20
21 #if !UCONFIG_NO_COLLATION
22
23 #include "unicode/coleitr.h"
24 #include "unicode/unorm.h"
25 #include "unicode/udata.h"
26 #include "unicode/ustring.h"
27
28 #include "ucol_imp.h"
29 #include "bocsu.h"
30
31 #include "normalizer2impl.h"
32 #include "unorm_it.h"
33 #include "umutex.h"
34 #include "cmemory.h"
35 #include "ucln_in.h"
36 #include "cstring.h"
37 #include "utracimp.h"
38 #include "putilimp.h"
39 #include "uassert.h"
40
41 #ifdef UCOL_DEBUG
42 #include <stdio.h>
43 #endif
44
45 U_NAMESPACE_USE
46
47 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
48
49 #define LAST_BYTE_MASK_ 0xFF
50 #define SECOND_LAST_BYTE_SHIFT_ 8
51
52 #define ZERO_CC_LIMIT_ 0xC0
53
54 // this is static pointer to the normalizer fcdTrieIndex
55 // it is always the same between calls to u_cleanup
56 // and therefore writing to it is not synchronized.
57 // It is cleaned in ucol_cleanup
58 static const uint16_t *fcdTrieIndex=NULL;
59 // Code points at fcdHighStart and above have a zero FCD value.
60 static UChar32 fcdHighStart = 0;
61
62 // These are values from UCA required for
63 // implicit generation and supressing sort key compression
64 // they should regularly be in the UCA, but if one
65 // is running without UCA, it could be a problem
66 static const int32_t maxRegularPrimary = 0x7A;
67 static const int32_t minImplicitPrimary = 0xE0;
68 static const int32_t maxImplicitPrimary = 0xE4;
69
70 U_CDECL_BEGIN
71 static UBool U_CALLCONV
72 ucol_cleanup(void)
73 {
74 fcdTrieIndex = NULL;
75 return TRUE;
76 }
77
78 static int32_t U_CALLCONV
79 _getFoldingOffset(uint32_t data) {
80 return (int32_t)(data&0xFFFFFF);
81 }
82
83 U_CDECL_END
84
85 // init FCD data
86 static inline
87 UBool initializeFCD(UErrorCode *status) {
88 if (fcdTrieIndex != NULL) {
89 return TRUE;
90 } else {
91 // The result is constant, until the library is reloaded.
92 fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
93 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
94 return U_SUCCESS(*status);
95 }
96 }
97
98 static
99 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceStri ng,
100 int32_t sourceLen, collIterate *s,
101 UErrorCode *status)
102 {
103 (s)->string = (s)->pos = sourceString;
104 (s)->origFlags = 0;
105 (s)->flags = 0;
106 if (sourceLen >= 0) {
107 s->flags |= UCOL_ITER_HASLEN;
108 (s)->endp = (UChar *)sourceString+sourceLen;
109 }
110 else {
111 /* change to enable easier checking for end of string for fcdpositon */
112 (s)->endp = NULL;
113 }
114 (s)->extendCEs = NULL;
115 (s)->extendCEsSize = 0;
116 (s)->CEpos = (s)->toReturn = (s)->CEs;
117 (s)->offsetBuffer = NULL;
118 (s)->offsetBufferSize = 0;
119 (s)->offsetReturn = (s)->offsetStore = NULL;
120 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
121 (s)->coll = (collator);
122 (s)->nfd = Normalizer2Factory::getNFDInstance(*status);
123 (s)->fcdPosition = 0;
124 if(collator->normalizationMode == UCOL_ON) {
125 (s)->flags |= UCOL_ITER_NORM;
126 }
127 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
128 (s)->flags |= UCOL_HIRAGANA_Q;
129 }
130 (s)->iterator = NULL;
131 //(s)->iteratorIndex = 0;
132 }
133
134 U_CAPI void U_EXPORT2
135 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
136 int32_t sourceLen, collIterate *s,
137 UErrorCode *status) {
138 /* Out-of-line version for use from other files. */
139 IInit_collIterate(collator, sourceString, sourceLen, s, status);
140 }
141
142 U_CAPI collIterate * U_EXPORT2
143 uprv_new_collIterate(UErrorCode *status) {
144 if(U_FAILURE(*status)) {
145 return NULL;
146 }
147 collIterate *s = new collIterate;
148 if(s == NULL) {
149 *status = U_MEMORY_ALLOCATION_ERROR;
150 return NULL;
151 }
152 return s;
153 }
154
155 U_CAPI void U_EXPORT2
156 uprv_delete_collIterate(collIterate *s) {
157 delete s;
158 }
159
160 U_CAPI UBool U_EXPORT2
161 uprv_collIterateAtEnd(collIterate *s) {
162 return s == NULL || s->pos == s->endp;
163 }
164
165 /**
166 * Backup the state of the collIterate struct data
167 * @param data collIterate to backup
168 * @param backup storage
169 */
170 static
171 inline void backupState(const collIterate *data, collIterateState *backup)
172 {
173 backup->fcdPosition = data->fcdPosition;
174 backup->flags = data->flags;
175 backup->origFlags = data->origFlags;
176 backup->pos = data->pos;
177 backup->bufferaddress = data->writableBuffer.getBuffer();
178 backup->buffersize = data->writableBuffer.length();
179 backup->iteratorMove = 0;
180 backup->iteratorIndex = 0;
181 if(data->iterator != NULL) {
182 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER _CURRENT);
183 backup->iteratorIndex = data->iterator->getState(data->iterator);
184 // no we try to fixup if we're using a normalizing iterator and we get U ITER_NO_STATE
185 if(backup->iteratorIndex == UITER_NO_STATE) {
186 while((backup->iteratorIndex = data->iterator->getState(data->iterat or)) == UITER_NO_STATE) {
187 backup->iteratorMove++;
188 data->iterator->move(data->iterator, -1, UITER_CURRENT);
189 }
190 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CUR RENT);
191 }
192 }
193 }
194
195 /**
196 * Loads the state into the collIterate struct data
197 * @param data collIterate to backup
198 * @param backup storage
199 * @param forwards boolean to indicate if forwards iteration is used,
200 * false indicates backwards iteration
201 */
202 static
203 inline void loadState(collIterate *data, const collIterateState *backup,
204 UBool forwards)
205 {
206 UErrorCode status = U_ZERO_ERROR;
207 data->flags = backup->flags;
208 data->origFlags = backup->origFlags;
209 if(data->iterator != NULL) {
210 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO );
211 data->iterator->setState(data->iterator, backup->iteratorIndex, &status) ;
212 if(backup->iteratorMove != 0) {
213 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CUR RENT);
214 }
215 }
216 data->pos = backup->pos;
217
218 if ((data->flags & UCOL_ITER_INNORMBUF) &&
219 data->writableBuffer.getBuffer() != backup->bufferaddress) {
220 /*
221 this is when a new buffer has been reallocated and we'll have to
222 calculate the new position.
223 note the new buffer has to contain the contents of the old buffer.
224 */
225 if (forwards) {
226 data->pos = data->writableBuffer.getTerminatedBuffer() +
227 (data->pos - backup->bufferaddress);
228 }
229 else {
230 /* backwards direction */
231 int32_t temp = backup->buffersize -
232 (int32_t)(data->pos - backup->bufferaddress);
233 data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writ ableBuffer.length() - temp);
234 }
235 }
236 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
237 /*
238 this is alittle tricky.
239 if we are initially not in the normalization buffer, even if we
240 normalize in the later stage, the data in the buffer will be
241 ignored, since we skip back up to the data string.
242 however if we are already in the normalization buffer, any
243 further normalization will pull data into the normalization
244 buffer and modify the fcdPosition.
245 since we are keeping the data in the buffer for use, the
246 fcdPosition can not be reverted back.
247 arrgghh....
248 */
249 data->fcdPosition = backup->fcdPosition;
250 }
251 }
252
253 static UBool
254 reallocCEs(collIterate *data, int32_t newCapacity) {
255 uint32_t *oldCEs = data->extendCEs;
256 if(oldCEs == NULL) {
257 oldCEs = data->CEs;
258 }
259 int32_t length = data->CEpos - oldCEs;
260 uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
261 if(newCEs == NULL) {
262 return FALSE;
263 }
264 uprv_memcpy(newCEs, oldCEs, length * 4);
265 uprv_free(data->extendCEs);
266 data->extendCEs = newCEs;
267 data->extendCEsSize = newCapacity;
268 data->CEpos = newCEs + length;
269 return TRUE;
270 }
271
272 static UBool
273 increaseCEsCapacity(collIterate *data) {
274 int32_t oldCapacity;
275 if(data->extendCEs != NULL) {
276 oldCapacity = data->extendCEsSize;
277 } else {
278 oldCapacity = LENGTHOF(data->CEs);
279 }
280 return reallocCEs(data, 2 * oldCapacity);
281 }
282
283 static UBool
284 ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
285 int32_t oldCapacity;
286 if(data->extendCEs != NULL) {
287 oldCapacity = data->extendCEsSize;
288 } else {
289 oldCapacity = LENGTHOF(data->CEs);
290 }
291 if(minCapacity <= oldCapacity) {
292 return TRUE;
293 }
294 oldCapacity *= 2;
295 return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacit y);
296 }
297
298 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {
299 if(U_FAILURE(errorCode)) {
300 return;
301 }
302 int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuf fer);
303 if(length >= offsetBufferSize) {
304 int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;
305 int32_t *newBuffer = reinterpret_cast<int32_t *>(uprv_malloc(newCapacity * 4));
306 if(newBuffer == NULL) {
307 errorCode = U_MEMORY_ALLOCATION_ERROR;
308 return;
309 }
310 if(length > 0) {
311 uprv_memcpy(newBuffer, offsetBuffer, length * 4);
312 }
313 uprv_free(offsetBuffer);
314 offsetBuffer = newBuffer;
315 offsetStore = offsetBuffer + length;
316 offsetBufferSize = newCapacity;
317 }
318 *offsetStore++ = offset;
319 }
320
321 /*
322 * collIter_eos()
323 * Checks for a collIterate being positioned at the end of
324 * its source string.
325 *
326 */
327 static
328 inline UBool collIter_eos(collIterate *s) {
329 if(s->flags & UCOL_USE_ITERATOR) {
330 return !(s->iterator->hasNext(s->iterator));
331 }
332 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
333 // Null terminated string, but not at null, so not at end.
334 // Whether in main or normalization buffer doesn't matter.
335 return FALSE;
336 }
337
338 // String with length. Can't be in normalization buffer, which is always
339 // null termintated.
340 if (s->flags & UCOL_ITER_HASLEN) {
341 return (s->pos == s->endp);
342 }
343
344 // We are at a null termination, could be either normalization buffer or mai n string.
345 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
346 // At null at end of main string.
347 return TRUE;
348 }
349
350 // At null at end of normalization buffer. Need to check whether there ther e are
351 // any characters left in the main buffer.
352 if(s->origFlags & UCOL_USE_ITERATOR) {
353 return !(s->iterator->hasNext(s->iterator));
354 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
355 // Null terminated main string. fcdPosition is the 'return' position in to main buf.
356 return (*s->fcdPosition == 0);
357 }
358 else {
359 // Main string with an end pointer.
360 return s->fcdPosition == s->endp;
361 }
362 }
363
364 /*
365 * collIter_bos()
366 * Checks for a collIterate being positioned at the start of
367 * its source string.
368 *
369 */
370 static
371 inline UBool collIter_bos(collIterate *source) {
372 // if we're going backwards, we need to know whether there is more in the
373 // iterator, even if we are in the side buffer
374 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
375 return !source->iterator->hasPrevious(source->iterator);
376 }
377 if (source->pos <= source->string ||
378 ((source->flags & UCOL_ITER_INNORMBUF) &&
379 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
380 return TRUE;
381 }
382 return FALSE;
383 }
384
385 /*static
386 inline UBool collIter_SimpleBos(collIterate *source) {
387 // if we're going backwards, we need to know whether there is more in the
388 // iterator, even if we are in the side buffer
389 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
390 return !source->iterator->hasPrevious(source->iterator);
391 }
392 if (source->pos == source->string) {
393 return TRUE;
394 }
395 return FALSE;
396 }*/
397 //return (data->pos == data->string) ||
398
399
400 /****************************************************************************/
401 /* Following are the open/close functions */
402 /* */
403 /****************************************************************************/
404
405 static UCollator*
406 ucol_initFromBinary(const uint8_t *bin, int32_t length,
407 const UCollator *base,
408 UCollator *fillIn,
409 UErrorCode *status)
410 {
411 UCollator *result = fillIn;
412 if(U_FAILURE(*status)) {
413 return NULL;
414 }
415 /*
416 if(base == NULL) {
417 // we don't support null base yet
418 *status = U_ILLEGAL_ARGUMENT_ERROR;
419 return NULL;
420 }
421 */
422 // We need these and we could be running without UCA
423 uprv_uca_initImplicitConstants(status);
424 UCATableHeader *colData = (UCATableHeader *)bin;
425 // do we want version check here? We're trying to figure out whether collato rs are compatible
426 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeo f(UVersionInfo)) != 0 ||
427 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersio nInfo)) != 0)) ||
428 colData->version[0] != UCOL_BUILDER_VERSION)
429 {
430 *status = U_COLLATOR_VERSION_MISMATCH;
431 return NULL;
432 }
433 else {
434 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(s izeof(UColOptionSet)))) {
435 result = ucol_initCollator((const UCATableHeader *)bin, result, base , status);
436 if(U_FAILURE(*status)){
437 return NULL;
438 }
439 result->hasRealData = TRUE;
440 }
441 else {
442 if(base) {
443 result = ucol_initCollator(base->image, result, base, status);
444 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
445 if(U_FAILURE(*status)){
446 return NULL;
447 }
448 result->hasRealData = FALSE;
449 }
450 else {
451 *status = U_USELESS_COLLATOR_ERROR;
452 return NULL;
453 }
454 }
455 result->freeImageOnClose = FALSE;
456 }
457 result->actualLocale = NULL;
458 result->validLocale = NULL;
459 result->requestedLocale = NULL;
460 result->rules = NULL;
461 result->rulesLength = 0;
462 result->freeRulesOnClose = FALSE;
463 result->ucaRules = NULL;
464 return result;
465 }
466
467 U_CAPI UCollator* U_EXPORT2
468 ucol_openBinary(const uint8_t *bin, int32_t length,
469 const UCollator *base,
470 UErrorCode *status)
471 {
472 return ucol_initFromBinary(bin, length, base, NULL, status);
473 }
474
475 U_CAPI int32_t U_EXPORT2
476 ucol_cloneBinary(const UCollator *coll,
477 uint8_t *buffer, int32_t capacity,
478 UErrorCode *status)
479 {
480 int32_t length = 0;
481 if(U_FAILURE(*status)) {
482 return length;
483 }
484 if(capacity < 0) {
485 *status = U_ILLEGAL_ARGUMENT_ERROR;
486 return length;
487 }
488 if(coll->hasRealData == TRUE) {
489 length = coll->image->size;
490 if(length <= capacity) {
491 uprv_memcpy(buffer, coll->image, length);
492 } else {
493 *status = U_BUFFER_OVERFLOW_ERROR;
494 }
495 } else {
496 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof( UColOptionSet)));
497 if(length <= capacity) {
498 /* build the UCATableHeader with minimal entries */
499 /* do not copy the header from the UCA file because its values are w rong! */
500 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
501
502 /* reset everything */
503 uprv_memset(buffer, 0, length);
504
505 /* set the tailoring-specific values */
506 UCATableHeader *myData = (UCATableHeader *)buffer;
507 myData->size = length;
508
509 /* offset for the options, the only part of the data that is present after the header */
510 myData->options = sizeof(UCATableHeader);
511
512 /* need to always set the expansion value for an upper bound of the options */
513 myData->expansion = myData->options + sizeof(UColOptionSet);
514
515 myData->magic = UCOL_HEADER_MAGIC;
516 myData->isBigEndian = U_IS_BIG_ENDIAN;
517 myData->charSetFamily = U_CHARSET_FAMILY;
518
519 /* copy UCA's version; genrb will override all but the builder versi on with tailoring data */
520 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionIn fo));
521
522 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVer sionInfo));
523 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVer sionInfo));
524 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeo f(UVersionInfo));
525 myData->jamoSpecial = coll->image->jamoSpecial;
526
527 /* copy the collator options */
528 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options , sizeof(UColOptionSet));
529 } else {
530 *status = U_BUFFER_OVERFLOW_ERROR;
531 }
532 }
533 return length;
534 }
535
536 U_CAPI UCollator* U_EXPORT2
537 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
538 {
539 UCollator * localCollator;
540 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
541 char *stackBufferChars = (char *)stackBuffer;
542 int32_t imageSize = 0;
543 int32_t rulesSize = 0;
544 int32_t rulesPadding = 0;
545 uint8_t *image;
546 UChar *rules;
547 UBool colAllocated = FALSE;
548 UBool imageAllocated = FALSE;
549
550 if (status == NULL || U_FAILURE(*status)){
551 return 0;
552 }
553 if ((stackBuffer && !pBufferSize) || !coll){
554 *status = U_ILLEGAL_ARGUMENT_ERROR;
555 return 0;
556 }
557 if (coll->rules && coll->freeRulesOnClose) {
558 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
559 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
560 bufferSizeNeeded += rulesSize + rulesPadding;
561 }
562
563 if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set neede d size into *pBufferSize */
564 *pBufferSize = bufferSizeNeeded;
565 return 0;
566 }
567
568 /* Pointers on 64-bit platforms need to be aligned
569 * on a 64-bit boundry in memory.
570 */
571 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
572 int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
573 if (*pBufferSize > offsetUp) {
574 *pBufferSize -= offsetUp;
575 stackBufferChars += offsetUp;
576 }
577 else {
578 /* prevent using the stack buffer but keep the size > 0 so that we d o not just preflight */
579 *pBufferSize = 1;
580 }
581 }
582 stackBuffer = (void *)stackBufferChars;
583
584 if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
585 /* allocate one here...*/
586 stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
587 // Null pointer check.
588 if (stackBufferChars == NULL) {
589 *status = U_MEMORY_ALLOCATION_ERROR;
590 return NULL;
591 }
592 colAllocated = TRUE;
593 if (U_SUCCESS(*status)) {
594 *status = U_SAFECLONE_ALLOCATED_WARNING;
595 }
596 }
597 localCollator = (UCollator *)stackBufferChars;
598 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
599 {
600 UErrorCode tempStatus = U_ZERO_ERROR;
601 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
602 }
603 if (coll->freeImageOnClose) {
604 image = (uint8_t *)uprv_malloc(imageSize);
605 // Null pointer check
606 if (image == NULL) {
607 *status = U_MEMORY_ALLOCATION_ERROR;
608 return NULL;
609 }
610 ucol_cloneBinary(coll, image, imageSize, status);
611 imageAllocated = TRUE;
612 }
613 else {
614 image = (uint8_t *)coll->image;
615 }
616 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollat or, status);
617 if (U_FAILURE(*status)) {
618 return NULL;
619 }
620
621 if (coll->rules) {
622 if (coll->freeRulesOnClose) {
623 localCollator->rules = u_strcpy(rules, coll->rules);
624 //bufferEnd += rulesSize;
625 }
626 else {
627 localCollator->rules = coll->rules;
628 }
629 localCollator->freeRulesOnClose = FALSE;
630 localCollator->rulesLength = coll->rulesLength;
631 }
632
633 int32_t i;
634 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
635 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(col l, (UColAttribute)i, status), status);
636 }
637 // zero copies of pointers
638 localCollator->actualLocale = NULL;
639 localCollator->validLocale = NULL;
640 localCollator->requestedLocale = NULL;
641 localCollator->ucaRules = coll->ucaRules; // There should only be one copy h ere.
642 localCollator->freeOnClose = colAllocated;
643 localCollator->freeImageOnClose = imageAllocated;
644 return localCollator;
645 }
646
647 U_CAPI void U_EXPORT2
648 ucol_close(UCollator *coll)
649 {
650 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
651 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
652 if(coll != NULL) {
653 // these are always owned by each UCollator struct,
654 // so we always free them
655 if(coll->validLocale != NULL) {
656 uprv_free(coll->validLocale);
657 }
658 if(coll->actualLocale != NULL) {
659 uprv_free(coll->actualLocale);
660 }
661 if(coll->requestedLocale != NULL) {
662 uprv_free(coll->requestedLocale);
663 }
664 if(coll->latinOneCEs != NULL) {
665 uprv_free(coll->latinOneCEs);
666 }
667 if(coll->options != NULL && coll->freeOptionsOnClose) {
668 uprv_free(coll->options);
669 }
670 if(coll->rules != NULL && coll->freeRulesOnClose) {
671 uprv_free((UChar *)coll->rules);
672 }
673 if(coll->image != NULL && coll->freeImageOnClose) {
674 uprv_free((UCATableHeader *)coll->image);
675 }
676 if(coll->leadBytePermutationTable != NULL) {
677 uprv_free(coll->leadBytePermutationTable);
678 }
679 if(coll->reorderCodes != NULL) {
680 uprv_free(coll->reorderCodes);
681 }
682
683 /* Here, it would be advisable to close: */
684 /* - UData for UCA (unless we stuff it in the root resb */
685 /* Again, do we need additional housekeeping... HMMM! */
686 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
687 if(coll->freeOnClose){
688 /* for safeClone, if freeOnClose is FALSE,
689 don't free the other instance data */
690 uprv_free(coll);
691 }
692 }
693 UTRACE_EXIT();
694 }
695
696 /* This one is currently used by genrb & tests. After constructing from rules (t ailoring),*/
697 /* you should be able to get the binary chunk to write out... Doesn't look very full now */
698 U_CFUNC uint8_t* U_EXPORT2
699 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
700 {
701 uint8_t *result = NULL;
702 if(U_FAILURE(*status)) {
703 return NULL;
704 }
705 if(coll->hasRealData == TRUE) {
706 *length = coll->image->size;
707 result = (uint8_t *)uprv_malloc(*length);
708 /* test for NULL */
709 if (result == NULL) {
710 *status = U_MEMORY_ALLOCATION_ERROR;
711 return NULL;
712 }
713 uprv_memcpy(result, coll->image, *length);
714 } else {
715 *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof (UColOptionSet)));
716 result = (uint8_t *)uprv_malloc(*length);
717 /* test for NULL */
718 if (result == NULL) {
719 *status = U_MEMORY_ALLOCATION_ERROR;
720 return NULL;
721 }
722
723 /* build the UCATableHeader with minimal entries */
724 /* do not copy the header from the UCA file because its values are wrong ! */
725 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
726
727 /* reset everything */
728 uprv_memset(result, 0, *length);
729
730 /* set the tailoring-specific values */
731 UCATableHeader *myData = (UCATableHeader *)result;
732 myData->size = *length;
733
734 /* offset for the options, the only part of the data that is present aft er the header */
735 myData->options = sizeof(UCATableHeader);
736
737 /* need to always set the expansion value for an upper bound of the opti ons */
738 myData->expansion = myData->options + sizeof(UColOptionSet);
739
740 myData->magic = UCOL_HEADER_MAGIC;
741 myData->isBigEndian = U_IS_BIG_ENDIAN;
742 myData->charSetFamily = U_CHARSET_FAMILY;
743
744 /* copy UCA's version; genrb will override all but the builder version w ith tailoring data */
745 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)) ;
746
747 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersion Info));
748 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersion Info));
749 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UV ersionInfo));
750 myData->jamoSpecial = coll->image->jamoSpecial;
751
752 /* copy the collator options */
753 uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, si zeof(UColOptionSet));
754 }
755 return result;
756 }
757
758 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCo de *status) {
759 if(U_FAILURE(*status)) {
760 return;
761 }
762 result->caseFirst = (UColAttributeValue)opts->caseFirst;
763 result->caseLevel = (UColAttributeValue)opts->caseLevel;
764 result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
765 result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
766 if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) {
767 return;
768 }
769 result->strength = (UColAttributeValue)opts->strength;
770 result->variableTopValue = opts->variableTopValue;
771 result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
772 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
773 result->numericCollation = (UColAttributeValue)opts->numericCollation;
774 result->caseFirstisDefault = TRUE;
775 result->caseLevelisDefault = TRUE;
776 result->frenchCollationisDefault = TRUE;
777 result->normalizationModeisDefault = TRUE;
778 result->strengthisDefault = TRUE;
779 result->variableTopValueisDefault = TRUE;
780 result->alternateHandlingisDefault = TRUE;
781 result->hiraganaQisDefault = TRUE;
782 result->numericCollationisDefault = TRUE;
783
784 ucol_updateInternalState(result, status);
785
786 result->options = opts;
787 }
788
789
790 /**
791 * Approximate determination if a character is at a contraction end.
792 * Guaranteed to be TRUE if a character is at the end of a contraction,
793 * otherwise it is not deterministic.
794 * @param c character to be determined
795 * @param coll collator
796 */
797 static
798 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
799 if (c < coll->minContrEndCP) {
800 return FALSE;
801 }
802
803 int32_t hash = c;
804 uint8_t htbyte;
805 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
806 if (U16_IS_TRAIL(c)) {
807 return TRUE;
808 }
809 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
810 }
811 htbyte = coll->contrEndCP[hash>>3];
812 return (((htbyte >> (hash & 7)) & 1) == 1);
813 }
814
815
816
817 /*
818 * i_getCombiningClass()
819 * A fast, at least partly inline version of u_getCombiningClass()
820 * This is a candidate for further optimization. Used heavily
821 * in contraction processing.
822 */
823 static
824 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
825 uint8_t sCC = 0;
826 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
827 sCC = u_getCombiningClass(c);
828 }
829 return sCC;
830 }
831
832 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, con st UCollator *UCA, UErrorCode *status) {
833 UChar c;
834 UCollator *result = fillIn;
835 if(U_FAILURE(*status) || image == NULL) {
836 return NULL;
837 }
838
839 if(result == NULL) {
840 result = (UCollator *)uprv_malloc(sizeof(UCollator));
841 if(result == NULL) {
842 *status = U_MEMORY_ALLOCATION_ERROR;
843 return result;
844 }
845 result->freeOnClose = TRUE;
846 } else {
847 result->freeOnClose = FALSE;
848 }
849
850 result->image = image;
851 result->mapping.getFoldingOffset = _getFoldingOffset;
852 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosit ion;
853 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
854 if(U_FAILURE(*status)) {
855 if(result->freeOnClose == TRUE) {
856 uprv_free(result);
857 result = NULL;
858 }
859 return result;
860 }
861
862 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
863 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image-> contractionCEs);
864 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->c ontractionIndex);
865 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expan sion);
866 result->rules = NULL;
867 result->rulesLength = 0;
868 result->freeRulesOnClose = FALSE;
869 result->reorderCodes = NULL;
870 result->reorderCodesLength = 0;
871 result->leadBytePermutationTable = NULL;
872
873 /* get the version info from UCATableHeader and populate the Collator struct */
874 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
875 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules v ersion*/
876 result->dataVersion[2] = 0;
877 result->dataVersion[3] = 0;
878
879 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
880 result->minUnsafeCP = 0;
881 for (c=0; c<0x300; c++) { // Find the smallest unsafe char.
882 if (ucol_unsafeCP(c, result)) break;
883 }
884 result->minUnsafeCP = c;
885
886 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
887 result->minContrEndCP = 0;
888 for (c=0; c<0x300; c++) { // Find the Contraction-ending char.
889 if (ucol_contractionEndCP(c, result)) break;
890 }
891 result->minContrEndCP = c;
892
893 /* max expansion tables */
894 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
895 result->image->endExpansionCE);
896 result->lastEndExpansionCE = result->endExpansionCE +
897 result->image->endExpansionCECount - 1;
898 result->expansionCESize = (uint8_t*)result->image +
899 result->image->expansionCESize;
900
901
902 //result->errorCode = *status;
903
904 result->latinOneCEs = NULL;
905
906 result->latinOneRegenTable = FALSE;
907 result->latinOneFailed = FALSE;
908 result->UCA = UCA;
909
910 /* Normally these will be set correctly later. This is the default if you us e UCA or the default. */
911 result->ucaRules = NULL;
912 result->actualLocale = NULL;
913 result->validLocale = NULL;
914 result->requestedLocale = NULL;
915 result->hasRealData = FALSE; // real data lives in .dat file...
916 result->freeImageOnClose = FALSE;
917
918 /* set attributes */
919 ucol_setOptionsFromHeader(
920 result,
921 (UColOptionSet*)((uint8_t*)result->image+result->image->options),
922 status);
923 result->freeOptionsOnClose = FALSE;
924
925 return result;
926 }
927
928 /* new Mark's code */
929
930 /**
931 * For generation of Implicit CEs
932 * @author Davis
933 *
934 * Cleaned up so that changes can be made more easily.
935 * Old values:
936 # First Implicit: E26A792D
937 # Last Implicit: E3DC70C0
938 # First CJK: E0030300
939 # Last CJK: E0A9DD00
940 # First CJK_A: E0A9DF00
941 # Last CJK_A: E0DE3100
942 */
943 /* Following is a port of Mark's code for new treatment of implicits.
944 * It is positioned here, since ucol_initUCA need to initialize the
945 * variables below according to the data in the fractional UCA.
946 */
947
948 /**
949 * Function used to:
950 * a) collapse the 2 different Han ranges from UCA into one (in the right order) , and
951 * b) bump any non-CJK characters by 10FFFF.
952 * The relevant blocks are:
953 * A: 4E00..9FFF; CJK Unified Ideographs
954 * F900..FAFF; CJK Compatibility Ideographs
955 * B: 3400..4DBF; CJK Unified Ideographs Extension A
956 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
957 * As long as
958 * no new B characters are allocated between 4E00 and FAFF, and
959 * no new A characters are outside of this range,
960 * (very high probability) this simple code will work.
961 * The reordered blocks are:
962 * Block1 is CJK
963 * Block2 is CJK_COMPAT_USED
964 * Block3 is CJK_A
965 * (all contiguous)
966 * Any other CJK gets its normal code point
967 * Any non-CJK gets +10FFFF
968 * When we reorder Block1, we make sure that it is at the very start,
969 * so that it will use a 3-byte form.
970 * Warning: the we only pick up the compatibility characters that are
971 * NOT decomposed, so that block is smaller!
972 */
973
974 // CONSTANTS
975 static const UChar32
976 NON_CJK_OFFSET = 0x110000,
977 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
978
979 /**
980 * Precomputed by initImplicitConstants()
981 */
982 static int32_t
983 final3Multiplier = 0,
984 final4Multiplier = 0,
985 final3Count = 0,
986 final4Count = 0,
987 medialCount = 0,
988 min3Primary = 0,
989 min4Primary = 0,
990 max4Primary = 0,
991 minTrail = 0,
992 maxTrail = 0,
993 max3Trail = 0,
994 max4Trail = 0,
995 min4Boundary = 0;
996
997 static const UChar32
998 // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
999 // 9FCB;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
1000 CJK_BASE = 0x4E00,
1001 CJK_LIMIT = 0x9FCB+1,
1002 // Unified CJK ideographs in the compatibility ideographs block.
1003 CJK_COMPAT_USED_BASE = 0xFA0E,
1004 CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
1005 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
1006 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
1007 CJK_A_BASE = 0x3400,
1008 CJK_A_LIMIT = 0x4DB5+1,
1009 // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
1010 // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
1011 CJK_B_BASE = 0x20000,
1012 CJK_B_LIMIT = 0x2A6D6+1,
1013 // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
1014 // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
1015 CJK_C_BASE = 0x2A700,
1016 CJK_C_LIMIT = 0x2B734+1,
1017 // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
1018 // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
1019 CJK_D_BASE = 0x2B740,
1020 CJK_D_LIMIT = 0x2B81D+1;
1021 // when adding to this list, look for all occurrences (in project)
1022 // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing !!!!
1023
1024 static UChar32 swapCJK(UChar32 i) {
1025 if (i < CJK_A_BASE) {
1026 // non-CJK
1027 } else if (i < CJK_A_LIMIT) {
1028 // Extension A has lower code points than the original Unihan+compat
1029 // but sorts higher.
1030 return i - CJK_A_BASE
1031 + (CJK_LIMIT - CJK_BASE)
1032 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1033 } else if (i < CJK_BASE) {
1034 // non-CJK
1035 } else if (i < CJK_LIMIT) {
1036 return i - CJK_BASE;
1037 } else if (i < CJK_COMPAT_USED_BASE) {
1038 // non-CJK
1039 } else if (i < CJK_COMPAT_USED_LIMIT) {
1040 return i - CJK_COMPAT_USED_BASE
1041 + (CJK_LIMIT - CJK_BASE);
1042 } else if (i < CJK_B_BASE) {
1043 // non-CJK
1044 } else if (i < CJK_B_LIMIT) {
1045 return i; // non-BMP-CJK
1046 } else if (i < CJK_C_BASE) {
1047 // non-CJK
1048 } else if (i < CJK_C_LIMIT) {
1049 return i; // non-BMP-CJK
1050 } else if (i < CJK_D_BASE) {
1051 // non-CJK
1052 } else if (i < CJK_D_LIMIT) {
1053 return i; // non-BMP-CJK
1054 }
1055 return i + NON_CJK_OFFSET; // non-CJK
1056 }
1057
1058 U_CAPI UChar32 U_EXPORT2
1059 uprv_uca_getRawFromCodePoint(UChar32 i) {
1060 return swapCJK(i)+1;
1061 }
1062
1063 U_CAPI UChar32 U_EXPORT2
1064 uprv_uca_getCodePointFromRaw(UChar32 i) {
1065 i--;
1066 UChar32 result = 0;
1067 if(i >= NON_CJK_OFFSET) {
1068 result = i - NON_CJK_OFFSET;
1069 } else if(i >= CJK_B_BASE) {
1070 result = i;
1071 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
1072 if(i < CJK_LIMIT - CJK_BASE) {
1073 result = i + CJK_BASE;
1074 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMP AT_USED_BASE)) {
1075 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
1076 } else {
1077 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_ LIMIT - CJK_COMPAT_USED_BASE);
1078 }
1079 } else {
1080 result = -1;
1081 }
1082 return result;
1083 }
1084
1085 // GET IMPLICIT PRIMARY WEIGHTS
1086 // Return value is left justified primary key
1087 U_CAPI uint32_t U_EXPORT2
1088 uprv_uca_getImplicitFromRaw(UChar32 cp) {
1089 /*
1090 if (cp < 0 || cp > UCOL_MAX_INPUT) {
1091 throw new IllegalArgumentException("Code point out of range " + Utility. hex(cp));
1092 }
1093 */
1094 int32_t last0 = cp - min4Boundary;
1095 if (last0 < 0) {
1096 int32_t last1 = cp / final3Count;
1097 last0 = cp % final3Count;
1098
1099 int32_t last2 = last1 / medialCount;
1100 last1 %= medialCount;
1101
1102 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
1103 last1 = minTrail + last1; // offset
1104 last2 = min3Primary + last2; // offset
1105 /*
1106 if (last2 >= min4Primary) {
1107 throw new IllegalArgumentException("4-byte out of range: " + Utility .hex(cp) + ", " + Utility.hex(last2));
1108 }
1109 */
1110 return (last2 << 24) + (last1 << 16) + (last0 << 8);
1111 } else {
1112 int32_t last1 = last0 / final4Count;
1113 last0 %= final4Count;
1114
1115 int32_t last2 = last1 / medialCount;
1116 last1 %= medialCount;
1117
1118 int32_t last3 = last2 / medialCount;
1119 last2 %= medialCount;
1120
1121 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
1122 last1 = minTrail + last1; // offset
1123 last2 = minTrail + last2; // offset
1124 last3 = min4Primary + last3; // offset
1125 /*
1126 if (last3 > max4Primary) {
1127 throw new IllegalArgumentException("4-byte out of range: " + Utility .hex(cp) + ", " + Utility.hex(last3));
1128 }
1129 */
1130 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
1131 }
1132 }
1133
1134 static uint32_t U_EXPORT2
1135 uprv_uca_getImplicitPrimary(UChar32 cp) {
1136 //fprintf(stdout, "Incoming: %04x\n", cp);
1137 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1138
1139 cp = swapCJK(cp);
1140 cp++;
1141 // we now have a range of numbers from 0 to 21FFFF.
1142
1143 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1144 //fprintf(stdout, "CJK swapped: %04x\n", cp);
1145
1146 return uprv_uca_getImplicitFromRaw(cp);
1147 }
1148
1149 /**
1150 * Converts implicit CE into raw integer ("code point")
1151 * @param implicit
1152 * @return -1 if illegal format
1153 */
1154 U_CAPI UChar32 U_EXPORT2
1155 uprv_uca_getRawFromImplicit(uint32_t implicit) {
1156 UChar32 result;
1157 UChar32 b3 = implicit & 0xFF;
1158 UChar32 b2 = (implicit >> 8) & 0xFF;
1159 UChar32 b1 = (implicit >> 16) & 0xFF;
1160 UChar32 b0 = (implicit >> 24) & 0xFF;
1161
1162 // simple parameter checks
1163 if (b0 < min3Primary || b0 > max4Primary
1164 || b1 < minTrail || b1 > maxTrail)
1165 return -1;
1166 // normal offsets
1167 b1 -= minTrail;
1168
1169 // take care of the final values, and compose
1170 if (b0 < min4Primary) {
1171 if (b2 < minTrail || b2 > max3Trail || b3 != 0)
1172 return -1;
1173 b2 -= minTrail;
1174 UChar32 remainder = b2 % final3Multiplier;
1175 if (remainder != 0)
1176 return -1;
1177 b0 -= min3Primary;
1178 b2 /= final3Multiplier;
1179 result = ((b0 * medialCount) + b1) * final3Count + b2;
1180 } else {
1181 if (b2 < minTrail || b2 > maxTrail
1182 || b3 < minTrail || b3 > max4Trail)
1183 return -1;
1184 b2 -= minTrail;
1185 b3 -= minTrail;
1186 UChar32 remainder = b3 % final4Multiplier;
1187 if (remainder != 0)
1188 return -1;
1189 b3 /= final4Multiplier;
1190 b0 -= min4Primary;
1191 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1192 }
1193 // final check
1194 if (result < 0 || result > UCOL_MAX_INPUT)
1195 return -1;
1196 return result;
1197 }
1198
1199
1200 static inline int32_t divideAndRoundUp(int a, int b) {
1201 return 1 + (a-1)/b;
1202 }
1203
1204 /* this function is either called from initUCA or from genUCA before
1205 * doing canonical closure for the UCA.
1206 */
1207
1208 /**
1209 * Set up to generate implicits.
1210 * Maintenance Note: this function may end up being called more than once, due
1211 * to threading races during initialization. Make sure that
1212 * none of the Constants is ever transiently assigned an
1213 * incorrect value.
1214 * @param minPrimary
1215 * @param maxPrimary
1216 * @param minTrail final byte
1217 * @param maxTrail final byte
1218 * @param gap3 the gap we leave for tailoring for 3-byte forms
1219 * @param gap4 the gap we leave for tailoring for 4-byte forms
1220 */
1221 static void initImplicitConstants(int minPrimary, int maxPrimary,
1222 int minTrailIn, int maxTrailIn,
1223 int gap3, int primaries3count,
1224 UErrorCode *status) {
1225 // some simple parameter checks
1226 if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
1227 || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
1228 || (primaries3count < 1))
1229 {
1230 *status = U_ILLEGAL_ARGUMENT_ERROR;
1231 return;
1232 };
1233
1234 minTrail = minTrailIn;
1235 maxTrail = maxTrailIn;
1236
1237 min3Primary = minPrimary;
1238 max4Primary = maxPrimary;
1239 // compute constants for use later.
1240 // number of values we can use in trailing bytes
1241 // leave room for empty values between AND above, e.g. if gap = 2
1242 // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1243 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1244 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1245 final3Multiplier = gap3 + 1;
1246 final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1247 max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1248
1249 // medials can use full range
1250 medialCount = (maxTrail - minTrail + 1);
1251 // find out how many values fit in each form
1252 int32_t threeByteCount = medialCount * final3Count;
1253 // now determine where the 3/4 boundary is.
1254 // we use 3 bytes below the boundary, and 4 above
1255 int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1256 int32_t primaries4count = primariesAvailable - primaries3count;
1257
1258
1259 int32_t min3ByteCoverage = primaries3count * threeByteCount;
1260 min4Primary = minPrimary + primaries3count;
1261 min4Boundary = min3ByteCoverage;
1262 // Now expand out the multiplier for the 4 bytes, and redo.
1263
1264 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1265 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count );
1266 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCo unt * medialCount);
1267 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1268 if (gap4 < 1) {
1269 *status = U_ILLEGAL_ARGUMENT_ERROR;
1270 return;
1271 }
1272 final4Multiplier = gap4 + 1;
1273 final4Count = neededPerFinalByte;
1274 max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1275 }
1276
1277 /**
1278 * Supply parameters for generating implicit CEs
1279 */
1280 U_CAPI void U_EXPORT2
1281 uprv_uca_initImplicitConstants(UErrorCode *status) {
1282 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms .
1283 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1284 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
1285 }
1286
1287
1288 /* collIterNormalize Incremental Normalization happens here. */
1289 /* pick up the range of chars identifed by FCD, */
1290 /* normalize it into the collIterate's writable buffer, */
1291 /* switch the collIterate's state to use the writable b uffer. */
1292 /* */
1293 static
1294 void collIterNormalize(collIterate *collationSource)
1295 {
1296 UErrorCode status = U_ZERO_ERROR;
1297 const UChar *srcP = collationSource->pos - 1; /* Start of chars to nor malize */
1298 const UChar *endP = collationSource->fcdPosition; /* End of region to norma lize+1 */
1299
1300 collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
1301 collationSource->writableBuffer,
1302 status);
1303 if (U_FAILURE(status)) {
1304 #ifdef UCOL_DEBUG
1305 fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_erro rName(status));
1306 #endif
1307 return;
1308 }
1309
1310 collationSource->pos = collationSource->writableBuffer.getTerminatedB uffer();
1311 collationSource->origFlags = collationSource->flags;
1312 collationSource->flags |= UCOL_ITER_INNORMBUF;
1313 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE _ITERATOR);
1314 }
1315
1316
1317 // This function takes the iterator and extracts normalized stuff up to the next boundary
1318 // It is similar in the end results to the collIterNormalize, but for the cases when we
1319 // use an iterator
1320 /*static
1321 inline void normalizeIterator(collIterate *collationSource) {
1322 UErrorCode status = U_ZERO_ERROR;
1323 UBool wasNormalized = FALSE;
1324 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->ite rator, UITER_CURRENT);
1325 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iter ator);
1326 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writa bleBuffer,
1327 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalize d, &status);
1328 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->w ritableBufSize) {
1329 // reallocate and terminate
1330 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1331 &collationSource->writableBuffer,
1332 (int32_t *)&collationSource->writableBufSize, nor mLen + 1,
1333 0)
1334 ) {
1335 #ifdef UCOL_DEBUG
1336 fprintf(stderr, "normalizeIterator(), out of memory\n");
1337 #endif
1338 return;
1339 }
1340 status = U_ZERO_ERROR;
1341 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITE R_ZERO);
1342 collationSource->iterator->setState(collationSource->iterator, iterIndex, &s tatus);
1343 normLen = unorm_next(collationSource->iterator, collationSource->writableBuf fer,
1344 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalize d, &status);
1345 }
1346 // Terminate the buffer - we already checked that it is big enough
1347 collationSource->writableBuffer[normLen] = 0;
1348 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1349 collationSource->flags |= UCOL_ITER_ALLOCATED;
1350 }
1351 collationSource->pos = collationSource->writableBuffer;
1352 collationSource->origFlags = collationSource->flags;
1353 collationSource->flags |= UCOL_ITER_INNORMBUF;
1354 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_I TERATOR);
1355 }*/
1356
1357
1358 /* Incremental FCD check and normalize */
1359 /* Called from getNextCE when normalization state is suspect. */
1360 /* When entering, the state is known to be this: */
1361 /* o We are working in the main buffer of the collIterate, not the side */
1362 /* writable buffer. When in the side buffer, normalization mode is alw ays off, */
1363 /* so we won't get here. */
1364 /* o The leading combining class from the current character is 0 or */
1365 /* the trailing combining class of the previous char was zero. */
1366 /* True because the previous call to this function will have always exi ted */
1367 /* that way, and we get called for every char where cc might be non-zer o. */
1368 static
1369 inline UBool collIterFCD(collIterate *collationSource) {
1370 const UChar *srcP, *endP;
1371 uint8_t leadingCC;
1372 uint8_t prevTrailingCC = 0;
1373 uint16_t fcd;
1374 UBool needNormalize = FALSE;
1375
1376 srcP = collationSource->pos-1;
1377
1378 if (collationSource->flags & UCOL_ITER_HASLEN) {
1379 endP = collationSource->endp;
1380 } else {
1381 endP = NULL;
1382 }
1383
1384 // Get the trailing combining class of the current character. If it's zero,
1385 // we are OK.
1386 /* trie access */
1387 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
1388 if (fcd != 0) {
1389 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1390
1391 if (prevTrailingCC != 0) {
1392 // The current char has a non-zero trailing CC. Scan forward until we find
1393 // a char with a leading cc of zero.
1394 while (endP == NULL || srcP != endP)
1395 {
1396 const UChar *savedSrcP = srcP;
1397
1398 /* trie access */
1399 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
1400 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1401 if (leadingCC == 0) {
1402 srcP = savedSrcP; // Hit char that is not part of combi ning sequence.
1403 // back up over it. (Could be surr ogate pair!)
1404 break;
1405 }
1406
1407 if (leadingCC < prevTrailingCC) {
1408 needNormalize = TRUE;
1409 }
1410
1411 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1412 }
1413 }
1414 }
1415
1416 collationSource->fcdPosition = (UChar *)srcP;
1417
1418 return needNormalize;
1419 }
1420
1421 /****************************************************************************/
1422 /* Following are the CE retrieval functions */
1423 /* */
1424 /****************************************************************************/
1425
1426 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
1427 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
1428
1429 /* there should be a macro version of this function in the header file */
1430 /* This is the first function that tries to fetch a collation element */
1431 /* If it's not succesfull or it encounters a more difficult situation */
1432 /* some more sofisticated and slower functions are invoked */
1433 static
1434 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou rce, UErrorCode *status) {
1435 uint32_t order = 0;
1436 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */
1437 order = *(collationSource->toReturn++); /* if so , return them */
1438 if(collationSource->CEpos == collationSource->toReturn) {
1439 collationSource->CEpos = collationSource->toReturn = collationSource ->extendCEs ? collationSource->extendCEs : collationSource->CEs;
1440 }
1441 return order;
1442 }
1443
1444 UChar ch = 0;
1445 collationSource->offsetReturn = NULL;
1446
1447 for (;;) /* Loop handles case when incremental nor malize switches */
1448 { /* to or from the side buffer / origina l string, and we */
1449 /* need to start again to get the next character. */
1450
1451 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1452 {
1453 // The source string is null terminated and we're not working from t he side buffer,
1454 // and we're not normalizing. This is the fast path.
1455 // (We can be in the side buffer for Thai pre-vowel reordering eve n when not normalizing.)
1456 ch = *collationSource->pos++;
1457 if (ch != 0) {
1458 break;
1459 }
1460 else {
1461 return UCOL_NO_MORE_CES;
1462 }
1463 }
1464
1465 if (collationSource->flags & UCOL_ITER_HASLEN) {
1466 // Normal path for strings when length is specified.
1467 // (We can't be in side buffer because it is always null terminate d.)
1468 if (collationSource->pos >= collationSource->endp) {
1469 // Ran off of the end of the main source string. We're done.
1470 return UCOL_NO_MORE_CES;
1471 }
1472 ch = *collationSource->pos++;
1473 }
1474 else if(collationSource->flags & UCOL_USE_ITERATOR) {
1475 UChar32 iterCh = collationSource->iterator->next(collationSource->it erator);
1476 if(iterCh == U_SENTINEL) {
1477 return UCOL_NO_MORE_CES;
1478 }
1479 ch = (UChar)iterCh;
1480 }
1481 else
1482 {
1483 // Null terminated string.
1484 ch = *collationSource->pos++;
1485 if (ch == 0) {
1486 // Ran off end of buffer.
1487 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1488 // Ran off end of main string. backing up one character.
1489 collationSource->pos--;
1490 return UCOL_NO_MORE_CES;
1491 }
1492 else
1493 {
1494 // Hit null in the normalize side buffer.
1495 // Usually this means the end of the normalized data,
1496 // except for one odd case: a null followed by combining cha rs,
1497 // which is the case if we are at the start of the buffer.
1498 if (collationSource->pos == collationSource->writableBuffer. getBuffer()+1) {
1499 break;
1500 }
1501
1502 // Null marked end of side buffer.
1503 // Revert to the main string and
1504 // loop back to top to try again to get a character.
1505 collationSource->pos = collationSource->fcdPosition;
1506 collationSource->flags = collationSource->origFlags;
1507 continue;
1508 }
1509 }
1510 }
1511
1512 if(collationSource->flags&UCOL_HIRAGANA_Q) {
1513 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
1514 * based on whether the previous codepoint was Hiragana or Katakana.
1515 */
1516 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
1517 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x30 99 && ch <= 0x309C))) {
1518 collationSource->flags |= UCOL_WAS_HIRAGANA;
1519 } else {
1520 collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1521 }
1522 }
1523
1524 // We've got a character. See if there's any fcd and/or normalization s tuff to do.
1525 // Note that UCOL_ITER_NORM flag is always zero when we are in the si de buffer.
1526 if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1527 break;
1528 }
1529
1530 if (collationSource->fcdPosition >= collationSource->pos) {
1531 // An earlier FCD check has already covered the current character.
1532 // We can go ahead and process this char.
1533 break;
1534 }
1535
1536 if (ch < ZERO_CC_LIMIT_ ) {
1537 // Fast fcd safe path. Trailing combining class == 0. This char is OK.
1538 break;
1539 }
1540
1541 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1542 // We need to peek at the next character in order to tell if we are FCD
1543 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource-> pos >= collationSource->endp) {
1544 // We are at the last char of source string.
1545 // It is always OK for FCD check.
1546 break;
1547 }
1548
1549 // Not at last char of source string (or we'll check against termina ting null). Do the FCD fast test
1550 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1551 break;
1552 }
1553 }
1554
1555
1556 // Need a more complete FCD check and possible normalization.
1557 if (collIterFCD(collationSource)) {
1558 collIterNormalize(collationSource);
1559 }
1560 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1561 // No normalization was needed. Go ahead and process the char we a lready had.
1562 break;
1563 }
1564
1565 // Some normalization happened. Next loop iteration will pick up a char
1566 // from the normalization buffer.
1567
1568 } // end for (;;)
1569
1570
1571 if (ch <= 0xFF) {
1572 /* For latin-1 characters we never need to fall back to the UCA table */
1573 /* because all of the UCA data is replicated in the latinOneMapping a rray */
1574 order = coll->latinOneMapping[ch];
1575 if (order > UCOL_NOT_FOUND) {
1576 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, stat us);
1577 }
1578 }
1579 else
1580 {
1581 // Always use UCA for Han, Hangul
1582 // (Han extension A is before main Han block)
1583 // **** Han compatibility chars ?? ****
1584 if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
1585 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
1586 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
1587 // between the two target ranges; do normal lookup
1588 // **** this range is YI, Modifier tone letters, ****
1589 // **** Latin-D, Syloti Nagari, Phagas-pa. ****
1590 // **** Latin-D might be tailored, so we need to ****
1591 // **** do the normal lookup for these guys. ****
1592 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1593 } else {
1594 // in one of the target ranges; use UCA
1595 order = UCOL_NOT_FOUND;
1596 }
1597 } else {
1598 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1599 }
1600
1601 if(order > UCOL_NOT_FOUND) { /* if a CE is special */
1602 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, stat us); /* and try to get the special CE */
1603 }
1604
1605 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */
1606 /* if we got here, the codepoint MUST be over 0xFF - so we look dire ctly in the trie */
1607 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1608
1609 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1610 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSou rce, status);
1611 }
1612 }
1613 }
1614 if(order == UCOL_NOT_FOUND) {
1615 order = getImplicit(ch, collationSource);
1616 }
1617 return order; /* return the CE */
1618 }
1619
1620 /* ucol_getNextCE, out-of-line version for use from other files. */
1621 U_CAPI uint32_t U_EXPORT2
1622 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode * status) {
1623 return ucol_IGetNextCE(coll, collationSource, status);
1624 }
1625
1626
1627 /**
1628 * Incremental previous normalization happens here. Pick up the range of chars
1629 * identifed by FCD, normalize it into the collIterate's writable buffer,
1630 * switch the collIterate's state to use the writable buffer.
1631 * @param data collation iterator data
1632 */
1633 static
1634 void collPrevIterNormalize(collIterate *data)
1635 {
1636 UErrorCode status = U_ZERO_ERROR;
1637 const UChar *pEnd = data->pos; /* End normalize + 1 */
1638 const UChar *pStart;
1639
1640 /* Start normalize */
1641 if (data->fcdPosition == NULL) {
1642 pStart = data->string;
1643 }
1644 else {
1645 pStart = data->fcdPosition + 1;
1646 }
1647
1648 int32_t normLen =
1649 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pSta rt) + 1)),
1650 data->writableBuffer,
1651 status).
1652 length();
1653 if(U_FAILURE(status)) {
1654 return;
1655 }
1656 /*
1657 this puts the null termination infront of the normalized string instead
1658 of the end
1659 */
1660 data->writableBuffer.insert(0, (UChar)0);
1661
1662 /*
1663 * The usual case at this point is that we've got a base
1664 * character followed by marks that were normalized. If
1665 * fcdPosition is NULL, that means that we backed up to
1666 * the beginning of the string and there's no base character.
1667 *
1668 * Forward processing will usually normalize when it sees
1669 * the first mark, so that mark will get it's natural offset
1670 * and the rest will get the offset of the character following
1671 * the marks. The base character will also get its natural offset.
1672 *
1673 * We write the offset of the base character, if there is one,
1674 * followed by the offset of the first mark and then the offsets
1675 * of the rest of the marks.
1676 */
1677 int32_t firstMarkOffset = 0;
1678 int32_t trailOffset = (int32_t)(data->pos - data->string + 1);
1679 int32_t trailCount = normLen - 1;
1680
1681 if (data->fcdPosition != NULL) {
1682 int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
1683 UChar baseChar = *data->fcdPosition;
1684
1685 firstMarkOffset = baseOffset + 1;
1686
1687 /*
1688 * If the base character is the start of a contraction, forward processi ng
1689 * will normalize the marks while checking for the contraction, which me ans
1690 * that the offset of the first mark will the same as the other marks.
1691 *
1692 * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
1693 */
1694 if (baseChar >= 0x100) {
1695 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, bas eChar);
1696
1697 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
1698 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, bas eChar);
1699 }
1700
1701 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION _TAG) {
1702 firstMarkOffset = trailOffset;
1703 }
1704 }
1705
1706 data->appendOffset(baseOffset, status);
1707 }
1708
1709 data->appendOffset(firstMarkOffset, status);
1710
1711 for (int32_t i = 0; i < trailCount; i += 1) {
1712 data->appendOffset(trailOffset, status);
1713 }
1714
1715 data->offsetRepeatValue = trailOffset;
1716
1717 data->offsetReturn = data->offsetStore - 1;
1718 if (data->offsetReturn == data->offsetBuffer) {
1719 data->offsetStore = data->offsetBuffer;
1720 }
1721
1722 data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
1723 data->origFlags = data->flags;
1724 data->flags |= UCOL_ITER_INNORMBUF;
1725 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1726 }
1727
1728
1729 /**
1730 * Incremental FCD check for previous iteration and normalize. Called from
1731 * getPrevCE when normalization state is suspect.
1732 * When entering, the state is known to be this:
1733 * o We are working in the main buffer of the collIterate, not the side
1734 * writable buffer. When in the side buffer, normalization mode is always
1735 * off, so we won't get here.
1736 * o The leading combining class from the current character is 0 or the
1737 * trailing combining class of the previous char was zero.
1738 * True because the previous call to this function will have always exited
1739 * that way, and we get called for every char where cc might be non-zero.
1740 * @param data collation iterate struct
1741 * @return normalization status, TRUE for normalization to be done, FALSE
1742 * otherwise
1743 */
1744 static
1745 inline UBool collPrevIterFCD(collIterate *data)
1746 {
1747 const UChar *src, *start;
1748 uint8_t leadingCC;
1749 uint8_t trailingCC = 0;
1750 uint16_t fcd;
1751 UBool result = FALSE;
1752
1753 start = data->string;
1754 src = data->pos + 1;
1755
1756 /* Get the trailing combining class of the current character. */
1757 fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
1758
1759 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1760
1761 if (leadingCC != 0) {
1762 /*
1763 The current char has a non-zero leading combining class.
1764 Scan backward until we find a char with a trailing cc of zero.
1765 */
1766 for (;;)
1767 {
1768 if (start == src) {
1769 data->fcdPosition = NULL;
1770 return result;
1771 }
1772
1773 fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
1774
1775 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1776
1777 if (trailingCC == 0) {
1778 break;
1779 }
1780
1781 if (leadingCC < trailingCC) {
1782 result = TRUE;
1783 }
1784
1785 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1786 }
1787 }
1788
1789 data->fcdPosition = (UChar *)src;
1790
1791 return result;
1792 }
1793
1794 /** gets a code unit from the string at a given offset
1795 * Handles both normal and iterative cases.
1796 * No error checking - caller beware!
1797 */
1798 static inline
1799 UChar peekCodeUnit(collIterate *source, int32_t offset) {
1800 if(source->pos != NULL) {
1801 return *(source->pos + offset);
1802 } else if(source->iterator != NULL) {
1803 UChar32 c;
1804 if(offset != 0) {
1805 source->iterator->move(source->iterator, offset, UITER_CURRENT);
1806 c = source->iterator->next(source->iterator);
1807 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1808 } else {
1809 c = source->iterator->current(source->iterator);
1810 }
1811 return c >= 0 ? (UChar)c : 0xfffd; // If the caller works properly, we should never see c<0.
1812 } else {
1813 return 0xfffd;
1814 }
1815 }
1816
1817 // Code point version. Treats the offset as a _code point_ delta.
1818 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-for med UTF-16.
1819 // We cannot use U16_FWD_1 and similar because we do not know the start and limi t of the buffer.
1820 static inline
1821 UChar32 peekCodePoint(collIterate *source, int32_t offset) {
1822 UChar32 c;
1823 if(source->pos != NULL) {
1824 const UChar *p = source->pos;
1825 if(offset >= 0) {
1826 // Skip forward over (offset-1) code points.
1827 while(--offset >= 0) {
1828 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) {
1829 ++p;
1830 }
1831 }
1832 // Read the code point there.
1833 c = *p++;
1834 UChar trail;
1835 if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {
1836 c = U16_GET_SUPPLEMENTARY(c, trail);
1837 }
1838 } else /* offset<0 */ {
1839 // Skip backward over (offset-1) code points.
1840 while(++offset < 0) {
1841 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) {
1842 --p;
1843 }
1844 }
1845 // Read the code point before that.
1846 c = *--p;
1847 UChar lead;
1848 if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {
1849 c = U16_GET_SUPPLEMENTARY(lead, c);
1850 }
1851 }
1852 } else if(source->iterator != NULL) {
1853 if(offset >= 0) {
1854 // Skip forward over (offset-1) code points.
1855 int32_t fwd = offset;
1856 while(fwd-- > 0) {
1857 uiter_next32(source->iterator);
1858 }
1859 // Read the code point there.
1860 c = uiter_current32(source->iterator);
1861 // Return to the starting point, skipping backward over (offset-1) c ode points.
1862 while(offset-- > 0) {
1863 uiter_previous32(source->iterator);
1864 }
1865 } else /* offset<0 */ {
1866 // Read backward, reading offset code points, remember only the last -read one.
1867 int32_t back = offset;
1868 do {
1869 c = uiter_previous32(source->iterator);
1870 } while(++back < 0);
1871 // Return to the starting position, skipping forward over offset cod e points.
1872 do {
1873 uiter_next32(source->iterator);
1874 } while(++offset < 0);
1875 }
1876 } else {
1877 c = U_SENTINEL;
1878 }
1879 return c;
1880 }
1881
1882 /**
1883 * Determines if we are at the start of the data string in the backwards
1884 * collation iterator
1885 * @param data collation iterator
1886 * @return TRUE if we are at the start
1887 */
1888 static
1889 inline UBool isAtStartPrevIterate(collIterate *data) {
1890 if(data->pos == NULL && data->iterator != NULL) {
1891 return !data->iterator->hasPrevious(data->iterator);
1892 }
1893 //return (collIter_bos(data)) ||
1894 return (data->pos == data->string) ||
1895 ((data->flags & UCOL_ITER_INNORMBUF) &&
1896 *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1897 }
1898
1899 static
1900 inline void goBackOne(collIterate *data) {
1901 # if 0
1902 // somehow, it looks like we need to keep iterator synced up
1903 // at all times, as above.
1904 if(data->pos) {
1905 data->pos--;
1906 }
1907 if(data->iterator) {
1908 data->iterator->previous(data->iterator);
1909 }
1910 #endif
1911 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1912 data->iterator->previous(data->iterator);
1913 }
1914 if(data->pos) {
1915 data->pos --;
1916 }
1917 }
1918
1919 /**
1920 * Inline function that gets a simple CE.
1921 * So what it does is that it will first check the expansion buffer. If the
1922 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1923 * is different from the string pointer, we return the collation element at the
1924 * return pointer and decrement it.
1925 * For more complicated CEs it resorts to getComplicatedCE.
1926 * @param coll collator data
1927 * @param data collation iterator struct
1928 * @param status error status
1929 */
1930 static
1931 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1932 UErrorCode *status)
1933 {
1934 uint32_t result = (uint32_t)UCOL_NULLORDER;
1935
1936 if (data->offsetReturn != NULL) {
1937 if (data->offsetRepeatCount > 0) {
1938 data->offsetRepeatCount -= 1;
1939 } else {
1940 if (data->offsetReturn == data->offsetBuffer) {
1941 data->offsetReturn = NULL;
1942 data->offsetStore = data->offsetBuffer;
1943 } else {
1944 data->offsetReturn -= 1;
1945 }
1946 }
1947 }
1948
1949 if ((data->extendCEs && data->toReturn > data->extendCEs) ||
1950 (!data->extendCEs && data->toReturn > data->CEs))
1951 {
1952 data->toReturn -= 1;
1953 result = *(data->toReturn);
1954 if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
1955 data->CEpos = data->toReturn;
1956 }
1957 }
1958 else {
1959 UChar ch = 0;
1960
1961 /*
1962 Loop handles case when incremental normalize switches to or from the
1963 side buffer / original string, and we need to start again to get the
1964 next character.
1965 */
1966 for (;;) {
1967 if (data->flags & UCOL_ITER_HASLEN) {
1968 /*
1969 Normal path for strings when length is specified.
1970 Not in side buffer because it is always null terminated.
1971 */
1972 if (data->pos <= data->string) {
1973 /* End of the main source string */
1974 return UCOL_NO_MORE_CES;
1975 }
1976 data->pos --;
1977 ch = *data->pos;
1978 }
1979 // we are using an iterator to go back. Pray for us!
1980 else if (data->flags & UCOL_USE_ITERATOR) {
1981 UChar32 iterCh = data->iterator->previous(data->iterator);
1982 if(iterCh == U_SENTINEL) {
1983 return UCOL_NO_MORE_CES;
1984 } else {
1985 ch = (UChar)iterCh;
1986 }
1987 }
1988 else {
1989 data->pos --;
1990 ch = *data->pos;
1991 /* we are in the side buffer. */
1992 if (ch == 0) {
1993 /*
1994 At the start of the normalize side buffer.
1995 Go back to string.
1996 Because pointer points to the last accessed character,
1997 hence we have to increment it by one here.
1998 */
1999 data->flags = data->origFlags;
2000 data->offsetRepeatValue = 0;
2001
2002 if (data->fcdPosition == NULL) {
2003 data->pos = data->string;
2004 return UCOL_NO_MORE_CES;
2005 }
2006 else {
2007 data->pos = data->fcdPosition + 1;
2008 }
2009
2010 continue;
2011 }
2012 }
2013
2014 if(data->flags&UCOL_HIRAGANA_Q) {
2015 if(ch>=0x3040 && ch<=0x309f) {
2016 data->flags |= UCOL_WAS_HIRAGANA;
2017 } else {
2018 data->flags &= ~UCOL_WAS_HIRAGANA;
2019 }
2020 }
2021
2022 /*
2023 * got a character to determine if there's fcd and/or normalization
2024 * stuff to do.
2025 * if the current character is not fcd.
2026 * if current character is at the start of the string
2027 * Trailing combining class == 0.
2028 * Note if pos is in the writablebuffer, norm is always 0
2029 */
2030 if (ch < ZERO_CC_LIMIT_ ||
2031 // this should propel us out of the loop in the iterator case
2032 (data->flags & UCOL_ITER_NORM) == 0 ||
2033 (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
2034 || data->string == data->pos) {
2035 break;
2036 }
2037
2038 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
2039 /* if next character is FCD */
2040 if (data->pos == data->string) {
2041 /* First char of string is always OK for FCD check */
2042 break;
2043 }
2044
2045 /* Not first char of string, do the FCD fast test */
2046 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
2047 break;
2048 }
2049 }
2050
2051 /* Need a more complete FCD check and possible normalization. */
2052 if (collPrevIterFCD(data)) {
2053 collPrevIterNormalize(data);
2054 }
2055
2056 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2057 /* No normalization. Go ahead and process the char. */
2058 break;
2059 }
2060
2061 /*
2062 Some normalization happened.
2063 Next loop picks up a char from the normalization buffer.
2064 */
2065 }
2066
2067 /* attempt to handle contractions, after removal of the backwards
2068 contraction
2069 */
2070 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
2071 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
2072 } else {
2073 if (ch <= 0xFF) {
2074 result = coll->latinOneMapping[ch];
2075 }
2076 else {
2077 // Always use UCA for [3400..9FFF], [AC00..D7AF]
2078 // **** [FA0E..FA2F] ?? ****
2079 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
2080 (ch >= 0x3400 && ch <= 0xD7AF)) {
2081 if (ch > 0x9FFF && ch < 0xAC00) {
2082 // between the two target ranges; do normal lookup
2083 // **** this range is YI, Modifier tone letters, ****
2084 // **** Latin-D, Syloti Nagari, Phagas-pa. ****
2085 // **** Latin-D might be tailored, so we need to ****
2086 // **** do the normal lookup for these guys. ****
2087 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2088 } else {
2089 result = UCOL_NOT_FOUND;
2090 }
2091 } else {
2092 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2093 }
2094 }
2095 if (result > UCOL_NOT_FOUND) {
2096 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, statu s);
2097 }
2098 if (result == UCOL_NOT_FOUND) { // Not found in master list
2099 if (!isAtStartPrevIterate(data) &&
2100 ucol_contractionEndCP(ch, data->coll))
2101 {
2102 result = UCOL_CONTRACTION;
2103 } else {
2104 if(coll->UCA) {
2105 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
2106 }
2107 }
2108
2109 if (result > UCOL_NOT_FOUND) {
2110 if(coll->UCA) {
2111 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result , data, status);
2112 }
2113 }
2114 }
2115 }
2116
2117 if(result == UCOL_NOT_FOUND) {
2118 result = getPrevImplicit(ch, data);
2119 }
2120 }
2121
2122 return result;
2123 }
2124
2125
2126 /* ucol_getPrevCE, out-of-line version for use from other files. */
2127 U_CFUNC uint32_t U_EXPORT2
2128 ucol_getPrevCE(const UCollator *coll, collIterate *data,
2129 UErrorCode *status) {
2130 return ucol_IGetPrevCE(coll, data, status);
2131 }
2132
2133
2134 /* this should be connected to special Jamo handling */
2135 U_CFUNC uint32_t U_EXPORT2
2136 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
2137 collIterate colIt;
2138 IInit_collIterate(coll, &u, 1, &colIt, status);
2139 if(U_FAILURE(*status)) {
2140 return 0;
2141 }
2142 return ucol_IGetNextCE(coll, &colIt, status);
2143 }
2144
2145 /**
2146 * Inserts the argument character into the end of the buffer pushing back the
2147 * null terminator.
2148 * @param data collIterate struct data
2149 * @param ch character to be appended
2150 * @return the position of the new addition
2151 */
2152 static
2153 inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
2154 {
2155 int32_t oldLength = data->writableBuffer.length();
2156 return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
2157 }
2158
2159 /**
2160 * Inserts the argument string into the end of the buffer pushing back the
2161 * null terminator.
2162 * @param data collIterate struct data
2163 * @param string to be appended
2164 * @param length of the string to be appended
2165 * @return the position of the new addition
2166 */
2167 static
2168 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_ t length)
2169 {
2170 int32_t oldLength = data->writableBuffer.length();
2171 return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldL ength;
2172 }
2173
2174 /**
2175 * Special normalization function for contraction in the forwards iterator.
2176 * This normalization sequence will place the current character at source->pos
2177 * and its following normalized sequence into the buffer.
2178 * The fcd position, pos will be changed.
2179 * pos will now point to positions in the buffer.
2180 * Flags will be changed accordingly.
2181 * @param data collation iterator data
2182 */
2183 static
2184 inline void normalizeNextContraction(collIterate *data)
2185 {
2186 int32_t strsize;
2187 UErrorCode status = U_ZERO_ERROR;
2188 /* because the pointer points to the next character */
2189 const UChar *pStart = data->pos - 1;
2190 const UChar *pEnd;
2191
2192 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2193 data->writableBuffer.setTo(*(pStart - 1));
2194 strsize = 1;
2195 }
2196 else {
2197 strsize = data->writableBuffer.length();
2198 }
2199
2200 pEnd = data->fcdPosition;
2201
2202 data->writableBuffer.append(
2203 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStar t)), status));
2204 if(U_FAILURE(status)) {
2205 return;
2206 }
2207
2208 data->pos = data->writableBuffer.getTerminatedBuffer() + strsize;
2209 data->origFlags = data->flags;
2210 data->flags |= UCOL_ITER_INNORMBUF;
2211 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2212 }
2213
2214 /**
2215 * Contraction character management function that returns the next character
2216 * for the forwards iterator.
2217 * Does nothing if the next character is in buffer and not the first character
2218 * in it.
2219 * Else it checks next character in data string to see if it is normalizable.
2220 * If it is not, the character is simply copied into the buffer, else
2221 * the whole normalized substring is copied into the buffer, including the
2222 * current character.
2223 * @param data collation element iterator data
2224 * @return next character
2225 */
2226 static
2227 inline UChar getNextNormalizedChar(collIterate *data)
2228 {
2229 UChar nextch;
2230 UChar ch;
2231 // Here we need to add the iterator code. One problem is the way
2232 // end of string is handled. If we just return next char, it could
2233 // be the sentinel. Most of the cases already check for this, but we
2234 // need to be sure.
2235 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2236 /* if no normalization and not in buffer. */
2237 if(data->flags & UCOL_USE_ITERATOR) {
2238 return (UChar)data->iterator->next(data->iterator);
2239 } else {
2240 return *(data->pos ++);
2241 }
2242 }
2243
2244 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2245 //normalizeIterator(data);
2246 //}
2247
2248 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2249 if ((innormbuf && *data->pos != 0) ||
2250 (data->fcdPosition != NULL && !innormbuf &&
2251 data->pos < data->fcdPosition)) {
2252 /*
2253 if next character is in normalized buffer, no further normalization
2254 is required
2255 */
2256 return *(data->pos ++);
2257 }
2258
2259 if (data->flags & UCOL_ITER_HASLEN) {
2260 /* in data string */
2261 if (data->pos + 1 == data->endp) {
2262 return *(data->pos ++);
2263 }
2264 }
2265 else {
2266 if (innormbuf) {
2267 // inside the normalization buffer, but at the end
2268 // (since we encountered zero). This means, in the
2269 // case we're using char iterator, that we need to
2270 // do another round of normalization.
2271 //if(data->origFlags & UCOL_USE_ITERATOR) {
2272 // we need to restore original flags,
2273 // otherwise, we'll lose them
2274 //data->flags = data->origFlags;
2275 //normalizeIterator(data);
2276 //return *(data->pos++);
2277 //} else {
2278 /*
2279 in writable buffer, at this point fcdPosition can not be
2280 pointing to the end of the data string. see contracting tag.
2281 */
2282 if(data->fcdPosition) {
2283 if (*(data->fcdPosition + 1) == 0 ||
2284 data->fcdPosition + 1 == data->endp) {
2285 /* at the end of the string, dump it into the normalizer */
2286 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
2287 // Check if data->pos received a null pointer
2288 if (data->pos == NULL) {
2289 return (UChar)-1; // Return to indicate error.
2290 }
2291 return *(data->fcdPosition ++);
2292 }
2293 data->pos = data->fcdPosition;
2294 } else if(data->origFlags & UCOL_USE_ITERATOR) {
2295 // if we are here, we're using a normalizing iterator.
2296 // we should just continue further.
2297 data->flags = data->origFlags;
2298 data->pos = NULL;
2299 return (UChar)data->iterator->next(data->iterator);
2300 }
2301 //}
2302 }
2303 else {
2304 if (*(data->pos + 1) == 0) {
2305 return *(data->pos ++);
2306 }
2307 }
2308 }
2309
2310 ch = *data->pos ++;
2311 nextch = *data->pos;
2312
2313 /*
2314 * if the current character is not fcd.
2315 * Trailing combining class == 0.
2316 */
2317 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2318 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2319 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2320 /*
2321 Need a more complete FCD check and possible normalization.
2322 normalize substring will be appended to buffer
2323 */
2324 if (collIterFCD(data)) {
2325 normalizeNextContraction(data);
2326 return *(data->pos ++);
2327 }
2328 else if (innormbuf) {
2329 /* fcdposition shifted even when there's no normalization, if we
2330 don't input the rest into this, we'll get the wrong position when
2331 we reach the end of the writableBuffer */
2332 int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
2333 data->pos = insertBufferEnd(data, data->pos - 1, length);
2334 // Check if data->pos received a null pointer
2335 if (data->pos == NULL) {
2336 return (UChar)-1; // Return to indicate error.
2337 }
2338 return *(data->pos ++);
2339 }
2340 }
2341
2342 if (innormbuf) {
2343 /*
2344 no normalization is to be done hence only one character will be
2345 appended to the buffer.
2346 */
2347 data->pos = insertBufferEnd(data, ch) + 1;
2348 // Check if data->pos received a null pointer
2349 if (data->pos == NULL) {
2350 return (UChar)-1; // Return to indicate error.
2351 }
2352 }
2353
2354 /* points back to the pos in string */
2355 return ch;
2356 }
2357
2358
2359
2360 /**
2361 * Function to copy the buffer into writableBuffer and sets the fcd position to
2362 * the correct position
2363 * @param source data string source
2364 * @param buffer character buffer
2365 */
2366 static
2367 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &b uffer)
2368 {
2369 /* okay confusing part here. to ensure that the skipped characters are
2370 considered later, we need to place it in the appropriate position in the
2371 normalization buffer and reassign the pos pointer. simple case if pos
2372 reside in string, simply copy to normalization buffer and
2373 fcdposition = pos, pos = start of normalization buffer. if pos in
2374 normalization buffer, we'll insert the copy infront of pos and point pos
2375 to the start of the normalization buffer. why am i doing these copies?
2376 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecial CE does
2377 not require any changes, which be really painful. */
2378 if (source->flags & UCOL_ITER_INNORMBUF) {
2379 int32_t replaceLength = source->pos - source->writableBuffer.getBuffer() ;
2380 source->writableBuffer.replace(0, replaceLength, buffer);
2381 }
2382 else {
2383 source->fcdPosition = source->pos;
2384 source->origFlags = source->flags;
2385 source->flags |= UCOL_ITER_INNORMBUF;
2386 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_IT ERATOR);
2387 source->writableBuffer = buffer;
2388 }
2389
2390 source->pos = source->writableBuffer.getTerminatedBuffer();
2391 }
2392
2393 /**
2394 * Function to get the discontiguos collation element within the source.
2395 * Note this function will set the position to the appropriate places.
2396 * @param coll current collator used
2397 * @param source data string source
2398 * @param constart index to the start character in the contraction table
2399 * @return discontiguos collation element offset
2400 */
2401 static
2402 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2403 const UChar *constart)
2404 {
2405 /* source->pos currently points to the second combining character after
2406 the start character */
2407 const UChar *temppos = source->pos;
2408 UnicodeString buffer;
2409 const UChar *tempconstart = constart;
2410 uint8_t tempflags = source->flags;
2411 UBool multicontraction = FALSE;
2412 collIterateState discState;
2413
2414 backupState(source, &discState);
2415
2416 buffer.setTo(peekCodePoint(source, -1));
2417 for (;;) {
2418 UChar *UCharOffset;
2419 UChar schar,
2420 tchar;
2421 uint32_t result;
2422
2423 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2424 || (peekCodeUnit(source, 0) == 0 &&
2425 //|| (*source->pos == 0 &&
2426 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2427 source->fcdPosition == NULL ||
2428 source->fcdPosition == source->endp ||
2429 *(source->fcdPosition) == 0 ||
2430 u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2431 /* end of string in null terminated string or stopped by a
2432 null character, note fcd does not always point to a base
2433 character after the discontiguos change */
2434 u_getCombiningClass(peekCodePoint(source, 0)) == 0) {
2435 //u_getCombiningClass(*(source->pos)) == 0) {
2436 //constart = (UChar *)coll->image + getContractOffset(CE);
2437 if (multicontraction) {
2438 source->pos = temppos - 1;
2439 setDiscontiguosAttribute(source, buffer);
2440 return *(coll->contractionCEs +
2441 (tempconstart - coll->contractionIndex));
2442 }
2443 constart = tempconstart;
2444 break;
2445 }
2446
2447 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2448 schar = getNextNormalizedChar(source);
2449
2450 while (schar > (tchar = *UCharOffset)) {
2451 UCharOffset++;
2452 }
2453
2454 if (schar != tchar) {
2455 /* not the correct codepoint. we stuff the current codepoint into
2456 the discontiguos buffer and try the next character */
2457 buffer.append(schar);
2458 continue;
2459 }
2460 else {
2461 if (u_getCombiningClass(schar) ==
2462 u_getCombiningClass(peekCodePoint(source, -2))) {
2463 buffer.append(schar);
2464 continue;
2465 }
2466 result = *(coll->contractionCEs +
2467 (UCharOffset - coll->contractionIndex));
2468 }
2469
2470 if (result == UCOL_NOT_FOUND) {
2471 break;
2472 } else if (isContraction(result)) {
2473 /* this is a multi-contraction*/
2474 tempconstart = (UChar *)coll->image + getContractOffset(result);
2475 if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2476 != UCOL_NOT_FOUND) {
2477 multicontraction = TRUE;
2478 temppos = source->pos + 1;
2479 }
2480 } else {
2481 setDiscontiguosAttribute(source, buffer);
2482 return result;
2483 }
2484 }
2485
2486 /* no problems simply reverting just like that,
2487 if we are in string before getting into this function, points back to
2488 string hence no problem.
2489 if we are in normalization buffer before getting into this function,
2490 since we'll never use another normalization within this function, we
2491 know that fcdposition points to a base character. the normalization buffer
2492 never change, hence this revert works. */
2493 loadState(source, &discState, TRUE);
2494 goBackOne(source);
2495
2496 //source->pos = temppos - 1;
2497 source->flags = tempflags;
2498 return *(coll->contractionCEs + (constart - coll->contractionIndex));
2499 }
2500
2501 /* now uses Mark's getImplicitPrimary code */
2502 static
2503 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2504 uint32_t r = uprv_uca_getImplicitPrimary(cp);
2505 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2506 collationSource->offsetRepeatCount += 1;
2507 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2508 }
2509
2510 /**
2511 * Inserts the argument character into the front of the buffer replacing the
2512 * front null terminator.
2513 * @param data collation element iterator data
2514 * @param ch character to be appended
2515 */
2516 static
2517 inline void insertBufferFront(collIterate *data, UChar ch)
2518 {
2519 data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTer minatedBuffer() + 2;
2520 }
2521
2522 /**
2523 * Special normalization function for contraction in the previous iterator.
2524 * This normalization sequence will place the current character at source->pos
2525 * and its following normalized sequence into the buffer.
2526 * The fcd position, pos will be changed.
2527 * pos will now point to positions in the buffer.
2528 * Flags will be changed accordingly.
2529 * @param data collation iterator data
2530 */
2531 static
2532 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
2533 {
2534 const UChar *pEnd = data->pos + 1; /* End normalize + 1 */
2535 const UChar *pStart;
2536
2537 UnicodeString endOfBuffer;
2538 if (data->flags & UCOL_ITER_HASLEN) {
2539 /*
2540 normalization buffer not used yet, we'll pull down the next
2541 character into the end of the buffer
2542 */
2543 endOfBuffer.setTo(*pEnd);
2544 }
2545 else {
2546 endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL
2547 }
2548
2549 if (data->fcdPosition == NULL) {
2550 pStart = data->string;
2551 }
2552 else {
2553 pStart = data->fcdPosition + 1;
2554 }
2555 int32_t normLen =
2556 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStar t)),
2557 data->writableBuffer,
2558 *status).
2559 length();
2560 if(U_FAILURE(*status)) {
2561 return;
2562 }
2563 /*
2564 this puts the null termination infront of the normalized string instead
2565 of the end
2566 */
2567 data->pos =
2568 data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminat edBuffer() +
2569 1 + normLen;
2570 data->origFlags = data->flags;
2571 data->flags |= UCOL_ITER_INNORMBUF;
2572 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2573 }
2574
2575 /**
2576 * Contraction character management function that returns the previous character
2577 * for the backwards iterator.
2578 * Does nothing if the previous character is in buffer and not the first
2579 * character in it.
2580 * Else it checks previous character in data string to see if it is
2581 * normalizable.
2582 * If it is not, the character is simply copied into the buffer, else
2583 * the whole normalized substring is copied into the buffer, including the
2584 * current character.
2585 * @param data collation element iterator data
2586 * @return previous character
2587 */
2588 static
2589 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
2590 {
2591 UChar prevch;
2592 UChar ch;
2593 const UChar *start;
2594 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2595 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2596 (innormbuf && *(data->pos - 1) != 0)) {
2597 /*
2598 if no normalization.
2599 if previous character is in normalized buffer, no further normalization
2600 is required
2601 */
2602 if(data->flags & UCOL_USE_ITERATOR) {
2603 data->iterator->move(data->iterator, -1, UITER_CURRENT);
2604 return (UChar)data->iterator->next(data->iterator);
2605 } else {
2606 return *(data->pos - 1);
2607 }
2608 }
2609
2610 start = data->pos;
2611 if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
2612 /* in data string */
2613 if ((start - 1) == data->string) {
2614 return *(start - 1);
2615 }
2616 start --;
2617 ch = *start;
2618 prevch = *(start - 1);
2619 }
2620 else {
2621 /*
2622 in writable buffer, at this point fcdPosition can not be NULL.
2623 see contracting tag.
2624 */
2625 if (data->fcdPosition == data->string) {
2626 /* at the start of the string, just dump it into the normalizer */
2627 insertBufferFront(data, *(data->fcdPosition));
2628 data->fcdPosition = NULL;
2629 return *(data->pos - 1);
2630 }
2631 start = data->fcdPosition;
2632 ch = *start;
2633 prevch = *(start - 1);
2634 }
2635 /*
2636 * if the current character is not fcd.
2637 * Trailing combining class == 0.
2638 */
2639 if (data->fcdPosition > start &&
2640 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2641 {
2642 /*
2643 Need a more complete FCD check and possible normalization.
2644 normalize substring will be appended to buffer
2645 */
2646 const UChar *backuppos = data->pos;
2647 data->pos = start;
2648 if (collPrevIterFCD(data)) {
2649 normalizePrevContraction(data, status);
2650 return *(data->pos - 1);
2651 }
2652 data->pos = backuppos;
2653 data->fcdPosition ++;
2654 }
2655
2656 if (innormbuf) {
2657 /*
2658 no normalization is to be done hence only one character will be
2659 appended to the buffer.
2660 */
2661 insertBufferFront(data, ch);
2662 data->fcdPosition --;
2663 }
2664
2665 return ch;
2666 }
2667
2668 /* This function handles the special CEs like contractions, expansions, surrogat es, Thai */
2669 /* It is called by getNextCE */
2670
2671 /* The following should be even */
2672 #define UCOL_MAX_DIGITS_FOR_NUMBER 254
2673
2674 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col lIterate *source, UErrorCode *status) {
2675 collIterateState entryState;
2676 backupState(source, &entryState);
2677 UChar32 cp = ch;
2678
2679 for (;;) {
2680 // This loop will repeat only in the case of contractions, and only when a contraction
2681 // is found and the first CE resulting from that contraction is itself a special
2682 // (an expansion, for example.) All other special CE types are fully handled the
2683 // first time through, and the loop exits.
2684
2685 const uint32_t *CEOffset = NULL;
2686 switch(getCETag(CE)) {
2687 case NOT_FOUND_TAG:
2688 /* This one is not found, and we'll let somebody else bother about i t... no more games */
2689 return CE;
2690 case SPEC_PROC_TAG:
2691 {
2692 // Special processing is getting a CE that is preceded by a cert ain prefix
2693 // Currently this is only needed for optimizing Japanese length and iteration marks.
2694 // When we encouter a special processing tag, we go backwards an d try to see if
2695 // we have a match.
2696 // Contraction tables are used - so the whole process is not unl ike contraction.
2697 // prefix data is stored backwards in the table.
2698 const UChar *UCharOffset;
2699 UChar schar, tchar;
2700 collIterateState prefixState;
2701 backupState(source, &prefixState);
2702 loadState(source, &entryState, TRUE);
2703 goBackOne(source); // We want to look at the point where we ente red - actually one
2704 // before that...
2705
2706 for(;;) {
2707 // This loop will run once per source string character, for as long as we
2708 // are matching a potential contraction sequence
2709
2710 // First we position ourselves at the begining of contractio n sequence
2711 const UChar *ContractionStart = UCharOffset = (UChar *)coll- >image+getContractOffset(CE);
2712 if (collIter_bos(source)) {
2713 CE = *(coll->contractionCEs + (UCharOffset - coll->contr actionIndex));
2714 break;
2715 }
2716 schar = getPrevNormalizedChar(source, status);
2717 goBackOne(source);
2718
2719 while(schar > (tchar = *UCharOffset)) { /* since the contrac tion codepoints should be ordered, we skip all that are smaller */
2720 UCharOffset++;
2721 }
2722
2723 if (schar == tchar) {
2724 // Found the source string char in the table.
2725 // Pick up the corresponding CE from the table.
2726 CE = *(coll->contractionCEs +
2727 (UCharOffset - coll->contractionIndex));
2728 }
2729 else
2730 {
2731 // Source string char was not in the table.
2732 // We have not found the prefix.
2733 CE = *(coll->contractionCEs +
2734 (ContractionStart - coll->contractionIndex));
2735 }
2736
2737 if(!isPrefix(CE)) {
2738 // The source string char was in the contraction table, and the corresponding
2739 // CE is not a prefix CE. We found the prefix, break
2740 // out of loop, this CE will end up being returned. T his is the normal
2741 // way out of prefix handling when the source actually contained
2742 // the prefix.
2743 break;
2744 }
2745 }
2746 if(CE != UCOL_NOT_FOUND) { // we found something and we can meri lly continue
2747 loadState(source, &prefixState, TRUE);
2748 if(source->origFlags & UCOL_USE_ITERATOR) {
2749 source->flags = source->origFlags;
2750 }
2751 } else { // prefix search was a failure, we have to backup all t he way to the start
2752 loadState(source, &entryState, TRUE);
2753 }
2754 break;
2755 }
2756 case CONTRACTION_TAG:
2757 {
2758 /* This should handle contractions */
2759 collIterateState state;
2760 backupState(source, &state);
2761 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->imag e+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
2762 const UChar *UCharOffset;
2763 UChar schar, tchar;
2764
2765 for (;;) {
2766 /* This loop will run once per source string character, for as long as we */
2767 /* are matching a potential contraction sequence */
2768
2769 /* First we position ourselves at the begining of contractio n sequence */
2770 const UChar *ContractionStart = UCharOffset = (UChar *)coll- >image+getContractOffset(CE);
2771
2772 if (collIter_eos(source)) {
2773 // Ran off the end of the source string.
2774 CE = *(coll->contractionCEs + (UCharOffset - coll->contr actionIndex));
2775 // So we'll pick whatever we have at the point...
2776 if (CE == UCOL_NOT_FOUND) {
2777 // back up the source over all the chars we scanned going into this contraction.
2778 CE = firstCE;
2779 loadState(source, &state, TRUE);
2780 if(source->origFlags & UCOL_USE_ITERATOR) {
2781 source->flags = source->origFlags;
2782 }
2783 }
2784 break;
2785 }
2786
2787 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the di scontiguos stuff */ /* skip the backward offset, see above */
2788 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2789
2790 schar = getNextNormalizedChar(source);
2791 while(schar > (tchar = *UCharOffset)) { /* since the contrac tion codepoints should be ordered, we skip all that are smaller */
2792 UCharOffset++;
2793 }
2794
2795 if (schar == tchar) {
2796 // Found the source string char in the contraction table .
2797 // Pick up the corresponding CE from the table.
2798 CE = *(coll->contractionCEs +
2799 (UCharOffset - coll->contractionIndex));
2800 }
2801 else
2802 {
2803 // Source string char was not in contraction table.
2804 // Unless we have a discontiguous contraction, we have finished
2805 // with this contraction.
2806 // in order to do the proper detection, we
2807 // need to see if we're dealing with a supplementary
2808 /* We test whether the next two char are surrogate pairs .
2809 * This test is done if the iterator is not NULL.
2810 * If there is no surrogate pair, the iterator
2811 * goes back one if needed. */
2812 UChar32 miss = schar;
2813 if (source->iterator) {
2814 UChar32 surrNextChar; /* the next char in the iterat ion to test */
2815 int32_t prevPos; /* holds the previous position befo re move forward of the source iterator */
2816 if(U16_IS_LEAD(schar) && source->iterator->hasNext(s ource->iterator)) {
2817 prevPos = source->iterator->index;
2818 surrNextChar = getNextNormalizedChar(source);
2819 if (U16_IS_TRAIL(surrNextChar)) {
2820 miss = U16_GET_SUPPLEMENTARY(schar, surrNext Char);
2821 } else if (prevPos < source->iterator->index){
2822 goBackOne(source);
2823 }
2824 }
2825 } else if (U16_IS_LEAD(schar)) {
2826 miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalize dChar(source));
2827 }
2828
2829 uint8_t sCC;
2830 if (miss < 0x300 ||
2831 maxCC == 0 ||
2832 (sCC = i_getCombiningClass(miss, coll)) == 0 ||
2833 sCC>maxCC ||
2834 (allSame != 0 && sCC == maxCC) ||
2835 collIter_eos(source))
2836 {
2837 // Contraction can not be discontiguous.
2838 goBackOne(source); // back up the source string by one,
2839 // because the character we just looked at was
2840 // not part of the contraction. */
2841 if(U_IS_SUPPLEMENTARY(miss)) {
2842 goBackOne(source);
2843 }
2844 CE = *(coll->contractionCEs +
2845 (ContractionStart - coll->contractionIndex));
2846 } else {
2847 //
2848 // Contraction is possibly discontiguous.
2849 // Scan more of source string looking for a match
2850 //
2851 UChar tempchar;
2852 /* find the next character if schar is not a base ch aracter
2853 and we are not yet at the end of the string */
2854 tempchar = getNextNormalizedChar(source);
2855 // probably need another supplementary thingie here
2856 goBackOne(source);
2857 if (i_getCombiningClass(tempchar, coll) == 0) {
2858 goBackOne(source);
2859 if(U_IS_SUPPLEMENTARY(miss)) {
2860 goBackOne(source);
2861 }
2862 /* Spit out the last char of the string, wasn't tasty enough */
2863 CE = *(coll->contractionCEs +
2864 (ContractionStart - coll->contractionIndex)) ;
2865 } else {
2866 CE = getDiscontiguous(coll, source, ContractionS tart);
2867 }
2868 }
2869 } // else after if(schar == tchar)
2870
2871 if(CE == UCOL_NOT_FOUND) {
2872 /* The Source string did not match the contraction that we were checking. */
2873 /* Back up the source position to undo the effects of h aving partially */
2874 /* scanned through what ultimately proved to not be a contraction. */
2875 loadState(source, &state, TRUE);
2876 CE = firstCE;
2877 break;
2878 }
2879
2880 if(!isContraction(CE)) {
2881 // The source string char was in the contraction table, and the corresponding
2882 // CE is not a contraction CE. We completed the contr action, break
2883 // out of loop, this CE will end up being returned. T his is the normal
2884 // way out of contraction handling when the source act ually contained
2885 // the contraction.
2886 break;
2887 }
2888
2889
2890 // The source string char was in the contraction table, and the corresponding
2891 // CE is IS a contraction CE. We will continue looping t o check the source
2892 // string for the remaining chars in the contraction.
2893 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2894 if(tempCE != UCOL_NOT_FOUND) {
2895 // We have scanned a a section of source string for whic h there is a
2896 // CE from the contraction table. Remember the CE and scan position, so
2897 // that we can return to this point if further scanning fails to
2898 // match a longer contraction sequence.
2899 firstCE = tempCE;
2900
2901 goBackOne(source);
2902 backupState(source, &state);
2903 getNextNormalizedChar(source);
2904
2905 // Another way to do this is:
2906 //collIterateState tempState;
2907 //backupState(source, &tempState);
2908 //goBackOne(source);
2909 //backupState(source, &state);
2910 //loadState(source, &tempState, TRUE);
2911
2912 // The problem is that for incomplete contractions we ha ve to remember the previous
2913 // position. Before, the only thing I needed to do was s tate.pos--;
2914 // After iterator introduction and especially after intr oduction of normalizing
2915 // iterators, it became much more difficult to decrease the saved state.
2916 // I'm not yet sure which of the two methods above is fa ster.
2917 }
2918 } // for(;;)
2919 break;
2920 } // case CONTRACTION_TAG:
2921 case LONG_PRIMARY_TAG:
2922 {
2923 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
2924 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYT E_COMMON;
2925 source->offsetRepeatCount += 1;
2926 return CE;
2927 }
2928 case EXPANSION_TAG:
2929 {
2930 /* This should handle expansion. */
2931 /* NOTE: we can encounter both continuations and expansions in a n expansion! */
2932 /* I have to decide where continuations are going to be dealt wi th */
2933 uint32_t size;
2934 uint32_t i; /* general counter */
2935
2936 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* fi nd the offset to expansion table */
2937 size = getExpansionCount(CE);
2938 CE = *CEOffset++;
2939 //source->offsetRepeatCount = -1;
2940
2941 if(size != 0) { /* if there are less than 16 elements in expansi on, we don't terminate */
2942 for(i = 1; i<size; i++) {
2943 *(source->CEpos++) = *CEOffset++;
2944 source->offsetRepeatCount += 1;
2945 }
2946 } else { /* else, we do */
2947 while(*CEOffset != 0) {
2948 *(source->CEpos++) = *CEOffset++;
2949 source->offsetRepeatCount += 1;
2950 }
2951 }
2952
2953 return CE;
2954 }
2955 case DIGIT_TAG:
2956 {
2957 /*
2958 We do a check to see if we want to collate digits as numbers; if so we generate
2959 a custom collation key. Otherwise we pull out the value stored i n the expansion table.
2960 */
2961 //uint32_t size;
2962 uint32_t i; /* general counter */
2963
2964 if (source->coll->numericCollation == UCOL_ON){
2965 collIterateState digitState = {0,0,0,0,0,0,0,0,0};
2966 UChar32 char32 = 0;
2967 int32_t digVal = 0;
2968
2969 uint32_t digIndx = 0;
2970 uint32_t endIndex = 0;
2971 uint32_t trailingZeroIndex = 0;
2972
2973 uint8_t collateVal = 0;
2974
2975 UBool nonZeroValReached = FALSE;
2976
2977 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I j ust need a temporary place to store my generated CEs.
2978 /*
2979 We parse the source string until we hit a char that's N OT a digit.
2980 Use this u_charDigitValue. This might be slow because we have to
2981 handle surrogates...
2982 */
2983 /*
2984 if (U16_IS_LEAD(ch)){
2985 if (!collIter_eos(source)) {
2986 backupState(source, &digitState);
2987 UChar trail = getNextNormalizedChar(source);
2988 if(U16_IS_TRAIL(trail)) {
2989 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
2990 } else {
2991 loadState(source, &digitState, TRUE);
2992 char32 = ch;
2993 }
2994 } else {
2995 char32 = ch;
2996 }
2997 } else {
2998 char32 = ch;
2999 }
3000 digVal = u_charDigitValue(char32);
3001 */
3002 digVal = u_charDigitValue(cp); // if we have arrived here, w e have
3003 // already processed possible supplementaries that trigered the digit tag -
3004 // all supplementaries are marked in the UCA.
3005 /*
3006 We pad a zero in front of the first element anyways. Th is takes
3007 care of the (probably) most common case where people are sorting things followed
3008 by a single digit
3009 */
3010 digIndx++;
3011 for(;;){
3012 // Make sure we have enough space. No longer needed;
3013 // at this point digIndx now has a max value of UCOL_MAX _DIGITS_FOR_NUMBER
3014 // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
3015 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
3016
3017 // Skipping over leading zeroes.
3018 if (digVal != 0) {
3019 nonZeroValReached = TRUE;
3020 }
3021 if (nonZeroValReached) {
3022 /*
3023 We parse the digit string into base 100 numbers (thi s fits into a byte).
3024 We only add to the buffer in twos, thus if we are pa rsing an odd character,
3025 that serves as the 'tens' digit while the if we are parsing an even one, that
3026 is the 'ones' digit. We dumped the parsed base 100 v alue (collateVal) into
3027 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3028 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3029 than all the other bytes.
3030 */
3031
3032 if (digIndx % 2 == 1){
3033 collateVal += (uint8_t)digVal;
3034
3035 // We don't enter the low-order-digit case unles s we've already seen
3036 // the high order, or for the first digit, which is always non-zero.
3037 if (collateVal != 0)
3038 trailingZeroIndex = 0;
3039
3040 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3041 collateVal = 0;
3042 }
3043 else{
3044 // We drop the collation value into the buffer s o if we need to do
3045 // a "front patch" we don't have to check to see if we're hitting the
3046 // last element.
3047 collateVal = (uint8_t)(digVal * 10);
3048
3049 // Check for trailing zeroes.
3050 if (collateVal == 0)
3051 {
3052 if (!trailingZeroIndex)
3053 trailingZeroIndex = (digIndx/2) + 2;
3054 }
3055 else
3056 trailingZeroIndex = 0;
3057
3058 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3059 }
3060 digIndx++;
3061 }
3062
3063 // Get next character.
3064 if (!collIter_eos(source)){
3065 ch = getNextNormalizedChar(source);
3066 if (U16_IS_LEAD(ch)){
3067 if (!collIter_eos(source)) {
3068 backupState(source, &digitState);
3069 UChar trail = getNextNormalizedChar(source);
3070 if(U16_IS_TRAIL(trail)) {
3071 char32 = U16_GET_SUPPLEMENTARY(ch, trail );
3072 } else {
3073 loadState(source, &digitState, TRUE);
3074 char32 = ch;
3075 }
3076 }
3077 } else {
3078 char32 = ch;
3079 }
3080
3081 if ((digVal = u_charDigitValue(char32)) == -1 || dig Indx > UCOL_MAX_DIGITS_FOR_NUMBER){
3082 // Resetting position to point to the next unpro cessed char. We
3083 // overshot it when doing our test/set for numbe rs.
3084 if (char32 > 0xFFFF) { // For surrogates.
3085 loadState(source, &digitState, TRUE);
3086 //goBackOne(source);
3087 }
3088 goBackOne(source);
3089 break;
3090 }
3091 } else {
3092 break;
3093 }
3094 }
3095
3096 if (nonZeroValReached == FALSE){
3097 digIndx = 2;
3098 numTempBuf[2] = 6;
3099 }
3100
3101 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx /2) + 2) ;
3102 if (digIndx % 2 != 0){
3103 /*
3104 We missed a value. Since digIndx isn't even, stuck too m any values into the buffer (this is what
3105 we get for padding the first byte with a zero). "Front-p atch" now by pushing all nybbles forward.
3106 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3107 single pass and optimizes for strings with single digits . I'm just assuming that's the more common case.
3108 */
3109
3110 for(i = 2; i < endIndex; i++){
3111 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3112 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3113 }
3114 --digIndx;
3115 }
3116
3117 // Subtract one off of the last byte.
3118 numTempBuf[endIndex-1] -= 1;
3119
3120 /*
3121 We want to skip over the first two slots in the buffer. The first slot
3122 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3123 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3124 */
3125 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3126 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3127
3128 // Now transfer the collation key to our collIterate struct.
3129 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3130 //size = ((endIndex+1) & ~1)/2;
3131 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARY ORDERSHIFT) | //Primary weight
3132 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Seco ndary weight
3133 UCOL_BYTE_COMMON; // Tertiary weight.
3134 i = 2; // Reset the index into the buffer.
3135 while(i < endIndex)
3136 {
3137 uint32_t primWeight = numTempBuf[i++] << 8;
3138 if ( i < endIndex)
3139 primWeight |= numTempBuf[i++];
3140 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHI FT) | UCOL_CONTINUATION_MARKER;
3141 }
3142
3143 } else {
3144 // no numeric mode, we'll just switch to whatever we stashed and continue
3145 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); / * find the offset to expansion table */
3146 CE = *CEOffset++;
3147 break;
3148 }
3149 return CE;
3150 }
3151 /* various implicits optimization */
3152 case IMPLICIT_TAG: /* everything that is not defined otherwise */
3153 /* UCA is filled with these. Tailorings are NOT_FOUND */
3154 return getImplicit(cp, source);
3155 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D */
3156 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImpl icit
3157 return getImplicit(cp, source);
3158 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3159 {
3160 static const uint32_t
3161 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11 A7;
3162 //const uint32_t LCount = 19;
3163 static const uint32_t VCount = 21;
3164 static const uint32_t TCount = 28;
3165 //const uint32_t NCount = VCount * TCount; // 588
3166 //const uint32_t SCount = LCount * NCount; // 11172
3167 uint32_t L = ch - SBase;
3168
3169 // divide into pieces
3170
3171 uint32_t T = L % TCount; // we do it in this order since some co mpilers can do % and / in one operation
3172 L /= TCount;
3173 uint32_t V = L % VCount;
3174 L /= VCount;
3175
3176 // offset them
3177
3178 L += LBase;
3179 V += VBase;
3180 T += TBase;
3181
3182 // return the first CE, but first put the rest into the expansio n buffer
3183 if (!source->coll->image->jamoSpecial) { // FAST PATH
3184
3185 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V );
3186 if (T != TBase) {
3187 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mappin g, T);
3188 }
3189
3190 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3191
3192 } else { // Jamo is Special
3193 // Since Hanguls pass the FCD check, it is
3194 // guaranteed that we won't be in
3195 // the normalization buffer if something like this happens
3196 // However, if we are using a uchar iterator and normalizati on
3197 // is ON, the Hangul that lead us here is going to be in tha t
3198 // normalization buffer. Here we want to restore the uchar
3199 // iterator state and pull out of the normalization buffer
3200 if(source->iterator != NULL && source->flags & UCOL_ITER_INN ORMBUF) {
3201 source->flags = source->origFlags; // restore the iterat or
3202 source->pos = NULL;
3203 }
3204 // Move Jamos into normalization buffer
3205 UChar *buffer = source->writableBuffer.getBuffer(4);
3206 int32_t bufferLength;
3207 buffer[0] = (UChar)L;
3208 buffer[1] = (UChar)V;
3209 if (T != TBase) {
3210 buffer[2] = (UChar)T;
3211 bufferLength = 3;
3212 } else {
3213 bufferLength = 2;
3214 }
3215 source->writableBuffer.releaseBuffer(bufferLength);
3216
3217 source->fcdPosition = source->pos; // Indicate where to continue in main input string
3218 // after exhausting the writableBuffer
3219 source->pos = source->writableBuffer.getTerminatedBuffer() ;
3220 source->origFlags = source->flags;
3221 source->flags |= UCOL_ITER_INNORMBUF;
3222 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3223
3224 return(UCOL_IGNORABLE);
3225 }
3226 }
3227 case SURROGATE_TAG:
3228 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
3229 /* two things can happen here: next code point can be a trailing sur rogate - we will use it */
3230 /* to retrieve the CE, or it is not a trailing surrogate (or the str ing is done). In that case */
3231 /* we treat it like an unassigned code point. */
3232 {
3233 UChar trail;
3234 collIterateState state;
3235 backupState(source, &state);
3236 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNorma lizedChar(source))))) {
3237 // we chould have stepped one char forward and it might have turned that it
3238 // was not a trail surrogate. In that case, we have to backu p.
3239 loadState(source, &state, TRUE);
3240 return UCOL_NOT_FOUND;
3241 } else {
3242 /* TODO: CE contain the data from the previous CE + the mask . It should at least be unmasked */
3243 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFF FF, trail);
3244 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
3245 // We need to backup
3246 loadState(source, &state, TRUE);
3247 return CE;
3248 }
3249 // calculate the supplementary code point value, if surrogat e was not tailored
3250 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10U L)+0xdc00-0x10000));
3251 }
3252 }
3253 break;
3254 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
3255 UChar nextChar;
3256 if( source->flags & UCOL_USE_ITERATOR) {
3257 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source ->iterator))) {
3258 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3259 source->iterator->next(source->iterator);
3260 return getImplicit(cp, source);
3261 }
3262 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->po s<source->endp)) &&
3263 U_IS_TRAIL((nextChar=*source->pos))) {
3264 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3265 source->pos++;
3266 return getImplicit(cp, source);
3267 }
3268 return UCOL_NOT_FOUND;
3269 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3270 return UCOL_NOT_FOUND; /* broken surrogate sequence */
3271 case CHARSET_TAG:
3272 /* not yet implemented */
3273 /* probably after 1.8 */
3274 return UCOL_NOT_FOUND;
3275 default:
3276 *status = U_INTERNAL_PROGRAM_ERROR;
3277 CE=0;
3278 break;
3279 }
3280 if (CE <= UCOL_NOT_FOUND) break;
3281 }
3282 return CE;
3283 }
3284
3285
3286 /* now uses Mark's getImplicitPrimary code */
3287 static
3288 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3289 uint32_t r = uprv_uca_getImplicitPrimary(cp);
3290
3291 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3292 collationSource->toReturn = collationSource->CEpos;
3293
3294 // **** doesn't work if using iterator ****
3295 if (collationSource->flags & UCOL_ITER_INNORMBUF) {
3296 collationSource->offsetRepeatCount = 1;
3297 } else {
3298 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource-> string);
3299
3300 UErrorCode errorCode = U_ZERO_ERROR;
3301 collationSource->appendOffset(firstOffset, errorCode);
3302 collationSource->appendOffset(firstOffset + 1, errorCode);
3303
3304 collationSource->offsetReturn = collationSource->offsetStore - 1;
3305 *(collationSource->offsetBuffer) = firstOffset;
3306 if (collationSource->offsetReturn == collationSource->offsetBuffer) {
3307 collationSource->offsetStore = collationSource->offsetBuffer;
3308 }
3309 }
3310
3311 return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3312 }
3313
3314 /**
3315 * This function handles the special CEs like contractions, expansions,
3316 * surrogates, Thai.
3317 * It is called by both getPrevCE
3318 */
3319 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3320 collIterate *source,
3321 UErrorCode *status)
3322 {
3323 const uint32_t *CEOffset = NULL;
3324 UChar *UCharOffset = NULL;
3325 UChar schar;
3326 const UChar *constart = NULL;
3327 uint32_t size;
3328 UChar buffer[UCOL_MAX_BUFFER];
3329 uint32_t *endCEBuffer;
3330 UChar *strbuffer;
3331 int32_t noChars = 0;
3332 int32_t CECount = 0;
3333
3334 for(;;)
3335 {
3336 /* the only ces that loops are thai and contractions */
3337 switch (getCETag(CE))
3338 {
3339 case NOT_FOUND_TAG: /* this tag always returns */
3340 return CE;
3341
3342 case SPEC_PROC_TAG:
3343 {
3344 // Special processing is getting a CE that is preceded by a cert ain prefix
3345 // Currently this is only needed for optimizing Japanese length and iteration marks.
3346 // When we encouter a special processing tag, we go backwards an d try to see if
3347 // we have a match.
3348 // Contraction tables are used - so the whole process is not unl ike contraction.
3349 // prefix data is stored backwards in the table.
3350 const UChar *UCharOffset;
3351 UChar schar, tchar;
3352 collIterateState prefixState;
3353 backupState(source, &prefixState);
3354 for(;;) {
3355 // This loop will run once per source string character, for as long as we
3356 // are matching a potential contraction sequence
3357
3358 // First we position ourselves at the begining of contractio n sequence
3359 const UChar *ContractionStart = UCharOffset = (UChar *)coll- >image+getContractOffset(CE);
3360
3361 if (collIter_bos(source)) {
3362 CE = *(coll->contractionCEs + (UCharOffset - coll->contr actionIndex));
3363 break;
3364 }
3365 schar = getPrevNormalizedChar(source, status);
3366 goBackOne(source);
3367
3368 while(schar > (tchar = *UCharOffset)) { /* since the contrac tion codepoints should be ordered, we skip all that are smaller */
3369 UCharOffset++;
3370 }
3371
3372 if (schar == tchar) {
3373 // Found the source string char in the table.
3374 // Pick up the corresponding CE from the table.
3375 CE = *(coll->contractionCEs +
3376 (UCharOffset - coll->contractionIndex));
3377 }
3378 else
3379 {
3380 // if there is a completely ignorable code point in the middle of
3381 // a prefix, we need to act as if it's not there
3382 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0- fdef are set to zero)
3383 // lone surrogates cannot be set to zero as it would bre ak other processing
3384 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping , schar);
3385 // it's easy for BMP code points
3386 if(isZeroCE == 0) {
3387 continue;
3388 } else if(U16_IS_SURROGATE(schar)) {
3389 // for supplementary code points, we have to check t he next one
3390 // situations where we are going to ignore
3391 // 1. beginning of the string: schar is a lone surro gate
3392 // 2. schar is a lone surrogate
3393 // 3. schar is a trail surrogate in a valid surrogat e sequence
3394 // that is explicitly set to zero.
3395 if (!collIter_bos(source)) {
3396 UChar lead;
3397 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD( lead = getPrevNormalizedChar(source, status))) {
3398 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapp ing, lead);
3399 if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {
3400 uint32_t finalCE = UTRIE_GET32_FROM_OFFS ET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
3401 if(finalCE == 0) {
3402 // this is a real, assigned complete ly ignorable code point
3403 goBackOne(source);
3404 continue;
3405 }
3406 }
3407 } else {
3408 // lone surrogate, treat like unassigned
3409 return UCOL_NOT_FOUND;
3410 }
3411 } else {
3412 // lone surrogate at the beggining, treat like u nassigned
3413 return UCOL_NOT_FOUND;
3414 }
3415 }
3416 // Source string char was not in the table.
3417 // We have not found the prefix.
3418 CE = *(coll->contractionCEs +
3419 (ContractionStart - coll->contractionIndex));
3420 }
3421
3422 if(!isPrefix(CE)) {
3423 // The source string char was in the contraction table, and the corresponding
3424 // CE is not a prefix CE. We found the prefix, break
3425 // out of loop, this CE will end up being returned. T his is the normal
3426 // way out of prefix handling when the source actually contained
3427 // the prefix.
3428 break;
3429 }
3430 }
3431 loadState(source, &prefixState, TRUE);
3432 break;
3433 }
3434
3435 case CONTRACTION_TAG: {
3436 /* to ensure that the backwards and forwards iteration matches, we
3437 take the current region of most possible match and pass it through
3438 the forward iteration. this will ensure that the obstinate problem o f
3439 overlapping contractions will not occur.
3440 */
3441 schar = peekCodeUnit(source, 0);
3442 constart = (UChar *)coll->image + getContractOffset(CE);
3443 if (isAtStartPrevIterate(source)
3444 /* commented away contraction end checks after adding the checks
3445 in getPrevCE */) {
3446 /* start of string or this is not the end of any contraction */
3447 CE = *(coll->contractionCEs +
3448 (constart - coll->contractionIndex));
3449 break;
3450 }
3451 strbuffer = buffer;
3452 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3453 *(UCharOffset --) = 0;
3454 noChars = 0;
3455 // have to swap thai characters
3456 while (ucol_unsafeCP(schar, coll)) {
3457 *(UCharOffset) = schar;
3458 noChars++;
3459 UCharOffset --;
3460 schar = getPrevNormalizedChar(source, status);
3461 goBackOne(source);
3462 // TODO: when we exhaust the contraction buffer,
3463 // it needs to get reallocated. The problem is
3464 // that the size depends on the string which is
3465 // not iterated over. However, since we're travelling
3466 // backwards, we already had to set the iterator at
3467 // the end - so we might as well know where we are?
3468 if (UCharOffset + 1 == buffer) {
3469 /* we have exhausted the buffer */
3470 int32_t newsize = 0;
3471 if(source->pos) { // actually dealing with a position
3472 newsize = (int32_t)(source->pos - source->string + 1);
3473 } else { // iterator
3474 newsize = 4 * UCOL_MAX_BUFFER;
3475 }
3476 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3477 (newsize + UCOL_MAX_BUFFER));
3478 /* test for NULL */
3479 if (strbuffer == NULL) {
3480 *status = U_MEMORY_ALLOCATION_ERROR;
3481 return UCOL_NO_MORE_CES;
3482 }
3483 UCharOffset = strbuffer + newsize;
3484 uprv_memcpy(UCharOffset, buffer,
3485 UCOL_MAX_BUFFER * sizeof(UChar));
3486 UCharOffset --;
3487 }
3488 if ((source->pos && (source->pos == source->string ||
3489 ((source->flags & UCOL_ITER_INNORMBUF) &&
3490 *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3491 || (source->iterator && !source->iterator->hasPrevious(sourc e->iterator))) {
3492 break;
3493 }
3494 }
3495 /* adds the initial base character to the string */
3496 *(UCharOffset) = schar;
3497 noChars++;
3498
3499 int32_t offsetBias;
3500
3501 // **** doesn't work if using iterator ****
3502 if (source->flags & UCOL_ITER_INNORMBUF) {
3503 offsetBias = -1;
3504 } else {
3505 offsetBias = (int32_t)(source->pos - source->string);
3506 }
3507
3508 /* a new collIterate is used to simplify things, since using the cur rent
3509 collIterate will mean that the forward and backwards iteration will
3510 share and change the same buffers. we don't want to get into that. * /
3511 collIterate temp;
3512 int32_t rawOffset;
3513
3514 IInit_collIterate(coll, UCharOffset, noChars, &temp, status);
3515 if(U_FAILURE(*status)) {
3516 return UCOL_NULLORDER;
3517 }
3518 temp.flags &= ~UCOL_ITER_NORM;
3519 temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
3520
3521 rawOffset = (int32_t)(temp.pos - temp.string); // should always be z ero?
3522 CE = ucol_IGetNextCE(coll, &temp, status);
3523
3524 if (source->extendCEs) {
3525 endCEBuffer = source->extendCEs + source->extendCEsSize;
3526 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(u int32_t));
3527 } else {
3528 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3529 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_ t));
3530 }
3531
3532 while (CE != UCOL_NO_MORE_CES) {
3533 *(source->CEpos ++) = CE;
3534
3535 if (offsetBias >= 0) {
3536 source->appendOffset(rawOffset + offsetBias, *status);
3537 }
3538
3539 CECount++;
3540 if (source->CEpos == endCEBuffer) {
3541 /* ran out of CE space, reallocate to new buffer.
3542 If reallocation fails, reset pointers and bail out,
3543 there's no guarantee of the right character position after
3544 this bail*/
3545 if (!increaseCEsCapacity(source)) {
3546 *status = U_MEMORY_ALLOCATION_ERROR;
3547 break;
3548 }
3549
3550 endCEBuffer = source->extendCEs + source->extendCEsSize;
3551 }
3552
3553 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
3554 rawOffset = (int32_t)(temp.fcdPosition - temp.string);
3555 } else {
3556 rawOffset = (int32_t)(temp.pos - temp.string);
3557 }
3558
3559 CE = ucol_IGetNextCE(coll, &temp, status);
3560 }
3561
3562 if (strbuffer != buffer) {
3563 uprv_free(strbuffer);
3564 }
3565 if (U_FAILURE(*status)) {
3566 return (uint32_t)UCOL_NULLORDER;
3567 }
3568
3569 if (source->offsetRepeatValue != 0) {
3570 if (CECount > noChars) {
3571 source->offsetRepeatCount += temp.offsetRepeatCount;
3572 } else {
3573 // **** does this really skip the right offsets? ****
3574 source->offsetReturn -= (noChars - CECount);
3575 }
3576 }
3577
3578 if (offsetBias >= 0) {
3579 source->offsetReturn = source->offsetStore - 1;
3580 if (source->offsetReturn == source->offsetBuffer) {
3581 source->offsetStore = source->offsetBuffer;
3582 }
3583 }
3584
3585 source->toReturn = source->CEpos - 1;
3586 if (source->toReturn == source->CEs) {
3587 source->CEpos = source->CEs;
3588 }
3589
3590 return *(source->toReturn);
3591 }
3592 case LONG_PRIMARY_TAG:
3593 {
3594 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3595 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3596 source->toReturn = source->CEpos - 1;
3597
3598 if (source->flags & UCOL_ITER_INNORMBUF) {
3599 source->offsetRepeatCount = 1;
3600 } else {
3601 int32_t firstOffset = (int32_t)(source->pos - source->string );
3602
3603 source->appendOffset(firstOffset, *status);
3604 source->appendOffset(firstOffset + 1, *status);
3605
3606 source->offsetReturn = source->offsetStore - 1;
3607 *(source->offsetBuffer) = firstOffset;
3608 if (source->offsetReturn == source->offsetBuffer) {
3609 source->offsetStore = source->offsetBuffer;
3610 }
3611 }
3612
3613
3614 return *(source->toReturn);
3615 }
3616
3617 case EXPANSION_TAG: /* this tag always returns */
3618 {
3619 /*
3620 This should handle expansion.
3621 NOTE: we can encounter both continuations and expansions in an expan sion!
3622 I have to decide where continuations are going to be dealt with
3623 */
3624 int32_t firstOffset = (int32_t)(source->pos - source->string);
3625
3626 // **** doesn't work if using iterator ****
3627 if (source->offsetReturn != NULL) {
3628 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetRet urn == source->offsetBuffer) {
3629 source->offsetStore = source->offsetBuffer;
3630 }else {
3631 firstOffset = -1;
3632 }
3633 }
3634
3635 /* find the offset to expansion table */
3636 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3637 size = getExpansionCount(CE);
3638 if (size != 0) {
3639 /*
3640 if there are less than 16 elements in expansion, we don't termin ate
3641 */
3642 uint32_t count;
3643
3644 for (count = 0; count < size; count++) {
3645 *(source->CEpos ++) = *CEOffset++;
3646
3647 if (firstOffset >= 0) {
3648 source->appendOffset(firstOffset + 1, *status);
3649 }
3650 }
3651 } else {
3652 /* else, we do */
3653 while (*CEOffset != 0) {
3654 *(source->CEpos ++) = *CEOffset ++;
3655
3656 if (firstOffset >= 0) {
3657 source->appendOffset(firstOffset + 1, *status);
3658 }
3659 }
3660 }
3661
3662 if (firstOffset >= 0) {
3663 source->offsetReturn = source->offsetStore - 1;
3664 *(source->offsetBuffer) = firstOffset;
3665 if (source->offsetReturn == source->offsetBuffer) {
3666 source->offsetStore = source->offsetBuffer;
3667 }
3668 } else {
3669 source->offsetRepeatCount += size - 1;
3670 }
3671
3672 source->toReturn = source->CEpos - 1;
3673 // in case of one element expansion, we
3674 // want to immediately return CEpos
3675 if(source->toReturn == source->CEs) {
3676 source->CEpos = source->CEs;
3677 }
3678
3679 return *(source->toReturn);
3680 }
3681
3682 case DIGIT_TAG:
3683 {
3684 /*
3685 We do a check to see if we want to collate digits as numbers; if so we generate
3686 a custom collation key. Otherwise we pull out the value stored i n the expansion table.
3687 */
3688 uint32_t i; /* general counter */
3689
3690 if (source->coll->numericCollation == UCOL_ON){
3691 uint32_t digIndx = 0;
3692 uint32_t endIndex = 0;
3693 uint32_t leadingZeroIndex = 0;
3694 uint32_t trailingZeroCount = 0;
3695
3696 uint8_t collateVal = 0;
3697
3698 UBool nonZeroValReached = FALSE;
3699
3700 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I j ust need a temporary place to store my generated CEs.
3701 /*
3702 We parse the source string until we hit a char that's NOT a digit.
3703 Use this u_charDigitValue. This might be slow because we hav e to
3704 handle surrogates...
3705 */
3706 /*
3707 We need to break up the digit string into collection element s of UCOL_MAX_DIGITS_FOR_NUMBER or less,
3708 with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
3709 element we process when going backward. To determine how lon g that chunk might be, we may need to make
3710 two passes through the loop that collects digits - one to se e how long the string is (and how much is
3711 leading zeros) to determine the length of that right-hand ch unk, and a second (if the whole string has
3712 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits ) to actually process that collation
3713 element chunk after resetting the state to the initialState at the right side of the digit string.
3714 */
3715 uint32_t ceLimit = 0;
3716 UChar initial_ch = ch;
3717 collIterateState initialState = {0,0,0,0,0,0,0,0,0};
3718 backupState(source, &initialState);
3719
3720 for(;;) {
3721 collIterateState state = {0,0,0,0,0,0,0,0,0};
3722 UChar32 char32 = 0;
3723 int32_t digVal = 0;
3724
3725 if (U16_IS_TRAIL (ch)) {
3726 if (!collIter_bos(source)){
3727 UChar lead = getPrevNormalizedChar(source, statu s);
3728 if(U16_IS_LEAD(lead)) {
3729 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3730 goBackOne(source);
3731 } else {
3732 char32 = ch;
3733 }
3734 } else {
3735 char32 = ch;
3736 }
3737 } else {
3738 char32 = ch;
3739 }
3740 digVal = u_charDigitValue(char32);
3741
3742 for(;;) {
3743 // Make sure we have enough space. No longer needed;
3744 // at this point the largest value of digIndx when w e need to save data in numTempBuf
3745 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post- incremented) so we just ensure
3746 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FO R_NUMBER/2 + 2).
3747
3748 // Skip over trailing zeroes, and keep a count of th em.
3749 if (digVal != 0)
3750 nonZeroValReached = TRUE;
3751
3752 if (nonZeroValReached) {
3753 /*
3754 We parse the digit string into base 100 numbers (this fits into a byte).
3755 We only add to the buffer in twos, thus if we ar e parsing an odd character,
3756 that serves as the 'tens' digit while the if we are parsing an even one, that
3757 is the 'ones' digit. We dumped the parsed base 1 00 value (collateVal) into
3758 a buffer. We multiply each collateVal by 2 (to g ive us room) and add 5 (to avoid
3759 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3760 than all the other bytes.
3761
3762 Since we're doing in this reverse we want to put the first digit encountered into the
3763 ones place and the second digit encountered into the tens place.
3764 */
3765
3766 if ((digIndx + trailingZeroCount) % 2 == 1) {
3767 // High-order digit case (tens place)
3768 collateVal += (uint8_t)(digVal * 10);
3769
3770 // We cannot set leadingZeroIndex unless it has been set for the
3771 // low-order digit. Therefore, all we can do for the high-order
3772 // digit is turn it off, never on.
3773 // The only time we will have a high digit w ithout a low is for
3774 // the very first non-zero digit, so no zero check is necessary.
3775 if (collateVal != 0)
3776 leadingZeroIndex = 0;
3777
3778 // The first pass through, digIndx may excee d the limit, but in that case
3779 // we no longer care about numTempBuf conten ts since they will be discarded
3780 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
3781 numTempBuf[(digIndx/2) + 2] = collateVal *2 + 6;
3782 }
3783 collateVal = 0;
3784 } else {
3785 // Low-order digit case (ones place)
3786 collateVal = (uint8_t)digVal;
3787
3788 // Check for leading zeroes.
3789 if (collateVal == 0) {
3790 if (!leadingZeroIndex)
3791 leadingZeroIndex = (digIndx/2) + 2;
3792 } else
3793 leadingZeroIndex = 0;
3794
3795 // No need to write to buffer; the case of a last odd digit
3796 // is handled below.
3797 }
3798 ++digIndx;
3799 } else
3800 ++trailingZeroCount;
3801
3802 if (!collIter_bos(source)) {
3803 ch = getPrevNormalizedChar(source, status);
3804 //goBackOne(source);
3805 if (U16_IS_TRAIL(ch)) {
3806 backupState(source, &state);
3807 if (!collIter_bos(source)) {
3808 goBackOne(source);
3809 UChar lead = getPrevNormalizedChar(sourc e, status);
3810
3811 if(U16_IS_LEAD(lead)) {
3812 char32 = U16_GET_SUPPLEMENTARY(lead, ch);
3813 } else {
3814 loadState(source, &state, FALSE);
3815 char32 = ch;
3816 }
3817 }
3818 } else
3819 char32 = ch;
3820
3821 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
3822 if (char32 > 0xFFFF) {// For surrogates.
3823 loadState(source, &state, FALSE);
3824 }
3825 // Don't need to "reverse" the goBackOne cal l,
3826 // as this points to the next position to pr ocess..
3827 //if (char32 > 0xFFFF) // For surrogates.
3828 //getNextNormalizedChar(source);
3829 break;
3830 }
3831
3832 goBackOne(source);
3833 }else
3834 break;
3835 }
3836
3837 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_N UMBER) {
3838 // our collation element is not too big, go ahead an d finish with it
3839 break;
3840 }
3841 // our digit string is too long for a collation element;
3842 // set the limit for it, reset the state and begin again
3843 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGIT S_FOR_NUMBER;
3844 if ( ceLimit == 0 ) {
3845 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
3846 }
3847 ch = initial_ch;
3848 loadState(source, &initialState, FALSE);
3849 digIndx = endIndex = leadingZeroIndex = trailingZeroCoun t = 0;
3850 collateVal = 0;
3851 nonZeroValReached = FALSE;
3852 }
3853
3854 if (! nonZeroValReached) {
3855 digIndx = 2;
3856 trailingZeroCount = 0;
3857 numTempBuf[2] = 6;
3858 }
3859
3860 if ((digIndx + trailingZeroCount) % 2 != 0) {
3861 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
3862 digIndx += 1; // The implicit leading zero
3863 }
3864 if (trailingZeroCount % 2 != 0) {
3865 // We had to consume one trailing zero for the low digit
3866 // of the least significant byte
3867 digIndx += 1; // The trailing zero not in the expo nent
3868 trailingZeroCount -= 1;
3869 }
3870
3871 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2 ) + 2) ;
3872
3873 // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3874 numTempBuf[2] -= 1;
3875
3876 /*
3877 We want to skip over the first two slots in the buffer. The first slot
3878 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3879 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3880 The exponent must be adjusted by the number of leading zeroe s, and the number of
3881 trailing zeroes.
3882 */
3883 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3884 uint32_t exponent = (digIndx+trailingZeroCount)/2;
3885 if (leadingZeroIndex)
3886 exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
3887 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
3888
3889 // Now transfer the collation key to our collIterate struct.
3890 // The total size for our collation key is half of endIndex, rounded up.
3891 int32_t size = (endIndex+1)/2;
3892 if(!ensureCEsCapacity(source, size)) {
3893 return UCOL_NULLORDER;
3894 }
3895 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3896 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Seco ndary weight
3897 UCOL_BYTE_COMMON; // Tertiary weight.
3898 i = endIndex - 1; // Reset the index into the buffer.
3899 while(i >= 2) {
3900 uint32_t primWeight = numTempBuf[i--] << 8;
3901 if ( i >= 2)
3902 primWeight |= numTempBuf[i--];
3903 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHI FT) | UCOL_CONTINUATION_MARKER;
3904 }
3905
3906 source->toReturn = source->CEpos -1;
3907 return *(source->toReturn);
3908 } else {
3909 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3910 CE = *(CEOffset++);
3911 break;
3912 }
3913 }
3914
3915 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3916 {
3917 static const uint32_t
3918 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11 A7;
3919 //const uint32_t LCount = 19;
3920 static const uint32_t VCount = 21;
3921 static const uint32_t TCount = 28;
3922 //const uint32_t NCount = VCount * TCount; /* 588 */
3923 //const uint32_t SCount = LCount * NCount; /* 11172 */
3924
3925 uint32_t L = ch - SBase;
3926 /*
3927 divide into pieces.
3928 we do it in this order since some compilers can do % and / in on e
3929 operation
3930 */
3931 uint32_t T = L % TCount;
3932 L /= TCount;
3933 uint32_t V = L % VCount;
3934 L /= VCount;
3935
3936 /* offset them */
3937 L += LBase;
3938 V += VBase;
3939 T += TBase;
3940
3941 int32_t firstOffset = (int32_t)(source->pos - source->string);
3942 source->appendOffset(firstOffset, *status);
3943
3944 /*
3945 * return the first CE, but first put the rest into the expansio n buffer
3946 */
3947 if (!source->coll->image->jamoSpecial) {
3948 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L );
3949 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V );
3950 source->appendOffset(firstOffset + 1, *status);
3951
3952 if (T != TBase) {
3953 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mappin g, T);
3954 source->appendOffset(firstOffset + 1, *status);
3955 }
3956
3957 source->toReturn = source->CEpos - 1;
3958
3959 source->offsetReturn = source->offsetStore - 1;
3960 if (source->offsetReturn == source->offsetBuffer) {
3961 source->offsetStore = source->offsetBuffer;
3962 }
3963
3964 return *(source->toReturn);
3965 } else {
3966 // Since Hanguls pass the FCD check, it is
3967 // guaranteed that we won't be in
3968 // the normalization buffer if something like this happens
3969 // Move Jamos into normalization buffer
3970 /*
3971 Move the Jamos into the
3972 normalization buffer
3973 */
3974 UChar *tempbuffer = source->writableBuffer.getBuffer(5);
3975 int32_t tempbufferLength;
3976 tempbuffer[0] = 0;
3977 tempbuffer[1] = (UChar)L;
3978 tempbuffer[2] = (UChar)V;
3979 if (T != TBase) {
3980 tempbuffer[3] = (UChar)T;
3981 tempbufferLength = 4;
3982 } else {
3983 tempbufferLength = 3;
3984 }
3985 source->writableBuffer.releaseBuffer(tempbufferLength);
3986
3987 /*
3988 Indicate where to continue in main input string after exhaus ting
3989 the writableBuffer
3990 */
3991 if (source->pos == source->string) {
3992 source->fcdPosition = NULL;
3993 } else {
3994 source->fcdPosition = source->pos-1;
3995 }
3996
3997 source->pos = source->writableBuffer.getTermin atedBuffer() + tempbufferLength;
3998 source->origFlags = source->flags;
3999 source->flags |= UCOL_ITER_INNORMBUF;
4000 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HAS LEN);
4001
4002 return(UCOL_IGNORABLE);
4003 }
4004 }
4005
4006 case IMPLICIT_TAG: /* everything that is not defined otherwise */
4007 return getPrevImplicit(ch, source);
4008
4009 // TODO: Remove CJK implicits as they are handled by the getImplicit Primary function
4010 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D */
4011 return getPrevImplicit(ch, source);
4012
4013 case SURROGATE_TAG: /* This is a surrogate pair */
4014 /* essentially an engaged lead surrogate. */
4015 /* if you have encountered it here, it means that a */
4016 /* broken sequence was encountered and this is an error */
4017 return UCOL_NOT_FOUND;
4018
4019 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
4020 return UCOL_NOT_FOUND; /* broken surrogate sequence */
4021
4022 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
4023 {
4024 UChar32 cp = 0;
4025 UChar prevChar;
4026 const UChar *prev;
4027 if (isAtStartPrevIterate(source)) {
4028 /* we are at the start of the string, wrong place to be at * /
4029 return UCOL_NOT_FOUND;
4030 }
4031 if (source->pos != source->writableBuffer.getBuffer()) {
4032 prev = source->pos - 1;
4033 } else {
4034 prev = source->fcdPosition;
4035 }
4036 prevChar = *prev;
4037
4038 /* Handles Han and Supplementary characters here.*/
4039 if (U16_IS_LEAD(prevChar)) {
4040 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<< 10UL)+0xdc00-0x10000));
4041 source->pos = prev;
4042 } else {
4043 return UCOL_NOT_FOUND; /* like unassigned */
4044 }
4045
4046 return getPrevImplicit(cp, source);
4047 }
4048
4049 /* UCA is filled with these. Tailorings are NOT_FOUND */
4050 /* not yet implemented */
4051 case CHARSET_TAG: /* this tag always returns */
4052 /* probably after 1.8 */
4053 return UCOL_NOT_FOUND;
4054
4055 default: /* this tag always returns */
4056 *status = U_INTERNAL_PROGRAM_ERROR;
4057 CE=0;
4058 break;
4059 }
4060
4061 if (CE <= UCOL_NOT_FOUND) {
4062 break;
4063 }
4064 }
4065
4066 return CE;
4067 }
4068
4069 /* This should really be a macro */
4070 /* However, it is used only when stack buffers are not sufficiently big, and the n we're messed up performance wise */
4071 /* anyway */
4072 static
4073 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *sec ond, uint32_t *secSize, uint32_t newSize, UErrorCode *status) {
4074 #ifdef UCOL_DEBUG
4075 fprintf(stderr, ".");
4076 #endif
4077 uint8_t *newStart = NULL;
4078 uint32_t offset = (uint32_t)(*secondaries-secStart);
4079
4080 if(secStart==second) {
4081 newStart=(uint8_t*)uprv_malloc(newSize);
4082 if(newStart==NULL) {
4083 *status = U_MEMORY_ALLOCATION_ERROR;
4084 return NULL;
4085 }
4086 uprv_memcpy(newStart, secStart, *secondaries-secStart);
4087 } else {
4088 newStart=(uint8_t*)uprv_realloc(secStart, newSize);
4089 if(newStart==NULL) {
4090 *status = U_MEMORY_ALLOCATION_ERROR;
4091 /* Since we're reallocating, return original reference so we don't l oose it. */
4092 return secStart;
4093 }
4094 }
4095 *secondaries=newStart+offset;
4096 *secSize=newSize;
4097 return newStart;
4098 }
4099
4100
4101 /* This should really be a macro */
4102 /* This function is used to reverse parts of a buffer. We need this operation wh en doing continuation */
4103 /* secondaries in French */
4104 /*
4105 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4106 uint8_t temp;
4107 while(start<end) {
4108 temp = *start;
4109 *start++ = *end;
4110 *end-- = temp;
4111 }
4112 }
4113 */
4114
4115 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4116 TYPE tempA; \
4117 while((start)<(end)) { \
4118 tempA = *(start); \
4119 *(start)++ = *(end); \
4120 *(end)-- = tempA; \
4121 } \
4122 }
4123
4124 /****************************************************************************/
4125 /* Following are the sortkey generation functions */
4126 /* */
4127 /****************************************************************************/
4128
4129 /**
4130 * Merge two sort keys.
4131 * This is useful, for example, to combine sort keys from first and last names
4132 * to sort such pairs.
4133 * Merged sort keys consider on each collation level the first part first entire ly,
4134 * then the second one.
4135 * It is possible to merge multiple sort keys by consecutively merging
4136 * another one with the intermediate result.
4137 *
4138 * The length of the merge result is the sum of the lengths of the input sort ke ys
4139 * minus 1.
4140 *
4141 * @param src1 the first sort key
4142 * @param src1Length the length of the first sort key, including the zero byte a t the end;
4143 * can be -1 if the function is to find the length
4144 * @param src2 the second sort key
4145 * @param src2Length the length of the second sort key, including the zero byte at the end;
4146 * can be -1 if the function is to find the length
4147 * @param dest the buffer where the merged sort key is written,
4148 * can be NULL if destCapacity==0
4149 * @param destCapacity the number of bytes in the dest buffer
4150 * @return the length of the merged sort key, src1Length+src2Length-1;
4151 * can be larger than destCapacity, or 0 if an error occurs (only for il legal arguments),
4152 * in which cases the contents of dest is undefined
4153 *
4154 * @draft
4155 */
4156 U_CAPI int32_t U_EXPORT2
4157 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
4158 const uint8_t *src2, int32_t src2Length,
4159 uint8_t *dest, int32_t destCapacity) {
4160 int32_t destLength;
4161 uint8_t b;
4162
4163 /* check arguments */
4164 if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[sr c1Length-1]!=0) ||
4165 src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[sr c2Length-1]!=0) ||
4166 destCapacity<0 || (destCapacity>0 && dest==NULL)
4167 ) {
4168 /* error, attempt to write a zero byte and return 0 */
4169 if(dest!=NULL && destCapacity>0) {
4170 *dest=0;
4171 }
4172 return 0;
4173 }
4174
4175 /* check lengths and capacity */
4176 if(src1Length<0) {
4177 src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
4178 }
4179 if(src2Length<0) {
4180 src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
4181 }
4182
4183 destLength=src1Length+src2Length-1;
4184 if(destLength>destCapacity) {
4185 /* the merged sort key does not fit into the destination */
4186 return destLength;
4187 }
4188
4189 /* merge the sort keys with the same number of levels */
4190 while(*src1!=0 && *src2!=0) { /* while both have another level */
4191 /* copy level from src1 not including 00 or 01 */
4192 while((b=*src1)>=2) {
4193 ++src1;
4194 *dest++=b;
4195 }
4196
4197 /* add a 02 merge separator */
4198 *dest++=2;
4199
4200 /* copy level from src2 not including 00 or 01 */
4201 while((b=*src2)>=2) {
4202 ++src2;
4203 *dest++=b;
4204 }
4205
4206 /* if both sort keys have another level, then add a 01 level separator a nd continue */
4207 if(*src1==1 && *src2==1) {
4208 ++src1;
4209 ++src2;
4210 *dest++=1;
4211 }
4212 }
4213
4214 /*
4215 * here, at least one sort key is finished now, but the other one
4216 * might have some contents left from containing more levels;
4217 * that contents is just appended to the result
4218 */
4219 if(*src1!=0) {
4220 /* src1 is not finished, therefore *src2==0, and src1 is appended */
4221 src2=src1;
4222 }
4223 /* append src2, "the other, unfinished sort key" */
4224 uprv_strcpy((char *)dest, (const char *)src2);
4225
4226 /* trust that neither sort key contained illegally embedded zero bytes */
4227 return destLength;
4228 }
4229
4230 /* sortkey API */
4231 U_CAPI int32_t U_EXPORT2
4232 ucol_getSortKey(const UCollator *coll,
4233 const UChar *source,
4234 int32_t sourceLength,
4235 uint8_t *result,
4236 int32_t resultLength)
4237 {
4238 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4239 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4240 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, sour ce,
4241 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLengt h));
4242 }
4243
4244 UErrorCode status = U_ZERO_ERROR;
4245 int32_t keySize = 0;
4246
4247 if(source != NULL) {
4248 // source == NULL is actually an error situation, but we would need to
4249 // have an error code to return it. Until we introduce a new
4250 // API, it stays like this
4251
4252 /* this uses the function pointer that is set in updateinternalstate */
4253 /* currently, there are two funcs: */
4254 /*ucol_calcSortKey(...);*/
4255 /*ucol_calcSortKeySimpleTertiary(...);*/
4256
4257 keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLe ngth, FALSE, &status);
4258 //if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && result & & resultLength > 0) {
4259 // That's not good. Something unusual happened.
4260 // We don't know how much we initialized before we failed.
4261 // NULL terminate for safety.
4262 // We have no way say that we have generated a partial sort key.
4263 //result[0] = 0;
4264 //keySize = 0;
4265 //}
4266 }
4267 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4268 UTRACE_EXIT_STATUS(status);
4269 return keySize;
4270 }
4271
4272 /* this function is called by the C++ API for sortkey generation */
4273 U_CFUNC int32_t
4274 ucol_getSortKeyWithAllocation(const UCollator *coll,
4275 const UChar *source, int32_t sourceLength,
4276 uint8_t **pResult,
4277 UErrorCode *pErrorCode) {
4278 *pResult = 0;
4279 return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pError Code);
4280 }
4281
4282 #define UCOL_FSEC_BUF_SIZE 256
4283
4284 // Is this primary weight compressible?
4285 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
4286 // TODO: This should use per-lead-byte flags from FractionalUCA.txt.
4287 static inline UBool
4288 isCompressible(const UCollator * /*coll*/, uint8_t primary1) {
4289 return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegul arPrimary;
4290 }
4291
4292 /* This function tries to get the size of a sortkey. It will be invoked if the s ize of resulting buffer is 0 */
4293 /* or if we run out of space while making a sortkey and want to return ASAP */
4294 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t curre ntSize, UColAttributeValue strength, int32_t len) {
4295 UErrorCode status = U_ZERO_ERROR;
4296 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->ima ge + coll->image->UCAConsts);
4297 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4298 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4299 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4300 UBool compareIdent = (strength == UCOL_IDENTICAL);
4301 UBool doCase = (coll->caseLevel == UCOL_ON);
4302 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
4303 //UBool qShifted = shifted && (compareQuad == 0);
4304 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4305 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0) ;
4306 uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE];
4307 uint8_t *fSecs = fSecsBuff;
4308 uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE;
4309 uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL;
4310
4311 uint32_t variableTopValue = coll->variableTopValue;
4312 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4313 if(doHiragana) {
4314 UCOL_COMMON_BOT4++;
4315 /* allocate one more space for hiragana */
4316 }
4317 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4318
4319 uint32_t order = UCOL_NO_MORE_CES;
4320 uint8_t primary1 = 0;
4321 uint8_t primary2 = 0;
4322 uint8_t secondary = 0;
4323 uint8_t tertiary = 0;
4324 int32_t caseShift = 0;
4325 uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */
4326
4327 uint8_t caseSwitch = coll->caseSwitch;
4328 uint8_t tertiaryMask = coll->tertiaryMask;
4329 uint8_t tertiaryCommon = coll->tertiaryCommon;
4330
4331 UBool wasShifted = FALSE;
4332 UBool notIsContinuation = FALSE;
4333 uint8_t leadPrimary = 0;
4334
4335
4336 for(;;) {
4337 order = ucol_IGetNextCE(coll, s, &status);
4338 if(order == UCOL_NO_MORE_CES) {
4339 break;
4340 }
4341
4342 if(order == 0) {
4343 continue;
4344 }
4345
4346 notIsContinuation = !isContinuation(order);
4347
4348
4349 if(notIsContinuation) {
4350 tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK));
4351 } else {
4352 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4353 }
4354 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4355 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4356 primary1 = (uint8_t)(order >> 8);
4357
4358 /* no need to permute since the actual code values don't matter
4359 if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
4360 primary1 = coll->leadBytePermutationTable[primary1];
4361 }
4362 */
4363
4364 if((shifted && ((notIsContinuation && order <= variableTopValue && prima ry1 > 0)
4365 || (!notIsContinuation && wasShifted)))
4366 || (wasShifted && primary1 == 0)) { /* amendment to the UCA says tha t primary ignorables */
4367 /* and other ignorables should be removed if following a shifted code point */
4368 if(primary1 == 0) { /* if we were shifted and we got an ignorabl e code point */
4369 /* we should just completely ignore it */
4370 continue;
4371 }
4372 if(compareQuad == 0) {
4373 if(c4 > 0) {
4374 currentSize += (c2/UCOL_BOT_COUNT4)+1;
4375 c4 = 0;
4376 }
4377 currentSize++;
4378 if(primary2 != 0) {
4379 currentSize++;
4380 }
4381 }
4382 wasShifted = TRUE;
4383 } else {
4384 wasShifted = FALSE;
4385 /* Note: This code assumes that the table is well built i.e. not hav ing 0 bytes where they are not supposed to be. */
4386 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */
4387 /* calculate sortkey size */
4388 if(primary1 != UCOL_IGNORABLE) {
4389 if(notIsContinuation) {
4390 if(leadPrimary == primary1) {
4391 currentSize++;
4392 } else {
4393 if(leadPrimary != 0) {
4394 currentSize++;
4395 }
4396 if(primary2 == UCOL_IGNORABLE) {
4397 /* one byter, not compressed */
4398 currentSize++;
4399 leadPrimary = 0;
4400 } else if(isCompressible(coll, primary1)) {
4401 /* compress */
4402 leadPrimary = primary1;
4403 currentSize+=2;
4404 } else {
4405 leadPrimary = 0;
4406 currentSize+=2;
4407 }
4408 }
4409 } else { /* we are in continuation, so we're gonna add primary t o the key don't care about compression */
4410 currentSize++;
4411 if(primary2 != UCOL_IGNORABLE) {
4412 currentSize++;
4413 }
4414 }
4415 }
4416
4417 if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */
4418 if(!isFrenchSec){
4419 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4420 c2++;
4421 } else {
4422 if(c2 > 0) {
4423 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4424 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1;
4425 } else {
4426 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1;
4427 }
4428 c2 = 0;
4429 }
4430 currentSize++;
4431 }
4432 } else {
4433 fSecs[fSecsLen++] = secondary;
4434 if(fSecsLen == fSecsMaxLen) {
4435 uint8_t *fSecsTemp;
4436 if(fSecs == fSecsBuff) {
4437 fSecsTemp = (uint8_t *)uprv_malloc(2*fSecsLen);
4438 } else {
4439 fSecsTemp = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLe n);
4440 }
4441 if(fSecsTemp == NULL) {
4442 status = U_MEMORY_ALLOCATION_ERROR;
4443 return 0;
4444 }
4445 fSecs = fSecsTemp;
4446 fSecsMaxLen *= 2;
4447 }
4448 if(notIsContinuation) {
4449 if (frenchStartPtr != NULL) {
4450 /* reverse secondaries from frenchStartPtr up to fre nchEndPtr */
4451 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, fr enchEndPtr);
4452 frenchStartPtr = NULL;
4453 }
4454 } else {
4455 if (frenchStartPtr == NULL) {
4456 frenchStartPtr = fSecs+fSecsLen-2;
4457 }
4458 frenchEndPtr = fSecs+fSecsLen-1;
4459 }
4460 }
4461 }
4462
4463 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4464 // do the case level if we need to do it. We don't want to calcu late
4465 // case level for primary ignorables if we have only primary str ength and case level
4466 // otherwise we would break well formedness of CEs
4467 if (caseShift == 0) {
4468 currentSize++;
4469 caseShift = UCOL_CASE_SHIFT_START;
4470 }
4471 if((tertiary&0x3F) > 0 && notIsContinuation) {
4472 caseShift--;
4473 if((tertiary &0xC0) != 0) {
4474 if (caseShift == 0) {
4475 currentSize++;
4476 caseShift = UCOL_CASE_SHIFT_START;
4477 }
4478 caseShift--;
4479 }
4480 }
4481 } else {
4482 if(notIsContinuation) {
4483 tertiary ^= caseSwitch;
4484 }
4485 }
4486
4487 tertiary &= tertiaryMask;
4488 if(tertiary > compareTer) { /* I think that != 0 test should be != I GNORABLE */
4489 if (tertiary == tertiaryCommon && notIsContinuation) {
4490 c3++;
4491 } else {
4492 if(c3 > 0) {
4493 if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_ COMMON3_NORMAL)
4494 || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
4495 currentSize += (c3/(uint32_t)coll->tertiaryTopCo unt)+1;
4496 } else {
4497 currentSize += (c3/(uint32_t)coll->tertiaryBottomCou nt)+1;
4498 }
4499 c3 = 0;
4500 }
4501 currentSize++;
4502 }
4503 }
4504
4505 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
4506 if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we n eed to note it
4507 if(c4>0) { // Close this part
4508 currentSize += (c4/UCOL_BOT_COUNT4)+1;
4509 c4 = 0;
4510 }
4511 currentSize++; // Add the Hiragana
4512 } else { // This wasn't Hiragana, so we can continue adding stuf f
4513 c4++;
4514 }
4515 }
4516 }
4517 }
4518
4519 if(!isFrenchSec){
4520 if(c2 > 0) {
4521 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BO T_COUNT2 != 0)?1:0);
4522 }
4523 } else {
4524 uint32_t i = 0;
4525 if(frenchStartPtr != NULL) {
4526 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4527 }
4528 for(i = 0; i<fSecsLen; i++) {
4529 secondary = *(fSecs+fSecsLen-i-1);
4530 /* This is compression code. */
4531 if (secondary == UCOL_COMMON2) {
4532 ++c2;
4533 } else {
4534 if(c2 > 0) {
4535 if (secondary > UCOL_COMMON2) { // not necessary for 4th lev el.
4536 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint 32_t)UCOL_TOP_COUNT2 != 0)?1:0);
4537 } else {
4538 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint 32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4539 }
4540 c2 = 0;
4541 }
4542 currentSize++;
4543 }
4544 }
4545 if(c2 > 0) {
4546 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BO T_COUNT2 != 0)?1:0);
4547 }
4548 if(fSecs != fSecsBuff) {
4549 uprv_free(fSecs);
4550 }
4551 }
4552
4553 if(c3 > 0) {
4554 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t )coll->tertiaryBottomCount != 0)?1:0);
4555 }
4556
4557 if(c4 > 0 && compareQuad == 0) {
4558 currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_CO UNT4 != 0)?1:0);
4559 }
4560
4561 if(compareIdent) {
4562 currentSize += u_lengthOfIdenticalLevelRun(s->string, len);
4563 }
4564 return currentSize;
4565 }
4566
4567 static
4568 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) {
4569 if (caseShift == 0) {
4570 *(*cases)++ = UCOL_CASE_BYTE_START;
4571 caseShift = UCOL_CASE_SHIFT_START;
4572 }
4573 }
4574
4575 // Adds a value to the buffer if it's safe to add. Increments the number of adde d values, so that we
4576 // know how many values we wanted to add, even if we didn't add them all
4577 static
4578 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size , const uint8_t value) {
4579 size++;
4580 if(primaries < limit) {
4581 *(primaries)++ = value;
4582 }
4583 }
4584
4585 // Packs the secondary buffer when processing French locale. Adds the terminator .
4586 static
4587 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *second aries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) {
4588 uint8_t secondary;
4589 int32_t count2 = 0;
4590 uint32_t i = 0, size = 0;
4591 // we use i here since the key size already accounts for terminators, so we' ll discard the increment
4592 addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR);
4593 /* If there are any unresolved continuation secondaries, reverse them here s o that we can reverse the whole secondary thing */
4594 if(frenchStartPtr != NULL) {
4595 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4596 }
4597 for(i = 0; i<*secsize; i++) {
4598 secondary = *(secondaries-i-1);
4599 /* This is compression code. */
4600 if (secondary == UCOL_COMMON2) {
4601 ++count2;
4602 } else {
4603 if (count2 > 0) {
4604 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4605 while (count2 > UCOL_TOP_COUNT2) {
4606 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCO L_COMMON_TOP2 - UCOL_TOP_COUNT2));
4607 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4608 }
4609 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_CO MMON_TOP2 - (count2-1)));
4610 } else {
4611 while (count2 > UCOL_BOT_COUNT2) {
4612 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCO L_COMMON_BOT2 + UCOL_BOT_COUNT2));
4613 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4614 }
4615 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_CO MMON_BOT2 + (count2-1)));
4616 }
4617 count2 = 0;
4618 }
4619 addWithIncrement(primaries, primEnd, size, secondary);
4620 }
4621 }
4622 if (count2 > 0) {
4623 while (count2 > UCOL_BOT_COUNT2) {
4624 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT 2 + UCOL_BOT_COUNT2));
4625 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4626 }
4627 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4628 }
4629 *secsize = size;
4630 return primaries;
4631 }
4632
4633 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
4634
4635 /* This is the sortkey work horse function */
4636 U_CFUNC int32_t U_CALLCONV
4637 ucol_calcSortKey(const UCollator *coll,
4638 const UChar *source,
4639 int32_t sourceLength,
4640 uint8_t **result,
4641 uint32_t resultLength,
4642 UBool allocateSKBuffer,
4643 UErrorCode *status)
4644 {
4645 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->ima ge + coll->image->UCAConsts);
4646
4647 uint32_t i = 0; /* general purpose counter */
4648
4649 /* Stack allocated buffers for buffers we use */
4650 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], te rt[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BU FFER];
4651
4652 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *ca ses = caseB, *quads = quad;
4653
4654 if(U_FAILURE(*status)) {
4655 return 0;
4656 }
4657
4658 if(primaries == NULL && allocateSKBuffer == TRUE) {
4659 primaries = *result = prim;
4660 resultLength = UCOL_PRIMARY_MAX_BUFFER;
4661 }
4662
4663 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BU FFER,
4664 caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER;
4665
4666 uint32_t sortKeySize = 1; /* it is always \0 terminated */
4667
4668 UnicodeString normSource;
4669
4670 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4671
4672 UColAttributeValue strength = coll->strength;
4673
4674 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4675 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4676 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4677 UBool compareIdent = (strength == UCOL_IDENTICAL);
4678 UBool doCase = (coll->caseLevel == UCOL_ON);
4679 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0) ;
4680 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
4681 //UBool qShifted = shifted && (compareQuad == 0);
4682 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4683
4684 uint32_t variableTopValue = coll->variableTopValue;
4685 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4686 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4687 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4688 uint8_t UCOL_HIRAGANA_QUAD = 0;
4689 if(doHiragana) {
4690 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4691 /* allocate one more space for hiragana, value for hiragana */
4692 }
4693 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4694
4695 /* support for special features like caselevel and funky secondaries */
4696 uint8_t *frenchStartPtr = NULL;
4697 uint8_t *frenchEndPtr = NULL;
4698 uint32_t caseShift = 0;
4699
4700 sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShi fted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0));
4701
4702 /* If we need to normalize, we'll do it all at once at the beginning! */
4703 const Normalizer2 *norm2;
4704 if(compareIdent) {
4705 norm2 = Normalizer2Factory::getNFDInstance(*status);
4706 } else if(coll->normalizationMode != UCOL_OFF) {
4707 norm2 = Normalizer2Factory::getFCDInstance(*status);
4708 } else {
4709 norm2 = NULL;
4710 }
4711 if(norm2 != NULL) {
4712 normSource.setTo(FALSE, source, len);
4713 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
4714 if(qcYesLength != len) {
4715 UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
4716 normSource.truncate(qcYesLength);
4717 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
4718 source = normSource.getBuffer();
4719 len = normSource.length();
4720 }
4721 }
4722 collIterate s;
4723 IInit_collIterate(coll, source, len, &s, status);
4724 if(U_FAILURE(*status)) {
4725 return 0;
4726 }
4727 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was norma lized.
4728
4729 if(resultLength == 0 || primaries == NULL) {
4730 return ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4731 }
4732 uint8_t *primarySafeEnd = primaries + resultLength - 1;
4733 if(strength > UCOL_PRIMARY) {
4734 primarySafeEnd--;
4735 }
4736
4737 uint32_t minBufferSize = UCOL_MAX_BUFFER;
4738
4739 uint8_t *primStart = primaries;
4740 uint8_t *secStart = secondaries;
4741 uint8_t *terStart = tertiaries;
4742 uint8_t *caseStart = cases;
4743 uint8_t *quadStart = quads;
4744
4745 uint32_t order = 0;
4746
4747 uint8_t primary1 = 0;
4748 uint8_t primary2 = 0;
4749 uint8_t secondary = 0;
4750 uint8_t tertiary = 0;
4751 uint8_t caseSwitch = coll->caseSwitch;
4752 uint8_t tertiaryMask = coll->tertiaryMask;
4753 int8_t tertiaryAddition = coll->tertiaryAddition;
4754 uint8_t tertiaryTop = coll->tertiaryTop;
4755 uint8_t tertiaryBottom = coll->tertiaryBottom;
4756 uint8_t tertiaryCommon = coll->tertiaryCommon;
4757 uint8_t caseBits = 0;
4758
4759 UBool finished = FALSE;
4760 UBool wasShifted = FALSE;
4761 UBool notIsContinuation = FALSE;
4762
4763 uint32_t prevBuffSize = 0;
4764
4765 uint32_t count2 = 0, count3 = 0, count4 = 0;
4766 uint8_t leadPrimary = 0;
4767
4768 for(;;) {
4769 for(i=prevBuffSize; i<minBufferSize; ++i) {
4770
4771 order = ucol_IGetNextCE(coll, &s, status);
4772 if(order == UCOL_NO_MORE_CES) {
4773 finished = TRUE;
4774 break;
4775 }
4776
4777 if(order == 0) {
4778 continue;
4779 }
4780
4781 notIsContinuation = !isContinuation(order);
4782
4783 if(notIsContinuation) {
4784 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4785 } else {
4786 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4787 }
4788
4789 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4790 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4791 primary1 = (uint8_t)(order >> 8);
4792
4793 uint8_t originalPrimary1 = primary1;
4794 if(notIsContinuation && coll->leadBytePermutationTable != NULL) {
4795 primary1 = coll->leadBytePermutationTable[primary1];
4796 }
4797
4798 if((shifted && ((notIsContinuation && order <= variableTopValue && p rimary1 > 0)
4799 || (!notIsContinuation && wasShifted)))
4800 || (wasShifted && primary1 == 0)) /* amendment to the UCA says t hat primary ignorables */
4801 {
4802 /* and other ignorables should be removed if following a shifted code point */
4803 if(primary1 == 0) { /* if we were shifted and we got an ignorabl e code point */
4804 /* we should just completely ignore it */
4805 continue;
4806 }
4807 if(compareQuad == 0) {
4808 if(count4 > 0) {
4809 while (count4 > UCOL_BOT_COUNT4) {
4810 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COU NT4);
4811 count4 -= UCOL_BOT_COUNT4;
4812 }
4813 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4814 count4 = 0;
4815 }
4816 /* We are dealing with a variable and we're treating them as shifted */
4817 /* This is a shifted ignorable */
4818 if(primary1 != 0) { /* we need to check this since we could be in continuation */
4819 *quads++ = primary1;
4820 }
4821 if(primary2 != 0) {
4822 *quads++ = primary2;
4823 }
4824 }
4825 wasShifted = TRUE;
4826 } else {
4827 wasShifted = FALSE;
4828 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4829 /* Usually, we'll have non-zero primary1 & primary2, except in c ases of a-z and friends, when primary2 will */
4830 /* regular and simple sortkey calc */
4831 if(primary1 != UCOL_IGNORABLE) {
4832 if(notIsContinuation) {
4833 if(leadPrimary == primary1) {
4834 *primaries++ = primary2;
4835 } else {
4836 if(leadPrimary != 0) {
4837 *primaries++ = (uint8_t)((primary1 > leadPrimary ) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
4838 }
4839 if(primary2 == UCOL_IGNORABLE) {
4840 /* one byter, not compressed */
4841 *primaries++ = primary1;
4842 leadPrimary = 0;
4843 } else if(isCompressible(coll, originalPrimary1)) {
4844 /* compress */
4845 *primaries++ = leadPrimary = primary1;
4846 if(primaries <= primarySafeEnd) {
4847 *primaries++ = primary2;
4848 }
4849 } else {
4850 leadPrimary = 0;
4851 *primaries++ = primary1;
4852 if(primaries <= primarySafeEnd) {
4853 *primaries++ = primary2;
4854 }
4855 }
4856 }
4857 } else { /* we are in continuation, so we're gonna add prima ry to the key don't care about compression */
4858 *primaries++ = primary1;
4859 if((primary2 != UCOL_IGNORABLE) && (primaries <= primary SafeEnd)) {
4860 *primaries++ = primary2; /* second part */
4861 }
4862 }
4863 }
4864
4865 if(secondary > compareSec) {
4866 if(!isFrenchSec) {
4867 /* This is compression code. */
4868 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4869 ++count2;
4870 } else {
4871 if (count2 > 0) {
4872 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4873 while (count2 > UCOL_TOP_COUNT2) {
4874 *secondaries++ = (uint8_t)(UCOL_COMMON_T OP2 - UCOL_TOP_COUNT2);
4875 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4876 }
4877 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
4878 } else {
4879 while (count2 > UCOL_BOT_COUNT2) {
4880 *secondaries++ = (uint8_t)(UCOL_COMMON_B OT2 + UCOL_BOT_COUNT2);
4881 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4882 }
4883 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
4884 }
4885 count2 = 0;
4886 }
4887 *secondaries++ = secondary;
4888 }
4889 } else {
4890 *secondaries++ = secondary;
4891 /* Do the special handling for French secondaries */
4892 /* We need to get continuation elements and do intermedi ate restore */
4893 /* abc1c2c3de with french secondaries need to be edc1c2c 3ba NOT edc3c2c1ba */
4894 if(notIsContinuation) {
4895 if (frenchStartPtr != NULL) {
4896 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4897 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr , frenchEndPtr);
4898 frenchStartPtr = NULL;
4899 }
4900 } else {
4901 if (frenchStartPtr == NULL) {
4902 frenchStartPtr = secondaries - 2;
4903 }
4904 frenchEndPtr = secondaries-1;
4905 }
4906 }
4907 }
4908
4909 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4910 // do the case level if we need to do it. We don't want to c alculate
4911 // case level for primary ignorables if we have only primary strength and case level
4912 // otherwise we would break well formedness of CEs
4913 doCaseShift(&cases, caseShift);
4914 if(notIsContinuation) {
4915 caseBits = (uint8_t)(tertiary & 0xC0);
4916
4917 if(tertiary != 0) {
4918 if(coll->caseFirst == UCOL_UPPER_FIRST) {
4919 if((caseBits & 0xC0) == 0) {
4920 *(cases-1) |= 1 << (--caseShift);
4921 } else {
4922 *(cases-1) |= 0 << (--caseShift);
4923 /* second bit */
4924 doCaseShift(&cases, caseShift);
4925 *(cases-1) |= ((caseBits>>6)&1) << (--caseSh ift);
4926 }
4927 } else {
4928 if((caseBits & 0xC0) == 0) {
4929 *(cases-1) |= 0 << (--caseShift);
4930 } else {
4931 *(cases-1) |= 1 << (--caseShift);
4932 /* second bit */
4933 doCaseShift(&cases, caseShift);
4934 *(cases-1) |= ((caseBits>>7)&1) << (--caseSh ift);
4935 }
4936 }
4937 }
4938
4939 }
4940 } else {
4941 if(notIsContinuation) {
4942 tertiary ^= caseSwitch;
4943 }
4944 }
4945
4946 tertiary &= tertiaryMask;
4947 if(tertiary > compareTer) {
4948 /* This is compression code. */
4949 /* sequence size check is included in the if clause */
4950 if (tertiary == tertiaryCommon && notIsContinuation) {
4951 ++count3;
4952 } else {
4953 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_C OMMON3_NORMAL) {
4954 tertiary += tertiaryAddition;
4955 } else if(tertiary <= tertiaryCommon && tertiaryCommon = = UCOL_COMMON3_UPPERFIRST) {
4956 tertiary -= tertiaryAddition;
4957 }
4958 if (count3 > 0) {
4959 if ((tertiary > tertiaryCommon)) {
4960 while (count3 > coll->tertiaryTopCount) {
4961 *tertiaries++ = (uint8_t)(tertiaryTop - coll ->tertiaryTopCount);
4962 count3 -= (uint32_t)coll->tertiaryTopCount;
4963 }
4964 *tertiaries++ = (uint8_t)(tertiaryTop - (count3- 1));
4965 } else {
4966 while (count3 > coll->tertiaryBottomCount) {
4967 *tertiaries++ = (uint8_t)(tertiaryBottom + c oll->tertiaryBottomCount);
4968 count3 -= (uint32_t)coll->tertiaryBottomCoun t;
4969 }
4970 *tertiaries++ = (uint8_t)(tertiaryBottom + (coun t3-1));
4971 }
4972 count3 = 0;
4973 }
4974 *tertiaries++ = tertiary;
4975 }
4976 }
4977
4978 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
4979 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and w e need to note it
4980 if(count4>0) { // Close this part
4981 while (count4 > UCOL_BOT_COUNT4) {
4982 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT _COUNT4);
4983 count4 -= UCOL_BOT_COUNT4;
4984 }
4985 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4986 count4 = 0;
4987 }
4988 *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana
4989 } else { // This wasn't Hiragana, so we can continue adding stuff
4990 count4++;
4991 }
4992 }
4993 }
4994
4995 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
4996 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
4997 IInit_collIterate(coll, (UChar *)source, len, &s, status);
4998 if(U_FAILURE(*status)) {
4999 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5000 finished = TRUE;
5001 break;
5002 }
5003 s.flags &= ~UCOL_ITER_NORM;
5004 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, str ength, len);
5005 *status = U_BUFFER_OVERFLOW_ERROR;
5006 finished = TRUE;
5007 break;
5008 } else { /* It's much nicer if we can actually reallocate */
5009 int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+ (secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadSt art));
5010 primStart = reallocateBuffer(&primaries, *result, prim, &res ultLength, 2*sks, status);
5011 if(U_SUCCESS(*status)) {
5012 *result = primStart;
5013 primarySafeEnd = primStart + resultLength - 1;
5014 if(strength > UCOL_PRIMARY) {
5015 primarySafeEnd--;
5016 }
5017 } else {
5018 /* We ran out of memory!? We can't recover. */
5019 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5020 finished = TRUE;
5021 break;
5022 }
5023 }
5024 }
5025 }
5026 if(finished) {
5027 break;
5028 } else {
5029 prevBuffSize = minBufferSize;
5030
5031 uint32_t frenchStartOffset = 0, frenchEndOffset = 0;
5032 if (frenchStartPtr != NULL) {
5033 frenchStartOffset = (uint32_t)(frenchStartPtr - secStart);
5034 frenchEndOffset = (uint32_t)(frenchEndPtr - secStart);
5035 }
5036 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize , 2*secSize, status);
5037 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2 *terSize, status);
5038 caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2* caseSize, status);
5039 quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*q uadSize, status);
5040 if(U_FAILURE(*status)) {
5041 /* We ran out of memory!? We can't recover. */
5042 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5043 break;
5044 }
5045 if (frenchStartPtr != NULL) {
5046 frenchStartPtr = secStart + frenchStartOffset;
5047 frenchEndPtr = secStart + frenchEndOffset;
5048 }
5049 minBufferSize *= 2;
5050 }
5051 }
5052
5053 /* Here, we are generally done with processing */
5054 /* bailing out would not be too productive */
5055
5056 if(U_SUCCESS(*status)) {
5057 sortKeySize += (uint32_t)(primaries - primStart);
5058 /* we have done all the CE's, now let's put them together to form a key */
5059 if(compareSec == 0) {
5060 if (count2 > 0) {
5061 while (count2 > UCOL_BOT_COUNT2) {
5062 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT 2);
5063 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5064 }
5065 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5066 }
5067 uint32_t secsize = (uint32_t)(secondaries-secStart);
5068 if(!isFrenchSec) { // Regular situation, we know the length of secon daries
5069 sortKeySize += secsize;
5070 if(sortKeySize <= resultLength) {
5071 *(primaries++) = UCOL_LEVELTERMINATOR;
5072 uprv_memcpy(primaries, secStart, secsize);
5073 primaries += secsize;
5074 } else {
5075 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
5076 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5077 if(U_SUCCESS(*status)) {
5078 *result = primStart;
5079 *(primaries++) = UCOL_LEVELTERMINATOR;
5080 uprv_memcpy(primaries, secStart, secsize);
5081 primaries += secsize;
5082 }
5083 else {
5084 /* We ran out of memory!? We can't recover. */
5085 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5086 goto cleanup;
5087 }
5088 } else {
5089 *status = U_BUFFER_OVERFLOW_ERROR;
5090 }
5091 }
5092 } else { // French secondary is on. We will need to pack French. pac kFrench will add the level terminator
5093 uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
5094 sortKeySize += secsize;
5095 if(sortKeySize <= resultLength) { // if we managed to pack fine
5096 primaries = newPrim; // update the primary pointer
5097 } else { // overflow, need to reallocate and redo
5098 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
5099 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5100 if(U_SUCCESS(*status)) {
5101 primaries = packFrench(primaries, primStart+resultLe ngth, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
5102 }
5103 else {
5104 /* We ran out of memory!? We can't recover. */
5105 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5106 goto cleanup;
5107 }
5108 } else {
5109 *status = U_BUFFER_OVERFLOW_ERROR;
5110 }
5111 }
5112 }
5113 }
5114
5115 if(doCase) {
5116 uint32_t casesize = (uint32_t)(cases - caseStart);
5117 sortKeySize += casesize;
5118 if(sortKeySize <= resultLength) {
5119 *(primaries++) = UCOL_LEVELTERMINATOR;
5120 uprv_memcpy(primaries, caseStart, casesize);
5121 primaries += casesize;
5122 } else {
5123 if(allocateSKBuffer == TRUE) {
5124 primStart = reallocateBuffer(&primaries, *result, prim, &res ultLength, 2*sortKeySize, status);
5125 if(U_SUCCESS(*status)) {
5126 *result = primStart;
5127 *(primaries++) = UCOL_LEVELTERMINATOR;
5128 uprv_memcpy(primaries, caseStart, casesize);
5129 }
5130 else {
5131 /* We ran out of memory!? We can't recover. */
5132 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5133 goto cleanup;
5134 }
5135 } else {
5136 *status = U_BUFFER_OVERFLOW_ERROR;
5137 }
5138 }
5139 }
5140
5141 if(compareTer == 0) {
5142 if (count3 > 0) {
5143 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
5144 while (count3 >= coll->tertiaryTopCount) {
5145 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTo pCount);
5146 count3 -= (uint32_t)coll->tertiaryTopCount;
5147 }
5148 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5149 } else {
5150 while (count3 > coll->tertiaryBottomCount) {
5151 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiar yBottomCount);
5152 count3 -= (uint32_t)coll->tertiaryBottomCount;
5153 }
5154 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5155 }
5156 }
5157 uint32_t tersize = (uint32_t)(tertiaries - terStart);
5158 sortKeySize += tersize;
5159 if(sortKeySize <= resultLength) {
5160 *(primaries++) = UCOL_LEVELTERMINATOR;
5161 uprv_memcpy(primaries, terStart, tersize);
5162 primaries += tersize;
5163 } else {
5164 if(allocateSKBuffer == TRUE) {
5165 primStart = reallocateBuffer(&primaries, *result, prim, &res ultLength, 2*sortKeySize, status);
5166 if(U_SUCCESS(*status)) {
5167 *result = primStart;
5168 *(primaries++) = UCOL_LEVELTERMINATOR;
5169 uprv_memcpy(primaries, terStart, tersize);
5170 }
5171 else {
5172 /* We ran out of memory!? We can't recover. */
5173 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5174 goto cleanup;
5175 }
5176 } else {
5177 *status = U_BUFFER_OVERFLOW_ERROR;
5178 }
5179 }
5180
5181 if(compareQuad == 0/*qShifted == TRUE*/) {
5182 if(count4 > 0) {
5183 while (count4 > UCOL_BOT_COUNT4) {
5184 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4) ;
5185 count4 -= UCOL_BOT_COUNT4;
5186 }
5187 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
5188 }
5189 uint32_t quadsize = (uint32_t)(quads - quadStart);
5190 sortKeySize += quadsize;
5191 if(sortKeySize <= resultLength) {
5192 *(primaries++) = UCOL_LEVELTERMINATOR;
5193 uprv_memcpy(primaries, quadStart, quadsize);
5194 primaries += quadsize;
5195 } else {
5196 if(allocateSKBuffer == TRUE) {
5197 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5198 if(U_SUCCESS(*status)) {
5199 *result = primStart;
5200 *(primaries++) = UCOL_LEVELTERMINATOR;
5201 uprv_memcpy(primaries, quadStart, quadsize);
5202 }
5203 else {
5204 /* We ran out of memory!? We can't recover. */
5205 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5206 goto cleanup;
5207 }
5208 } else {
5209 *status = U_BUFFER_OVERFLOW_ERROR;
5210 }
5211 }
5212 }
5213
5214 if(compareIdent) {
5215 sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len);
5216 if(sortKeySize <= resultLength) {
5217 *(primaries++) = UCOL_LEVELTERMINATOR;
5218 primaries += u_writeIdenticalLevelRun(s.string, len, primari es);
5219 } else {
5220 if(allocateSKBuffer == TRUE) {
5221 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status);
5222 if(U_SUCCESS(*status)) {
5223 *result = primStart;
5224 *(primaries++) = UCOL_LEVELTERMINATOR;
5225 u_writeIdenticalLevelRun(s.string, len, primaries);
5226 }
5227 else {
5228 /* We ran out of memory!? We can't recover. */
5229 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5230 goto cleanup;
5231 }
5232 } else {
5233 *status = U_BUFFER_OVERFLOW_ERROR;
5234 }
5235 }
5236 }
5237 }
5238 *(primaries++) = '\0';
5239 }
5240
5241 if(allocateSKBuffer == TRUE) {
5242 *result = (uint8_t*)uprv_malloc(sortKeySize);
5243 /* test for NULL */
5244 if (*result == NULL) {
5245 *status = U_MEMORY_ALLOCATION_ERROR;
5246 goto cleanup;
5247 }
5248 uprv_memcpy(*result, primStart, sortKeySize);
5249 if(primStart != prim) {
5250 uprv_free(primStart);
5251 }
5252 }
5253
5254 cleanup:
5255 if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && * status != U_BUFFER_OVERFLOW_ERROR) {
5256 /* NULL terminate for safety */
5257 **result = 0;
5258 }
5259 if(terStart != tert) {
5260 uprv_free(terStart);
5261 uprv_free(secStart);
5262 uprv_free(caseStart);
5263 uprv_free(quadStart);
5264 }
5265
5266 /* To avoid memory leak, free the offset buffer if necessary. */
5267 ucol_freeOffsetBuffer(&s);
5268
5269 return sortKeySize;
5270 }
5271
5272
5273 U_CFUNC int32_t U_CALLCONV
5274 ucol_calcSortKeySimpleTertiary(const UCollator *coll,
5275 const UChar *source,
5276 int32_t sourceLength,
5277 uint8_t **result,
5278 uint32_t resultLength,
5279 UBool allocateSKBuffer,
5280 UErrorCode *status)
5281 {
5282 U_ALIGN_CODE(16);
5283
5284 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->ima ge + coll->image->UCAConsts);
5285 uint32_t i = 0; /* general purpose counter */
5286
5287 /* Stack allocated buffers for buffers we use */
5288 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], te rt[UCOL_TERTIARY_MAX_BUFFER];
5289
5290 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert;
5291
5292 if(U_FAILURE(*status)) {
5293 return 0;
5294 }
5295
5296 if(primaries == NULL && allocateSKBuffer == TRUE) {
5297 primaries = *result = prim;
5298 resultLength = UCOL_PRIMARY_MAX_BUFFER;
5299 }
5300
5301 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BU FFER;
5302
5303 uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5304
5305 UnicodeString normSource;
5306
5307 int32_t len = sourceLength;
5308
5309 /* If we need to normalize, we'll do it all at once at the beginning! */
5310 if(coll->normalizationMode != UCOL_OFF) {
5311 normSource.setTo(len < 0, source, len);
5312 const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status);
5313 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
5314 if(qcYesLength != normSource.length()) {
5315 UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
5316 normSource.truncate(qcYesLength);
5317 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
5318 source = normSource.getBuffer();
5319 len = normSource.length();
5320 }
5321 }
5322 collIterate s;
5323 IInit_collIterate(coll, (UChar *)source, len, &s, status);
5324 if(U_FAILURE(*status)) {
5325 return 0;
5326 }
5327 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was norma lized.
5328
5329 if(resultLength == 0 || primaries == NULL) {
5330 return ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5331 }
5332
5333 uint8_t *primarySafeEnd = primaries + resultLength - 2;
5334
5335 uint32_t minBufferSize = UCOL_MAX_BUFFER;
5336
5337 uint8_t *primStart = primaries;
5338 uint8_t *secStart = secondaries;
5339 uint8_t *terStart = tertiaries;
5340
5341 uint32_t order = 0;
5342
5343 uint8_t primary1 = 0;
5344 uint8_t primary2 = 0;
5345 uint8_t secondary = 0;
5346 uint8_t tertiary = 0;
5347 uint8_t caseSwitch = coll->caseSwitch;
5348 uint8_t tertiaryMask = coll->tertiaryMask;
5349 int8_t tertiaryAddition = coll->tertiaryAddition;
5350 uint8_t tertiaryTop = coll->tertiaryTop;
5351 uint8_t tertiaryBottom = coll->tertiaryBottom;
5352 uint8_t tertiaryCommon = coll->tertiaryCommon;
5353
5354 uint32_t prevBuffSize = 0;
5355
5356 UBool finished = FALSE;
5357 UBool notIsContinuation = FALSE;
5358
5359 uint32_t count2 = 0, count3 = 0;
5360 uint8_t leadPrimary = 0;
5361
5362 for(;;) {
5363 for(i=prevBuffSize; i<minBufferSize; ++i) {
5364
5365 order = ucol_IGetNextCE(coll, &s, status);
5366
5367 if(order == 0) {
5368 continue;
5369 }
5370
5371 if(order == UCOL_NO_MORE_CES) {
5372 finished = TRUE;
5373 break;
5374 }
5375
5376 notIsContinuation = !isContinuation(order);
5377
5378 if(notIsContinuation) {
5379 tertiary = (uint8_t)((order & tertiaryMask));
5380 } else {
5381 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5382 }
5383
5384 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5385 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5386 primary1 = (uint8_t)(order >> 8);
5387
5388 uint8_t originalPrimary1 = primary1;
5389 if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
5390 primary1 = coll->leadBytePermutationTable[primary1];
5391 }
5392
5393 /* Note: This code assumes that the table is well built i.e. not hav ing 0 bytes where they are not supposed to be. */
5394 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */
5395 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
5396 /* regular and simple sortkey calc */
5397 if(primary1 != UCOL_IGNORABLE) {
5398 if(notIsContinuation) {
5399 if(leadPrimary == primary1) {
5400 *primaries++ = primary2;
5401 } else {
5402 if(leadPrimary != 0) {
5403 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5404 }
5405 if(primary2 == UCOL_IGNORABLE) {
5406 /* one byter, not compressed */
5407 *primaries++ = primary1;
5408 leadPrimary = 0;
5409 } else if(isCompressible(coll, originalPrimary1)) {
5410 /* compress */
5411 *primaries++ = leadPrimary = primary1;
5412 *primaries++ = primary2;
5413 } else {
5414 leadPrimary = 0;
5415 *primaries++ = primary1;
5416 *primaries++ = primary2;
5417 }
5418 }
5419 } else { /* we are in continuation, so we're gonna add primary t o the key don't care about compression */
5420 *primaries++ = primary1;
5421 if(primary2 != UCOL_IGNORABLE) {
5422 *primaries++ = primary2; /* second part */
5423 }
5424 }
5425 }
5426
5427 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5428 /* This is compression code. */
5429 if (secondary == UCOL_COMMON2 && notIsContinuation) {
5430 ++count2;
5431 } else {
5432 if (count2 > 0) {
5433 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5434 while (count2 > UCOL_TOP_COUNT2) {
5435 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UC OL_TOP_COUNT2);
5436 count2 -= (uint32_t)UCOL_TOP_COUNT2;
5437 }
5438 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count 2-1));
5439 } else {
5440 while (count2 > UCOL_BOT_COUNT2) {
5441 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UC OL_BOT_COUNT2);
5442 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5443 }
5444 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count 2-1));
5445 }
5446 count2 = 0;
5447 }
5448 *secondaries++ = secondary;
5449 }
5450 }
5451
5452 if(notIsContinuation) {
5453 tertiary ^= caseSwitch;
5454 }
5455
5456 if(tertiary > 0) {
5457 /* This is compression code. */
5458 /* sequence size check is included in the if clause */
5459 if (tertiary == tertiaryCommon && notIsContinuation) {
5460 ++count3;
5461 } else {
5462 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMO N3_NORMAL) {
5463 tertiary += tertiaryAddition;
5464 } else if (tertiary <= tertiaryCommon && tertiaryCommon == U COL_COMMON3_UPPERFIRST) {
5465 tertiary -= tertiaryAddition;
5466 }
5467 if (count3 > 0) {
5468 if ((tertiary > tertiaryCommon)) {
5469 while (count3 > coll->tertiaryTopCount) {
5470 *tertiaries++ = (uint8_t)(tertiaryTop - coll->te rtiaryTopCount);
5471 count3 -= (uint32_t)coll->tertiaryTopCount;
5472 }
5473 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5474 } else {
5475 while (count3 > coll->tertiaryBottomCount) {
5476 *tertiaries++ = (uint8_t)(tertiaryBottom + coll- >tertiaryBottomCount);
5477 count3 -= (uint32_t)coll->tertiaryBottomCount;
5478 }
5479 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1 ));
5480 }
5481 count3 = 0;
5482 }
5483 *tertiaries++ = tertiary;
5484 }
5485 }
5486
5487 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5488 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5489 IInit_collIterate(coll, (UChar *)source, len, &s, status);
5490 if(U_FAILURE(*status)) {
5491 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5492 finished = TRUE;
5493 break;
5494 }
5495 s.flags &= ~UCOL_ITER_NORM;
5496 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, col l->strength, len);
5497 *status = U_BUFFER_OVERFLOW_ERROR;
5498 finished = TRUE;
5499 break;
5500 } else { /* It's much nicer if we can actually reallocate */
5501 int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+ (secondaries - secStart)+(tertiaries - terStart));
5502 primStart = reallocateBuffer(&primaries, *result, prim, &res ultLength, 2*sks, status);
5503 if(U_SUCCESS(*status)) {
5504 *result = primStart;
5505 primarySafeEnd = primStart + resultLength - 2;
5506 } else {
5507 /* We ran out of memory!? We can't recover. */
5508 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5509 finished = TRUE;
5510 break;
5511 }
5512 }
5513 }
5514 }
5515 if(finished) {
5516 break;
5517 } else {
5518 prevBuffSize = minBufferSize;
5519 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize , 2*secSize, status);
5520 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2 *terSize, status);
5521 minBufferSize *= 2;
5522 if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5523 /* We ran out of memory!? We can't recover. */
5524 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5525 break;
5526 }
5527 }
5528 }
5529
5530 if(U_SUCCESS(*status)) {
5531 sortKeySize += (uint32_t)(primaries - primStart);
5532 /* we have done all the CE's, now let's put them together to form a key */
5533 if (count2 > 0) {
5534 while (count2 > UCOL_BOT_COUNT2) {
5535 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5536 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5537 }
5538 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5539 }
5540 uint32_t secsize = (uint32_t)(secondaries-secStart);
5541 sortKeySize += secsize;
5542 if(sortKeySize <= resultLength) {
5543 *(primaries++) = UCOL_LEVELTERMINATOR;
5544 uprv_memcpy(primaries, secStart, secsize);
5545 primaries += secsize;
5546 } else {
5547 if(allocateSKBuffer == TRUE) {
5548 primStart = reallocateBuffer(&primaries, *result, prim, &resultL ength, 2*sortKeySize, status);
5549 if(U_SUCCESS(*status)) {
5550 *(primaries++) = UCOL_LEVELTERMINATOR;
5551 *result = primStart;
5552 uprv_memcpy(primaries, secStart, secsize);
5553 }
5554 else {
5555 /* We ran out of memory!? We can't recover. */
5556 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5557 goto cleanup;
5558 }
5559 } else {
5560 *status = U_BUFFER_OVERFLOW_ERROR;
5561 }
5562 }
5563
5564 if (count3 > 0) {
5565 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5566 while (count3 >= coll->tertiaryTopCount) {
5567 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCou nt);
5568 count3 -= (uint32_t)coll->tertiaryTopCount;
5569 }
5570 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5571 } else {
5572 while (count3 > coll->tertiaryBottomCount) {
5573 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBot tomCount);
5574 count3 -= (uint32_t)coll->tertiaryBottomCount;
5575 }
5576 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5577 }
5578 }
5579 uint32_t tersize = (uint32_t)(tertiaries - terStart);
5580 sortKeySize += tersize;
5581 if(sortKeySize <= resultLength) {
5582 *(primaries++) = UCOL_LEVELTERMINATOR;
5583 uprv_memcpy(primaries, terStart, tersize);
5584 primaries += tersize;
5585 } else {
5586 if(allocateSKBuffer == TRUE) {
5587 primStart = reallocateBuffer(&primaries, *result, prim, &resultL ength, 2*sortKeySize, status);
5588 if(U_SUCCESS(*status)) {
5589 *result = primStart;
5590 *(primaries++) = UCOL_LEVELTERMINATOR;
5591 uprv_memcpy(primaries, terStart, tersize);
5592 }
5593 else {
5594 /* We ran out of memory!? We can't recover. */
5595 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5596 goto cleanup;
5597 }
5598 } else {
5599 *status = U_BUFFER_OVERFLOW_ERROR;
5600 }
5601 }
5602
5603 *(primaries++) = '\0';
5604 }
5605
5606 if(allocateSKBuffer == TRUE) {
5607 *result = (uint8_t*)uprv_malloc(sortKeySize);
5608 /* test for NULL */
5609 if (*result == NULL) {
5610 *status = U_MEMORY_ALLOCATION_ERROR;
5611 goto cleanup;
5612 }
5613 uprv_memcpy(*result, primStart, sortKeySize);
5614 if(primStart != prim) {
5615 uprv_free(primStart);
5616 }
5617 }
5618
5619 cleanup:
5620 if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && * status != U_BUFFER_OVERFLOW_ERROR) {
5621 /* NULL terminate for safety */
5622 **result = 0;
5623 }
5624 if(terStart != tert) {
5625 uprv_free(terStart);
5626 uprv_free(secStart);
5627 }
5628
5629 /* To avoid memory leak, free the offset buffer if necessary. */
5630 ucol_freeOffsetBuffer(&s);
5631
5632 return sortKeySize;
5633 }
5634
5635 static inline
5636 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5637 UBool notIsContinuation = !isContinuation(CE);
5638 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5639 if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5640 || (!notIsContinuation && *wasShifted)))
5641 || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that pri mary ignorables */
5642 {
5643 // The stuff below should probably be in the sortkey code... maybe not.. .
5644 if(primary1 != 0) { /* if we were shifted and we got an ignorable code p oint */
5645 /* we should just completely ignore it */
5646 *wasShifted = TRUE;
5647 //continue;
5648 }
5649 //*wasShifted = TRUE;
5650 return TRUE;
5651 } else {
5652 *wasShifted = FALSE;
5653 return FALSE;
5654 }
5655 }
5656 static inline
5657 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *des t) {
5658 if(level < maxLevel) {
5659 dest[i++] = UCOL_LEVELTERMINATOR;
5660 } else {
5661 dest[i++] = 0;
5662 }
5663 }
5664
5665 /** enumeration of level identifiers for partial sort key generation */
5666 enum {
5667 UCOL_PSK_PRIMARY = 0,
5668 UCOL_PSK_SECONDARY = 1,
5669 UCOL_PSK_CASE = 2,
5670 UCOL_PSK_TERTIARY = 3,
5671 UCOL_PSK_QUATERNARY = 4,
5672 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have t hree bits to blow */
5673 UCOL_PSK_IDENTICAL = 6,
5674 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */
5675 UCOL_PSK_LIMIT
5676 };
5677
5678 /** collation state enum. *_SHIFT value is how much to shift right
5679 * to get the state piece to the right. *_MASK value should be
5680 * ANDed with the shifted state. This data is stored in state[1]
5681 * field.
5682 */
5683 enum {
5684 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */
5685 UCOL_PSK_LEVEL_MASK = 7, /** three bits */
5686 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5687 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5688 /** can be only 0 or 1, since we get up to two bytes from primary or quatern ary
5689 * This field is also used to denote that the French secondary level is fin ished
5690 */
5691 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5692 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5693 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already wri tten */
5694 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5695 /** When we do French we need to reverse secondary values. However, continua tions
5696 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2 c3ba
5697 */
5698 UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
5699 UCOL_PSK_BOCSU_BYTES_MASK = 3,
5700 UCOL_PSK_CONSUMED_CES_SHIFT = 9,
5701 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
5702 };
5703
5704 // macro calculating the number of expansion CEs available
5705 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5706
5707
5708 /** main sortkey part procedure. On the first call,
5709 * you should pass in a collator, an iterator, empty state
5710 * state[0] == state[1] == 0, a buffer to hold results
5711 * number of bytes you need and an error code pointer.
5712 * Make sure your buffer is big enough to hold the wanted
5713 * number of sortkey bytes. I don't check.
5714 * The only meaningful status you can get back is
5715 * U_BUFFER_OVERFLOW_ERROR, which basically means that you
5716 * have been dealt a raw deal and that you probably won't
5717 * be able to use partial sortkey generation for this
5718 * particular combination of string and collator. This
5719 * is highly unlikely, but you should still check the error code.
5720 * Any other status means that you're not in a sane situation
5721 * anymore. After the first call, preserve state values and
5722 * use them on subsequent calls to obtain more bytes of a sortkey.
5723 * Use until the number of bytes written is smaller than the requested
5724 * number of bytes. Generated sortkey is not compatible with the
5725 * one generated by ucol_getSortKey, as we don't do any compression.
5726 * However, levels are still terminated by a 1 (one) and the sortkey
5727 * is terminated by a 0 (zero). Identical level is the same as in the
5728 * regular sortkey - internal bocu-1 implementation is used.
5729 * For curious, although you cannot do much about this, here is
5730 * the structure of state words.
5731 * state[0] - iterator state. Depends on the iterator implementation,
5732 * but allows the iterator to continue where it stopped in
5733 * the last iteration.
5734 * state[1] - collation processing state. Here is the distribution
5735 * of the bits:
5736 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5737 * quaternary, quin (we don't use this one), identical and
5738 * null (producing only zeroes - first one to terminate the
5739 * sortkey and subsequent to fill the buffer).
5740 * 3 - byte count. Number of bytes written on the primary level.
5741 * 4 - was shifted. Whether the previous iteration finished in the
5742 * shifted state.
5743 * 5, 6 - French continuation bytes written. See the comment in the enum
5744 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on
5745 * the identical level.
5746 * 9..31 - CEs consumed. Number of getCE or next32 operations performed
5747 * since thes last successful update of the iterator state.
5748 */
5749 U_CAPI int32_t U_EXPORT2
5750 ucol_nextSortKeyPart(const UCollator *coll,
5751 UCharIterator *iter,
5752 uint32_t state[2],
5753 uint8_t *dest, int32_t count,
5754 UErrorCode *status)
5755 {
5756 /* error checking */
5757 if(status==NULL || U_FAILURE(*status)) {
5758 return 0;
5759 }
5760 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5761 if( coll==NULL || iter==NULL ||
5762 state==NULL ||
5763 count<0 || (count>0 && dest==NULL)
5764 ) {
5765 *status=U_ILLEGAL_ARGUMENT_ERROR;
5766 UTRACE_EXIT_STATUS(status);
5767 return 0;
5768 }
5769
5770 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count= %d",
5771 coll, iter, state[0], state[1], dest, count);
5772
5773 if(count==0) {
5774 /* nothing to do */
5775 UTRACE_EXIT_VALUE(0);
5776 return 0;
5777 }
5778 /** Setting up situation according to the state we got from the previous ite ration */
5779 // The state of the iterator from the previous invocation
5780 uint32_t iterState = state[0];
5781 // Has the last iteration ended in the shifted state
5782 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_ SHIFTED_MASK)?TRUE:FALSE;
5783 // What is the current level of the sortkey?
5784 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
5785 // Have we written only one byte from a two byte primary in the previous ite ration?
5786 // Also on secondary level - have we finished with the French secondary?
5787 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_D ONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
5788 // number of bytes in the continuation buffer for French
5789 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USE D_FRENCH_MASK;
5790 // Number of bytes already written from a bocsu sequence. Since
5791 // the longes bocsu sequence is 4 long, this can be up to 3.
5792 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK _BOCSU_BYTES_MASK;
5793 // Number of elements that need to be consumed in this iteration because
5794 // the iterator returned UITER_NO_STATE at the end of the last iteration,
5795 // so we had to save the last valid state.
5796 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED _CES_MASK;
5797
5798 /** values that depend on the collator attributes */
5799 // strength of the collator.
5800 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
5801 // maximal level of the partial sortkey. Need to take whether case level is done
5802 int32_t maxLevel = 0;
5803 if(strength < UCOL_TERTIARY) {
5804 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5805 maxLevel = UCOL_PSK_CASE;
5806 } else {
5807 maxLevel = strength;
5808 }
5809 } else {
5810 if(strength == UCOL_TERTIARY) {
5811 maxLevel = UCOL_PSK_TERTIARY;
5812 } else if(strength == UCOL_QUATERNARY) {
5813 maxLevel = UCOL_PSK_QUATERNARY;
5814 } else { // identical
5815 maxLevel = UCOL_IDENTICAL;
5816 }
5817 }
5818 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5819 uint8_t UCOL_HIRAGANA_QUAD =
5820 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON )?0xFE:0xFF;
5821 // Boundary value that decides whether a CE is shifted or not
5822 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopV alue<<16):0;
5823 // Are we doing French collation?
5824 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
5825
5826 /** initializing the collation state */
5827 UBool notIsContinuation = FALSE;
5828 uint32_t CE = UCOL_NO_MORE_CES;
5829
5830 collIterate s;
5831 IInit_collIterate(coll, NULL, -1, &s, status);
5832 if(U_FAILURE(*status)) {
5833 UTRACE_EXIT_STATUS(*status);
5834 return 0;
5835 }
5836 s.iterator = iter;
5837 s.flags |= UCOL_USE_ITERATOR;
5838 // This variable tells us whether we have produced some other levels in this iteration
5839 // before we moved to the identical level. In that case, we need to switch t he
5840 // type of the iterator.
5841 UBool doingIdenticalFromStart = FALSE;
5842 // Normalizing iterator
5843 // The division for the array length may truncate the array size to
5844 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5845 // for all platforms anyway.
5846 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
5847 UNormIterator *normIter = NULL;
5848 // If the normalization is turned on for the collator and we are below ident ical level
5849 // we will use a FCD normalizing iterator
5850 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && le vel < UCOL_PSK_IDENTICAL) {
5851 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5852 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
5853 s.flags &= ~UCOL_ITER_NORM;
5854 if(U_FAILURE(*status)) {
5855 UTRACE_EXIT_STATUS(*status);
5856 return 0;
5857 }
5858 } else if(level == UCOL_PSK_IDENTICAL) {
5859 // for identical level, we need a NFD iterator. We need to instantiate i t here, since we
5860 // will be updating the state - and this cannot be done on an ordinary i terator.
5861 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5862 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5863 s.flags &= ~UCOL_ITER_NORM;
5864 if(U_FAILURE(*status)) {
5865 UTRACE_EXIT_STATUS(*status);
5866 return 0;
5867 }
5868 doingIdenticalFromStart = TRUE;
5869 }
5870
5871 // This is the tentative new state of the iterator. The problem
5872 // is that the iterator might return an undefined state, in
5873 // which case we should save the last valid state and increase
5874 // the iterator skip value.
5875 uint32_t newState = 0;
5876
5877 // First, we set the iterator to the last valid position
5878 // from the last iteration. This was saved in state[0].
5879 if(iterState == 0) {
5880 /* initial state */
5881 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
5882 s.iterator->move(s.iterator, 0, UITER_LIMIT);
5883 } else {
5884 s.iterator->move(s.iterator, 0, UITER_START);
5885 }
5886 } else {
5887 /* reset to previous state */
5888 s.iterator->setState(s.iterator, iterState, status);
5889 if(U_FAILURE(*status)) {
5890 UTRACE_EXIT_STATUS(*status);
5891 return 0;
5892 }
5893 }
5894
5895
5896
5897 // This variable tells us whether we can attempt to update the state
5898 // of iterator. Situations where we don't want to update iterator state
5899 // are the existence of expansion CEs that are not yet processed, and
5900 // finishing the case level without enough space in the buffer to insert
5901 // a level terminator.
5902 UBool canUpdateState = TRUE;
5903
5904 // Consume all the CEs that were consumed at the end of the previous
5905 // iteration without updating the iterator state. On identical level,
5906 // consume the code points.
5907 int32_t counter = cces;
5908 if(level < UCOL_PSK_IDENTICAL) {
5909 while(counter-->0) {
5910 // If we're doing French and we are on the secondary level,
5911 // we go backwards.
5912 if(level == UCOL_PSK_SECONDARY && doingFrench) {
5913 CE = ucol_IGetPrevCE(coll, &s, status);
5914 } else {
5915 CE = ucol_IGetNextCE(coll, &s, status);
5916 }
5917 if(CE==UCOL_NO_MORE_CES) {
5918 /* should not happen */
5919 *status=U_INTERNAL_PROGRAM_ERROR;
5920 UTRACE_EXIT_STATUS(*status);
5921 return 0;
5922 }
5923 if(uprv_numAvailableExpCEs(s)) {
5924 canUpdateState = FALSE;
5925 }
5926 }
5927 } else {
5928 while(counter-->0) {
5929 uiter_next32(s.iterator);
5930 }
5931 }
5932
5933 // French secondary needs to know whether the iterator state of zero came fr om previous level OR
5934 // from a new invocation...
5935 UBool wasDoingPrimary = FALSE;
5936 // destination buffer byte counter. When this guy
5937 // gets to count, we're done with the iteration
5938 int32_t i = 0;
5939 // used to count the zero bytes written after we
5940 // have finished with the sort key
5941 int32_t j = 0;
5942
5943
5944 // Hm.... I think we're ready to plunge in. Basic story is as following:
5945 // we have a fall through case based on level. This is used for initial
5946 // positioning on iteration start. Every level processor contains a
5947 // for(;;) which will be broken when we exhaust all the CEs. Other
5948 // way to exit is a goto saveState, which happens when we have filled
5949 // out our buffer.
5950 switch(level) {
5951 case UCOL_PSK_PRIMARY:
5952 wasDoingPrimary = TRUE;
5953 for(;;) {
5954 if(i==count) {
5955 goto saveState;
5956 }
5957 // We should save the state only if we
5958 // are sure that we are done with the
5959 // previous iterator state
5960 if(canUpdateState && byteCountOrFrenchDone == 0) {
5961 newState = s.iterator->getState(s.iterator);
5962 if(newState != UITER_NO_STATE) {
5963 iterState = newState;
5964 cces = 0;
5965 }
5966 }
5967 CE = ucol_IGetNextCE(coll, &s, status);
5968 cces++;
5969 if(CE==UCOL_NO_MORE_CES) {
5970 // Add the level separator
5971 terminatePSKLevel(level, maxLevel, i, dest);
5972 byteCountOrFrenchDone=0;
5973 // Restart the iteration an move to the
5974 // second level
5975 s.iterator->move(s.iterator, 0, UITER_START);
5976 cces = 0;
5977 level = UCOL_PSK_SECONDARY;
5978 break;
5979 }
5980 if(!isContinuation(CE)){
5981 if(coll->leadBytePermutationTable != NULL){
5982 CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF);
5983 }
5984 }
5985 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5986 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
5987 if(CE != 0) {
5988 if(byteCountOrFrenchDone == 0) {
5989 // get the second byte of primary
5990 dest[i++]=(uint8_t)(CE >> 8);
5991 } else {
5992 byteCountOrFrenchDone = 0;
5993 }
5994 if((CE &=0xff)!=0) {
5995 if(i==count) {
5996 /* overflow */
5997 byteCountOrFrenchDone = 1;
5998 cces--;
5999 goto saveState;
6000 }
6001 dest[i++]=(uint8_t)CE;
6002 }
6003 }
6004 }
6005 if(uprv_numAvailableExpCEs(s)) {
6006 canUpdateState = FALSE;
6007 } else {
6008 canUpdateState = TRUE;
6009 }
6010 }
6011 /* fall through to next level */
6012 case UCOL_PSK_SECONDARY:
6013 if(strength >= UCOL_SECONDARY) {
6014 if(!doingFrench) {
6015 for(;;) {
6016 if(i == count) {
6017 goto saveState;
6018 }
6019 // We should save the state only if we
6020 // are sure that we are done with the
6021 // previous iterator state
6022 if(canUpdateState) {
6023 newState = s.iterator->getState(s.iterator);
6024 if(newState != UITER_NO_STATE) {
6025 iterState = newState;
6026 cces = 0;
6027 }
6028 }
6029 CE = ucol_IGetNextCE(coll, &s, status);
6030 cces++;
6031 if(CE==UCOL_NO_MORE_CES) {
6032 // Add the level separator
6033 terminatePSKLevel(level, maxLevel, i, dest);
6034 byteCountOrFrenchDone = 0;
6035 // Restart the iteration an move to the
6036 // second level
6037 s.iterator->move(s.iterator, 0, UITER_START);
6038 cces = 0;
6039 level = UCOL_PSK_CASE;
6040 break;
6041 }
6042 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6043 CE >>= 8; /* get secondary */
6044 if(CE != 0) {
6045 dest[i++]=(uint8_t)CE;
6046 }
6047 }
6048 if(uprv_numAvailableExpCEs(s)) {
6049 canUpdateState = FALSE;
6050 } else {
6051 canUpdateState = TRUE;
6052 }
6053 }
6054 } else { // French secondary processing
6055 uint8_t frenchBuff[UCOL_MAX_BUFFER];
6056 int32_t frenchIndex = 0;
6057 // Here we are going backwards.
6058 // If the iterator is at the beggining, it should be
6059 // moved to end.
6060 if(wasDoingPrimary) {
6061 s.iterator->move(s.iterator, 0, UITER_LIMIT);
6062 cces = 0;
6063 }
6064 for(;;) {
6065 if(i == count) {
6066 goto saveState;
6067 }
6068 if(canUpdateState) {
6069 newState = s.iterator->getState(s.iterator);
6070 if(newState != UITER_NO_STATE) {
6071 iterState = newState;
6072 cces = 0;
6073 }
6074 }
6075 CE = ucol_IGetPrevCE(coll, &s, status);
6076 cces++;
6077 if(CE==UCOL_NO_MORE_CES) {
6078 // Add the level separator
6079 terminatePSKLevel(level, maxLevel, i, dest);
6080 byteCountOrFrenchDone = 0;
6081 // Restart the iteration an move to the next level
6082 s.iterator->move(s.iterator, 0, UITER_START);
6083 level = UCOL_PSK_CASE;
6084 break;
6085 }
6086 if(isContinuation(CE)) { // if it's a continuation, we want to save it and
6087 // reverse when we get a first non-continuation CE.
6088 CE >>= 8;
6089 frenchBuff[frenchIndex++] = (uint8_t)CE;
6090 } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
6091 CE >>= 8; /* get secondary */
6092 if(!frenchIndex) {
6093 if(CE != 0) {
6094 dest[i++]=(uint8_t)CE;
6095 }
6096 } else {
6097 frenchBuff[frenchIndex++] = (uint8_t)CE;
6098 frenchIndex -= usedFrench;
6099 usedFrench = 0;
6100 while(i < count && frenchIndex) {
6101 dest[i++] = frenchBuff[--frenchIndex];
6102 usedFrench++;
6103 }
6104 }
6105 }
6106 if(uprv_numAvailableExpCEs(s)) {
6107 canUpdateState = FALSE;
6108 } else {
6109 canUpdateState = TRUE;
6110 }
6111 }
6112 }
6113 } else {
6114 level = UCOL_PSK_CASE;
6115 }
6116 /* fall through to next level */
6117 case UCOL_PSK_CASE:
6118 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
6119 uint32_t caseShift = UCOL_CASE_SHIFT_START;
6120 uint8_t caseByte = UCOL_CASE_BYTE_START;
6121 uint8_t caseBits = 0;
6122
6123 for(;;) {
6124 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);
6125 if(i == count) {
6126 goto saveState;
6127 }
6128 // We should save the state only if we
6129 // are sure that we are done with the
6130 // previous iterator state
6131 if(canUpdateState) {
6132 newState = s.iterator->getState(s.iterator);
6133 if(newState != UITER_NO_STATE) {
6134 iterState = newState;
6135 cces = 0;
6136 }
6137 }
6138 CE = ucol_IGetNextCE(coll, &s, status);
6139 cces++;
6140 if(CE==UCOL_NO_MORE_CES) {
6141 // On the case level we might have an unfinished
6142 // case byte. Add one if it's started.
6143 if(caseShift != UCOL_CASE_SHIFT_START) {
6144 dest[i++] = caseByte;
6145 }
6146 cces = 0;
6147 // We have finished processing CEs on this level.
6148 // However, we don't know if we have enough space
6149 // to add a case level terminator.
6150 if(i < count) {
6151 // Add the level separator
6152 terminatePSKLevel(level, maxLevel, i, dest);
6153 // Restart the iteration and move to the
6154 // next level
6155 s.iterator->move(s.iterator, 0, UITER_START);
6156 level = UCOL_PSK_TERTIARY;
6157 } else {
6158 canUpdateState = FALSE;
6159 }
6160 break;
6161 }
6162
6163 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6164 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || s trength > UCOL_PRIMARY)) {
6165 // do the case level if we need to do it. We don't want to calculate
6166 // case level for primary ignorables if we have only pri mary strength and case level
6167 // otherwise we would break well formedness of CEs
6168 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6169 caseBits = (uint8_t)(CE & 0xC0);
6170 // this copies the case level logic from the
6171 // sort key generation code
6172 if(CE != 0) {
6173 if (caseShift == 0) {
6174 dest[i++] = caseByte;
6175 caseShift = UCOL_CASE_SHIFT_START;
6176 caseByte = UCOL_CASE_BYTE_START;
6177 }
6178 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6179 if((caseBits & 0xC0) == 0) {
6180 caseByte |= 1 << (--caseShift);
6181 } else {
6182 caseByte |= 0 << (--caseShift);
6183 /* second bit */
6184 if(caseShift == 0) {
6185 dest[i++] = caseByte;
6186 caseShift = UCOL_CASE_SHIFT_START;
6187 caseByte = UCOL_CASE_BYTE_START;
6188 }
6189 caseByte |= ((caseBits>>6)&1) << (--caseShif t);
6190 }
6191 } else {
6192 if((caseBits & 0xC0) == 0) {
6193 caseByte |= 0 << (--caseShift);
6194 } else {
6195 caseByte |= 1 << (--caseShift);
6196 /* second bit */
6197 if(caseShift == 0) {
6198 dest[i++] = caseByte;
6199 caseShift = UCOL_CASE_SHIFT_START;
6200 caseByte = UCOL_CASE_BYTE_START;
6201 }
6202 caseByte |= ((caseBits>>7)&1) << (--caseShif t);
6203 }
6204 }
6205 }
6206
6207 }
6208 }
6209 // Not sure this is correct for the case level - revisit
6210 if(uprv_numAvailableExpCEs(s)) {
6211 canUpdateState = FALSE;
6212 } else {
6213 canUpdateState = TRUE;
6214 }
6215 }
6216 } else {
6217 level = UCOL_PSK_TERTIARY;
6218 }
6219 /* fall through to next level */
6220 case UCOL_PSK_TERTIARY:
6221 if(strength >= UCOL_TERTIARY) {
6222 for(;;) {
6223 if(i == count) {
6224 goto saveState;
6225 }
6226 // We should save the state only if we
6227 // are sure that we are done with the
6228 // previous iterator state
6229 if(canUpdateState) {
6230 newState = s.iterator->getState(s.iterator);
6231 if(newState != UITER_NO_STATE) {
6232 iterState = newState;
6233 cces = 0;
6234 }
6235 }
6236 CE = ucol_IGetNextCE(coll, &s, status);
6237 cces++;
6238 if(CE==UCOL_NO_MORE_CES) {
6239 // Add the level separator
6240 terminatePSKLevel(level, maxLevel, i, dest);
6241 byteCountOrFrenchDone = 0;
6242 // Restart the iteration an move to the
6243 // second level
6244 s.iterator->move(s.iterator, 0, UITER_START);
6245 cces = 0;
6246 level = UCOL_PSK_QUATERNARY;
6247 break;
6248 }
6249 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6250 notIsContinuation = !isContinuation(CE);
6251
6252 if(notIsContinuation) {
6253 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6254 CE ^= coll->caseSwitch;
6255 CE &= coll->tertiaryMask;
6256 } else {
6257 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6258 }
6259
6260 if(CE != 0) {
6261 dest[i++]=(uint8_t)CE;
6262 }
6263 }
6264 if(uprv_numAvailableExpCEs(s)) {
6265 canUpdateState = FALSE;
6266 } else {
6267 canUpdateState = TRUE;
6268 }
6269 }
6270 } else {
6271 // if we're not doing tertiary
6272 // skip to the end
6273 level = UCOL_PSK_NULL;
6274 }
6275 /* fall through to next level */
6276 case UCOL_PSK_QUATERNARY:
6277 if(strength >= UCOL_QUATERNARY) {
6278 for(;;) {
6279 if(i == count) {
6280 goto saveState;
6281 }
6282 // We should save the state only if we
6283 // are sure that we are done with the
6284 // previous iterator state
6285 if(canUpdateState) {
6286 newState = s.iterator->getState(s.iterator);
6287 if(newState != UITER_NO_STATE) {
6288 iterState = newState;
6289 cces = 0;
6290 }
6291 }
6292 CE = ucol_IGetNextCE(coll, &s, status);
6293 cces++;
6294 if(CE==UCOL_NO_MORE_CES) {
6295 // Add the level separator
6296 terminatePSKLevel(level, maxLevel, i, dest);
6297 //dest[i++] = UCOL_LEVELTERMINATOR;
6298 byteCountOrFrenchDone = 0;
6299 // Restart the iteration an move to the
6300 // second level
6301 s.iterator->move(s.iterator, 0, UITER_START);
6302 cces = 0;
6303 level = UCOL_PSK_QUIN;
6304 break;
6305 }
6306 if(CE==0)
6307 continue;
6308 if(isShiftedCE(CE, LVT, &wasShifted)) {
6309 CE >>= 16; /* get primary */
6310 if(CE != 0) {
6311 if(byteCountOrFrenchDone == 0) {
6312 dest[i++]=(uint8_t)(CE >> 8);
6313 } else {
6314 byteCountOrFrenchDone = 0;
6315 }
6316 if((CE &=0xff)!=0) {
6317 if(i==count) {
6318 /* overflow */
6319 byteCountOrFrenchDone = 1;
6320 goto saveState;
6321 }
6322 dest[i++]=(uint8_t)CE;
6323 }
6324 }
6325 } else {
6326 notIsContinuation = !isContinuation(CE);
6327 if(notIsContinuation) {
6328 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana a nd we need to note it
6329 dest[i++] = UCOL_HIRAGANA_QUAD;
6330 } else {
6331 dest[i++] = 0xFF;
6332 }
6333 }
6334 }
6335 if(uprv_numAvailableExpCEs(s)) {
6336 canUpdateState = FALSE;
6337 } else {
6338 canUpdateState = TRUE;
6339 }
6340 }
6341 } else {
6342 // if we're not doing quaternary
6343 // skip to the end
6344 level = UCOL_PSK_NULL;
6345 }
6346 /* fall through to next level */
6347 case UCOL_PSK_QUIN:
6348 level = UCOL_PSK_IDENTICAL;
6349 /* fall through to next level */
6350 case UCOL_PSK_IDENTICAL:
6351 if(strength >= UCOL_IDENTICAL) {
6352 UChar32 first, second;
6353 int32_t bocsuBytesWritten = 0;
6354 // We always need to do identical on
6355 // the NFD form of the string.
6356 if(normIter == NULL) {
6357 // we arrived from the level below and
6358 // normalization was not turned on.
6359 // therefore, we need to make a fresh NFD iterator
6360 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6361 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6362 } else if(!doingIdenticalFromStart) {
6363 // there is an iterator, but we did some other levels.
6364 // therefore, we have a FCD iterator - need to make
6365 // a NFD one.
6366 // normIter being at the beginning does not guarantee
6367 // that the underlying iterator is at the beginning
6368 iter->move(iter, 0, UITER_START);
6369 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6370 }
6371 // At this point we have a NFD iterator that is positioned
6372 // in the right place
6373 if(U_FAILURE(*status)) {
6374 UTRACE_EXIT_STATUS(*status);
6375 return 0;
6376 }
6377 first = uiter_previous32(s.iterator);
6378 // maybe we're at the start of the string
6379 if(first == U_SENTINEL) {
6380 first = 0;
6381 } else {
6382 uiter_next32(s.iterator);
6383 }
6384
6385 j = 0;
6386 for(;;) {
6387 if(i == count) {
6388 if(j+1 < bocsuBytesWritten) {
6389 bocsuBytesUsed = j+1;
6390 }
6391 goto saveState;
6392 }
6393
6394 // On identical level, we will always save
6395 // the state if we reach this point, since
6396 // we don't depend on getNextCE for content
6397 // all the content is in our buffer and we
6398 // already either stored the full buffer OR
6399 // otherwise we won't arrive here.
6400 newState = s.iterator->getState(s.iterator);
6401 if(newState != UITER_NO_STATE) {
6402 iterState = newState;
6403 cces = 0;
6404 }
6405
6406 uint8_t buff[4];
6407 second = uiter_next32(s.iterator);
6408 cces++;
6409
6410 // end condition for identical level
6411 if(second == U_SENTINEL) {
6412 terminatePSKLevel(level, maxLevel, i, dest);
6413 level = UCOL_PSK_NULL;
6414 break;
6415 }
6416 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, seco nd, buff);
6417 first = second;
6418
6419 j = 0;
6420 if(bocsuBytesUsed != 0) {
6421 while(bocsuBytesUsed-->0) {
6422 j++;
6423 }
6424 }
6425
6426 while(i < count && j < bocsuBytesWritten) {
6427 dest[i++] = buff[j++];
6428 }
6429 }
6430
6431 } else {
6432 level = UCOL_PSK_NULL;
6433 }
6434 /* fall through to next level */
6435 case UCOL_PSK_NULL:
6436 j = i;
6437 while(j<count) {
6438 dest[j++]=0;
6439 }
6440 break;
6441 default:
6442 *status = U_INTERNAL_PROGRAM_ERROR;
6443 UTRACE_EXIT_STATUS(*status);
6444 return 0;
6445 }
6446
6447 saveState:
6448 // Now we need to return stuff. First we want to see whether we have
6449 // done everything for the current state of iterator.
6450 if(byteCountOrFrenchDone
6451 || canUpdateState == FALSE
6452 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
6453 {
6454 // Any of above mean that the previous transaction
6455 // wasn't finished and that we should store the
6456 // previous iterator state.
6457 state[0] = iterState;
6458 } else {
6459 // The transaction is complete. We will continue in the next iteration.
6460 state[0] = s.iterator->getState(s.iterator);
6461 cces = 0;
6462 }
6463 // Store the number of bocsu bytes written.
6464 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
6465 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6466 }
6467 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BY TES_SHIFT;
6468
6469 // Next we put in the level of comparison
6470 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6471
6472 // If we are doing French, we need to store whether we have just finished th e French level
6473 if(level == UCOL_PSK_SECONDARY && doingFrench) {
6474 state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6475 } else {
6476 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE _MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6477 }
6478
6479 // Was the latest CE shifted
6480 if(wasShifted) {
6481 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6482 }
6483 // Check for cces overflow
6484 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
6485 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6486 }
6487 // Store cces
6488 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SH IFT);
6489
6490 // Check for French overflow
6491 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6492 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6493 }
6494 // Store number of bytes written in the French secondary continuation sequen ce
6495 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENC H_SHIFT);
6496
6497
6498 // If we have used normalizing iterator, get rid of it
6499 if(normIter != NULL) {
6500 unorm_closeIter(normIter);
6501 }
6502
6503 /* To avoid memory leak, free the offset buffer if necessary. */
6504 ucol_freeOffsetBuffer(&s);
6505
6506 // Return number of meaningful sortkey bytes.
6507 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6508 dest,i, state[0], state[1]);
6509 UTRACE_EXIT_VALUE(i);
6510 return i;
6511 }
6512
6513 /**
6514 * Produce a bound for a given sortkey and a number of levels.
6515 */
6516 U_CAPI int32_t U_EXPORT2
6517 ucol_getBound(const uint8_t *source,
6518 int32_t sourceLength,
6519 UColBoundMode boundType,
6520 uint32_t noOfLevels,
6521 uint8_t *result,
6522 int32_t resultLength,
6523 UErrorCode *status)
6524 {
6525 // consistency checks
6526 if(status == NULL || U_FAILURE(*status)) {
6527 return 0;
6528 }
6529 if(source == NULL) {
6530 *status = U_ILLEGAL_ARGUMENT_ERROR;
6531 return 0;
6532 }
6533
6534 int32_t sourceIndex = 0;
6535 // Scan the string until we skip enough of the key OR reach the end of the k ey
6536 do {
6537 sourceIndex++;
6538 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6539 noOfLevels--;
6540 }
6541 } while (noOfLevels > 0
6542 && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6543
6544 if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6545 && noOfLevels > 0) {
6546 *status = U_SORT_KEY_TOO_SHORT_WARNING;
6547 }
6548
6549
6550 // READ ME: this code assumes that the values for boundType
6551 // enum will not changes. They are set so that the enum value
6552 // corresponds to the number of extra bytes each bound type
6553 // needs.
6554 if(result != NULL && resultLength >= sourceIndex+boundType) {
6555 uprv_memcpy(result, source, sourceIndex);
6556 switch(boundType) {
6557 // Lower bound just gets terminated. No extra bytes
6558 case UCOL_BOUND_LOWER: // = 0
6559 break;
6560 // Upper bound needs one extra byte
6561 case UCOL_BOUND_UPPER: // = 1
6562 result[sourceIndex++] = 2;
6563 break;
6564 // Upper long bound needs two extra bytes
6565 case UCOL_BOUND_UPPER_LONG: // = 2
6566 result[sourceIndex++] = 0xFF;
6567 result[sourceIndex++] = 0xFF;
6568 break;
6569 default:
6570 *status = U_ILLEGAL_ARGUMENT_ERROR;
6571 return 0;
6572 }
6573 result[sourceIndex++] = 0;
6574
6575 return sourceIndex;
6576 } else {
6577 return sourceIndex+boundType+1;
6578 }
6579 }
6580
6581 /****************************************************************************/
6582 /* Following are the functions that deal with the properties of a collator */
6583 /* there are new APIs and some compatibility APIs */
6584 /****************************************************************************/
6585
6586 static inline void
6587 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6588 int32_t *primShift, int32_t *secShift, int32_t *terShift)
6589 {
6590 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6591 UBool reverseSecondary = FALSE;
6592 UBool continuation = isContinuation(CE);
6593 if(!continuation) {
6594 tertiary = (uint8_t)((CE & coll->tertiaryMask));
6595 tertiary ^= coll->caseSwitch;
6596 reverseSecondary = TRUE;
6597 } else {
6598 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6599 tertiary &= UCOL_REMOVE_CASE;
6600 reverseSecondary = FALSE;
6601 }
6602
6603 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6604 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6605 primary1 = (uint8_t)(CE >> 8);
6606
6607 if(primary1 != 0) {
6608 if (coll->leadBytePermutationTable != NULL && !continuation) {
6609 primary1 = coll->leadBytePermutationTable[primary1];
6610 }
6611
6612 coll->latinOneCEs[ch] |= (primary1 << *primShift);
6613 *primShift -= 8;
6614 }
6615 if(primary2 != 0) {
6616 if(*primShift < 0) {
6617 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6618 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6619 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6620 return;
6621 }
6622 coll->latinOneCEs[ch] |= (primary2 << *primShift);
6623 *primShift -= 8;
6624 }
6625 if(secondary != 0) {
6626 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse se condary
6627 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space fo r secondary
6628 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6629 } else { // normal case
6630 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secSh ift);
6631 }
6632 *secShift -= 8;
6633 }
6634 if(tertiary != 0) {
6635 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift );
6636 *terShift -= 8;
6637 }
6638 }
6639
6640 static inline UBool
6641 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6642 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6643 if(newTable == NULL) {
6644 *status = U_MEMORY_ALLOCATION_ERROR;
6645 coll->latinOneFailed = TRUE;
6646 return FALSE;
6647 }
6648 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTable Len)*sizeof(uint32_t);
6649 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6650 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6651 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToC opy);
6652 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, siz eToCopy);
6653 coll->latinOneTableLen = size;
6654 uprv_free(coll->latinOneCEs);
6655 coll->latinOneCEs = newTable;
6656 return TRUE;
6657 }
6658
6659 static UBool
6660 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6661 UBool result = TRUE;
6662 if(coll->latinOneCEs == NULL) {
6663 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINO NETABLELEN*3);
6664 if(coll->latinOneCEs == NULL) {
6665 *status = U_MEMORY_ALLOCATION_ERROR;
6666 return FALSE;
6667 }
6668 coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6669 }
6670 UChar ch = 0;
6671 UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6672 // Check for null pointer
6673 if (U_FAILURE(*status)) {
6674 return FALSE;
6675 }
6676 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3) ;
6677
6678 int32_t primShift = 24, secShift = 24, terShift = 24;
6679 uint32_t CE = 0;
6680 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6681
6682 // TODO: make safe if you get more than you wanted...
6683 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6684 primShift = 24; secShift = 24; terShift = 24;
6685 if(ch < 0x100) {
6686 CE = coll->latinOneMapping[ch];
6687 } else {
6688 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
6689 if(CE == UCOL_NOT_FOUND && coll->UCA) {
6690 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
6691 }
6692 }
6693 if(CE < UCOL_NOT_FOUND) {
6694 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift );
6695 } else {
6696 switch (getCETag(CE)) {
6697 case EXPANSION_TAG:
6698 case DIGIT_TAG:
6699 ucol_setText(it, &ch, 1, status);
6700 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6701 if(primShift < 0 || secShift < 0 || terShift < 0) {
6702 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6703 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL _OUT_CE;
6704 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BA IL_OUT_CE;
6705 break;
6706 }
6707 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, & terShift);
6708 }
6709 break;
6710 case CONTRACTION_TAG:
6711 // here is the trick
6712 // F2 is contraction. We do something very similar to contractio ns
6713 // but have two indices, one in the real contraction table and t he
6714 // other to where we stuffed things. This hopes that we don't ha ve
6715 // many contractions (this should work for latin-1 tables).
6716 {
6717 if((CE & 0x00FFF000) != 0) {
6718 *status = U_UNSUPPORTED_ERROR;
6719 goto cleanup_after_failure;
6720 }
6721
6722 const UChar *UCharOffset = (UChar *)coll->image+getContractO ffset(CE);
6723
6724 CE |= (contractionOffset & 0xFFF) << 12; // insert the offse t in latin-1 table
6725
6726 coll->latinOneCEs[ch] = CE;
6727 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6728 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6729
6730 // We're going to jump into contraction table, pick the elem ents
6731 // and use them
6732 do {
6733 CE = *(coll->contractionCEs +
6734 (UCharOffset - coll->contractionIndex));
6735 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
6736 uint32_t size;
6737 uint32_t i; /* general counter */
6738 uint32_t *CEOffset = (uint32_t *)coll->image+getExpa nsionOffset(CE); /* find the offset to expansion table */
6739 size = getExpansionCount(CE);
6740 //CE = *CEOffset++;
6741 if(size != 0) { /* if there are less than 16 element s in expansion, we don't terminate */
6742 for(i = 0; i<size; i++) {
6743 if(primShift < 0 || secShift < 0 || terShift < 0) {
6744 coll->latinOneCEs[(UChar)contractionOffs et] = UCOL_BAIL_OUT_CE;
6745 coll->latinOneCEs[coll->latinOneTableLen +(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6746 coll->latinOneCEs[2*coll->latinOneTableL en+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6747 break;
6748 }
6749 ucol_addLatinOneEntry(coll, (UChar)contracti onOffset, *CEOffset++, &primShift, &secShift, &terShift);
6750 }
6751 } else { /* else, we do */
6752 while(*CEOffset != 0) {
6753 if(primShift < 0 || secShift < 0 || terShift < 0) {
6754 coll->latinOneCEs[(UChar)contractionOffs et] = UCOL_BAIL_OUT_CE;
6755 coll->latinOneCEs[coll->latinOneTableLen +(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6756 coll->latinOneCEs[2*coll->latinOneTableL en+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6757 break;
6758 }
6759 ucol_addLatinOneEntry(coll, (UChar)contracti onOffset, *CEOffset++, &primShift, &secShift, &terShift);
6760 }
6761 }
6762 contractionOffset++;
6763 } else if(CE < UCOL_NOT_FOUND) {
6764 ucol_addLatinOneEntry(coll, (UChar)contractionOffset ++, CE, &primShift, &secShift, &terShift);
6765 } else {
6766 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_B AIL_OUT_CE;
6767 coll->latinOneCEs[coll->latinOneTableLen+(UChar)cont ractionOffset] = UCOL_BAIL_OUT_CE;
6768 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)co ntractionOffset] = UCOL_BAIL_OUT_CE;
6769 contractionOffset++;
6770 }
6771 UCharOffset++;
6772 primShift = 24; secShift = 24; terShift = 24;
6773 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6774 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneT ableLen, status)) {
6775 goto cleanup_after_failure;
6776 }
6777 }
6778 } while(*UCharOffset != 0xFFFF);
6779 }
6780 break;;
6781 case SPEC_PROC_TAG:
6782 {
6783 // 0xB7 is a precontext character defined in UCA5.1, a speci al
6784 // handle is implemeted in order to save LatinOne table for
6785 // most locales.
6786 if (ch==0xb7) {
6787 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShif t, &terShift);
6788 }
6789 else {
6790 goto cleanup_after_failure;
6791 }
6792 }
6793 break;
6794 default:
6795 goto cleanup_after_failure;
6796 }
6797 }
6798 }
6799 // compact table
6800 if(contractionOffset < coll->latinOneTableLen) {
6801 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
6802 goto cleanup_after_failure;
6803 }
6804 }
6805 ucol_closeElements(it);
6806 return result;
6807
6808 cleanup_after_failure:
6809 // status should already be set before arriving here.
6810 coll->latinOneFailed = TRUE;
6811 ucol_closeElements(it);
6812 return FALSE;
6813 }
6814
6815 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
6816 if(U_SUCCESS(*status)) {
6817 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6818 coll->caseSwitch = UCOL_CASE_SWITCH;
6819 } else {
6820 coll->caseSwitch = UCOL_NO_CASE_SWITCH;
6821 }
6822
6823 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
6824 coll->tertiaryMask = UCOL_REMOVE_CASE;
6825 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6826 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
6827 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
6828 coll->tertiaryBottom = UCOL_COMMON_BOT3;
6829 } else {
6830 coll->tertiaryMask = UCOL_KEEP_CASE;
6831 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
6832 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6833 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
6834 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
6835 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
6836 } else {
6837 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6838 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
6839 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
6840 }
6841 }
6842
6843 /* Set the compression values */
6844 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1 );
6845 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* w e multilply double with int, but need only int */
6846 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopC ount);
6847
6848 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
6849 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == U COL_NON_IGNORABLE)
6850 {
6851 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
6852 } else {
6853 coll->sortKeyGen = ucol_calcSortKey;
6854 }
6855 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && col l->numericCollation == UCOL_OFF
6856 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneF ailed)
6857 {
6858 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
6859 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in build ing latin1 table, we'll use it
6860 //fprintf(stderr, "F");
6861 coll->latinOneUse = TRUE;
6862 } else {
6863 coll->latinOneUse = FALSE;
6864 }
6865 if(*status == U_UNSUPPORTED_ERROR) {
6866 *status = U_ZERO_ERROR;
6867 }
6868 } else { // latin1Table exists and it doesn't need to be regenerated , just use it
6869 coll->latinOneUse = TRUE;
6870 }
6871 } else {
6872 coll->latinOneUse = FALSE;
6873 }
6874 }
6875 }
6876
6877 U_CAPI uint32_t U_EXPORT2
6878 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCod e *status) {
6879 if(U_FAILURE(*status) || coll == NULL) {
6880 return 0;
6881 }
6882 if(len == -1) {
6883 len = u_strlen(varTop);
6884 }
6885 if(len == 0) {
6886 *status = U_ILLEGAL_ARGUMENT_ERROR;
6887 return 0;
6888 }
6889
6890 collIterate s;
6891 IInit_collIterate(coll, varTop, len, &s, status);
6892 if(U_FAILURE(*status)) {
6893 return 0;
6894 }
6895
6896 uint32_t CE = ucol_IGetNextCE(coll, &s, status);
6897
6898 /* here we check if we have consumed all characters */
6899 /* you can put in either one character or a contraction */
6900 /* you shouldn't put more... */
6901 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
6902 *status = U_CE_NOT_FOUND_ERROR;
6903 return 0;
6904 }
6905
6906 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
6907
6908 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
6909 *status = U_PRIMARY_TOO_LONG_ERROR;
6910 return 0;
6911 }
6912 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
6913 coll->variableTopValueisDefault = FALSE;
6914 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
6915 }
6916
6917 /* To avoid memory leak, free the offset buffer if necessary. */
6918 ucol_freeOffsetBuffer(&s);
6919
6920 return CE & UCOL_PRIMARYMASK;
6921 }
6922
6923 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
6924 if(U_FAILURE(*status) || coll == NULL) {
6925 return 0;
6926 }
6927 return coll->variableTopValue<<16;
6928 }
6929
6930 U_CAPI void U_EXPORT2
6931 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *stat us) {
6932 if(U_FAILURE(*status) || coll == NULL) {
6933 return;
6934 }
6935
6936 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
6937 coll->variableTopValueisDefault = FALSE;
6938 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
6939 }
6940 }
6941 /* Attribute setter API */
6942 U_CAPI void U_EXPORT2
6943 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
6944 if(U_FAILURE(*status) || coll == NULL) {
6945 return;
6946 }
6947 UColAttributeValue oldFrench = coll->frenchCollation;
6948 UColAttributeValue oldCaseFirst = coll->caseFirst;
6949 switch(attr) {
6950 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
6951 if(value == UCOL_ON) {
6952 coll->numericCollation = UCOL_ON;
6953 coll->numericCollationisDefault = FALSE;
6954 } else if (value == UCOL_OFF) {
6955 coll->numericCollation = UCOL_OFF;
6956 coll->numericCollationisDefault = FALSE;
6957 } else if (value == UCOL_DEFAULT) {
6958 coll->numericCollationisDefault = TRUE;
6959 coll->numericCollation = (UColAttributeValue)coll->options->numericC ollation;
6960 } else {
6961 *status = U_ILLEGAL_ARGUMENT_ERROR;
6962 }
6963 break;
6964 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragan a */
6965 if(value == UCOL_ON) {
6966 coll->hiraganaQ = UCOL_ON;
6967 coll->hiraganaQisDefault = FALSE;
6968 } else if (value == UCOL_OFF) {
6969 coll->hiraganaQ = UCOL_OFF;
6970 coll->hiraganaQisDefault = FALSE;
6971 } else if (value == UCOL_DEFAULT) {
6972 coll->hiraganaQisDefault = TRUE;
6973 coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
6974 } else {
6975 *status = U_ILLEGAL_ARGUMENT_ERROR;
6976 }
6977 break;
6978 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights* /
6979 if(value == UCOL_ON) {
6980 coll->frenchCollation = UCOL_ON;
6981 coll->frenchCollationisDefault = FALSE;
6982 } else if (value == UCOL_OFF) {
6983 coll->frenchCollation = UCOL_OFF;
6984 coll->frenchCollationisDefault = FALSE;
6985 } else if (value == UCOL_DEFAULT) {
6986 coll->frenchCollationisDefault = TRUE;
6987 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCol lation;
6988 } else {
6989 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6990 }
6991 break;
6992 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6993 if(value == UCOL_SHIFTED) {
6994 coll->alternateHandling = UCOL_SHIFTED;
6995 coll->alternateHandlingisDefault = FALSE;
6996 } else if (value == UCOL_NON_IGNORABLE) {
6997 coll->alternateHandling = UCOL_NON_IGNORABLE;
6998 coll->alternateHandlingisDefault = FALSE;
6999 } else if (value == UCOL_DEFAULT) {
7000 coll->alternateHandlingisDefault = TRUE;
7001 coll->alternateHandling = (UColAttributeValue)coll->options->alterna teHandling ;
7002 } else {
7003 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7004 }
7005 break;
7006 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
7007 if(value == UCOL_LOWER_FIRST) {
7008 coll->caseFirst = UCOL_LOWER_FIRST;
7009 coll->caseFirstisDefault = FALSE;
7010 } else if (value == UCOL_UPPER_FIRST) {
7011 coll->caseFirst = UCOL_UPPER_FIRST;
7012 coll->caseFirstisDefault = FALSE;
7013 } else if (value == UCOL_OFF) {
7014 coll->caseFirst = UCOL_OFF;
7015 coll->caseFirstisDefault = FALSE;
7016 } else if (value == UCOL_DEFAULT) {
7017 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
7018 coll->caseFirstisDefault = TRUE;
7019 } else {
7020 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7021 }
7022 break;
7023 case UCOL_CASE_LEVEL: /* do we have an extra case level */
7024 if(value == UCOL_ON) {
7025 coll->caseLevel = UCOL_ON;
7026 coll->caseLevelisDefault = FALSE;
7027 } else if (value == UCOL_OFF) {
7028 coll->caseLevel = UCOL_OFF;
7029 coll->caseLevelisDefault = FALSE;
7030 } else if (value == UCOL_DEFAULT) {
7031 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
7032 coll->caseLevelisDefault = TRUE;
7033 } else {
7034 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7035 }
7036 break;
7037 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
7038 if(value == UCOL_ON) {
7039 coll->normalizationMode = UCOL_ON;
7040 coll->normalizationModeisDefault = FALSE;
7041 initializeFCD(status);
7042 } else if (value == UCOL_OFF) {
7043 coll->normalizationMode = UCOL_OFF;
7044 coll->normalizationModeisDefault = FALSE;
7045 } else if (value == UCOL_DEFAULT) {
7046 coll->normalizationModeisDefault = TRUE;
7047 coll->normalizationMode = (UColAttributeValue)coll->options->normali zationMode;
7048 if(coll->normalizationMode == UCOL_ON) {
7049 initializeFCD(status);
7050 }
7051 } else {
7052 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7053 }
7054 break;
7055 case UCOL_STRENGTH: /* attribute for strength */
7056 if (value == UCOL_DEFAULT) {
7057 coll->strengthisDefault = TRUE;
7058 coll->strength = (UColAttributeValue)coll->options->strength;
7059 } else if (value <= UCOL_IDENTICAL) {
7060 coll->strengthisDefault = FALSE;
7061 coll->strength = value;
7062 } else {
7063 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7064 }
7065 break;
7066 case UCOL_ATTRIBUTE_COUNT:
7067 default:
7068 *status = U_ILLEGAL_ARGUMENT_ERROR;
7069 break;
7070 }
7071 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
7072 coll->latinOneRegenTable = TRUE;
7073 } else {
7074 coll->latinOneRegenTable = FALSE;
7075 }
7076 ucol_updateInternalState(coll, status);
7077 }
7078
7079 U_CAPI UColAttributeValue U_EXPORT2
7080 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
7081 if(U_FAILURE(*status) || coll == NULL) {
7082 return UCOL_DEFAULT;
7083 }
7084 switch(attr) {
7085 case UCOL_NUMERIC_COLLATION:
7086 return coll->numericCollation;
7087 case UCOL_HIRAGANA_QUATERNARY_MODE:
7088 return coll->hiraganaQ;
7089 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights* /
7090 return coll->frenchCollation;
7091 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
7092 return coll->alternateHandling;
7093 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
7094 return coll->caseFirst;
7095 case UCOL_CASE_LEVEL: /* do we have an extra case level */
7096 return coll->caseLevel;
7097 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
7098 return coll->normalizationMode;
7099 case UCOL_STRENGTH: /* attribute for strength */
7100 return coll->strength;
7101 case UCOL_ATTRIBUTE_COUNT:
7102 default:
7103 *status = U_ILLEGAL_ARGUMENT_ERROR;
7104 break;
7105 }
7106 return UCOL_DEFAULT;
7107 }
7108
7109 U_CAPI void U_EXPORT2
7110 ucol_setStrength( UCollator *coll,
7111 UCollationStrength strength)
7112 {
7113 UErrorCode status = U_ZERO_ERROR;
7114 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
7115 }
7116
7117 U_CAPI UCollationStrength U_EXPORT2
7118 ucol_getStrength(const UCollator *coll)
7119 {
7120 UErrorCode status = U_ZERO_ERROR;
7121 return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
7122 }
7123
7124 U_INTERNAL int32_t U_EXPORT2
7125 ucol_getReorderCodes(const UCollator *coll,
7126 int32_t *dest,
7127 int32_t destCapacity,
7128 UErrorCode *pErrorCode) {
7129 if (U_FAILURE(*pErrorCode)) {
7130 return 0;
7131 }
7132
7133 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
7134 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
7135 return 0;
7136 }
7137
7138 if (coll->reorderCodesLength > destCapacity) {
7139 *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
7140 return coll->reorderCodesLength;
7141 }
7142 for (int32_t i = 0; i < coll->reorderCodesLength; i++) {
7143 dest[i] = coll->reorderCodes[i];
7144 }
7145 return coll->reorderCodesLength;
7146 }
7147
7148 U_INTERNAL void U_EXPORT2
7149 ucol_setReorderCodes(UCollator *coll,
7150 const int32_t *reorderCodes,
7151 int32_t reorderCodesLength,
7152 UErrorCode *pErrorCode) {
7153 if (U_FAILURE(*pErrorCode)) {
7154 return;
7155 }
7156
7157 if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NUL L)) {
7158 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
7159 return;
7160 }
7161
7162 uprv_free(coll->reorderCodes);
7163 coll->reorderCodes = NULL;
7164 coll->reorderCodesLength = 0;
7165 if (reorderCodesLength == 0) {
7166 uprv_free(coll->leadBytePermutationTable);
7167 coll->leadBytePermutationTable = NULL;
7168 return;
7169 }
7170 coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int3 2_t));
7171 if (coll->reorderCodes == NULL) {
7172 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
7173 return;
7174 }
7175 for (int32_t i = 0; i < reorderCodesLength; i++) {
7176 coll->reorderCodes[i] = reorderCodes[i];
7177 }
7178 coll->reorderCodesLength = reorderCodesLength;
7179 ucol_buildPermutationTable(coll, pErrorCode);
7180 if (U_FAILURE(*pErrorCode)) {
7181 uprv_free(coll->reorderCodes);
7182 coll->reorderCodes = NULL;
7183 coll->reorderCodesLength = 0;
7184 }
7185 }
7186
7187
7188 /****************************************************************************/
7189 /* Following are misc functions */
7190 /* there are new APIs and some compatibility APIs */
7191 /****************************************************************************/
7192
7193 U_CAPI void U_EXPORT2
7194 ucol_getVersion(const UCollator* coll,
7195 UVersionInfo versionInfo)
7196 {
7197 /* RunTime version */
7198 uint8_t rtVersion = UCOL_RUNTIME_VERSION;
7199 /* Builder version*/
7200 uint8_t bdVersion = coll->image->version[0];
7201
7202 /* Charset Version. Need to get the version from cnv files
7203 * makeconv should populate cnv files with version and
7204 * an api has to be provided in ucnv.h to obtain this version
7205 */
7206 uint8_t csVersion = 0;
7207
7208 /* combine the version info */
7209 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersi on));
7210
7211 /* Tailoring rules */
7212 versionInfo[0] = (uint8_t)(cmbVersion>>8);
7213 versionInfo[1] = (uint8_t)cmbVersion;
7214 versionInfo[2] = coll->image->version[1];
7215 if(coll->UCA) {
7216 /* Include the minor number when getting the UCA version. (major & 1f) < < 3 | (minor & 7) */
7217 versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll-> UCA->image->UCAVersion[1] & 0x07);
7218 } else {
7219 versionInfo[3] = 0;
7220 }
7221 }
7222
7223
7224 /* This internal API checks whether a character is tailored or not */
7225 U_CAPI UBool U_EXPORT2
7226 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
7227 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
7228 return FALSE;
7229 }
7230
7231 uint32_t CE = UCOL_NOT_FOUND;
7232 const UChar *ContractionStart = NULL;
7233 if(u < 0x100) { /* latin-1 */
7234 CE = coll->latinOneMapping[u];
7235 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
7236 return FALSE;
7237 }
7238 } else { /* regular */
7239 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
7240 }
7241
7242 if(isContraction(CE)) {
7243 ContractionStart = (UChar *)coll->image+getContractOffset(CE);
7244 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex) );
7245 }
7246
7247 return (UBool)(CE != UCOL_NOT_FOUND);
7248 }
7249
7250
7251 /****************************************************************************/
7252 /* Following are the string compare functions */
7253 /* */
7254 /****************************************************************************/
7255
7256
7257 /* ucol_checkIdent internal function. Does byte level string compare. */
7258 /* Used by strcoll if strength == identical and strings */
7259 /* are otherwise equal. */
7260 /* */
7261 /* Comparison must be done on NFD normalized strings. */
7262 /* FCD is not good enough. */
7263
7264 static
7265 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBoo l normalize, UErrorCode *status)
7266 {
7267 // When we arrive here, we can have normal strings or UCharIterators. Curren tly they are both
7268 // of same type, but that doesn't really mean that it will stay that way.
7269 int32_t comparison;
7270
7271 if (sColl->flags & UCOL_USE_ITERATOR) {
7272 // The division for the array length may truncate the array size to
7273 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
7274 // for all platforms anyway.
7275 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7276 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7277 UNormIterator *sNIt = NULL, *tNIt = NULL;
7278 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
7279 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
7280 sColl->iterator->move(sColl->iterator, 0, UITER_START);
7281 tColl->iterator->move(tColl->iterator, 0, UITER_START);
7282 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, sta tus);
7283 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, sta tus);
7284 comparison = u_strCompareIter(sIt, tIt, TRUE);
7285 unorm_closeIter(sNIt);
7286 unorm_closeIter(tNIt);
7287 } else {
7288 int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl- >endp - sColl->string) : -1;
7289 const UChar *sBuf = sColl->string;
7290 int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl- >endp - tColl->string) : -1;
7291 const UChar *tBuf = tColl->string;
7292
7293 if (normalize) {
7294 *status = U_ZERO_ERROR;
7295 // Note: We could use Normalizer::compare() or similar, but for shor t strings
7296 // which may not be in FCD it might be faster to just NFD them.
7297 // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather tha n
7298 // NFD'ing immediately might be faster for long strings,
7299 // but string comparison is usually done on relatively short strings .
7300 sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN ) == 0, sBuf, sLen),
7301 sColl->writableBuffer,
7302 *status);
7303 tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN ) == 0, tBuf, tLen),
7304 tColl->writableBuffer,
7305 *status);
7306 if(U_FAILURE(*status)) {
7307 return UCOL_LESS;
7308 }
7309 comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writ ableBuffer);
7310 } else {
7311 comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);
7312 }
7313 }
7314
7315 if (comparison < 0) {
7316 return UCOL_LESS;
7317 } else if (comparison == 0) {
7318 return UCOL_EQUAL;
7319 } else /* comparison > 0 */ {
7320 return UCOL_GREATER;
7321 }
7322 }
7323
7324 /* CEBuf - A struct and some inline functions to handle the saving */
7325 /* of CEs in a buffer within ucol_strcoll */
7326
7327 #define UCOL_CEBUF_SIZE 512
7328 typedef struct ucol_CEBuf {
7329 uint32_t *buf;
7330 uint32_t *endp;
7331 uint32_t *pos;
7332 uint32_t localArray[UCOL_CEBUF_SIZE];
7333 } ucol_CEBuf;
7334
7335
7336 static
7337 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
7338 (b)->buf = (b)->pos = (b)->localArray;
7339 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
7340 }
7341
7342 static
7343 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
7344 uint32_t oldSize;
7345 uint32_t newSize;
7346 uint32_t *newBuf;
7347
7348 ci->flags |= UCOL_ITER_ALLOCATED;
7349 oldSize = (uint32_t)(b->pos - b->buf);
7350 newSize = oldSize * 2;
7351 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7352 if(newBuf == NULL) {
7353 *status = U_MEMORY_ALLOCATION_ERROR;
7354 }
7355 else {
7356 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7357 if (b->buf != b->localArray) {
7358 uprv_free(b->buf);
7359 }
7360 b->buf = newBuf;
7361 b->endp = b->buf + newSize;
7362 b->pos = b->buf + oldSize;
7363 }
7364 }
7365
7366 static
7367 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCo de *status) {
7368 if (b->pos == b->endp) {
7369 ucol_CEBuf_Expand(b, ci, status);
7370 }
7371 if (U_SUCCESS(*status)) {
7372 *(b)->pos++ = ce;
7373 }
7374 }
7375
7376 /* This is a trick string compare function that goes in and uses sortkeys to com pare */
7377 /* It is used when compare gets in trouble and needs to bail out */
7378 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7379 collIterate *tColl,
7380 UErrorCode *status)
7381 {
7382 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7383 uint8_t *sourceKeyP = sourceKey;
7384 uint8_t *targetKeyP = targetKey;
7385 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7386 const UCollator *coll = sColl->coll;
7387 const UChar *source = NULL;
7388 const UChar *target = NULL;
7389 int32_t result = UCOL_EQUAL;
7390 UnicodeString sourceString, targetString;
7391 int32_t sourceLength;
7392 int32_t targetLength;
7393
7394 if(sColl->flags & UCOL_USE_ITERATOR) {
7395 sColl->iterator->move(sColl->iterator, 0, UITER_START);
7396 tColl->iterator->move(tColl->iterator, 0, UITER_START);
7397 UChar32 c;
7398 while((c=sColl->iterator->next(sColl->iterator))>=0) {
7399 sourceString.append((UChar)c);
7400 }
7401 while((c=tColl->iterator->next(tColl->iterator))>=0) {
7402 targetString.append((UChar)c);
7403 }
7404 source = sourceString.getBuffer();
7405 sourceLength = sourceString.length();
7406 target = targetString.getBuffer();
7407 targetLength = targetString.length();
7408 } else { // no iterators
7409 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sCo ll->string):-1;
7410 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tCo ll->string):-1;
7411 source = sColl->string;
7412 target = tColl->string;
7413 }
7414
7415
7416
7417 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourc eKeyLen);
7418 if(sourceKeyLen > UCOL_MAX_BUFFER) {
7419 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7420 if(sourceKeyP == NULL) {
7421 *status = U_MEMORY_ALLOCATION_ERROR;
7422 goto cleanup_and_do_compare;
7423 }
7424 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, s ourceKeyLen);
7425 }
7426
7427 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targe tKeyLen);
7428 if(targetKeyLen > UCOL_MAX_BUFFER) {
7429 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7430 if(targetKeyP == NULL) {
7431 *status = U_MEMORY_ALLOCATION_ERROR;
7432 goto cleanup_and_do_compare;
7433 }
7434 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, t argetKeyLen);
7435 }
7436
7437 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7438
7439 cleanup_and_do_compare:
7440 if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
7441 uprv_free(sourceKeyP);
7442 }
7443
7444 if(targetKeyP != NULL && targetKeyP != targetKey) {
7445 uprv_free(targetKeyP);
7446 }
7447
7448 if(result<0) {
7449 return UCOL_LESS;
7450 } else if(result>0) {
7451 return UCOL_GREATER;
7452 } else {
7453 return UCOL_EQUAL;
7454 }
7455 }
7456
7457
7458 static UCollationResult
7459 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
7460 {
7461 U_ALIGN_CODE(16);
7462
7463 const UCollator *coll = sColl->coll;
7464
7465
7466 // setting up the collator parameters
7467 UColAttributeValue strength = coll->strength;
7468 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY);
7469
7470 UBool checkSecTer = initialCheckSecTer;
7471 UBool checkTertiary = (strength >= UCOL_TERTIARY);
7472 UBool checkQuad = (strength >= UCOL_QUATERNARY);
7473 UBool checkIdent = (strength == UCOL_IDENTICAL);
7474 UBool checkCase = (coll->caseLevel == UCOL_ON);
7475 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7476 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7477 UBool qShifted = shifted && checkQuad;
7478 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7479
7480 if(doHiragana && shifted) {
7481 return (ucol_compareUsingSortKeys(sColl, tColl, status));
7482 }
7483 uint8_t caseSwitch = coll->caseSwitch;
7484 uint8_t tertiaryMask = coll->tertiaryMask;
7485
7486 // This is the lowest primary value that will not be ignored if shifted
7487 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7488
7489 UCollationResult result = UCOL_EQUAL;
7490 UCollationResult hirResult = UCOL_EQUAL;
7491
7492 // Preparing the CE buffers. They will be filled during the primary phase
7493 ucol_CEBuf sCEs;
7494 ucol_CEBuf tCEs;
7495 UCOL_INIT_CEBUF(&sCEs);
7496 UCOL_INIT_CEBUF(&tCEs);
7497
7498 uint32_t secS = 0, secT = 0;
7499 uint32_t sOrder=0, tOrder=0;
7500
7501 // Non shifted primary processing is quite simple
7502 if(!shifted) {
7503 for(;;) {
7504
7505 // We fetch CEs until we hit a non ignorable primary or end.
7506 do {
7507 // We get the next CE
7508 sOrder = ucol_IGetNextCE(coll, sColl, status);
7509 // Stuff it in the buffer
7510 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7511 // And keep just the primary part.
7512 sOrder &= UCOL_PRIMARYMASK;
7513 } while(sOrder == 0);
7514
7515 // see the comments on the above block
7516 do {
7517 tOrder = ucol_IGetNextCE(coll, tColl, status);
7518 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7519 tOrder &= UCOL_PRIMARYMASK;
7520 } while(tOrder == 0);
7521
7522 // if both primaries are the same
7523 if(sOrder == tOrder) {
7524 // and there are no more CEs, we advance to the next level
7525 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7526 break;
7527 }
7528 if(doHiragana && hirResult == UCOL_EQUAL) {
7529 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCO L_WAS_HIRAGANA)) {
7530 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl ->flags & UCOL_WAS_HIRAGANA))
7531 ? UCOL_LESS:UCOL_GREATER;
7532 }
7533 }
7534 } else {
7535 // only need to check one for continuation
7536 // if one is then the other must be or the preceding CE would be a prefix of the other
7537 if (coll->leadBytePermutationTable != NULL && !isContinuation(sO rder)) {
7538 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
7539 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
7540 }
7541 // if two primaries are different, we are done
7542 result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER;
7543 goto commonReturn;
7544 }
7545 } // no primary difference... do the rest from the buffers
7546 } else { // shifted - do a slightly more complicated processing :)
7547 for(;;) {
7548 UBool sInShifted = FALSE;
7549 UBool tInShifted = FALSE;
7550 // This version of code can be refactored. However, it seems easier to understand this way.
7551 // Source loop. Sam as the target loop.
7552 for(;;) {
7553 sOrder = ucol_IGetNextCE(coll, sColl, status);
7554 if(sOrder == UCOL_NO_MORE_CES) {
7555 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7556 break;
7557 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMA SK) == 0)) {
7558 /* UCA amendment - ignore ignorables that follow shifted cod e points */
7559 continue;
7560 } else if(isContinuation(sOrder)) {
7561 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary va lue */
7562 if(sInShifted) {
7563 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* pres erve interesting continuation */
7564 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7565 continue;
7566 } else {
7567 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7568 break;
7569 }
7570 } else { /* Just lower level values */
7571 if(sInShifted) {
7572 continue;
7573 } else {
7574 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7575 continue;
7576 }
7577 }
7578 } else { /* regular */
7579 if(coll->leadBytePermutationTable != NULL){
7580 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
7581 }
7582 if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7583 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7584 break;
7585 } else {
7586 if((sOrder & UCOL_PRIMARYMASK) > 0) {
7587 sInShifted = TRUE;
7588 sOrder &= UCOL_PRIMARYMASK;
7589 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7590 continue;
7591 } else {
7592 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7593 sInShifted = FALSE;
7594 continue;
7595 }
7596 }
7597 }
7598 }
7599 sOrder &= UCOL_PRIMARYMASK;
7600 sInShifted = FALSE;
7601
7602 for(;;) {
7603 tOrder = ucol_IGetNextCE(coll, tColl, status);
7604 if(tOrder == UCOL_NO_MORE_CES) {
7605 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7606 break;
7607 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMA SK) == 0)) {
7608 /* UCA amendment - ignore ignorables that follow shifted cod e points */
7609 continue;
7610 } else if(isContinuation(tOrder)) {
7611 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary va lue */
7612 if(tInShifted) {
7613 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* pres erve interesting continuation */
7614 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7615 continue;
7616 } else {
7617 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7618 break;
7619 }
7620 } else { /* Just lower level values */
7621 if(tInShifted) {
7622 continue;
7623 } else {
7624 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7625 continue;
7626 }
7627 }
7628 } else { /* regular */
7629 if(coll->leadBytePermutationTable != NULL){
7630 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
7631 }
7632 if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7633 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7634 break;
7635 } else {
7636 if((tOrder & UCOL_PRIMARYMASK) > 0) {
7637 tInShifted = TRUE;
7638 tOrder &= UCOL_PRIMARYMASK;
7639 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7640 continue;
7641 } else {
7642 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7643 tInShifted = FALSE;
7644 continue;
7645 }
7646 }
7647 }
7648 }
7649 tOrder &= UCOL_PRIMARYMASK;
7650 tInShifted = FALSE;
7651
7652 if(sOrder == tOrder) {
7653 /*
7654 if(doHiragana && hirResult == UCOL_EQUAL) {
7655 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_ HIRAGANA)) {
7656 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7657 ? UCOL_LESS:UCOL_GREATER;
7658 }
7659 }
7660 */
7661 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7662 break;
7663 } else {
7664 sOrder = 0;
7665 tOrder = 0;
7666 continue;
7667 }
7668 } else {
7669 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7670 goto commonReturn;
7671 }
7672 } /* no primary difference... do the rest from the buffers */
7673 }
7674
7675 /* now, we're gonna reexamine collected CEs */
7676 uint32_t *sCE;
7677 uint32_t *tCE;
7678
7679 /* This is the secondary level of comparison */
7680 if(checkSecTer) {
7681 if(!isFrenchSec) { /* normal */
7682 sCE = sCEs.buf;
7683 tCE = tCEs.buf;
7684 for(;;) {
7685 while (secS == 0) {
7686 secS = *(sCE++) & UCOL_SECONDARYMASK;
7687 }
7688
7689 while(secT == 0) {
7690 secT = *(tCE++) & UCOL_SECONDARYMASK;
7691 }
7692
7693 if(secS == secT) {
7694 if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7695 break;
7696 } else {
7697 secS = 0; secT = 0;
7698 continue;
7699 }
7700 } else {
7701 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7702 goto commonReturn;
7703 }
7704 }
7705 } else { /* do the French */
7706 uint32_t *sCESave = NULL;
7707 uint32_t *tCESave = NULL;
7708 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimi zed */
7709 tCE = tCEs.pos-2;
7710 for(;;) {
7711 while (secS == 0 && sCE >= sCEs.buf) {
7712 if(sCESave == NULL) {
7713 secS = *(sCE--);
7714 if(isContinuation(secS)) {
7715 while(isContinuation(secS = *(sCE--)))
7716 ;
7717 /* after this, secS has the start of continuation, a nd sCEs points before that */
7718 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7719 sCE+=2; /* need to point to the first continuation CP */
7720 /* However, now you can just continue doing stuff */
7721 }
7722 } else {
7723 secS = *(sCE++);
7724 if(!isContinuation(secS)) { /* This means we have finish ed with this cont */
7725 sCE = sCESave; /* reset the pointer to be fore continuation */
7726 sCESave = NULL;
7727 secS = 0; /* Fetch a fresh CE before the continuati on sequence. */
7728 continue;
7729 }
7730 }
7731 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit * /
7732 }
7733
7734 while(secT == 0 && tCE >= tCEs.buf) {
7735 if(tCESave == NULL) {
7736 secT = *(tCE--);
7737 if(isContinuation(secT)) {
7738 while(isContinuation(secT = *(tCE--)))
7739 ;
7740 /* after this, secS has the start of continuation, a nd sCEs points before that */
7741 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7742 tCE+=2; /* need to point to the first continuation CP */
7743 /* However, now you can just continue doing stuff */
7744 }
7745 } else {
7746 secT = *(tCE++);
7747 if(!isContinuation(secT)) { /* This means we have finish ed with this cont */
7748 tCE = tCESave; /* reset the pointer to befo re continuation */
7749 tCESave = NULL;
7750 secT = 0; /* Fetch a fresh CE before the continuati on sequence. */
7751 continue;
7752 }
7753 }
7754 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit * /
7755 }
7756
7757 if(secS == secT) {
7758 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7759 break;
7760 } else {
7761 secS = 0; secT = 0;
7762 continue;
7763 }
7764 } else {
7765 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7766 goto commonReturn;
7767 }
7768 }
7769 }
7770 }
7771
7772 /* doing the case bit */
7773 if(checkCase) {
7774 sCE = sCEs.buf;
7775 tCE = tCEs.buf;
7776 for(;;) {
7777 while((secS & UCOL_REMOVE_CASE) == 0) {
7778 if(!isContinuation(*sCE++)) {
7779 secS =*(sCE-1);
7780 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMA RY) {
7781 // primary ignorables should not be considered on the ca se level when the strength is primary
7782 // otherwise, the CEs stop being well-formed
7783 secS &= UCOL_TERT_CASE_MASK;
7784 secS ^= caseSwitch;
7785 } else {
7786 secS = 0;
7787 }
7788 } else {
7789 secS = 0;
7790 }
7791 }
7792
7793 while((secT & UCOL_REMOVE_CASE) == 0) {
7794 if(!isContinuation(*tCE++)) {
7795 secT = *(tCE-1);
7796 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMA RY) {
7797 // primary ignorables should not be considered on the ca se level when the strength is primary
7798 // otherwise, the CEs stop being well-formed
7799 secT &= UCOL_TERT_CASE_MASK;
7800 secT ^= caseSwitch;
7801 } else {
7802 secT = 0;
7803 }
7804 } else {
7805 secT = 0;
7806 }
7807 }
7808
7809 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
7810 result = UCOL_LESS;
7811 goto commonReturn;
7812 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
7813 result = UCOL_GREATER;
7814 goto commonReturn;
7815 }
7816
7817 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
7818 break;
7819 } else {
7820 secS = 0;
7821 secT = 0;
7822 }
7823 }
7824 }
7825
7826 /* Tertiary level */
7827 if(checkTertiary) {
7828 secS = 0;
7829 secT = 0;
7830 sCE = sCEs.buf;
7831 tCE = tCEs.buf;
7832 for(;;) {
7833 while((secS & UCOL_REMOVE_CASE) == 0) {
7834 secS = *(sCE++) & tertiaryMask;
7835 if(!isContinuation(secS)) {
7836 secS ^= caseSwitch;
7837 } else {
7838 secS &= UCOL_REMOVE_CASE;
7839 }
7840 }
7841
7842 while((secT & UCOL_REMOVE_CASE) == 0) {
7843 secT = *(tCE++) & tertiaryMask;
7844 if(!isContinuation(secT)) {
7845 secT ^= caseSwitch;
7846 } else {
7847 secT &= UCOL_REMOVE_CASE;
7848 }
7849 }
7850
7851 if(secS == secT) {
7852 if((secS & UCOL_REMOVE_CASE) == 1) {
7853 break;
7854 } else {
7855 secS = 0; secT = 0;
7856 continue;
7857 }
7858 } else {
7859 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7860 goto commonReturn;
7861 }
7862 }
7863 }
7864
7865
7866 if(qShifted /*checkQuad*/) {
7867 UBool sInShifted = TRUE;
7868 UBool tInShifted = TRUE;
7869 secS = 0;
7870 secT = 0;
7871 sCE = sCEs.buf;
7872 tCE = tCEs.buf;
7873 for(;;) {
7874 while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(sec S) && !sInShifted)) {
7875 secS = *(sCE++);
7876 if(isContinuation(secS)) {
7877 if(!sInShifted) {
7878 continue;
7879 }
7880 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
7881 secS = UCOL_PRIMARYMASK;
7882 sInShifted = FALSE;
7883 } else {
7884 sInShifted = TRUE;
7885 }
7886 }
7887 secS &= UCOL_PRIMARYMASK;
7888
7889
7890 while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(sec T) && !tInShifted)) {
7891 secT = *(tCE++);
7892 if(isContinuation(secT)) {
7893 if(!tInShifted) {
7894 continue;
7895 }
7896 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
7897 secT = UCOL_PRIMARYMASK;
7898 tInShifted = FALSE;
7899 } else {
7900 tInShifted = TRUE;
7901 }
7902 }
7903 secT &= UCOL_PRIMARYMASK;
7904
7905 if(secS == secT) {
7906 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
7907 break;
7908 } else {
7909 secS = 0; secT = 0;
7910 continue;
7911 }
7912 } else {
7913 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7914 goto commonReturn;
7915 }
7916 }
7917 } else if(doHiragana && hirResult != UCOL_EQUAL) {
7918 // If we're fine on quaternaries, we might be different
7919 // on Hiragana. This, however, might fail us in shifted.
7920 result = hirResult;
7921 goto commonReturn;
7922 }
7923
7924 /* For IDENTICAL comparisons, we use a bitwise character comparison */
7925 /* as a tiebreaker if all else is equal. */
7926 /* Getting here should be quite rare - strings are not identical - */
7927 /* that is checked first, but compared == through all other checks. */
7928 if(checkIdent)
7929 {
7930 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UC OL_ON);
7931 result = ucol_checkIdent(sColl, tColl, TRUE, status);
7932 }
7933
7934 commonReturn:
7935 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
7936 if (sCEs.buf != sCEs.localArray ) {
7937 uprv_free(sCEs.buf);
7938 }
7939 if (tCEs.buf != tCEs.localArray ) {
7940 uprv_free(tCEs.buf);
7941 }
7942 }
7943
7944 return result;
7945 }
7946
7947 static UCollationResult
7948 ucol_strcollRegular(const UCollator *coll,
7949 const UChar *source, int32_t sourceLength,
7950 const UChar *target, int32_t targetLength,
7951 UErrorCode *status) {
7952 collIterate sColl, tColl;
7953 // Preparing the context objects for iterating over strings
7954 IInit_collIterate(coll, source, sourceLength, &sColl, status);
7955 IInit_collIterate(coll, target, targetLength, &tColl, status);
7956 if(U_FAILURE(*status)) {
7957 return UCOL_LESS;
7958 }
7959 return ucol_strcollRegular(&sColl, &tColl, status);
7960 }
7961
7962 static inline uint32_t
7963 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
7964 uint32_t CE, const UChar *s, int32_t *index, int32_t l en)
7965 {
7966 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
7967 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
7968 int32_t offset = 1;
7969 UChar schar = 0, tchar = 0;
7970
7971 for(;;) {
7972 if(len == -1) {
7973 if(s[*index] == 0) { // end of string
7974 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOn eOffset]);
7975 } else {
7976 schar = s[*index];
7977 }
7978 } else {
7979 if(*index == len) {
7980 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOn eOffset]);
7981 } else {
7982 schar = s[*index];
7983 }
7984 }
7985
7986 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contractio n codepoints should be ordered, we skip all that are smaller */
7987 offset++;
7988 }
7989
7990 if (schar == tchar) {
7991 (*index)++;
7992 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff set+offset]);
7993 }
7994 else
7995 {
7996 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
7997 return UCOL_BAIL_OUT_CE;
7998 }
7999 // skip completely ignorables
8000 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
8001 if(isZeroCE == 0) { // we have to ignore completely ignorables
8002 (*index)++;
8003 continue;
8004 }
8005
8006 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff set]);
8007 }
8008 }
8009 }
8010
8011
8012 /**
8013 * This is a fast strcoll, geared towards text in Latin-1.
8014 * It supports contractions of size two, French secondaries
8015 * and case switching. You can use it with strengths primary
8016 * to tertiary. It does not support shifted and case level.
8017 * It relies on the table build by setupLatin1Table. If it
8018 * doesn't understand something, it will go to the regular
8019 * strcoll.
8020 */
8021 static UCollationResult
8022 ucol_strcollUseLatin1( const UCollator *coll,
8023 const UChar *source,
8024 int32_t sLen,
8025 const UChar *target,
8026 int32_t tLen,
8027 UErrorCode *status)
8028 {
8029 U_ALIGN_CODE(16);
8030 int32_t strength = coll->strength;
8031
8032 int32_t sIndex = 0, tIndex = 0;
8033 UChar sChar = 0, tChar = 0;
8034 uint32_t sOrder=0, tOrder=0;
8035
8036 UBool endOfSource = FALSE;
8037
8038 uint32_t *elements = coll->latinOneCEs;
8039
8040 UBool haveContractions = FALSE; // if we have contractions in our string
8041 // we cannot do French secondary
8042
8043 // Do the primary level
8044 for(;;) {
8045 while(sOrder==0) { // this loop skips primary ignorables
8046 // sOrder=getNextlatinOneCE(source);
8047 if(sLen==-1) { // handling zero terminated strings
8048 sChar=source[sIndex++];
8049 if(sChar==0) {
8050 endOfSource = TRUE;
8051 break;
8052 }
8053 } else { // handling strings with known length
8054 if(sIndex==sLen) {
8055 endOfSource = TRUE;
8056 break;
8057 }
8058 sChar=source[sIndex++];
8059 }
8060 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sCha r > 0xFF, but this is faster on win32)
8061 //fprintf(stderr, "R");
8062 return ucol_strcollRegular(coll, source, sLen, target, tLen, sta tus);
8063 }
8064 sOrder = elements[sChar];
8065 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
8066 // specials can basically be either contractions or bail-out sig ns. If we get anything
8067 // else, we'll bail out anywasy
8068 if(getCETag(sOrder) == CONTRACTION_TAG) {
8069 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOr der, source, &sIndex, sLen);
8070 haveContractions = TRUE; // if there are contractions, we ca nnot do French secondary
8071 // However, if there are contractions in the table, but we a lways use just one char,
8072 // we might be able to do French. This should be checked out .
8073 }
8074 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8075 //fprintf(stderr, "S");
8076 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8077 }
8078 }
8079 }
8080
8081 while(tOrder==0) { // this loop skips primary ignorables
8082 // tOrder=getNextlatinOneCE(target);
8083 if(tLen==-1) { // handling zero terminated strings
8084 tChar=target[tIndex++];
8085 if(tChar==0) {
8086 if(endOfSource) { // this is different than source loop,
8087 // as we already know that source loop is done here,
8088 // so we can either finish the primary loop if both
8089 // strings are done or anounce the result if only
8090 // target is done. Same below.
8091 goto endOfPrimLoop;
8092 } else {
8093 return UCOL_GREATER;
8094 }
8095 }
8096 } else { // handling strings with known length
8097 if(tIndex==tLen) {
8098 if(endOfSource) {
8099 goto endOfPrimLoop;
8100 } else {
8101 return UCOL_GREATER;
8102 }
8103 }
8104 tChar=target[tIndex++];
8105 }
8106 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sCha r > 0xFF, but this is faster on win32)
8107 //fprintf(stderr, "R");
8108 return ucol_strcollRegular(coll, source, sLen, target, tLen, sta tus);
8109 }
8110 tOrder = elements[tChar];
8111 if(tOrder >= UCOL_NOT_FOUND) {
8112 // Handling specials, see the comments for source
8113 if(getCETag(tOrder) == CONTRACTION_TAG) {
8114 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOr der, target, &tIndex, tLen);
8115 haveContractions = TRUE;
8116 }
8117 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8118 //fprintf(stderr, "S");
8119 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8120 }
8121 }
8122 }
8123 if(endOfSource) { // source is finished, but target is not, say the resu lt.
8124 return UCOL_LESS;
8125 }
8126
8127 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
8128 sOrder = 0; tOrder = 0;
8129 continue;
8130 } else {
8131 // compare current top bytes
8132 if(((sOrder^tOrder)&0xFF000000)!=0) {
8133 // top bytes differ, return difference
8134 if(sOrder < tOrder) {
8135 return UCOL_LESS;
8136 } else if(sOrder > tOrder) {
8137 return UCOL_GREATER;
8138 }
8139 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24) ;
8140 // since we must return enum value
8141 }
8142
8143 // top bytes match, continue with following bytes
8144 sOrder<<=8;
8145 tOrder<<=8;
8146 }
8147 }
8148
8149 endOfPrimLoop:
8150 // after primary loop, we definitely know the sizes of strings,
8151 // so we set it and use simpler loop for secondaries and tertiaries
8152 sLen = sIndex; tLen = tIndex;
8153 if(strength >= UCOL_SECONDARY) {
8154 // adjust the table beggining
8155 elements += coll->latinOneTableLen;
8156 endOfSource = FALSE;
8157
8158 if(coll->frenchCollation == UCOL_OFF) { // non French
8159 // This loop is a simplified copy of primary loop
8160 // at this point we know that whole strings are latin-1, so we don't
8161 // check for that. We also know that we only have contractions as
8162 // specials.
8163 sIndex = 0; tIndex = 0;
8164 for(;;) {
8165 while(sOrder==0) {
8166 if(sIndex==sLen) {
8167 endOfSource = TRUE;
8168 break;
8169 }
8170 sChar=source[sIndex++];
8171 sOrder = elements[sChar];
8172 if(sOrder > UCOL_NOT_FOUND) {
8173 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDAR Y, sOrder, source, &sIndex, sLen);
8174 }
8175 }
8176
8177 while(tOrder==0) {
8178 if(tIndex==tLen) {
8179 if(endOfSource) {
8180 goto endOfSecLoop;
8181 } else {
8182 return UCOL_GREATER;
8183 }
8184 }
8185 tChar=target[tIndex++];
8186 tOrder = elements[tChar];
8187 if(tOrder > UCOL_NOT_FOUND) {
8188 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDAR Y, tOrder, target, &tIndex, tLen);
8189 }
8190 }
8191 if(endOfSource) {
8192 return UCOL_LESS;
8193 }
8194
8195 if(sOrder == tOrder) {
8196 sOrder = 0; tOrder = 0;
8197 continue;
8198 } else {
8199 // see primary loop for comments on this
8200 if(((sOrder^tOrder)&0xFF000000)!=0) {
8201 if(sOrder < tOrder) {
8202 return UCOL_LESS;
8203 } else if(sOrder > tOrder) {
8204 return UCOL_GREATER;
8205 }
8206 }
8207 sOrder<<=8;
8208 tOrder<<=8;
8209 }
8210 }
8211 } else { // French
8212 if(haveContractions) { // if we have contractions, we have to bail o ut
8213 // since we don't really know how to handle them here
8214 return ucol_strcollRegular(coll, source, sLen, target, tLen, sta tus);
8215 }
8216 // For French, we go backwards
8217 sIndex = sLen; tIndex = tLen;
8218 for(;;) {
8219 while(sOrder==0) {
8220 if(sIndex==0) {
8221 endOfSource = TRUE;
8222 break;
8223 }
8224 sChar=source[--sIndex];
8225 sOrder = elements[sChar];
8226 // don't even look for contractions
8227 }
8228
8229 while(tOrder==0) {
8230 if(tIndex==0) {
8231 if(endOfSource) {
8232 goto endOfSecLoop;
8233 } else {
8234 return UCOL_GREATER;
8235 }
8236 }
8237 tChar=target[--tIndex];
8238 tOrder = elements[tChar];
8239 // don't even look for contractions
8240 }
8241 if(endOfSource) {
8242 return UCOL_LESS;
8243 }
8244
8245 if(sOrder == tOrder) {
8246 sOrder = 0; tOrder = 0;
8247 continue;
8248 } else {
8249 // see the primary loop for comments
8250 if(((sOrder^tOrder)&0xFF000000)!=0) {
8251 if(sOrder < tOrder) {
8252 return UCOL_LESS;
8253 } else if(sOrder > tOrder) {
8254 return UCOL_GREATER;
8255 }
8256 }
8257 sOrder<<=8;
8258 tOrder<<=8;
8259 }
8260 }
8261 }
8262 }
8263
8264 endOfSecLoop:
8265 if(strength >= UCOL_TERTIARY) {
8266 // tertiary loop is the same as secondary (except no French)
8267 elements += coll->latinOneTableLen;
8268 sIndex = 0; tIndex = 0;
8269 endOfSource = FALSE;
8270 for(;;) {
8271 while(sOrder==0) {
8272 if(sIndex==sLen) {
8273 endOfSource = TRUE;
8274 break;
8275 }
8276 sChar=source[sIndex++];
8277 sOrder = elements[sChar];
8278 if(sOrder > UCOL_NOT_FOUND) {
8279 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sO rder, source, &sIndex, sLen);
8280 }
8281 }
8282 while(tOrder==0) {
8283 if(tIndex==tLen) {
8284 if(endOfSource) {
8285 return UCOL_EQUAL; // if both strings are at the end, th ey are equal
8286 } else {
8287 return UCOL_GREATER;
8288 }
8289 }
8290 tChar=target[tIndex++];
8291 tOrder = elements[tChar];
8292 if(tOrder > UCOL_NOT_FOUND) {
8293 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tO rder, target, &tIndex, tLen);
8294 }
8295 }
8296 if(endOfSource) {
8297 return UCOL_LESS;
8298 }
8299 if(sOrder == tOrder) {
8300 sOrder = 0; tOrder = 0;
8301 continue;
8302 } else {
8303 if(((sOrder^tOrder)&0xff000000)!=0) {
8304 if(sOrder < tOrder) {
8305 return UCOL_LESS;
8306 } else if(sOrder > tOrder) {
8307 return UCOL_GREATER;
8308 }
8309 }
8310 sOrder<<=8;
8311 tOrder<<=8;
8312 }
8313 }
8314 }
8315 return UCOL_EQUAL;
8316 }
8317
8318
8319 U_CAPI UCollationResult U_EXPORT2
8320 ucol_strcollIter( const UCollator *coll,
8321 UCharIterator *sIter,
8322 UCharIterator *tIter,
8323 UErrorCode *status)
8324 {
8325 if(!status || U_FAILURE(*status)) {
8326 return UCOL_EQUAL;
8327 }
8328
8329 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8330 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIt er);
8331
8332 if (sIter == tIter) {
8333 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8334 return UCOL_EQUAL;
8335 }
8336 if(sIter == NULL || tIter == NULL || coll == NULL) {
8337 *status = U_ILLEGAL_ARGUMENT_ERROR;
8338 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8339 return UCOL_EQUAL;
8340 }
8341
8342 UCollationResult result = UCOL_EQUAL;
8343
8344 // Preparing the context objects for iterating over strings
8345 collIterate sColl, tColl;
8346 IInit_collIterate(coll, NULL, -1, &sColl, status);
8347 IInit_collIterate(coll, NULL, -1, &tColl, status);
8348 if(U_FAILURE(*status)) {
8349 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8350 return UCOL_EQUAL;
8351 }
8352 // The division for the array length may truncate the array size to
8353 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8354 // for all platforms anyway.
8355 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8356 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8357 UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8358
8359 sColl.iterator = sIter;
8360 sColl.flags |= UCOL_USE_ITERATOR;
8361 tColl.flags |= UCOL_USE_ITERATOR;
8362 tColl.iterator = tIter;
8363
8364 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8365 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), statu s);
8366 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8367 sColl.flags &= ~UCOL_ITER_NORM;
8368
8369 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), statu s);
8370 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8371 tColl.flags &= ~UCOL_ITER_NORM;
8372 }
8373
8374 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8375
8376 while((sChar = sColl.iterator->next(sColl.iterator)) ==
8377 (tChar = tColl.iterator->next(tColl.iterator))) {
8378 if(sChar == U_SENTINEL) {
8379 result = UCOL_EQUAL;
8380 goto end_compare;
8381 }
8382 }
8383
8384 if(sChar == U_SENTINEL) {
8385 tChar = tColl.iterator->previous(tColl.iterator);
8386 }
8387
8388 if(tChar == U_SENTINEL) {
8389 sChar = sColl.iterator->previous(sColl.iterator);
8390 }
8391
8392 sChar = sColl.iterator->previous(sColl.iterator);
8393 tChar = tColl.iterator->previous(tColl.iterator);
8394
8395 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8396 {
8397 // We are stopped in the middle of a contraction.
8398 // Scan backwards through the == part of the string looking for the star t of the contraction.
8399 // It doesn't matter which string we scan, since they are the same in this region.
8400 do
8401 {
8402 sChar = sColl.iterator->previous(sColl.iterator);
8403 tChar = tColl.iterator->previous(tColl.iterator);
8404 }
8405 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8406 }
8407
8408
8409 if(U_SUCCESS(*status)) {
8410 result = ucol_strcollRegular(&sColl, &tColl, status);
8411 }
8412
8413 end_compare:
8414 if(sNormIter || tNormIter) {
8415 unorm_closeIter(sNormIter);
8416 unorm_closeIter(tNormIter);
8417 }
8418
8419 UTRACE_EXIT_VALUE_STATUS(result, *status)
8420 return result;
8421 }
8422
8423
8424 /* */
8425 /* ucol_strcoll Main public API string comparison function */
8426 /* */
8427 U_CAPI UCollationResult U_EXPORT2
8428 ucol_strcoll( const UCollator *coll,
8429 const UChar *source,
8430 int32_t sourceLength,
8431 const UChar *target,
8432 int32_t targetLength)
8433 {
8434 U_ALIGN_CODE(16);
8435
8436 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8437 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8438 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, sour ce, target);
8439 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLengt h);
8440 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLengt h);
8441 }
8442
8443 if(source == NULL || target == NULL) {
8444 // do not crash, but return. Should have
8445 // status argument to return error.
8446 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8447 return UCOL_EQUAL;
8448 }
8449
8450 /* Quick check if source and target are same strings. */
8451 /* They should either both be NULL terminated or the explicit length should be set on both. */
8452 if (source==target && sourceLength==targetLength) {
8453 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8454 return UCOL_EQUAL;
8455 }
8456
8457 /* Scan the strings. Find: */
8458 /* The length of any leading portion that is equal */
8459 /* Whether they are exactly equal. (in which case we just return) */
8460 const UChar *pSrc = source;
8461 const UChar *pTarg = target;
8462 int32_t equalLength;
8463
8464 if (sourceLength == -1 && targetLength == -1) {
8465 // Both strings are null terminated.
8466 // Scan through any leading equal portion.
8467 while (*pSrc == *pTarg && *pSrc != 0) {
8468 pSrc++;
8469 pTarg++;
8470 }
8471 if (*pSrc == 0 && *pTarg == 0) {
8472 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8473 return UCOL_EQUAL;
8474 }
8475 equalLength = (int32_t)(pSrc - source);
8476 }
8477 else
8478 {
8479 // One or both strings has an explicit length.
8480 const UChar *pSrcEnd = source + sourceLength;
8481 const UChar *pTargEnd = target + targetLength;
8482
8483 // Scan while the strings are bitwise ==, or until one is exhausted.
8484 for (;;) {
8485 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8486 break;
8487 }
8488 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLeng th == -1)) {
8489 break;
8490 }
8491 if (*pSrc != *pTarg) {
8492 break;
8493 }
8494 pSrc++;
8495 pTarg++;
8496 }
8497 equalLength = (int32_t)(pSrc - source);
8498
8499 // If we made it all the way through both strings, we are done. They ar e ==
8500 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */
8501 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also at end of dest string */
8502 {
8503 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8504 return UCOL_EQUAL;
8505 }
8506 }
8507 if (equalLength > 0) {
8508 /* There is an identical portion at the beginning of the two strings. */
8509 /* If the identical portion ends within a contraction or a comibining */
8510 /* character sequence, back up to the start of that sequence. */
8511
8512 // These values should already be set by the code above.
8513 //pSrc = source + equalLength; /* point to the first differing c hars */
8514 //pTarg = target + equalLength;
8515 if ((pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) ||
8516 (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)))
8517 {
8518 // We are stopped in the middle of a contraction.
8519 // Scan backwards through the == part of the string looking for the start of the contraction.
8520 // It doesn't matter which string we scan, since they are the same in this region.
8521 do
8522 {
8523 equalLength--;
8524 pSrc--;
8525 }
8526 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8527 }
8528
8529 source += equalLength;
8530 target += equalLength;
8531 if (sourceLength > 0) {
8532 sourceLength -= equalLength;
8533 }
8534 if (targetLength > 0) {
8535 targetLength -= equalLength;
8536 }
8537 }
8538
8539 UErrorCode status = U_ZERO_ERROR;
8540 UCollationResult returnVal;
8541 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLeng th > 0 && *target&0xff00)) {
8542 returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targ etLength, &status);
8543 } else {
8544 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, ta rgetLength, &status);
8545 }
8546 UTRACE_EXIT_VALUE(returnVal);
8547 return returnVal;
8548 }
8549
8550 /* convenience function for comparing strings */
8551 U_CAPI UBool U_EXPORT2
8552 ucol_greater( const UCollator *coll,
8553 const UChar *source,
8554 int32_t sourceLength,
8555 const UChar *target,
8556 int32_t targetLength)
8557 {
8558 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8559 == UCOL_GREATER);
8560 }
8561
8562 /* convenience function for comparing strings */
8563 U_CAPI UBool U_EXPORT2
8564 ucol_greaterOrEqual( const UCollator *coll,
8565 const UChar *source,
8566 int32_t sourceLength,
8567 const UChar *target,
8568 int32_t targetLength)
8569 {
8570 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8571 != UCOL_LESS);
8572 }
8573
8574 /* convenience function for comparing strings */
8575 U_CAPI UBool U_EXPORT2
8576 ucol_equal( const UCollator *coll,
8577 const UChar *source,
8578 int32_t sourceLength,
8579 const UChar *target,
8580 int32_t targetLength)
8581 {
8582 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8583 == UCOL_EQUAL);
8584 }
8585
8586 U_CAPI void U_EXPORT2
8587 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
8588 if(coll && coll->UCA) {
8589 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
8590 }
8591 }
8592
8593 #endif /* #if !UCONFIG_NO_COLLATION */
OLDNEW
« no previous file with comments | « icu46/source/i18n/ucln_in.c ('k') | icu46/source/i18n/ucol_bld.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698