Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(103)

Side by Side Diff: source/i18n/ucol.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master
Patch Set: remove unusued directories Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/i18n/ucln_in.cpp ('k') | source/i18n/ucol_bld.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 ******************************************************************************* 2 *******************************************************************************
3 * Copyright (C) 1996-2013, International Business Machines 3 * Copyright (C) 1996-2014, International Business Machines
4 * Corporation and others. All Rights Reserved. 4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************* 5 *******************************************************************************
6 * file name: ucol.cpp 6 * file name: ucol.cpp
7 * encoding: US-ASCII 7 * encoding: US-ASCII
8 * tab size: 8 (not used) 8 * tab size: 8 (not used)
9 * indentation:4 9 * indentation:4
10 * 10 *
11 * Modification history 11 * Modification history
12 * Date Name Comments 12 * Date Name Comments
13 * 1996-1999 various members of ICU team maintained C API for collation framewo rk 13 * 1996-1999 various members of ICU team maintained C API for collation framewo rk
14 * 02/16/2001 synwee Added internal method getPrevSpecialCE 14 * 02/16/2001 synwee Added internal method getPrevSpecialCE
15 * 03/01/2001 synwee Added maxexpansion functionality. 15 * 03/01/2001 synwee Added maxexpansion functionality.
16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compl iant 16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compl iant
17 * 2012-2014 markus Rewritten in C++ again.
17 */ 18 */
18 19
19 #include "unicode/utypes.h" 20 #include "unicode/utypes.h"
20 21
21 #if !UCONFIG_NO_COLLATION 22 #if !UCONFIG_NO_COLLATION
22 23
24 #include "unicode/coll.h"
25 #include "unicode/tblcoll.h"
23 #include "unicode/bytestream.h" 26 #include "unicode/bytestream.h"
24 #include "unicode/coleitr.h" 27 #include "unicode/coleitr.h"
25 #include "unicode/unorm.h" 28 #include "unicode/ucoleitr.h"
26 #include "unicode/udata.h"
27 #include "unicode/ustring.h" 29 #include "unicode/ustring.h"
28 #include "unicode/utf8.h"
29
30 #include "ucol_imp.h"
31 #include "bocsu.h"
32
33 #include "normalizer2impl.h"
34 #include "unorm_it.h"
35 #include "umutex.h"
36 #include "cmemory.h" 30 #include "cmemory.h"
37 #include "ucln_in.h" 31 #include "collation.h"
38 #include "cstring.h" 32 #include "cstring.h"
39 #include "utracimp.h"
40 #include "putilimp.h" 33 #include "putilimp.h"
41 #include "uassert.h" 34 #include "uassert.h"
42 #include "unicode/coll.h" 35 #include "utracimp.h"
43
44 #ifdef UCOL_DEBUG
45 #include <stdio.h>
46 #endif
47 36
48 U_NAMESPACE_USE 37 U_NAMESPACE_USE
49 38
50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
51
52 #define LAST_BYTE_MASK_ 0xFF
53 #define SECOND_LAST_BYTE_SHIFT_ 8
54
55 #define ZERO_CC_LIMIT_ 0xC0
56
57 // These are static pointers to the NFC/NFD implementation instance.
58 // Each of them is always the same between calls to u_cleanup
59 // and therefore writing to it is not synchronized.
60 // They are cleaned in ucol_cleanup
61 static const Normalizer2 *g_nfd = NULL;
62 static const Normalizer2Impl *g_nfcImpl = NULL;
63
64 // These are values from UCA required for
65 // implicit generation and supressing sort key compression
66 // they should regularly be in the UCA, but if one
67 // is running without UCA, it could be a problem
68 static const int32_t maxRegularPrimary = 0x7A;
69 static const int32_t minImplicitPrimary = 0xE0;
70 static const int32_t maxImplicitPrimary = 0xE4;
71
72 U_CDECL_BEGIN
73 static UBool U_CALLCONV
74 ucol_cleanup(void)
75 {
76 g_nfd = NULL;
77 g_nfcImpl = NULL;
78 return TRUE;
79 }
80
81 static int32_t U_CALLCONV
82 _getFoldingOffset(uint32_t data) {
83 return (int32_t)(data&0xFFFFFF);
84 }
85
86 U_CDECL_END
87
88 static inline
89 UBool initializeNFD(UErrorCode *status) {
90 if (g_nfd != NULL) {
91 return TRUE;
92 } else {
93 // The result is constant, until the library is reloaded.
94 g_nfd = Normalizer2Factory::getNFDInstance(*status);
95 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
96 return U_SUCCESS(*status);
97 }
98 }
99
100 // init FCD data
101 static inline
102 UBool initializeFCD(UErrorCode *status) {
103 if (g_nfcImpl != NULL) {
104 return TRUE;
105 } else {
106 // The result is constant, until the library is reloaded.
107 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
108 // Note: Alternatively, we could also store this pointer in each collIte rate struct,
109 // same as Normalizer2Factory::getImpl(collIterate->nfd).
110 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
111 return U_SUCCESS(*status);
112 }
113 }
114
115 static
116 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceStri ng,
117 int32_t sourceLen, collIterate *s,
118 UErrorCode *status)
119 {
120 (s)->string = (s)->pos = sourceString;
121 (s)->origFlags = 0;
122 (s)->flags = 0;
123 if (sourceLen >= 0) {
124 s->flags |= UCOL_ITER_HASLEN;
125 (s)->endp = (UChar *)sourceString+sourceLen;
126 }
127 else {
128 /* change to enable easier checking for end of string for fcdpositon */
129 (s)->endp = NULL;
130 }
131 (s)->extendCEs = NULL;
132 (s)->extendCEsSize = 0;
133 (s)->CEpos = (s)->toReturn = (s)->CEs;
134 (s)->offsetBuffer = NULL;
135 (s)->offsetBufferSize = 0;
136 (s)->offsetReturn = (s)->offsetStore = NULL;
137 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
138 (s)->coll = (collator);
139 if (initializeNFD(status)) {
140 (s)->nfd = g_nfd;
141 } else {
142 return;
143 }
144 (s)->fcdPosition = 0;
145 if(collator->normalizationMode == UCOL_ON) {
146 (s)->flags |= UCOL_ITER_NORM;
147 }
148 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
149 (s)->flags |= UCOL_HIRAGANA_Q;
150 }
151 (s)->iterator = NULL;
152 //(s)->iteratorIndex = 0;
153 }
154
155 U_CAPI void U_EXPORT2
156 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
157 int32_t sourceLen, collIterate *s,
158 UErrorCode *status) {
159 /* Out-of-line version for use from other files. */
160 IInit_collIterate(collator, sourceString, sourceLen, s, status);
161 }
162
163 U_CAPI collIterate * U_EXPORT2
164 uprv_new_collIterate(UErrorCode *status) {
165 if(U_FAILURE(*status)) {
166 return NULL;
167 }
168 collIterate *s = new collIterate;
169 if(s == NULL) {
170 *status = U_MEMORY_ALLOCATION_ERROR;
171 return NULL;
172 }
173 return s;
174 }
175
176 U_CAPI void U_EXPORT2
177 uprv_delete_collIterate(collIterate *s) {
178 delete s;
179 }
180
181 U_CAPI UBool U_EXPORT2
182 uprv_collIterateAtEnd(collIterate *s) {
183 return s == NULL || s->pos == s->endp;
184 }
185
186 /**
187 * Backup the state of the collIterate struct data
188 * @param data collIterate to backup
189 * @param backup storage
190 */
191 static
192 inline void backupState(const collIterate *data, collIterateState *backup)
193 {
194 backup->fcdPosition = data->fcdPosition;
195 backup->flags = data->flags;
196 backup->origFlags = data->origFlags;
197 backup->pos = data->pos;
198 backup->bufferaddress = data->writableBuffer.getBuffer();
199 backup->buffersize = data->writableBuffer.length();
200 backup->iteratorMove = 0;
201 backup->iteratorIndex = 0;
202 if(data->iterator != NULL) {
203 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER _CURRENT);
204 backup->iteratorIndex = data->iterator->getState(data->iterator);
205 // no we try to fixup if we're using a normalizing iterator and we get U ITER_NO_STATE
206 if(backup->iteratorIndex == UITER_NO_STATE) {
207 while((backup->iteratorIndex = data->iterator->getState(data->iterat or)) == UITER_NO_STATE) {
208 backup->iteratorMove++;
209 data->iterator->move(data->iterator, -1, UITER_CURRENT);
210 }
211 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CUR RENT);
212 }
213 }
214 }
215
216 /**
217 * Loads the state into the collIterate struct data
218 * @param data collIterate to backup
219 * @param backup storage
220 * @param forwards boolean to indicate if forwards iteration is used,
221 * false indicates backwards iteration
222 */
223 static
224 inline void loadState(collIterate *data, const collIterateState *backup,
225 UBool forwards)
226 {
227 UErrorCode status = U_ZERO_ERROR;
228 data->flags = backup->flags;
229 data->origFlags = backup->origFlags;
230 if(data->iterator != NULL) {
231 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO );
232 data->iterator->setState(data->iterator, backup->iteratorIndex, &status) ;
233 if(backup->iteratorMove != 0) {
234 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CUR RENT);
235 }
236 }
237 data->pos = backup->pos;
238
239 if ((data->flags & UCOL_ITER_INNORMBUF) &&
240 data->writableBuffer.getBuffer() != backup->bufferaddress) {
241 /*
242 this is when a new buffer has been reallocated and we'll have to
243 calculate the new position.
244 note the new buffer has to contain the contents of the old buffer.
245 */
246 if (forwards) {
247 data->pos = data->writableBuffer.getTerminatedBuffer() +
248 (data->pos - backup->bufferaddress);
249 }
250 else {
251 /* backwards direction */
252 int32_t temp = backup->buffersize -
253 (int32_t)(data->pos - backup->bufferaddress);
254 data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writ ableBuffer.length() - temp);
255 }
256 }
257 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
258 /*
259 this is alittle tricky.
260 if we are initially not in the normalization buffer, even if we
261 normalize in the later stage, the data in the buffer will be
262 ignored, since we skip back up to the data string.
263 however if we are already in the normalization buffer, any
264 further normalization will pull data into the normalization
265 buffer and modify the fcdPosition.
266 since we are keeping the data in the buffer for use, the
267 fcdPosition can not be reverted back.
268 arrgghh....
269 */
270 data->fcdPosition = backup->fcdPosition;
271 }
272 }
273
274 static UBool
275 reallocCEs(collIterate *data, int32_t newCapacity) {
276 uint32_t *oldCEs = data->extendCEs;
277 if(oldCEs == NULL) {
278 oldCEs = data->CEs;
279 }
280 int32_t length = data->CEpos - oldCEs;
281 uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
282 if(newCEs == NULL) {
283 return FALSE;
284 }
285 uprv_memcpy(newCEs, oldCEs, length * 4);
286 uprv_free(data->extendCEs);
287 data->extendCEs = newCEs;
288 data->extendCEsSize = newCapacity;
289 data->CEpos = newCEs + length;
290 return TRUE;
291 }
292
293 static UBool
294 increaseCEsCapacity(collIterate *data) {
295 int32_t oldCapacity;
296 if(data->extendCEs != NULL) {
297 oldCapacity = data->extendCEsSize;
298 } else {
299 oldCapacity = LENGTHOF(data->CEs);
300 }
301 return reallocCEs(data, 2 * oldCapacity);
302 }
303
304 static UBool
305 ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
306 int32_t oldCapacity;
307 if(data->extendCEs != NULL) {
308 oldCapacity = data->extendCEsSize;
309 } else {
310 oldCapacity = LENGTHOF(data->CEs);
311 }
312 if(minCapacity <= oldCapacity) {
313 return TRUE;
314 }
315 oldCapacity *= 2;
316 return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacit y);
317 }
318
319 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {
320 if(U_FAILURE(errorCode)) {
321 return;
322 }
323 int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuf fer);
324 U_ASSERT(length >= offsetBufferSize || offsetStore != NULL);
325 if(length >= offsetBufferSize) {
326 int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;
327 int32_t *newBuffer = static_cast<int32_t *>(uprv_malloc(newCapacity * 4) );
328 if(newBuffer == NULL) {
329 errorCode = U_MEMORY_ALLOCATION_ERROR;
330 return;
331 }
332 if(length > 0) {
333 uprv_memcpy(newBuffer, offsetBuffer, length * 4);
334 }
335 uprv_free(offsetBuffer);
336 offsetBuffer = newBuffer;
337 offsetStore = offsetBuffer + length;
338 offsetBufferSize = newCapacity;
339 }
340 *offsetStore++ = offset;
341 }
342
343 /*
344 * collIter_eos()
345 * Checks for a collIterate being positioned at the end of
346 * its source string.
347 *
348 */
349 static
350 inline UBool collIter_eos(collIterate *s) {
351 if(s->flags & UCOL_USE_ITERATOR) {
352 return !(s->iterator->hasNext(s->iterator));
353 }
354 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
355 // Null terminated string, but not at null, so not at end.
356 // Whether in main or normalization buffer doesn't matter.
357 return FALSE;
358 }
359
360 // String with length. Can't be in normalization buffer, which is always
361 // null termintated.
362 if (s->flags & UCOL_ITER_HASLEN) {
363 return (s->pos == s->endp);
364 }
365
366 // We are at a null termination, could be either normalization buffer or mai n string.
367 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
368 // At null at end of main string.
369 return TRUE;
370 }
371
372 // At null at end of normalization buffer. Need to check whether there ther e are
373 // any characters left in the main buffer.
374 if(s->origFlags & UCOL_USE_ITERATOR) {
375 return !(s->iterator->hasNext(s->iterator));
376 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
377 // Null terminated main string. fcdPosition is the 'return' position in to main buf.
378 return (*s->fcdPosition == 0);
379 }
380 else {
381 // Main string with an end pointer.
382 return s->fcdPosition == s->endp;
383 }
384 }
385
386 /*
387 * collIter_bos()
388 * Checks for a collIterate being positioned at the start of
389 * its source string.
390 *
391 */
392 static
393 inline UBool collIter_bos(collIterate *source) {
394 // if we're going backwards, we need to know whether there is more in the
395 // iterator, even if we are in the side buffer
396 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
397 return !source->iterator->hasPrevious(source->iterator);
398 }
399 if (source->pos <= source->string ||
400 ((source->flags & UCOL_ITER_INNORMBUF) &&
401 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
402 return TRUE;
403 }
404 return FALSE;
405 }
406
407 /*static
408 inline UBool collIter_SimpleBos(collIterate *source) {
409 // if we're going backwards, we need to know whether there is more in the
410 // iterator, even if we are in the side buffer
411 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
412 return !source->iterator->hasPrevious(source->iterator);
413 }
414 if (source->pos == source->string) {
415 return TRUE;
416 }
417 return FALSE;
418 }*/
419 //return (data->pos == data->string) ||
420
421
422 /****************************************************************************/
423 /* Following are the open/close functions */
424 /* */
425 /****************************************************************************/
426
427 static UCollator*
428 ucol_initFromBinary(const uint8_t *bin, int32_t length,
429 const UCollator *base,
430 UCollator *fillIn,
431 UErrorCode *status)
432 {
433 UCollator *result = fillIn;
434 if(U_FAILURE(*status)) {
435 return NULL;
436 }
437 /*
438 if(base == NULL) {
439 // we don't support null base yet
440 *status = U_ILLEGAL_ARGUMENT_ERROR;
441 return NULL;
442 }
443 */
444 // We need these and we could be running without UCA
445 uprv_uca_initImplicitConstants(status);
446 UCATableHeader *colData = (UCATableHeader *)bin;
447 // do we want version check here? We're trying to figure out whether collato rs are compatible
448 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeo f(UVersionInfo)) != 0 ||
449 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersio nInfo)) != 0)) ||
450 colData->version[0] != UCOL_BUILDER_VERSION)
451 {
452 *status = U_COLLATOR_VERSION_MISMATCH;
453 return NULL;
454 }
455 else {
456 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(s izeof(UColOptionSet)))) {
457 result = ucol_initCollator((const UCATableHeader *)bin, result, base , status);
458 if(U_FAILURE(*status)){
459 return NULL;
460 }
461 result->hasRealData = TRUE;
462 }
463 else {
464 if(base) {
465 result = ucol_initCollator(base->image, result, base, status);
466 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
467 if(U_FAILURE(*status)){
468 return NULL;
469 }
470 result->hasRealData = FALSE;
471 }
472 else {
473 *status = U_USELESS_COLLATOR_ERROR;
474 return NULL;
475 }
476 }
477 result->freeImageOnClose = FALSE;
478 }
479 result->actualLocale = NULL;
480 result->validLocale = NULL;
481 result->requestedLocale = NULL;
482 result->rules = NULL;
483 result->rulesLength = 0;
484 result->freeRulesOnClose = FALSE;
485 result->ucaRules = NULL;
486 return result;
487 }
488
489 U_CAPI UCollator* U_EXPORT2 39 U_CAPI UCollator* U_EXPORT2
490 ucol_openBinary(const uint8_t *bin, int32_t length, 40 ucol_openBinary(const uint8_t *bin, int32_t length,
491 const UCollator *base, 41 const UCollator *base,
492 UErrorCode *status) 42 UErrorCode *status)
493 { 43 {
494 return ucol_initFromBinary(bin, length, base, NULL, status); 44 if(U_FAILURE(*status)) { return NULL; }
45 RuleBasedCollator *coll = new RuleBasedCollator(
46 bin, length,
47 RuleBasedCollator::rbcFromUCollator(base),
48 *status);
49 if(coll == NULL) {
50 *status = U_MEMORY_ALLOCATION_ERROR;
51 return NULL;
52 }
53 if(U_FAILURE(*status)) {
54 delete coll;
55 return NULL;
56 }
57 return coll->toUCollator();
495 } 58 }
496 59
497 U_CAPI int32_t U_EXPORT2 60 U_CAPI int32_t U_EXPORT2
498 ucol_cloneBinary(const UCollator *coll, 61 ucol_cloneBinary(const UCollator *coll,
499 uint8_t *buffer, int32_t capacity, 62 uint8_t *buffer, int32_t capacity,
500 UErrorCode *status) 63 UErrorCode *status)
501 { 64 {
502 int32_t length = 0;
503 if(U_FAILURE(*status)) { 65 if(U_FAILURE(*status)) {
504 return length; 66 return 0;
505 } 67 }
506 if(capacity < 0) { 68 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
507 *status = U_ILLEGAL_ARGUMENT_ERROR; 69 if(rbc == NULL && coll != NULL) {
508 return length; 70 *status = U_UNSUPPORTED_ERROR;
71 return 0;
509 } 72 }
510 if(coll->hasRealData == TRUE) { 73 return rbc->cloneBinary(buffer, capacity, *status);
511 length = coll->image->size;
512 if(length <= capacity) {
513 uprv_memcpy(buffer, coll->image, length);
514 } else {
515 *status = U_BUFFER_OVERFLOW_ERROR;
516 }
517 } else {
518 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof( UColOptionSet)));
519 if(length <= capacity) {
520 /* build the UCATableHeader with minimal entries */
521 /* do not copy the header from the UCA file because its values are w rong! */
522 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
523
524 /* reset everything */
525 uprv_memset(buffer, 0, length);
526
527 /* set the tailoring-specific values */
528 UCATableHeader *myData = (UCATableHeader *)buffer;
529 myData->size = length;
530
531 /* offset for the options, the only part of the data that is present after the header */
532 myData->options = sizeof(UCATableHeader);
533
534 /* need to always set the expansion value for an upper bound of the options */
535 myData->expansion = myData->options + sizeof(UColOptionSet);
536
537 myData->magic = UCOL_HEADER_MAGIC;
538 myData->isBigEndian = U_IS_BIG_ENDIAN;
539 myData->charSetFamily = U_CHARSET_FAMILY;
540
541 /* copy UCA's version; genrb will override all but the builder versi on with tailoring data */
542 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionIn fo));
543
544 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVer sionInfo));
545 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVer sionInfo));
546 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeo f(UVersionInfo));
547 myData->jamoSpecial = coll->image->jamoSpecial;
548
549 /* copy the collator options */
550 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options , sizeof(UColOptionSet));
551 } else {
552 *status = U_BUFFER_OVERFLOW_ERROR;
553 }
554 }
555 return length;
556 } 74 }
557 75
558 U_CAPI UCollator* U_EXPORT2 76 U_CAPI UCollator* U_EXPORT2
559 ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferS ize, UErrorCode *status) 77 ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferS ize, UErrorCode *status)
560 { 78 {
561 UCollator * localCollator;
562 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
563 int32_t imageSize = 0;
564 int32_t rulesSize = 0;
565 int32_t rulesPadding = 0;
566 int32_t defaultReorderCodesSize = 0;
567 int32_t reorderCodesSize = 0;
568 uint8_t *image;
569 UChar *rules;
570 int32_t* defaultReorderCodes;
571 int32_t* reorderCodes;
572 uint8_t* leadBytePermutationTable;
573 UBool imageAllocated = FALSE;
574
575 if (status == NULL || U_FAILURE(*status)){ 79 if (status == NULL || U_FAILURE(*status)){
576 return NULL; 80 return NULL;
577 } 81 }
578 if (coll == NULL) { 82 if (coll == NULL) {
579 *status = U_ILLEGAL_ARGUMENT_ERROR; 83 *status = U_ILLEGAL_ARGUMENT_ERROR;
580 return NULL; 84 return NULL;
581 } 85 }
582
583 if (coll->rules && coll->freeRulesOnClose) {
584 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
585 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
586 bufferSizeNeeded += rulesSize + rulesPadding;
587 }
588 // no padding for alignment needed from here since the next two are 4 byte q uantities
589 if (coll->defaultReorderCodes) {
590 defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32 _t);
591 bufferSizeNeeded += defaultReorderCodesSize;
592 }
593 if (coll->reorderCodes) {
594 reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t);
595 bufferSizeNeeded += reorderCodesSize;
596 }
597 if (coll->leadBytePermutationTable) {
598 bufferSizeNeeded += 256 * sizeof(uint8_t);
599 }
600
601 if (pBufferSize != NULL) { 86 if (pBufferSize != NULL) {
602 int32_t inputSize = *pBufferSize; 87 int32_t inputSize = *pBufferSize;
603 *pBufferSize = 1; 88 *pBufferSize = 1;
604 if (inputSize == 0) { 89 if (inputSize == 0) {
605 return NULL; // preflighting for deprecated functionality 90 return NULL; // preflighting for deprecated functionality
606 } 91 }
607 } 92 }
608 93 Collator *newColl = Collator::fromUCollator(coll)->clone();
609 char *stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded); 94 if (newColl == NULL) {
610 // Null pointer check.
611 if (stackBufferChars == NULL) {
612 *status = U_MEMORY_ALLOCATION_ERROR; 95 *status = U_MEMORY_ALLOCATION_ERROR;
613 return NULL; 96 } else {
97 *status = U_SAFECLONE_ALLOCATED_WARNING;
614 } 98 }
615 *status = U_SAFECLONE_ALLOCATED_WARNING; 99 return newColl->toUCollator();
616
617 localCollator = (UCollator *)stackBufferChars;
618 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
619 defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize);
620 reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCode sSize);
621 leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize;
622
623 {
624 UErrorCode tempStatus = U_ZERO_ERROR;
625 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
626 }
627 if (coll->freeImageOnClose) {
628 image = (uint8_t *)uprv_malloc(imageSize);
629 // Null pointer check
630 if (image == NULL) {
631 *status = U_MEMORY_ALLOCATION_ERROR;
632 return NULL;
633 }
634 ucol_cloneBinary(coll, image, imageSize, status);
635 imageAllocated = TRUE;
636 }
637 else {
638 image = (uint8_t *)coll->image;
639 }
640 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollat or, status);
641 if (U_FAILURE(*status)) {
642 return NULL;
643 }
644
645 if (coll->rules) {
646 if (coll->freeRulesOnClose) {
647 localCollator->rules = u_strcpy(rules, coll->rules);
648 //bufferEnd += rulesSize;
649 }
650 else {
651 localCollator->rules = coll->rules;
652 }
653 localCollator->freeRulesOnClose = FALSE;
654 localCollator->rulesLength = coll->rulesLength;
655 }
656
657 // collator reordering
658 if (coll->defaultReorderCodes) {
659 localCollator->defaultReorderCodes =
660 (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCode s, coll->defaultReorderCodesLength * sizeof(int32_t));
661 localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLeng th;
662 localCollator->freeDefaultReorderCodesOnClose = FALSE;
663 }
664 if (coll->reorderCodes) {
665 localCollator->reorderCodes =
666 (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorde rCodesLength * sizeof(int32_t));
667 localCollator->reorderCodesLength = coll->reorderCodesLength;
668 localCollator->freeReorderCodesOnClose = FALSE;
669 }
670 if (coll->leadBytePermutationTable) {
671 localCollator->leadBytePermutationTable =
672 (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermu tationTable, 256);
673 localCollator->freeLeadBytePermutationTableOnClose = FALSE;
674 }
675
676 int32_t i;
677 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
678 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(col l, (UColAttribute)i, status), status);
679 }
680 // zero copies of pointers
681 localCollator->actualLocale = NULL;
682 localCollator->validLocale = NULL;
683 localCollator->requestedLocale = NULL;
684 localCollator->ucaRules = coll->ucaRules; // There should only be one copy h ere.
685 localCollator->freeOnClose = TRUE;
686 localCollator->freeImageOnClose = imageAllocated;
687 return localCollator;
688 } 100 }
689 101
690 U_CAPI void U_EXPORT2 102 U_CAPI void U_EXPORT2
691 ucol_close(UCollator *coll) 103 ucol_close(UCollator *coll)
692 { 104 {
693 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); 105 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
694 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); 106 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
695 if(coll != NULL) { 107 if(coll != NULL) {
696 // these are always owned by each UCollator struct, 108 delete Collator::fromUCollator(coll);
697 // so we always free them
698 if(coll->validLocale != NULL) {
699 uprv_free(coll->validLocale);
700 }
701 if(coll->actualLocale != NULL) {
702 uprv_free(coll->actualLocale);
703 }
704 if(coll->requestedLocale != NULL) {
705 uprv_free(coll->requestedLocale);
706 }
707 if(coll->latinOneCEs != NULL) {
708 uprv_free(coll->latinOneCEs);
709 }
710 if(coll->options != NULL && coll->freeOptionsOnClose) {
711 uprv_free(coll->options);
712 }
713 if(coll->rules != NULL && coll->freeRulesOnClose) {
714 uprv_free((UChar *)coll->rules);
715 }
716 if(coll->image != NULL && coll->freeImageOnClose) {
717 uprv_free((UCATableHeader *)coll->image);
718 }
719
720 if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutati onTableOnClose == TRUE) {
721 uprv_free(coll->leadBytePermutationTable);
722 }
723 if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnC lose == TRUE) {
724 uprv_free(coll->defaultReorderCodes);
725 }
726 if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
727 uprv_free(coll->reorderCodes);
728 }
729
730 if(coll->delegate != NULL) {
731 delete (Collator*)coll->delegate;
732 }
733
734 /* Here, it would be advisable to close: */
735 /* - UData for UCA (unless we stuff it in the root resb */
736 /* Again, do we need additional housekeeping... HMMM! */
737 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
738 if(coll->freeOnClose){
739 /* for safeClone, if freeOnClose is FALSE,
740 don't free the other instance data */
741 uprv_free(coll);
742 }
743 } 109 }
744 UTRACE_EXIT(); 110 UTRACE_EXIT();
745 } 111 }
746 112
747 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCo de *status) {
748 if(U_FAILURE(*status)) {
749 return;
750 }
751 result->caseFirst = (UColAttributeValue)opts->caseFirst;
752 result->caseLevel = (UColAttributeValue)opts->caseLevel;
753 result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
754 result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
755 if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) {
756 return;
757 }
758 result->strength = (UColAttributeValue)opts->strength;
759 result->variableTopValue = opts->variableTopValue;
760 result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
761 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
762 result->numericCollation = (UColAttributeValue)opts->numericCollation;
763 result->caseFirstisDefault = TRUE;
764 result->caseLevelisDefault = TRUE;
765 result->frenchCollationisDefault = TRUE;
766 result->normalizationModeisDefault = TRUE;
767 result->strengthisDefault = TRUE;
768 result->variableTopValueisDefault = TRUE;
769 result->alternateHandlingisDefault = TRUE;
770 result->hiraganaQisDefault = TRUE;
771 result->numericCollationisDefault = TRUE;
772
773 ucol_updateInternalState(result, status);
774
775 result->options = opts;
776 }
777
778
779 /**
780 * Approximate determination if a character is at a contraction end.
781 * Guaranteed to be TRUE if a character is at the end of a contraction,
782 * otherwise it is not deterministic.
783 * @param c character to be determined
784 * @param coll collator
785 */
786 static
787 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
788 if (c < coll->minContrEndCP) {
789 return FALSE;
790 }
791
792 int32_t hash = c;
793 uint8_t htbyte;
794 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
795 if (U16_IS_TRAIL(c)) {
796 return TRUE;
797 }
798 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
799 }
800 htbyte = coll->contrEndCP[hash>>3];
801 return (((htbyte >> (hash & 7)) & 1) == 1);
802 }
803
804
805
806 /*
807 * i_getCombiningClass()
808 * A fast, at least partly inline version of u_getCombiningClass()
809 * This is a candidate for further optimization. Used heavily
810 * in contraction processing.
811 */
812 static
813 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
814 uint8_t sCC = 0;
815 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
816 sCC = u_getCombiningClass(c);
817 }
818 return sCC;
819 }
820
821 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, con st UCollator *UCA, UErrorCode *status) {
822 UChar c;
823 UCollator *result = fillIn;
824 if(U_FAILURE(*status) || image == NULL) {
825 return NULL;
826 }
827
828 if(result == NULL) {
829 result = (UCollator *)uprv_malloc(sizeof(UCollator));
830 if(result == NULL) {
831 *status = U_MEMORY_ALLOCATION_ERROR;
832 return result;
833 }
834 result->freeOnClose = TRUE;
835 } else {
836 result->freeOnClose = FALSE;
837 }
838
839 result->delegate = NULL;
840
841 result->image = image;
842 result->mapping.getFoldingOffset = _getFoldingOffset;
843 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosit ion;
844 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
845 if(U_FAILURE(*status)) {
846 if(result->freeOnClose == TRUE) {
847 uprv_free(result);
848 result = NULL;
849 }
850 return result;
851 }
852
853 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
854 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image-> contractionCEs);
855 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->c ontractionIndex);
856 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expan sion);
857 result->rules = NULL;
858 result->rulesLength = 0;
859 result->freeRulesOnClose = FALSE;
860 result->defaultReorderCodes = NULL;
861 result->defaultReorderCodesLength = 0;
862 result->freeDefaultReorderCodesOnClose = FALSE;
863 result->reorderCodes = NULL;
864 result->reorderCodesLength = 0;
865 result->freeReorderCodesOnClose = FALSE;
866 result->leadBytePermutationTable = NULL;
867 result->freeLeadBytePermutationTableOnClose = FALSE;
868
869 /* get the version info from UCATableHeader and populate the Collator struct */
870 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
871 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules v ersion*/
872 result->dataVersion[2] = 0;
873 result->dataVersion[3] = 0;
874
875 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
876 result->minUnsafeCP = 0;
877 for (c=0; c<0x300; c++) { // Find the smallest unsafe char.
878 if (ucol_unsafeCP(c, result)) break;
879 }
880 result->minUnsafeCP = c;
881
882 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
883 result->minContrEndCP = 0;
884 for (c=0; c<0x300; c++) { // Find the Contraction-ending char.
885 if (ucol_contractionEndCP(c, result)) break;
886 }
887 result->minContrEndCP = c;
888
889 /* max expansion tables */
890 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
891 result->image->endExpansionCE);
892 result->lastEndExpansionCE = result->endExpansionCE +
893 result->image->endExpansionCECount - 1;
894 result->expansionCESize = (uint8_t*)result->image +
895 result->image->expansionCESize;
896
897
898 //result->errorCode = *status;
899
900 result->latinOneCEs = NULL;
901
902 result->latinOneRegenTable = FALSE;
903 result->latinOneFailed = FALSE;
904 result->UCA = UCA;
905
906 /* Normally these will be set correctly later. This is the default if you us e UCA or the default. */
907 result->ucaRules = NULL;
908 result->actualLocale = NULL;
909 result->validLocale = NULL;
910 result->requestedLocale = NULL;
911 result->hasRealData = FALSE; // real data lives in .dat file...
912 result->freeImageOnClose = FALSE;
913
914 /* set attributes */
915 ucol_setOptionsFromHeader(
916 result,
917 (UColOptionSet*)((uint8_t*)result->image+result->image->options),
918 status);
919 result->freeOptionsOnClose = FALSE;
920
921 return result;
922 }
923
924 /* new Mark's code */
925
926 /**
927 * For generation of Implicit CEs
928 * @author Davis
929 *
930 * Cleaned up so that changes can be made more easily.
931 * Old values:
932 # First Implicit: E26A792D
933 # Last Implicit: E3DC70C0
934 # First CJK: E0030300
935 # Last CJK: E0A9DD00
936 # First CJK_A: E0A9DF00
937 # Last CJK_A: E0DE3100
938 */
939 /* Following is a port of Mark's code for new treatment of implicits.
940 * It is positioned here, since ucol_initUCA need to initialize the
941 * variables below according to the data in the fractional UCA.
942 */
943
944 /**
945 * Function used to:
946 * a) collapse the 2 different Han ranges from UCA into one (in the right order) , and
947 * b) bump any non-CJK characters by 10FFFF.
948 * The relevant blocks are:
949 * A: 4E00..9FFF; CJK Unified Ideographs
950 * F900..FAFF; CJK Compatibility Ideographs
951 * B: 3400..4DBF; CJK Unified Ideographs Extension A
952 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
953 * As long as
954 * no new B characters are allocated between 4E00 and FAFF, and
955 * no new A characters are outside of this range,
956 * (very high probability) this simple code will work.
957 * The reordered blocks are:
958 * Block1 is CJK
959 * Block2 is CJK_COMPAT_USED
960 * Block3 is CJK_A
961 * (all contiguous)
962 * Any other CJK gets its normal code point
963 * Any non-CJK gets +10FFFF
964 * When we reorder Block1, we make sure that it is at the very start,
965 * so that it will use a 3-byte form.
966 * Warning: the we only pick up the compatibility characters that are
967 * NOT decomposed, so that block is smaller!
968 */
969
970 // CONSTANTS
971 static const UChar32
972 NON_CJK_OFFSET = 0x110000,
973 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
974
975 /**
976 * Precomputed by initImplicitConstants()
977 */
978 static int32_t
979 final3Multiplier = 0,
980 final4Multiplier = 0,
981 final3Count = 0,
982 final4Count = 0,
983 medialCount = 0,
984 min3Primary = 0,
985 min4Primary = 0,
986 max4Primary = 0,
987 minTrail = 0,
988 maxTrail = 0,
989 max3Trail = 0,
990 max4Trail = 0,
991 min4Boundary = 0;
992
993 static const UChar32
994 // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
995 // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; (Unicode 6.1)
996 CJK_BASE = 0x4E00,
997 CJK_LIMIT = 0x9FCC+1,
998 // Unified CJK ideographs in the compatibility ideographs block.
999 CJK_COMPAT_USED_BASE = 0xFA0E,
1000 CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
1001 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
1002 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
1003 CJK_A_BASE = 0x3400,
1004 CJK_A_LIMIT = 0x4DB5+1,
1005 // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
1006 // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
1007 CJK_B_BASE = 0x20000,
1008 CJK_B_LIMIT = 0x2A6D6+1,
1009 // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
1010 // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
1011 CJK_C_BASE = 0x2A700,
1012 CJK_C_LIMIT = 0x2B734+1,
1013 // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
1014 // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
1015 CJK_D_BASE = 0x2B740,
1016 CJK_D_LIMIT = 0x2B81D+1;
1017 // when adding to this list, look for all occurrences (in project)
1018 // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing !!!!
1019
1020 static UChar32 swapCJK(UChar32 i) {
1021 if (i < CJK_A_BASE) {
1022 // non-CJK
1023 } else if (i < CJK_A_LIMIT) {
1024 // Extension A has lower code points than the original Unihan+compat
1025 // but sorts higher.
1026 return i - CJK_A_BASE
1027 + (CJK_LIMIT - CJK_BASE)
1028 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1029 } else if (i < CJK_BASE) {
1030 // non-CJK
1031 } else if (i < CJK_LIMIT) {
1032 return i - CJK_BASE;
1033 } else if (i < CJK_COMPAT_USED_BASE) {
1034 // non-CJK
1035 } else if (i < CJK_COMPAT_USED_LIMIT) {
1036 return i - CJK_COMPAT_USED_BASE
1037 + (CJK_LIMIT - CJK_BASE);
1038 } else if (i < CJK_B_BASE) {
1039 // non-CJK
1040 } else if (i < CJK_B_LIMIT) {
1041 return i; // non-BMP-CJK
1042 } else if (i < CJK_C_BASE) {
1043 // non-CJK
1044 } else if (i < CJK_C_LIMIT) {
1045 return i; // non-BMP-CJK
1046 } else if (i < CJK_D_BASE) {
1047 // non-CJK
1048 } else if (i < CJK_D_LIMIT) {
1049 return i; // non-BMP-CJK
1050 }
1051 return i + NON_CJK_OFFSET; // non-CJK
1052 }
1053
1054 U_CAPI UChar32 U_EXPORT2
1055 uprv_uca_getRawFromCodePoint(UChar32 i) {
1056 return swapCJK(i)+1;
1057 }
1058
1059 U_CAPI UChar32 U_EXPORT2
1060 uprv_uca_getCodePointFromRaw(UChar32 i) {
1061 i--;
1062 UChar32 result = 0;
1063 if(i >= NON_CJK_OFFSET) {
1064 result = i - NON_CJK_OFFSET;
1065 } else if(i >= CJK_B_BASE) {
1066 result = i;
1067 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
1068 if(i < CJK_LIMIT - CJK_BASE) {
1069 result = i + CJK_BASE;
1070 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMP AT_USED_BASE)) {
1071 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
1072 } else {
1073 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_ LIMIT - CJK_COMPAT_USED_BASE);
1074 }
1075 } else {
1076 result = -1;
1077 }
1078 return result;
1079 }
1080
1081 // GET IMPLICIT PRIMARY WEIGHTS
1082 // Return value is left justified primary key
1083 U_CAPI uint32_t U_EXPORT2
1084 uprv_uca_getImplicitFromRaw(UChar32 cp) {
1085 /*
1086 if (cp < 0 || cp > UCOL_MAX_INPUT) {
1087 throw new IllegalArgumentException("Code point out of range " + Utility. hex(cp));
1088 }
1089 */
1090 int32_t last0 = cp - min4Boundary;
1091 if (last0 < 0) {
1092 int32_t last1 = cp / final3Count;
1093 last0 = cp % final3Count;
1094
1095 int32_t last2 = last1 / medialCount;
1096 last1 %= medialCount;
1097
1098 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
1099 last1 = minTrail + last1; // offset
1100 last2 = min3Primary + last2; // offset
1101 /*
1102 if (last2 >= min4Primary) {
1103 throw new IllegalArgumentException("4-byte out of range: " + Utility .hex(cp) + ", " + Utility.hex(last2));
1104 }
1105 */
1106 return (last2 << 24) + (last1 << 16) + (last0 << 8);
1107 } else {
1108 int32_t last1 = last0 / final4Count;
1109 last0 %= final4Count;
1110
1111 int32_t last2 = last1 / medialCount;
1112 last1 %= medialCount;
1113
1114 int32_t last3 = last2 / medialCount;
1115 last2 %= medialCount;
1116
1117 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
1118 last1 = minTrail + last1; // offset
1119 last2 = minTrail + last2; // offset
1120 last3 = min4Primary + last3; // offset
1121 /*
1122 if (last3 > max4Primary) {
1123 throw new IllegalArgumentException("4-byte out of range: " + Utility .hex(cp) + ", " + Utility.hex(last3));
1124 }
1125 */
1126 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
1127 }
1128 }
1129
1130 static uint32_t U_EXPORT2
1131 uprv_uca_getImplicitPrimary(UChar32 cp) {
1132 //fprintf(stdout, "Incoming: %04x\n", cp);
1133 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1134
1135 cp = swapCJK(cp);
1136 cp++;
1137 // we now have a range of numbers from 0 to 21FFFF.
1138
1139 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1140 //fprintf(stdout, "CJK swapped: %04x\n", cp);
1141
1142 return uprv_uca_getImplicitFromRaw(cp);
1143 }
1144
1145 /**
1146 * Converts implicit CE into raw integer ("code point")
1147 * @param implicit
1148 * @return -1 if illegal format
1149 */
1150 U_CAPI UChar32 U_EXPORT2
1151 uprv_uca_getRawFromImplicit(uint32_t implicit) {
1152 UChar32 result;
1153 UChar32 b3 = implicit & 0xFF;
1154 UChar32 b2 = (implicit >> 8) & 0xFF;
1155 UChar32 b1 = (implicit >> 16) & 0xFF;
1156 UChar32 b0 = (implicit >> 24) & 0xFF;
1157
1158 // simple parameter checks
1159 if (b0 < min3Primary || b0 > max4Primary
1160 || b1 < minTrail || b1 > maxTrail)
1161 return -1;
1162 // normal offsets
1163 b1 -= minTrail;
1164
1165 // take care of the final values, and compose
1166 if (b0 < min4Primary) {
1167 if (b2 < minTrail || b2 > max3Trail || b3 != 0)
1168 return -1;
1169 b2 -= minTrail;
1170 UChar32 remainder = b2 % final3Multiplier;
1171 if (remainder != 0)
1172 return -1;
1173 b0 -= min3Primary;
1174 b2 /= final3Multiplier;
1175 result = ((b0 * medialCount) + b1) * final3Count + b2;
1176 } else {
1177 if (b2 < minTrail || b2 > maxTrail
1178 || b3 < minTrail || b3 > max4Trail)
1179 return -1;
1180 b2 -= minTrail;
1181 b3 -= minTrail;
1182 UChar32 remainder = b3 % final4Multiplier;
1183 if (remainder != 0)
1184 return -1;
1185 b3 /= final4Multiplier;
1186 b0 -= min4Primary;
1187 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1188 }
1189 // final check
1190 if (result < 0 || result > UCOL_MAX_INPUT)
1191 return -1;
1192 return result;
1193 }
1194
1195
1196 static inline int32_t divideAndRoundUp(int a, int b) {
1197 return 1 + (a-1)/b;
1198 }
1199
1200 /* this function is either called from initUCA or from genUCA before
1201 * doing canonical closure for the UCA.
1202 */
1203
1204 /**
1205 * Set up to generate implicits.
1206 * Maintenance Note: this function may end up being called more than once, due
1207 * to threading races during initialization. Make sure that
1208 * none of the Constants is ever transiently assigned an
1209 * incorrect value.
1210 * @param minPrimary
1211 * @param maxPrimary
1212 * @param minTrail final byte
1213 * @param maxTrail final byte
1214 * @param gap3 the gap we leave for tailoring for 3-byte forms
1215 * @param gap4 the gap we leave for tailoring for 4-byte forms
1216 */
1217 static void initImplicitConstants(int minPrimary, int maxPrimary,
1218 int minTrailIn, int maxTrailIn,
1219 int gap3, int primaries3count,
1220 UErrorCode *status) {
1221 // some simple parameter checks
1222 if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
1223 || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
1224 || (primaries3count < 1))
1225 {
1226 *status = U_ILLEGAL_ARGUMENT_ERROR;
1227 return;
1228 };
1229
1230 minTrail = minTrailIn;
1231 maxTrail = maxTrailIn;
1232
1233 min3Primary = minPrimary;
1234 max4Primary = maxPrimary;
1235 // compute constants for use later.
1236 // number of values we can use in trailing bytes
1237 // leave room for empty values between AND above, e.g. if gap = 2
1238 // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1239 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1240 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1241 final3Multiplier = gap3 + 1;
1242 final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1243 max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1244
1245 // medials can use full range
1246 medialCount = (maxTrail - minTrail + 1);
1247 // find out how many values fit in each form
1248 int32_t threeByteCount = medialCount * final3Count;
1249 // now determine where the 3/4 boundary is.
1250 // we use 3 bytes below the boundary, and 4 above
1251 int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1252 int32_t primaries4count = primariesAvailable - primaries3count;
1253
1254
1255 int32_t min3ByteCoverage = primaries3count * threeByteCount;
1256 min4Primary = minPrimary + primaries3count;
1257 min4Boundary = min3ByteCoverage;
1258 // Now expand out the multiplier for the 4 bytes, and redo.
1259
1260 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1261 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count );
1262 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCo unt * medialCount);
1263 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1264 if (gap4 < 1) {
1265 *status = U_ILLEGAL_ARGUMENT_ERROR;
1266 return;
1267 }
1268 final4Multiplier = gap4 + 1;
1269 final4Count = neededPerFinalByte;
1270 max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1271 }
1272
1273 /**
1274 * Supply parameters for generating implicit CEs
1275 */
1276 U_CAPI void U_EXPORT2
1277 uprv_uca_initImplicitConstants(UErrorCode *status) {
1278 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms .
1279 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1280 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
1281 }
1282
1283
1284 /* collIterNormalize Incremental Normalization happens here. */
1285 /* pick up the range of chars identifed by FCD, */
1286 /* normalize it into the collIterate's writable buffer, */
1287 /* switch the collIterate's state to use the writable b uffer. */
1288 /* */
1289 static
1290 void collIterNormalize(collIterate *collationSource)
1291 {
1292 UErrorCode status = U_ZERO_ERROR;
1293 const UChar *srcP = collationSource->pos - 1; /* Start of chars to nor malize */
1294 const UChar *endP = collationSource->fcdPosition; /* End of region to norma lize+1 */
1295
1296 collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
1297 collationSource->writableBuffer,
1298 status);
1299 if (U_FAILURE(status)) {
1300 #ifdef UCOL_DEBUG
1301 fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_erro rName(status));
1302 #endif
1303 return;
1304 }
1305
1306 collationSource->pos = collationSource->writableBuffer.getTerminatedB uffer();
1307 collationSource->origFlags = collationSource->flags;
1308 collationSource->flags |= UCOL_ITER_INNORMBUF;
1309 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE _ITERATOR);
1310 }
1311
1312
1313 // This function takes the iterator and extracts normalized stuff up to the next boundary
1314 // It is similar in the end results to the collIterNormalize, but for the cases when we
1315 // use an iterator
1316 /*static
1317 inline void normalizeIterator(collIterate *collationSource) {
1318 UErrorCode status = U_ZERO_ERROR;
1319 UBool wasNormalized = FALSE;
1320 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->ite rator, UITER_CURRENT);
1321 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iter ator);
1322 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writa bleBuffer,
1323 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalize d, &status);
1324 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->w ritableBufSize) {
1325 // reallocate and terminate
1326 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1327 &collationSource->writableBuffer,
1328 (int32_t *)&collationSource->writableBufSize, nor mLen + 1,
1329 0)
1330 ) {
1331 #ifdef UCOL_DEBUG
1332 fprintf(stderr, "normalizeIterator(), out of memory\n");
1333 #endif
1334 return;
1335 }
1336 status = U_ZERO_ERROR;
1337 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITE R_ZERO);
1338 collationSource->iterator->setState(collationSource->iterator, iterIndex, &s tatus);
1339 normLen = unorm_next(collationSource->iterator, collationSource->writableBuf fer,
1340 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalize d, &status);
1341 }
1342 // Terminate the buffer - we already checked that it is big enough
1343 collationSource->writableBuffer[normLen] = 0;
1344 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1345 collationSource->flags |= UCOL_ITER_ALLOCATED;
1346 }
1347 collationSource->pos = collationSource->writableBuffer;
1348 collationSource->origFlags = collationSource->flags;
1349 collationSource->flags |= UCOL_ITER_INNORMBUF;
1350 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_I TERATOR);
1351 }*/
1352
1353
1354 /* Incremental FCD check and normalize */
1355 /* Called from getNextCE when normalization state is suspect. */
1356 /* When entering, the state is known to be this: */
1357 /* o We are working in the main buffer of the collIterate, not the side */
1358 /* writable buffer. When in the side buffer, normalization mode is alw ays off, */
1359 /* so we won't get here. */
1360 /* o The leading combining class from the current character is 0 or */
1361 /* the trailing combining class of the previous char was zero. */
1362 /* True because the previous call to this function will have always exi ted */
1363 /* that way, and we get called for every char where cc might be non-zer o. */
1364 static
1365 inline UBool collIterFCD(collIterate *collationSource) {
1366 const UChar *srcP, *endP;
1367 uint8_t leadingCC;
1368 uint8_t prevTrailingCC = 0;
1369 uint16_t fcd;
1370 UBool needNormalize = FALSE;
1371
1372 srcP = collationSource->pos-1;
1373
1374 if (collationSource->flags & UCOL_ITER_HASLEN) {
1375 endP = collationSource->endp;
1376 } else {
1377 endP = NULL;
1378 }
1379
1380 // Get the trailing combining class of the current character. If it's zero, we are OK.
1381 fcd = g_nfcImpl->nextFCD16(srcP, endP);
1382 if (fcd != 0) {
1383 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1384
1385 if (prevTrailingCC != 0) {
1386 // The current char has a non-zero trailing CC. Scan forward until we find
1387 // a char with a leading cc of zero.
1388 while (endP == NULL || srcP != endP)
1389 {
1390 const UChar *savedSrcP = srcP;
1391
1392 fcd = g_nfcImpl->nextFCD16(srcP, endP);
1393 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1394 if (leadingCC == 0) {
1395 srcP = savedSrcP; // Hit char that is not part of combi ning sequence.
1396 // back up over it. (Could be surr ogate pair!)
1397 break;
1398 }
1399
1400 if (leadingCC < prevTrailingCC) {
1401 needNormalize = TRUE;
1402 }
1403
1404 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1405 }
1406 }
1407 }
1408
1409 collationSource->fcdPosition = (UChar *)srcP;
1410
1411 return needNormalize;
1412 }
1413
1414 /****************************************************************************/
1415 /* Following are the CE retrieval functions */
1416 /* */
1417 /****************************************************************************/
1418
1419 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
1420 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
1421
1422 /* there should be a macro version of this function in the header file */
1423 /* This is the first function that tries to fetch a collation element */
1424 /* If it's not succesfull or it encounters a more difficult situation */
1425 /* some more sofisticated and slower functions are invoked */
1426 static
1427 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou rce, UErrorCode *status) {
1428 uint32_t order = 0;
1429 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */
1430 order = *(collationSource->toReturn++); /* if so , return them */
1431 if(collationSource->CEpos == collationSource->toReturn) {
1432 collationSource->CEpos = collationSource->toReturn = collationSource ->extendCEs ? collationSource->extendCEs : collationSource->CEs;
1433 }
1434 return order;
1435 }
1436
1437 UChar ch = 0;
1438 collationSource->offsetReturn = NULL;
1439
1440 do {
1441 for (;;) /* Loop handles case when incremental normalize switches */
1442 { /* to or from the side buffer / ori ginal string, and we */
1443 /* need to start again to get the next character. */
1444
1445 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBU F | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1446 {
1447 // The source string is null terminated and we're not working fr om the side buffer,
1448 // and we're not normalizing. This is the fast path.
1449 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1450 ch = *collationSource->pos++;
1451 if (ch != 0) {
1452 break;
1453 }
1454 else {
1455 return UCOL_NO_MORE_CES;
1456 }
1457 }
1458
1459 if (collationSource->flags & UCOL_ITER_HASLEN) {
1460 // Normal path for strings when length is specified.
1461 // (We can't be in side buffer because it is always null termi nated.)
1462 if (collationSource->pos >= collationSource->endp) {
1463 // Ran off of the end of the main source string. We're done .
1464 return UCOL_NO_MORE_CES;
1465 }
1466 ch = *collationSource->pos++;
1467 }
1468 else if(collationSource->flags & UCOL_USE_ITERATOR) {
1469 UChar32 iterCh = collationSource->iterator->next(collationSource ->iterator);
1470 if(iterCh == U_SENTINEL) {
1471 return UCOL_NO_MORE_CES;
1472 }
1473 ch = (UChar)iterCh;
1474 }
1475 else
1476 {
1477 // Null terminated string.
1478 ch = *collationSource->pos++;
1479 if (ch == 0) {
1480 // Ran off end of buffer.
1481 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1482 // Ran off end of main string. backing up one character.
1483 collationSource->pos--;
1484 return UCOL_NO_MORE_CES;
1485 }
1486 else
1487 {
1488 // Hit null in the normalize side buffer.
1489 // Usually this means the end of the normalized data,
1490 // except for one odd case: a null followed by combining chars,
1491 // which is the case if we are at the start of the buf fer.
1492 if (collationSource->pos == collationSource->writableBuf fer.getBuffer()+1) {
1493 break;
1494 }
1495
1496 // Null marked end of side buffer.
1497 // Revert to the main string and
1498 // loop back to top to try again to get a character.
1499 collationSource->pos = collationSource->fcdPosition;
1500 collationSource->flags = collationSource->origFlags;
1501 continue;
1502 }
1503 }
1504 }
1505
1506 if(collationSource->flags&UCOL_HIRAGANA_Q) {
1507 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
1508 * based on whether the previous codepoint was Hiragana or Katak ana.
1509 */
1510 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f) ) ||
1511 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
1512 collationSource->flags |= UCOL_WAS_HIRAGANA;
1513 } else {
1514 collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1515 }
1516 }
1517
1518 // We've got a character. See if there's any fcd and/or normalizati on stuff to do.
1519 // Note that UCOL_ITER_NORM flag is always zero when we are in th e side buffer.
1520 if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1521 break;
1522 }
1523
1524 if (collationSource->fcdPosition >= collationSource->pos) {
1525 // An earlier FCD check has already covered the current characte r.
1526 // We can go ahead and process this char.
1527 break;
1528 }
1529
1530 if (ch < ZERO_CC_LIMIT_ ) {
1531 // Fast fcd safe path. Trailing combining class == 0. This cha r is OK.
1532 break;
1533 }
1534
1535 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1536 // We need to peek at the next character in order to tell if we are FCD
1537 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSour ce->pos >= collationSource->endp) {
1538 // We are at the last char of source string.
1539 // It is always OK for FCD check.
1540 break;
1541 }
1542
1543 // Not at last char of source string (or we'll check against ter minating null). Do the FCD fast test
1544 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1545 break;
1546 }
1547 }
1548
1549
1550 // Need a more complete FCD check and possible normalization.
1551 if (collIterFCD(collationSource)) {
1552 collIterNormalize(collationSource);
1553 }
1554 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1555 // No normalization was needed. Go ahead and process the char we already had.
1556 break;
1557 }
1558
1559 // Some normalization happened. Next loop iteration will pick up a char
1560 // from the normalization buffer.
1561
1562 } // end for (;;)
1563
1564
1565 if (ch <= 0xFF) {
1566 /* For latin-1 characters we never need to fall back to the UCA tab le */
1567 /* because all of the UCA data is replicated in the latinOneMappi ng array */
1568 order = coll->latinOneMapping[ch];
1569 if (order > UCOL_NOT_FOUND) {
1570 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1571 }
1572 }
1573 else
1574 {
1575 // Always use UCA for Han, Hangul
1576 // (Han extension A is before main Han block)
1577 // **** Han compatibility chars ?? ****
1578 if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
1579 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
1580 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
1581 // between the two target ranges; do normal lookup
1582 // **** this range is YI, Modifier tone letters, ****
1583 // **** Latin-D, Syloti Nagari, Phagas-pa. ****
1584 // **** Latin-D might be tailored, so we need to ****
1585 // **** do the normal lookup for these guys. ****
1586 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1587 } else {
1588 // in one of the target ranges; use UCA
1589 order = UCOL_NOT_FOUND;
1590 }
1591 } else {
1592 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1593 }
1594
1595 if(order > UCOL_NOT_FOUND) { / * if a CE is special */
1596 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */
1597 }
1598
1599 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a g ood CE in the tailoring */
1600 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1601 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1602
1603 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE * /
1604 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collatio nSource, status);
1605 }
1606 }
1607 }
1608 } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_L AST_HANGUL );
1609
1610 if(order == UCOL_NOT_FOUND) {
1611 order = getImplicit(ch, collationSource);
1612 }
1613 return order; /* return the CE */
1614 }
1615
1616 /* ucol_getNextCE, out-of-line version for use from other files. */
1617 U_CAPI uint32_t U_EXPORT2
1618 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode * status) {
1619 return ucol_IGetNextCE(coll, collationSource, status);
1620 }
1621
1622
1623 /**
1624 * Incremental previous normalization happens here. Pick up the range of chars
1625 * identifed by FCD, normalize it into the collIterate's writable buffer,
1626 * switch the collIterate's state to use the writable buffer.
1627 * @param data collation iterator data
1628 */
1629 static
1630 void collPrevIterNormalize(collIterate *data)
1631 {
1632 UErrorCode status = U_ZERO_ERROR;
1633 const UChar *pEnd = data->pos; /* End normalize + 1 */
1634 const UChar *pStart;
1635
1636 /* Start normalize */
1637 if (data->fcdPosition == NULL) {
1638 pStart = data->string;
1639 }
1640 else {
1641 pStart = data->fcdPosition + 1;
1642 }
1643
1644 int32_t normLen =
1645 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pSta rt) + 1)),
1646 data->writableBuffer,
1647 status).
1648 length();
1649 if(U_FAILURE(status)) {
1650 return;
1651 }
1652 /*
1653 this puts the null termination infront of the normalized string instead
1654 of the end
1655 */
1656 data->writableBuffer.insert(0, (UChar)0);
1657
1658 /*
1659 * The usual case at this point is that we've got a base
1660 * character followed by marks that were normalized. If
1661 * fcdPosition is NULL, that means that we backed up to
1662 * the beginning of the string and there's no base character.
1663 *
1664 * Forward processing will usually normalize when it sees
1665 * the first mark, so that mark will get it's natural offset
1666 * and the rest will get the offset of the character following
1667 * the marks. The base character will also get its natural offset.
1668 *
1669 * We write the offset of the base character, if there is one,
1670 * followed by the offset of the first mark and then the offsets
1671 * of the rest of the marks.
1672 */
1673 int32_t firstMarkOffset = 0;
1674 int32_t trailOffset = (int32_t)(data->pos - data->string + 1);
1675 int32_t trailCount = normLen - 1;
1676
1677 if (data->fcdPosition != NULL) {
1678 int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
1679 UChar baseChar = *data->fcdPosition;
1680
1681 firstMarkOffset = baseOffset + 1;
1682
1683 /*
1684 * If the base character is the start of a contraction, forward processi ng
1685 * will normalize the marks while checking for the contraction, which me ans
1686 * that the offset of the first mark will the same as the other marks.
1687 *
1688 * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
1689 */
1690 if (baseChar >= 0x100) {
1691 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, bas eChar);
1692
1693 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
1694 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, bas eChar);
1695 }
1696
1697 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION _TAG) {
1698 firstMarkOffset = trailOffset;
1699 }
1700 }
1701
1702 data->appendOffset(baseOffset, status);
1703 }
1704
1705 data->appendOffset(firstMarkOffset, status);
1706
1707 for (int32_t i = 0; i < trailCount; i += 1) {
1708 data->appendOffset(trailOffset, status);
1709 }
1710
1711 data->offsetRepeatValue = trailOffset;
1712
1713 data->offsetReturn = data->offsetStore - 1;
1714 if (data->offsetReturn == data->offsetBuffer) {
1715 data->offsetStore = data->offsetBuffer;
1716 }
1717
1718 data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
1719 data->origFlags = data->flags;
1720 data->flags |= UCOL_ITER_INNORMBUF;
1721 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1722 }
1723
1724
1725 /**
1726 * Incremental FCD check for previous iteration and normalize. Called from
1727 * getPrevCE when normalization state is suspect.
1728 * When entering, the state is known to be this:
1729 * o We are working in the main buffer of the collIterate, not the side
1730 * writable buffer. When in the side buffer, normalization mode is always
1731 * off, so we won't get here.
1732 * o The leading combining class from the current character is 0 or the
1733 * trailing combining class of the previous char was zero.
1734 * True because the previous call to this function will have always exited
1735 * that way, and we get called for every char where cc might be non-zero.
1736 * @param data collation iterate struct
1737 * @return normalization status, TRUE for normalization to be done, FALSE
1738 * otherwise
1739 */
1740 static
1741 inline UBool collPrevIterFCD(collIterate *data)
1742 {
1743 const UChar *src, *start;
1744 uint8_t leadingCC;
1745 uint8_t trailingCC = 0;
1746 uint16_t fcd;
1747 UBool result = FALSE;
1748
1749 start = data->string;
1750 src = data->pos + 1;
1751
1752 /* Get the trailing combining class of the current character. */
1753 fcd = g_nfcImpl->previousFCD16(start, src);
1754
1755 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1756
1757 if (leadingCC != 0) {
1758 /*
1759 The current char has a non-zero leading combining class.
1760 Scan backward until we find a char with a trailing cc of zero.
1761 */
1762 for (;;)
1763 {
1764 if (start == src) {
1765 data->fcdPosition = NULL;
1766 return result;
1767 }
1768
1769 fcd = g_nfcImpl->previousFCD16(start, src);
1770
1771 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1772
1773 if (trailingCC == 0) {
1774 break;
1775 }
1776
1777 if (leadingCC < trailingCC) {
1778 result = TRUE;
1779 }
1780
1781 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1782 }
1783 }
1784
1785 data->fcdPosition = (UChar *)src;
1786
1787 return result;
1788 }
1789
1790 /** gets a code unit from the string at a given offset
1791 * Handles both normal and iterative cases.
1792 * No error checking - caller beware!
1793 */
1794 static inline
1795 UChar peekCodeUnit(collIterate *source, int32_t offset) {
1796 if(source->pos != NULL) {
1797 return *(source->pos + offset);
1798 } else if(source->iterator != NULL) {
1799 UChar32 c;
1800 if(offset != 0) {
1801 source->iterator->move(source->iterator, offset, UITER_CURRENT);
1802 c = source->iterator->next(source->iterator);
1803 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1804 } else {
1805 c = source->iterator->current(source->iterator);
1806 }
1807 return c >= 0 ? (UChar)c : 0xfffd; // If the caller works properly, we should never see c<0.
1808 } else {
1809 return 0xfffd;
1810 }
1811 }
1812
1813 // Code point version. Treats the offset as a _code point_ delta.
1814 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-for med UTF-16.
1815 // We cannot use U16_FWD_1 and similar because we do not know the start and limi t of the buffer.
1816 static inline
1817 UChar32 peekCodePoint(collIterate *source, int32_t offset) {
1818 UChar32 c;
1819 if(source->pos != NULL) {
1820 const UChar *p = source->pos;
1821 if(offset >= 0) {
1822 // Skip forward over (offset-1) code points.
1823 while(--offset >= 0) {
1824 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) {
1825 ++p;
1826 }
1827 }
1828 // Read the code point there.
1829 c = *p++;
1830 UChar trail;
1831 if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {
1832 c = U16_GET_SUPPLEMENTARY(c, trail);
1833 }
1834 } else /* offset<0 */ {
1835 // Skip backward over (offset-1) code points.
1836 while(++offset < 0) {
1837 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) {
1838 --p;
1839 }
1840 }
1841 // Read the code point before that.
1842 c = *--p;
1843 UChar lead;
1844 if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {
1845 c = U16_GET_SUPPLEMENTARY(lead, c);
1846 }
1847 }
1848 } else if(source->iterator != NULL) {
1849 if(offset >= 0) {
1850 // Skip forward over (offset-1) code points.
1851 int32_t fwd = offset;
1852 while(fwd-- > 0) {
1853 uiter_next32(source->iterator);
1854 }
1855 // Read the code point there.
1856 c = uiter_current32(source->iterator);
1857 // Return to the starting point, skipping backward over (offset-1) c ode points.
1858 while(offset-- > 0) {
1859 uiter_previous32(source->iterator);
1860 }
1861 } else /* offset<0 */ {
1862 // Read backward, reading offset code points, remember only the last -read one.
1863 int32_t back = offset;
1864 do {
1865 c = uiter_previous32(source->iterator);
1866 } while(++back < 0);
1867 // Return to the starting position, skipping forward over offset cod e points.
1868 do {
1869 uiter_next32(source->iterator);
1870 } while(++offset < 0);
1871 }
1872 } else {
1873 c = U_SENTINEL;
1874 }
1875 return c;
1876 }
1877
1878 /**
1879 * Determines if we are at the start of the data string in the backwards
1880 * collation iterator
1881 * @param data collation iterator
1882 * @return TRUE if we are at the start
1883 */
1884 static
1885 inline UBool isAtStartPrevIterate(collIterate *data) {
1886 if(data->pos == NULL && data->iterator != NULL) {
1887 return !data->iterator->hasPrevious(data->iterator);
1888 }
1889 //return (collIter_bos(data)) ||
1890 return (data->pos == data->string) ||
1891 ((data->flags & UCOL_ITER_INNORMBUF) && (data->pos != NULL) &&
1892 *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1893 }
1894
1895 static
1896 inline void goBackOne(collIterate *data) {
1897 # if 0
1898 // somehow, it looks like we need to keep iterator synced up
1899 // at all times, as above.
1900 if(data->pos) {
1901 data->pos--;
1902 }
1903 if(data->iterator) {
1904 data->iterator->previous(data->iterator);
1905 }
1906 #endif
1907 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1908 data->iterator->previous(data->iterator);
1909 }
1910 if(data->pos) {
1911 data->pos --;
1912 }
1913 }
1914
1915 /**
1916 * Inline function that gets a simple CE.
1917 * So what it does is that it will first check the expansion buffer. If the
1918 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1919 * is different from the string pointer, we return the collation element at the
1920 * return pointer and decrement it.
1921 * For more complicated CEs it resorts to getComplicatedCE.
1922 * @param coll collator data
1923 * @param data collation iterator struct
1924 * @param status error status
1925 */
1926 static
1927 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1928 UErrorCode *status)
1929 {
1930 uint32_t result = (uint32_t)UCOL_NULLORDER;
1931
1932 if (data->offsetReturn != NULL) {
1933 if (data->offsetRepeatCount > 0) {
1934 data->offsetRepeatCount -= 1;
1935 } else {
1936 if (data->offsetReturn == data->offsetBuffer) {
1937 data->offsetReturn = NULL;
1938 data->offsetStore = data->offsetBuffer;
1939 } else {
1940 data->offsetReturn -= 1;
1941 }
1942 }
1943 }
1944
1945 if ((data->extendCEs && data->toReturn > data->extendCEs) ||
1946 (!data->extendCEs && data->toReturn > data->CEs))
1947 {
1948 data->toReturn -= 1;
1949 result = *(data->toReturn);
1950 if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
1951 data->CEpos = data->toReturn;
1952 }
1953 }
1954 else {
1955 UChar ch = 0;
1956
1957 do {
1958 /*
1959 Loop handles case when incremental normalize switches to or from the
1960 side buffer / original string, and we need to start again to get the
1961 next character.
1962 */
1963 for (;;) {
1964 if (data->flags & UCOL_ITER_HASLEN) {
1965 /*
1966 Normal path for strings when length is specified.
1967 Not in side buffer because it is always null terminated.
1968 */
1969 if (data->pos <= data->string) {
1970 /* End of the main source string */
1971 return UCOL_NO_MORE_CES;
1972 }
1973 data->pos --;
1974 ch = *data->pos;
1975 }
1976 // we are using an iterator to go back. Pray for us!
1977 else if (data->flags & UCOL_USE_ITERATOR) {
1978 UChar32 iterCh = data->iterator->previous(data->iterator);
1979 if(iterCh == U_SENTINEL) {
1980 return UCOL_NO_MORE_CES;
1981 } else {
1982 ch = (UChar)iterCh;
1983 }
1984 }
1985 else {
1986 data->pos --;
1987 ch = *data->pos;
1988 /* we are in the side buffer. */
1989 if (ch == 0) {
1990 /*
1991 At the start of the normalize side buffer.
1992 Go back to string.
1993 Because pointer points to the last accessed character,
1994 hence we have to increment it by one here.
1995 */
1996 data->flags = data->origFlags;
1997 data->offsetRepeatValue = 0;
1998
1999 if (data->fcdPosition == NULL) {
2000 data->pos = data->string;
2001 return UCOL_NO_MORE_CES;
2002 }
2003 else {
2004 data->pos = data->fcdPosition + 1;
2005 }
2006
2007 continue;
2008 }
2009 }
2010
2011 if(data->flags&UCOL_HIRAGANA_Q) {
2012 if(ch>=0x3040 && ch<=0x309f) {
2013 data->flags |= UCOL_WAS_HIRAGANA;
2014 } else {
2015 data->flags &= ~UCOL_WAS_HIRAGANA;
2016 }
2017 }
2018
2019 /*
2020 * got a character to determine if there's fcd and/or normalizati on
2021 * stuff to do.
2022 * if the current character is not fcd.
2023 * if current character is at the start of the string
2024 * Trailing combining class == 0.
2025 * Note if pos is in the writablebuffer, norm is always 0
2026 */
2027 if (ch < ZERO_CC_LIMIT_ ||
2028 // this should propel us out of the loop in the iterator case
2029 (data->flags & UCOL_ITER_NORM) == 0 ||
2030 (data->fcdPosition != NULL && data->fcdPosition <= data->pos )
2031 || data->string == data->pos) {
2032 break;
2033 }
2034
2035 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
2036 /* if next character is FCD */
2037 if (data->pos == data->string) {
2038 /* First char of string is always OK for FCD check */
2039 break;
2040 }
2041
2042 /* Not first char of string, do the FCD fast test */
2043 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
2044 break;
2045 }
2046 }
2047
2048 /* Need a more complete FCD check and possible normalization. */
2049 if (collPrevIterFCD(data)) {
2050 collPrevIterNormalize(data);
2051 }
2052
2053 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2054 /* No normalization. Go ahead and process the char. */
2055 break;
2056 }
2057
2058 /*
2059 Some normalization happened.
2060 Next loop picks up a char from the normalization buffer.
2061 */
2062 }
2063
2064 /* attempt to handle contractions, after removal of the backwards
2065 contraction
2066 */
2067 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
2068 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, d ata, status);
2069 } else {
2070 if (ch <= 0xFF) {
2071 result = coll->latinOneMapping[ch];
2072 }
2073 else {
2074 // Always use UCA for [3400..9FFF], [AC00..D7AF]
2075 // **** [FA0E..FA2F] ?? ****
2076 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
2077 (ch >= 0x3400 && ch <= 0xD7AF)) {
2078 if (ch > 0x9FFF && ch < 0xAC00) {
2079 // between the two target ranges; do normal lookup
2080 // **** this range is YI, Modifier tone letters, *** *
2081 // **** Latin-D, Syloti Nagari, Phagas-pa. *** *
2082 // **** Latin-D might be tailored, so we need to *** *
2083 // **** do the normal lookup for these guys. *** *
2084 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2085 } else {
2086 result = UCOL_NOT_FOUND;
2087 }
2088 } else {
2089 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2090 }
2091 }
2092 if (result > UCOL_NOT_FOUND) {
2093 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, s tatus);
2094 }
2095 if (result == UCOL_NOT_FOUND) { // Not found in master list
2096 if (!isAtStartPrevIterate(data) &&
2097 ucol_contractionEndCP(ch, data->coll))
2098 {
2099 result = UCOL_CONTRACTION;
2100 } else {
2101 if(coll->UCA) {
2102 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
2103 }
2104 }
2105
2106 if (result > UCOL_NOT_FOUND) {
2107 if(coll->UCA) {
2108 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, re sult, data, status);
2109 }
2110 }
2111 }
2112 }
2113 } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= U COL_LAST_HANGUL );
2114
2115 if(result == UCOL_NOT_FOUND) {
2116 result = getPrevImplicit(ch, data);
2117 }
2118 }
2119
2120 return result;
2121 }
2122
2123
2124 /* ucol_getPrevCE, out-of-line version for use from other files. */
2125 U_CFUNC uint32_t U_EXPORT2
2126 ucol_getPrevCE(const UCollator *coll, collIterate *data,
2127 UErrorCode *status) {
2128 return ucol_IGetPrevCE(coll, data, status);
2129 }
2130
2131
2132 /* this should be connected to special Jamo handling */
2133 U_CFUNC uint32_t U_EXPORT2
2134 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
2135 collIterate colIt;
2136 IInit_collIterate(coll, &u, 1, &colIt, status);
2137 if(U_FAILURE(*status)) {
2138 return 0;
2139 }
2140 return ucol_IGetNextCE(coll, &colIt, status);
2141 }
2142
2143 /**
2144 * Inserts the argument character into the end of the buffer pushing back the
2145 * null terminator.
2146 * @param data collIterate struct data
2147 * @param ch character to be appended
2148 * @return the position of the new addition
2149 */
2150 static
2151 inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
2152 {
2153 int32_t oldLength = data->writableBuffer.length();
2154 return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
2155 }
2156
2157 /**
2158 * Inserts the argument string into the end of the buffer pushing back the
2159 * null terminator.
2160 * @param data collIterate struct data
2161 * @param string to be appended
2162 * @param length of the string to be appended
2163 * @return the position of the new addition
2164 */
2165 static
2166 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_ t length)
2167 {
2168 int32_t oldLength = data->writableBuffer.length();
2169 return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldL ength;
2170 }
2171
2172 /**
2173 * Special normalization function for contraction in the forwards iterator.
2174 * This normalization sequence will place the current character at source->pos
2175 * and its following normalized sequence into the buffer.
2176 * The fcd position, pos will be changed.
2177 * pos will now point to positions in the buffer.
2178 * Flags will be changed accordingly.
2179 * @param data collation iterator data
2180 */
2181 static
2182 inline void normalizeNextContraction(collIterate *data)
2183 {
2184 int32_t strsize;
2185 UErrorCode status = U_ZERO_ERROR;
2186 /* because the pointer points to the next character */
2187 const UChar *pStart = data->pos - 1;
2188 const UChar *pEnd;
2189
2190 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2191 data->writableBuffer.setTo(*(pStart - 1));
2192 strsize = 1;
2193 }
2194 else {
2195 strsize = data->writableBuffer.length();
2196 }
2197
2198 pEnd = data->fcdPosition;
2199
2200 data->writableBuffer.append(
2201 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStar t)), status));
2202 if(U_FAILURE(status)) {
2203 return;
2204 }
2205
2206 data->pos = data->writableBuffer.getTerminatedBuffer() + strsize;
2207 data->origFlags = data->flags;
2208 data->flags |= UCOL_ITER_INNORMBUF;
2209 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2210 }
2211
2212 /**
2213 * Contraction character management function that returns the next character
2214 * for the forwards iterator.
2215 * Does nothing if the next character is in buffer and not the first character
2216 * in it.
2217 * Else it checks next character in data string to see if it is normalizable.
2218 * If it is not, the character is simply copied into the buffer, else
2219 * the whole normalized substring is copied into the buffer, including the
2220 * current character.
2221 * @param data collation element iterator data
2222 * @return next character
2223 */
2224 static
2225 inline UChar getNextNormalizedChar(collIterate *data)
2226 {
2227 UChar nextch;
2228 UChar ch;
2229 // Here we need to add the iterator code. One problem is the way
2230 // end of string is handled. If we just return next char, it could
2231 // be the sentinel. Most of the cases already check for this, but we
2232 // need to be sure.
2233 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2234 /* if no normalization and not in buffer. */
2235 if(data->flags & UCOL_USE_ITERATOR) {
2236 return (UChar)data->iterator->next(data->iterator);
2237 } else {
2238 return *(data->pos ++);
2239 }
2240 }
2241
2242 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2243 //normalizeIterator(data);
2244 //}
2245
2246 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2247 if ((innormbuf && *data->pos != 0) ||
2248 (data->fcdPosition != NULL && !innormbuf &&
2249 data->pos < data->fcdPosition)) {
2250 /*
2251 if next character is in normalized buffer, no further normalization
2252 is required
2253 */
2254 return *(data->pos ++);
2255 }
2256
2257 if (data->flags & UCOL_ITER_HASLEN) {
2258 /* in data string */
2259 if (data->pos + 1 == data->endp) {
2260 return *(data->pos ++);
2261 }
2262 if (data->pos >= data->endp) {
2263 return (UChar) -1; // return U+FFFF (non-char) to indicate an error
2264 }
2265 }
2266 else {
2267 if (innormbuf) {
2268 // inside the normalization buffer, but at the end
2269 // (since we encountered zero). This means, in the
2270 // case we're using char iterator, that we need to
2271 // do another round of normalization.
2272 //if(data->origFlags & UCOL_USE_ITERATOR) {
2273 // we need to restore original flags,
2274 // otherwise, we'll lose them
2275 //data->flags = data->origFlags;
2276 //normalizeIterator(data);
2277 //return *(data->pos++);
2278 //} else {
2279 /*
2280 in writable buffer, at this point fcdPosition can not be
2281 pointing to the end of the data string. see contracting tag.
2282 */
2283 if(data->fcdPosition) {
2284 if (*(data->fcdPosition + 1) == 0 ||
2285 data->fcdPosition + 1 == data->endp) {
2286 /* at the end of the string, dump it into the normalizer */
2287 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
2288 // Check if data->pos received a null pointer
2289 if (data->pos == NULL) {
2290 return (UChar)-1; // Return to indicate error.
2291 }
2292 return *(data->fcdPosition ++);
2293 }
2294 data->pos = data->fcdPosition;
2295 } else if(data->origFlags & UCOL_USE_ITERATOR) {
2296 // if we are here, we're using a normalizing iterator.
2297 // we should just continue further.
2298 data->flags = data->origFlags;
2299 data->pos = NULL;
2300 return (UChar)data->iterator->next(data->iterator);
2301 }
2302 //}
2303 }
2304 else {
2305 if (*(data->pos + 1) == 0) {
2306 return *(data->pos ++);
2307 }
2308 }
2309 }
2310
2311 ch = *data->pos ++;
2312 nextch = *data->pos;
2313
2314 /*
2315 * if the current character is not fcd.
2316 * Trailing combining class == 0.
2317 */
2318 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2319 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2320 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2321 /*
2322 Need a more complete FCD check and possible normalization.
2323 normalize substring will be appended to buffer
2324 */
2325 if (collIterFCD(data)) {
2326 normalizeNextContraction(data);
2327 return *(data->pos ++);
2328 }
2329 else if (innormbuf) {
2330 /* fcdposition shifted even when there's no normalization, if we
2331 don't input the rest into this, we'll get the wrong position when
2332 we reach the end of the writableBuffer */
2333 int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
2334 data->pos = insertBufferEnd(data, data->pos - 1, length);
2335 // Check if data->pos received a null pointer
2336 if (data->pos == NULL) {
2337 return (UChar)-1; // Return to indicate error.
2338 }
2339 return *(data->pos ++);
2340 }
2341 }
2342
2343 if (innormbuf) {
2344 /*
2345 no normalization is to be done hence only one character will be
2346 appended to the buffer.
2347 */
2348 data->pos = insertBufferEnd(data, ch) + 1;
2349 // Check if data->pos received a null pointer
2350 if (data->pos == NULL) {
2351 return (UChar)-1; // Return to indicate error.
2352 }
2353 }
2354
2355 /* points back to the pos in string */
2356 return ch;
2357 }
2358
2359
2360
2361 /**
2362 * Function to copy the buffer into writableBuffer and sets the fcd position to
2363 * the correct position
2364 * @param source data string source
2365 * @param buffer character buffer
2366 */
2367 static
2368 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &b uffer)
2369 {
2370 /* okay confusing part here. to ensure that the skipped characters are
2371 considered later, we need to place it in the appropriate position in the
2372 normalization buffer and reassign the pos pointer. simple case if pos
2373 reside in string, simply copy to normalization buffer and
2374 fcdposition = pos, pos = start of normalization buffer. if pos in
2375 normalization buffer, we'll insert the copy infront of pos and point pos
2376 to the start of the normalization buffer. why am i doing these copies?
2377 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecial CE does
2378 not require any changes, which be really painful. */
2379 if (source->flags & UCOL_ITER_INNORMBUF) {
2380 int32_t replaceLength = source->pos - source->writableBuffer.getBuffer() ;
2381 source->writableBuffer.replace(0, replaceLength, buffer);
2382 }
2383 else {
2384 source->fcdPosition = source->pos;
2385 source->origFlags = source->flags;
2386 source->flags |= UCOL_ITER_INNORMBUF;
2387 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_IT ERATOR);
2388 source->writableBuffer = buffer;
2389 }
2390
2391 source->pos = source->writableBuffer.getTerminatedBuffer();
2392 }
2393
2394 /**
2395 * Function to get the discontiguos collation element within the source.
2396 * Note this function will set the position to the appropriate places.
2397 * @param coll current collator used
2398 * @param source data string source
2399 * @param constart index to the start character in the contraction table
2400 * @return discontiguos collation element offset
2401 */
2402 static
2403 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2404 const UChar *constart)
2405 {
2406 /* source->pos currently points to the second combining character after
2407 the start character */
2408 const UChar *temppos = source->pos;
2409 UnicodeString buffer;
2410 const UChar *tempconstart = constart;
2411 uint8_t tempflags = source->flags;
2412 UBool multicontraction = FALSE;
2413 collIterateState discState;
2414
2415 backupState(source, &discState);
2416
2417 buffer.setTo(peekCodePoint(source, -1));
2418 for (;;) {
2419 UChar *UCharOffset;
2420 UChar schar,
2421 tchar;
2422 uint32_t result;
2423
2424 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2425 || (peekCodeUnit(source, 0) == 0 &&
2426 //|| (*source->pos == 0 &&
2427 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2428 source->fcdPosition == NULL ||
2429 source->fcdPosition == source->endp ||
2430 *(source->fcdPosition) == 0 ||
2431 u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2432 /* end of string in null terminated string or stopped by a
2433 null character, note fcd does not always point to a base
2434 character after the discontiguos change */
2435 u_getCombiningClass(peekCodePoint(source, 0)) == 0) {
2436 //u_getCombiningClass(*(source->pos)) == 0) {
2437 //constart = (UChar *)coll->image + getContractOffset(CE);
2438 if (multicontraction) {
2439 source->pos = temppos - 1;
2440 setDiscontiguosAttribute(source, buffer);
2441 return *(coll->contractionCEs +
2442 (tempconstart - coll->contractionIndex));
2443 }
2444 constart = tempconstart;
2445 break;
2446 }
2447
2448 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2449 schar = getNextNormalizedChar(source);
2450
2451 while (schar > (tchar = *UCharOffset)) {
2452 UCharOffset++;
2453 }
2454
2455 if (schar != tchar) {
2456 /* not the correct codepoint. we stuff the current codepoint into
2457 the discontiguos buffer and try the next character */
2458 buffer.append(schar);
2459 continue;
2460 }
2461 else {
2462 if (u_getCombiningClass(schar) ==
2463 u_getCombiningClass(peekCodePoint(source, -2))) {
2464 buffer.append(schar);
2465 continue;
2466 }
2467 result = *(coll->contractionCEs +
2468 (UCharOffset - coll->contractionIndex));
2469 }
2470
2471 if (result == UCOL_NOT_FOUND) {
2472 break;
2473 } else if (isContraction(result)) {
2474 /* this is a multi-contraction*/
2475 tempconstart = (UChar *)coll->image + getContractOffset(result);
2476 if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2477 != UCOL_NOT_FOUND) {
2478 multicontraction = TRUE;
2479 temppos = source->pos + 1;
2480 }
2481 } else {
2482 setDiscontiguosAttribute(source, buffer);
2483 return result;
2484 }
2485 }
2486
2487 /* no problems simply reverting just like that,
2488 if we are in string before getting into this function, points back to
2489 string hence no problem.
2490 if we are in normalization buffer before getting into this function,
2491 since we'll never use another normalization within this function, we
2492 know that fcdposition points to a base character. the normalization buffer
2493 never change, hence this revert works. */
2494 loadState(source, &discState, TRUE);
2495 goBackOne(source);
2496
2497 //source->pos = temppos - 1;
2498 source->flags = tempflags;
2499 return *(coll->contractionCEs + (constart - coll->contractionIndex));
2500 }
2501
2502 /* now uses Mark's getImplicitPrimary code */
2503 static
2504 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2505 uint32_t r = uprv_uca_getImplicitPrimary(cp);
2506 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2507 collationSource->offsetRepeatCount += 1;
2508 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2509 }
2510
2511 /**
2512 * Inserts the argument character into the front of the buffer replacing the
2513 * front null terminator.
2514 * @param data collation element iterator data
2515 * @param ch character to be appended
2516 */
2517 static
2518 inline void insertBufferFront(collIterate *data, UChar ch)
2519 {
2520 data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTer minatedBuffer() + 2;
2521 }
2522
2523 /**
2524 * Special normalization function for contraction in the previous iterator.
2525 * This normalization sequence will place the current character at source->pos
2526 * and its following normalized sequence into the buffer.
2527 * The fcd position, pos will be changed.
2528 * pos will now point to positions in the buffer.
2529 * Flags will be changed accordingly.
2530 * @param data collation iterator data
2531 */
2532 static
2533 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
2534 {
2535 const UChar *pEnd = data->pos + 1; /* End normalize + 1 */
2536 const UChar *pStart;
2537
2538 UnicodeString endOfBuffer;
2539 if (data->flags & UCOL_ITER_HASLEN) {
2540 /*
2541 normalization buffer not used yet, we'll pull down the next
2542 character into the end of the buffer
2543 */
2544 endOfBuffer.setTo(*pEnd);
2545 }
2546 else {
2547 endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL
2548 }
2549
2550 if (data->fcdPosition == NULL) {
2551 pStart = data->string;
2552 }
2553 else {
2554 pStart = data->fcdPosition + 1;
2555 }
2556 int32_t normLen =
2557 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStar t)),
2558 data->writableBuffer,
2559 *status).
2560 length();
2561 if(U_FAILURE(*status)) {
2562 return;
2563 }
2564 /*
2565 this puts the null termination infront of the normalized string instead
2566 of the end
2567 */
2568 data->pos =
2569 data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminat edBuffer() +
2570 1 + normLen;
2571 data->origFlags = data->flags;
2572 data->flags |= UCOL_ITER_INNORMBUF;
2573 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2574 }
2575
2576 /**
2577 * Contraction character management function that returns the previous character
2578 * for the backwards iterator.
2579 * Does nothing if the previous character is in buffer and not the first
2580 * character in it.
2581 * Else it checks previous character in data string to see if it is
2582 * normalizable.
2583 * If it is not, the character is simply copied into the buffer, else
2584 * the whole normalized substring is copied into the buffer, including the
2585 * current character.
2586 * @param data collation element iterator data
2587 * @return previous character
2588 */
2589 static
2590 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
2591 {
2592 UChar prevch;
2593 UChar ch;
2594 const UChar *start;
2595 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2596 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2597 (innormbuf && *(data->pos - 1) != 0)) {
2598 /*
2599 if no normalization.
2600 if previous character is in normalized buffer, no further normalization
2601 is required
2602 */
2603 if(data->flags & UCOL_USE_ITERATOR) {
2604 data->iterator->move(data->iterator, -1, UITER_CURRENT);
2605 return (UChar)data->iterator->next(data->iterator);
2606 } else {
2607 return *(data->pos - 1);
2608 }
2609 }
2610
2611 start = data->pos;
2612 if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
2613 /* in data string */
2614 if ((start - 1) == data->string) {
2615 return *(start - 1);
2616 }
2617 start --;
2618 ch = *start;
2619 prevch = *(start - 1);
2620 }
2621 else {
2622 /*
2623 in writable buffer, at this point fcdPosition can not be NULL.
2624 see contracting tag.
2625 */
2626 if (data->fcdPosition == data->string) {
2627 /* at the start of the string, just dump it into the normalizer */
2628 insertBufferFront(data, *(data->fcdPosition));
2629 data->fcdPosition = NULL;
2630 return *(data->pos - 1);
2631 }
2632 start = data->fcdPosition;
2633 ch = *start;
2634 prevch = *(start - 1);
2635 }
2636 /*
2637 * if the current character is not fcd.
2638 * Trailing combining class == 0.
2639 */
2640 if (data->fcdPosition > start &&
2641 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2642 {
2643 /*
2644 Need a more complete FCD check and possible normalization.
2645 normalize substring will be appended to buffer
2646 */
2647 const UChar *backuppos = data->pos;
2648 data->pos = start;
2649 if (collPrevIterFCD(data)) {
2650 normalizePrevContraction(data, status);
2651 return *(data->pos - 1);
2652 }
2653 data->pos = backuppos;
2654 data->fcdPosition ++;
2655 }
2656
2657 if (innormbuf) {
2658 /*
2659 no normalization is to be done hence only one character will be
2660 appended to the buffer.
2661 */
2662 insertBufferFront(data, ch);
2663 data->fcdPosition --;
2664 }
2665
2666 return ch;
2667 }
2668
2669 /* This function handles the special CEs like contractions, expansions, surrogat es, Thai */
2670 /* It is called by getNextCE */
2671
2672 /* The following should be even */
2673 #define UCOL_MAX_DIGITS_FOR_NUMBER 254
2674
2675 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col lIterate *source, UErrorCode *status) {
2676 collIterateState entryState;
2677 backupState(source, &entryState);
2678 UChar32 cp = ch;
2679
2680 for (;;) {
2681 // This loop will repeat only in the case of contractions, and only when a contraction
2682 // is found and the first CE resulting from that contraction is itself a special
2683 // (an expansion, for example.) All other special CE types are fully handled the
2684 // first time through, and the loop exits.
2685
2686 const uint32_t *CEOffset = NULL;
2687 switch(getCETag(CE)) {
2688 case NOT_FOUND_TAG:
2689 /* This one is not found, and we'll let somebody else bother about i t... no more games */
2690 return CE;
2691 case SPEC_PROC_TAG:
2692 {
2693 // Special processing is getting a CE that is preceded by a cert ain prefix
2694 // Currently this is only needed for optimizing Japanese length and iteration marks.
2695 // When we encouter a special processing tag, we go backwards an d try to see if
2696 // we have a match.
2697 // Contraction tables are used - so the whole process is not unl ike contraction.
2698 // prefix data is stored backwards in the table.
2699 const UChar *UCharOffset;
2700 UChar schar, tchar;
2701 collIterateState prefixState;
2702 backupState(source, &prefixState);
2703 loadState(source, &entryState, TRUE);
2704 goBackOne(source); // We want to look at the point where we ente red - actually one
2705 // before that...
2706
2707 for(;;) {
2708 // This loop will run once per source string character, for as long as we
2709 // are matching a potential contraction sequence
2710
2711 // First we position ourselves at the begining of contractio n sequence
2712 const UChar *ContractionStart = UCharOffset = (UChar *)coll- >image+getContractOffset(CE);
2713 if (collIter_bos(source)) {
2714 CE = *(coll->contractionCEs + (UCharOffset - coll->contr actionIndex));
2715 break;
2716 }
2717 schar = getPrevNormalizedChar(source, status);
2718 goBackOne(source);
2719
2720 while(schar > (tchar = *UCharOffset)) { /* since the contrac tion codepoints should be ordered, we skip all that are smaller */
2721 UCharOffset++;
2722 }
2723
2724 if (schar == tchar) {
2725 // Found the source string char in the table.
2726 // Pick up the corresponding CE from the table.
2727 CE = *(coll->contractionCEs +
2728 (UCharOffset - coll->contractionIndex));
2729 }
2730 else
2731 {
2732 // Source string char was not in the table.
2733 // We have not found the prefix.
2734 CE = *(coll->contractionCEs +
2735 (ContractionStart - coll->contractionIndex));
2736 }
2737
2738 if(!isPrefix(CE)) {
2739 // The source string char was in the contraction table, and the corresponding
2740 // CE is not a prefix CE. We found the prefix, break
2741 // out of loop, this CE will end up being returned. T his is the normal
2742 // way out of prefix handling when the source actually contained
2743 // the prefix.
2744 break;
2745 }
2746 }
2747 if(CE != UCOL_NOT_FOUND) { // we found something and we can meri lly continue
2748 loadState(source, &prefixState, TRUE);
2749 if(source->origFlags & UCOL_USE_ITERATOR) {
2750 source->flags = source->origFlags;
2751 }
2752 } else { // prefix search was a failure, we have to backup all t he way to the start
2753 loadState(source, &entryState, TRUE);
2754 }
2755 break;
2756 }
2757 case CONTRACTION_TAG:
2758 {
2759 /* This should handle contractions */
2760 collIterateState state;
2761 backupState(source, &state);
2762 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->imag e+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
2763 const UChar *UCharOffset;
2764 UChar schar, tchar;
2765
2766 for (;;) {
2767 /* This loop will run once per source string character, for as long as we */
2768 /* are matching a potential contraction sequence */
2769
2770 /* First we position ourselves at the begining of contractio n sequence */
2771 const UChar *ContractionStart = UCharOffset = (UChar *)coll- >image+getContractOffset(CE);
2772
2773 if (collIter_eos(source)) {
2774 // Ran off the end of the source string.
2775 CE = *(coll->contractionCEs + (UCharOffset - coll->contr actionIndex));
2776 // So we'll pick whatever we have at the point...
2777 if (CE == UCOL_NOT_FOUND) {
2778 // back up the source over all the chars we scanned going into this contraction.
2779 CE = firstCE;
2780 loadState(source, &state, TRUE);
2781 if(source->origFlags & UCOL_USE_ITERATOR) {
2782 source->flags = source->origFlags;
2783 }
2784 }
2785 break;
2786 }
2787
2788 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the di scontiguos stuff */ /* skip the backward offset, see above */
2789 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2790
2791 schar = getNextNormalizedChar(source);
2792 while(schar > (tchar = *UCharOffset)) { /* since the contrac tion codepoints should be ordered, we skip all that are smaller */
2793 UCharOffset++;
2794 }
2795
2796 if (schar == tchar) {
2797 // Found the source string char in the contraction table .
2798 // Pick up the corresponding CE from the table.
2799 CE = *(coll->contractionCEs +
2800 (UCharOffset - coll->contractionIndex));
2801 }
2802 else
2803 {
2804 // Source string char was not in contraction table.
2805 // Unless we have a discontiguous contraction, we have finished
2806 // with this contraction.
2807 // in order to do the proper detection, we
2808 // need to see if we're dealing with a supplementary
2809 /* We test whether the next two char are surrogate pairs .
2810 * This test is done if the iterator is not NULL.
2811 * If there is no surrogate pair, the iterator
2812 * goes back one if needed. */
2813 UChar32 miss = schar;
2814 if (source->iterator) {
2815 UChar32 surrNextChar; /* the next char in the iterat ion to test */
2816 int32_t prevPos; /* holds the previous position befo re move forward of the source iterator */
2817 if(U16_IS_LEAD(schar) && source->iterator->hasNext(s ource->iterator)) {
2818 prevPos = source->iterator->index;
2819 surrNextChar = getNextNormalizedChar(source);
2820 if (U16_IS_TRAIL(surrNextChar)) {
2821 miss = U16_GET_SUPPLEMENTARY(schar, surrNext Char);
2822 } else if (prevPos < source->iterator->index){
2823 goBackOne(source);
2824 }
2825 }
2826 } else if (U16_IS_LEAD(schar) && source->pos + 1 < sourc e->endp) {
2827 const UChar* prevPos = source->pos;
2828 UChar nextChar = getNextNormalizedChar(source);
2829 if (U16_IS_TRAIL(nextChar)) {
2830 miss = U16_GET_SUPPLEMENTARY(schar, nextChar);
2831 } else if (prevPos < source->pos) {
2832 goBackOne(source);
2833 }
2834 }
2835
2836 uint8_t sCC;
2837 if (miss < 0x300 ||
2838 maxCC == 0 ||
2839 (sCC = i_getCombiningClass(miss, coll)) == 0 ||
2840 sCC>maxCC ||
2841 (allSame != 0 && sCC == maxCC) ||
2842 collIter_eos(source))
2843 {
2844 // Contraction can not be discontiguous.
2845 goBackOne(source); // back up the source string by one,
2846 // because the character we just looked at was
2847 // not part of the contraction. */
2848 if(U_IS_SUPPLEMENTARY(miss)) {
2849 goBackOne(source);
2850 }
2851 CE = *(coll->contractionCEs +
2852 (ContractionStart - coll->contractionIndex));
2853 } else {
2854 //
2855 // Contraction is possibly discontiguous.
2856 // Scan more of source string looking for a match
2857 //
2858 UChar tempchar;
2859 /* find the next character if schar is not a base ch aracter
2860 and we are not yet at the end of the string */
2861 tempchar = getNextNormalizedChar(source);
2862 // probably need another supplementary thingie here
2863 goBackOne(source);
2864 if (i_getCombiningClass(tempchar, coll) == 0) {
2865 goBackOne(source);
2866 if(U_IS_SUPPLEMENTARY(miss)) {
2867 goBackOne(source);
2868 }
2869 /* Spit out the last char of the string, wasn't tasty enough */
2870 CE = *(coll->contractionCEs +
2871 (ContractionStart - coll->contractionIndex)) ;
2872 } else {
2873 CE = getDiscontiguous(coll, source, ContractionS tart);
2874 }
2875 }
2876 } // else after if(schar == tchar)
2877
2878 if(CE == UCOL_NOT_FOUND) {
2879 /* The Source string did not match the contraction that we were checking. */
2880 /* Back up the source position to undo the effects of h aving partially */
2881 /* scanned through what ultimately proved to not be a contraction. */
2882 loadState(source, &state, TRUE);
2883 CE = firstCE;
2884 break;
2885 }
2886
2887 if(!isContraction(CE)) {
2888 // The source string char was in the contraction table, and the corresponding
2889 // CE is not a contraction CE. We completed the contr action, break
2890 // out of loop, this CE will end up being returned. T his is the normal
2891 // way out of contraction handling when the source act ually contained
2892 // the contraction.
2893 break;
2894 }
2895
2896
2897 // The source string char was in the contraction table, and the corresponding
2898 // CE is IS a contraction CE. We will continue looping t o check the source
2899 // string for the remaining chars in the contraction.
2900 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2901 if(tempCE != UCOL_NOT_FOUND) {
2902 // We have scanned a a section of source string for whic h there is a
2903 // CE from the contraction table. Remember the CE and scan position, so
2904 // that we can return to this point if further scanning fails to
2905 // match a longer contraction sequence.
2906 firstCE = tempCE;
2907
2908 goBackOne(source);
2909 backupState(source, &state);
2910 getNextNormalizedChar(source);
2911
2912 // Another way to do this is:
2913 //collIterateState tempState;
2914 //backupState(source, &tempState);
2915 //goBackOne(source);
2916 //backupState(source, &state);
2917 //loadState(source, &tempState, TRUE);
2918
2919 // The problem is that for incomplete contractions we ha ve to remember the previous
2920 // position. Before, the only thing I needed to do was s tate.pos--;
2921 // After iterator introduction and especially after intr oduction of normalizing
2922 // iterators, it became much more difficult to decrease the saved state.
2923 // I'm not yet sure which of the two methods above is fa ster.
2924 }
2925 } // for(;;)
2926 break;
2927 } // case CONTRACTION_TAG:
2928 case LONG_PRIMARY_TAG:
2929 {
2930 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
2931 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYT E_COMMON;
2932 source->offsetRepeatCount += 1;
2933 return CE;
2934 }
2935 case EXPANSION_TAG:
2936 {
2937 /* This should handle expansion. */
2938 /* NOTE: we can encounter both continuations and expansions in a n expansion! */
2939 /* I have to decide where continuations are going to be dealt wi th */
2940 uint32_t size;
2941 uint32_t i; /* general counter */
2942
2943 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* fi nd the offset to expansion table */
2944 size = getExpansionCount(CE);
2945 CE = *CEOffset++;
2946 //source->offsetRepeatCount = -1;
2947
2948 if(size != 0) { /* if there are less than 16 elements in expansi on, we don't terminate */
2949 for(i = 1; i<size; i++) {
2950 *(source->CEpos++) = *CEOffset++;
2951 source->offsetRepeatCount += 1;
2952 }
2953 } else { /* else, we do */
2954 while(*CEOffset != 0) {
2955 *(source->CEpos++) = *CEOffset++;
2956 source->offsetRepeatCount += 1;
2957 }
2958 }
2959
2960 return CE;
2961 }
2962 case DIGIT_TAG:
2963 {
2964 /*
2965 We do a check to see if we want to collate digits as numbers; if so we generate
2966 a custom collation key. Otherwise we pull out the value stored i n the expansion table.
2967 */
2968 //uint32_t size;
2969 uint32_t i; /* general counter */
2970
2971 if (source->coll->numericCollation == UCOL_ON){
2972 collIterateState digitState = {0,0,0,0,0,0,0,0,0};
2973 UChar32 char32 = 0;
2974 int32_t digVal = 0;
2975
2976 uint32_t digIndx = 0;
2977 uint32_t endIndex = 0;
2978 uint32_t trailingZeroIndex = 0;
2979
2980 uint8_t collateVal = 0;
2981
2982 UBool nonZeroValReached = FALSE;
2983
2984 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I j ust need a temporary place to store my generated CEs.
2985 /*
2986 We parse the source string until we hit a char that's N OT a digit.
2987 Use this u_charDigitValue. This might be slow because we have to
2988 handle surrogates...
2989 */
2990 /*
2991 if (U16_IS_LEAD(ch)){
2992 if (!collIter_eos(source)) {
2993 backupState(source, &digitState);
2994 UChar trail = getNextNormalizedChar(source);
2995 if(U16_IS_TRAIL(trail)) {
2996 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
2997 } else {
2998 loadState(source, &digitState, TRUE);
2999 char32 = ch;
3000 }
3001 } else {
3002 char32 = ch;
3003 }
3004 } else {
3005 char32 = ch;
3006 }
3007 digVal = u_charDigitValue(char32);
3008 */
3009 digVal = u_charDigitValue(cp); // if we have arrived here, w e have
3010 // already processed possible supplementaries that trigered the digit tag -
3011 // all supplementaries are marked in the UCA.
3012 /*
3013 We pad a zero in front of the first element anyways. Th is takes
3014 care of the (probably) most common case where people are sorting things followed
3015 by a single digit
3016 */
3017 digIndx++;
3018 for(;;){
3019 // Make sure we have enough space. No longer needed;
3020 // at this point digIndx now has a max value of UCOL_MAX _DIGITS_FOR_NUMBER
3021 // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
3022 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
3023
3024 // Skipping over leading zeroes.
3025 if (digVal != 0) {
3026 nonZeroValReached = TRUE;
3027 }
3028 if (nonZeroValReached) {
3029 /*
3030 We parse the digit string into base 100 numbers (thi s fits into a byte).
3031 We only add to the buffer in twos, thus if we are pa rsing an odd character,
3032 that serves as the 'tens' digit while the if we are parsing an even one, that
3033 is the 'ones' digit. We dumped the parsed base 100 v alue (collateVal) into
3034 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3035 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3036 than all the other bytes.
3037 */
3038
3039 if (digIndx % 2 == 1){
3040 collateVal += (uint8_t)digVal;
3041
3042 // We don't enter the low-order-digit case unles s we've already seen
3043 // the high order, or for the first digit, which is always non-zero.
3044 if (collateVal != 0)
3045 trailingZeroIndex = 0;
3046
3047 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3048 collateVal = 0;
3049 }
3050 else{
3051 // We drop the collation value into the buffer s o if we need to do
3052 // a "front patch" we don't have to check to see if we're hitting the
3053 // last element.
3054 collateVal = (uint8_t)(digVal * 10);
3055
3056 // Check for trailing zeroes.
3057 if (collateVal == 0)
3058 {
3059 if (!trailingZeroIndex)
3060 trailingZeroIndex = (digIndx/2) + 2;
3061 }
3062 else
3063 trailingZeroIndex = 0;
3064
3065 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3066 }
3067 digIndx++;
3068 }
3069
3070 // Get next character.
3071 if (!collIter_eos(source)){
3072 ch = getNextNormalizedChar(source);
3073 if (U16_IS_LEAD(ch)){
3074 if (!collIter_eos(source)) {
3075 backupState(source, &digitState);
3076 UChar trail = getNextNormalizedChar(source);
3077 if(U16_IS_TRAIL(trail)) {
3078 char32 = U16_GET_SUPPLEMENTARY(ch, trail );
3079 } else {
3080 loadState(source, &digitState, TRUE);
3081 char32 = ch;
3082 }
3083 }
3084 } else {
3085 char32 = ch;
3086 }
3087
3088 if ((digVal = u_charDigitValue(char32)) == -1 || dig Indx > UCOL_MAX_DIGITS_FOR_NUMBER){
3089 // Resetting position to point to the next unpro cessed char. We
3090 // overshot it when doing our test/set for numbe rs.
3091 if (char32 > 0xFFFF) { // For surrogates.
3092 loadState(source, &digitState, TRUE);
3093 //goBackOne(source);
3094 }
3095 goBackOne(source);
3096 break;
3097 }
3098 } else {
3099 break;
3100 }
3101 }
3102
3103 if (nonZeroValReached == FALSE){
3104 digIndx = 2;
3105 numTempBuf[2] = 6;
3106 }
3107
3108 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx /2) + 2) ;
3109 if (digIndx % 2 != 0){
3110 /*
3111 We missed a value. Since digIndx isn't even, stuck too m any values into the buffer (this is what
3112 we get for padding the first byte with a zero). "Front-p atch" now by pushing all nybbles forward.
3113 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3114 single pass and optimizes for strings with single digits . I'm just assuming that's the more common case.
3115 */
3116
3117 for(i = 2; i < endIndex; i++){
3118 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3119 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3120 }
3121 --digIndx;
3122 }
3123
3124 // Subtract one off of the last byte.
3125 numTempBuf[endIndex-1] -= 1;
3126
3127 /*
3128 We want to skip over the first two slots in the buffer. The first slot
3129 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3130 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3131 */
3132 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3133 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3134
3135 // Now transfer the collation key to our collIterate struct.
3136 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3137 //size = ((endIndex+1) & ~1)/2;
3138 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARY ORDERSHIFT) | //Primary weight
3139 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Seco ndary weight
3140 UCOL_BYTE_COMMON; // Tertiary weight.
3141 i = 2; // Reset the index into the buffer.
3142 while(i < endIndex)
3143 {
3144 uint32_t primWeight = numTempBuf[i++] << 8;
3145 if ( i < endIndex)
3146 primWeight |= numTempBuf[i++];
3147 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHI FT) | UCOL_CONTINUATION_MARKER;
3148 }
3149
3150 } else {
3151 // no numeric mode, we'll just switch to whatever we stashed and continue
3152 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); / * find the offset to expansion table */
3153 CE = *CEOffset++;
3154 break;
3155 }
3156 return CE;
3157 }
3158 /* various implicits optimization */
3159 case IMPLICIT_TAG: /* everything that is not defined otherwise */
3160 /* UCA is filled with these. Tailorings are NOT_FOUND */
3161 return getImplicit(cp, source);
3162 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D */
3163 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImpl icit
3164 return getImplicit(cp, source);
3165 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3166 {
3167 static const uint32_t
3168 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11 A7;
3169 //const uint32_t LCount = 19;
3170 static const uint32_t VCount = 21;
3171 static const uint32_t TCount = 28;
3172 //const uint32_t NCount = VCount * TCount; // 588
3173 //const uint32_t SCount = LCount * NCount; // 11172
3174 uint32_t L = ch - SBase;
3175
3176 // divide into pieces
3177
3178 uint32_t T = L % TCount; // we do it in this order since some co mpilers can do % and / in one operation
3179 L /= TCount;
3180 uint32_t V = L % VCount;
3181 L /= VCount;
3182
3183 // offset them
3184
3185 L += LBase;
3186 V += VBase;
3187 T += TBase;
3188
3189 // return the first CE, but first put the rest into the expansio n buffer
3190 if (!source->coll->image->jamoSpecial) { // FAST PATH
3191
3192 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V );
3193 if (T != TBase) {
3194 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mappin g, T);
3195 }
3196
3197 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3198
3199 } else { // Jamo is Special
3200 // Since Hanguls pass the FCD check, it is
3201 // guaranteed that we won't be in
3202 // the normalization buffer if something like this happens
3203
3204 // However, if we are using a uchar iterator and normalizati on
3205 // is ON, the Hangul that lead us here is going to be in tha t
3206 // normalization buffer. Here we want to restore the uchar
3207 // iterator state and pull out of the normalization buffer
3208 if(source->iterator != NULL && source->flags & UCOL_ITER_INN ORMBUF) {
3209 source->flags = source->origFlags; // restore the iterat or
3210 source->pos = NULL;
3211 }
3212
3213 // Move Jamos into normalization buffer
3214 UChar *buffer = source->writableBuffer.getBuffer(4);
3215 int32_t bufferLength;
3216 buffer[0] = (UChar)L;
3217 buffer[1] = (UChar)V;
3218 if (T != TBase) {
3219 buffer[2] = (UChar)T;
3220 bufferLength = 3;
3221 } else {
3222 bufferLength = 2;
3223 }
3224 source->writableBuffer.releaseBuffer(bufferLength);
3225
3226 // Indicate where to continue in main input string after exh austing the writableBuffer
3227 source->fcdPosition = source->pos;
3228
3229 source->pos = source->writableBuffer.getTerminatedBuffer() ;
3230 source->origFlags = source->flags;
3231 source->flags |= UCOL_ITER_INNORMBUF;
3232 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3233
3234 return(UCOL_IGNORABLE);
3235 }
3236 }
3237 case SURROGATE_TAG:
3238 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
3239 /* two things can happen here: next code point can be a trailing sur rogate - we will use it */
3240 /* to retrieve the CE, or it is not a trailing surrogate (or the str ing is done). In that case */
3241 /* we treat it like an unassigned code point. */
3242 {
3243 UChar trail;
3244 collIterateState state;
3245 backupState(source, &state);
3246 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNorma lizedChar(source))))) {
3247 // we chould have stepped one char forward and it might have turned that it
3248 // was not a trail surrogate. In that case, we have to backu p.
3249 loadState(source, &state, TRUE);
3250 return UCOL_NOT_FOUND;
3251 } else {
3252 /* TODO: CE contain the data from the previous CE + the mask . It should at least be unmasked */
3253 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFF FF, trail);
3254 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
3255 // We need to backup
3256 loadState(source, &state, TRUE);
3257 return CE;
3258 }
3259 // calculate the supplementary code point value, if surrogat e was not tailored
3260 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10U L)+0xdc00-0x10000));
3261 }
3262 }
3263 break;
3264 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
3265 UChar nextChar;
3266 if( source->flags & UCOL_USE_ITERATOR) {
3267 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source ->iterator))) {
3268 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3269 source->iterator->next(source->iterator);
3270 return getImplicit(cp, source);
3271 }
3272 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->po s<source->endp)) &&
3273 U_IS_TRAIL((nextChar=*source->pos))) {
3274 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3275 source->pos++;
3276 return getImplicit(cp, source);
3277 }
3278 return UCOL_NOT_FOUND;
3279 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3280 return UCOL_NOT_FOUND; /* broken surrogate sequence */
3281 case CHARSET_TAG:
3282 /* not yet implemented */
3283 /* probably after 1.8 */
3284 return UCOL_NOT_FOUND;
3285 default:
3286 *status = U_INTERNAL_PROGRAM_ERROR;
3287 CE=0;
3288 break;
3289 }
3290 if (CE <= UCOL_NOT_FOUND) break;
3291 }
3292 return CE;
3293 }
3294
3295
3296 /* now uses Mark's getImplicitPrimary code */
3297 static
3298 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3299 uint32_t r = uprv_uca_getImplicitPrimary(cp);
3300
3301 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3302 collationSource->toReturn = collationSource->CEpos;
3303
3304 // **** doesn't work if using iterator ****
3305 if (collationSource->flags & UCOL_ITER_INNORMBUF) {
3306 collationSource->offsetRepeatCount = 1;
3307 } else {
3308 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource-> string);
3309
3310 UErrorCode errorCode = U_ZERO_ERROR;
3311 collationSource->appendOffset(firstOffset, errorCode);
3312 collationSource->appendOffset(firstOffset + 1, errorCode);
3313
3314 collationSource->offsetReturn = collationSource->offsetStore - 1;
3315 *(collationSource->offsetBuffer) = firstOffset;
3316 if (collationSource->offsetReturn == collationSource->offsetBuffer) {
3317 collationSource->offsetStore = collationSource->offsetBuffer;
3318 }
3319 }
3320
3321 return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3322 }
3323
3324 /**
3325 * This function handles the special CEs like contractions, expansions,
3326 * surrogates, Thai.
3327 * It is called by both getPrevCE
3328 */
3329 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3330 collIterate *source,
3331 UErrorCode *status)
3332 {
3333 const uint32_t *CEOffset = NULL;
3334 UChar *UCharOffset = NULL;
3335 UChar schar;
3336 const UChar *constart = NULL;
3337 uint32_t size;
3338 UChar buffer[UCOL_MAX_BUFFER];
3339 uint32_t *endCEBuffer;
3340 UChar *strbuffer;
3341 int32_t noChars = 0;
3342 int32_t CECount = 0;
3343
3344 for(;;)
3345 {
3346 /* the only ces that loops are thai and contractions */
3347 switch (getCETag(CE))
3348 {
3349 case NOT_FOUND_TAG: /* this tag always returns */
3350 return CE;
3351
3352 case SPEC_PROC_TAG:
3353 {
3354 // Special processing is getting a CE that is preceded by a cert ain prefix
3355 // Currently this is only needed for optimizing Japanese length and iteration marks.
3356 // When we encouter a special processing tag, we go backwards an d try to see if
3357 // we have a match.
3358 // Contraction tables are used - so the whole process is not unl ike contraction.
3359 // prefix data is stored backwards in the table.
3360 const UChar *UCharOffset;
3361 UChar schar, tchar;
3362 collIterateState prefixState;
3363 backupState(source, &prefixState);
3364 for(;;) {
3365 // This loop will run once per source string character, for as long as we
3366 // are matching a potential contraction sequence
3367
3368 // First we position ourselves at the begining of contractio n sequence
3369 const UChar *ContractionStart = UCharOffset = (UChar *)coll- >image+getContractOffset(CE);
3370
3371 if (collIter_bos(source)) {
3372 CE = *(coll->contractionCEs + (UCharOffset - coll->contr actionIndex));
3373 break;
3374 }
3375 schar = getPrevNormalizedChar(source, status);
3376 goBackOne(source);
3377
3378 while(schar > (tchar = *UCharOffset)) { /* since the contrac tion codepoints should be ordered, we skip all that are smaller */
3379 UCharOffset++;
3380 }
3381
3382 if (schar == tchar) {
3383 // Found the source string char in the table.
3384 // Pick up the corresponding CE from the table.
3385 CE = *(coll->contractionCEs +
3386 (UCharOffset - coll->contractionIndex));
3387 }
3388 else
3389 {
3390 // if there is a completely ignorable code point in the middle of
3391 // a prefix, we need to act as if it's not there
3392 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0- fdef are set to zero)
3393 // lone surrogates cannot be set to zero as it would bre ak other processing
3394 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping , schar);
3395 // it's easy for BMP code points
3396 if(isZeroCE == 0) {
3397 continue;
3398 } else if(U16_IS_SURROGATE(schar)) {
3399 // for supplementary code points, we have to check t he next one
3400 // situations where we are going to ignore
3401 // 1. beginning of the string: schar is a lone surro gate
3402 // 2. schar is a lone surrogate
3403 // 3. schar is a trail surrogate in a valid surrogat e sequence
3404 // that is explicitly set to zero.
3405 if (!collIter_bos(source)) {
3406 UChar lead;
3407 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD( lead = getPrevNormalizedChar(source, status))) {
3408 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapp ing, lead);
3409 if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {
3410 uint32_t finalCE = UTRIE_GET32_FROM_OFFS ET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
3411 if(finalCE == 0) {
3412 // this is a real, assigned complete ly ignorable code point
3413 goBackOne(source);
3414 continue;
3415 }
3416 }
3417 } else {
3418 // lone surrogate, treat like unassigned
3419 return UCOL_NOT_FOUND;
3420 }
3421 } else {
3422 // lone surrogate at the beggining, treat like u nassigned
3423 return UCOL_NOT_FOUND;
3424 }
3425 }
3426 // Source string char was not in the table.
3427 // We have not found the prefix.
3428 CE = *(coll->contractionCEs +
3429 (ContractionStart - coll->contractionIndex));
3430 }
3431
3432 if(!isPrefix(CE)) {
3433 // The source string char was in the contraction table, and the corresponding
3434 // CE is not a prefix CE. We found the prefix, break
3435 // out of loop, this CE will end up being returned. T his is the normal
3436 // way out of prefix handling when the source actually contained
3437 // the prefix.
3438 break;
3439 }
3440 }
3441 loadState(source, &prefixState, TRUE);
3442 break;
3443 }
3444
3445 case CONTRACTION_TAG: {
3446 /* to ensure that the backwards and forwards iteration matches, we
3447 take the current region of most possible match and pass it through
3448 the forward iteration. this will ensure that the obstinate problem o f
3449 overlapping contractions will not occur.
3450 */
3451 schar = peekCodeUnit(source, 0);
3452 constart = (UChar *)coll->image + getContractOffset(CE);
3453 if (isAtStartPrevIterate(source)
3454 /* commented away contraction end checks after adding the checks
3455 in getPrevCE */) {
3456 /* start of string or this is not the end of any contraction */
3457 CE = *(coll->contractionCEs +
3458 (constart - coll->contractionIndex));
3459 break;
3460 }
3461 strbuffer = buffer;
3462 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3463 *(UCharOffset --) = 0;
3464 noChars = 0;
3465 // have to swap thai characters
3466 while (ucol_unsafeCP(schar, coll)) {
3467 *(UCharOffset) = schar;
3468 noChars++;
3469 UCharOffset --;
3470 schar = getPrevNormalizedChar(source, status);
3471 goBackOne(source);
3472 // TODO: when we exhaust the contraction buffer,
3473 // it needs to get reallocated. The problem is
3474 // that the size depends on the string which is
3475 // not iterated over. However, since we're travelling
3476 // backwards, we already had to set the iterator at
3477 // the end - so we might as well know where we are?
3478 if (UCharOffset + 1 == buffer) {
3479 /* we have exhausted the buffer */
3480 int32_t newsize = 0;
3481 if(source->pos) { // actually dealing with a position
3482 newsize = (int32_t)(source->pos - source->string + 1);
3483 } else { // iterator
3484 newsize = 4 * UCOL_MAX_BUFFER;
3485 }
3486 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3487 (newsize + UCOL_MAX_BUFFER));
3488 /* test for NULL */
3489 if (strbuffer == NULL) {
3490 *status = U_MEMORY_ALLOCATION_ERROR;
3491 return UCOL_NO_MORE_CES;
3492 }
3493 UCharOffset = strbuffer + newsize;
3494 uprv_memcpy(UCharOffset, buffer,
3495 UCOL_MAX_BUFFER * sizeof(UChar));
3496 UCharOffset --;
3497 }
3498 if ((source->pos && (source->pos == source->string ||
3499 ((source->flags & UCOL_ITER_INNORMBUF) &&
3500 *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3501 || (source->iterator && !source->iterator->hasPrevious(sourc e->iterator))) {
3502 break;
3503 }
3504 }
3505 /* adds the initial base character to the string */
3506 *(UCharOffset) = schar;
3507 noChars++;
3508
3509 int32_t offsetBias;
3510
3511 // **** doesn't work if using iterator ****
3512 if (source->flags & UCOL_ITER_INNORMBUF) {
3513 offsetBias = -1;
3514 } else {
3515 offsetBias = (int32_t)(source->pos - source->string);
3516 }
3517
3518 /* a new collIterate is used to simplify things, since using the cur rent
3519 collIterate will mean that the forward and backwards iteration will
3520 share and change the same buffers. we don't want to get into that. * /
3521 collIterate temp;
3522 int32_t rawOffset;
3523
3524 IInit_collIterate(coll, UCharOffset, noChars, &temp, status);
3525 if(U_FAILURE(*status)) {
3526 return (uint32_t)UCOL_NULLORDER;
3527 }
3528 temp.flags &= ~UCOL_ITER_NORM;
3529 temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
3530
3531 rawOffset = (int32_t)(temp.pos - temp.string); // should always be z ero?
3532 CE = ucol_IGetNextCE(coll, &temp, status);
3533
3534 if (source->extendCEs) {
3535 endCEBuffer = source->extendCEs + source->extendCEsSize;
3536 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(u int32_t));
3537 } else {
3538 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3539 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_ t));
3540 }
3541
3542 while (CE != UCOL_NO_MORE_CES) {
3543 *(source->CEpos ++) = CE;
3544
3545 if (offsetBias >= 0) {
3546 source->appendOffset(rawOffset + offsetBias, *status);
3547 }
3548
3549 CECount++;
3550 if (source->CEpos == endCEBuffer) {
3551 /* ran out of CE space, reallocate to new buffer.
3552 If reallocation fails, reset pointers and bail out,
3553 there's no guarantee of the right character position after
3554 this bail*/
3555 if (!increaseCEsCapacity(source)) {
3556 *status = U_MEMORY_ALLOCATION_ERROR;
3557 break;
3558 }
3559
3560 endCEBuffer = source->extendCEs + source->extendCEsSize;
3561 }
3562
3563 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
3564 rawOffset = (int32_t)(temp.fcdPosition - temp.string);
3565 } else {
3566 rawOffset = (int32_t)(temp.pos - temp.string);
3567 }
3568
3569 CE = ucol_IGetNextCE(coll, &temp, status);
3570 }
3571
3572 if (strbuffer != buffer) {
3573 uprv_free(strbuffer);
3574 }
3575 if (U_FAILURE(*status)) {
3576 return (uint32_t)UCOL_NULLORDER;
3577 }
3578
3579 if (source->offsetRepeatValue != 0) {
3580 if (CECount > noChars) {
3581 source->offsetRepeatCount += temp.offsetRepeatCount;
3582 } else {
3583 // **** does this really skip the right offsets? ****
3584 source->offsetReturn -= (noChars - CECount);
3585 }
3586 }
3587
3588 if (offsetBias >= 0) {
3589 source->offsetReturn = source->offsetStore - 1;
3590 if (source->offsetReturn == source->offsetBuffer) {
3591 source->offsetStore = source->offsetBuffer;
3592 }
3593 }
3594
3595 source->toReturn = source->CEpos - 1;
3596 if (source->toReturn == source->CEs) {
3597 source->CEpos = source->CEs;
3598 }
3599
3600 return *(source->toReturn);
3601 }
3602 case LONG_PRIMARY_TAG:
3603 {
3604 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3605 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3606 source->toReturn = source->CEpos - 1;
3607
3608 if (source->flags & UCOL_ITER_INNORMBUF) {
3609 source->offsetRepeatCount = 1;
3610 } else {
3611 int32_t firstOffset = (int32_t)(source->pos - source->string );
3612
3613 source->appendOffset(firstOffset, *status);
3614 source->appendOffset(firstOffset + 1, *status);
3615
3616 source->offsetReturn = source->offsetStore - 1;
3617 *(source->offsetBuffer) = firstOffset;
3618 if (source->offsetReturn == source->offsetBuffer) {
3619 source->offsetStore = source->offsetBuffer;
3620 }
3621 }
3622
3623
3624 return *(source->toReturn);
3625 }
3626
3627 case EXPANSION_TAG: /* this tag always returns */
3628 {
3629 /*
3630 This should handle expansion.
3631 NOTE: we can encounter both continuations and expansions in an expan sion!
3632 I have to decide where continuations are going to be dealt with
3633 */
3634 int32_t firstOffset = (int32_t)(source->pos - source->string);
3635
3636 // **** doesn't work if using iterator ****
3637 if (source->offsetReturn != NULL) {
3638 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetRet urn == source->offsetBuffer) {
3639 source->offsetStore = source->offsetBuffer;
3640 }else {
3641 firstOffset = -1;
3642 }
3643 }
3644
3645 /* find the offset to expansion table */
3646 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3647 size = getExpansionCount(CE);
3648 if (size != 0) {
3649 /*
3650 if there are less than 16 elements in expansion, we don't termin ate
3651 */
3652 uint32_t count;
3653
3654 for (count = 0; count < size; count++) {
3655 *(source->CEpos ++) = *CEOffset++;
3656
3657 if (firstOffset >= 0) {
3658 source->appendOffset(firstOffset + 1, *status);
3659 }
3660 }
3661 } else {
3662 /* else, we do */
3663 while (*CEOffset != 0) {
3664 *(source->CEpos ++) = *CEOffset ++;
3665
3666 if (firstOffset >= 0) {
3667 source->appendOffset(firstOffset + 1, *status);
3668 }
3669 }
3670 }
3671
3672 if (firstOffset >= 0) {
3673 source->offsetReturn = source->offsetStore - 1;
3674 *(source->offsetBuffer) = firstOffset;
3675 if (source->offsetReturn == source->offsetBuffer) {
3676 source->offsetStore = source->offsetBuffer;
3677 }
3678 } else {
3679 source->offsetRepeatCount += size - 1;
3680 }
3681
3682 source->toReturn = source->CEpos - 1;
3683 // in case of one element expansion, we
3684 // want to immediately return CEpos
3685 if(source->toReturn == source->CEs) {
3686 source->CEpos = source->CEs;
3687 }
3688
3689 return *(source->toReturn);
3690 }
3691
3692 case DIGIT_TAG:
3693 {
3694 /*
3695 We do a check to see if we want to collate digits as numbers; if so we generate
3696 a custom collation key. Otherwise we pull out the value stored i n the expansion table.
3697 */
3698 uint32_t i; /* general counter */
3699
3700 if (source->coll->numericCollation == UCOL_ON){
3701 uint32_t digIndx = 0;
3702 uint32_t endIndex = 0;
3703 uint32_t leadingZeroIndex = 0;
3704 uint32_t trailingZeroCount = 0;
3705
3706 uint8_t collateVal = 0;
3707
3708 UBool nonZeroValReached = FALSE;
3709
3710 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I j ust need a temporary place to store my generated CEs.
3711 /*
3712 We parse the source string until we hit a char that's NOT a digit.
3713 Use this u_charDigitValue. This might be slow because we hav e to
3714 handle surrogates...
3715 */
3716 /*
3717 We need to break up the digit string into collection element s of UCOL_MAX_DIGITS_FOR_NUMBER or less,
3718 with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
3719 element we process when going backward. To determine how lon g that chunk might be, we may need to make
3720 two passes through the loop that collects digits - one to se e how long the string is (and how much is
3721 leading zeros) to determine the length of that right-hand ch unk, and a second (if the whole string has
3722 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits ) to actually process that collation
3723 element chunk after resetting the state to the initialState at the right side of the digit string.
3724 */
3725 uint32_t ceLimit = 0;
3726 UChar initial_ch = ch;
3727 collIterateState initialState = {0,0,0,0,0,0,0,0,0};
3728 backupState(source, &initialState);
3729
3730 for(;;) {
3731 collIterateState state = {0,0,0,0,0,0,0,0,0};
3732 UChar32 char32 = 0;
3733 int32_t digVal = 0;
3734
3735 if (U16_IS_TRAIL (ch)) {
3736 if (!collIter_bos(source)){
3737 UChar lead = getPrevNormalizedChar(source, statu s);
3738 if(U16_IS_LEAD(lead)) {
3739 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3740 goBackOne(source);
3741 } else {
3742 char32 = ch;
3743 }
3744 } else {
3745 char32 = ch;
3746 }
3747 } else {
3748 char32 = ch;
3749 }
3750 digVal = u_charDigitValue(char32);
3751
3752 for(;;) {
3753 // Make sure we have enough space. No longer needed;
3754 // at this point the largest value of digIndx when w e need to save data in numTempBuf
3755 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post- incremented) so we just ensure
3756 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FO R_NUMBER/2 + 2).
3757
3758 // Skip over trailing zeroes, and keep a count of th em.
3759 if (digVal != 0)
3760 nonZeroValReached = TRUE;
3761
3762 if (nonZeroValReached) {
3763 /*
3764 We parse the digit string into base 100 numbers (this fits into a byte).
3765 We only add to the buffer in twos, thus if we ar e parsing an odd character,
3766 that serves as the 'tens' digit while the if we are parsing an even one, that
3767 is the 'ones' digit. We dumped the parsed base 1 00 value (collateVal) into
3768 a buffer. We multiply each collateVal by 2 (to g ive us room) and add 5 (to avoid
3769 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3770 than all the other bytes.
3771
3772 Since we're doing in this reverse we want to put the first digit encountered into the
3773 ones place and the second digit encountered into the tens place.
3774 */
3775
3776 if ((digIndx + trailingZeroCount) % 2 == 1) {
3777 // High-order digit case (tens place)
3778 collateVal += (uint8_t)(digVal * 10);
3779
3780 // We cannot set leadingZeroIndex unless it has been set for the
3781 // low-order digit. Therefore, all we can do for the high-order
3782 // digit is turn it off, never on.
3783 // The only time we will have a high digit w ithout a low is for
3784 // the very first non-zero digit, so no zero check is necessary.
3785 if (collateVal != 0)
3786 leadingZeroIndex = 0;
3787
3788 // The first pass through, digIndx may excee d the limit, but in that case
3789 // we no longer care about numTempBuf conten ts since they will be discarded
3790 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
3791 numTempBuf[(digIndx/2) + 2] = collateVal *2 + 6;
3792 }
3793 collateVal = 0;
3794 } else {
3795 // Low-order digit case (ones place)
3796 collateVal = (uint8_t)digVal;
3797
3798 // Check for leading zeroes.
3799 if (collateVal == 0) {
3800 if (!leadingZeroIndex)
3801 leadingZeroIndex = (digIndx/2) + 2;
3802 } else
3803 leadingZeroIndex = 0;
3804
3805 // No need to write to buffer; the case of a last odd digit
3806 // is handled below.
3807 }
3808 ++digIndx;
3809 } else
3810 ++trailingZeroCount;
3811
3812 if (!collIter_bos(source)) {
3813 ch = getPrevNormalizedChar(source, status);
3814 //goBackOne(source);
3815 if (U16_IS_TRAIL(ch)) {
3816 backupState(source, &state);
3817 if (!collIter_bos(source)) {
3818 goBackOne(source);
3819 UChar lead = getPrevNormalizedChar(sourc e, status);
3820
3821 if(U16_IS_LEAD(lead)) {
3822 char32 = U16_GET_SUPPLEMENTARY(lead, ch);
3823 } else {
3824 loadState(source, &state, FALSE);
3825 char32 = ch;
3826 }
3827 }
3828 } else
3829 char32 = ch;
3830
3831 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
3832 if (char32 > 0xFFFF) {// For surrogates.
3833 loadState(source, &state, FALSE);
3834 }
3835 // Don't need to "reverse" the goBackOne cal l,
3836 // as this points to the next position to pr ocess..
3837 //if (char32 > 0xFFFF) // For surrogates.
3838 //getNextNormalizedChar(source);
3839 break;
3840 }
3841
3842 goBackOne(source);
3843 }else
3844 break;
3845 }
3846
3847 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_N UMBER) {
3848 // our collation element is not too big, go ahead an d finish with it
3849 break;
3850 }
3851 // our digit string is too long for a collation element;
3852 // set the limit for it, reset the state and begin again
3853 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGIT S_FOR_NUMBER;
3854 if ( ceLimit == 0 ) {
3855 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
3856 }
3857 ch = initial_ch;
3858 loadState(source, &initialState, FALSE);
3859 digIndx = endIndex = leadingZeroIndex = trailingZeroCoun t = 0;
3860 collateVal = 0;
3861 nonZeroValReached = FALSE;
3862 }
3863
3864 if (! nonZeroValReached) {
3865 digIndx = 2;
3866 trailingZeroCount = 0;
3867 numTempBuf[2] = 6;
3868 }
3869
3870 if ((digIndx + trailingZeroCount) % 2 != 0) {
3871 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
3872 digIndx += 1; // The implicit leading zero
3873 }
3874 if (trailingZeroCount % 2 != 0) {
3875 // We had to consume one trailing zero for the low digit
3876 // of the least significant byte
3877 digIndx += 1; // The trailing zero not in the expo nent
3878 trailingZeroCount -= 1;
3879 }
3880
3881 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2 ) + 2) ;
3882
3883 // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3884 numTempBuf[2] -= 1;
3885
3886 /*
3887 We want to skip over the first two slots in the buffer. The first slot
3888 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3889 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3890 The exponent must be adjusted by the number of leading zeroe s, and the number of
3891 trailing zeroes.
3892 */
3893 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3894 uint32_t exponent = (digIndx+trailingZeroCount)/2;
3895 if (leadingZeroIndex)
3896 exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
3897 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
3898
3899 // Now transfer the collation key to our collIterate struct.
3900 // The total size for our collation key is half of endIndex, rounded up.
3901 int32_t size = (endIndex+1)/2;
3902 if(!ensureCEsCapacity(source, size)) {
3903 return (uint32_t)UCOL_NULLORDER;
3904 }
3905 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3906 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Seco ndary weight
3907 UCOL_BYTE_COMMON; // Tertiary weight.
3908 i = endIndex - 1; // Reset the index into the buffer.
3909 while(i >= 2) {
3910 uint32_t primWeight = numTempBuf[i--] << 8;
3911 if ( i >= 2)
3912 primWeight |= numTempBuf[i--];
3913 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHI FT) | UCOL_CONTINUATION_MARKER;
3914 }
3915
3916 source->toReturn = source->CEpos -1;
3917 return *(source->toReturn);
3918 } else {
3919 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3920 CE = *(CEOffset++);
3921 break;
3922 }
3923 }
3924
3925 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3926 {
3927 static const uint32_t
3928 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11 A7;
3929 //const uint32_t LCount = 19;
3930 static const uint32_t VCount = 21;
3931 static const uint32_t TCount = 28;
3932 //const uint32_t NCount = VCount * TCount; /* 588 */
3933 //const uint32_t SCount = LCount * NCount; /* 11172 */
3934
3935 uint32_t L = ch - SBase;
3936 /*
3937 divide into pieces.
3938 we do it in this order since some compilers can do % and / in on e
3939 operation
3940 */
3941 uint32_t T = L % TCount;
3942 L /= TCount;
3943 uint32_t V = L % VCount;
3944 L /= VCount;
3945
3946 /* offset them */
3947 L += LBase;
3948 V += VBase;
3949 T += TBase;
3950
3951 int32_t firstOffset = (int32_t)(source->pos - source->string);
3952 source->appendOffset(firstOffset, *status);
3953
3954 /*
3955 * return the first CE, but first put the rest into the expansio n buffer
3956 */
3957 if (!source->coll->image->jamoSpecial) {
3958 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L );
3959 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V );
3960 source->appendOffset(firstOffset + 1, *status);
3961
3962 if (T != TBase) {
3963 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mappin g, T);
3964 source->appendOffset(firstOffset + 1, *status);
3965 }
3966
3967 source->toReturn = source->CEpos - 1;
3968
3969 source->offsetReturn = source->offsetStore - 1;
3970 if (source->offsetReturn == source->offsetBuffer) {
3971 source->offsetStore = source->offsetBuffer;
3972 }
3973
3974 return *(source->toReturn);
3975 } else {
3976 // Since Hanguls pass the FCD check, it is
3977 // guaranteed that we won't be in
3978 // the normalization buffer if something like this happens
3979
3980 // Move Jamos into normalization buffer
3981 UChar *tempbuffer = source->writableBuffer.getBuffer(5);
3982 int32_t tempbufferLength, jamoOffset;
3983 tempbuffer[0] = 0;
3984 tempbuffer[1] = (UChar)L;
3985 tempbuffer[2] = (UChar)V;
3986 if (T != TBase) {
3987 tempbuffer[3] = (UChar)T;
3988 tempbufferLength = 4;
3989 } else {
3990 tempbufferLength = 3;
3991 }
3992 source->writableBuffer.releaseBuffer(tempbufferLength);
3993
3994 // Indicate where to continue in main input string after exh austing the writableBuffer
3995 if (source->pos == source->string) {
3996 jamoOffset = 0;
3997 source->fcdPosition = NULL;
3998 } else {
3999 jamoOffset = source->pos - source->string;
4000 source->fcdPosition = source->pos-1;
4001 }
4002
4003 // Append offsets for the additional chars
4004 // (not the 0, and not the L whose offsets match the origina l Hangul)
4005 int32_t jamoRemaining = tempbufferLength - 2;
4006 jamoOffset++; // appended offsets should match end of origin al Hangul
4007 while (jamoRemaining-- > 0) {
4008 source->appendOffset(jamoOffset, *status);
4009 }
4010
4011 source->offsetRepeatValue = jamoOffset;
4012
4013 source->offsetReturn = source->offsetStore - 1;
4014 if (source->offsetReturn == source->offsetBuffer) {
4015 source->offsetStore = source->offsetBuffer;
4016 }
4017
4018 source->pos = source->writableBuffer.getTermin atedBuffer() + tempbufferLength;
4019 source->origFlags = source->flags;
4020 source->flags |= UCOL_ITER_INNORMBUF;
4021 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HAS LEN);
4022
4023 return(UCOL_IGNORABLE);
4024 }
4025 }
4026
4027 case IMPLICIT_TAG: /* everything that is not defined otherwise */
4028 return getPrevImplicit(ch, source);
4029
4030 // TODO: Remove CJK implicits as they are handled by the getImplicit Primary function
4031 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D */
4032 return getPrevImplicit(ch, source);
4033
4034 case SURROGATE_TAG: /* This is a surrogate pair */
4035 /* essentially an engaged lead surrogate. */
4036 /* if you have encountered it here, it means that a */
4037 /* broken sequence was encountered and this is an error */
4038 return UCOL_NOT_FOUND;
4039
4040 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
4041 return UCOL_NOT_FOUND; /* broken surrogate sequence */
4042
4043 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
4044 {
4045 UChar32 cp = 0;
4046 UChar prevChar;
4047 const UChar *prev;
4048 if (isAtStartPrevIterate(source)) {
4049 /* we are at the start of the string, wrong place to be at * /
4050 return UCOL_NOT_FOUND;
4051 }
4052 if (source->pos != source->writableBuffer.getBuffer()) {
4053 prev = source->pos - 1;
4054 } else {
4055 prev = source->fcdPosition;
4056 }
4057 prevChar = *prev;
4058
4059 /* Handles Han and Supplementary characters here.*/
4060 if (U16_IS_LEAD(prevChar)) {
4061 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<< 10UL)+0xdc00-0x10000));
4062 source->pos = prev;
4063 } else {
4064 return UCOL_NOT_FOUND; /* like unassigned */
4065 }
4066
4067 return getPrevImplicit(cp, source);
4068 }
4069
4070 /* UCA is filled with these. Tailorings are NOT_FOUND */
4071 /* not yet implemented */
4072 case CHARSET_TAG: /* this tag always returns */
4073 /* probably after 1.8 */
4074 return UCOL_NOT_FOUND;
4075
4076 default: /* this tag always returns */
4077 *status = U_INTERNAL_PROGRAM_ERROR;
4078 CE=0;
4079 break;
4080 }
4081
4082 if (CE <= UCOL_NOT_FOUND) {
4083 break;
4084 }
4085 }
4086
4087 return CE;
4088 }
4089
4090 /* This should really be a macro */
4091 /* This function is used to reverse parts of a buffer. We need this operation wh en doing continuation */
4092 /* secondaries in French */
4093 /*
4094 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4095 uint8_t temp;
4096 while(start<end) {
4097 temp = *start;
4098 *start++ = *end;
4099 *end-- = temp;
4100 }
4101 }
4102 */
4103
4104 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4105 TYPE tempA; \
4106 while((start)<(end)) { \
4107 tempA = *(start); \
4108 *(start)++ = *(end); \
4109 *(end)-- = tempA; \
4110 } \
4111 }
4112
4113 /****************************************************************************/
4114 /* Following are the sortkey generation functions */
4115 /* */
4116 /****************************************************************************/
4117
4118 U_CAPI int32_t U_EXPORT2 113 U_CAPI int32_t U_EXPORT2
4119 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, 114 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
4120 const uint8_t *src2, int32_t src2Length, 115 const uint8_t *src2, int32_t src2Length,
4121 uint8_t *dest, int32_t destCapacity) { 116 uint8_t *dest, int32_t destCapacity) {
4122 /* check arguments */ 117 /* check arguments */
4123 if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[sr c1Length-1]!=0) || 118 if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[sr c1Length-1]!=0) ||
4124 src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[sr c2Length-1]!=0) || 119 src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[sr c2Length-1]!=0) ||
4125 destCapacity<0 || (destCapacity>0 && dest==NULL) 120 destCapacity<0 || (destCapacity>0 && dest==NULL)
4126 ) { 121 ) {
4127 /* error, attempt to write a zero byte and return 0 */ 122 /* error, attempt to write a zero byte and return 0 */
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after
4183 /* src1 is not finished, therefore *src2==0, and src1 is appended */ 178 /* src1 is not finished, therefore *src2==0, and src1 is appended */
4184 src2=src1; 179 src2=src1;
4185 } 180 }
4186 /* append src2, "the other, unfinished sort key" */ 181 /* append src2, "the other, unfinished sort key" */
4187 while((*p++=*src2++)!=0) {} 182 while((*p++=*src2++)!=0) {}
4188 183
4189 /* the actual length might be less than destLength if either sort key contai ned illegally embedded zero bytes */ 184 /* the actual length might be less than destLength if either sort key contai ned illegally embedded zero bytes */
4190 return (int32_t)(p-dest); 185 return (int32_t)(p-dest);
4191 } 186 }
4192 187
4193 U_NAMESPACE_BEGIN
4194
4195 class SortKeyByteSink : public ByteSink {
4196 public:
4197 SortKeyByteSink(char *dest, int32_t destCapacity)
4198 : buffer_(dest), capacity_(destCapacity),
4199 appended_(0) {
4200 if (buffer_ == NULL) {
4201 capacity_ = 0;
4202 } else if(capacity_ < 0) {
4203 buffer_ = NULL;
4204 capacity_ = 0;
4205 }
4206 }
4207 virtual ~SortKeyByteSink();
4208
4209 virtual void Append(const char *bytes, int32_t n);
4210 void Append(uint32_t b) {
4211 if (appended_ < capacity_ || Resize(1, appended_)) {
4212 buffer_[appended_] = (char)b;
4213 }
4214 ++appended_;
4215 }
4216 void Append(uint32_t b1, uint32_t b2) {
4217 int32_t a2 = appended_ + 2;
4218 if (a2 <= capacity_ || Resize(2, appended_)) {
4219 buffer_[appended_] = (char)b1;
4220 buffer_[appended_ + 1] = (char)b2;
4221 } else if(appended_ < capacity_) {
4222 buffer_[appended_] = (char)b1;
4223 }
4224 appended_ = a2;
4225 }
4226 virtual char *GetAppendBuffer(int32_t min_capacity,
4227 int32_t desired_capacity_hint,
4228 char *scratch, int32_t scratch_capacity,
4229 int32_t *result_capacity);
4230 int32_t NumberOfBytesAppended() const { return appended_; }
4231 /** @return FALSE if memory allocation failed */
4232 UBool IsOk() const { return buffer_ != NULL; }
4233
4234 protected:
4235 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t leng th) = 0;
4236 virtual UBool Resize(int32_t appendCapacity, int32_t length) = 0;
4237
4238 void SetNotOk() {
4239 buffer_ = NULL;
4240 capacity_ = 0;
4241 }
4242
4243 char *buffer_;
4244 int32_t capacity_;
4245 int32_t appended_;
4246
4247 private:
4248 SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemente d
4249 SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented
4250 };
4251
4252 SortKeyByteSink::~SortKeyByteSink() {}
4253
4254 void
4255 SortKeyByteSink::Append(const char *bytes, int32_t n) {
4256 if (n <= 0 || bytes == NULL) {
4257 return;
4258 }
4259 int32_t length = appended_;
4260 appended_ += n;
4261 if ((buffer_ + length) == bytes) {
4262 return; // the caller used GetAppendBuffer() and wrote the bytes alread y
4263 }
4264 int32_t available = capacity_ - length;
4265 if (n <= available) {
4266 uprv_memcpy(buffer_ + length, bytes, n);
4267 } else {
4268 AppendBeyondCapacity(bytes, n, length);
4269 }
4270 }
4271
4272 char *
4273 SortKeyByteSink::GetAppendBuffer(int32_t min_capacity,
4274 int32_t desired_capacity_hint,
4275 char *scratch,
4276 int32_t scratch_capacity,
4277 int32_t *result_capacity) {
4278 if (min_capacity < 1 || scratch_capacity < min_capacity) {
4279 *result_capacity = 0;
4280 return NULL;
4281 }
4282 int32_t available = capacity_ - appended_;
4283 if (available >= min_capacity) {
4284 *result_capacity = available;
4285 return buffer_ + appended_;
4286 } else if (Resize(desired_capacity_hint, appended_)) {
4287 *result_capacity = capacity_ - appended_;
4288 return buffer_ + appended_;
4289 } else {
4290 *result_capacity = scratch_capacity;
4291 return scratch;
4292 }
4293 }
4294
4295 class FixedSortKeyByteSink : public SortKeyByteSink {
4296 public:
4297 FixedSortKeyByteSink(char *dest, int32_t destCapacity)
4298 : SortKeyByteSink(dest, destCapacity) {}
4299 virtual ~FixedSortKeyByteSink();
4300
4301 private:
4302 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t leng th);
4303 virtual UBool Resize(int32_t appendCapacity, int32_t length);
4304 };
4305
4306 FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
4307
4308 void
4309 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int 32_t length) {
4310 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
4311 // Fill the buffer completely.
4312 int32_t available = capacity_ - length;
4313 if (available > 0) {
4314 uprv_memcpy(buffer_ + length, bytes, available);
4315 }
4316 }
4317
4318 UBool
4319 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
4320 return FALSE;
4321 }
4322
4323 class CollationKeyByteSink : public SortKeyByteSink {
4324 public:
4325 CollationKeyByteSink(CollationKey &key)
4326 : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getC apacity()),
4327 key_(key) {}
4328 virtual ~CollationKeyByteSink();
4329
4330 private:
4331 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t leng th);
4332 virtual UBool Resize(int32_t appendCapacity, int32_t length);
4333
4334 CollationKey &key_;
4335 };
4336
4337 CollationKeyByteSink::~CollationKeyByteSink() {}
4338
4339 void
4340 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
4341 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
4342 if (Resize(n, length)) {
4343 uprv_memcpy(buffer_ + length, bytes, n);
4344 }
4345 }
4346
4347 UBool
4348 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
4349 if (buffer_ == NULL) {
4350 return FALSE; // allocation failed before already
4351 }
4352 int32_t newCapacity = 2 * capacity_;
4353 int32_t altCapacity = length + 2 * appendCapacity;
4354 if (newCapacity < altCapacity) {
4355 newCapacity = altCapacity;
4356 }
4357 if (newCapacity < 200) {
4358 newCapacity = 200;
4359 }
4360 uint8_t *newBuffer = key_.reallocate(newCapacity, length);
4361 if (newBuffer == NULL) {
4362 SetNotOk();
4363 return FALSE;
4364 }
4365 buffer_ = reinterpret_cast<char *>(newBuffer);
4366 capacity_ = newCapacity;
4367 return TRUE;
4368 }
4369
4370 /**
4371 * uint8_t byte buffer, similar to CharString but simpler.
4372 */
4373 class SortKeyLevel : public UMemory {
4374 public:
4375 SortKeyLevel() : len(0), ok(TRUE) {}
4376 ~SortKeyLevel() {}
4377
4378 /** @return FALSE if memory allocation failed */
4379 UBool isOk() const { return ok; }
4380 UBool isEmpty() const { return len == 0; }
4381 int32_t length() const { return len; }
4382 const uint8_t *data() const { return buffer.getAlias(); }
4383 uint8_t operator[](int32_t index) const { return buffer[index]; }
4384
4385 void appendByte(uint32_t b);
4386
4387 void appendTo(ByteSink &sink) const {
4388 sink.Append(reinterpret_cast<const char *>(buffer.getAlias()), len);
4389 }
4390
4391 uint8_t &lastByte() {
4392 U_ASSERT(len > 0);
4393 return buffer[len - 1];
4394 }
4395
4396 uint8_t *getLastFewBytes(int32_t n) {
4397 if (ok && len >= n) {
4398 return buffer.getAlias() + len - n;
4399 } else {
4400 return NULL;
4401 }
4402 }
4403
4404 private:
4405 MaybeStackArray<uint8_t, 40> buffer;
4406 int32_t len;
4407 UBool ok;
4408
4409 UBool ensureCapacity(int32_t appendCapacity);
4410
4411 SortKeyLevel(const SortKeyLevel &other); // forbid copying of this class
4412 SortKeyLevel &operator=(const SortKeyLevel &other); // forbid copying of thi s class
4413 };
4414
4415 void SortKeyLevel::appendByte(uint32_t b) {
4416 if(len < buffer.getCapacity() || ensureCapacity(1)) {
4417 buffer[len++] = (uint8_t)b;
4418 }
4419 }
4420
4421 UBool SortKeyLevel::ensureCapacity(int32_t appendCapacity) {
4422 if(!ok) {
4423 return FALSE;
4424 }
4425 int32_t newCapacity = 2 * buffer.getCapacity();
4426 int32_t altCapacity = len + 2 * appendCapacity;
4427 if (newCapacity < altCapacity) {
4428 newCapacity = altCapacity;
4429 }
4430 if (newCapacity < 200) {
4431 newCapacity = 200;
4432 }
4433 if(buffer.resize(newCapacity, len)==NULL) {
4434 return ok = FALSE;
4435 }
4436 return TRUE;
4437 }
4438
4439 U_NAMESPACE_END
4440
4441 /* sortkey API */
4442 U_CAPI int32_t U_EXPORT2 188 U_CAPI int32_t U_EXPORT2
4443 ucol_getSortKey(const UCollator *coll, 189 ucol_getSortKey(const UCollator *coll,
4444 const UChar *source, 190 const UChar *source,
4445 int32_t sourceLength, 191 int32_t sourceLength,
4446 uint8_t *result, 192 uint8_t *result,
4447 int32_t resultLength) 193 int32_t resultLength)
4448 { 194 {
4449 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); 195 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4450 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 196 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4451 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, sour ce, 197 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, sour ce,
4452 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLengt h)); 198 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLengt h));
4453 } 199 }
4454 200
4455 if(coll->delegate != NULL) { 201 int32_t keySize = Collator::fromUCollator(coll)->
4456 return ((const Collator*)coll->delegate)->getSortKey(source, sourceLength, result, resultLength); 202 getSortKey(source, sourceLength, result, resultLength);
4457 } 203
4458
4459 UErrorCode status = U_ZERO_ERROR;
4460 int32_t keySize = 0;
4461
4462 if(source != NULL) {
4463 // source == NULL is actually an error situation, but we would need to
4464 // have an error code to return it. Until we introduce a new
4465 // API, it stays like this
4466
4467 /* this uses the function pointer that is set in updateinternalstate */
4468 /* currently, there are two funcs: */
4469 /*ucol_calcSortKey(...);*/
4470 /*ucol_calcSortKeySimpleTertiary(...);*/
4471
4472 uint8_t noDest[1] = { 0 };
4473 if(result == NULL) {
4474 // Distinguish pure preflighting from an allocation error.
4475 result = noDest;
4476 resultLength = 0;
4477 }
4478 FixedSortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength );
4479 coll->sortKeyGen(coll, source, sourceLength, sink, &status);
4480 if(U_SUCCESS(status)) {
4481 keySize = sink.NumberOfBytesAppended();
4482 }
4483 }
4484 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); 204 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4485 UTRACE_EXIT_STATUS(status); 205 UTRACE_EXIT_VALUE(keySize);
4486 return keySize; 206 return keySize;
4487 } 207 }
4488 208
4489 U_CFUNC int32_t
4490 ucol_getCollationKey(const UCollator *coll,
4491 const UChar *source, int32_t sourceLength,
4492 CollationKey &key,
4493 UErrorCode &errorCode) {
4494 CollationKeyByteSink sink(key);
4495 coll->sortKeyGen(coll, source, sourceLength, sink, &errorCode);
4496 return sink.NumberOfBytesAppended();
4497 }
4498
4499 // Is this primary weight compressible?
4500 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
4501 // TODO: This should use per-lead-byte flags from FractionalUCA.txt.
4502 static inline UBool
4503 isCompressible(const UCollator * /*coll*/, uint8_t primary1) {
4504 return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegul arPrimary;
4505 }
4506
4507 static
4508 inline void doCaseShift(SortKeyLevel &cases, uint32_t &caseShift) {
4509 if (caseShift == 0) {
4510 cases.appendByte(UCOL_CASE_BYTE_START);
4511 caseShift = UCOL_CASE_SHIFT_START;
4512 }
4513 }
4514
4515 // Packs the secondary buffer when processing French locale.
4516 static void
4517 packFrench(const uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result) {
4518 secondaries += secsize; // We read the secondary-level bytes back to front.
4519 uint8_t secondary;
4520 int32_t count2 = 0;
4521 int32_t i = 0;
4522 // we use i here since the key size already accounts for terminators, so we' ll discard the increment
4523 for(i = 0; i<secsize; i++) {
4524 secondary = *(secondaries-i-1);
4525 /* This is compression code. */
4526 if (secondary == UCOL_COMMON2) {
4527 ++count2;
4528 } else {
4529 if (count2 > 0) {
4530 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4531 while (count2 > UCOL_TOP_COUNT2) {
4532 result.Append(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4533 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4534 }
4535 result.Append(UCOL_COMMON_TOP2 - (count2-1));
4536 } else {
4537 while (count2 > UCOL_BOT_COUNT2) {
4538 result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4539 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4540 }
4541 result.Append(UCOL_COMMON_BOT2 + (count2-1));
4542 }
4543 count2 = 0;
4544 }
4545 result.Append(secondary);
4546 }
4547 }
4548 if (count2 > 0) {
4549 while (count2 > UCOL_BOT_COUNT2) {
4550 result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4551 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4552 }
4553 result.Append(UCOL_COMMON_BOT2 + (count2-1));
4554 }
4555 }
4556
4557 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
4558
4559 /* This is the sortkey work horse function */
4560 U_CFUNC void U_CALLCONV
4561 ucol_calcSortKey(const UCollator *coll,
4562 const UChar *source,
4563 int32_t sourceLength,
4564 SortKeyByteSink &result,
4565 UErrorCode *status)
4566 {
4567 if(U_FAILURE(*status)) {
4568 return;
4569 }
4570
4571 SortKeyByteSink &primaries = result;
4572 SortKeyLevel secondaries;
4573 SortKeyLevel tertiaries;
4574 SortKeyLevel cases;
4575 SortKeyLevel quads;
4576
4577 UnicodeString normSource;
4578
4579 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4580
4581 UColAttributeValue strength = coll->strength;
4582
4583 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4584 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4585 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4586 UBool compareIdent = (strength == UCOL_IDENTICAL);
4587 UBool doCase = (coll->caseLevel == UCOL_ON);
4588 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0) ;
4589 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
4590 //UBool qShifted = shifted && (compareQuad == 0);
4591 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4592
4593 uint32_t variableTopValue = coll->variableTopValue;
4594 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4595 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4596 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4597 uint8_t UCOL_HIRAGANA_QUAD = 0;
4598 if(doHiragana) {
4599 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4600 /* allocate one more space for hiragana, value for hiragana */
4601 }
4602 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4603
4604 /* support for special features like caselevel and funky secondaries */
4605 int32_t lastSecondaryLength = 0;
4606 uint32_t caseShift = 0;
4607
4608 /* If we need to normalize, we'll do it all at once at the beginning! */
4609 const Normalizer2 *norm2;
4610 if(compareIdent) {
4611 norm2 = Normalizer2Factory::getNFDInstance(*status);
4612 } else if(coll->normalizationMode != UCOL_OFF) {
4613 norm2 = Normalizer2Factory::getFCDInstance(*status);
4614 } else {
4615 norm2 = NULL;
4616 }
4617 if(norm2 != NULL) {
4618 normSource.setTo(FALSE, source, len);
4619 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
4620 if(qcYesLength != len) {
4621 UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
4622 normSource.truncate(qcYesLength);
4623 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
4624 source = normSource.getBuffer();
4625 len = normSource.length();
4626 }
4627 }
4628 collIterate s;
4629 IInit_collIterate(coll, source, len, &s, status);
4630 if(U_FAILURE(*status)) {
4631 return;
4632 }
4633 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was norma lized.
4634
4635 uint32_t order = 0;
4636
4637 uint8_t primary1 = 0;
4638 uint8_t primary2 = 0;
4639 uint8_t secondary = 0;
4640 uint8_t tertiary = 0;
4641 uint8_t caseSwitch = coll->caseSwitch;
4642 uint8_t tertiaryMask = coll->tertiaryMask;
4643 int8_t tertiaryAddition = coll->tertiaryAddition;
4644 uint8_t tertiaryTop = coll->tertiaryTop;
4645 uint8_t tertiaryBottom = coll->tertiaryBottom;
4646 uint8_t tertiaryCommon = coll->tertiaryCommon;
4647 uint8_t caseBits = 0;
4648
4649 UBool wasShifted = FALSE;
4650 UBool notIsContinuation = FALSE;
4651
4652 uint32_t count2 = 0, count3 = 0, count4 = 0;
4653 uint8_t leadPrimary = 0;
4654
4655 for(;;) {
4656 order = ucol_IGetNextCE(coll, &s, status);
4657 if(order == UCOL_NO_MORE_CES) {
4658 break;
4659 }
4660
4661 if(order == 0) {
4662 continue;
4663 }
4664
4665 notIsContinuation = !isContinuation(order);
4666
4667 if(notIsContinuation) {
4668 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4669 } else {
4670 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4671 }
4672
4673 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4674 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4675 primary1 = (uint8_t)(order >> 8);
4676
4677 uint8_t originalPrimary1 = primary1;
4678 if(notIsContinuation && coll->leadBytePermutationTable != NULL) {
4679 primary1 = coll->leadBytePermutationTable[primary1];
4680 }
4681
4682 if((shifted && ((notIsContinuation && order <= variableTopValue && prima ry1 > 0)
4683 || (!notIsContinuation && wasShifted)))
4684 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
4685 {
4686 /* and other ignorables should be removed if following a shifted cod e point */
4687 if(primary1 == 0) { /* if we were shifted and we got an ignorable co de point */
4688 /* we should just completely ignore it */
4689 continue;
4690 }
4691 if(compareQuad == 0) {
4692 if(count4 > 0) {
4693 while (count4 > UCOL_BOT_COUNT4) {
4694 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4695 count4 -= UCOL_BOT_COUNT4;
4696 }
4697 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
4698 count4 = 0;
4699 }
4700 /* We are dealing with a variable and we're treating them as shi fted */
4701 /* This is a shifted ignorable */
4702 if(primary1 != 0) { /* we need to check this since we could be i n continuation */
4703 quads.appendByte(primary1);
4704 }
4705 if(primary2 != 0) {
4706 quads.appendByte(primary2);
4707 }
4708 }
4709 wasShifted = TRUE;
4710 } else {
4711 wasShifted = FALSE;
4712 /* Note: This code assumes that the table is well built i.e. not hav ing 0 bytes where they are not supposed to be. */
4713 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */
4714 /* regular and simple sortkey calc */
4715 if(primary1 != UCOL_IGNORABLE) {
4716 if(notIsContinuation) {
4717 if(leadPrimary == primary1) {
4718 primaries.Append(primary2);
4719 } else {
4720 if(leadPrimary != 0) {
4721 primaries.Append((primary1 > leadPrimary) ? UCOL_BYT E_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
4722 }
4723 if(primary2 == UCOL_IGNORABLE) {
4724 /* one byter, not compressed */
4725 primaries.Append(primary1);
4726 leadPrimary = 0;
4727 } else if(isCompressible(coll, originalPrimary1)) {
4728 /* compress */
4729 primaries.Append(leadPrimary = primary1, primary2);
4730 } else {
4731 leadPrimary = 0;
4732 primaries.Append(primary1, primary2);
4733 }
4734 }
4735 } else { /* we are in continuation, so we're gonna add primary t o the key don't care about compression */
4736 if(primary2 == UCOL_IGNORABLE) {
4737 primaries.Append(primary1);
4738 } else {
4739 primaries.Append(primary1, primary2);
4740 }
4741 }
4742 }
4743
4744 if(secondary > compareSec) {
4745 if(!isFrenchSec) {
4746 /* This is compression code. */
4747 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4748 ++count2;
4749 } else {
4750 if (count2 > 0) {
4751 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4752 while (count2 > UCOL_TOP_COUNT2) {
4753 secondaries.appendByte(UCOL_COMMON_TOP2 - UC OL_TOP_COUNT2);
4754 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4755 }
4756 secondaries.appendByte(UCOL_COMMON_TOP2 - (count 2-1));
4757 } else {
4758 while (count2 > UCOL_BOT_COUNT2) {
4759 secondaries.appendByte(UCOL_COMMON_BOT2 + UC OL_BOT_COUNT2);
4760 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4761 }
4762 secondaries.appendByte(UCOL_COMMON_BOT2 + (count 2-1));
4763 }
4764 count2 = 0;
4765 }
4766 secondaries.appendByte(secondary);
4767 }
4768 } else {
4769 /* Do the special handling for French secondaries */
4770 /* We need to get continuation elements and do intermediate restore */
4771 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4772 if(notIsContinuation) {
4773 if (lastSecondaryLength > 1) {
4774 uint8_t *frenchStartPtr = secondaries.getLastFewByte s(lastSecondaryLength);
4775 if (frenchStartPtr != NULL) {
4776 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4777 uint8_t *frenchEndPtr = frenchStartPtr + lastSec ondaryLength - 1;
4778 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr , frenchEndPtr);
4779 }
4780 }
4781 lastSecondaryLength = 1;
4782 } else {
4783 ++lastSecondaryLength;
4784 }
4785 secondaries.appendByte(secondary);
4786 }
4787 }
4788
4789 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4790 // do the case level if we need to do it. We don't want to calcu late
4791 // case level for primary ignorables if we have only primary str ength and case level
4792 // otherwise we would break well formedness of CEs
4793 doCaseShift(cases, caseShift);
4794 if(notIsContinuation) {
4795 caseBits = (uint8_t)(tertiary & 0xC0);
4796
4797 if(tertiary != 0) {
4798 if(coll->caseFirst == UCOL_UPPER_FIRST) {
4799 if((caseBits & 0xC0) == 0) {
4800 cases.lastByte() |= 1 << (--caseShift);
4801 } else {
4802 cases.lastByte() |= 0 << (--caseShift);
4803 /* second bit */
4804 doCaseShift(cases, caseShift);
4805 cases.lastByte() |= ((caseBits>>6)&1) << (--case Shift);
4806 }
4807 } else {
4808 if((caseBits & 0xC0) == 0) {
4809 cases.lastByte() |= 0 << (--caseShift);
4810 } else {
4811 cases.lastByte() |= 1 << (--caseShift);
4812 /* second bit */
4813 doCaseShift(cases, caseShift);
4814 cases.lastByte() |= ((caseBits>>7)&1) << (--case Shift);
4815 }
4816 }
4817 }
4818 }
4819 } else {
4820 if(notIsContinuation) {
4821 tertiary ^= caseSwitch;
4822 }
4823 }
4824
4825 tertiary &= tertiaryMask;
4826 if(tertiary > compareTer) {
4827 /* This is compression code. */
4828 /* sequence size check is included in the if clause */
4829 if (tertiary == tertiaryCommon && notIsContinuation) {
4830 ++count3;
4831 } else {
4832 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMO N3_NORMAL) {
4833 tertiary += tertiaryAddition;
4834 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UC OL_COMMON3_UPPERFIRST) {
4835 tertiary -= tertiaryAddition;
4836 }
4837 if (count3 > 0) {
4838 if ((tertiary > tertiaryCommon)) {
4839 while (count3 > coll->tertiaryTopCount) {
4840 tertiaries.appendByte(tertiaryTop - coll->tertia ryTopCount);
4841 count3 -= (uint32_t)coll->tertiaryTopCount;
4842 }
4843 tertiaries.appendByte(tertiaryTop - (count3-1));
4844 } else {
4845 while (count3 > coll->tertiaryBottomCount) {
4846 tertiaries.appendByte(tertiaryBottom + coll->ter tiaryBottomCount);
4847 count3 -= (uint32_t)coll->tertiaryBottomCount;
4848 }
4849 tertiaries.appendByte(tertiaryBottom + (count3-1));
4850 }
4851 count3 = 0;
4852 }
4853 tertiaries.appendByte(tertiary);
4854 }
4855 }
4856
4857 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
4858 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we ne ed to note it
4859 if(count4>0) { // Close this part
4860 while (count4 > UCOL_BOT_COUNT4) {
4861 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4) ;
4862 count4 -= UCOL_BOT_COUNT4;
4863 }
4864 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
4865 count4 = 0;
4866 }
4867 quads.appendByte(UCOL_HIRAGANA_QUAD); // Add the Hiragana
4868 } else { // This wasn't Hiragana, so we can continue adding stuf f
4869 count4++;
4870 }
4871 }
4872 }
4873 }
4874
4875 /* Here, we are generally done with processing */
4876 /* bailing out would not be too productive */
4877
4878 UBool ok = TRUE;
4879 if(U_SUCCESS(*status)) {
4880 /* we have done all the CE's, now let's put them together to form a key */
4881 if(compareSec == 0) {
4882 if (count2 > 0) {
4883 while (count2 > UCOL_BOT_COUNT2) {
4884 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4885 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4886 }
4887 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
4888 }
4889 result.Append(UCOL_LEVELTERMINATOR);
4890 if(!secondaries.isOk()) {
4891 ok = FALSE;
4892 } else if(!isFrenchSec) {
4893 secondaries.appendTo(result);
4894 } else {
4895 // If there are any unresolved continuation secondaries,
4896 // reverse them here so that we can reverse the whole secondary thing.
4897 if (lastSecondaryLength > 1) {
4898 uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSe condaryLength);
4899 if (frenchStartPtr != NULL) {
4900 /* reverse secondaries from frenchStartPtr up to frenchE ndPtr */
4901 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLe ngth - 1;
4902 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, french EndPtr);
4903 }
4904 }
4905 packFrench(secondaries.data(), secondaries.length(), result);
4906 }
4907 }
4908
4909 if(doCase) {
4910 ok &= cases.isOk();
4911 result.Append(UCOL_LEVELTERMINATOR);
4912 cases.appendTo(result);
4913 }
4914
4915 if(compareTer == 0) {
4916 if (count3 > 0) {
4917 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
4918 while (count3 >= coll->tertiaryTopCount) {
4919 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCou nt);
4920 count3 -= (uint32_t)coll->tertiaryTopCount;
4921 }
4922 tertiaries.appendByte(tertiaryTop - count3);
4923 } else {
4924 while (count3 > coll->tertiaryBottomCount) {
4925 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBot tomCount);
4926 count3 -= (uint32_t)coll->tertiaryBottomCount;
4927 }
4928 tertiaries.appendByte(tertiaryBottom + (count3-1));
4929 }
4930 }
4931 ok &= tertiaries.isOk();
4932 result.Append(UCOL_LEVELTERMINATOR);
4933 tertiaries.appendTo(result);
4934
4935 if(compareQuad == 0/*qShifted == TRUE*/) {
4936 if(count4 > 0) {
4937 while (count4 > UCOL_BOT_COUNT4) {
4938 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4939 count4 -= UCOL_BOT_COUNT4;
4940 }
4941 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
4942 }
4943 ok &= quads.isOk();
4944 result.Append(UCOL_LEVELTERMINATOR);
4945 quads.appendTo(result);
4946 }
4947
4948 if(compareIdent) {
4949 result.Append(UCOL_LEVELTERMINATOR);
4950 u_writeIdenticalLevelRun(s.string, len, result);
4951 }
4952 }
4953 result.Append(0);
4954 }
4955
4956 /* To avoid memory leak, free the offset buffer if necessary. */
4957 ucol_freeOffsetBuffer(&s);
4958
4959 ok &= result.IsOk();
4960 if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; }
4961 }
4962
4963
4964 U_CFUNC void U_CALLCONV
4965 ucol_calcSortKeySimpleTertiary(const UCollator *coll,
4966 const UChar *source,
4967 int32_t sourceLength,
4968 SortKeyByteSink &result,
4969 UErrorCode *status)
4970 {
4971 U_ALIGN_CODE(16);
4972
4973 if(U_FAILURE(*status)) {
4974 return;
4975 }
4976
4977 SortKeyByteSink &primaries = result;
4978 SortKeyLevel secondaries;
4979 SortKeyLevel tertiaries;
4980
4981 UnicodeString normSource;
4982
4983 int32_t len = sourceLength;
4984
4985 /* If we need to normalize, we'll do it all at once at the beginning! */
4986 if(coll->normalizationMode != UCOL_OFF) {
4987 normSource.setTo(len < 0, source, len);
4988 const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status);
4989 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
4990 if(qcYesLength != normSource.length()) {
4991 UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
4992 normSource.truncate(qcYesLength);
4993 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
4994 source = normSource.getBuffer();
4995 len = normSource.length();
4996 }
4997 }
4998 collIterate s;
4999 IInit_collIterate(coll, (UChar *)source, len, &s, status);
5000 if(U_FAILURE(*status)) {
5001 return;
5002 }
5003 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was norma lized.
5004
5005 uint32_t order = 0;
5006
5007 uint8_t primary1 = 0;
5008 uint8_t primary2 = 0;
5009 uint8_t secondary = 0;
5010 uint8_t tertiary = 0;
5011 uint8_t caseSwitch = coll->caseSwitch;
5012 uint8_t tertiaryMask = coll->tertiaryMask;
5013 int8_t tertiaryAddition = coll->tertiaryAddition;
5014 uint8_t tertiaryTop = coll->tertiaryTop;
5015 uint8_t tertiaryBottom = coll->tertiaryBottom;
5016 uint8_t tertiaryCommon = coll->tertiaryCommon;
5017
5018 UBool notIsContinuation = FALSE;
5019
5020 uint32_t count2 = 0, count3 = 0;
5021 uint8_t leadPrimary = 0;
5022
5023 for(;;) {
5024 order = ucol_IGetNextCE(coll, &s, status);
5025
5026 if(order == 0) {
5027 continue;
5028 }
5029
5030 if(order == UCOL_NO_MORE_CES) {
5031 break;
5032 }
5033
5034 notIsContinuation = !isContinuation(order);
5035
5036 if(notIsContinuation) {
5037 tertiary = (uint8_t)((order & tertiaryMask));
5038 } else {
5039 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5040 }
5041
5042 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5043 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5044 primary1 = (uint8_t)(order >> 8);
5045
5046 uint8_t originalPrimary1 = primary1;
5047 if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
5048 primary1 = coll->leadBytePermutationTable[primary1];
5049 }
5050
5051 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5052 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */
5053 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
5054 /* regular and simple sortkey calc */
5055 if(primary1 != UCOL_IGNORABLE) {
5056 if(notIsContinuation) {
5057 if(leadPrimary == primary1) {
5058 primaries.Append(primary2);
5059 } else {
5060 if(leadPrimary != 0) {
5061 primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UN SHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5062 }
5063 if(primary2 == UCOL_IGNORABLE) {
5064 /* one byter, not compressed */
5065 primaries.Append(primary1);
5066 leadPrimary = 0;
5067 } else if(isCompressible(coll, originalPrimary1)) {
5068 /* compress */
5069 primaries.Append(leadPrimary = primary1, primary2);
5070 } else {
5071 leadPrimary = 0;
5072 primaries.Append(primary1, primary2);
5073 }
5074 }
5075 } else { /* we are in continuation, so we're gonna add primary to th e key don't care about compression */
5076 if(primary2 == UCOL_IGNORABLE) {
5077 primaries.Append(primary1);
5078 } else {
5079 primaries.Append(primary1, primary2);
5080 }
5081 }
5082 }
5083
5084 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5085 /* This is compression code. */
5086 if (secondary == UCOL_COMMON2 && notIsContinuation) {
5087 ++count2;
5088 } else {
5089 if (count2 > 0) {
5090 if (secondary > UCOL_COMMON2) { // not necessary for 4th lev el.
5091 while (count2 > UCOL_TOP_COUNT2) {
5092 secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_C OUNT2);
5093 count2 -= (uint32_t)UCOL_TOP_COUNT2;
5094 }
5095 secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1));
5096 } else {
5097 while (count2 > UCOL_BOT_COUNT2) {
5098 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_C OUNT2);
5099 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5100 }
5101 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
5102 }
5103 count2 = 0;
5104 }
5105 secondaries.appendByte(secondary);
5106 }
5107 }
5108
5109 if(notIsContinuation) {
5110 tertiary ^= caseSwitch;
5111 }
5112
5113 if(tertiary > 0) {
5114 /* This is compression code. */
5115 /* sequence size check is included in the if clause */
5116 if (tertiary == tertiaryCommon && notIsContinuation) {
5117 ++count3;
5118 } else {
5119 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_N ORMAL) {
5120 tertiary += tertiaryAddition;
5121 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_ COMMON3_UPPERFIRST) {
5122 tertiary -= tertiaryAddition;
5123 }
5124 if (count3 > 0) {
5125 if ((tertiary > tertiaryCommon)) {
5126 while (count3 > coll->tertiaryTopCount) {
5127 tertiaries.appendByte(tertiaryTop - coll->tertiaryTo pCount);
5128 count3 -= (uint32_t)coll->tertiaryTopCount;
5129 }
5130 tertiaries.appendByte(tertiaryTop - (count3-1));
5131 } else {
5132 while (count3 > coll->tertiaryBottomCount) {
5133 tertiaries.appendByte(tertiaryBottom + coll->tertiar yBottomCount);
5134 count3 -= (uint32_t)coll->tertiaryBottomCount;
5135 }
5136 tertiaries.appendByte(tertiaryBottom + (count3-1));
5137 }
5138 count3 = 0;
5139 }
5140 tertiaries.appendByte(tertiary);
5141 }
5142 }
5143 }
5144
5145 UBool ok = TRUE;
5146 if(U_SUCCESS(*status)) {
5147 /* we have done all the CE's, now let's put them together to form a key */
5148 if (count2 > 0) {
5149 while (count2 > UCOL_BOT_COUNT2) {
5150 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5151 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5152 }
5153 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
5154 }
5155 ok &= secondaries.isOk();
5156 result.Append(UCOL_LEVELTERMINATOR);
5157 secondaries.appendTo(result);
5158
5159 if (count3 > 0) {
5160 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5161 while (count3 >= coll->tertiaryTopCount) {
5162 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
5163 count3 -= (uint32_t)coll->tertiaryTopCount;
5164 }
5165 tertiaries.appendByte(tertiaryTop - count3);
5166 } else {
5167 while (count3 > coll->tertiaryBottomCount) {
5168 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomC ount);
5169 count3 -= (uint32_t)coll->tertiaryBottomCount;
5170 }
5171 tertiaries.appendByte(tertiaryBottom + (count3-1));
5172 }
5173 }
5174 ok &= tertiaries.isOk();
5175 result.Append(UCOL_LEVELTERMINATOR);
5176 tertiaries.appendTo(result);
5177
5178 result.Append(0);
5179 }
5180
5181 /* To avoid memory leak, free the offset buffer if necessary. */
5182 ucol_freeOffsetBuffer(&s);
5183
5184 ok &= result.IsOk();
5185 if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; }
5186 }
5187
5188 static inline
5189 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5190 UBool notIsContinuation = !isContinuation(CE);
5191 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5192 if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5193 || (!notIsContinuation && *wasShifted)))
5194 || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that pri mary ignorables */
5195 {
5196 // The stuff below should probably be in the sortkey code... maybe not.. .
5197 if(primary1 != 0) { /* if we were shifted and we got an ignorable code p oint */
5198 /* we should just completely ignore it */
5199 *wasShifted = TRUE;
5200 //continue;
5201 }
5202 //*wasShifted = TRUE;
5203 return TRUE;
5204 } else {
5205 *wasShifted = FALSE;
5206 return FALSE;
5207 }
5208 }
5209 static inline
5210 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *des t) {
5211 if(level < maxLevel) {
5212 dest[i++] = UCOL_LEVELTERMINATOR;
5213 } else {
5214 dest[i++] = 0;
5215 }
5216 }
5217
5218 /** enumeration of level identifiers for partial sort key generation */
5219 enum {
5220 UCOL_PSK_PRIMARY = 0,
5221 UCOL_PSK_SECONDARY = 1,
5222 UCOL_PSK_CASE = 2,
5223 UCOL_PSK_TERTIARY = 3,
5224 UCOL_PSK_QUATERNARY = 4,
5225 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have t hree bits to blow */
5226 UCOL_PSK_IDENTICAL = 6,
5227 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */
5228 UCOL_PSK_LIMIT
5229 };
5230
5231 /** collation state enum. *_SHIFT value is how much to shift right
5232 * to get the state piece to the right. *_MASK value should be
5233 * ANDed with the shifted state. This data is stored in state[1]
5234 * field.
5235 */
5236 enum {
5237 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */
5238 UCOL_PSK_LEVEL_MASK = 7, /** three bits */
5239 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5240 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5241 /** can be only 0 or 1, since we get up to two bytes from primary or quatern ary
5242 * This field is also used to denote that the French secondary level is fin ished
5243 */
5244 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5245 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5246 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already wri tten */
5247 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5248 /** When we do French we need to reverse secondary values. However, continua tions
5249 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2 c3ba
5250 */
5251 UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
5252 UCOL_PSK_BOCSU_BYTES_MASK = 3,
5253 UCOL_PSK_CONSUMED_CES_SHIFT = 9,
5254 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
5255 };
5256
5257 // macro calculating the number of expansion CEs available
5258 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5259
5260
5261 /** main sortkey part procedure. On the first call,
5262 * you should pass in a collator, an iterator, empty state
5263 * state[0] == state[1] == 0, a buffer to hold results
5264 * number of bytes you need and an error code pointer.
5265 * Make sure your buffer is big enough to hold the wanted
5266 * number of sortkey bytes. I don't check.
5267 * The only meaningful status you can get back is
5268 * U_BUFFER_OVERFLOW_ERROR, which basically means that you
5269 * have been dealt a raw deal and that you probably won't
5270 * be able to use partial sortkey generation for this
5271 * particular combination of string and collator. This
5272 * is highly unlikely, but you should still check the error code.
5273 * Any other status means that you're not in a sane situation
5274 * anymore. After the first call, preserve state values and
5275 * use them on subsequent calls to obtain more bytes of a sortkey.
5276 * Use until the number of bytes written is smaller than the requested
5277 * number of bytes. Generated sortkey is not compatible with the
5278 * one generated by ucol_getSortKey, as we don't do any compression.
5279 * However, levels are still terminated by a 1 (one) and the sortkey
5280 * is terminated by a 0 (zero). Identical level is the same as in the
5281 * regular sortkey - internal bocu-1 implementation is used.
5282 * For curious, although you cannot do much about this, here is
5283 * the structure of state words.
5284 * state[0] - iterator state. Depends on the iterator implementation,
5285 * but allows the iterator to continue where it stopped in
5286 * the last iteration.
5287 * state[1] - collation processing state. Here is the distribution
5288 * of the bits:
5289 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5290 * quaternary, quin (we don't use this one), identical and
5291 * null (producing only zeroes - first one to terminate the
5292 * sortkey and subsequent to fill the buffer).
5293 * 3 - byte count. Number of bytes written on the primary level.
5294 * 4 - was shifted. Whether the previous iteration finished in the
5295 * shifted state.
5296 * 5, 6 - French continuation bytes written. See the comment in the enum
5297 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on
5298 * the identical level.
5299 * 9..31 - CEs consumed. Number of getCE or next32 operations performed
5300 * since thes last successful update of the iterator state.
5301 */
5302 U_CAPI int32_t U_EXPORT2 209 U_CAPI int32_t U_EXPORT2
5303 ucol_nextSortKeyPart(const UCollator *coll, 210 ucol_nextSortKeyPart(const UCollator *coll,
5304 UCharIterator *iter, 211 UCharIterator *iter,
5305 uint32_t state[2], 212 uint32_t state[2],
5306 uint8_t *dest, int32_t count, 213 uint8_t *dest, int32_t count,
5307 UErrorCode *status) 214 UErrorCode *status)
5308 { 215 {
5309 /* error checking */ 216 /* error checking */
5310 if(status==NULL || U_FAILURE(*status)) { 217 if(status==NULL || U_FAILURE(*status)) {
5311 return 0; 218 return 0;
5312 } 219 }
5313 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); 220 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5314 if( coll==NULL || iter==NULL ||
5315 state==NULL ||
5316 count<0 || (count>0 && dest==NULL)
5317 ) {
5318 *status=U_ILLEGAL_ARGUMENT_ERROR;
5319 UTRACE_EXIT_STATUS(status);
5320 return 0;
5321 }
5322
5323 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count= %d", 221 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count= %d",
5324 coll, iter, state[0], state[1], dest, count); 222 coll, iter, state[0], state[1], dest, count);
5325 223
5326 if(count==0) { 224 int32_t i = Collator::fromUCollator(coll)->
5327 /* nothing to do */ 225 internalNextSortKeyPart(iter, state, dest, count, *status);
5328 UTRACE_EXIT_VALUE(0); 226
5329 return 0;
5330 }
5331 /** Setting up situation according to the state we got from the previous ite ration */
5332 // The state of the iterator from the previous invocation
5333 uint32_t iterState = state[0];
5334 // Has the last iteration ended in the shifted state
5335 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_ SHIFTED_MASK)?TRUE:FALSE;
5336 // What is the current level of the sortkey?
5337 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
5338 // Have we written only one byte from a two byte primary in the previous ite ration?
5339 // Also on secondary level - have we finished with the French secondary?
5340 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_D ONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
5341 // number of bytes in the continuation buffer for French
5342 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USE D_FRENCH_MASK;
5343 // Number of bytes already written from a bocsu sequence. Since
5344 // the longes bocsu sequence is 4 long, this can be up to 3.
5345 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK _BOCSU_BYTES_MASK;
5346 // Number of elements that need to be consumed in this iteration because
5347 // the iterator returned UITER_NO_STATE at the end of the last iteration,
5348 // so we had to save the last valid state.
5349 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED _CES_MASK;
5350
5351 /** values that depend on the collator attributes */
5352 // strength of the collator.
5353 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
5354 // maximal level of the partial sortkey. Need to take whether case level is done
5355 int32_t maxLevel = 0;
5356 if(strength < UCOL_TERTIARY) {
5357 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5358 maxLevel = UCOL_PSK_CASE;
5359 } else {
5360 maxLevel = strength;
5361 }
5362 } else {
5363 if(strength == UCOL_TERTIARY) {
5364 maxLevel = UCOL_PSK_TERTIARY;
5365 } else if(strength == UCOL_QUATERNARY) {
5366 maxLevel = UCOL_PSK_QUATERNARY;
5367 } else { // identical
5368 maxLevel = UCOL_IDENTICAL;
5369 }
5370 }
5371 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5372 uint8_t UCOL_HIRAGANA_QUAD =
5373 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON )?0xFE:0xFF;
5374 // Boundary value that decides whether a CE is shifted or not
5375 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopV alue<<16):0;
5376 // Are we doing French collation?
5377 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
5378
5379 /** initializing the collation state */
5380 UBool notIsContinuation = FALSE;
5381 uint32_t CE = UCOL_NO_MORE_CES;
5382
5383 collIterate s;
5384 IInit_collIterate(coll, NULL, -1, &s, status);
5385 if(U_FAILURE(*status)) {
5386 UTRACE_EXIT_STATUS(*status);
5387 return 0;
5388 }
5389 s.iterator = iter;
5390 s.flags |= UCOL_USE_ITERATOR;
5391 // This variable tells us whether we have produced some other levels in this iteration
5392 // before we moved to the identical level. In that case, we need to switch t he
5393 // type of the iterator.
5394 UBool doingIdenticalFromStart = FALSE;
5395 // Normalizing iterator
5396 // The division for the array length may truncate the array size to
5397 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5398 // for all platforms anyway.
5399 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
5400 UNormIterator *normIter = NULL;
5401 // If the normalization is turned on for the collator and we are below ident ical level
5402 // we will use a FCD normalizing iterator
5403 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && le vel < UCOL_PSK_IDENTICAL) {
5404 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5405 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
5406 s.flags &= ~UCOL_ITER_NORM;
5407 if(U_FAILURE(*status)) {
5408 UTRACE_EXIT_STATUS(*status);
5409 return 0;
5410 }
5411 } else if(level == UCOL_PSK_IDENTICAL) {
5412 // for identical level, we need a NFD iterator. We need to instantiate i t here, since we
5413 // will be updating the state - and this cannot be done on an ordinary i terator.
5414 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5415 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5416 s.flags &= ~UCOL_ITER_NORM;
5417 if(U_FAILURE(*status)) {
5418 UTRACE_EXIT_STATUS(*status);
5419 return 0;
5420 }
5421 doingIdenticalFromStart = TRUE;
5422 }
5423
5424 // This is the tentative new state of the iterator. The problem
5425 // is that the iterator might return an undefined state, in
5426 // which case we should save the last valid state and increase
5427 // the iterator skip value.
5428 uint32_t newState = 0;
5429
5430 // First, we set the iterator to the last valid position
5431 // from the last iteration. This was saved in state[0].
5432 if(iterState == 0) {
5433 /* initial state */
5434 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
5435 s.iterator->move(s.iterator, 0, UITER_LIMIT);
5436 } else {
5437 s.iterator->move(s.iterator, 0, UITER_START);
5438 }
5439 } else {
5440 /* reset to previous state */
5441 s.iterator->setState(s.iterator, iterState, status);
5442 if(U_FAILURE(*status)) {
5443 UTRACE_EXIT_STATUS(*status);
5444 return 0;
5445 }
5446 }
5447
5448
5449
5450 // This variable tells us whether we can attempt to update the state
5451 // of iterator. Situations where we don't want to update iterator state
5452 // are the existence of expansion CEs that are not yet processed, and
5453 // finishing the case level without enough space in the buffer to insert
5454 // a level terminator.
5455 UBool canUpdateState = TRUE;
5456
5457 // Consume all the CEs that were consumed at the end of the previous
5458 // iteration without updating the iterator state. On identical level,
5459 // consume the code points.
5460 int32_t counter = cces;
5461 if(level < UCOL_PSK_IDENTICAL) {
5462 while(counter-->0) {
5463 // If we're doing French and we are on the secondary level,
5464 // we go backwards.
5465 if(level == UCOL_PSK_SECONDARY && doingFrench) {
5466 CE = ucol_IGetPrevCE(coll, &s, status);
5467 } else {
5468 CE = ucol_IGetNextCE(coll, &s, status);
5469 }
5470 if(CE==UCOL_NO_MORE_CES) {
5471 /* should not happen */
5472 *status=U_INTERNAL_PROGRAM_ERROR;
5473 UTRACE_EXIT_STATUS(*status);
5474 return 0;
5475 }
5476 if(uprv_numAvailableExpCEs(s)) {
5477 canUpdateState = FALSE;
5478 }
5479 }
5480 } else {
5481 while(counter-->0) {
5482 uiter_next32(s.iterator);
5483 }
5484 }
5485
5486 // French secondary needs to know whether the iterator state of zero came fr om previous level OR
5487 // from a new invocation...
5488 UBool wasDoingPrimary = FALSE;
5489 // destination buffer byte counter. When this guy
5490 // gets to count, we're done with the iteration
5491 int32_t i = 0;
5492 // used to count the zero bytes written after we
5493 // have finished with the sort key
5494 int32_t j = 0;
5495
5496
5497 // Hm.... I think we're ready to plunge in. Basic story is as following:
5498 // we have a fall through case based on level. This is used for initial
5499 // positioning on iteration start. Every level processor contains a
5500 // for(;;) which will be broken when we exhaust all the CEs. Other
5501 // way to exit is a goto saveState, which happens when we have filled
5502 // out our buffer.
5503 switch(level) {
5504 case UCOL_PSK_PRIMARY:
5505 wasDoingPrimary = TRUE;
5506 for(;;) {
5507 if(i==count) {
5508 goto saveState;
5509 }
5510 // We should save the state only if we
5511 // are sure that we are done with the
5512 // previous iterator state
5513 if(canUpdateState && byteCountOrFrenchDone == 0) {
5514 newState = s.iterator->getState(s.iterator);
5515 if(newState != UITER_NO_STATE) {
5516 iterState = newState;
5517 cces = 0;
5518 }
5519 }
5520 CE = ucol_IGetNextCE(coll, &s, status);
5521 cces++;
5522 if(CE==UCOL_NO_MORE_CES) {
5523 // Add the level separator
5524 terminatePSKLevel(level, maxLevel, i, dest);
5525 byteCountOrFrenchDone=0;
5526 // Restart the iteration an move to the
5527 // second level
5528 s.iterator->move(s.iterator, 0, UITER_START);
5529 cces = 0;
5530 level = UCOL_PSK_SECONDARY;
5531 break;
5532 }
5533 if(!isContinuation(CE)){
5534 if(coll->leadBytePermutationTable != NULL){
5535 CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF);
5536 }
5537 }
5538 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5539 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
5540 if(CE != 0) {
5541 if(byteCountOrFrenchDone == 0) {
5542 // get the second byte of primary
5543 dest[i++]=(uint8_t)(CE >> 8);
5544 } else {
5545 byteCountOrFrenchDone = 0;
5546 }
5547 if((CE &=0xff)!=0) {
5548 if(i==count) {
5549 /* overflow */
5550 byteCountOrFrenchDone = 1;
5551 cces--;
5552 goto saveState;
5553 }
5554 dest[i++]=(uint8_t)CE;
5555 }
5556 }
5557 }
5558 if(uprv_numAvailableExpCEs(s)) {
5559 canUpdateState = FALSE;
5560 } else {
5561 canUpdateState = TRUE;
5562 }
5563 }
5564 /* fall through to next level */
5565 case UCOL_PSK_SECONDARY:
5566 if(strength >= UCOL_SECONDARY) {
5567 if(!doingFrench) {
5568 for(;;) {
5569 if(i == count) {
5570 goto saveState;
5571 }
5572 // We should save the state only if we
5573 // are sure that we are done with the
5574 // previous iterator state
5575 if(canUpdateState) {
5576 newState = s.iterator->getState(s.iterator);
5577 if(newState != UITER_NO_STATE) {
5578 iterState = newState;
5579 cces = 0;
5580 }
5581 }
5582 CE = ucol_IGetNextCE(coll, &s, status);
5583 cces++;
5584 if(CE==UCOL_NO_MORE_CES) {
5585 // Add the level separator
5586 terminatePSKLevel(level, maxLevel, i, dest);
5587 byteCountOrFrenchDone = 0;
5588 // Restart the iteration an move to the
5589 // second level
5590 s.iterator->move(s.iterator, 0, UITER_START);
5591 cces = 0;
5592 level = UCOL_PSK_CASE;
5593 break;
5594 }
5595 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5596 CE >>= 8; /* get secondary */
5597 if(CE != 0) {
5598 dest[i++]=(uint8_t)CE;
5599 }
5600 }
5601 if(uprv_numAvailableExpCEs(s)) {
5602 canUpdateState = FALSE;
5603 } else {
5604 canUpdateState = TRUE;
5605 }
5606 }
5607 } else { // French secondary processing
5608 uint8_t frenchBuff[UCOL_MAX_BUFFER];
5609 int32_t frenchIndex = 0;
5610 // Here we are going backwards.
5611 // If the iterator is at the beggining, it should be
5612 // moved to end.
5613 if(wasDoingPrimary) {
5614 s.iterator->move(s.iterator, 0, UITER_LIMIT);
5615 cces = 0;
5616 }
5617 for(;;) {
5618 if(i == count) {
5619 goto saveState;
5620 }
5621 if(canUpdateState) {
5622 newState = s.iterator->getState(s.iterator);
5623 if(newState != UITER_NO_STATE) {
5624 iterState = newState;
5625 cces = 0;
5626 }
5627 }
5628 CE = ucol_IGetPrevCE(coll, &s, status);
5629 cces++;
5630 if(CE==UCOL_NO_MORE_CES) {
5631 // Add the level separator
5632 terminatePSKLevel(level, maxLevel, i, dest);
5633 byteCountOrFrenchDone = 0;
5634 // Restart the iteration an move to the next level
5635 s.iterator->move(s.iterator, 0, UITER_START);
5636 level = UCOL_PSK_CASE;
5637 break;
5638 }
5639 if(isContinuation(CE)) { // if it's a continuation, we want to save it and
5640 // reverse when we get a first non-continuation CE.
5641 CE >>= 8;
5642 frenchBuff[frenchIndex++] = (uint8_t)CE;
5643 } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
5644 CE >>= 8; /* get secondary */
5645 if(!frenchIndex) {
5646 if(CE != 0) {
5647 dest[i++]=(uint8_t)CE;
5648 }
5649 } else {
5650 frenchBuff[frenchIndex++] = (uint8_t)CE;
5651 frenchIndex -= usedFrench;
5652 usedFrench = 0;
5653 while(i < count && frenchIndex) {
5654 dest[i++] = frenchBuff[--frenchIndex];
5655 usedFrench++;
5656 }
5657 }
5658 }
5659 if(uprv_numAvailableExpCEs(s)) {
5660 canUpdateState = FALSE;
5661 } else {
5662 canUpdateState = TRUE;
5663 }
5664 }
5665 }
5666 } else {
5667 level = UCOL_PSK_CASE;
5668 }
5669 /* fall through to next level */
5670 case UCOL_PSK_CASE:
5671 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5672 uint32_t caseShift = UCOL_CASE_SHIFT_START;
5673 uint8_t caseByte = UCOL_CASE_BYTE_START;
5674 uint8_t caseBits = 0;
5675
5676 for(;;) {
5677 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);
5678 if(i == count) {
5679 goto saveState;
5680 }
5681 // We should save the state only if we
5682 // are sure that we are done with the
5683 // previous iterator state
5684 if(canUpdateState) {
5685 newState = s.iterator->getState(s.iterator);
5686 if(newState != UITER_NO_STATE) {
5687 iterState = newState;
5688 cces = 0;
5689 }
5690 }
5691 CE = ucol_IGetNextCE(coll, &s, status);
5692 cces++;
5693 if(CE==UCOL_NO_MORE_CES) {
5694 // On the case level we might have an unfinished
5695 // case byte. Add one if it's started.
5696 if(caseShift != UCOL_CASE_SHIFT_START) {
5697 dest[i++] = caseByte;
5698 }
5699 cces = 0;
5700 // We have finished processing CEs on this level.
5701 // However, we don't know if we have enough space
5702 // to add a case level terminator.
5703 if(i < count) {
5704 // Add the level separator
5705 terminatePSKLevel(level, maxLevel, i, dest);
5706 // Restart the iteration and move to the
5707 // next level
5708 s.iterator->move(s.iterator, 0, UITER_START);
5709 level = UCOL_PSK_TERTIARY;
5710 } else {
5711 canUpdateState = FALSE;
5712 }
5713 break;
5714 }
5715
5716 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5717 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || s trength > UCOL_PRIMARY)) {
5718 // do the case level if we need to do it. We don't want to calculate
5719 // case level for primary ignorables if we have only pri mary strength and case level
5720 // otherwise we would break well formedness of CEs
5721 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5722 caseBits = (uint8_t)(CE & 0xC0);
5723 // this copies the case level logic from the
5724 // sort key generation code
5725 if(CE != 0) {
5726 if (caseShift == 0) {
5727 dest[i++] = caseByte;
5728 caseShift = UCOL_CASE_SHIFT_START;
5729 caseByte = UCOL_CASE_BYTE_START;
5730 }
5731 if(coll->caseFirst == UCOL_UPPER_FIRST) {
5732 if((caseBits & 0xC0) == 0) {
5733 caseByte |= 1 << (--caseShift);
5734 } else {
5735 caseByte |= 0 << (--caseShift);
5736 /* second bit */
5737 if(caseShift == 0) {
5738 dest[i++] = caseByte;
5739 caseShift = UCOL_CASE_SHIFT_START;
5740 caseByte = UCOL_CASE_BYTE_START;
5741 }
5742 caseByte |= ((caseBits>>6)&1) << (--caseShif t);
5743 }
5744 } else {
5745 if((caseBits & 0xC0) == 0) {
5746 caseByte |= 0 << (--caseShift);
5747 } else {
5748 caseByte |= 1 << (--caseShift);
5749 /* second bit */
5750 if(caseShift == 0) {
5751 dest[i++] = caseByte;
5752 caseShift = UCOL_CASE_SHIFT_START;
5753 caseByte = UCOL_CASE_BYTE_START;
5754 }
5755 caseByte |= ((caseBits>>7)&1) << (--caseShif t);
5756 }
5757 }
5758 }
5759
5760 }
5761 }
5762 // Not sure this is correct for the case level - revisit
5763 if(uprv_numAvailableExpCEs(s)) {
5764 canUpdateState = FALSE;
5765 } else {
5766 canUpdateState = TRUE;
5767 }
5768 }
5769 } else {
5770 level = UCOL_PSK_TERTIARY;
5771 }
5772 /* fall through to next level */
5773 case UCOL_PSK_TERTIARY:
5774 if(strength >= UCOL_TERTIARY) {
5775 for(;;) {
5776 if(i == count) {
5777 goto saveState;
5778 }
5779 // We should save the state only if we
5780 // are sure that we are done with the
5781 // previous iterator state
5782 if(canUpdateState) {
5783 newState = s.iterator->getState(s.iterator);
5784 if(newState != UITER_NO_STATE) {
5785 iterState = newState;
5786 cces = 0;
5787 }
5788 }
5789 CE = ucol_IGetNextCE(coll, &s, status);
5790 cces++;
5791 if(CE==UCOL_NO_MORE_CES) {
5792 // Add the level separator
5793 terminatePSKLevel(level, maxLevel, i, dest);
5794 byteCountOrFrenchDone = 0;
5795 // Restart the iteration an move to the
5796 // second level
5797 s.iterator->move(s.iterator, 0, UITER_START);
5798 cces = 0;
5799 level = UCOL_PSK_QUATERNARY;
5800 break;
5801 }
5802 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5803 notIsContinuation = !isContinuation(CE);
5804
5805 if(notIsContinuation) {
5806 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5807 CE ^= coll->caseSwitch;
5808 CE &= coll->tertiaryMask;
5809 } else {
5810 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
5811 }
5812
5813 if(CE != 0) {
5814 dest[i++]=(uint8_t)CE;
5815 }
5816 }
5817 if(uprv_numAvailableExpCEs(s)) {
5818 canUpdateState = FALSE;
5819 } else {
5820 canUpdateState = TRUE;
5821 }
5822 }
5823 } else {
5824 // if we're not doing tertiary
5825 // skip to the end
5826 level = UCOL_PSK_NULL;
5827 }
5828 /* fall through to next level */
5829 case UCOL_PSK_QUATERNARY:
5830 if(strength >= UCOL_QUATERNARY) {
5831 for(;;) {
5832 if(i == count) {
5833 goto saveState;
5834 }
5835 // We should save the state only if we
5836 // are sure that we are done with the
5837 // previous iterator state
5838 if(canUpdateState) {
5839 newState = s.iterator->getState(s.iterator);
5840 if(newState != UITER_NO_STATE) {
5841 iterState = newState;
5842 cces = 0;
5843 }
5844 }
5845 CE = ucol_IGetNextCE(coll, &s, status);
5846 cces++;
5847 if(CE==UCOL_NO_MORE_CES) {
5848 // Add the level separator
5849 terminatePSKLevel(level, maxLevel, i, dest);
5850 //dest[i++] = UCOL_LEVELTERMINATOR;
5851 byteCountOrFrenchDone = 0;
5852 // Restart the iteration an move to the
5853 // second level
5854 s.iterator->move(s.iterator, 0, UITER_START);
5855 cces = 0;
5856 level = UCOL_PSK_QUIN;
5857 break;
5858 }
5859 if(CE==0)
5860 continue;
5861 if(isShiftedCE(CE, LVT, &wasShifted)) {
5862 CE >>= 16; /* get primary */
5863 if(CE != 0) {
5864 if(byteCountOrFrenchDone == 0) {
5865 dest[i++]=(uint8_t)(CE >> 8);
5866 } else {
5867 byteCountOrFrenchDone = 0;
5868 }
5869 if((CE &=0xff)!=0) {
5870 if(i==count) {
5871 /* overflow */
5872 byteCountOrFrenchDone = 1;
5873 goto saveState;
5874 }
5875 dest[i++]=(uint8_t)CE;
5876 }
5877 }
5878 } else {
5879 notIsContinuation = !isContinuation(CE);
5880 if(notIsContinuation) {
5881 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana a nd we need to note it
5882 dest[i++] = UCOL_HIRAGANA_QUAD;
5883 } else {
5884 dest[i++] = 0xFF;
5885 }
5886 }
5887 }
5888 if(uprv_numAvailableExpCEs(s)) {
5889 canUpdateState = FALSE;
5890 } else {
5891 canUpdateState = TRUE;
5892 }
5893 }
5894 } else {
5895 // if we're not doing quaternary
5896 // skip to the end
5897 level = UCOL_PSK_NULL;
5898 }
5899 /* fall through to next level */
5900 case UCOL_PSK_QUIN:
5901 level = UCOL_PSK_IDENTICAL;
5902 /* fall through to next level */
5903 case UCOL_PSK_IDENTICAL:
5904 if(strength >= UCOL_IDENTICAL) {
5905 UChar32 first, second;
5906 int32_t bocsuBytesWritten = 0;
5907 // We always need to do identical on
5908 // the NFD form of the string.
5909 if(normIter == NULL) {
5910 // we arrived from the level below and
5911 // normalization was not turned on.
5912 // therefore, we need to make a fresh NFD iterator
5913 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5914 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5915 } else if(!doingIdenticalFromStart) {
5916 // there is an iterator, but we did some other levels.
5917 // therefore, we have a FCD iterator - need to make
5918 // a NFD one.
5919 // normIter being at the beginning does not guarantee
5920 // that the underlying iterator is at the beginning
5921 iter->move(iter, 0, UITER_START);
5922 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5923 }
5924 // At this point we have a NFD iterator that is positioned
5925 // in the right place
5926 if(U_FAILURE(*status)) {
5927 UTRACE_EXIT_STATUS(*status);
5928 return 0;
5929 }
5930 first = uiter_previous32(s.iterator);
5931 // maybe we're at the start of the string
5932 if(first == U_SENTINEL) {
5933 first = 0;
5934 } else {
5935 uiter_next32(s.iterator);
5936 }
5937
5938 j = 0;
5939 for(;;) {
5940 if(i == count) {
5941 if(j+1 < bocsuBytesWritten) {
5942 bocsuBytesUsed = j+1;
5943 }
5944 goto saveState;
5945 }
5946
5947 // On identical level, we will always save
5948 // the state if we reach this point, since
5949 // we don't depend on getNextCE for content
5950 // all the content is in our buffer and we
5951 // already either stored the full buffer OR
5952 // otherwise we won't arrive here.
5953 newState = s.iterator->getState(s.iterator);
5954 if(newState != UITER_NO_STATE) {
5955 iterState = newState;
5956 cces = 0;
5957 }
5958
5959 uint8_t buff[4];
5960 second = uiter_next32(s.iterator);
5961 cces++;
5962
5963 // end condition for identical level
5964 if(second == U_SENTINEL) {
5965 terminatePSKLevel(level, maxLevel, i, dest);
5966 level = UCOL_PSK_NULL;
5967 break;
5968 }
5969 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, seco nd, buff);
5970 first = second;
5971
5972 j = 0;
5973 if(bocsuBytesUsed != 0) {
5974 while(bocsuBytesUsed-->0) {
5975 j++;
5976 }
5977 }
5978
5979 while(i < count && j < bocsuBytesWritten) {
5980 dest[i++] = buff[j++];
5981 }
5982 }
5983
5984 } else {
5985 level = UCOL_PSK_NULL;
5986 }
5987 /* fall through to next level */
5988 case UCOL_PSK_NULL:
5989 j = i;
5990 while(j<count) {
5991 dest[j++]=0;
5992 }
5993 break;
5994 default:
5995 *status = U_INTERNAL_PROGRAM_ERROR;
5996 UTRACE_EXIT_STATUS(*status);
5997 return 0;
5998 }
5999
6000 saveState:
6001 // Now we need to return stuff. First we want to see whether we have
6002 // done everything for the current state of iterator.
6003 if(byteCountOrFrenchDone
6004 || canUpdateState == FALSE
6005 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
6006 {
6007 // Any of above mean that the previous transaction
6008 // wasn't finished and that we should store the
6009 // previous iterator state.
6010 state[0] = iterState;
6011 } else {
6012 // The transaction is complete. We will continue in the next iteration.
6013 state[0] = s.iterator->getState(s.iterator);
6014 cces = 0;
6015 }
6016 // Store the number of bocsu bytes written.
6017 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
6018 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6019 }
6020 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BY TES_SHIFT;
6021
6022 // Next we put in the level of comparison
6023 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6024
6025 // If we are doing French, we need to store whether we have just finished th e French level
6026 if(level == UCOL_PSK_SECONDARY && doingFrench) {
6027 state[1] |= (((int32_t)(state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_D ONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6028 } else {
6029 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE _MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6030 }
6031
6032 // Was the latest CE shifted
6033 if(wasShifted) {
6034 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6035 }
6036 // Check for cces overflow
6037 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
6038 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6039 }
6040 // Store cces
6041 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SH IFT);
6042
6043 // Check for French overflow
6044 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6045 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6046 }
6047 // Store number of bytes written in the French secondary continuation sequen ce
6048 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENC H_SHIFT);
6049
6050
6051 // If we have used normalizing iterator, get rid of it
6052 if(normIter != NULL) {
6053 unorm_closeIter(normIter);
6054 }
6055
6056 /* To avoid memory leak, free the offset buffer if necessary. */
6057 ucol_freeOffsetBuffer(&s);
6058
6059 // Return number of meaningful sortkey bytes. 227 // Return number of meaningful sortkey bytes.
6060 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", 228 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6061 dest,i, state[0], state[1]); 229 dest,i, state[0], state[1]);
6062 UTRACE_EXIT_VALUE(i); 230 UTRACE_EXIT_VALUE_STATUS(i, *status);
6063 return i; 231 return i;
6064 } 232 }
6065 233
6066 /** 234 /**
6067 * Produce a bound for a given sortkey and a number of levels. 235 * Produce a bound for a given sortkey and a number of levels.
6068 */ 236 */
6069 U_CAPI int32_t U_EXPORT2 237 U_CAPI int32_t U_EXPORT2
6070 ucol_getBound(const uint8_t *source, 238 ucol_getBound(const uint8_t *source,
6071 int32_t sourceLength, 239 int32_t sourceLength,
6072 UColBoundMode boundType, 240 UColBoundMode boundType,
6073 uint32_t noOfLevels, 241 uint32_t noOfLevels,
6074 uint8_t *result, 242 uint8_t *result,
6075 int32_t resultLength, 243 int32_t resultLength,
6076 UErrorCode *status) 244 UErrorCode *status)
6077 { 245 {
6078 // consistency checks 246 // consistency checks
6079 if(status == NULL || U_FAILURE(*status)) { 247 if(status == NULL || U_FAILURE(*status)) {
6080 return 0; 248 return 0;
6081 } 249 }
6082 if(source == NULL) { 250 if(source == NULL) {
6083 *status = U_ILLEGAL_ARGUMENT_ERROR; 251 *status = U_ILLEGAL_ARGUMENT_ERROR;
6084 return 0; 252 return 0;
6085 } 253 }
6086 254
6087 int32_t sourceIndex = 0; 255 int32_t sourceIndex = 0;
6088 // Scan the string until we skip enough of the key OR reach the end of the k ey 256 // Scan the string until we skip enough of the key OR reach the end of the k ey
6089 do { 257 do {
6090 sourceIndex++; 258 sourceIndex++;
6091 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) { 259 if(source[sourceIndex] == Collation::LEVEL_SEPARATOR_BYTE) {
6092 noOfLevels--; 260 noOfLevels--;
6093 } 261 }
6094 } while (noOfLevels > 0 262 } while (noOfLevels > 0
6095 && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); 263 && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6096 264
6097 if((source[sourceIndex] == 0 || sourceIndex == sourceLength) 265 if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6098 && noOfLevels > 0) { 266 && noOfLevels > 0) {
6099 *status = U_SORT_KEY_TOO_SHORT_WARNING; 267 *status = U_SORT_KEY_TOO_SHORT_WARNING;
6100 } 268 }
6101 269
(...skipping 22 matching lines...) Expand all
6124 return 0; 292 return 0;
6125 } 293 }
6126 result[sourceIndex++] = 0; 294 result[sourceIndex++] = 0;
6127 295
6128 return sourceIndex; 296 return sourceIndex;
6129 } else { 297 } else {
6130 return sourceIndex+boundType+1; 298 return sourceIndex+boundType+1;
6131 } 299 }
6132 } 300 }
6133 301
6134 /****************************************************************************/ 302 U_CAPI void U_EXPORT2
6135 /* Following are the functions that deal with the properties of a collator */ 303 ucol_setMaxVariable(UCollator *coll, UColReorderCode group, UErrorCode *pErrorCo de) {
6136 /* there are new APIs and some compatibility APIs */ 304 if(U_FAILURE(*pErrorCode)) { return; }
6137 /****************************************************************************/ 305 Collator::fromUCollator(coll)->setMaxVariable(group, *pErrorCode);
6138 306 }
6139 static inline void 307
6140 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE, 308 U_CAPI UColReorderCode U_EXPORT2
6141 int32_t *primShift, int32_t *secShift, int32_t *terShift) 309 ucol_getMaxVariable(const UCollator *coll) {
6142 { 310 return Collator::fromUCollator(coll)->getMaxVariable();
6143 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6144 UBool reverseSecondary = FALSE;
6145 UBool continuation = isContinuation(CE);
6146 if(!continuation) {
6147 tertiary = (uint8_t)((CE & coll->tertiaryMask));
6148 tertiary ^= coll->caseSwitch;
6149 reverseSecondary = TRUE;
6150 } else {
6151 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6152 tertiary &= UCOL_REMOVE_CASE;
6153 reverseSecondary = FALSE;
6154 }
6155
6156 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6157 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6158 primary1 = (uint8_t)(CE >> 8);
6159
6160 if(primary1 != 0) {
6161 if (coll->leadBytePermutationTable != NULL && !continuation) {
6162 primary1 = coll->leadBytePermutationTable[primary1];
6163 }
6164
6165 coll->latinOneCEs[ch] |= (primary1 << *primShift);
6166 *primShift -= 8;
6167 }
6168 if(primary2 != 0) {
6169 if(*primShift < 0) {
6170 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6171 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6172 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6173 return;
6174 }
6175 coll->latinOneCEs[ch] |= (primary2 << *primShift);
6176 *primShift -= 8;
6177 }
6178 if(secondary != 0) {
6179 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse se condary
6180 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space fo r secondary
6181 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6182 } else { // normal case
6183 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secSh ift);
6184 }
6185 *secShift -= 8;
6186 }
6187 if(tertiary != 0) {
6188 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift );
6189 *terShift -= 8;
6190 }
6191 }
6192
6193 static inline UBool
6194 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6195 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6196 if(newTable == NULL) {
6197 *status = U_MEMORY_ALLOCATION_ERROR;
6198 coll->latinOneFailed = TRUE;
6199 return FALSE;
6200 }
6201 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTable Len)*sizeof(uint32_t);
6202 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6203 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6204 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToC opy);
6205 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, siz eToCopy);
6206 coll->latinOneTableLen = size;
6207 uprv_free(coll->latinOneCEs);
6208 coll->latinOneCEs = newTable;
6209 return TRUE;
6210 }
6211
6212 static UBool
6213 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6214 UBool result = TRUE;
6215 if(coll->latinOneCEs == NULL) {
6216 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINO NETABLELEN*3);
6217 if(coll->latinOneCEs == NULL) {
6218 *status = U_MEMORY_ALLOCATION_ERROR;
6219 return FALSE;
6220 }
6221 coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6222 }
6223 UChar ch = 0;
6224 UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6225 // Check for null pointer
6226 if (U_FAILURE(*status)) {
6227 ucol_closeElements(it);
6228 return FALSE;
6229 }
6230 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3) ;
6231
6232 int32_t primShift = 24, secShift = 24, terShift = 24;
6233 uint32_t CE = 0;
6234 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6235
6236 // TODO: make safe if you get more than you wanted...
6237 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6238 primShift = 24; secShift = 24; terShift = 24;
6239 if(ch < 0x100) {
6240 CE = coll->latinOneMapping[ch];
6241 } else {
6242 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
6243 if(CE == UCOL_NOT_FOUND && coll->UCA) {
6244 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
6245 }
6246 }
6247 if(CE < UCOL_NOT_FOUND) {
6248 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift );
6249 } else {
6250 switch (getCETag(CE)) {
6251 case EXPANSION_TAG:
6252 case DIGIT_TAG:
6253 ucol_setText(it, &ch, 1, status);
6254 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6255 if(primShift < 0 || secShift < 0 || terShift < 0) {
6256 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6257 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL _OUT_CE;
6258 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BA IL_OUT_CE;
6259 break;
6260 }
6261 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, & terShift);
6262 }
6263 break;
6264 case CONTRACTION_TAG:
6265 // here is the trick
6266 // F2 is contraction. We do something very similar to contractio ns
6267 // but have two indices, one in the real contraction table and t he
6268 // other to where we stuffed things. This hopes that we don't ha ve
6269 // many contractions (this should work for latin-1 tables).
6270 {
6271 if((CE & 0x00FFF000) != 0) {
6272 *status = U_UNSUPPORTED_ERROR;
6273 goto cleanup_after_failure;
6274 }
6275
6276 const UChar *UCharOffset = (UChar *)coll->image+getContractO ffset(CE);
6277
6278 CE |= (contractionOffset & 0xFFF) << 12; // insert the offse t in latin-1 table
6279
6280 coll->latinOneCEs[ch] = CE;
6281 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6282 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6283
6284 // We're going to jump into contraction table, pick the elem ents
6285 // and use them
6286 do {
6287 CE = *(coll->contractionCEs +
6288 (UCharOffset - coll->contractionIndex));
6289 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
6290 uint32_t size;
6291 uint32_t i; /* general counter */
6292 uint32_t *CEOffset = (uint32_t *)coll->image+getExpa nsionOffset(CE); /* find the offset to expansion table */
6293 size = getExpansionCount(CE);
6294 //CE = *CEOffset++;
6295 if(size != 0) { /* if there are less than 16 element s in expansion, we don't terminate */
6296 for(i = 0; i<size; i++) {
6297 if(primShift < 0 || secShift < 0 || terShift < 0) {
6298 coll->latinOneCEs[(UChar)contractionOffs et] = UCOL_BAIL_OUT_CE;
6299 coll->latinOneCEs[coll->latinOneTableLen +(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6300 coll->latinOneCEs[2*coll->latinOneTableL en+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6301 break;
6302 }
6303 ucol_addLatinOneEntry(coll, (UChar)contracti onOffset, *CEOffset++, &primShift, &secShift, &terShift);
6304 }
6305 } else { /* else, we do */
6306 while(*CEOffset != 0) {
6307 if(primShift < 0 || secShift < 0 || terShift < 0) {
6308 coll->latinOneCEs[(UChar)contractionOffs et] = UCOL_BAIL_OUT_CE;
6309 coll->latinOneCEs[coll->latinOneTableLen +(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6310 coll->latinOneCEs[2*coll->latinOneTableL en+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6311 break;
6312 }
6313 ucol_addLatinOneEntry(coll, (UChar)contracti onOffset, *CEOffset++, &primShift, &secShift, &terShift);
6314 }
6315 }
6316 contractionOffset++;
6317 } else if(CE < UCOL_NOT_FOUND) {
6318 ucol_addLatinOneEntry(coll, (UChar)contractionOffset ++, CE, &primShift, &secShift, &terShift);
6319 } else {
6320 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_B AIL_OUT_CE;
6321 coll->latinOneCEs[coll->latinOneTableLen+(UChar)cont ractionOffset] = UCOL_BAIL_OUT_CE;
6322 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)co ntractionOffset] = UCOL_BAIL_OUT_CE;
6323 contractionOffset++;
6324 }
6325 UCharOffset++;
6326 primShift = 24; secShift = 24; terShift = 24;
6327 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6328 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneT ableLen, status)) {
6329 goto cleanup_after_failure;
6330 }
6331 }
6332 } while(*UCharOffset != 0xFFFF);
6333 }
6334 break;;
6335 case SPEC_PROC_TAG:
6336 {
6337 // 0xB7 is a precontext character defined in UCA5.1, a speci al
6338 // handle is implemeted in order to save LatinOne table for
6339 // most locales.
6340 if (ch==0xb7) {
6341 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShif t, &terShift);
6342 }
6343 else {
6344 goto cleanup_after_failure;
6345 }
6346 }
6347 break;
6348 default:
6349 goto cleanup_after_failure;
6350 }
6351 }
6352 }
6353 // compact table
6354 if(contractionOffset < coll->latinOneTableLen) {
6355 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
6356 goto cleanup_after_failure;
6357 }
6358 }
6359 ucol_closeElements(it);
6360 return result;
6361
6362 cleanup_after_failure:
6363 // status should already be set before arriving here.
6364 coll->latinOneFailed = TRUE;
6365 ucol_closeElements(it);
6366 return FALSE;
6367 }
6368
6369 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
6370 if(U_SUCCESS(*status)) {
6371 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6372 coll->caseSwitch = UCOL_CASE_SWITCH;
6373 } else {
6374 coll->caseSwitch = UCOL_NO_CASE_SWITCH;
6375 }
6376
6377 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
6378 coll->tertiaryMask = UCOL_REMOVE_CASE;
6379 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6380 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
6381 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
6382 coll->tertiaryBottom = UCOL_COMMON_BOT3;
6383 } else {
6384 coll->tertiaryMask = UCOL_KEEP_CASE;
6385 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
6386 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6387 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
6388 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
6389 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
6390 } else {
6391 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6392 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
6393 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
6394 }
6395 }
6396
6397 /* Set the compression values */
6398 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBott om - 1);
6399 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* w e multilply double with int, but need only int */
6400 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopC ount);
6401
6402 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
6403 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == U COL_NON_IGNORABLE)
6404 {
6405 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
6406 } else {
6407 coll->sortKeyGen = ucol_calcSortKey;
6408 }
6409 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && col l->numericCollation == UCOL_OFF
6410 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneF ailed)
6411 {
6412 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
6413 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in build ing latin1 table, we'll use it
6414 //fprintf(stderr, "F");
6415 coll->latinOneUse = TRUE;
6416 } else {
6417 coll->latinOneUse = FALSE;
6418 }
6419 if(*status == U_UNSUPPORTED_ERROR) {
6420 *status = U_ZERO_ERROR;
6421 }
6422 } else { // latin1Table exists and it doesn't need to be regenerated , just use it
6423 coll->latinOneUse = TRUE;
6424 }
6425 } else {
6426 coll->latinOneUse = FALSE;
6427 }
6428 }
6429 } 311 }
6430 312
6431 U_CAPI uint32_t U_EXPORT2 313 U_CAPI uint32_t U_EXPORT2
6432 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCod e *status) { 314 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCod e *status) {
6433 if(U_FAILURE(*status) || coll == NULL) { 315 if(U_FAILURE(*status) || coll == NULL) {
6434 return 0; 316 return 0;
6435 } 317 }
6436 if(len == -1) { 318 return Collator::fromUCollator(coll)->setVariableTop(varTop, len, *status);
6437 len = u_strlen(varTop);
6438 }
6439 if(len == 0) {
6440 *status = U_ILLEGAL_ARGUMENT_ERROR;
6441 return 0;
6442 }
6443
6444 if(coll->delegate!=NULL) {
6445 return ((Collator*)coll->delegate)->setVariableTop(varTop, len, *status);
6446 }
6447
6448
6449 collIterate s;
6450 IInit_collIterate(coll, varTop, len, &s, status);
6451 if(U_FAILURE(*status)) {
6452 return 0;
6453 }
6454
6455 uint32_t CE = ucol_IGetNextCE(coll, &s, status);
6456
6457 /* here we check if we have consumed all characters */
6458 /* you can put in either one character or a contraction */
6459 /* you shouldn't put more... */
6460 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
6461 *status = U_CE_NOT_FOUND_ERROR;
6462 return 0;
6463 }
6464
6465 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
6466
6467 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
6468 *status = U_PRIMARY_TOO_LONG_ERROR;
6469 return 0;
6470 }
6471 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
6472 coll->variableTopValueisDefault = FALSE;
6473 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
6474 }
6475
6476 /* To avoid memory leak, free the offset buffer if necessary. */
6477 ucol_freeOffsetBuffer(&s);
6478
6479 return CE & UCOL_PRIMARYMASK;
6480 } 319 }
6481 320
6482 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) { 321 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
6483 if(U_FAILURE(*status) || coll == NULL) { 322 if(U_FAILURE(*status) || coll == NULL) {
6484 return 0; 323 return 0;
6485 } 324 }
6486 if(coll->delegate!=NULL) { 325 return Collator::fromUCollator(coll)->getVariableTop(*status);
6487 return ((const Collator*)coll->delegate)->getVariableTop(*status);
6488 }
6489 return coll->variableTopValue<<16;
6490 } 326 }
6491 327
6492 U_CAPI void U_EXPORT2 328 U_CAPI void U_EXPORT2
6493 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *stat us) { 329 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *stat us) {
6494 if(U_FAILURE(*status) || coll == NULL) { 330 if(U_FAILURE(*status) || coll == NULL) {
6495 return; 331 return;
6496 } 332 }
6497 333 Collator::fromUCollator(coll)->setVariableTop(varTop, *status);
6498 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) { 334 }
6499 coll->variableTopValueisDefault = FALSE; 335
6500 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
6501 }
6502 }
6503 /* Attribute setter API */
6504 U_CAPI void U_EXPORT2 336 U_CAPI void U_EXPORT2
6505 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) { 337 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
6506 if(U_FAILURE(*status) || coll == NULL) { 338 if(U_FAILURE(*status) || coll == NULL) {
6507 return; 339 return;
6508 } 340 }
6509 341
6510 if(coll->delegate != NULL) { 342 Collator::fromUCollator(coll)->setAttribute(attr, value, *status);
6511 ((Collator*)coll->delegate)->setAttribute(attr,value,*status);
6512 return;
6513 }
6514
6515 UColAttributeValue oldFrench = coll->frenchCollation;
6516 UColAttributeValue oldCaseFirst = coll->caseFirst;
6517 switch(attr) {
6518 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
6519 if(value == UCOL_ON) {
6520 coll->numericCollation = UCOL_ON;
6521 coll->numericCollationisDefault = FALSE;
6522 } else if (value == UCOL_OFF) {
6523 coll->numericCollation = UCOL_OFF;
6524 coll->numericCollationisDefault = FALSE;
6525 } else if (value == UCOL_DEFAULT) {
6526 coll->numericCollationisDefault = TRUE;
6527 coll->numericCollation = (UColAttributeValue)coll->options->numericC ollation;
6528 } else {
6529 *status = U_ILLEGAL_ARGUMENT_ERROR;
6530 }
6531 break;
6532 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragan a */
6533 if(value == UCOL_ON || value == UCOL_OFF || value == UCOL_DEFAULT) {
6534 // This attribute is an implementation detail of the CLDR Japanese t ailoring.
6535 // The implementation might change to use a different mechanism
6536 // to achieve the same Japanese sort order.
6537 // Since ICU 50, this attribute is not settable any more via API fun ctions.
6538 } else {
6539 *status = U_ILLEGAL_ARGUMENT_ERROR;
6540 }
6541 break;
6542 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights* /
6543 if(value == UCOL_ON) {
6544 coll->frenchCollation = UCOL_ON;
6545 coll->frenchCollationisDefault = FALSE;
6546 } else if (value == UCOL_OFF) {
6547 coll->frenchCollation = UCOL_OFF;
6548 coll->frenchCollationisDefault = FALSE;
6549 } else if (value == UCOL_DEFAULT) {
6550 coll->frenchCollationisDefault = TRUE;
6551 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCol lation;
6552 } else {
6553 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6554 }
6555 break;
6556 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6557 if(value == UCOL_SHIFTED) {
6558 coll->alternateHandling = UCOL_SHIFTED;
6559 coll->alternateHandlingisDefault = FALSE;
6560 } else if (value == UCOL_NON_IGNORABLE) {
6561 coll->alternateHandling = UCOL_NON_IGNORABLE;
6562 coll->alternateHandlingisDefault = FALSE;
6563 } else if (value == UCOL_DEFAULT) {
6564 coll->alternateHandlingisDefault = TRUE;
6565 coll->alternateHandling = (UColAttributeValue)coll->options->alterna teHandling ;
6566 } else {
6567 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6568 }
6569 break;
6570 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6571 if(value == UCOL_LOWER_FIRST) {
6572 coll->caseFirst = UCOL_LOWER_FIRST;
6573 coll->caseFirstisDefault = FALSE;
6574 } else if (value == UCOL_UPPER_FIRST) {
6575 coll->caseFirst = UCOL_UPPER_FIRST;
6576 coll->caseFirstisDefault = FALSE;
6577 } else if (value == UCOL_OFF) {
6578 coll->caseFirst = UCOL_OFF;
6579 coll->caseFirstisDefault = FALSE;
6580 } else if (value == UCOL_DEFAULT) {
6581 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
6582 coll->caseFirstisDefault = TRUE;
6583 } else {
6584 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6585 }
6586 break;
6587 case UCOL_CASE_LEVEL: /* do we have an extra case level */
6588 if(value == UCOL_ON) {
6589 coll->caseLevel = UCOL_ON;
6590 coll->caseLevelisDefault = FALSE;
6591 } else if (value == UCOL_OFF) {
6592 coll->caseLevel = UCOL_OFF;
6593 coll->caseLevelisDefault = FALSE;
6594 } else if (value == UCOL_DEFAULT) {
6595 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
6596 coll->caseLevelisDefault = TRUE;
6597 } else {
6598 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6599 }
6600 break;
6601 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6602 if(value == UCOL_ON) {
6603 coll->normalizationMode = UCOL_ON;
6604 coll->normalizationModeisDefault = FALSE;
6605 initializeFCD(status);
6606 } else if (value == UCOL_OFF) {
6607 coll->normalizationMode = UCOL_OFF;
6608 coll->normalizationModeisDefault = FALSE;
6609 } else if (value == UCOL_DEFAULT) {
6610 coll->normalizationModeisDefault = TRUE;
6611 coll->normalizationMode = (UColAttributeValue)coll->options->normali zationMode;
6612 if(coll->normalizationMode == UCOL_ON) {
6613 initializeFCD(status);
6614 }
6615 } else {
6616 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6617 }
6618 break;
6619 case UCOL_STRENGTH: /* attribute for strength */
6620 if (value == UCOL_DEFAULT) {
6621 coll->strengthisDefault = TRUE;
6622 coll->strength = (UColAttributeValue)coll->options->strength;
6623 } else if (value <= UCOL_IDENTICAL) {
6624 coll->strengthisDefault = FALSE;
6625 coll->strength = value;
6626 } else {
6627 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6628 }
6629 break;
6630 case UCOL_ATTRIBUTE_COUNT:
6631 default:
6632 *status = U_ILLEGAL_ARGUMENT_ERROR;
6633 break;
6634 }
6635 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
6636 coll->latinOneRegenTable = TRUE;
6637 } else {
6638 coll->latinOneRegenTable = FALSE;
6639 }
6640 ucol_updateInternalState(coll, status);
6641 } 343 }
6642 344
6643 U_CAPI UColAttributeValue U_EXPORT2 345 U_CAPI UColAttributeValue U_EXPORT2
6644 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { 346 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
6645 if(U_FAILURE(*status) || coll == NULL) { 347 if(U_FAILURE(*status) || coll == NULL) {
6646 return UCOL_DEFAULT; 348 return UCOL_DEFAULT;
6647 } 349 }
6648 350
6649 if(coll->delegate != NULL) { 351 return Collator::fromUCollator(coll)->getAttribute(attr, *status);
6650 return ((Collator*)coll->delegate)->getAttribute(attr,*status);
6651 }
6652
6653 switch(attr) {
6654 case UCOL_NUMERIC_COLLATION:
6655 return coll->numericCollation;
6656 case UCOL_HIRAGANA_QUATERNARY_MODE:
6657 return coll->hiraganaQ;
6658 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights* /
6659 return coll->frenchCollation;
6660 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6661 return coll->alternateHandling;
6662 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6663 return coll->caseFirst;
6664 case UCOL_CASE_LEVEL: /* do we have an extra case level */
6665 return coll->caseLevel;
6666 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6667 return coll->normalizationMode;
6668 case UCOL_STRENGTH: /* attribute for strength */
6669 return coll->strength;
6670 case UCOL_ATTRIBUTE_COUNT:
6671 default:
6672 *status = U_ILLEGAL_ARGUMENT_ERROR;
6673 break;
6674 }
6675 return UCOL_DEFAULT;
6676 } 352 }
6677 353
6678 U_CAPI void U_EXPORT2 354 U_CAPI void U_EXPORT2
6679 ucol_setStrength( UCollator *coll, 355 ucol_setStrength( UCollator *coll,
6680 UCollationStrength strength) 356 UCollationStrength strength)
6681 { 357 {
6682 UErrorCode status = U_ZERO_ERROR; 358 UErrorCode status = U_ZERO_ERROR;
6683 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); 359 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
6684 } 360 }
6685 361
6686 U_CAPI UCollationStrength U_EXPORT2 362 U_CAPI UCollationStrength U_EXPORT2
6687 ucol_getStrength(const UCollator *coll) 363 ucol_getStrength(const UCollator *coll)
6688 { 364 {
6689 UErrorCode status = U_ZERO_ERROR; 365 UErrorCode status = U_ZERO_ERROR;
6690 return ucol_getAttribute(coll, UCOL_STRENGTH, &status); 366 return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
6691 } 367 }
6692 368
6693 U_CAPI int32_t U_EXPORT2 369 U_CAPI int32_t U_EXPORT2
6694 ucol_getReorderCodes(const UCollator *coll, 370 ucol_getReorderCodes(const UCollator *coll,
6695 int32_t *dest, 371 int32_t *dest,
6696 int32_t destCapacity, 372 int32_t destCapacity,
6697 UErrorCode *status) { 373 UErrorCode *status) {
6698 if (U_FAILURE(*status)) { 374 if (U_FAILURE(*status)) {
6699 return 0; 375 return 0;
6700 } 376 }
6701 377
6702 if(coll->delegate!=NULL) { 378 return Collator::fromUCollator(coll)->getReorderCodes(dest, destCapacity, *s tatus);
6703 return ((const Collator*)coll->delegate)->getReorderCodes(dest, destCapaci ty, *status);
6704 }
6705
6706 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
6707 *status = U_ILLEGAL_ARGUMENT_ERROR;
6708 return 0;
6709 }
6710
6711 #ifdef UCOL_DEBUG
6712 printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength);
6713 printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLe ngth);
6714 #endif
6715
6716 if (coll->reorderCodesLength > destCapacity) {
6717 *status = U_BUFFER_OVERFLOW_ERROR;
6718 return coll->reorderCodesLength;
6719 }
6720 for (int32_t i = 0; i < coll->reorderCodesLength; i++) {
6721 dest[i] = coll->reorderCodes[i];
6722 }
6723 return coll->reorderCodesLength;
6724 } 379 }
6725 380
6726 U_CAPI void U_EXPORT2 381 U_CAPI void U_EXPORT2
6727 ucol_setReorderCodes(UCollator* coll, 382 ucol_setReorderCodes(UCollator* coll,
6728 const int32_t* reorderCodes, 383 const int32_t* reorderCodes,
6729 int32_t reorderCodesLength, 384 int32_t reorderCodesLength,
6730 UErrorCode *status) { 385 UErrorCode *status) {
6731 if (U_FAILURE(*status)) { 386 if (U_FAILURE(*status)) {
6732 return; 387 return;
6733 } 388 }
6734 389
6735 if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NUL L)) { 390 Collator::fromUCollator(coll)->setReorderCodes(reorderCodes, reorderCodesLen gth, *status);
6736 *status = U_ILLEGAL_ARGUMENT_ERROR;
6737 return;
6738 }
6739
6740 if(coll->delegate!=NULL) {
6741 ((Collator*)coll->delegate)->setReorderCodes(reorderCodes, reorderCodesLen gth, *status);
6742 return;
6743 }
6744
6745 if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
6746 uprv_free(coll->reorderCodes);
6747 }
6748 coll->reorderCodes = NULL;
6749 coll->freeReorderCodesOnClose = FALSE;
6750 coll->reorderCodesLength = 0;
6751 if (reorderCodesLength == 0) {
6752 if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutat ionTableOnClose == TRUE) {
6753 uprv_free(coll->leadBytePermutationTable);
6754 }
6755 coll->leadBytePermutationTable = NULL;
6756 coll->freeLeadBytePermutationTableOnClose = FALSE;
6757 return;
6758 }
6759 coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int3 2_t));
6760 if (coll->reorderCodes == NULL) {
6761 *status = U_MEMORY_ALLOCATION_ERROR;
6762 return;
6763 }
6764 coll->freeReorderCodesOnClose = TRUE;
6765 for (int32_t i = 0; i < reorderCodesLength; i++) {
6766 coll->reorderCodes[i] = reorderCodes[i];
6767 }
6768 coll->reorderCodesLength = reorderCodesLength;
6769 ucol_buildPermutationTable(coll, status);
6770 } 391 }
6771 392
6772 U_CAPI int32_t U_EXPORT2 393 U_CAPI int32_t U_EXPORT2
6773 ucol_getEquivalentReorderCodes(int32_t reorderCode, 394 ucol_getEquivalentReorderCodes(int32_t reorderCode,
6774 int32_t* dest, 395 int32_t* dest,
6775 int32_t destCapacity, 396 int32_t destCapacity,
6776 UErrorCode *pErrorCode) { 397 UErrorCode *pErrorCode) {
6777 bool equivalentCodesSet[USCRIPT_CODE_LIMIT]; 398 return Collator::getEquivalentReorderCodes(reorderCode, dest, destCapacity, *pErrorCode);
6778 uint16_t leadBytes[256]; 399 }
6779 int leadBytesCount;
6780 int leadByteIndex;
6781 int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT];
6782 int reorderCodesForLeadByteCount;
6783 int reorderCodeIndex;
6784
6785 int32_t equivalentCodesCount = 0;
6786 int setIndex;
6787
6788 if (U_FAILURE(*pErrorCode)) {
6789 return 0;
6790 }
6791
6792 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
6793 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
6794 return 0;
6795 }
6796
6797 uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool));
6798
6799 const UCollator* uca = ucol_initUCA(pErrorCode);
6800 if (U_FAILURE(*pErrorCode)) {
6801 » return 0;
6802 }
6803 leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes , 256);
6804 for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) {
6805 reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte(
6806 uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE _LIMIT);
6807 for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCou nt; reorderCodeIndex++) {
6808 equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true ;
6809 }
6810 }
6811
6812 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
6813 if (equivalentCodesSet[setIndex] == true) {
6814 equivalentCodesCount++;
6815 }
6816 }
6817
6818 if (destCapacity == 0) {
6819 return equivalentCodesCount;
6820 }
6821
6822 equivalentCodesCount = 0;
6823 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
6824 if (equivalentCodesSet[setIndex] == true) {
6825 dest[equivalentCodesCount++] = setIndex;
6826 if (equivalentCodesCount >= destCapacity) {
6827 break;
6828 }
6829 }
6830 }
6831 return equivalentCodesCount;
6832 }
6833
6834
6835 /****************************************************************************/
6836 /* Following are misc functions */
6837 /* there are new APIs and some compatibility APIs */
6838 /****************************************************************************/
6839 400
6840 U_CAPI void U_EXPORT2 401 U_CAPI void U_EXPORT2
6841 ucol_getVersion(const UCollator* coll, 402 ucol_getVersion(const UCollator* coll,
6842 UVersionInfo versionInfo) 403 UVersionInfo versionInfo)
6843 { 404 {
6844 if(coll->delegate!=NULL) { 405 Collator::fromUCollator(coll)->getVersion(versionInfo);
6845 ((const Collator*)coll->delegate)->getVersion(versionInfo);
6846 return;
6847 }
6848 /* RunTime version */
6849 uint8_t rtVersion = UCOL_RUNTIME_VERSION;
6850 /* Builder version*/
6851 uint8_t bdVersion = coll->image->version[0];
6852
6853 /* Charset Version. Need to get the version from cnv files
6854 * makeconv should populate cnv files with version and
6855 * an api has to be provided in ucnv.h to obtain this version
6856 */
6857 uint8_t csVersion = 0;
6858
6859 /* combine the version info */
6860 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersi on));
6861
6862 /* Tailoring rules */
6863 versionInfo[0] = (uint8_t)(cmbVersion>>8);
6864 versionInfo[1] = (uint8_t)cmbVersion;
6865 versionInfo[2] = coll->image->version[1];
6866 if(coll->UCA) {
6867 /* Include the minor number when getting the UCA version. (major & 1f) < < 3 | (minor & 7) */
6868 versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll-> UCA->image->UCAVersion[1] & 0x07);
6869 } else {
6870 versionInfo[3] = 0;
6871 }
6872 }
6873
6874
6875 /* This internal API checks whether a character is tailored or not */
6876 U_CAPI UBool U_EXPORT2
6877 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
6878 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
6879 return FALSE;
6880 }
6881
6882 uint32_t CE = UCOL_NOT_FOUND;
6883 const UChar *ContractionStart = NULL;
6884 if(u < 0x100) { /* latin-1 */
6885 CE = coll->latinOneMapping[u];
6886 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
6887 return FALSE;
6888 }
6889 } else { /* regular */
6890 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
6891 }
6892
6893 if(isContraction(CE)) {
6894 ContractionStart = (UChar *)coll->image+getContractOffset(CE);
6895 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex) );
6896 }
6897
6898 return (UBool)(CE != UCOL_NOT_FOUND);
6899 }
6900
6901
6902 /****************************************************************************/
6903 /* Following are the string compare functions */
6904 /* */
6905 /****************************************************************************/
6906
6907
6908 /* ucol_checkIdent internal function. Does byte level string compare. */
6909 /* Used by strcoll if strength == identical and strings */
6910 /* are otherwise equal. */
6911 /* */
6912 /* Comparison must be done on NFD normalized strings. */
6913 /* FCD is not good enough. */
6914
6915 static
6916 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBoo l normalize, UErrorCode *status)
6917 {
6918 // When we arrive here, we can have normal strings or UCharIterators. Curren tly they are both
6919 // of same type, but that doesn't really mean that it will stay that way.
6920 int32_t comparison;
6921
6922 if (sColl->flags & UCOL_USE_ITERATOR) {
6923 // The division for the array length may truncate the array size to
6924 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6925 // for all platforms anyway.
6926 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6927 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6928 UNormIterator *sNIt = NULL, *tNIt = NULL;
6929 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
6930 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
6931 sColl->iterator->move(sColl->iterator, 0, UITER_START);
6932 tColl->iterator->move(tColl->iterator, 0, UITER_START);
6933 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, sta tus);
6934 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, sta tus);
6935 comparison = u_strCompareIter(sIt, tIt, TRUE);
6936 unorm_closeIter(sNIt);
6937 unorm_closeIter(tNIt);
6938 } else {
6939 int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl- >endp - sColl->string) : -1;
6940 const UChar *sBuf = sColl->string;
6941 int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl- >endp - tColl->string) : -1;
6942 const UChar *tBuf = tColl->string;
6943
6944 if (normalize) {
6945 *status = U_ZERO_ERROR;
6946 // Note: We could use Normalizer::compare() or similar, but for shor t strings
6947 // which may not be in FCD it might be faster to just NFD them.
6948 // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather tha n
6949 // NFD'ing immediately might be faster for long strings,
6950 // but string comparison is usually done on relatively short strings .
6951 sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN ) == 0, sBuf, sLen),
6952 sColl->writableBuffer,
6953 *status);
6954 tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN ) == 0, tBuf, tLen),
6955 tColl->writableBuffer,
6956 *status);
6957 if(U_FAILURE(*status)) {
6958 return UCOL_LESS;
6959 }
6960 comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writ ableBuffer);
6961 } else {
6962 comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);
6963 }
6964 }
6965
6966 if (comparison < 0) {
6967 return UCOL_LESS;
6968 } else if (comparison == 0) {
6969 return UCOL_EQUAL;
6970 } else /* comparison > 0 */ {
6971 return UCOL_GREATER;
6972 }
6973 }
6974
6975 /* CEBuf - A struct and some inline functions to handle the saving */
6976 /* of CEs in a buffer within ucol_strcoll */
6977
6978 #define UCOL_CEBUF_SIZE 512
6979 typedef struct ucol_CEBuf {
6980 uint32_t *buf;
6981 uint32_t *endp;
6982 uint32_t *pos;
6983 uint32_t localArray[UCOL_CEBUF_SIZE];
6984 } ucol_CEBuf;
6985
6986
6987 static
6988 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
6989 (b)->buf = (b)->pos = (b)->localArray;
6990 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
6991 }
6992
6993 static
6994 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
6995 uint32_t oldSize;
6996 uint32_t newSize;
6997 uint32_t *newBuf;
6998
6999 ci->flags |= UCOL_ITER_ALLOCATED;
7000 oldSize = (uint32_t)(b->pos - b->buf);
7001 newSize = oldSize * 2;
7002 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7003 if(newBuf == NULL) {
7004 *status = U_MEMORY_ALLOCATION_ERROR;
7005 }
7006 else {
7007 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7008 if (b->buf != b->localArray) {
7009 uprv_free(b->buf);
7010 }
7011 b->buf = newBuf;
7012 b->endp = b->buf + newSize;
7013 b->pos = b->buf + oldSize;
7014 }
7015 }
7016
7017 static
7018 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCo de *status) {
7019 if (b->pos == b->endp) {
7020 ucol_CEBuf_Expand(b, ci, status);
7021 }
7022 if (U_SUCCESS(*status)) {
7023 *(b)->pos++ = ce;
7024 }
7025 }
7026
7027 /* This is a trick string compare function that goes in and uses sortkeys to com pare */
7028 /* It is used when compare gets in trouble and needs to bail out */
7029 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7030 collIterate *tColl,
7031 UErrorCode *status)
7032 {
7033 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7034 uint8_t *sourceKeyP = sourceKey;
7035 uint8_t *targetKeyP = targetKey;
7036 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7037 const UCollator *coll = sColl->coll;
7038 const UChar *source = NULL;
7039 const UChar *target = NULL;
7040 int32_t result = UCOL_EQUAL;
7041 UnicodeString sourceString, targetString;
7042 int32_t sourceLength;
7043 int32_t targetLength;
7044
7045 if(sColl->flags & UCOL_USE_ITERATOR) {
7046 sColl->iterator->move(sColl->iterator, 0, UITER_START);
7047 tColl->iterator->move(tColl->iterator, 0, UITER_START);
7048 UChar32 c;
7049 while((c=sColl->iterator->next(sColl->iterator))>=0) {
7050 sourceString.append((UChar)c);
7051 }
7052 while((c=tColl->iterator->next(tColl->iterator))>=0) {
7053 targetString.append((UChar)c);
7054 }
7055 source = sourceString.getBuffer();
7056 sourceLength = sourceString.length();
7057 target = targetString.getBuffer();
7058 targetLength = targetString.length();
7059 } else { // no iterators
7060 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sCo ll->string):-1;
7061 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tCo ll->string):-1;
7062 source = sColl->string;
7063 target = tColl->string;
7064 }
7065
7066
7067
7068 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourc eKeyLen);
7069 if(sourceKeyLen > UCOL_MAX_BUFFER) {
7070 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7071 if(sourceKeyP == NULL) {
7072 *status = U_MEMORY_ALLOCATION_ERROR;
7073 goto cleanup_and_do_compare;
7074 }
7075 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, s ourceKeyLen);
7076 }
7077
7078 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targe tKeyLen);
7079 if(targetKeyLen > UCOL_MAX_BUFFER) {
7080 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7081 if(targetKeyP == NULL) {
7082 *status = U_MEMORY_ALLOCATION_ERROR;
7083 goto cleanup_and_do_compare;
7084 }
7085 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, t argetKeyLen);
7086 }
7087
7088 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7089
7090 cleanup_and_do_compare:
7091 if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
7092 uprv_free(sourceKeyP);
7093 }
7094
7095 if(targetKeyP != NULL && targetKeyP != targetKey) {
7096 uprv_free(targetKeyP);
7097 }
7098
7099 if(result<0) {
7100 return UCOL_LESS;
7101 } else if(result>0) {
7102 return UCOL_GREATER;
7103 } else {
7104 return UCOL_EQUAL;
7105 }
7106 }
7107
7108
7109 static UCollationResult
7110 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
7111 {
7112 U_ALIGN_CODE(16);
7113
7114 const UCollator *coll = sColl->coll;
7115
7116
7117 // setting up the collator parameters
7118 UColAttributeValue strength = coll->strength;
7119 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY);
7120
7121 UBool checkSecTer = initialCheckSecTer;
7122 UBool checkTertiary = (strength >= UCOL_TERTIARY);
7123 UBool checkQuad = (strength >= UCOL_QUATERNARY);
7124 UBool checkIdent = (strength == UCOL_IDENTICAL);
7125 UBool checkCase = (coll->caseLevel == UCOL_ON);
7126 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7127 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7128 UBool qShifted = shifted && checkQuad;
7129 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7130
7131 if(doHiragana && shifted) {
7132 return (ucol_compareUsingSortKeys(sColl, tColl, status));
7133 }
7134 uint8_t caseSwitch = coll->caseSwitch;
7135 uint8_t tertiaryMask = coll->tertiaryMask;
7136
7137 // This is the lowest primary value that will not be ignored if shifted
7138 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7139
7140 UCollationResult result = UCOL_EQUAL;
7141 UCollationResult hirResult = UCOL_EQUAL;
7142
7143 // Preparing the CE buffers. They will be filled during the primary phase
7144 ucol_CEBuf sCEs;
7145 ucol_CEBuf tCEs;
7146 UCOL_INIT_CEBUF(&sCEs);
7147 UCOL_INIT_CEBUF(&tCEs);
7148
7149 uint32_t secS = 0, secT = 0;
7150 uint32_t sOrder=0, tOrder=0;
7151
7152 // Non shifted primary processing is quite simple
7153 if(!shifted) {
7154 for(;;) {
7155 // We fetch CEs until we hit a non ignorable primary or end.
7156 uint32_t sPrimary;
7157 do {
7158 // We get the next CE
7159 sOrder = ucol_IGetNextCE(coll, sColl, status);
7160 // Stuff it in the buffer
7161 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7162 // And keep just the primary part.
7163 sPrimary = sOrder & UCOL_PRIMARYMASK;
7164 } while(sPrimary == 0);
7165
7166 // see the comments on the above block
7167 uint32_t tPrimary;
7168 do {
7169 tOrder = ucol_IGetNextCE(coll, tColl, status);
7170 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7171 tPrimary = tOrder & UCOL_PRIMARYMASK;
7172 } while(tPrimary == 0);
7173
7174 // if both primaries are the same
7175 if(sPrimary == tPrimary) {
7176 // and there are no more CEs, we advance to the next level
7177 if(sPrimary == UCOL_NO_MORE_CES_PRIMARY) {
7178 break;
7179 }
7180 if(doHiragana && hirResult == UCOL_EQUAL) {
7181 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCO L_WAS_HIRAGANA)) {
7182 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl ->flags & UCOL_WAS_HIRAGANA))
7183 ? UCOL_LESS:UCOL_GREATER;
7184 }
7185 }
7186 } else {
7187 // only need to check one for continuation
7188 // if one is then the other must be or the preceding CE would be a prefix of the other
7189 if (coll->leadBytePermutationTable != NULL && !isContinuation(sO rder)) {
7190 sPrimary = (coll->leadBytePermutationTable[sPrimary>>24] << 24) | (sPrimary & 0x00FFFFFF);
7191 tPrimary = (coll->leadBytePermutationTable[tPrimary>>24] << 24) | (tPrimary & 0x00FFFFFF);
7192 }
7193 // if two primaries are different, we are done
7194 result = (sPrimary < tPrimary) ? UCOL_LESS: UCOL_GREATER;
7195 goto commonReturn;
7196 }
7197 } // no primary difference... do the rest from the buffers
7198 } else { // shifted - do a slightly more complicated processing :)
7199 for(;;) {
7200 UBool sInShifted = FALSE;
7201 UBool tInShifted = FALSE;
7202 // This version of code can be refactored. However, it seems easier to understand this way.
7203 // Source loop. Same as the target loop.
7204 for(;;) {
7205 sOrder = ucol_IGetNextCE(coll, sColl, status);
7206 if(sOrder == UCOL_NO_MORE_CES) {
7207 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7208 break;
7209 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMA SK) == 0)) {
7210 /* UCA amendment - ignore ignorables that follow shifted cod e points */
7211 continue;
7212 } else if(isContinuation(sOrder)) {
7213 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary va lue */
7214 if(sInShifted) {
7215 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* pres erve interesting continuation */
7216 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7217 continue;
7218 } else {
7219 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7220 break;
7221 }
7222 } else { /* Just lower level values */
7223 if(sInShifted) {
7224 continue;
7225 } else {
7226 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7227 continue;
7228 }
7229 }
7230 } else { /* regular */
7231 if(coll->leadBytePermutationTable != NULL){
7232 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
7233 }
7234 if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7235 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7236 break;
7237 } else {
7238 if((sOrder & UCOL_PRIMARYMASK) > 0) {
7239 sInShifted = TRUE;
7240 sOrder &= UCOL_PRIMARYMASK;
7241 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7242 continue;
7243 } else {
7244 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7245 sInShifted = FALSE;
7246 continue;
7247 }
7248 }
7249 }
7250 }
7251 sOrder &= UCOL_PRIMARYMASK;
7252 sInShifted = FALSE;
7253
7254 for(;;) {
7255 tOrder = ucol_IGetNextCE(coll, tColl, status);
7256 if(tOrder == UCOL_NO_MORE_CES) {
7257 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7258 break;
7259 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMA SK) == 0)) {
7260 /* UCA amendment - ignore ignorables that follow shifted cod e points */
7261 continue;
7262 } else if(isContinuation(tOrder)) {
7263 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary va lue */
7264 if(tInShifted) {
7265 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* pres erve interesting continuation */
7266 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7267 continue;
7268 } else {
7269 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7270 break;
7271 }
7272 } else { /* Just lower level values */
7273 if(tInShifted) {
7274 continue;
7275 } else {
7276 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7277 continue;
7278 }
7279 }
7280 } else { /* regular */
7281 if(coll->leadBytePermutationTable != NULL){
7282 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
7283 }
7284 if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7285 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7286 break;
7287 } else {
7288 if((tOrder & UCOL_PRIMARYMASK) > 0) {
7289 tInShifted = TRUE;
7290 tOrder &= UCOL_PRIMARYMASK;
7291 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7292 continue;
7293 } else {
7294 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7295 tInShifted = FALSE;
7296 continue;
7297 }
7298 }
7299 }
7300 }
7301 tOrder &= UCOL_PRIMARYMASK;
7302 tInShifted = FALSE;
7303
7304 if(sOrder == tOrder) {
7305 /*
7306 if(doHiragana && hirResult == UCOL_EQUAL) {
7307 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_ HIRAGANA)) {
7308 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7309 ? UCOL_LESS:UCOL_GREATER;
7310 }
7311 }
7312 */
7313 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7314 break;
7315 } else {
7316 sOrder = 0;
7317 tOrder = 0;
7318 continue;
7319 }
7320 } else {
7321 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7322 goto commonReturn;
7323 }
7324 } /* no primary difference... do the rest from the buffers */
7325 }
7326
7327 /* now, we're gonna reexamine collected CEs */
7328 uint32_t *sCE;
7329 uint32_t *tCE;
7330
7331 /* This is the secondary level of comparison */
7332 if(checkSecTer) {
7333 if(!isFrenchSec) { /* normal */
7334 sCE = sCEs.buf;
7335 tCE = tCEs.buf;
7336 for(;;) {
7337 while (secS == 0) {
7338 secS = *(sCE++) & UCOL_SECONDARYMASK;
7339 }
7340
7341 while(secT == 0) {
7342 secT = *(tCE++) & UCOL_SECONDARYMASK;
7343 }
7344
7345 if(secS == secT) {
7346 if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7347 break;
7348 } else {
7349 secS = 0; secT = 0;
7350 continue;
7351 }
7352 } else {
7353 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7354 goto commonReturn;
7355 }
7356 }
7357 } else { /* do the French */
7358 uint32_t *sCESave = NULL;
7359 uint32_t *tCESave = NULL;
7360 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimi zed */
7361 tCE = tCEs.pos-2;
7362 for(;;) {
7363 while (secS == 0 && sCE >= sCEs.buf) {
7364 if(sCESave == NULL) {
7365 secS = *(sCE--);
7366 if(isContinuation(secS)) {
7367 while(isContinuation(secS = *(sCE--)))
7368 ;
7369 /* after this, secS has the start of continuation, a nd sCEs points before that */
7370 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7371 sCE+=2; /* need to point to the first continuation CP */
7372 /* However, now you can just continue doing stuff */
7373 }
7374 } else {
7375 secS = *(sCE++);
7376 if(!isContinuation(secS)) { /* This means we have finish ed with this cont */
7377 sCE = sCESave; /* reset the pointer to be fore continuation */
7378 sCESave = NULL;
7379 secS = 0; /* Fetch a fresh CE before the continuati on sequence. */
7380 continue;
7381 }
7382 }
7383 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit * /
7384 }
7385
7386 while(secT == 0 && tCE >= tCEs.buf) {
7387 if(tCESave == NULL) {
7388 secT = *(tCE--);
7389 if(isContinuation(secT)) {
7390 while(isContinuation(secT = *(tCE--)))
7391 ;
7392 /* after this, secS has the start of continuation, a nd sCEs points before that */
7393 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7394 tCE+=2; /* need to point to the first continuation CP */
7395 /* However, now you can just continue doing stuff */
7396 }
7397 } else {
7398 secT = *(tCE++);
7399 if(!isContinuation(secT)) { /* This means we have finish ed with this cont */
7400 tCE = tCESave; /* reset the pointer to befo re continuation */
7401 tCESave = NULL;
7402 secT = 0; /* Fetch a fresh CE before the continuati on sequence. */
7403 continue;
7404 }
7405 }
7406 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit * /
7407 }
7408
7409 if(secS == secT) {
7410 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7411 break;
7412 } else {
7413 secS = 0; secT = 0;
7414 continue;
7415 }
7416 } else {
7417 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7418 goto commonReturn;
7419 }
7420 }
7421 }
7422 }
7423
7424 /* doing the case bit */
7425 if(checkCase) {
7426 sCE = sCEs.buf;
7427 tCE = tCEs.buf;
7428 for(;;) {
7429 while((secS & UCOL_REMOVE_CASE) == 0) {
7430 if(!isContinuation(*sCE++)) {
7431 secS =*(sCE-1);
7432 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMA RY) {
7433 // primary ignorables should not be considered on the ca se level when the strength is primary
7434 // otherwise, the CEs stop being well-formed
7435 secS &= UCOL_TERT_CASE_MASK;
7436 secS ^= caseSwitch;
7437 } else {
7438 secS = 0;
7439 }
7440 } else {
7441 secS = 0;
7442 }
7443 }
7444
7445 while((secT & UCOL_REMOVE_CASE) == 0) {
7446 if(!isContinuation(*tCE++)) {
7447 secT = *(tCE-1);
7448 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMA RY) {
7449 // primary ignorables should not be considered on the ca se level when the strength is primary
7450 // otherwise, the CEs stop being well-formed
7451 secT &= UCOL_TERT_CASE_MASK;
7452 secT ^= caseSwitch;
7453 } else {
7454 secT = 0;
7455 }
7456 } else {
7457 secT = 0;
7458 }
7459 }
7460
7461 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
7462 result = UCOL_LESS;
7463 goto commonReturn;
7464 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
7465 result = UCOL_GREATER;
7466 goto commonReturn;
7467 }
7468
7469 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
7470 break;
7471 } else {
7472 secS = 0;
7473 secT = 0;
7474 }
7475 }
7476 }
7477
7478 /* Tertiary level */
7479 if(checkTertiary) {
7480 secS = 0;
7481 secT = 0;
7482 sCE = sCEs.buf;
7483 tCE = tCEs.buf;
7484 for(;;) {
7485 while((secS & UCOL_REMOVE_CASE) == 0) {
7486 sOrder = *sCE++;
7487 secS = sOrder & tertiaryMask;
7488 if(!isContinuation(sOrder)) {
7489 secS ^= caseSwitch;
7490 } else {
7491 secS &= UCOL_REMOVE_CASE;
7492 }
7493 }
7494
7495 while((secT & UCOL_REMOVE_CASE) == 0) {
7496 tOrder = *tCE++;
7497 secT = tOrder & tertiaryMask;
7498 if(!isContinuation(tOrder)) {
7499 secT ^= caseSwitch;
7500 } else {
7501 secT &= UCOL_REMOVE_CASE;
7502 }
7503 }
7504
7505 if(secS == secT) {
7506 if((secS & UCOL_REMOVE_CASE) == 1) {
7507 break;
7508 } else {
7509 secS = 0; secT = 0;
7510 continue;
7511 }
7512 } else {
7513 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7514 goto commonReturn;
7515 }
7516 }
7517 }
7518
7519
7520 if(qShifted /*checkQuad*/) {
7521 UBool sInShifted = TRUE;
7522 UBool tInShifted = TRUE;
7523 secS = 0;
7524 secT = 0;
7525 sCE = sCEs.buf;
7526 tCE = tCEs.buf;
7527 for(;;) {
7528 while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(sec S) && !sInShifted)) {
7529 secS = *(sCE++);
7530 if(isContinuation(secS)) {
7531 if(!sInShifted) {
7532 continue;
7533 }
7534 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
7535 secS = UCOL_PRIMARYMASK;
7536 sInShifted = FALSE;
7537 } else {
7538 sInShifted = TRUE;
7539 }
7540 }
7541 secS &= UCOL_PRIMARYMASK;
7542
7543
7544 while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(sec T) && !tInShifted)) {
7545 secT = *(tCE++);
7546 if(isContinuation(secT)) {
7547 if(!tInShifted) {
7548 continue;
7549 }
7550 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
7551 secT = UCOL_PRIMARYMASK;
7552 tInShifted = FALSE;
7553 } else {
7554 tInShifted = TRUE;
7555 }
7556 }
7557 secT &= UCOL_PRIMARYMASK;
7558
7559 if(secS == secT) {
7560 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
7561 break;
7562 } else {
7563 secS = 0; secT = 0;
7564 continue;
7565 }
7566 } else {
7567 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7568 goto commonReturn;
7569 }
7570 }
7571 } else if(doHiragana && hirResult != UCOL_EQUAL) {
7572 // If we're fine on quaternaries, we might be different
7573 // on Hiragana. This, however, might fail us in shifted.
7574 result = hirResult;
7575 goto commonReturn;
7576 }
7577
7578 /* For IDENTICAL comparisons, we use a bitwise character comparison */
7579 /* as a tiebreaker if all else is equal. */
7580 /* Getting here should be quite rare - strings are not identical - */
7581 /* that is checked first, but compared == through all other checks. */
7582 if(checkIdent)
7583 {
7584 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UC OL_ON);
7585 result = ucol_checkIdent(sColl, tColl, TRUE, status);
7586 }
7587
7588 commonReturn:
7589 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
7590 if (sCEs.buf != sCEs.localArray ) {
7591 uprv_free(sCEs.buf);
7592 }
7593 if (tCEs.buf != tCEs.localArray ) {
7594 uprv_free(tCEs.buf);
7595 }
7596 }
7597
7598 return result;
7599 }
7600
7601 static UCollationResult
7602 ucol_strcollRegular(const UCollator *coll,
7603 const UChar *source, int32_t sourceLength,
7604 const UChar *target, int32_t targetLength,
7605 UErrorCode *status) {
7606 collIterate sColl, tColl;
7607 // Preparing the context objects for iterating over strings
7608 IInit_collIterate(coll, source, sourceLength, &sColl, status);
7609 IInit_collIterate(coll, target, targetLength, &tColl, status);
7610 if(U_FAILURE(*status)) {
7611 return UCOL_LESS;
7612 }
7613 return ucol_strcollRegular(&sColl, &tColl, status);
7614 }
7615
7616 static inline uint32_t
7617 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
7618 uint32_t CE, const UChar *s, int32_t *index, int32_t l en)
7619 {
7620 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
7621 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
7622 int32_t offset = 1;
7623 UChar schar = 0, tchar = 0;
7624
7625 for(;;) {
7626 if(len == -1) {
7627 if(s[*index] == 0) { // end of string
7628 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOn eOffset]);
7629 } else {
7630 schar = s[*index];
7631 }
7632 } else {
7633 if(*index == len) {
7634 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOn eOffset]);
7635 } else {
7636 schar = s[*index];
7637 }
7638 }
7639
7640 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contractio n codepoints should be ordered, we skip all that are smaller */
7641 offset++;
7642 }
7643
7644 if (schar == tchar) {
7645 (*index)++;
7646 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff set+offset]);
7647 }
7648 else
7649 {
7650 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
7651 return UCOL_BAIL_OUT_CE;
7652 }
7653 // skip completely ignorables
7654 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
7655 if(isZeroCE == 0) { // we have to ignore completely ignorables
7656 (*index)++;
7657 continue;
7658 }
7659
7660 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff set]);
7661 }
7662 }
7663 }
7664
7665
7666 /**
7667 * This is a fast strcoll, geared towards text in Latin-1.
7668 * It supports contractions of size two, French secondaries
7669 * and case switching. You can use it with strengths primary
7670 * to tertiary. It does not support shifted and case level.
7671 * It relies on the table build by setupLatin1Table. If it
7672 * doesn't understand something, it will go to the regular
7673 * strcoll.
7674 */
7675 static UCollationResult
7676 ucol_strcollUseLatin1( const UCollator *coll,
7677 const UChar *source,
7678 int32_t sLen,
7679 const UChar *target,
7680 int32_t tLen,
7681 UErrorCode *status)
7682 {
7683 U_ALIGN_CODE(16);
7684 int32_t strength = coll->strength;
7685
7686 int32_t sIndex = 0, tIndex = 0;
7687 UChar sChar = 0, tChar = 0;
7688 uint32_t sOrder=0, tOrder=0;
7689
7690 UBool endOfSource = FALSE;
7691
7692 uint32_t *elements = coll->latinOneCEs;
7693
7694 UBool haveContractions = FALSE; // if we have contractions in our string
7695 // we cannot do French secondary
7696
7697 // Do the primary level
7698 for(;;) {
7699 while(sOrder==0) { // this loop skips primary ignorables
7700 // sOrder=getNextlatinOneCE(source);
7701 if(sLen==-1) { // handling zero terminated strings
7702 sChar=source[sIndex++];
7703 if(sChar==0) {
7704 endOfSource = TRUE;
7705 break;
7706 }
7707 } else { // handling strings with known length
7708 if(sIndex==sLen) {
7709 endOfSource = TRUE;
7710 break;
7711 }
7712 sChar=source[sIndex++];
7713 }
7714 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sCha r > 0xFF, but this is faster on win32)
7715 //fprintf(stderr, "R");
7716 return ucol_strcollRegular(coll, source, sLen, target, tLen, sta tus);
7717 }
7718 sOrder = elements[sChar];
7719 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
7720 // specials can basically be either contractions or bail-out sig ns. If we get anything
7721 // else, we'll bail out anywasy
7722 if(getCETag(sOrder) == CONTRACTION_TAG) {
7723 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOr der, source, &sIndex, sLen);
7724 haveContractions = TRUE; // if there are contractions, we ca nnot do French secondary
7725 // However, if there are contractions in the table, but we a lways use just one char,
7726 // we might be able to do French. This should be checked out .
7727 }
7728 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7729 //fprintf(stderr, "S");
7730 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7731 }
7732 }
7733 }
7734
7735 while(tOrder==0) { // this loop skips primary ignorables
7736 // tOrder=getNextlatinOneCE(target);
7737 if(tLen==-1) { // handling zero terminated strings
7738 tChar=target[tIndex++];
7739 if(tChar==0) {
7740 if(endOfSource) { // this is different than source loop,
7741 // as we already know that source loop is done here,
7742 // so we can either finish the primary loop if both
7743 // strings are done or anounce the result if only
7744 // target is done. Same below.
7745 goto endOfPrimLoop;
7746 } else {
7747 return UCOL_GREATER;
7748 }
7749 }
7750 } else { // handling strings with known length
7751 if(tIndex==tLen) {
7752 if(endOfSource) {
7753 goto endOfPrimLoop;
7754 } else {
7755 return UCOL_GREATER;
7756 }
7757 }
7758 tChar=target[tIndex++];
7759 }
7760 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sCha r > 0xFF, but this is faster on win32)
7761 //fprintf(stderr, "R");
7762 return ucol_strcollRegular(coll, source, sLen, target, tLen, sta tus);
7763 }
7764 tOrder = elements[tChar];
7765 if(tOrder >= UCOL_NOT_FOUND) {
7766 // Handling specials, see the comments for source
7767 if(getCETag(tOrder) == CONTRACTION_TAG) {
7768 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOr der, target, &tIndex, tLen);
7769 haveContractions = TRUE;
7770 }
7771 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7772 //fprintf(stderr, "S");
7773 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7774 }
7775 }
7776 }
7777 if(endOfSource) { // source is finished, but target is not, say the resu lt.
7778 return UCOL_LESS;
7779 }
7780
7781 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
7782 sOrder = 0; tOrder = 0;
7783 continue;
7784 } else {
7785 // compare current top bytes
7786 if(((sOrder^tOrder)&0xFF000000)!=0) {
7787 // top bytes differ, return difference
7788 if(sOrder < tOrder) {
7789 return UCOL_LESS;
7790 } else if(sOrder > tOrder) {
7791 return UCOL_GREATER;
7792 }
7793 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24) ;
7794 // since we must return enum value
7795 }
7796
7797 // top bytes match, continue with following bytes
7798 sOrder<<=8;
7799 tOrder<<=8;
7800 }
7801 }
7802
7803 endOfPrimLoop:
7804 // after primary loop, we definitely know the sizes of strings,
7805 // so we set it and use simpler loop for secondaries and tertiaries
7806 sLen = sIndex; tLen = tIndex;
7807 if(strength >= UCOL_SECONDARY) {
7808 // adjust the table beggining
7809 elements += coll->latinOneTableLen;
7810 endOfSource = FALSE;
7811
7812 if(coll->frenchCollation == UCOL_OFF) { // non French
7813 // This loop is a simplified copy of primary loop
7814 // at this point we know that whole strings are latin-1, so we don't
7815 // check for that. We also know that we only have contractions as
7816 // specials.
7817 sIndex = 0; tIndex = 0;
7818 for(;;) {
7819 while(sOrder==0) {
7820 if(sIndex==sLen) {
7821 endOfSource = TRUE;
7822 break;
7823 }
7824 sChar=source[sIndex++];
7825 sOrder = elements[sChar];
7826 if(sOrder > UCOL_NOT_FOUND) {
7827 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDAR Y, sOrder, source, &sIndex, sLen);
7828 }
7829 }
7830
7831 while(tOrder==0) {
7832 if(tIndex==tLen) {
7833 if(endOfSource) {
7834 goto endOfSecLoop;
7835 } else {
7836 return UCOL_GREATER;
7837 }
7838 }
7839 tChar=target[tIndex++];
7840 tOrder = elements[tChar];
7841 if(tOrder > UCOL_NOT_FOUND) {
7842 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDAR Y, tOrder, target, &tIndex, tLen);
7843 }
7844 }
7845 if(endOfSource) {
7846 return UCOL_LESS;
7847 }
7848
7849 if(sOrder == tOrder) {
7850 sOrder = 0; tOrder = 0;
7851 continue;
7852 } else {
7853 // see primary loop for comments on this
7854 if(((sOrder^tOrder)&0xFF000000)!=0) {
7855 if(sOrder < tOrder) {
7856 return UCOL_LESS;
7857 } else if(sOrder > tOrder) {
7858 return UCOL_GREATER;
7859 }
7860 }
7861 sOrder<<=8;
7862 tOrder<<=8;
7863 }
7864 }
7865 } else { // French
7866 if(haveContractions) { // if we have contractions, we have to bail o ut
7867 // since we don't really know how to handle them here
7868 return ucol_strcollRegular(coll, source, sLen, target, tLen, sta tus);
7869 }
7870 // For French, we go backwards
7871 sIndex = sLen; tIndex = tLen;
7872 for(;;) {
7873 while(sOrder==0) {
7874 if(sIndex==0) {
7875 endOfSource = TRUE;
7876 break;
7877 }
7878 sChar=source[--sIndex];
7879 sOrder = elements[sChar];
7880 // don't even look for contractions
7881 }
7882
7883 while(tOrder==0) {
7884 if(tIndex==0) {
7885 if(endOfSource) {
7886 goto endOfSecLoop;
7887 } else {
7888 return UCOL_GREATER;
7889 }
7890 }
7891 tChar=target[--tIndex];
7892 tOrder = elements[tChar];
7893 // don't even look for contractions
7894 }
7895 if(endOfSource) {
7896 return UCOL_LESS;
7897 }
7898
7899 if(sOrder == tOrder) {
7900 sOrder = 0; tOrder = 0;
7901 continue;
7902 } else {
7903 // see the primary loop for comments
7904 if(((sOrder^tOrder)&0xFF000000)!=0) {
7905 if(sOrder < tOrder) {
7906 return UCOL_LESS;
7907 } else if(sOrder > tOrder) {
7908 return UCOL_GREATER;
7909 }
7910 }
7911 sOrder<<=8;
7912 tOrder<<=8;
7913 }
7914 }
7915 }
7916 }
7917
7918 endOfSecLoop:
7919 if(strength >= UCOL_TERTIARY) {
7920 // tertiary loop is the same as secondary (except no French)
7921 elements += coll->latinOneTableLen;
7922 sIndex = 0; tIndex = 0;
7923 endOfSource = FALSE;
7924 for(;;) {
7925 while(sOrder==0) {
7926 if(sIndex==sLen) {
7927 endOfSource = TRUE;
7928 break;
7929 }
7930 sChar=source[sIndex++];
7931 sOrder = elements[sChar];
7932 if(sOrder > UCOL_NOT_FOUND) {
7933 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sO rder, source, &sIndex, sLen);
7934 }
7935 }
7936 while(tOrder==0) {
7937 if(tIndex==tLen) {
7938 if(endOfSource) {
7939 return UCOL_EQUAL; // if both strings are at the end, th ey are equal
7940 } else {
7941 return UCOL_GREATER;
7942 }
7943 }
7944 tChar=target[tIndex++];
7945 tOrder = elements[tChar];
7946 if(tOrder > UCOL_NOT_FOUND) {
7947 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tO rder, target, &tIndex, tLen);
7948 }
7949 }
7950 if(endOfSource) {
7951 return UCOL_LESS;
7952 }
7953 if(sOrder == tOrder) {
7954 sOrder = 0; tOrder = 0;
7955 continue;
7956 } else {
7957 if(((sOrder^tOrder)&0xff000000)!=0) {
7958 if(sOrder < tOrder) {
7959 return UCOL_LESS;
7960 } else if(sOrder > tOrder) {
7961 return UCOL_GREATER;
7962 }
7963 }
7964 sOrder<<=8;
7965 tOrder<<=8;
7966 }
7967 }
7968 }
7969 return UCOL_EQUAL;
7970 }
7971
7972 /*
7973 Note: ucol_strcollUTF8 supports null terminated input. Calculating length of
7974 null terminated input string takes extra amount of CPU cycles.
7975 */
7976 static UCollationResult
7977 ucol_strcollRegularUTF8(
7978 const UCollator *coll,
7979 const char *source,
7980 int32_t sourceLength,
7981 const char *target,
7982 int32_t targetLength,
7983 UErrorCode *status)
7984 {
7985 UCharIterator src;
7986 UCharIterator tgt;
7987
7988 uiter_setUTF8(&src, source, sourceLength);
7989 uiter_setUTF8(&tgt, target, targetLength);
7990
7991 // Preparing the context objects for iterating over strings
7992 collIterate sColl, tColl;
7993 IInit_collIterate(coll, NULL, -1, &sColl, status);
7994 IInit_collIterate(coll, NULL, -1, &tColl, status);
7995 if(U_FAILURE(*status)) {
7996 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
7997 return UCOL_EQUAL;
7998 }
7999 // The division for the array length may truncate the array size to
8000 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8001 // for all platforms anyway.
8002 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8003 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8004 UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8005
8006 sColl.iterator = &src;
8007 sColl.flags |= UCOL_USE_ITERATOR;
8008 tColl.flags |= UCOL_USE_ITERATOR;
8009 tColl.iterator = &tgt;
8010
8011 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8012 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), statu s);
8013 sColl.iterator = unorm_setIter(sNormIter, &src, UNORM_FCD, status);
8014 sColl.flags &= ~UCOL_ITER_NORM;
8015
8016 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), statu s);
8017 tColl.iterator = unorm_setIter(tNormIter, &tgt, UNORM_FCD, status);
8018 tColl.flags &= ~UCOL_ITER_NORM;
8019 }
8020
8021 return ucol_strcollRegular(&sColl, &tColl, status);
8022 }
8023
8024 static inline uint32_t
8025 ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength,
8026 uint32_t CE, const char *s, int32_t *index, int32_t le n)
8027 {
8028 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
8029 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
8030 int32_t offset = 1;
8031 UChar32 schar = 0, tchar = 0;
8032
8033 for(;;) {
8034 if (*index == len) {
8035 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff set]);
8036 }
8037 U8_GET_OR_FFFD((const uint8_t*)s, 0, *index, len, schar);
8038 if (len < 0 && schar == 0) {
8039 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff set]);
8040 }
8041
8042 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contractio n codepoints should be ordered, we skip all that are smaller */
8043 offset++;
8044 }
8045
8046 if (schar == tchar) {
8047 U8_FWD_1(s, *index, len);
8048 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff set+offset]);
8049 }
8050 else
8051 {
8052 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
8053 return UCOL_BAIL_OUT_CE;
8054 }
8055 // skip completely ignorables
8056 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
8057 if(isZeroCE == 0) { // we have to ignore completely ignorables
8058 U8_FWD_1(s, *index, len);
8059 continue;
8060 }
8061
8062 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff set]);
8063 }
8064 }
8065 }
8066
8067 static inline UCollationResult
8068 ucol_strcollUseLatin1UTF8(
8069 const UCollator *coll,
8070 const char *source,
8071 int32_t sLen,
8072 const char *target,
8073 int32_t tLen,
8074 UErrorCode *status)
8075 {
8076 U_ALIGN_CODE(16);
8077 int32_t strength = coll->strength;
8078
8079 int32_t sIndex = 0, tIndex = 0;
8080 UChar32 sChar = 0, tChar = 0;
8081 uint32_t sOrder=0, tOrder=0;
8082
8083 UBool endOfSource = FALSE;
8084
8085 uint32_t *elements = coll->latinOneCEs;
8086
8087 UBool haveContractions = FALSE; // if we have contractions in our string
8088 // we cannot do French secondary
8089
8090 // Do the primary level
8091 for(;;) {
8092 while(sOrder==0) { // this loop skips primary ignorables
8093 // sOrder=getNextlatinOneCE(source);
8094 if (sIndex == sLen) {
8095 endOfSource = TRUE;
8096 break;
8097 }
8098 U8_NEXT_OR_FFFD(source, sIndex, sLen ,sChar);
8099 if (sLen < 0 && sChar == 0) {
8100 endOfSource = TRUE;
8101 sLen = sIndex;
8102 break;
8103 }
8104 if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out ( sChar > 0xFF, but this is faster on win32)
8105 //fprintf(stderr, "R");
8106 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8107 }
8108 sOrder = elements[sChar];
8109 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
8110 // specials can basically be either contractions or bail-out sig ns. If we get anything
8111 // else, we'll bail out anywasy
8112 if(getCETag(sOrder) == CONTRACTION_TAG) {
8113 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
8114 haveContractions = TRUE; // if there are contractions, we ca nnot do French secondary
8115 // However, if there are contractions in the table, but we a lways use just one char,
8116 // we might be able to do French. This should be checked out .
8117 }
8118 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8119 //fprintf(stderr, "S");
8120 return ucol_strcollRegularUTF8(coll, source, sLen, target, t Len, status);
8121 }
8122 }
8123 }
8124
8125 while(tOrder==0) { // this loop skips primary ignorables
8126 // tOrder=getNextlatinOneCE(target);
8127 if (tIndex == tLen) {
8128 if(endOfSource) {
8129 goto endOfPrimLoopU8;
8130 } else {
8131 return UCOL_GREATER;
8132 }
8133 }
8134 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
8135 if (tLen < 0 && tChar == 0) {
8136 if(endOfSource) {
8137 tLen = tIndex;
8138 goto endOfPrimLoopU8;
8139 } else {
8140 return UCOL_GREATER;
8141 }
8142 }
8143 if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out ( sChar > 0xFF, but this is faster on win32)
8144 //fprintf(stderr, "R");
8145 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8146 }
8147 tOrder = elements[tChar];
8148 if(tOrder >= UCOL_NOT_FOUND) {
8149 // Handling specials, see the comments for source
8150 if(getCETag(tOrder) == CONTRACTION_TAG) {
8151 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
8152 haveContractions = TRUE;
8153 }
8154 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8155 //fprintf(stderr, "S");
8156 return ucol_strcollRegularUTF8(coll, source, sLen, target, t Len, status);
8157 }
8158 }
8159 }
8160 if(endOfSource) { // source is finished, but target is not, say the resu lt.
8161 return UCOL_LESS;
8162 }
8163
8164 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
8165 sOrder = 0; tOrder = 0;
8166 continue;
8167 } else {
8168 // compare current top bytes
8169 if(((sOrder^tOrder)&0xFF000000)!=0) {
8170 // top bytes differ, return difference
8171 if(sOrder < tOrder) {
8172 return UCOL_LESS;
8173 } else if(sOrder > tOrder) {
8174 return UCOL_GREATER;
8175 }
8176 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24) ;
8177 // since we must return enum value
8178 }
8179
8180 // top bytes match, continue with following bytes
8181 sOrder<<=8;
8182 tOrder<<=8;
8183 }
8184 }
8185
8186 endOfPrimLoopU8:
8187 // after primary loop, we definitely know the sizes of strings,
8188 // so we set it and use simpler loop for secondaries and tertiaries
8189 sLen = sIndex; tLen = tIndex;
8190 if(strength >= UCOL_SECONDARY) {
8191 // adjust the table beggining
8192 elements += coll->latinOneTableLen;
8193 endOfSource = FALSE;
8194
8195 if(coll->frenchCollation == UCOL_OFF) { // non French
8196 // This loop is a simplified copy of primary loop
8197 // at this point we know that whole strings are latin-1, so we don't
8198 // check for that. We also know that we only have contractions as
8199 // specials.
8200 sIndex = 0; tIndex = 0;
8201 for(;;) {
8202 while(sOrder==0) {
8203 if(sIndex==sLen) {
8204 endOfSource = TRUE;
8205 break;
8206 }
8207 U_ASSERT(sLen >= 0);
8208 U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);
8209 U_ASSERT(sChar >= 0 && sChar <= 0xFF);
8210 sOrder = elements[sChar];
8211 if(sOrder > UCOL_NOT_FOUND) {
8212 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECO NDARY, sOrder, source, &sIndex, sLen);
8213 }
8214 }
8215
8216 while(tOrder==0) {
8217 if(tIndex==tLen) {
8218 if(endOfSource) {
8219 goto endOfSecLoopU8;
8220 } else {
8221 return UCOL_GREATER;
8222 }
8223 }
8224 U_ASSERT(tLen >= 0);
8225 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
8226 U_ASSERT(tChar >= 0 && tChar <= 0xFF);
8227 tOrder = elements[tChar];
8228 if(tOrder > UCOL_NOT_FOUND) {
8229 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECO NDARY, tOrder, target, &tIndex, tLen);
8230 }
8231 }
8232 if(endOfSource) {
8233 return UCOL_LESS;
8234 }
8235
8236 if(sOrder == tOrder) {
8237 sOrder = 0; tOrder = 0;
8238 continue;
8239 } else {
8240 // see primary loop for comments on this
8241 if(((sOrder^tOrder)&0xFF000000)!=0) {
8242 if(sOrder < tOrder) {
8243 return UCOL_LESS;
8244 } else if(sOrder > tOrder) {
8245 return UCOL_GREATER;
8246 }
8247 }
8248 sOrder<<=8;
8249 tOrder<<=8;
8250 }
8251 }
8252 } else { // French
8253 if(haveContractions) { // if we have contractions, we have to bail o ut
8254 // since we don't really know how to handle them here
8255 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8256 }
8257 // For French, we go backwards
8258 sIndex = sLen; tIndex = tLen;
8259 for(;;) {
8260 while(sOrder==0) {
8261 if(sIndex==0) {
8262 endOfSource = TRUE;
8263 break;
8264 }
8265 U8_PREV_OR_FFFD(source, 0, sIndex, sChar);
8266 U_ASSERT(sChar >= 0 && sChar <= 0xFF);
8267 sOrder = elements[sChar];
8268 // don't even look for contractions
8269 }
8270
8271 while(tOrder==0) {
8272 if(tIndex==0) {
8273 if(endOfSource) {
8274 goto endOfSecLoopU8;
8275 } else {
8276 return UCOL_GREATER;
8277 }
8278 }
8279 U8_PREV_OR_FFFD(target, 0, tIndex, tChar);
8280 U_ASSERT(tChar >= 0 && tChar <= 0xFF);
8281 tOrder = elements[tChar];
8282 // don't even look for contractions
8283 }
8284 if(endOfSource) {
8285 return UCOL_LESS;
8286 }
8287
8288 if(sOrder == tOrder) {
8289 sOrder = 0; tOrder = 0;
8290 continue;
8291 } else {
8292 // see the primary loop for comments
8293 if(((sOrder^tOrder)&0xFF000000)!=0) {
8294 if(sOrder < tOrder) {
8295 return UCOL_LESS;
8296 } else if(sOrder > tOrder) {
8297 return UCOL_GREATER;
8298 }
8299 }
8300 sOrder<<=8;
8301 tOrder<<=8;
8302 }
8303 }
8304 }
8305 }
8306
8307 endOfSecLoopU8:
8308 if(strength >= UCOL_TERTIARY) {
8309 // tertiary loop is the same as secondary (except no French)
8310 elements += coll->latinOneTableLen;
8311 sIndex = 0; tIndex = 0;
8312 endOfSource = FALSE;
8313 for(;;) {
8314 while(sOrder==0) {
8315 if(sIndex==sLen) {
8316 endOfSource = TRUE;
8317 break;
8318 }
8319 U_ASSERT(sLen >= 0);
8320 U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);
8321 U_ASSERT(sChar >= 0 && sChar <= 0xFF);
8322 sOrder = elements[sChar];
8323 if(sOrder > UCOL_NOT_FOUND) {
8324 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY , sOrder, source, &sIndex, sLen);
8325 }
8326 }
8327 while(tOrder==0) {
8328 if(tIndex==tLen) {
8329 if(endOfSource) {
8330 return UCOL_EQUAL; // if both strings are at the end, th ey are equal
8331 } else {
8332 return UCOL_GREATER;
8333 }
8334 }
8335 U_ASSERT(tLen >= 0);
8336 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
8337 U_ASSERT(tChar >= 0 && tChar <= 0xFF);
8338 tOrder = elements[tChar];
8339 if(tOrder > UCOL_NOT_FOUND) {
8340 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY , tOrder, target, &tIndex, tLen);
8341 }
8342 }
8343 if(endOfSource) {
8344 return UCOL_LESS;
8345 }
8346 if(sOrder == tOrder) {
8347 sOrder = 0; tOrder = 0;
8348 continue;
8349 } else {
8350 if(((sOrder^tOrder)&0xff000000)!=0) {
8351 if(sOrder < tOrder) {
8352 return UCOL_LESS;
8353 } else if(sOrder > tOrder) {
8354 return UCOL_GREATER;
8355 }
8356 }
8357 sOrder<<=8;
8358 tOrder<<=8;
8359 }
8360 }
8361 }
8362 return UCOL_EQUAL;
8363 } 406 }
8364 407
8365 U_CAPI UCollationResult U_EXPORT2 408 U_CAPI UCollationResult U_EXPORT2
8366 ucol_strcollIter( const UCollator *coll, 409 ucol_strcollIter( const UCollator *coll,
8367 UCharIterator *sIter, 410 UCharIterator *sIter,
8368 UCharIterator *tIter, 411 UCharIterator *tIter,
8369 UErrorCode *status) 412 UErrorCode *status)
8370 { 413 {
8371 if(!status || U_FAILURE(*status)) { 414 if(!status || U_FAILURE(*status)) {
8372 return UCOL_EQUAL; 415 return UCOL_EQUAL;
8373 } 416 }
8374 417
8375 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); 418 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8376 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIt er); 419 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIt er);
8377 420
8378 if (sIter == tIter) {
8379 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8380 return UCOL_EQUAL;
8381 }
8382 if(sIter == NULL || tIter == NULL || coll == NULL) { 421 if(sIter == NULL || tIter == NULL || coll == NULL) {
8383 *status = U_ILLEGAL_ARGUMENT_ERROR; 422 *status = U_ILLEGAL_ARGUMENT_ERROR;
8384 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 423 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8385 return UCOL_EQUAL; 424 return UCOL_EQUAL;
8386 } 425 }
8387 426
8388 UCollationResult result = UCOL_EQUAL; 427 UCollationResult result = Collator::fromUCollator(coll)->compare(*sIter, *tI ter, *status);
8389 428
8390 // Preparing the context objects for iterating over strings 429 UTRACE_EXIT_VALUE_STATUS(result, *status);
8391 collIterate sColl, tColl;
8392 IInit_collIterate(coll, NULL, -1, &sColl, status);
8393 IInit_collIterate(coll, NULL, -1, &tColl, status);
8394 if(U_FAILURE(*status)) {
8395 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8396 return UCOL_EQUAL;
8397 }
8398 // The division for the array length may truncate the array size to
8399 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8400 // for all platforms anyway.
8401 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8402 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8403 UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8404
8405 sColl.iterator = sIter;
8406 sColl.flags |= UCOL_USE_ITERATOR;
8407 tColl.flags |= UCOL_USE_ITERATOR;
8408 tColl.iterator = tIter;
8409
8410 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8411 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), statu s);
8412 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8413 sColl.flags &= ~UCOL_ITER_NORM;
8414
8415 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), statu s);
8416 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8417 tColl.flags &= ~UCOL_ITER_NORM;
8418 }
8419
8420 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8421
8422 while((sChar = sColl.iterator->next(sColl.iterator)) ==
8423 (tChar = tColl.iterator->next(tColl.iterator))) {
8424 if(sChar == U_SENTINEL) {
8425 result = UCOL_EQUAL;
8426 goto end_compare;
8427 }
8428 }
8429
8430 if(sChar == U_SENTINEL) {
8431 tChar = tColl.iterator->previous(tColl.iterator);
8432 }
8433
8434 if(tChar == U_SENTINEL) {
8435 sChar = sColl.iterator->previous(sColl.iterator);
8436 }
8437
8438 sChar = sColl.iterator->previous(sColl.iterator);
8439 tChar = tColl.iterator->previous(tColl.iterator);
8440
8441 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8442 {
8443 // We are stopped in the middle of a contraction.
8444 // Scan backwards through the == part of the string looking for the star t of the contraction.
8445 // It doesn't matter which string we scan, since they are the same in this region.
8446 do
8447 {
8448 sChar = sColl.iterator->previous(sColl.iterator);
8449 tChar = tColl.iterator->previous(tColl.iterator);
8450 }
8451 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8452 }
8453
8454
8455 if(U_SUCCESS(*status)) {
8456 result = ucol_strcollRegular(&sColl, &tColl, status);
8457 }
8458
8459 end_compare:
8460 if(sNormIter || tNormIter) {
8461 unorm_closeIter(sNormIter);
8462 unorm_closeIter(tNormIter);
8463 }
8464
8465 UTRACE_EXIT_VALUE_STATUS(result, *status)
8466 return result; 430 return result;
8467 } 431 }
8468 432
8469 433
8470 /* */ 434 /* */
8471 /* ucol_strcoll Main public API string comparison function */ 435 /* ucol_strcoll Main public API string comparison function */
8472 /* */ 436 /* */
8473 U_CAPI UCollationResult U_EXPORT2 437 U_CAPI UCollationResult U_EXPORT2
8474 ucol_strcoll( const UCollator *coll, 438 ucol_strcoll( const UCollator *coll,
8475 const UChar *source, 439 const UChar *source,
8476 int32_t sourceLength, 440 int32_t sourceLength,
8477 const UChar *target, 441 const UChar *target,
8478 int32_t targetLength) 442 int32_t targetLength)
8479 { 443 {
8480 U_ALIGN_CODE(16); 444 U_ALIGN_CODE(16);
8481 445
8482 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); 446 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8483 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 447 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8484 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, sour ce, target); 448 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, sour ce, target);
8485 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLengt h); 449 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLengt h);
8486 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLengt h); 450 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLengt h);
8487 } 451 }
8488 452
8489 if((source == NULL && sourceLength != 0) || (target == NULL && targetLength != 0)) {
8490 // do not crash, but return. Should have
8491 // status argument to return error.
8492 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8493 return UCOL_EQUAL;
8494 }
8495
8496 /* Quick check if source and target are same strings. */
8497 /* They should either both be NULL terminated or the explicit length should be set on both. */
8498 if (source==target && sourceLength==targetLength) {
8499 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8500 return UCOL_EQUAL;
8501 }
8502
8503 if(coll->delegate != NULL) {
8504 UErrorCode status = U_ZERO_ERROR;
8505 return ((const Collator*)coll->delegate)->compare(source,sourceLength,targ et,targetLength, status);
8506 }
8507
8508 /* Scan the strings. Find: */
8509 /* The length of any leading portion that is equal */
8510 /* Whether they are exactly equal. (in which case we just return) */
8511 const UChar *pSrc = source;
8512 const UChar *pTarg = target;
8513 int32_t equalLength;
8514
8515 if (sourceLength == -1 && targetLength == -1) {
8516 // Both strings are null terminated.
8517 // Scan through any leading equal portion.
8518 while (*pSrc == *pTarg && *pSrc != 0) {
8519 pSrc++;
8520 pTarg++;
8521 }
8522 if (*pSrc == 0 && *pTarg == 0) {
8523 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8524 return UCOL_EQUAL;
8525 }
8526 equalLength = (int32_t)(pSrc - source);
8527 }
8528 else
8529 {
8530 // One or both strings has an explicit length.
8531 const UChar *pSrcEnd = source + sourceLength;
8532 const UChar *pTargEnd = target + targetLength;
8533
8534 // Scan while the strings are bitwise ==, or until one is exhausted.
8535 for (;;) {
8536 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8537 break;
8538 }
8539 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLeng th == -1)) {
8540 break;
8541 }
8542 if (*pSrc != *pTarg) {
8543 break;
8544 }
8545 pSrc++;
8546 pTarg++;
8547 }
8548 equalLength = (int32_t)(pSrc - source);
8549
8550 // If we made it all the way through both strings, we are done. They ar e ==
8551 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */
8552 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also at end of dest string */
8553 {
8554 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8555 return UCOL_EQUAL;
8556 }
8557 }
8558 if (equalLength > 0) {
8559 /* There is an identical portion at the beginning of the two strings. */
8560 /* If the identical portion ends within a contraction or a comibining */
8561 /* character sequence, back up to the start of that sequence. */
8562
8563 // These values should already be set by the code above.
8564 //pSrc = source + equalLength; /* point to the first differing c hars */
8565 //pTarg = target + equalLength;
8566 if ((pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) ||
8567 (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)))
8568 {
8569 // We are stopped in the middle of a contraction.
8570 // Scan backwards through the == part of the string looking for the start of the contraction.
8571 // It doesn't matter which string we scan, since they are the same in this region.
8572 do
8573 {
8574 equalLength--;
8575 pSrc--;
8576 }
8577 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8578 }
8579
8580 source += equalLength;
8581 target += equalLength;
8582 if (sourceLength > 0) {
8583 sourceLength -= equalLength;
8584 }
8585 if (targetLength > 0) {
8586 targetLength -= equalLength;
8587 }
8588 }
8589
8590 UErrorCode status = U_ZERO_ERROR; 453 UErrorCode status = U_ZERO_ERROR;
8591 UCollationResult returnVal; 454 UCollationResult returnVal = Collator::fromUCollator(coll)->
8592 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLeng th > 0 && *target&0xff00)) { 455 compare(source, sourceLength, target, targetLength, status);
8593 returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targ etLength, &status); 456 UTRACE_EXIT_VALUE_STATUS(returnVal, status);
8594 } else {
8595 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, ta rgetLength, &status);
8596 }
8597 UTRACE_EXIT_VALUE(returnVal);
8598 return returnVal; 457 return returnVal;
8599 } 458 }
8600 459
8601 U_CAPI UCollationResult U_EXPORT2 460 U_CAPI UCollationResult U_EXPORT2
8602 ucol_strcollUTF8( 461 ucol_strcollUTF8(
8603 const UCollator *coll, 462 const UCollator *coll,
8604 const char *source, 463 const char *source,
8605 int32_t sourceLength, 464 int32_t sourceLength,
8606 const char *target, 465 const char *target,
8607 int32_t targetLength, 466 int32_t targetLength,
8608 UErrorCode *status) 467 UErrorCode *status)
8609 { 468 {
8610 U_ALIGN_CODE(16); 469 U_ALIGN_CODE(16);
8611 470
8612 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8); 471 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8);
8613 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 472 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8614 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, sour ce, target); 473 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, sour ce, target);
8615 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLengt h); 474 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLengt h);
8616 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLengt h); 475 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLengt h);
8617 } 476 }
8618 477
8619 if (U_FAILURE(*status)) { 478 if (U_FAILURE(*status)) {
8620 /* do nothing */ 479 /* do nothing */
8621 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 480 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8622 return UCOL_EQUAL; 481 return UCOL_EQUAL;
8623 } 482 }
8624 483
8625 if((source == NULL && sourceLength != 0) || (target == NULL && targetLength != 0)) { 484 UCollationResult returnVal = Collator::fromUCollator(coll)->internalCompareU TF8(
8626 *status = U_ILLEGAL_ARGUMENT_ERROR; 485 source, sourceLength, target, targetLength, *status);
8627 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8628 return UCOL_EQUAL;
8629 }
8630
8631 /* Quick check if source and target are same strings. */
8632 /* They should either both be NULL terminated or the explicit length should be set on both. */
8633 if (source==target && sourceLength==targetLength) {
8634 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8635 return UCOL_EQUAL;
8636 }
8637
8638 if(coll->delegate != NULL) {
8639 return ((const Collator*)coll->delegate)->compareUTF8(
8640 StringPiece(source, (sourceLength < 0) ? uprv_strlen(source) : sourc eLength),
8641 StringPiece(target, (targetLength < 0) ? uprv_strlen(target) : targe tLength),
8642 *status);
8643 }
8644
8645 /* Scan the strings. Find: */
8646 /* The length of any leading portion that is equal */
8647 /* Whether they are exactly equal. (in which case we just return) */
8648 const char *pSrc = source;
8649 const char *pTarg = target;
8650 UBool bSrcLimit = FALSE;
8651 UBool bTargLimit = FALSE;
8652
8653 if (sourceLength == -1 && targetLength == -1) {
8654 // Both strings are null terminated.
8655 // Scan through any leading equal portion.
8656 while (*pSrc == *pTarg && *pSrc != 0) {
8657 pSrc++;
8658 pTarg++;
8659 }
8660 if (*pSrc == 0 && *pTarg == 0) {
8661 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8662 return UCOL_EQUAL;
8663 }
8664 bSrcLimit = (*pSrc == 0);
8665 bTargLimit = (*pTarg == 0);
8666 }
8667 else
8668 {
8669 // One or both strings has an explicit length.
8670 const char *pSrcEnd = source + sourceLength;
8671 const char *pTargEnd = target + targetLength;
8672
8673 // Scan while the strings are bitwise ==, or until one is exhausted.
8674 for (;;) {
8675 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8676 break;
8677 }
8678 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLeng th == -1)) {
8679 break;
8680 }
8681 if (*pSrc != *pTarg) {
8682 break;
8683 }
8684 pSrc++;
8685 pTarg++;
8686 }
8687 bSrcLimit = (pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0));
8688 bTargLimit = (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0));
8689
8690 // If we made it all the way through both strings, we are done. They ar e ==
8691 if (bSrcLimit && /* At end of src string, however it was specified. * /
8692 bTargLimit) /* and also at end of dest string * /
8693 {
8694 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8695 return UCOL_EQUAL;
8696 }
8697 }
8698
8699 U_ASSERT(!(bSrcLimit && bTargLimit));
8700
8701 int32_t equalLength = pSrc - source;
8702 UBool bSawNonLatin1 = FALSE;
8703
8704 if (equalLength > 0) {
8705 // Align position to the start of UTF-8 code point.
8706 if (bTargLimit) {
8707 U8_SET_CP_START((const uint8_t*)source, 0, equalLength);
8708 } else {
8709 U8_SET_CP_START((const uint8_t*)target, 0, equalLength);
8710 }
8711 pSrc = source + equalLength;
8712 pTarg = target + equalLength;
8713 }
8714
8715 if (equalLength > 0) {
8716 /* There is an identical portion at the beginning of the two strings. */
8717 /* If the identical portion ends within a contraction or a comibining */
8718 /* character sequence, back up to the start of that sequence. */
8719 UBool bUnsafeCP = FALSE;
8720 UChar32 uc32 = -1;
8721
8722 if (!bSrcLimit) {
8723 U8_GET_OR_FFFD((const uint8_t*)source, 0, equalLength, sourceLength, uc32);
8724 if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
8725 bUnsafeCP = TRUE;
8726 }
8727 bSawNonLatin1 |= (uc32 > 0xff);
8728 }
8729 if (!bTargLimit) {
8730 U8_GET_OR_FFFD((const uint8_t*)target, 0, equalLength, targetLength, uc32);
8731 if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
8732 bUnsafeCP = TRUE;
8733 }
8734 bSawNonLatin1 |= (uc32 > 0xff);
8735 }
8736
8737 if (bUnsafeCP) {
8738 while (equalLength > 0) {
8739 // We are stopped in the middle of a contraction.
8740 // Scan backwards through the == part of the string looking for the start of the contraction.
8741 // It doesn't matter which string we scan, since they are the same in this region.
8742 U8_PREV_OR_FFFD((uint8_t*)source, 0, equalLength, uc32);
8743 bSawNonLatin1 |= (uc32 > 0xff);
8744 if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) {
8745 break;
8746 }
8747 }
8748 }
8749 source += equalLength;
8750 target += equalLength;
8751 if (sourceLength > 0) {
8752 sourceLength -= equalLength;
8753 }
8754 if (targetLength > 0) {
8755 targetLength -= equalLength;
8756 }
8757 } else {
8758 // Lead byte of Latin 1 character is 0x00 - 0xC3
8759 bSawNonLatin1 = (source && (sourceLength != 0) && (uint8_t)*source > 0xc 3);
8760 bSawNonLatin1 |= (target && (targetLength != 0) && (uint8_t)*target > 0x c3);
8761 }
8762
8763 UCollationResult returnVal;
8764
8765 if(!coll->latinOneUse || bSawNonLatin1) {
8766 returnVal = ucol_strcollRegularUTF8(coll, source, sourceLength, target, targetLength, status);
8767 } else {
8768 returnVal = ucol_strcollUseLatin1UTF8(coll, source, sourceLength, target , targetLength, status);
8769 }
8770 UTRACE_EXIT_VALUE_STATUS(returnVal, *status); 486 UTRACE_EXIT_VALUE_STATUS(returnVal, *status);
8771 return returnVal; 487 return returnVal;
8772 } 488 }
8773 489
8774 490
8775 /* convenience function for comparing strings */ 491 /* convenience function for comparing strings */
8776 U_CAPI UBool U_EXPORT2 492 U_CAPI UBool U_EXPORT2
8777 ucol_greater( const UCollator *coll, 493 ucol_greater( const UCollator *coll,
8778 const UChar *source, 494 const UChar *source,
8779 int32_t sourceLength, 495 int32_t sourceLength,
(...skipping 23 matching lines...) Expand all
8803 int32_t sourceLength, 519 int32_t sourceLength,
8804 const UChar *target, 520 const UChar *target,
8805 int32_t targetLength) 521 int32_t targetLength)
8806 { 522 {
8807 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 523 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8808 == UCOL_EQUAL); 524 == UCOL_EQUAL);
8809 } 525 }
8810 526
8811 U_CAPI void U_EXPORT2 527 U_CAPI void U_EXPORT2
8812 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { 528 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
8813 if(coll && coll->UCA) { 529 const Collator *c = Collator::fromUCollator(coll);
8814 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo)); 530 if(c != NULL) {
531 UVersionInfo v;
532 c->getVersion(v);
533 // Note: This is tied to how the current implementation encodes the UCA version
534 // in the overall getVersion().
535 // Alternatively, we could load the root collator and get at lower-level data from there.
536 // Either way, it will reflect the input collator's UCA version only
537 // if it is a known implementation.
538 // It would be cleaner to make this a virtual Collator method.
539 info[0] = v[1] >> 3;
540 info[1] = v[1] & 7;
541 info[2] = v[2] >> 6;
542 info[3] = 0;
8815 } 543 }
8816 } 544 }
8817 545
546 U_CAPI const UChar * U_EXPORT2
547 ucol_getRules(const UCollator *coll, int32_t *length) {
548 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
549 // OK to crash if coll==NULL: We do not want to check "this" pointers.
550 if(rbc != NULL || coll == NULL) {
551 const UnicodeString &rules = rbc->getRules();
552 U_ASSERT(rules.getBuffer()[rules.length()] == 0);
553 *length = rules.length();
554 return rules.getBuffer();
555 }
556 static const UChar _NUL = 0;
557 *length = 0;
558 return &_NUL;
559 }
560
561 U_CAPI int32_t U_EXPORT2
562 ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int3 2_t bufferLen) {
563 UnicodeString rules;
564 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
565 if(rbc != NULL || coll == NULL) {
566 rbc->getRules(delta, rules);
567 }
568 if(buffer != NULL && bufferLen > 0) {
569 UErrorCode errorCode = U_ZERO_ERROR;
570 return rules.extract(buffer, bufferLen, errorCode);
571 } else {
572 return rules.length();
573 }
574 }
575
576 U_CAPI const char * U_EXPORT2
577 ucol_getLocale(const UCollator *coll, ULocDataLocaleType type, UErrorCode *statu s) {
578 return ucol_getLocaleByType(coll, type, status);
579 }
580
581 U_CAPI const char * U_EXPORT2
582 ucol_getLocaleByType(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) {
583 if(U_FAILURE(*status)) {
584 return NULL;
585 }
586 UTRACE_ENTRY(UTRACE_UCOL_GETLOCALE);
587 UTRACE_DATA1(UTRACE_INFO, "coll=%p", coll);
588
589 const char *result;
590 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
591 if(rbc == NULL && coll != NULL) {
592 *status = U_UNSUPPORTED_ERROR;
593 result = NULL;
594 } else {
595 result = rbc->internalGetLocaleID(type, *status);
596 }
597
598 UTRACE_DATA1(UTRACE_INFO, "result = %s", result);
599 UTRACE_EXIT_STATUS(*status);
600 return result;
601 }
602
603 U_CAPI USet * U_EXPORT2
604 ucol_getTailoredSet(const UCollator *coll, UErrorCode *status) {
605 if(U_FAILURE(*status)) {
606 return NULL;
607 }
608 UnicodeSet *set = Collator::fromUCollator(coll)->getTailoredSet(*status);
609 if(U_FAILURE(*status)) {
610 delete set;
611 return NULL;
612 }
613 return set->toUSet();
614 }
615
616 U_CAPI UBool U_EXPORT2
617 ucol_equals(const UCollator *source, const UCollator *target) {
618 return source == target ||
619 (*Collator::fromUCollator(source)) == (*Collator::fromUCollator(target)) ;
620 }
621
8818 #endif /* #if !UCONFIG_NO_COLLATION */ 622 #endif /* #if !UCONFIG_NO_COLLATION */
OLDNEW
« no previous file with comments | « source/i18n/ucln_in.cpp ('k') | source/i18n/ucol_bld.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698