OLD | NEW |
1 /* | 1 /* |
2 ******************************************************************************* | 2 ******************************************************************************* |
3 * Copyright (C) 1996-2013, International Business Machines | 3 * Copyright (C) 1996-2014, International Business Machines |
4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
5 ******************************************************************************* | 5 ******************************************************************************* |
6 * file name: ucol.cpp | 6 * file name: ucol.cpp |
7 * encoding: US-ASCII | 7 * encoding: US-ASCII |
8 * tab size: 8 (not used) | 8 * tab size: 8 (not used) |
9 * indentation:4 | 9 * indentation:4 |
10 * | 10 * |
11 * Modification history | 11 * Modification history |
12 * Date Name Comments | 12 * Date Name Comments |
13 * 1996-1999 various members of ICU team maintained C API for collation framewo
rk | 13 * 1996-1999 various members of ICU team maintained C API for collation framewo
rk |
14 * 02/16/2001 synwee Added internal method getPrevSpecialCE | 14 * 02/16/2001 synwee Added internal method getPrevSpecialCE |
15 * 03/01/2001 synwee Added maxexpansion functionality. | 15 * 03/01/2001 synwee Added maxexpansion functionality. |
16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compl
iant | 16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compl
iant |
| 17 * 2012-2014 markus Rewritten in C++ again. |
17 */ | 18 */ |
18 | 19 |
19 #include "unicode/utypes.h" | 20 #include "unicode/utypes.h" |
20 | 21 |
21 #if !UCONFIG_NO_COLLATION | 22 #if !UCONFIG_NO_COLLATION |
22 | 23 |
| 24 #include "unicode/coll.h" |
| 25 #include "unicode/tblcoll.h" |
23 #include "unicode/bytestream.h" | 26 #include "unicode/bytestream.h" |
24 #include "unicode/coleitr.h" | 27 #include "unicode/coleitr.h" |
25 #include "unicode/unorm.h" | 28 #include "unicode/ucoleitr.h" |
26 #include "unicode/udata.h" | |
27 #include "unicode/ustring.h" | 29 #include "unicode/ustring.h" |
28 #include "unicode/utf8.h" | |
29 | |
30 #include "ucol_imp.h" | |
31 #include "bocsu.h" | |
32 | |
33 #include "normalizer2impl.h" | |
34 #include "unorm_it.h" | |
35 #include "umutex.h" | |
36 #include "cmemory.h" | 30 #include "cmemory.h" |
37 #include "ucln_in.h" | 31 #include "collation.h" |
38 #include "cstring.h" | 32 #include "cstring.h" |
39 #include "utracimp.h" | |
40 #include "putilimp.h" | 33 #include "putilimp.h" |
41 #include "uassert.h" | 34 #include "uassert.h" |
42 #include "unicode/coll.h" | 35 #include "utracimp.h" |
43 | |
44 #ifdef UCOL_DEBUG | |
45 #include <stdio.h> | |
46 #endif | |
47 | 36 |
48 U_NAMESPACE_USE | 37 U_NAMESPACE_USE |
49 | 38 |
50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) | |
51 | |
52 #define LAST_BYTE_MASK_ 0xFF | |
53 #define SECOND_LAST_BYTE_SHIFT_ 8 | |
54 | |
55 #define ZERO_CC_LIMIT_ 0xC0 | |
56 | |
57 // These are static pointers to the NFC/NFD implementation instance. | |
58 // Each of them is always the same between calls to u_cleanup | |
59 // and therefore writing to it is not synchronized. | |
60 // They are cleaned in ucol_cleanup | |
61 static const Normalizer2 *g_nfd = NULL; | |
62 static const Normalizer2Impl *g_nfcImpl = NULL; | |
63 | |
64 // These are values from UCA required for | |
65 // implicit generation and supressing sort key compression | |
66 // they should regularly be in the UCA, but if one | |
67 // is running without UCA, it could be a problem | |
68 static const int32_t maxRegularPrimary = 0x7A; | |
69 static const int32_t minImplicitPrimary = 0xE0; | |
70 static const int32_t maxImplicitPrimary = 0xE4; | |
71 | |
72 U_CDECL_BEGIN | |
73 static UBool U_CALLCONV | |
74 ucol_cleanup(void) | |
75 { | |
76 g_nfd = NULL; | |
77 g_nfcImpl = NULL; | |
78 return TRUE; | |
79 } | |
80 | |
81 static int32_t U_CALLCONV | |
82 _getFoldingOffset(uint32_t data) { | |
83 return (int32_t)(data&0xFFFFFF); | |
84 } | |
85 | |
86 U_CDECL_END | |
87 | |
88 static inline | |
89 UBool initializeNFD(UErrorCode *status) { | |
90 if (g_nfd != NULL) { | |
91 return TRUE; | |
92 } else { | |
93 // The result is constant, until the library is reloaded. | |
94 g_nfd = Normalizer2Factory::getNFDInstance(*status); | |
95 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); | |
96 return U_SUCCESS(*status); | |
97 } | |
98 } | |
99 | |
100 // init FCD data | |
101 static inline | |
102 UBool initializeFCD(UErrorCode *status) { | |
103 if (g_nfcImpl != NULL) { | |
104 return TRUE; | |
105 } else { | |
106 // The result is constant, until the library is reloaded. | |
107 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status); | |
108 // Note: Alternatively, we could also store this pointer in each collIte
rate struct, | |
109 // same as Normalizer2Factory::getImpl(collIterate->nfd). | |
110 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); | |
111 return U_SUCCESS(*status); | |
112 } | |
113 } | |
114 | |
115 static | |
116 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceStri
ng, | |
117 int32_t sourceLen, collIterate *s, | |
118 UErrorCode *status) | |
119 { | |
120 (s)->string = (s)->pos = sourceString; | |
121 (s)->origFlags = 0; | |
122 (s)->flags = 0; | |
123 if (sourceLen >= 0) { | |
124 s->flags |= UCOL_ITER_HASLEN; | |
125 (s)->endp = (UChar *)sourceString+sourceLen; | |
126 } | |
127 else { | |
128 /* change to enable easier checking for end of string for fcdpositon */ | |
129 (s)->endp = NULL; | |
130 } | |
131 (s)->extendCEs = NULL; | |
132 (s)->extendCEsSize = 0; | |
133 (s)->CEpos = (s)->toReturn = (s)->CEs; | |
134 (s)->offsetBuffer = NULL; | |
135 (s)->offsetBufferSize = 0; | |
136 (s)->offsetReturn = (s)->offsetStore = NULL; | |
137 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0; | |
138 (s)->coll = (collator); | |
139 if (initializeNFD(status)) { | |
140 (s)->nfd = g_nfd; | |
141 } else { | |
142 return; | |
143 } | |
144 (s)->fcdPosition = 0; | |
145 if(collator->normalizationMode == UCOL_ON) { | |
146 (s)->flags |= UCOL_ITER_NORM; | |
147 } | |
148 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY)
{ | |
149 (s)->flags |= UCOL_HIRAGANA_Q; | |
150 } | |
151 (s)->iterator = NULL; | |
152 //(s)->iteratorIndex = 0; | |
153 } | |
154 | |
155 U_CAPI void U_EXPORT2 | |
156 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, | |
157 int32_t sourceLen, collIterate *s, | |
158 UErrorCode *status) { | |
159 /* Out-of-line version for use from other files. */ | |
160 IInit_collIterate(collator, sourceString, sourceLen, s, status); | |
161 } | |
162 | |
163 U_CAPI collIterate * U_EXPORT2 | |
164 uprv_new_collIterate(UErrorCode *status) { | |
165 if(U_FAILURE(*status)) { | |
166 return NULL; | |
167 } | |
168 collIterate *s = new collIterate; | |
169 if(s == NULL) { | |
170 *status = U_MEMORY_ALLOCATION_ERROR; | |
171 return NULL; | |
172 } | |
173 return s; | |
174 } | |
175 | |
176 U_CAPI void U_EXPORT2 | |
177 uprv_delete_collIterate(collIterate *s) { | |
178 delete s; | |
179 } | |
180 | |
181 U_CAPI UBool U_EXPORT2 | |
182 uprv_collIterateAtEnd(collIterate *s) { | |
183 return s == NULL || s->pos == s->endp; | |
184 } | |
185 | |
186 /** | |
187 * Backup the state of the collIterate struct data | |
188 * @param data collIterate to backup | |
189 * @param backup storage | |
190 */ | |
191 static | |
192 inline void backupState(const collIterate *data, collIterateState *backup) | |
193 { | |
194 backup->fcdPosition = data->fcdPosition; | |
195 backup->flags = data->flags; | |
196 backup->origFlags = data->origFlags; | |
197 backup->pos = data->pos; | |
198 backup->bufferaddress = data->writableBuffer.getBuffer(); | |
199 backup->buffersize = data->writableBuffer.length(); | |
200 backup->iteratorMove = 0; | |
201 backup->iteratorIndex = 0; | |
202 if(data->iterator != NULL) { | |
203 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER
_CURRENT); | |
204 backup->iteratorIndex = data->iterator->getState(data->iterator); | |
205 // no we try to fixup if we're using a normalizing iterator and we get U
ITER_NO_STATE | |
206 if(backup->iteratorIndex == UITER_NO_STATE) { | |
207 while((backup->iteratorIndex = data->iterator->getState(data->iterat
or)) == UITER_NO_STATE) { | |
208 backup->iteratorMove++; | |
209 data->iterator->move(data->iterator, -1, UITER_CURRENT); | |
210 } | |
211 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CUR
RENT); | |
212 } | |
213 } | |
214 } | |
215 | |
216 /** | |
217 * Loads the state into the collIterate struct data | |
218 * @param data collIterate to backup | |
219 * @param backup storage | |
220 * @param forwards boolean to indicate if forwards iteration is used, | |
221 * false indicates backwards iteration | |
222 */ | |
223 static | |
224 inline void loadState(collIterate *data, const collIterateState *backup, | |
225 UBool forwards) | |
226 { | |
227 UErrorCode status = U_ZERO_ERROR; | |
228 data->flags = backup->flags; | |
229 data->origFlags = backup->origFlags; | |
230 if(data->iterator != NULL) { | |
231 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO
); | |
232 data->iterator->setState(data->iterator, backup->iteratorIndex, &status)
; | |
233 if(backup->iteratorMove != 0) { | |
234 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CUR
RENT); | |
235 } | |
236 } | |
237 data->pos = backup->pos; | |
238 | |
239 if ((data->flags & UCOL_ITER_INNORMBUF) && | |
240 data->writableBuffer.getBuffer() != backup->bufferaddress) { | |
241 /* | |
242 this is when a new buffer has been reallocated and we'll have to | |
243 calculate the new position. | |
244 note the new buffer has to contain the contents of the old buffer. | |
245 */ | |
246 if (forwards) { | |
247 data->pos = data->writableBuffer.getTerminatedBuffer() + | |
248 (data->pos - backup->bufferaddress); | |
249 } | |
250 else { | |
251 /* backwards direction */ | |
252 int32_t temp = backup->buffersize - | |
253 (int32_t)(data->pos - backup->bufferaddress); | |
254 data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writ
ableBuffer.length() - temp); | |
255 } | |
256 } | |
257 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { | |
258 /* | |
259 this is alittle tricky. | |
260 if we are initially not in the normalization buffer, even if we | |
261 normalize in the later stage, the data in the buffer will be | |
262 ignored, since we skip back up to the data string. | |
263 however if we are already in the normalization buffer, any | |
264 further normalization will pull data into the normalization | |
265 buffer and modify the fcdPosition. | |
266 since we are keeping the data in the buffer for use, the | |
267 fcdPosition can not be reverted back. | |
268 arrgghh.... | |
269 */ | |
270 data->fcdPosition = backup->fcdPosition; | |
271 } | |
272 } | |
273 | |
274 static UBool | |
275 reallocCEs(collIterate *data, int32_t newCapacity) { | |
276 uint32_t *oldCEs = data->extendCEs; | |
277 if(oldCEs == NULL) { | |
278 oldCEs = data->CEs; | |
279 } | |
280 int32_t length = data->CEpos - oldCEs; | |
281 uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4); | |
282 if(newCEs == NULL) { | |
283 return FALSE; | |
284 } | |
285 uprv_memcpy(newCEs, oldCEs, length * 4); | |
286 uprv_free(data->extendCEs); | |
287 data->extendCEs = newCEs; | |
288 data->extendCEsSize = newCapacity; | |
289 data->CEpos = newCEs + length; | |
290 return TRUE; | |
291 } | |
292 | |
293 static UBool | |
294 increaseCEsCapacity(collIterate *data) { | |
295 int32_t oldCapacity; | |
296 if(data->extendCEs != NULL) { | |
297 oldCapacity = data->extendCEsSize; | |
298 } else { | |
299 oldCapacity = LENGTHOF(data->CEs); | |
300 } | |
301 return reallocCEs(data, 2 * oldCapacity); | |
302 } | |
303 | |
304 static UBool | |
305 ensureCEsCapacity(collIterate *data, int32_t minCapacity) { | |
306 int32_t oldCapacity; | |
307 if(data->extendCEs != NULL) { | |
308 oldCapacity = data->extendCEsSize; | |
309 } else { | |
310 oldCapacity = LENGTHOF(data->CEs); | |
311 } | |
312 if(minCapacity <= oldCapacity) { | |
313 return TRUE; | |
314 } | |
315 oldCapacity *= 2; | |
316 return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacit
y); | |
317 } | |
318 | |
319 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) { | |
320 if(U_FAILURE(errorCode)) { | |
321 return; | |
322 } | |
323 int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuf
fer); | |
324 U_ASSERT(length >= offsetBufferSize || offsetStore != NULL); | |
325 if(length >= offsetBufferSize) { | |
326 int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE; | |
327 int32_t *newBuffer = static_cast<int32_t *>(uprv_malloc(newCapacity * 4)
); | |
328 if(newBuffer == NULL) { | |
329 errorCode = U_MEMORY_ALLOCATION_ERROR; | |
330 return; | |
331 } | |
332 if(length > 0) { | |
333 uprv_memcpy(newBuffer, offsetBuffer, length * 4); | |
334 } | |
335 uprv_free(offsetBuffer); | |
336 offsetBuffer = newBuffer; | |
337 offsetStore = offsetBuffer + length; | |
338 offsetBufferSize = newCapacity; | |
339 } | |
340 *offsetStore++ = offset; | |
341 } | |
342 | |
343 /* | |
344 * collIter_eos() | |
345 * Checks for a collIterate being positioned at the end of | |
346 * its source string. | |
347 * | |
348 */ | |
349 static | |
350 inline UBool collIter_eos(collIterate *s) { | |
351 if(s->flags & UCOL_USE_ITERATOR) { | |
352 return !(s->iterator->hasNext(s->iterator)); | |
353 } | |
354 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) { | |
355 // Null terminated string, but not at null, so not at end. | |
356 // Whether in main or normalization buffer doesn't matter. | |
357 return FALSE; | |
358 } | |
359 | |
360 // String with length. Can't be in normalization buffer, which is always | |
361 // null termintated. | |
362 if (s->flags & UCOL_ITER_HASLEN) { | |
363 return (s->pos == s->endp); | |
364 } | |
365 | |
366 // We are at a null termination, could be either normalization buffer or mai
n string. | |
367 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) { | |
368 // At null at end of main string. | |
369 return TRUE; | |
370 } | |
371 | |
372 // At null at end of normalization buffer. Need to check whether there ther
e are | |
373 // any characters left in the main buffer. | |
374 if(s->origFlags & UCOL_USE_ITERATOR) { | |
375 return !(s->iterator->hasNext(s->iterator)); | |
376 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) { | |
377 // Null terminated main string. fcdPosition is the 'return' position in
to main buf. | |
378 return (*s->fcdPosition == 0); | |
379 } | |
380 else { | |
381 // Main string with an end pointer. | |
382 return s->fcdPosition == s->endp; | |
383 } | |
384 } | |
385 | |
386 /* | |
387 * collIter_bos() | |
388 * Checks for a collIterate being positioned at the start of | |
389 * its source string. | |
390 * | |
391 */ | |
392 static | |
393 inline UBool collIter_bos(collIterate *source) { | |
394 // if we're going backwards, we need to know whether there is more in the | |
395 // iterator, even if we are in the side buffer | |
396 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR)
{ | |
397 return !source->iterator->hasPrevious(source->iterator); | |
398 } | |
399 if (source->pos <= source->string || | |
400 ((source->flags & UCOL_ITER_INNORMBUF) && | |
401 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) { | |
402 return TRUE; | |
403 } | |
404 return FALSE; | |
405 } | |
406 | |
407 /*static | |
408 inline UBool collIter_SimpleBos(collIterate *source) { | |
409 // if we're going backwards, we need to know whether there is more in the | |
410 // iterator, even if we are in the side buffer | |
411 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR)
{ | |
412 return !source->iterator->hasPrevious(source->iterator); | |
413 } | |
414 if (source->pos == source->string) { | |
415 return TRUE; | |
416 } | |
417 return FALSE; | |
418 }*/ | |
419 //return (data->pos == data->string) || | |
420 | |
421 | |
422 /****************************************************************************/ | |
423 /* Following are the open/close functions */ | |
424 /* */ | |
425 /****************************************************************************/ | |
426 | |
427 static UCollator* | |
428 ucol_initFromBinary(const uint8_t *bin, int32_t length, | |
429 const UCollator *base, | |
430 UCollator *fillIn, | |
431 UErrorCode *status) | |
432 { | |
433 UCollator *result = fillIn; | |
434 if(U_FAILURE(*status)) { | |
435 return NULL; | |
436 } | |
437 /* | |
438 if(base == NULL) { | |
439 // we don't support null base yet | |
440 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
441 return NULL; | |
442 } | |
443 */ | |
444 // We need these and we could be running without UCA | |
445 uprv_uca_initImplicitConstants(status); | |
446 UCATableHeader *colData = (UCATableHeader *)bin; | |
447 // do we want version check here? We're trying to figure out whether collato
rs are compatible | |
448 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeo
f(UVersionInfo)) != 0 || | |
449 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersio
nInfo)) != 0)) || | |
450 colData->version[0] != UCOL_BUILDER_VERSION) | |
451 { | |
452 *status = U_COLLATOR_VERSION_MISMATCH; | |
453 return NULL; | |
454 } | |
455 else { | |
456 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(s
izeof(UColOptionSet)))) { | |
457 result = ucol_initCollator((const UCATableHeader *)bin, result, base
, status); | |
458 if(U_FAILURE(*status)){ | |
459 return NULL; | |
460 } | |
461 result->hasRealData = TRUE; | |
462 } | |
463 else { | |
464 if(base) { | |
465 result = ucol_initCollator(base->image, result, base, status); | |
466 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const
UCATableHeader *)bin)->options), status); | |
467 if(U_FAILURE(*status)){ | |
468 return NULL; | |
469 } | |
470 result->hasRealData = FALSE; | |
471 } | |
472 else { | |
473 *status = U_USELESS_COLLATOR_ERROR; | |
474 return NULL; | |
475 } | |
476 } | |
477 result->freeImageOnClose = FALSE; | |
478 } | |
479 result->actualLocale = NULL; | |
480 result->validLocale = NULL; | |
481 result->requestedLocale = NULL; | |
482 result->rules = NULL; | |
483 result->rulesLength = 0; | |
484 result->freeRulesOnClose = FALSE; | |
485 result->ucaRules = NULL; | |
486 return result; | |
487 } | |
488 | |
489 U_CAPI UCollator* U_EXPORT2 | 39 U_CAPI UCollator* U_EXPORT2 |
490 ucol_openBinary(const uint8_t *bin, int32_t length, | 40 ucol_openBinary(const uint8_t *bin, int32_t length, |
491 const UCollator *base, | 41 const UCollator *base, |
492 UErrorCode *status) | 42 UErrorCode *status) |
493 { | 43 { |
494 return ucol_initFromBinary(bin, length, base, NULL, status); | 44 if(U_FAILURE(*status)) { return NULL; } |
| 45 RuleBasedCollator *coll = new RuleBasedCollator( |
| 46 bin, length, |
| 47 RuleBasedCollator::rbcFromUCollator(base), |
| 48 *status); |
| 49 if(coll == NULL) { |
| 50 *status = U_MEMORY_ALLOCATION_ERROR; |
| 51 return NULL; |
| 52 } |
| 53 if(U_FAILURE(*status)) { |
| 54 delete coll; |
| 55 return NULL; |
| 56 } |
| 57 return coll->toUCollator(); |
495 } | 58 } |
496 | 59 |
497 U_CAPI int32_t U_EXPORT2 | 60 U_CAPI int32_t U_EXPORT2 |
498 ucol_cloneBinary(const UCollator *coll, | 61 ucol_cloneBinary(const UCollator *coll, |
499 uint8_t *buffer, int32_t capacity, | 62 uint8_t *buffer, int32_t capacity, |
500 UErrorCode *status) | 63 UErrorCode *status) |
501 { | 64 { |
502 int32_t length = 0; | |
503 if(U_FAILURE(*status)) { | 65 if(U_FAILURE(*status)) { |
504 return length; | 66 return 0; |
505 } | 67 } |
506 if(capacity < 0) { | 68 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); |
507 *status = U_ILLEGAL_ARGUMENT_ERROR; | 69 if(rbc == NULL && coll != NULL) { |
508 return length; | 70 *status = U_UNSUPPORTED_ERROR; |
| 71 return 0; |
509 } | 72 } |
510 if(coll->hasRealData == TRUE) { | 73 return rbc->cloneBinary(buffer, capacity, *status); |
511 length = coll->image->size; | |
512 if(length <= capacity) { | |
513 uprv_memcpy(buffer, coll->image, length); | |
514 } else { | |
515 *status = U_BUFFER_OVERFLOW_ERROR; | |
516 } | |
517 } else { | |
518 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(
UColOptionSet))); | |
519 if(length <= capacity) { | |
520 /* build the UCATableHeader with minimal entries */ | |
521 /* do not copy the header from the UCA file because its values are w
rong! */ | |
522 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ | |
523 | |
524 /* reset everything */ | |
525 uprv_memset(buffer, 0, length); | |
526 | |
527 /* set the tailoring-specific values */ | |
528 UCATableHeader *myData = (UCATableHeader *)buffer; | |
529 myData->size = length; | |
530 | |
531 /* offset for the options, the only part of the data that is present
after the header */ | |
532 myData->options = sizeof(UCATableHeader); | |
533 | |
534 /* need to always set the expansion value for an upper bound of the
options */ | |
535 myData->expansion = myData->options + sizeof(UColOptionSet); | |
536 | |
537 myData->magic = UCOL_HEADER_MAGIC; | |
538 myData->isBigEndian = U_IS_BIG_ENDIAN; | |
539 myData->charSetFamily = U_CHARSET_FAMILY; | |
540 | |
541 /* copy UCA's version; genrb will override all but the builder versi
on with tailoring data */ | |
542 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionIn
fo)); | |
543 | |
544 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVer
sionInfo)); | |
545 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVer
sionInfo)); | |
546 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeo
f(UVersionInfo)); | |
547 myData->jamoSpecial = coll->image->jamoSpecial; | |
548 | |
549 /* copy the collator options */ | |
550 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options
, sizeof(UColOptionSet)); | |
551 } else { | |
552 *status = U_BUFFER_OVERFLOW_ERROR; | |
553 } | |
554 } | |
555 return length; | |
556 } | 74 } |
557 | 75 |
558 U_CAPI UCollator* U_EXPORT2 | 76 U_CAPI UCollator* U_EXPORT2 |
559 ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferS
ize, UErrorCode *status) | 77 ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferS
ize, UErrorCode *status) |
560 { | 78 { |
561 UCollator * localCollator; | |
562 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator); | |
563 int32_t imageSize = 0; | |
564 int32_t rulesSize = 0; | |
565 int32_t rulesPadding = 0; | |
566 int32_t defaultReorderCodesSize = 0; | |
567 int32_t reorderCodesSize = 0; | |
568 uint8_t *image; | |
569 UChar *rules; | |
570 int32_t* defaultReorderCodes; | |
571 int32_t* reorderCodes; | |
572 uint8_t* leadBytePermutationTable; | |
573 UBool imageAllocated = FALSE; | |
574 | |
575 if (status == NULL || U_FAILURE(*status)){ | 79 if (status == NULL || U_FAILURE(*status)){ |
576 return NULL; | 80 return NULL; |
577 } | 81 } |
578 if (coll == NULL) { | 82 if (coll == NULL) { |
579 *status = U_ILLEGAL_ARGUMENT_ERROR; | 83 *status = U_ILLEGAL_ARGUMENT_ERROR; |
580 return NULL; | 84 return NULL; |
581 } | 85 } |
582 | |
583 if (coll->rules && coll->freeRulesOnClose) { | |
584 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar); | |
585 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar)); | |
586 bufferSizeNeeded += rulesSize + rulesPadding; | |
587 } | |
588 // no padding for alignment needed from here since the next two are 4 byte q
uantities | |
589 if (coll->defaultReorderCodes) { | |
590 defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32
_t); | |
591 bufferSizeNeeded += defaultReorderCodesSize; | |
592 } | |
593 if (coll->reorderCodes) { | |
594 reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t); | |
595 bufferSizeNeeded += reorderCodesSize; | |
596 } | |
597 if (coll->leadBytePermutationTable) { | |
598 bufferSizeNeeded += 256 * sizeof(uint8_t); | |
599 } | |
600 | |
601 if (pBufferSize != NULL) { | 86 if (pBufferSize != NULL) { |
602 int32_t inputSize = *pBufferSize; | 87 int32_t inputSize = *pBufferSize; |
603 *pBufferSize = 1; | 88 *pBufferSize = 1; |
604 if (inputSize == 0) { | 89 if (inputSize == 0) { |
605 return NULL; // preflighting for deprecated functionality | 90 return NULL; // preflighting for deprecated functionality |
606 } | 91 } |
607 } | 92 } |
608 | 93 Collator *newColl = Collator::fromUCollator(coll)->clone(); |
609 char *stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded); | 94 if (newColl == NULL) { |
610 // Null pointer check. | |
611 if (stackBufferChars == NULL) { | |
612 *status = U_MEMORY_ALLOCATION_ERROR; | 95 *status = U_MEMORY_ALLOCATION_ERROR; |
613 return NULL; | 96 } else { |
| 97 *status = U_SAFECLONE_ALLOCATED_WARNING; |
614 } | 98 } |
615 *status = U_SAFECLONE_ALLOCATED_WARNING; | 99 return newColl->toUCollator(); |
616 | |
617 localCollator = (UCollator *)stackBufferChars; | |
618 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding); | |
619 defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize); | |
620 reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCode
sSize); | |
621 leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize; | |
622 | |
623 { | |
624 UErrorCode tempStatus = U_ZERO_ERROR; | |
625 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus); | |
626 } | |
627 if (coll->freeImageOnClose) { | |
628 image = (uint8_t *)uprv_malloc(imageSize); | |
629 // Null pointer check | |
630 if (image == NULL) { | |
631 *status = U_MEMORY_ALLOCATION_ERROR; | |
632 return NULL; | |
633 } | |
634 ucol_cloneBinary(coll, image, imageSize, status); | |
635 imageAllocated = TRUE; | |
636 } | |
637 else { | |
638 image = (uint8_t *)coll->image; | |
639 } | |
640 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollat
or, status); | |
641 if (U_FAILURE(*status)) { | |
642 return NULL; | |
643 } | |
644 | |
645 if (coll->rules) { | |
646 if (coll->freeRulesOnClose) { | |
647 localCollator->rules = u_strcpy(rules, coll->rules); | |
648 //bufferEnd += rulesSize; | |
649 } | |
650 else { | |
651 localCollator->rules = coll->rules; | |
652 } | |
653 localCollator->freeRulesOnClose = FALSE; | |
654 localCollator->rulesLength = coll->rulesLength; | |
655 } | |
656 | |
657 // collator reordering | |
658 if (coll->defaultReorderCodes) { | |
659 localCollator->defaultReorderCodes = | |
660 (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCode
s, coll->defaultReorderCodesLength * sizeof(int32_t)); | |
661 localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLeng
th; | |
662 localCollator->freeDefaultReorderCodesOnClose = FALSE; | |
663 } | |
664 if (coll->reorderCodes) { | |
665 localCollator->reorderCodes = | |
666 (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorde
rCodesLength * sizeof(int32_t)); | |
667 localCollator->reorderCodesLength = coll->reorderCodesLength; | |
668 localCollator->freeReorderCodesOnClose = FALSE; | |
669 } | |
670 if (coll->leadBytePermutationTable) { | |
671 localCollator->leadBytePermutationTable = | |
672 (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermu
tationTable, 256); | |
673 localCollator->freeLeadBytePermutationTableOnClose = FALSE; | |
674 } | |
675 | |
676 int32_t i; | |
677 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { | |
678 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(col
l, (UColAttribute)i, status), status); | |
679 } | |
680 // zero copies of pointers | |
681 localCollator->actualLocale = NULL; | |
682 localCollator->validLocale = NULL; | |
683 localCollator->requestedLocale = NULL; | |
684 localCollator->ucaRules = coll->ucaRules; // There should only be one copy h
ere. | |
685 localCollator->freeOnClose = TRUE; | |
686 localCollator->freeImageOnClose = imageAllocated; | |
687 return localCollator; | |
688 } | 100 } |
689 | 101 |
690 U_CAPI void U_EXPORT2 | 102 U_CAPI void U_EXPORT2 |
691 ucol_close(UCollator *coll) | 103 ucol_close(UCollator *coll) |
692 { | 104 { |
693 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); | 105 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); |
694 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); | 106 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); |
695 if(coll != NULL) { | 107 if(coll != NULL) { |
696 // these are always owned by each UCollator struct, | 108 delete Collator::fromUCollator(coll); |
697 // so we always free them | |
698 if(coll->validLocale != NULL) { | |
699 uprv_free(coll->validLocale); | |
700 } | |
701 if(coll->actualLocale != NULL) { | |
702 uprv_free(coll->actualLocale); | |
703 } | |
704 if(coll->requestedLocale != NULL) { | |
705 uprv_free(coll->requestedLocale); | |
706 } | |
707 if(coll->latinOneCEs != NULL) { | |
708 uprv_free(coll->latinOneCEs); | |
709 } | |
710 if(coll->options != NULL && coll->freeOptionsOnClose) { | |
711 uprv_free(coll->options); | |
712 } | |
713 if(coll->rules != NULL && coll->freeRulesOnClose) { | |
714 uprv_free((UChar *)coll->rules); | |
715 } | |
716 if(coll->image != NULL && coll->freeImageOnClose) { | |
717 uprv_free((UCATableHeader *)coll->image); | |
718 } | |
719 | |
720 if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutati
onTableOnClose == TRUE) { | |
721 uprv_free(coll->leadBytePermutationTable); | |
722 } | |
723 if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnC
lose == TRUE) { | |
724 uprv_free(coll->defaultReorderCodes); | |
725 } | |
726 if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE)
{ | |
727 uprv_free(coll->reorderCodes); | |
728 } | |
729 | |
730 if(coll->delegate != NULL) { | |
731 delete (Collator*)coll->delegate; | |
732 } | |
733 | |
734 /* Here, it would be advisable to close: */ | |
735 /* - UData for UCA (unless we stuff it in the root resb */ | |
736 /* Again, do we need additional housekeeping... HMMM! */ | |
737 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose); | |
738 if(coll->freeOnClose){ | |
739 /* for safeClone, if freeOnClose is FALSE, | |
740 don't free the other instance data */ | |
741 uprv_free(coll); | |
742 } | |
743 } | 109 } |
744 UTRACE_EXIT(); | 110 UTRACE_EXIT(); |
745 } | 111 } |
746 | 112 |
747 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCo
de *status) { | |
748 if(U_FAILURE(*status)) { | |
749 return; | |
750 } | |
751 result->caseFirst = (UColAttributeValue)opts->caseFirst; | |
752 result->caseLevel = (UColAttributeValue)opts->caseLevel; | |
753 result->frenchCollation = (UColAttributeValue)opts->frenchCollation; | |
754 result->normalizationMode = (UColAttributeValue)opts->normalizationMode; | |
755 if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) { | |
756 return; | |
757 } | |
758 result->strength = (UColAttributeValue)opts->strength; | |
759 result->variableTopValue = opts->variableTopValue; | |
760 result->alternateHandling = (UColAttributeValue)opts->alternateHandling; | |
761 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ; | |
762 result->numericCollation = (UColAttributeValue)opts->numericCollation; | |
763 result->caseFirstisDefault = TRUE; | |
764 result->caseLevelisDefault = TRUE; | |
765 result->frenchCollationisDefault = TRUE; | |
766 result->normalizationModeisDefault = TRUE; | |
767 result->strengthisDefault = TRUE; | |
768 result->variableTopValueisDefault = TRUE; | |
769 result->alternateHandlingisDefault = TRUE; | |
770 result->hiraganaQisDefault = TRUE; | |
771 result->numericCollationisDefault = TRUE; | |
772 | |
773 ucol_updateInternalState(result, status); | |
774 | |
775 result->options = opts; | |
776 } | |
777 | |
778 | |
779 /** | |
780 * Approximate determination if a character is at a contraction end. | |
781 * Guaranteed to be TRUE if a character is at the end of a contraction, | |
782 * otherwise it is not deterministic. | |
783 * @param c character to be determined | |
784 * @param coll collator | |
785 */ | |
786 static | |
787 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) { | |
788 if (c < coll->minContrEndCP) { | |
789 return FALSE; | |
790 } | |
791 | |
792 int32_t hash = c; | |
793 uint8_t htbyte; | |
794 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { | |
795 if (U16_IS_TRAIL(c)) { | |
796 return TRUE; | |
797 } | |
798 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; | |
799 } | |
800 htbyte = coll->contrEndCP[hash>>3]; | |
801 return (((htbyte >> (hash & 7)) & 1) == 1); | |
802 } | |
803 | |
804 | |
805 | |
806 /* | |
807 * i_getCombiningClass() | |
808 * A fast, at least partly inline version of u_getCombiningClass() | |
809 * This is a candidate for further optimization. Used heavily | |
810 * in contraction processing. | |
811 */ | |
812 static | |
813 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) { | |
814 uint8_t sCC = 0; | |
815 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) { | |
816 sCC = u_getCombiningClass(c); | |
817 } | |
818 return sCC; | |
819 } | |
820 | |
821 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, con
st UCollator *UCA, UErrorCode *status) { | |
822 UChar c; | |
823 UCollator *result = fillIn; | |
824 if(U_FAILURE(*status) || image == NULL) { | |
825 return NULL; | |
826 } | |
827 | |
828 if(result == NULL) { | |
829 result = (UCollator *)uprv_malloc(sizeof(UCollator)); | |
830 if(result == NULL) { | |
831 *status = U_MEMORY_ALLOCATION_ERROR; | |
832 return result; | |
833 } | |
834 result->freeOnClose = TRUE; | |
835 } else { | |
836 result->freeOnClose = FALSE; | |
837 } | |
838 | |
839 result->delegate = NULL; | |
840 | |
841 result->image = image; | |
842 result->mapping.getFoldingOffset = _getFoldingOffset; | |
843 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosit
ion; | |
844 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE -
result->image->mappingPosition, status); | |
845 if(U_FAILURE(*status)) { | |
846 if(result->freeOnClose == TRUE) { | |
847 uprv_free(result); | |
848 result = NULL; | |
849 } | |
850 return result; | |
851 } | |
852 | |
853 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping); | |
854 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->
contractionCEs); | |
855 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->c
ontractionIndex); | |
856 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expan
sion); | |
857 result->rules = NULL; | |
858 result->rulesLength = 0; | |
859 result->freeRulesOnClose = FALSE; | |
860 result->defaultReorderCodes = NULL; | |
861 result->defaultReorderCodesLength = 0; | |
862 result->freeDefaultReorderCodesOnClose = FALSE; | |
863 result->reorderCodes = NULL; | |
864 result->reorderCodesLength = 0; | |
865 result->freeReorderCodesOnClose = FALSE; | |
866 result->leadBytePermutationTable = NULL; | |
867 result->freeLeadBytePermutationTableOnClose = FALSE; | |
868 | |
869 /* get the version info from UCATableHeader and populate the Collator struct
*/ | |
870 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/ | |
871 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules v
ersion*/ | |
872 result->dataVersion[2] = 0; | |
873 result->dataVersion[3] = 0; | |
874 | |
875 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP; | |
876 result->minUnsafeCP = 0; | |
877 for (c=0; c<0x300; c++) { // Find the smallest unsafe char. | |
878 if (ucol_unsafeCP(c, result)) break; | |
879 } | |
880 result->minUnsafeCP = c; | |
881 | |
882 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP; | |
883 result->minContrEndCP = 0; | |
884 for (c=0; c<0x300; c++) { // Find the Contraction-ending char. | |
885 if (ucol_contractionEndCP(c, result)) break; | |
886 } | |
887 result->minContrEndCP = c; | |
888 | |
889 /* max expansion tables */ | |
890 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image + | |
891 result->image->endExpansionCE); | |
892 result->lastEndExpansionCE = result->endExpansionCE + | |
893 result->image->endExpansionCECount - 1; | |
894 result->expansionCESize = (uint8_t*)result->image + | |
895 result->image->expansionCESize; | |
896 | |
897 | |
898 //result->errorCode = *status; | |
899 | |
900 result->latinOneCEs = NULL; | |
901 | |
902 result->latinOneRegenTable = FALSE; | |
903 result->latinOneFailed = FALSE; | |
904 result->UCA = UCA; | |
905 | |
906 /* Normally these will be set correctly later. This is the default if you us
e UCA or the default. */ | |
907 result->ucaRules = NULL; | |
908 result->actualLocale = NULL; | |
909 result->validLocale = NULL; | |
910 result->requestedLocale = NULL; | |
911 result->hasRealData = FALSE; // real data lives in .dat file... | |
912 result->freeImageOnClose = FALSE; | |
913 | |
914 /* set attributes */ | |
915 ucol_setOptionsFromHeader( | |
916 result, | |
917 (UColOptionSet*)((uint8_t*)result->image+result->image->options), | |
918 status); | |
919 result->freeOptionsOnClose = FALSE; | |
920 | |
921 return result; | |
922 } | |
923 | |
924 /* new Mark's code */ | |
925 | |
926 /** | |
927 * For generation of Implicit CEs | |
928 * @author Davis | |
929 * | |
930 * Cleaned up so that changes can be made more easily. | |
931 * Old values: | |
932 # First Implicit: E26A792D | |
933 # Last Implicit: E3DC70C0 | |
934 # First CJK: E0030300 | |
935 # Last CJK: E0A9DD00 | |
936 # First CJK_A: E0A9DF00 | |
937 # Last CJK_A: E0DE3100 | |
938 */ | |
939 /* Following is a port of Mark's code for new treatment of implicits. | |
940 * It is positioned here, since ucol_initUCA need to initialize the | |
941 * variables below according to the data in the fractional UCA. | |
942 */ | |
943 | |
944 /** | |
945 * Function used to: | |
946 * a) collapse the 2 different Han ranges from UCA into one (in the right order)
, and | |
947 * b) bump any non-CJK characters by 10FFFF. | |
948 * The relevant blocks are: | |
949 * A: 4E00..9FFF; CJK Unified Ideographs | |
950 * F900..FAFF; CJK Compatibility Ideographs | |
951 * B: 3400..4DBF; CJK Unified Ideographs Extension A | |
952 * 20000..XX; CJK Unified Ideographs Extension B (and others later on) | |
953 * As long as | |
954 * no new B characters are allocated between 4E00 and FAFF, and | |
955 * no new A characters are outside of this range, | |
956 * (very high probability) this simple code will work. | |
957 * The reordered blocks are: | |
958 * Block1 is CJK | |
959 * Block2 is CJK_COMPAT_USED | |
960 * Block3 is CJK_A | |
961 * (all contiguous) | |
962 * Any other CJK gets its normal code point | |
963 * Any non-CJK gets +10FFFF | |
964 * When we reorder Block1, we make sure that it is at the very start, | |
965 * so that it will use a 3-byte form. | |
966 * Warning: the we only pick up the compatibility characters that are | |
967 * NOT decomposed, so that block is smaller! | |
968 */ | |
969 | |
970 // CONSTANTS | |
971 static const UChar32 | |
972 NON_CJK_OFFSET = 0x110000, | |
973 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2 | |
974 | |
975 /** | |
976 * Precomputed by initImplicitConstants() | |
977 */ | |
978 static int32_t | |
979 final3Multiplier = 0, | |
980 final4Multiplier = 0, | |
981 final3Count = 0, | |
982 final4Count = 0, | |
983 medialCount = 0, | |
984 min3Primary = 0, | |
985 min4Primary = 0, | |
986 max4Primary = 0, | |
987 minTrail = 0, | |
988 maxTrail = 0, | |
989 max3Trail = 0, | |
990 max4Trail = 0, | |
991 min4Boundary = 0; | |
992 | |
993 static const UChar32 | |
994 // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; | |
995 // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; (Unicode 6.1) | |
996 CJK_BASE = 0x4E00, | |
997 CJK_LIMIT = 0x9FCC+1, | |
998 // Unified CJK ideographs in the compatibility ideographs block. | |
999 CJK_COMPAT_USED_BASE = 0xFA0E, | |
1000 CJK_COMPAT_USED_LIMIT = 0xFA2F+1, | |
1001 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; | |
1002 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; | |
1003 CJK_A_BASE = 0x3400, | |
1004 CJK_A_LIMIT = 0x4DB5+1, | |
1005 // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;; | |
1006 // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;; | |
1007 CJK_B_BASE = 0x20000, | |
1008 CJK_B_LIMIT = 0x2A6D6+1, | |
1009 // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;; | |
1010 // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;; | |
1011 CJK_C_BASE = 0x2A700, | |
1012 CJK_C_LIMIT = 0x2B734+1, | |
1013 // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;; | |
1014 // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;; | |
1015 CJK_D_BASE = 0x2B740, | |
1016 CJK_D_LIMIT = 0x2B81D+1; | |
1017 // when adding to this list, look for all occurrences (in project) | |
1018 // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing
!!!! | |
1019 | |
1020 static UChar32 swapCJK(UChar32 i) { | |
1021 if (i < CJK_A_BASE) { | |
1022 // non-CJK | |
1023 } else if (i < CJK_A_LIMIT) { | |
1024 // Extension A has lower code points than the original Unihan+compat | |
1025 // but sorts higher. | |
1026 return i - CJK_A_BASE | |
1027 + (CJK_LIMIT - CJK_BASE) | |
1028 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); | |
1029 } else if (i < CJK_BASE) { | |
1030 // non-CJK | |
1031 } else if (i < CJK_LIMIT) { | |
1032 return i - CJK_BASE; | |
1033 } else if (i < CJK_COMPAT_USED_BASE) { | |
1034 // non-CJK | |
1035 } else if (i < CJK_COMPAT_USED_LIMIT) { | |
1036 return i - CJK_COMPAT_USED_BASE | |
1037 + (CJK_LIMIT - CJK_BASE); | |
1038 } else if (i < CJK_B_BASE) { | |
1039 // non-CJK | |
1040 } else if (i < CJK_B_LIMIT) { | |
1041 return i; // non-BMP-CJK | |
1042 } else if (i < CJK_C_BASE) { | |
1043 // non-CJK | |
1044 } else if (i < CJK_C_LIMIT) { | |
1045 return i; // non-BMP-CJK | |
1046 } else if (i < CJK_D_BASE) { | |
1047 // non-CJK | |
1048 } else if (i < CJK_D_LIMIT) { | |
1049 return i; // non-BMP-CJK | |
1050 } | |
1051 return i + NON_CJK_OFFSET; // non-CJK | |
1052 } | |
1053 | |
1054 U_CAPI UChar32 U_EXPORT2 | |
1055 uprv_uca_getRawFromCodePoint(UChar32 i) { | |
1056 return swapCJK(i)+1; | |
1057 } | |
1058 | |
1059 U_CAPI UChar32 U_EXPORT2 | |
1060 uprv_uca_getCodePointFromRaw(UChar32 i) { | |
1061 i--; | |
1062 UChar32 result = 0; | |
1063 if(i >= NON_CJK_OFFSET) { | |
1064 result = i - NON_CJK_OFFSET; | |
1065 } else if(i >= CJK_B_BASE) { | |
1066 result = i; | |
1067 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted | |
1068 if(i < CJK_LIMIT - CJK_BASE) { | |
1069 result = i + CJK_BASE; | |
1070 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMP
AT_USED_BASE)) { | |
1071 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE); | |
1072 } else { | |
1073 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_
LIMIT - CJK_COMPAT_USED_BASE); | |
1074 } | |
1075 } else { | |
1076 result = -1; | |
1077 } | |
1078 return result; | |
1079 } | |
1080 | |
1081 // GET IMPLICIT PRIMARY WEIGHTS | |
1082 // Return value is left justified primary key | |
1083 U_CAPI uint32_t U_EXPORT2 | |
1084 uprv_uca_getImplicitFromRaw(UChar32 cp) { | |
1085 /* | |
1086 if (cp < 0 || cp > UCOL_MAX_INPUT) { | |
1087 throw new IllegalArgumentException("Code point out of range " + Utility.
hex(cp)); | |
1088 } | |
1089 */ | |
1090 int32_t last0 = cp - min4Boundary; | |
1091 if (last0 < 0) { | |
1092 int32_t last1 = cp / final3Count; | |
1093 last0 = cp % final3Count; | |
1094 | |
1095 int32_t last2 = last1 / medialCount; | |
1096 last1 %= medialCount; | |
1097 | |
1098 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at
start | |
1099 last1 = minTrail + last1; // offset | |
1100 last2 = min3Primary + last2; // offset | |
1101 /* | |
1102 if (last2 >= min4Primary) { | |
1103 throw new IllegalArgumentException("4-byte out of range: " + Utility
.hex(cp) + ", " + Utility.hex(last2)); | |
1104 } | |
1105 */ | |
1106 return (last2 << 24) + (last1 << 16) + (last0 << 8); | |
1107 } else { | |
1108 int32_t last1 = last0 / final4Count; | |
1109 last0 %= final4Count; | |
1110 | |
1111 int32_t last2 = last1 / medialCount; | |
1112 last1 %= medialCount; | |
1113 | |
1114 int32_t last3 = last2 / medialCount; | |
1115 last2 %= medialCount; | |
1116 | |
1117 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at
start | |
1118 last1 = minTrail + last1; // offset | |
1119 last2 = minTrail + last2; // offset | |
1120 last3 = min4Primary + last3; // offset | |
1121 /* | |
1122 if (last3 > max4Primary) { | |
1123 throw new IllegalArgumentException("4-byte out of range: " + Utility
.hex(cp) + ", " + Utility.hex(last3)); | |
1124 } | |
1125 */ | |
1126 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0; | |
1127 } | |
1128 } | |
1129 | |
1130 static uint32_t U_EXPORT2 | |
1131 uprv_uca_getImplicitPrimary(UChar32 cp) { | |
1132 //fprintf(stdout, "Incoming: %04x\n", cp); | |
1133 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); | |
1134 | |
1135 cp = swapCJK(cp); | |
1136 cp++; | |
1137 // we now have a range of numbers from 0 to 21FFFF. | |
1138 | |
1139 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); | |
1140 //fprintf(stdout, "CJK swapped: %04x\n", cp); | |
1141 | |
1142 return uprv_uca_getImplicitFromRaw(cp); | |
1143 } | |
1144 | |
1145 /** | |
1146 * Converts implicit CE into raw integer ("code point") | |
1147 * @param implicit | |
1148 * @return -1 if illegal format | |
1149 */ | |
1150 U_CAPI UChar32 U_EXPORT2 | |
1151 uprv_uca_getRawFromImplicit(uint32_t implicit) { | |
1152 UChar32 result; | |
1153 UChar32 b3 = implicit & 0xFF; | |
1154 UChar32 b2 = (implicit >> 8) & 0xFF; | |
1155 UChar32 b1 = (implicit >> 16) & 0xFF; | |
1156 UChar32 b0 = (implicit >> 24) & 0xFF; | |
1157 | |
1158 // simple parameter checks | |
1159 if (b0 < min3Primary || b0 > max4Primary | |
1160 || b1 < minTrail || b1 > maxTrail) | |
1161 return -1; | |
1162 // normal offsets | |
1163 b1 -= minTrail; | |
1164 | |
1165 // take care of the final values, and compose | |
1166 if (b0 < min4Primary) { | |
1167 if (b2 < minTrail || b2 > max3Trail || b3 != 0) | |
1168 return -1; | |
1169 b2 -= minTrail; | |
1170 UChar32 remainder = b2 % final3Multiplier; | |
1171 if (remainder != 0) | |
1172 return -1; | |
1173 b0 -= min3Primary; | |
1174 b2 /= final3Multiplier; | |
1175 result = ((b0 * medialCount) + b1) * final3Count + b2; | |
1176 } else { | |
1177 if (b2 < minTrail || b2 > maxTrail | |
1178 || b3 < minTrail || b3 > max4Trail) | |
1179 return -1; | |
1180 b2 -= minTrail; | |
1181 b3 -= minTrail; | |
1182 UChar32 remainder = b3 % final4Multiplier; | |
1183 if (remainder != 0) | |
1184 return -1; | |
1185 b3 /= final4Multiplier; | |
1186 b0 -= min4Primary; | |
1187 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count +
b3 + min4Boundary; | |
1188 } | |
1189 // final check | |
1190 if (result < 0 || result > UCOL_MAX_INPUT) | |
1191 return -1; | |
1192 return result; | |
1193 } | |
1194 | |
1195 | |
1196 static inline int32_t divideAndRoundUp(int a, int b) { | |
1197 return 1 + (a-1)/b; | |
1198 } | |
1199 | |
1200 /* this function is either called from initUCA or from genUCA before | |
1201 * doing canonical closure for the UCA. | |
1202 */ | |
1203 | |
1204 /** | |
1205 * Set up to generate implicits. | |
1206 * Maintenance Note: this function may end up being called more than once, due | |
1207 * to threading races during initialization. Make sure that | |
1208 * none of the Constants is ever transiently assigned an | |
1209 * incorrect value. | |
1210 * @param minPrimary | |
1211 * @param maxPrimary | |
1212 * @param minTrail final byte | |
1213 * @param maxTrail final byte | |
1214 * @param gap3 the gap we leave for tailoring for 3-byte forms | |
1215 * @param gap4 the gap we leave for tailoring for 4-byte forms | |
1216 */ | |
1217 static void initImplicitConstants(int minPrimary, int maxPrimary, | |
1218 int minTrailIn, int maxTrailIn, | |
1219 int gap3, int primaries3count, | |
1220 UErrorCode *status) { | |
1221 // some simple parameter checks | |
1222 if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) | |
1223 || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) | |
1224 || (primaries3count < 1)) | |
1225 { | |
1226 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
1227 return; | |
1228 }; | |
1229 | |
1230 minTrail = minTrailIn; | |
1231 maxTrail = maxTrailIn; | |
1232 | |
1233 min3Primary = minPrimary; | |
1234 max4Primary = maxPrimary; | |
1235 // compute constants for use later. | |
1236 // number of values we can use in trailing bytes | |
1237 // leave room for empty values between AND above, e.g. if gap = 2 | |
1238 // range 3..7 => +3 -4 -5 -6 -7: so 1 value | |
1239 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values | |
1240 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values | |
1241 final3Multiplier = gap3 + 1; | |
1242 final3Count = (maxTrail - minTrail + 1) / final3Multiplier; | |
1243 max3Trail = minTrail + (final3Count - 1) * final3Multiplier; | |
1244 | |
1245 // medials can use full range | |
1246 medialCount = (maxTrail - minTrail + 1); | |
1247 // find out how many values fit in each form | |
1248 int32_t threeByteCount = medialCount * final3Count; | |
1249 // now determine where the 3/4 boundary is. | |
1250 // we use 3 bytes below the boundary, and 4 above | |
1251 int32_t primariesAvailable = maxPrimary - minPrimary + 1; | |
1252 int32_t primaries4count = primariesAvailable - primaries3count; | |
1253 | |
1254 | |
1255 int32_t min3ByteCoverage = primaries3count * threeByteCount; | |
1256 min4Primary = minPrimary + primaries3count; | |
1257 min4Boundary = min3ByteCoverage; | |
1258 // Now expand out the multiplier for the 4 bytes, and redo. | |
1259 | |
1260 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary; | |
1261 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count
); | |
1262 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCo
unt * medialCount); | |
1263 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte; | |
1264 if (gap4 < 1) { | |
1265 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
1266 return; | |
1267 } | |
1268 final4Multiplier = gap4 + 1; | |
1269 final4Count = neededPerFinalByte; | |
1270 max4Trail = minTrail + (final4Count - 1) * final4Multiplier; | |
1271 } | |
1272 | |
1273 /** | |
1274 * Supply parameters for generating implicit CEs | |
1275 */ | |
1276 U_CAPI void U_EXPORT2 | |
1277 uprv_uca_initImplicitConstants(UErrorCode *status) { | |
1278 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms
. | |
1279 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status); | |
1280 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1,
1, status); | |
1281 } | |
1282 | |
1283 | |
1284 /* collIterNormalize Incremental Normalization happens here.
*/ | |
1285 /* pick up the range of chars identifed by FCD,
*/ | |
1286 /* normalize it into the collIterate's writable buffer,
*/ | |
1287 /* switch the collIterate's state to use the writable b
uffer. */ | |
1288 /*
*/ | |
1289 static | |
1290 void collIterNormalize(collIterate *collationSource) | |
1291 { | |
1292 UErrorCode status = U_ZERO_ERROR; | |
1293 const UChar *srcP = collationSource->pos - 1; /* Start of chars to nor
malize */ | |
1294 const UChar *endP = collationSource->fcdPosition; /* End of region to norma
lize+1 */ | |
1295 | |
1296 collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP -
srcP)), | |
1297 collationSource->writableBuffer, | |
1298 status); | |
1299 if (U_FAILURE(status)) { | |
1300 #ifdef UCOL_DEBUG | |
1301 fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_erro
rName(status)); | |
1302 #endif | |
1303 return; | |
1304 } | |
1305 | |
1306 collationSource->pos = collationSource->writableBuffer.getTerminatedB
uffer(); | |
1307 collationSource->origFlags = collationSource->flags; | |
1308 collationSource->flags |= UCOL_ITER_INNORMBUF; | |
1309 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE
_ITERATOR); | |
1310 } | |
1311 | |
1312 | |
1313 // This function takes the iterator and extracts normalized stuff up to the next
boundary | |
1314 // It is similar in the end results to the collIterNormalize, but for the cases
when we | |
1315 // use an iterator | |
1316 /*static | |
1317 inline void normalizeIterator(collIterate *collationSource) { | |
1318 UErrorCode status = U_ZERO_ERROR; | |
1319 UBool wasNormalized = FALSE; | |
1320 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->ite
rator, UITER_CURRENT); | |
1321 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iter
ator); | |
1322 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writa
bleBuffer, | |
1323 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalize
d, &status); | |
1324 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->w
ritableBufSize) { | |
1325 // reallocate and terminate | |
1326 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer, | |
1327 &collationSource->writableBuffer, | |
1328 (int32_t *)&collationSource->writableBufSize, nor
mLen + 1, | |
1329 0) | |
1330 ) { | |
1331 #ifdef UCOL_DEBUG | |
1332 fprintf(stderr, "normalizeIterator(), out of memory\n"); | |
1333 #endif | |
1334 return; | |
1335 } | |
1336 status = U_ZERO_ERROR; | |
1337 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITE
R_ZERO); | |
1338 collationSource->iterator->setState(collationSource->iterator, iterIndex, &s
tatus); | |
1339 normLen = unorm_next(collationSource->iterator, collationSource->writableBuf
fer, | |
1340 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalize
d, &status); | |
1341 } | |
1342 // Terminate the buffer - we already checked that it is big enough | |
1343 collationSource->writableBuffer[normLen] = 0; | |
1344 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { | |
1345 collationSource->flags |= UCOL_ITER_ALLOCATED; | |
1346 } | |
1347 collationSource->pos = collationSource->writableBuffer; | |
1348 collationSource->origFlags = collationSource->flags; | |
1349 collationSource->flags |= UCOL_ITER_INNORMBUF; | |
1350 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_I
TERATOR); | |
1351 }*/ | |
1352 | |
1353 | |
1354 /* Incremental FCD check and normalize
*/ | |
1355 /* Called from getNextCE when normalization state is suspect.
*/ | |
1356 /* When entering, the state is known to be this:
*/ | |
1357 /* o We are working in the main buffer of the collIterate, not the side
*/ | |
1358 /* writable buffer. When in the side buffer, normalization mode is alw
ays off, */ | |
1359 /* so we won't get here.
*/ | |
1360 /* o The leading combining class from the current character is 0 or
*/ | |
1361 /* the trailing combining class of the previous char was zero.
*/ | |
1362 /* True because the previous call to this function will have always exi
ted */ | |
1363 /* that way, and we get called for every char where cc might be non-zer
o. */ | |
1364 static | |
1365 inline UBool collIterFCD(collIterate *collationSource) { | |
1366 const UChar *srcP, *endP; | |
1367 uint8_t leadingCC; | |
1368 uint8_t prevTrailingCC = 0; | |
1369 uint16_t fcd; | |
1370 UBool needNormalize = FALSE; | |
1371 | |
1372 srcP = collationSource->pos-1; | |
1373 | |
1374 if (collationSource->flags & UCOL_ITER_HASLEN) { | |
1375 endP = collationSource->endp; | |
1376 } else { | |
1377 endP = NULL; | |
1378 } | |
1379 | |
1380 // Get the trailing combining class of the current character. If it's zero,
we are OK. | |
1381 fcd = g_nfcImpl->nextFCD16(srcP, endP); | |
1382 if (fcd != 0) { | |
1383 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); | |
1384 | |
1385 if (prevTrailingCC != 0) { | |
1386 // The current char has a non-zero trailing CC. Scan forward until
we find | |
1387 // a char with a leading cc of zero. | |
1388 while (endP == NULL || srcP != endP) | |
1389 { | |
1390 const UChar *savedSrcP = srcP; | |
1391 | |
1392 fcd = g_nfcImpl->nextFCD16(srcP, endP); | |
1393 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); | |
1394 if (leadingCC == 0) { | |
1395 srcP = savedSrcP; // Hit char that is not part of combi
ning sequence. | |
1396 // back up over it. (Could be surr
ogate pair!) | |
1397 break; | |
1398 } | |
1399 | |
1400 if (leadingCC < prevTrailingCC) { | |
1401 needNormalize = TRUE; | |
1402 } | |
1403 | |
1404 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); | |
1405 } | |
1406 } | |
1407 } | |
1408 | |
1409 collationSource->fcdPosition = (UChar *)srcP; | |
1410 | |
1411 return needNormalize; | |
1412 } | |
1413 | |
1414 /****************************************************************************/ | |
1415 /* Following are the CE retrieval functions */ | |
1416 /* */ | |
1417 /****************************************************************************/ | |
1418 | |
1419 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource); | |
1420 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource); | |
1421 | |
1422 /* there should be a macro version of this function in the header file */ | |
1423 /* This is the first function that tries to fetch a collation element */ | |
1424 /* If it's not succesfull or it encounters a more difficult situation */ | |
1425 /* some more sofisticated and slower functions are invoked */ | |
1426 static | |
1427 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou
rce, UErrorCode *status) { | |
1428 uint32_t order = 0; | |
1429 if (collationSource->CEpos > collationSource->toReturn) { /* Are there
any CEs from previous expansions? */ | |
1430 order = *(collationSource->toReturn++); /* if so
, return them */ | |
1431 if(collationSource->CEpos == collationSource->toReturn) { | |
1432 collationSource->CEpos = collationSource->toReturn = collationSource
->extendCEs ? collationSource->extendCEs : collationSource->CEs; | |
1433 } | |
1434 return order; | |
1435 } | |
1436 | |
1437 UChar ch = 0; | |
1438 collationSource->offsetReturn = NULL; | |
1439 | |
1440 do { | |
1441 for (;;) /* Loop handles case when incremental
normalize switches */ | |
1442 { /* to or from the side buffer / ori
ginal string, and we */ | |
1443 /* need to start again to get the next character. */ | |
1444 | |
1445 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBU
F | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0) | |
1446 { | |
1447 // The source string is null terminated and we're not working fr
om the side buffer, | |
1448 // and we're not normalizing. This is the fast path. | |
1449 // (We can be in the side buffer for Thai pre-vowel reordering
even when not normalizing.) | |
1450 ch = *collationSource->pos++; | |
1451 if (ch != 0) { | |
1452 break; | |
1453 } | |
1454 else { | |
1455 return UCOL_NO_MORE_CES; | |
1456 } | |
1457 } | |
1458 | |
1459 if (collationSource->flags & UCOL_ITER_HASLEN) { | |
1460 // Normal path for strings when length is specified. | |
1461 // (We can't be in side buffer because it is always null termi
nated.) | |
1462 if (collationSource->pos >= collationSource->endp) { | |
1463 // Ran off of the end of the main source string. We're done
. | |
1464 return UCOL_NO_MORE_CES; | |
1465 } | |
1466 ch = *collationSource->pos++; | |
1467 } | |
1468 else if(collationSource->flags & UCOL_USE_ITERATOR) { | |
1469 UChar32 iterCh = collationSource->iterator->next(collationSource
->iterator); | |
1470 if(iterCh == U_SENTINEL) { | |
1471 return UCOL_NO_MORE_CES; | |
1472 } | |
1473 ch = (UChar)iterCh; | |
1474 } | |
1475 else | |
1476 { | |
1477 // Null terminated string. | |
1478 ch = *collationSource->pos++; | |
1479 if (ch == 0) { | |
1480 // Ran off end of buffer. | |
1481 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { | |
1482 // Ran off end of main string. backing up one character. | |
1483 collationSource->pos--; | |
1484 return UCOL_NO_MORE_CES; | |
1485 } | |
1486 else | |
1487 { | |
1488 // Hit null in the normalize side buffer. | |
1489 // Usually this means the end of the normalized data, | |
1490 // except for one odd case: a null followed by combining
chars, | |
1491 // which is the case if we are at the start of the buf
fer. | |
1492 if (collationSource->pos == collationSource->writableBuf
fer.getBuffer()+1) { | |
1493 break; | |
1494 } | |
1495 | |
1496 // Null marked end of side buffer. | |
1497 // Revert to the main string and | |
1498 // loop back to top to try again to get a character. | |
1499 collationSource->pos = collationSource->fcdPosition; | |
1500 collationSource->flags = collationSource->origFlags; | |
1501 continue; | |
1502 } | |
1503 } | |
1504 } | |
1505 | |
1506 if(collationSource->flags&UCOL_HIRAGANA_Q) { | |
1507 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set
the flag | |
1508 * based on whether the previous codepoint was Hiragana or Katak
ana. | |
1509 */ | |
1510 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)
) || | |
1511 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >=
0x3099 && ch <= 0x309C))) { | |
1512 collationSource->flags |= UCOL_WAS_HIRAGANA; | |
1513 } else { | |
1514 collationSource->flags &= ~UCOL_WAS_HIRAGANA; | |
1515 } | |
1516 } | |
1517 | |
1518 // We've got a character. See if there's any fcd and/or normalizati
on stuff to do. | |
1519 // Note that UCOL_ITER_NORM flag is always zero when we are in th
e side buffer. | |
1520 if ((collationSource->flags & UCOL_ITER_NORM) == 0) { | |
1521 break; | |
1522 } | |
1523 | |
1524 if (collationSource->fcdPosition >= collationSource->pos) { | |
1525 // An earlier FCD check has already covered the current characte
r. | |
1526 // We can go ahead and process this char. | |
1527 break; | |
1528 } | |
1529 | |
1530 if (ch < ZERO_CC_LIMIT_ ) { | |
1531 // Fast fcd safe path. Trailing combining class == 0. This cha
r is OK. | |
1532 break; | |
1533 } | |
1534 | |
1535 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
1536 // We need to peek at the next character in order to tell if we
are FCD | |
1537 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSour
ce->pos >= collationSource->endp) { | |
1538 // We are at the last char of source string. | |
1539 // It is always OK for FCD check. | |
1540 break; | |
1541 } | |
1542 | |
1543 // Not at last char of source string (or we'll check against ter
minating null). Do the FCD fast test | |
1544 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
1545 break; | |
1546 } | |
1547 } | |
1548 | |
1549 | |
1550 // Need a more complete FCD check and possible normalization. | |
1551 if (collIterFCD(collationSource)) { | |
1552 collIterNormalize(collationSource); | |
1553 } | |
1554 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { | |
1555 // No normalization was needed. Go ahead and process the char
we already had. | |
1556 break; | |
1557 } | |
1558 | |
1559 // Some normalization happened. Next loop iteration will pick up a
char | |
1560 // from the normalization buffer. | |
1561 | |
1562 } // end for (;;) | |
1563 | |
1564 | |
1565 if (ch <= 0xFF) { | |
1566 /* For latin-1 characters we never need to fall back to the UCA tab
le */ | |
1567 /* because all of the UCA data is replicated in the latinOneMappi
ng array */ | |
1568 order = coll->latinOneMapping[ch]; | |
1569 if (order > UCOL_NOT_FOUND) { | |
1570 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource,
status); | |
1571 } | |
1572 } | |
1573 else | |
1574 { | |
1575 // Always use UCA for Han, Hangul | |
1576 // (Han extension A is before main Han block) | |
1577 // **** Han compatibility chars ?? **** | |
1578 if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && | |
1579 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) { | |
1580 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) { | |
1581 // between the two target ranges; do normal lookup | |
1582 // **** this range is YI, Modifier tone letters, **** | |
1583 // **** Latin-D, Syloti Nagari, Phagas-pa. **** | |
1584 // **** Latin-D might be tailored, so we need to **** | |
1585 // **** do the normal lookup for these guys. **** | |
1586 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); | |
1587 } else { | |
1588 // in one of the target ranges; use UCA | |
1589 order = UCOL_NOT_FOUND; | |
1590 } | |
1591 } else { | |
1592 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); | |
1593 } | |
1594 | |
1595 if(order > UCOL_NOT_FOUND) { /
* if a CE is special */ | |
1596 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource,
status); /* and try to get the special CE */ | |
1597 } | |
1598 | |
1599 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a g
ood CE in the tailoring */ | |
1600 /* if we got here, the codepoint MUST be over 0xFF - so we look
directly in the trie */ | |
1601 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); | |
1602 | |
1603 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE *
/ | |
1604 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collatio
nSource, status); | |
1605 } | |
1606 } | |
1607 } | |
1608 } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_L
AST_HANGUL ); | |
1609 | |
1610 if(order == UCOL_NOT_FOUND) { | |
1611 order = getImplicit(ch, collationSource); | |
1612 } | |
1613 return order; /* return the CE */ | |
1614 } | |
1615 | |
1616 /* ucol_getNextCE, out-of-line version for use from other files. */ | |
1617 U_CAPI uint32_t U_EXPORT2 | |
1618 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *
status) { | |
1619 return ucol_IGetNextCE(coll, collationSource, status); | |
1620 } | |
1621 | |
1622 | |
1623 /** | |
1624 * Incremental previous normalization happens here. Pick up the range of chars | |
1625 * identifed by FCD, normalize it into the collIterate's writable buffer, | |
1626 * switch the collIterate's state to use the writable buffer. | |
1627 * @param data collation iterator data | |
1628 */ | |
1629 static | |
1630 void collPrevIterNormalize(collIterate *data) | |
1631 { | |
1632 UErrorCode status = U_ZERO_ERROR; | |
1633 const UChar *pEnd = data->pos; /* End normalize + 1 */ | |
1634 const UChar *pStart; | |
1635 | |
1636 /* Start normalize */ | |
1637 if (data->fcdPosition == NULL) { | |
1638 pStart = data->string; | |
1639 } | |
1640 else { | |
1641 pStart = data->fcdPosition + 1; | |
1642 } | |
1643 | |
1644 int32_t normLen = | |
1645 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pSta
rt) + 1)), | |
1646 data->writableBuffer, | |
1647 status). | |
1648 length(); | |
1649 if(U_FAILURE(status)) { | |
1650 return; | |
1651 } | |
1652 /* | |
1653 this puts the null termination infront of the normalized string instead | |
1654 of the end | |
1655 */ | |
1656 data->writableBuffer.insert(0, (UChar)0); | |
1657 | |
1658 /* | |
1659 * The usual case at this point is that we've got a base | |
1660 * character followed by marks that were normalized. If | |
1661 * fcdPosition is NULL, that means that we backed up to | |
1662 * the beginning of the string and there's no base character. | |
1663 * | |
1664 * Forward processing will usually normalize when it sees | |
1665 * the first mark, so that mark will get it's natural offset | |
1666 * and the rest will get the offset of the character following | |
1667 * the marks. The base character will also get its natural offset. | |
1668 * | |
1669 * We write the offset of the base character, if there is one, | |
1670 * followed by the offset of the first mark and then the offsets | |
1671 * of the rest of the marks. | |
1672 */ | |
1673 int32_t firstMarkOffset = 0; | |
1674 int32_t trailOffset = (int32_t)(data->pos - data->string + 1); | |
1675 int32_t trailCount = normLen - 1; | |
1676 | |
1677 if (data->fcdPosition != NULL) { | |
1678 int32_t baseOffset = (int32_t)(data->fcdPosition - data->string); | |
1679 UChar baseChar = *data->fcdPosition; | |
1680 | |
1681 firstMarkOffset = baseOffset + 1; | |
1682 | |
1683 /* | |
1684 * If the base character is the start of a contraction, forward processi
ng | |
1685 * will normalize the marks while checking for the contraction, which me
ans | |
1686 * that the offset of the first mark will the same as the other marks. | |
1687 * | |
1688 * **** THIS IS PROBABLY NOT A COMPLETE TEST **** | |
1689 */ | |
1690 if (baseChar >= 0x100) { | |
1691 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, bas
eChar); | |
1692 | |
1693 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) { | |
1694 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, bas
eChar); | |
1695 } | |
1696 | |
1697 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION
_TAG) { | |
1698 firstMarkOffset = trailOffset; | |
1699 } | |
1700 } | |
1701 | |
1702 data->appendOffset(baseOffset, status); | |
1703 } | |
1704 | |
1705 data->appendOffset(firstMarkOffset, status); | |
1706 | |
1707 for (int32_t i = 0; i < trailCount; i += 1) { | |
1708 data->appendOffset(trailOffset, status); | |
1709 } | |
1710 | |
1711 data->offsetRepeatValue = trailOffset; | |
1712 | |
1713 data->offsetReturn = data->offsetStore - 1; | |
1714 if (data->offsetReturn == data->offsetBuffer) { | |
1715 data->offsetStore = data->offsetBuffer; | |
1716 } | |
1717 | |
1718 data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen; | |
1719 data->origFlags = data->flags; | |
1720 data->flags |= UCOL_ITER_INNORMBUF; | |
1721 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); | |
1722 } | |
1723 | |
1724 | |
1725 /** | |
1726 * Incremental FCD check for previous iteration and normalize. Called from | |
1727 * getPrevCE when normalization state is suspect. | |
1728 * When entering, the state is known to be this: | |
1729 * o We are working in the main buffer of the collIterate, not the side | |
1730 * writable buffer. When in the side buffer, normalization mode is always | |
1731 * off, so we won't get here. | |
1732 * o The leading combining class from the current character is 0 or the | |
1733 * trailing combining class of the previous char was zero. | |
1734 * True because the previous call to this function will have always exited | |
1735 * that way, and we get called for every char where cc might be non-zero. | |
1736 * @param data collation iterate struct | |
1737 * @return normalization status, TRUE for normalization to be done, FALSE | |
1738 * otherwise | |
1739 */ | |
1740 static | |
1741 inline UBool collPrevIterFCD(collIterate *data) | |
1742 { | |
1743 const UChar *src, *start; | |
1744 uint8_t leadingCC; | |
1745 uint8_t trailingCC = 0; | |
1746 uint16_t fcd; | |
1747 UBool result = FALSE; | |
1748 | |
1749 start = data->string; | |
1750 src = data->pos + 1; | |
1751 | |
1752 /* Get the trailing combining class of the current character. */ | |
1753 fcd = g_nfcImpl->previousFCD16(start, src); | |
1754 | |
1755 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); | |
1756 | |
1757 if (leadingCC != 0) { | |
1758 /* | |
1759 The current char has a non-zero leading combining class. | |
1760 Scan backward until we find a char with a trailing cc of zero. | |
1761 */ | |
1762 for (;;) | |
1763 { | |
1764 if (start == src) { | |
1765 data->fcdPosition = NULL; | |
1766 return result; | |
1767 } | |
1768 | |
1769 fcd = g_nfcImpl->previousFCD16(start, src); | |
1770 | |
1771 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); | |
1772 | |
1773 if (trailingCC == 0) { | |
1774 break; | |
1775 } | |
1776 | |
1777 if (leadingCC < trailingCC) { | |
1778 result = TRUE; | |
1779 } | |
1780 | |
1781 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); | |
1782 } | |
1783 } | |
1784 | |
1785 data->fcdPosition = (UChar *)src; | |
1786 | |
1787 return result; | |
1788 } | |
1789 | |
1790 /** gets a code unit from the string at a given offset | |
1791 * Handles both normal and iterative cases. | |
1792 * No error checking - caller beware! | |
1793 */ | |
1794 static inline | |
1795 UChar peekCodeUnit(collIterate *source, int32_t offset) { | |
1796 if(source->pos != NULL) { | |
1797 return *(source->pos + offset); | |
1798 } else if(source->iterator != NULL) { | |
1799 UChar32 c; | |
1800 if(offset != 0) { | |
1801 source->iterator->move(source->iterator, offset, UITER_CURRENT); | |
1802 c = source->iterator->next(source->iterator); | |
1803 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT); | |
1804 } else { | |
1805 c = source->iterator->current(source->iterator); | |
1806 } | |
1807 return c >= 0 ? (UChar)c : 0xfffd; // If the caller works properly, we
should never see c<0. | |
1808 } else { | |
1809 return 0xfffd; | |
1810 } | |
1811 } | |
1812 | |
1813 // Code point version. Treats the offset as a _code point_ delta. | |
1814 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-for
med UTF-16. | |
1815 // We cannot use U16_FWD_1 and similar because we do not know the start and limi
t of the buffer. | |
1816 static inline | |
1817 UChar32 peekCodePoint(collIterate *source, int32_t offset) { | |
1818 UChar32 c; | |
1819 if(source->pos != NULL) { | |
1820 const UChar *p = source->pos; | |
1821 if(offset >= 0) { | |
1822 // Skip forward over (offset-1) code points. | |
1823 while(--offset >= 0) { | |
1824 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) { | |
1825 ++p; | |
1826 } | |
1827 } | |
1828 // Read the code point there. | |
1829 c = *p++; | |
1830 UChar trail; | |
1831 if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) { | |
1832 c = U16_GET_SUPPLEMENTARY(c, trail); | |
1833 } | |
1834 } else /* offset<0 */ { | |
1835 // Skip backward over (offset-1) code points. | |
1836 while(++offset < 0) { | |
1837 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) { | |
1838 --p; | |
1839 } | |
1840 } | |
1841 // Read the code point before that. | |
1842 c = *--p; | |
1843 UChar lead; | |
1844 if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) { | |
1845 c = U16_GET_SUPPLEMENTARY(lead, c); | |
1846 } | |
1847 } | |
1848 } else if(source->iterator != NULL) { | |
1849 if(offset >= 0) { | |
1850 // Skip forward over (offset-1) code points. | |
1851 int32_t fwd = offset; | |
1852 while(fwd-- > 0) { | |
1853 uiter_next32(source->iterator); | |
1854 } | |
1855 // Read the code point there. | |
1856 c = uiter_current32(source->iterator); | |
1857 // Return to the starting point, skipping backward over (offset-1) c
ode points. | |
1858 while(offset-- > 0) { | |
1859 uiter_previous32(source->iterator); | |
1860 } | |
1861 } else /* offset<0 */ { | |
1862 // Read backward, reading offset code points, remember only the last
-read one. | |
1863 int32_t back = offset; | |
1864 do { | |
1865 c = uiter_previous32(source->iterator); | |
1866 } while(++back < 0); | |
1867 // Return to the starting position, skipping forward over offset cod
e points. | |
1868 do { | |
1869 uiter_next32(source->iterator); | |
1870 } while(++offset < 0); | |
1871 } | |
1872 } else { | |
1873 c = U_SENTINEL; | |
1874 } | |
1875 return c; | |
1876 } | |
1877 | |
1878 /** | |
1879 * Determines if we are at the start of the data string in the backwards | |
1880 * collation iterator | |
1881 * @param data collation iterator | |
1882 * @return TRUE if we are at the start | |
1883 */ | |
1884 static | |
1885 inline UBool isAtStartPrevIterate(collIterate *data) { | |
1886 if(data->pos == NULL && data->iterator != NULL) { | |
1887 return !data->iterator->hasPrevious(data->iterator); | |
1888 } | |
1889 //return (collIter_bos(data)) || | |
1890 return (data->pos == data->string) || | |
1891 ((data->flags & UCOL_ITER_INNORMBUF) && (data->pos != NULL) && | |
1892 *(data->pos - 1) == 0 && data->fcdPosition == NULL); | |
1893 } | |
1894 | |
1895 static | |
1896 inline void goBackOne(collIterate *data) { | |
1897 # if 0 | |
1898 // somehow, it looks like we need to keep iterator synced up | |
1899 // at all times, as above. | |
1900 if(data->pos) { | |
1901 data->pos--; | |
1902 } | |
1903 if(data->iterator) { | |
1904 data->iterator->previous(data->iterator); | |
1905 } | |
1906 #endif | |
1907 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) { | |
1908 data->iterator->previous(data->iterator); | |
1909 } | |
1910 if(data->pos) { | |
1911 data->pos --; | |
1912 } | |
1913 } | |
1914 | |
1915 /** | |
1916 * Inline function that gets a simple CE. | |
1917 * So what it does is that it will first check the expansion buffer. If the | |
1918 * expansion buffer is not empty, ie the end pointer to the expansion buffer | |
1919 * is different from the string pointer, we return the collation element at the | |
1920 * return pointer and decrement it. | |
1921 * For more complicated CEs it resorts to getComplicatedCE. | |
1922 * @param coll collator data | |
1923 * @param data collation iterator struct | |
1924 * @param status error status | |
1925 */ | |
1926 static | |
1927 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, | |
1928 UErrorCode *status) | |
1929 { | |
1930 uint32_t result = (uint32_t)UCOL_NULLORDER; | |
1931 | |
1932 if (data->offsetReturn != NULL) { | |
1933 if (data->offsetRepeatCount > 0) { | |
1934 data->offsetRepeatCount -= 1; | |
1935 } else { | |
1936 if (data->offsetReturn == data->offsetBuffer) { | |
1937 data->offsetReturn = NULL; | |
1938 data->offsetStore = data->offsetBuffer; | |
1939 } else { | |
1940 data->offsetReturn -= 1; | |
1941 } | |
1942 } | |
1943 } | |
1944 | |
1945 if ((data->extendCEs && data->toReturn > data->extendCEs) || | |
1946 (!data->extendCEs && data->toReturn > data->CEs)) | |
1947 { | |
1948 data->toReturn -= 1; | |
1949 result = *(data->toReturn); | |
1950 if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) { | |
1951 data->CEpos = data->toReturn; | |
1952 } | |
1953 } | |
1954 else { | |
1955 UChar ch = 0; | |
1956 | |
1957 do { | |
1958 /* | |
1959 Loop handles case when incremental normalize switches to or from the | |
1960 side buffer / original string, and we need to start again to get the | |
1961 next character. | |
1962 */ | |
1963 for (;;) { | |
1964 if (data->flags & UCOL_ITER_HASLEN) { | |
1965 /* | |
1966 Normal path for strings when length is specified. | |
1967 Not in side buffer because it is always null terminated. | |
1968 */ | |
1969 if (data->pos <= data->string) { | |
1970 /* End of the main source string */ | |
1971 return UCOL_NO_MORE_CES; | |
1972 } | |
1973 data->pos --; | |
1974 ch = *data->pos; | |
1975 } | |
1976 // we are using an iterator to go back. Pray for us! | |
1977 else if (data->flags & UCOL_USE_ITERATOR) { | |
1978 UChar32 iterCh = data->iterator->previous(data->iterator); | |
1979 if(iterCh == U_SENTINEL) { | |
1980 return UCOL_NO_MORE_CES; | |
1981 } else { | |
1982 ch = (UChar)iterCh; | |
1983 } | |
1984 } | |
1985 else { | |
1986 data->pos --; | |
1987 ch = *data->pos; | |
1988 /* we are in the side buffer. */ | |
1989 if (ch == 0) { | |
1990 /* | |
1991 At the start of the normalize side buffer. | |
1992 Go back to string. | |
1993 Because pointer points to the last accessed character, | |
1994 hence we have to increment it by one here. | |
1995 */ | |
1996 data->flags = data->origFlags; | |
1997 data->offsetRepeatValue = 0; | |
1998 | |
1999 if (data->fcdPosition == NULL) { | |
2000 data->pos = data->string; | |
2001 return UCOL_NO_MORE_CES; | |
2002 } | |
2003 else { | |
2004 data->pos = data->fcdPosition + 1; | |
2005 } | |
2006 | |
2007 continue; | |
2008 } | |
2009 } | |
2010 | |
2011 if(data->flags&UCOL_HIRAGANA_Q) { | |
2012 if(ch>=0x3040 && ch<=0x309f) { | |
2013 data->flags |= UCOL_WAS_HIRAGANA; | |
2014 } else { | |
2015 data->flags &= ~UCOL_WAS_HIRAGANA; | |
2016 } | |
2017 } | |
2018 | |
2019 /* | |
2020 * got a character to determine if there's fcd and/or normalizati
on | |
2021 * stuff to do. | |
2022 * if the current character is not fcd. | |
2023 * if current character is at the start of the string | |
2024 * Trailing combining class == 0. | |
2025 * Note if pos is in the writablebuffer, norm is always 0 | |
2026 */ | |
2027 if (ch < ZERO_CC_LIMIT_ || | |
2028 // this should propel us out of the loop in the iterator case | |
2029 (data->flags & UCOL_ITER_NORM) == 0 || | |
2030 (data->fcdPosition != NULL && data->fcdPosition <= data->pos
) | |
2031 || data->string == data->pos) { | |
2032 break; | |
2033 } | |
2034 | |
2035 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
2036 /* if next character is FCD */ | |
2037 if (data->pos == data->string) { | |
2038 /* First char of string is always OK for FCD check */ | |
2039 break; | |
2040 } | |
2041 | |
2042 /* Not first char of string, do the FCD fast test */ | |
2043 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
2044 break; | |
2045 } | |
2046 } | |
2047 | |
2048 /* Need a more complete FCD check and possible normalization. */ | |
2049 if (collPrevIterFCD(data)) { | |
2050 collPrevIterNormalize(data); | |
2051 } | |
2052 | |
2053 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { | |
2054 /* No normalization. Go ahead and process the char. */ | |
2055 break; | |
2056 } | |
2057 | |
2058 /* | |
2059 Some normalization happened. | |
2060 Next loop picks up a char from the normalization buffer. | |
2061 */ | |
2062 } | |
2063 | |
2064 /* attempt to handle contractions, after removal of the backwards | |
2065 contraction | |
2066 */ | |
2067 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data))
{ | |
2068 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, d
ata, status); | |
2069 } else { | |
2070 if (ch <= 0xFF) { | |
2071 result = coll->latinOneMapping[ch]; | |
2072 } | |
2073 else { | |
2074 // Always use UCA for [3400..9FFF], [AC00..D7AF] | |
2075 // **** [FA0E..FA2F] ?? **** | |
2076 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && | |
2077 (ch >= 0x3400 && ch <= 0xD7AF)) { | |
2078 if (ch > 0x9FFF && ch < 0xAC00) { | |
2079 // between the two target ranges; do normal lookup | |
2080 // **** this range is YI, Modifier tone letters, ***
* | |
2081 // **** Latin-D, Syloti Nagari, Phagas-pa. ***
* | |
2082 // **** Latin-D might be tailored, so we need to ***
* | |
2083 // **** do the normal lookup for these guys. ***
* | |
2084 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); | |
2085 } else { | |
2086 result = UCOL_NOT_FOUND; | |
2087 } | |
2088 } else { | |
2089 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); | |
2090 } | |
2091 } | |
2092 if (result > UCOL_NOT_FOUND) { | |
2093 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, s
tatus); | |
2094 } | |
2095 if (result == UCOL_NOT_FOUND) { // Not found in master list | |
2096 if (!isAtStartPrevIterate(data) && | |
2097 ucol_contractionEndCP(ch, data->coll)) | |
2098 { | |
2099 result = UCOL_CONTRACTION; | |
2100 } else { | |
2101 if(coll->UCA) { | |
2102 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping,
ch); | |
2103 } | |
2104 } | |
2105 | |
2106 if (result > UCOL_NOT_FOUND) { | |
2107 if(coll->UCA) { | |
2108 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, re
sult, data, status); | |
2109 } | |
2110 } | |
2111 } | |
2112 } | |
2113 } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= U
COL_LAST_HANGUL ); | |
2114 | |
2115 if(result == UCOL_NOT_FOUND) { | |
2116 result = getPrevImplicit(ch, data); | |
2117 } | |
2118 } | |
2119 | |
2120 return result; | |
2121 } | |
2122 | |
2123 | |
2124 /* ucol_getPrevCE, out-of-line version for use from other files. */ | |
2125 U_CFUNC uint32_t U_EXPORT2 | |
2126 ucol_getPrevCE(const UCollator *coll, collIterate *data, | |
2127 UErrorCode *status) { | |
2128 return ucol_IGetPrevCE(coll, data, status); | |
2129 } | |
2130 | |
2131 | |
2132 /* this should be connected to special Jamo handling */ | |
2133 U_CFUNC uint32_t U_EXPORT2 | |
2134 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) { | |
2135 collIterate colIt; | |
2136 IInit_collIterate(coll, &u, 1, &colIt, status); | |
2137 if(U_FAILURE(*status)) { | |
2138 return 0; | |
2139 } | |
2140 return ucol_IGetNextCE(coll, &colIt, status); | |
2141 } | |
2142 | |
2143 /** | |
2144 * Inserts the argument character into the end of the buffer pushing back the | |
2145 * null terminator. | |
2146 * @param data collIterate struct data | |
2147 * @param ch character to be appended | |
2148 * @return the position of the new addition | |
2149 */ | |
2150 static | |
2151 inline const UChar * insertBufferEnd(collIterate *data, UChar ch) | |
2152 { | |
2153 int32_t oldLength = data->writableBuffer.length(); | |
2154 return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength; | |
2155 } | |
2156 | |
2157 /** | |
2158 * Inserts the argument string into the end of the buffer pushing back the | |
2159 * null terminator. | |
2160 * @param data collIterate struct data | |
2161 * @param string to be appended | |
2162 * @param length of the string to be appended | |
2163 * @return the position of the new addition | |
2164 */ | |
2165 static | |
2166 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_
t length) | |
2167 { | |
2168 int32_t oldLength = data->writableBuffer.length(); | |
2169 return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldL
ength; | |
2170 } | |
2171 | |
2172 /** | |
2173 * Special normalization function for contraction in the forwards iterator. | |
2174 * This normalization sequence will place the current character at source->pos | |
2175 * and its following normalized sequence into the buffer. | |
2176 * The fcd position, pos will be changed. | |
2177 * pos will now point to positions in the buffer. | |
2178 * Flags will be changed accordingly. | |
2179 * @param data collation iterator data | |
2180 */ | |
2181 static | |
2182 inline void normalizeNextContraction(collIterate *data) | |
2183 { | |
2184 int32_t strsize; | |
2185 UErrorCode status = U_ZERO_ERROR; | |
2186 /* because the pointer points to the next character */ | |
2187 const UChar *pStart = data->pos - 1; | |
2188 const UChar *pEnd; | |
2189 | |
2190 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { | |
2191 data->writableBuffer.setTo(*(pStart - 1)); | |
2192 strsize = 1; | |
2193 } | |
2194 else { | |
2195 strsize = data->writableBuffer.length(); | |
2196 } | |
2197 | |
2198 pEnd = data->fcdPosition; | |
2199 | |
2200 data->writableBuffer.append( | |
2201 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStar
t)), status)); | |
2202 if(U_FAILURE(status)) { | |
2203 return; | |
2204 } | |
2205 | |
2206 data->pos = data->writableBuffer.getTerminatedBuffer() + strsize; | |
2207 data->origFlags = data->flags; | |
2208 data->flags |= UCOL_ITER_INNORMBUF; | |
2209 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); | |
2210 } | |
2211 | |
2212 /** | |
2213 * Contraction character management function that returns the next character | |
2214 * for the forwards iterator. | |
2215 * Does nothing if the next character is in buffer and not the first character | |
2216 * in it. | |
2217 * Else it checks next character in data string to see if it is normalizable. | |
2218 * If it is not, the character is simply copied into the buffer, else | |
2219 * the whole normalized substring is copied into the buffer, including the | |
2220 * current character. | |
2221 * @param data collation element iterator data | |
2222 * @return next character | |
2223 */ | |
2224 static | |
2225 inline UChar getNextNormalizedChar(collIterate *data) | |
2226 { | |
2227 UChar nextch; | |
2228 UChar ch; | |
2229 // Here we need to add the iterator code. One problem is the way | |
2230 // end of string is handled. If we just return next char, it could | |
2231 // be the sentinel. Most of the cases already check for this, but we | |
2232 // need to be sure. | |
2233 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) { | |
2234 /* if no normalization and not in buffer. */ | |
2235 if(data->flags & UCOL_USE_ITERATOR) { | |
2236 return (UChar)data->iterator->next(data->iterator); | |
2237 } else { | |
2238 return *(data->pos ++); | |
2239 } | |
2240 } | |
2241 | |
2242 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) { | |
2243 //normalizeIterator(data); | |
2244 //} | |
2245 | |
2246 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); | |
2247 if ((innormbuf && *data->pos != 0) || | |
2248 (data->fcdPosition != NULL && !innormbuf && | |
2249 data->pos < data->fcdPosition)) { | |
2250 /* | |
2251 if next character is in normalized buffer, no further normalization | |
2252 is required | |
2253 */ | |
2254 return *(data->pos ++); | |
2255 } | |
2256 | |
2257 if (data->flags & UCOL_ITER_HASLEN) { | |
2258 /* in data string */ | |
2259 if (data->pos + 1 == data->endp) { | |
2260 return *(data->pos ++); | |
2261 } | |
2262 if (data->pos >= data->endp) { | |
2263 return (UChar) -1; // return U+FFFF (non-char) to indicate an error | |
2264 } | |
2265 } | |
2266 else { | |
2267 if (innormbuf) { | |
2268 // inside the normalization buffer, but at the end | |
2269 // (since we encountered zero). This means, in the | |
2270 // case we're using char iterator, that we need to | |
2271 // do another round of normalization. | |
2272 //if(data->origFlags & UCOL_USE_ITERATOR) { | |
2273 // we need to restore original flags, | |
2274 // otherwise, we'll lose them | |
2275 //data->flags = data->origFlags; | |
2276 //normalizeIterator(data); | |
2277 //return *(data->pos++); | |
2278 //} else { | |
2279 /* | |
2280 in writable buffer, at this point fcdPosition can not be | |
2281 pointing to the end of the data string. see contracting tag. | |
2282 */ | |
2283 if(data->fcdPosition) { | |
2284 if (*(data->fcdPosition + 1) == 0 || | |
2285 data->fcdPosition + 1 == data->endp) { | |
2286 /* at the end of the string, dump it into the normalizer */ | |
2287 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1; | |
2288 // Check if data->pos received a null pointer | |
2289 if (data->pos == NULL) { | |
2290 return (UChar)-1; // Return to indicate error. | |
2291 } | |
2292 return *(data->fcdPosition ++); | |
2293 } | |
2294 data->pos = data->fcdPosition; | |
2295 } else if(data->origFlags & UCOL_USE_ITERATOR) { | |
2296 // if we are here, we're using a normalizing iterator. | |
2297 // we should just continue further. | |
2298 data->flags = data->origFlags; | |
2299 data->pos = NULL; | |
2300 return (UChar)data->iterator->next(data->iterator); | |
2301 } | |
2302 //} | |
2303 } | |
2304 else { | |
2305 if (*(data->pos + 1) == 0) { | |
2306 return *(data->pos ++); | |
2307 } | |
2308 } | |
2309 } | |
2310 | |
2311 ch = *data->pos ++; | |
2312 nextch = *data->pos; | |
2313 | |
2314 /* | |
2315 * if the current character is not fcd. | |
2316 * Trailing combining class == 0. | |
2317 */ | |
2318 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) && | |
2319 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ || | |
2320 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) { | |
2321 /* | |
2322 Need a more complete FCD check and possible normalization. | |
2323 normalize substring will be appended to buffer | |
2324 */ | |
2325 if (collIterFCD(data)) { | |
2326 normalizeNextContraction(data); | |
2327 return *(data->pos ++); | |
2328 } | |
2329 else if (innormbuf) { | |
2330 /* fcdposition shifted even when there's no normalization, if we | |
2331 don't input the rest into this, we'll get the wrong position when | |
2332 we reach the end of the writableBuffer */ | |
2333 int32_t length = (int32_t)(data->fcdPosition - data->pos + 1); | |
2334 data->pos = insertBufferEnd(data, data->pos - 1, length); | |
2335 // Check if data->pos received a null pointer | |
2336 if (data->pos == NULL) { | |
2337 return (UChar)-1; // Return to indicate error. | |
2338 } | |
2339 return *(data->pos ++); | |
2340 } | |
2341 } | |
2342 | |
2343 if (innormbuf) { | |
2344 /* | |
2345 no normalization is to be done hence only one character will be | |
2346 appended to the buffer. | |
2347 */ | |
2348 data->pos = insertBufferEnd(data, ch) + 1; | |
2349 // Check if data->pos received a null pointer | |
2350 if (data->pos == NULL) { | |
2351 return (UChar)-1; // Return to indicate error. | |
2352 } | |
2353 } | |
2354 | |
2355 /* points back to the pos in string */ | |
2356 return ch; | |
2357 } | |
2358 | |
2359 | |
2360 | |
2361 /** | |
2362 * Function to copy the buffer into writableBuffer and sets the fcd position to | |
2363 * the correct position | |
2364 * @param source data string source | |
2365 * @param buffer character buffer | |
2366 */ | |
2367 static | |
2368 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &b
uffer) | |
2369 { | |
2370 /* okay confusing part here. to ensure that the skipped characters are | |
2371 considered later, we need to place it in the appropriate position in the | |
2372 normalization buffer and reassign the pos pointer. simple case if pos | |
2373 reside in string, simply copy to normalization buffer and | |
2374 fcdposition = pos, pos = start of normalization buffer. if pos in | |
2375 normalization buffer, we'll insert the copy infront of pos and point pos | |
2376 to the start of the normalization buffer. why am i doing these copies? | |
2377 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecial
CE does | |
2378 not require any changes, which be really painful. */ | |
2379 if (source->flags & UCOL_ITER_INNORMBUF) { | |
2380 int32_t replaceLength = source->pos - source->writableBuffer.getBuffer()
; | |
2381 source->writableBuffer.replace(0, replaceLength, buffer); | |
2382 } | |
2383 else { | |
2384 source->fcdPosition = source->pos; | |
2385 source->origFlags = source->flags; | |
2386 source->flags |= UCOL_ITER_INNORMBUF; | |
2387 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_IT
ERATOR); | |
2388 source->writableBuffer = buffer; | |
2389 } | |
2390 | |
2391 source->pos = source->writableBuffer.getTerminatedBuffer(); | |
2392 } | |
2393 | |
2394 /** | |
2395 * Function to get the discontiguos collation element within the source. | |
2396 * Note this function will set the position to the appropriate places. | |
2397 * @param coll current collator used | |
2398 * @param source data string source | |
2399 * @param constart index to the start character in the contraction table | |
2400 * @return discontiguos collation element offset | |
2401 */ | |
2402 static | |
2403 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source, | |
2404 const UChar *constart) | |
2405 { | |
2406 /* source->pos currently points to the second combining character after | |
2407 the start character */ | |
2408 const UChar *temppos = source->pos; | |
2409 UnicodeString buffer; | |
2410 const UChar *tempconstart = constart; | |
2411 uint8_t tempflags = source->flags; | |
2412 UBool multicontraction = FALSE; | |
2413 collIterateState discState; | |
2414 | |
2415 backupState(source, &discState); | |
2416 | |
2417 buffer.setTo(peekCodePoint(source, -1)); | |
2418 for (;;) { | |
2419 UChar *UCharOffset; | |
2420 UChar schar, | |
2421 tchar; | |
2422 uint32_t result; | |
2423 | |
2424 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp) | |
2425 || (peekCodeUnit(source, 0) == 0 && | |
2426 //|| (*source->pos == 0 && | |
2427 ((source->flags & UCOL_ITER_INNORMBUF) == 0 || | |
2428 source->fcdPosition == NULL || | |
2429 source->fcdPosition == source->endp || | |
2430 *(source->fcdPosition) == 0 || | |
2431 u_getCombiningClass(*(source->fcdPosition)) == 0)) || | |
2432 /* end of string in null terminated string or stopped by a | |
2433 null character, note fcd does not always point to a base | |
2434 character after the discontiguos change */ | |
2435 u_getCombiningClass(peekCodePoint(source, 0)) == 0) { | |
2436 //u_getCombiningClass(*(source->pos)) == 0) { | |
2437 //constart = (UChar *)coll->image + getContractOffset(CE); | |
2438 if (multicontraction) { | |
2439 source->pos = temppos - 1; | |
2440 setDiscontiguosAttribute(source, buffer); | |
2441 return *(coll->contractionCEs + | |
2442 (tempconstart - coll->contractionIndex)); | |
2443 } | |
2444 constart = tempconstart; | |
2445 break; | |
2446 } | |
2447 | |
2448 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/ | |
2449 schar = getNextNormalizedChar(source); | |
2450 | |
2451 while (schar > (tchar = *UCharOffset)) { | |
2452 UCharOffset++; | |
2453 } | |
2454 | |
2455 if (schar != tchar) { | |
2456 /* not the correct codepoint. we stuff the current codepoint into | |
2457 the discontiguos buffer and try the next character */ | |
2458 buffer.append(schar); | |
2459 continue; | |
2460 } | |
2461 else { | |
2462 if (u_getCombiningClass(schar) == | |
2463 u_getCombiningClass(peekCodePoint(source, -2))) { | |
2464 buffer.append(schar); | |
2465 continue; | |
2466 } | |
2467 result = *(coll->contractionCEs + | |
2468 (UCharOffset - coll->contractionIndex)); | |
2469 } | |
2470 | |
2471 if (result == UCOL_NOT_FOUND) { | |
2472 break; | |
2473 } else if (isContraction(result)) { | |
2474 /* this is a multi-contraction*/ | |
2475 tempconstart = (UChar *)coll->image + getContractOffset(result); | |
2476 if (*(coll->contractionCEs + (constart - coll->contractionIndex)) | |
2477 != UCOL_NOT_FOUND) { | |
2478 multicontraction = TRUE; | |
2479 temppos = source->pos + 1; | |
2480 } | |
2481 } else { | |
2482 setDiscontiguosAttribute(source, buffer); | |
2483 return result; | |
2484 } | |
2485 } | |
2486 | |
2487 /* no problems simply reverting just like that, | |
2488 if we are in string before getting into this function, points back to | |
2489 string hence no problem. | |
2490 if we are in normalization buffer before getting into this function, | |
2491 since we'll never use another normalization within this function, we | |
2492 know that fcdposition points to a base character. the normalization buffer | |
2493 never change, hence this revert works. */ | |
2494 loadState(source, &discState, TRUE); | |
2495 goBackOne(source); | |
2496 | |
2497 //source->pos = temppos - 1; | |
2498 source->flags = tempflags; | |
2499 return *(coll->contractionCEs + (constart - coll->contractionIndex)); | |
2500 } | |
2501 | |
2502 /* now uses Mark's getImplicitPrimary code */ | |
2503 static | |
2504 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) { | |
2505 uint32_t r = uprv_uca_getImplicitPrimary(cp); | |
2506 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0; | |
2507 collationSource->offsetRepeatCount += 1; | |
2508 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order' | |
2509 } | |
2510 | |
2511 /** | |
2512 * Inserts the argument character into the front of the buffer replacing the | |
2513 * front null terminator. | |
2514 * @param data collation element iterator data | |
2515 * @param ch character to be appended | |
2516 */ | |
2517 static | |
2518 inline void insertBufferFront(collIterate *data, UChar ch) | |
2519 { | |
2520 data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTer
minatedBuffer() + 2; | |
2521 } | |
2522 | |
2523 /** | |
2524 * Special normalization function for contraction in the previous iterator. | |
2525 * This normalization sequence will place the current character at source->pos | |
2526 * and its following normalized sequence into the buffer. | |
2527 * The fcd position, pos will be changed. | |
2528 * pos will now point to positions in the buffer. | |
2529 * Flags will be changed accordingly. | |
2530 * @param data collation iterator data | |
2531 */ | |
2532 static | |
2533 inline void normalizePrevContraction(collIterate *data, UErrorCode *status) | |
2534 { | |
2535 const UChar *pEnd = data->pos + 1; /* End normalize + 1 */ | |
2536 const UChar *pStart; | |
2537 | |
2538 UnicodeString endOfBuffer; | |
2539 if (data->flags & UCOL_ITER_HASLEN) { | |
2540 /* | |
2541 normalization buffer not used yet, we'll pull down the next | |
2542 character into the end of the buffer | |
2543 */ | |
2544 endOfBuffer.setTo(*pEnd); | |
2545 } | |
2546 else { | |
2547 endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL | |
2548 } | |
2549 | |
2550 if (data->fcdPosition == NULL) { | |
2551 pStart = data->string; | |
2552 } | |
2553 else { | |
2554 pStart = data->fcdPosition + 1; | |
2555 } | |
2556 int32_t normLen = | |
2557 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStar
t)), | |
2558 data->writableBuffer, | |
2559 *status). | |
2560 length(); | |
2561 if(U_FAILURE(*status)) { | |
2562 return; | |
2563 } | |
2564 /* | |
2565 this puts the null termination infront of the normalized string instead | |
2566 of the end | |
2567 */ | |
2568 data->pos = | |
2569 data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminat
edBuffer() + | |
2570 1 + normLen; | |
2571 data->origFlags = data->flags; | |
2572 data->flags |= UCOL_ITER_INNORMBUF; | |
2573 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); | |
2574 } | |
2575 | |
2576 /** | |
2577 * Contraction character management function that returns the previous character | |
2578 * for the backwards iterator. | |
2579 * Does nothing if the previous character is in buffer and not the first | |
2580 * character in it. | |
2581 * Else it checks previous character in data string to see if it is | |
2582 * normalizable. | |
2583 * If it is not, the character is simply copied into the buffer, else | |
2584 * the whole normalized substring is copied into the buffer, including the | |
2585 * current character. | |
2586 * @param data collation element iterator data | |
2587 * @return previous character | |
2588 */ | |
2589 static | |
2590 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status) | |
2591 { | |
2592 UChar prevch; | |
2593 UChar ch; | |
2594 const UChar *start; | |
2595 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); | |
2596 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 || | |
2597 (innormbuf && *(data->pos - 1) != 0)) { | |
2598 /* | |
2599 if no normalization. | |
2600 if previous character is in normalized buffer, no further normalization | |
2601 is required | |
2602 */ | |
2603 if(data->flags & UCOL_USE_ITERATOR) { | |
2604 data->iterator->move(data->iterator, -1, UITER_CURRENT); | |
2605 return (UChar)data->iterator->next(data->iterator); | |
2606 } else { | |
2607 return *(data->pos - 1); | |
2608 } | |
2609 } | |
2610 | |
2611 start = data->pos; | |
2612 if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) { | |
2613 /* in data string */ | |
2614 if ((start - 1) == data->string) { | |
2615 return *(start - 1); | |
2616 } | |
2617 start --; | |
2618 ch = *start; | |
2619 prevch = *(start - 1); | |
2620 } | |
2621 else { | |
2622 /* | |
2623 in writable buffer, at this point fcdPosition can not be NULL. | |
2624 see contracting tag. | |
2625 */ | |
2626 if (data->fcdPosition == data->string) { | |
2627 /* at the start of the string, just dump it into the normalizer */ | |
2628 insertBufferFront(data, *(data->fcdPosition)); | |
2629 data->fcdPosition = NULL; | |
2630 return *(data->pos - 1); | |
2631 } | |
2632 start = data->fcdPosition; | |
2633 ch = *start; | |
2634 prevch = *(start - 1); | |
2635 } | |
2636 /* | |
2637 * if the current character is not fcd. | |
2638 * Trailing combining class == 0. | |
2639 */ | |
2640 if (data->fcdPosition > start && | |
2641 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_)) | |
2642 { | |
2643 /* | |
2644 Need a more complete FCD check and possible normalization. | |
2645 normalize substring will be appended to buffer | |
2646 */ | |
2647 const UChar *backuppos = data->pos; | |
2648 data->pos = start; | |
2649 if (collPrevIterFCD(data)) { | |
2650 normalizePrevContraction(data, status); | |
2651 return *(data->pos - 1); | |
2652 } | |
2653 data->pos = backuppos; | |
2654 data->fcdPosition ++; | |
2655 } | |
2656 | |
2657 if (innormbuf) { | |
2658 /* | |
2659 no normalization is to be done hence only one character will be | |
2660 appended to the buffer. | |
2661 */ | |
2662 insertBufferFront(data, ch); | |
2663 data->fcdPosition --; | |
2664 } | |
2665 | |
2666 return ch; | |
2667 } | |
2668 | |
2669 /* This function handles the special CEs like contractions, expansions, surrogat
es, Thai */ | |
2670 /* It is called by getNextCE */ | |
2671 | |
2672 /* The following should be even */ | |
2673 #define UCOL_MAX_DIGITS_FOR_NUMBER 254 | |
2674 | |
2675 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
lIterate *source, UErrorCode *status) { | |
2676 collIterateState entryState; | |
2677 backupState(source, &entryState); | |
2678 UChar32 cp = ch; | |
2679 | |
2680 for (;;) { | |
2681 // This loop will repeat only in the case of contractions, and only when
a contraction | |
2682 // is found and the first CE resulting from that contraction is itself
a special | |
2683 // (an expansion, for example.) All other special CE types are fully
handled the | |
2684 // first time through, and the loop exits. | |
2685 | |
2686 const uint32_t *CEOffset = NULL; | |
2687 switch(getCETag(CE)) { | |
2688 case NOT_FOUND_TAG: | |
2689 /* This one is not found, and we'll let somebody else bother about i
t... no more games */ | |
2690 return CE; | |
2691 case SPEC_PROC_TAG: | |
2692 { | |
2693 // Special processing is getting a CE that is preceded by a cert
ain prefix | |
2694 // Currently this is only needed for optimizing Japanese length
and iteration marks. | |
2695 // When we encouter a special processing tag, we go backwards an
d try to see if | |
2696 // we have a match. | |
2697 // Contraction tables are used - so the whole process is not unl
ike contraction. | |
2698 // prefix data is stored backwards in the table. | |
2699 const UChar *UCharOffset; | |
2700 UChar schar, tchar; | |
2701 collIterateState prefixState; | |
2702 backupState(source, &prefixState); | |
2703 loadState(source, &entryState, TRUE); | |
2704 goBackOne(source); // We want to look at the point where we ente
red - actually one | |
2705 // before that... | |
2706 | |
2707 for(;;) { | |
2708 // This loop will run once per source string character, for
as long as we | |
2709 // are matching a potential contraction sequence | |
2710 | |
2711 // First we position ourselves at the begining of contractio
n sequence | |
2712 const UChar *ContractionStart = UCharOffset = (UChar *)coll-
>image+getContractOffset(CE); | |
2713 if (collIter_bos(source)) { | |
2714 CE = *(coll->contractionCEs + (UCharOffset - coll->contr
actionIndex)); | |
2715 break; | |
2716 } | |
2717 schar = getPrevNormalizedChar(source, status); | |
2718 goBackOne(source); | |
2719 | |
2720 while(schar > (tchar = *UCharOffset)) { /* since the contrac
tion codepoints should be ordered, we skip all that are smaller */ | |
2721 UCharOffset++; | |
2722 } | |
2723 | |
2724 if (schar == tchar) { | |
2725 // Found the source string char in the table. | |
2726 // Pick up the corresponding CE from the table. | |
2727 CE = *(coll->contractionCEs + | |
2728 (UCharOffset - coll->contractionIndex)); | |
2729 } | |
2730 else | |
2731 { | |
2732 // Source string char was not in the table. | |
2733 // We have not found the prefix. | |
2734 CE = *(coll->contractionCEs + | |
2735 (ContractionStart - coll->contractionIndex)); | |
2736 } | |
2737 | |
2738 if(!isPrefix(CE)) { | |
2739 // The source string char was in the contraction table,
and the corresponding | |
2740 // CE is not a prefix CE. We found the prefix, break | |
2741 // out of loop, this CE will end up being returned. T
his is the normal | |
2742 // way out of prefix handling when the source actually
contained | |
2743 // the prefix. | |
2744 break; | |
2745 } | |
2746 } | |
2747 if(CE != UCOL_NOT_FOUND) { // we found something and we can meri
lly continue | |
2748 loadState(source, &prefixState, TRUE); | |
2749 if(source->origFlags & UCOL_USE_ITERATOR) { | |
2750 source->flags = source->origFlags; | |
2751 } | |
2752 } else { // prefix search was a failure, we have to backup all t
he way to the start | |
2753 loadState(source, &entryState, TRUE); | |
2754 } | |
2755 break; | |
2756 } | |
2757 case CONTRACTION_TAG: | |
2758 { | |
2759 /* This should handle contractions */ | |
2760 collIterateState state; | |
2761 backupState(source, &state); | |
2762 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->imag
e+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND; | |
2763 const UChar *UCharOffset; | |
2764 UChar schar, tchar; | |
2765 | |
2766 for (;;) { | |
2767 /* This loop will run once per source string character, for
as long as we */ | |
2768 /* are matching a potential contraction sequence
*/ | |
2769 | |
2770 /* First we position ourselves at the begining of contractio
n sequence */ | |
2771 const UChar *ContractionStart = UCharOffset = (UChar *)coll-
>image+getContractOffset(CE); | |
2772 | |
2773 if (collIter_eos(source)) { | |
2774 // Ran off the end of the source string. | |
2775 CE = *(coll->contractionCEs + (UCharOffset - coll->contr
actionIndex)); | |
2776 // So we'll pick whatever we have at the point... | |
2777 if (CE == UCOL_NOT_FOUND) { | |
2778 // back up the source over all the chars we scanned
going into this contraction. | |
2779 CE = firstCE; | |
2780 loadState(source, &state, TRUE); | |
2781 if(source->origFlags & UCOL_USE_ITERATOR) { | |
2782 source->flags = source->origFlags; | |
2783 } | |
2784 } | |
2785 break; | |
2786 } | |
2787 | |
2788 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the di
scontiguos stuff */ /* skip the backward offset, see above */ | |
2789 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8); | |
2790 | |
2791 schar = getNextNormalizedChar(source); | |
2792 while(schar > (tchar = *UCharOffset)) { /* since the contrac
tion codepoints should be ordered, we skip all that are smaller */ | |
2793 UCharOffset++; | |
2794 } | |
2795 | |
2796 if (schar == tchar) { | |
2797 // Found the source string char in the contraction table
. | |
2798 // Pick up the corresponding CE from the table. | |
2799 CE = *(coll->contractionCEs + | |
2800 (UCharOffset - coll->contractionIndex)); | |
2801 } | |
2802 else | |
2803 { | |
2804 // Source string char was not in contraction table. | |
2805 // Unless we have a discontiguous contraction, we have
finished | |
2806 // with this contraction. | |
2807 // in order to do the proper detection, we | |
2808 // need to see if we're dealing with a supplementary | |
2809 /* We test whether the next two char are surrogate pairs
. | |
2810 * This test is done if the iterator is not NULL. | |
2811 * If there is no surrogate pair, the iterator | |
2812 * goes back one if needed. */ | |
2813 UChar32 miss = schar; | |
2814 if (source->iterator) { | |
2815 UChar32 surrNextChar; /* the next char in the iterat
ion to test */ | |
2816 int32_t prevPos; /* holds the previous position befo
re move forward of the source iterator */ | |
2817 if(U16_IS_LEAD(schar) && source->iterator->hasNext(s
ource->iterator)) { | |
2818 prevPos = source->iterator->index; | |
2819 surrNextChar = getNextNormalizedChar(source); | |
2820 if (U16_IS_TRAIL(surrNextChar)) { | |
2821 miss = U16_GET_SUPPLEMENTARY(schar, surrNext
Char); | |
2822 } else if (prevPos < source->iterator->index){ | |
2823 goBackOne(source); | |
2824 } | |
2825 } | |
2826 } else if (U16_IS_LEAD(schar) && source->pos + 1 < sourc
e->endp) { | |
2827 const UChar* prevPos = source->pos; | |
2828 UChar nextChar = getNextNormalizedChar(source); | |
2829 if (U16_IS_TRAIL(nextChar)) { | |
2830 miss = U16_GET_SUPPLEMENTARY(schar, nextChar); | |
2831 } else if (prevPos < source->pos) { | |
2832 goBackOne(source); | |
2833 } | |
2834 } | |
2835 | |
2836 uint8_t sCC; | |
2837 if (miss < 0x300 || | |
2838 maxCC == 0 || | |
2839 (sCC = i_getCombiningClass(miss, coll)) == 0 || | |
2840 sCC>maxCC || | |
2841 (allSame != 0 && sCC == maxCC) || | |
2842 collIter_eos(source)) | |
2843 { | |
2844 // Contraction can not be discontiguous. | |
2845 goBackOne(source); // back up the source string by
one, | |
2846 // because the character we just looked at was | |
2847 // not part of the contraction. */ | |
2848 if(U_IS_SUPPLEMENTARY(miss)) { | |
2849 goBackOne(source); | |
2850 } | |
2851 CE = *(coll->contractionCEs + | |
2852 (ContractionStart - coll->contractionIndex)); | |
2853 } else { | |
2854 // | |
2855 // Contraction is possibly discontiguous. | |
2856 // Scan more of source string looking for a match | |
2857 // | |
2858 UChar tempchar; | |
2859 /* find the next character if schar is not a base ch
aracter | |
2860 and we are not yet at the end of the string */ | |
2861 tempchar = getNextNormalizedChar(source); | |
2862 // probably need another supplementary thingie here | |
2863 goBackOne(source); | |
2864 if (i_getCombiningClass(tempchar, coll) == 0) { | |
2865 goBackOne(source); | |
2866 if(U_IS_SUPPLEMENTARY(miss)) { | |
2867 goBackOne(source); | |
2868 } | |
2869 /* Spit out the last char of the string, wasn't
tasty enough */ | |
2870 CE = *(coll->contractionCEs + | |
2871 (ContractionStart - coll->contractionIndex))
; | |
2872 } else { | |
2873 CE = getDiscontiguous(coll, source, ContractionS
tart); | |
2874 } | |
2875 } | |
2876 } // else after if(schar == tchar) | |
2877 | |
2878 if(CE == UCOL_NOT_FOUND) { | |
2879 /* The Source string did not match the contraction that
we were checking. */ | |
2880 /* Back up the source position to undo the effects of h
aving partially */ | |
2881 /* scanned through what ultimately proved to not be a
contraction. */ | |
2882 loadState(source, &state, TRUE); | |
2883 CE = firstCE; | |
2884 break; | |
2885 } | |
2886 | |
2887 if(!isContraction(CE)) { | |
2888 // The source string char was in the contraction table,
and the corresponding | |
2889 // CE is not a contraction CE. We completed the contr
action, break | |
2890 // out of loop, this CE will end up being returned. T
his is the normal | |
2891 // way out of contraction handling when the source act
ually contained | |
2892 // the contraction. | |
2893 break; | |
2894 } | |
2895 | |
2896 | |
2897 // The source string char was in the contraction table, and
the corresponding | |
2898 // CE is IS a contraction CE. We will continue looping t
o check the source | |
2899 // string for the remaining chars in the contraction. | |
2900 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart
- coll->contractionIndex)); | |
2901 if(tempCE != UCOL_NOT_FOUND) { | |
2902 // We have scanned a a section of source string for whic
h there is a | |
2903 // CE from the contraction table. Remember the CE and
scan position, so | |
2904 // that we can return to this point if further scanning
fails to | |
2905 // match a longer contraction sequence. | |
2906 firstCE = tempCE; | |
2907 | |
2908 goBackOne(source); | |
2909 backupState(source, &state); | |
2910 getNextNormalizedChar(source); | |
2911 | |
2912 // Another way to do this is: | |
2913 //collIterateState tempState; | |
2914 //backupState(source, &tempState); | |
2915 //goBackOne(source); | |
2916 //backupState(source, &state); | |
2917 //loadState(source, &tempState, TRUE); | |
2918 | |
2919 // The problem is that for incomplete contractions we ha
ve to remember the previous | |
2920 // position. Before, the only thing I needed to do was s
tate.pos--; | |
2921 // After iterator introduction and especially after intr
oduction of normalizing | |
2922 // iterators, it became much more difficult to decrease
the saved state. | |
2923 // I'm not yet sure which of the two methods above is fa
ster. | |
2924 } | |
2925 } // for(;;) | |
2926 break; | |
2927 } // case CONTRACTION_TAG: | |
2928 case LONG_PRIMARY_TAG: | |
2929 { | |
2930 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; | |
2931 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYT
E_COMMON; | |
2932 source->offsetRepeatCount += 1; | |
2933 return CE; | |
2934 } | |
2935 case EXPANSION_TAG: | |
2936 { | |
2937 /* This should handle expansion. */ | |
2938 /* NOTE: we can encounter both continuations and expansions in a
n expansion! */ | |
2939 /* I have to decide where continuations are going to be dealt wi
th */ | |
2940 uint32_t size; | |
2941 uint32_t i; /* general counter */ | |
2942 | |
2943 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* fi
nd the offset to expansion table */ | |
2944 size = getExpansionCount(CE); | |
2945 CE = *CEOffset++; | |
2946 //source->offsetRepeatCount = -1; | |
2947 | |
2948 if(size != 0) { /* if there are less than 16 elements in expansi
on, we don't terminate */ | |
2949 for(i = 1; i<size; i++) { | |
2950 *(source->CEpos++) = *CEOffset++; | |
2951 source->offsetRepeatCount += 1; | |
2952 } | |
2953 } else { /* else, we do */ | |
2954 while(*CEOffset != 0) { | |
2955 *(source->CEpos++) = *CEOffset++; | |
2956 source->offsetRepeatCount += 1; | |
2957 } | |
2958 } | |
2959 | |
2960 return CE; | |
2961 } | |
2962 case DIGIT_TAG: | |
2963 { | |
2964 /* | |
2965 We do a check to see if we want to collate digits as numbers; if
so we generate | |
2966 a custom collation key. Otherwise we pull out the value stored i
n the expansion table. | |
2967 */ | |
2968 //uint32_t size; | |
2969 uint32_t i; /* general counter */ | |
2970 | |
2971 if (source->coll->numericCollation == UCOL_ON){ | |
2972 collIterateState digitState = {0,0,0,0,0,0,0,0,0}; | |
2973 UChar32 char32 = 0; | |
2974 int32_t digVal = 0; | |
2975 | |
2976 uint32_t digIndx = 0; | |
2977 uint32_t endIndex = 0; | |
2978 uint32_t trailingZeroIndex = 0; | |
2979 | |
2980 uint8_t collateVal = 0; | |
2981 | |
2982 UBool nonZeroValReached = FALSE; | |
2983 | |
2984 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I j
ust need a temporary place to store my generated CEs. | |
2985 /* | |
2986 We parse the source string until we hit a char that's N
OT a digit. | |
2987 Use this u_charDigitValue. This might be slow because we
have to | |
2988 handle surrogates... | |
2989 */ | |
2990 /* | |
2991 if (U16_IS_LEAD(ch)){ | |
2992 if (!collIter_eos(source)) { | |
2993 backupState(source, &digitState); | |
2994 UChar trail = getNextNormalizedChar(source); | |
2995 if(U16_IS_TRAIL(trail)) { | |
2996 char32 = U16_GET_SUPPLEMENTARY(ch, trail); | |
2997 } else { | |
2998 loadState(source, &digitState, TRUE); | |
2999 char32 = ch; | |
3000 } | |
3001 } else { | |
3002 char32 = ch; | |
3003 } | |
3004 } else { | |
3005 char32 = ch; | |
3006 } | |
3007 digVal = u_charDigitValue(char32); | |
3008 */ | |
3009 digVal = u_charDigitValue(cp); // if we have arrived here, w
e have | |
3010 // already processed possible supplementaries that trigered
the digit tag - | |
3011 // all supplementaries are marked in the UCA. | |
3012 /* | |
3013 We pad a zero in front of the first element anyways. Th
is takes | |
3014 care of the (probably) most common case where people are
sorting things followed | |
3015 by a single digit | |
3016 */ | |
3017 digIndx++; | |
3018 for(;;){ | |
3019 // Make sure we have enough space. No longer needed; | |
3020 // at this point digIndx now has a max value of UCOL_MAX
_DIGITS_FOR_NUMBER | |
3021 // (it has been pre-incremented) so we just ensure that
numTempBuf is big enough | |
3022 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3). | |
3023 | |
3024 // Skipping over leading zeroes. | |
3025 if (digVal != 0) { | |
3026 nonZeroValReached = TRUE; | |
3027 } | |
3028 if (nonZeroValReached) { | |
3029 /* | |
3030 We parse the digit string into base 100 numbers (thi
s fits into a byte). | |
3031 We only add to the buffer in twos, thus if we are pa
rsing an odd character, | |
3032 that serves as the 'tens' digit while the if we are
parsing an even one, that | |
3033 is the 'ones' digit. We dumped the parsed base 100 v
alue (collateVal) into | |
3034 a buffer. We multiply each collateVal by 2 (to give
us room) and add 5 (to avoid | |
3035 overlapping magic CE byte values). The last byte we
subtract 1 to ensure it is less | |
3036 than all the other bytes. | |
3037 */ | |
3038 | |
3039 if (digIndx % 2 == 1){ | |
3040 collateVal += (uint8_t)digVal; | |
3041 | |
3042 // We don't enter the low-order-digit case unles
s we've already seen | |
3043 // the high order, or for the first digit, which
is always non-zero. | |
3044 if (collateVal != 0) | |
3045 trailingZeroIndex = 0; | |
3046 | |
3047 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; | |
3048 collateVal = 0; | |
3049 } | |
3050 else{ | |
3051 // We drop the collation value into the buffer s
o if we need to do | |
3052 // a "front patch" we don't have to check to see
if we're hitting the | |
3053 // last element. | |
3054 collateVal = (uint8_t)(digVal * 10); | |
3055 | |
3056 // Check for trailing zeroes. | |
3057 if (collateVal == 0) | |
3058 { | |
3059 if (!trailingZeroIndex) | |
3060 trailingZeroIndex = (digIndx/2) + 2; | |
3061 } | |
3062 else | |
3063 trailingZeroIndex = 0; | |
3064 | |
3065 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; | |
3066 } | |
3067 digIndx++; | |
3068 } | |
3069 | |
3070 // Get next character. | |
3071 if (!collIter_eos(source)){ | |
3072 ch = getNextNormalizedChar(source); | |
3073 if (U16_IS_LEAD(ch)){ | |
3074 if (!collIter_eos(source)) { | |
3075 backupState(source, &digitState); | |
3076 UChar trail = getNextNormalizedChar(source); | |
3077 if(U16_IS_TRAIL(trail)) { | |
3078 char32 = U16_GET_SUPPLEMENTARY(ch, trail
); | |
3079 } else { | |
3080 loadState(source, &digitState, TRUE); | |
3081 char32 = ch; | |
3082 } | |
3083 } | |
3084 } else { | |
3085 char32 = ch; | |
3086 } | |
3087 | |
3088 if ((digVal = u_charDigitValue(char32)) == -1 || dig
Indx > UCOL_MAX_DIGITS_FOR_NUMBER){ | |
3089 // Resetting position to point to the next unpro
cessed char. We | |
3090 // overshot it when doing our test/set for numbe
rs. | |
3091 if (char32 > 0xFFFF) { // For surrogates. | |
3092 loadState(source, &digitState, TRUE); | |
3093 //goBackOne(source); | |
3094 } | |
3095 goBackOne(source); | |
3096 break; | |
3097 } | |
3098 } else { | |
3099 break; | |
3100 } | |
3101 } | |
3102 | |
3103 if (nonZeroValReached == FALSE){ | |
3104 digIndx = 2; | |
3105 numTempBuf[2] = 6; | |
3106 } | |
3107 | |
3108 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx
/2) + 2) ; | |
3109 if (digIndx % 2 != 0){ | |
3110 /* | |
3111 We missed a value. Since digIndx isn't even, stuck too m
any values into the buffer (this is what | |
3112 we get for padding the first byte with a zero). "Front-p
atch" now by pushing all nybbles forward. | |
3113 Doing it this way ensures that at least 50% of the time
(statistically speaking) we'll only be doing a | |
3114 single pass and optimizes for strings with single digits
. I'm just assuming that's the more common case. | |
3115 */ | |
3116 | |
3117 for(i = 2; i < endIndex; i++){ | |
3118 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10)
* 10) + | |
3119 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6; | |
3120 } | |
3121 --digIndx; | |
3122 } | |
3123 | |
3124 // Subtract one off of the last byte. | |
3125 numTempBuf[endIndex-1] -= 1; | |
3126 | |
3127 /* | |
3128 We want to skip over the first two slots in the buffer. The
first slot | |
3129 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The
second slot is for the | |
3130 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. | |
3131 */ | |
3132 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; | |
3133 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F)); | |
3134 | |
3135 // Now transfer the collation key to our collIterate struct. | |
3136 // The total size for our collation key is endIndx bumped up
to the next largest even value divided by two. | |
3137 //size = ((endIndex+1) & ~1)/2; | |
3138 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARY
ORDERSHIFT) | //Primary weight | |
3139 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Seco
ndary weight | |
3140 UCOL_BYTE_COMMON; // Tertiary weight. | |
3141 i = 2; // Reset the index into the buffer. | |
3142 while(i < endIndex) | |
3143 { | |
3144 uint32_t primWeight = numTempBuf[i++] << 8; | |
3145 if ( i < endIndex) | |
3146 primWeight |= numTempBuf[i++]; | |
3147 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHI
FT) | UCOL_CONTINUATION_MARKER; | |
3148 } | |
3149 | |
3150 } else { | |
3151 // no numeric mode, we'll just switch to whatever we stashed
and continue | |
3152 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /
* find the offset to expansion table */ | |
3153 CE = *CEOffset++; | |
3154 break; | |
3155 } | |
3156 return CE; | |
3157 } | |
3158 /* various implicits optimization */ | |
3159 case IMPLICIT_TAG: /* everything that is not defined otherwise */ | |
3160 /* UCA is filled with these. Tailorings are NOT_FOUND */ | |
3161 return getImplicit(cp, source); | |
3162 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
*/ | |
3163 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImpl
icit | |
3164 return getImplicit(cp, source); | |
3165 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ | |
3166 { | |
3167 static const uint32_t | |
3168 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11
A7; | |
3169 //const uint32_t LCount = 19; | |
3170 static const uint32_t VCount = 21; | |
3171 static const uint32_t TCount = 28; | |
3172 //const uint32_t NCount = VCount * TCount; // 588 | |
3173 //const uint32_t SCount = LCount * NCount; // 11172 | |
3174 uint32_t L = ch - SBase; | |
3175 | |
3176 // divide into pieces | |
3177 | |
3178 uint32_t T = L % TCount; // we do it in this order since some co
mpilers can do % and / in one operation | |
3179 L /= TCount; | |
3180 uint32_t V = L % VCount; | |
3181 L /= VCount; | |
3182 | |
3183 // offset them | |
3184 | |
3185 L += LBase; | |
3186 V += VBase; | |
3187 T += TBase; | |
3188 | |
3189 // return the first CE, but first put the rest into the expansio
n buffer | |
3190 if (!source->coll->image->jamoSpecial) { // FAST PATH | |
3191 | |
3192 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V
); | |
3193 if (T != TBase) { | |
3194 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mappin
g, T); | |
3195 } | |
3196 | |
3197 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L); | |
3198 | |
3199 } else { // Jamo is Special | |
3200 // Since Hanguls pass the FCD check, it is | |
3201 // guaranteed that we won't be in | |
3202 // the normalization buffer if something like this happens | |
3203 | |
3204 // However, if we are using a uchar iterator and normalizati
on | |
3205 // is ON, the Hangul that lead us here is going to be in tha
t | |
3206 // normalization buffer. Here we want to restore the uchar | |
3207 // iterator state and pull out of the normalization buffer | |
3208 if(source->iterator != NULL && source->flags & UCOL_ITER_INN
ORMBUF) { | |
3209 source->flags = source->origFlags; // restore the iterat
or | |
3210 source->pos = NULL; | |
3211 } | |
3212 | |
3213 // Move Jamos into normalization buffer | |
3214 UChar *buffer = source->writableBuffer.getBuffer(4); | |
3215 int32_t bufferLength; | |
3216 buffer[0] = (UChar)L; | |
3217 buffer[1] = (UChar)V; | |
3218 if (T != TBase) { | |
3219 buffer[2] = (UChar)T; | |
3220 bufferLength = 3; | |
3221 } else { | |
3222 bufferLength = 2; | |
3223 } | |
3224 source->writableBuffer.releaseBuffer(bufferLength); | |
3225 | |
3226 // Indicate where to continue in main input string after exh
austing the writableBuffer | |
3227 source->fcdPosition = source->pos; | |
3228 | |
3229 source->pos = source->writableBuffer.getTerminatedBuffer()
; | |
3230 source->origFlags = source->flags; | |
3231 source->flags |= UCOL_ITER_INNORMBUF; | |
3232 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); | |
3233 | |
3234 return(UCOL_IGNORABLE); | |
3235 } | |
3236 } | |
3237 case SURROGATE_TAG: | |
3238 /* we encountered a leading surrogate. We shall get the CE by using
the following code unit */ | |
3239 /* two things can happen here: next code point can be a trailing sur
rogate - we will use it */ | |
3240 /* to retrieve the CE, or it is not a trailing surrogate (or the str
ing is done). In that case */ | |
3241 /* we treat it like an unassigned code point. */ | |
3242 { | |
3243 UChar trail; | |
3244 collIterateState state; | |
3245 backupState(source, &state); | |
3246 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNorma
lizedChar(source))))) { | |
3247 // we chould have stepped one char forward and it might have
turned that it | |
3248 // was not a trail surrogate. In that case, we have to backu
p. | |
3249 loadState(source, &state, TRUE); | |
3250 return UCOL_NOT_FOUND; | |
3251 } else { | |
3252 /* TODO: CE contain the data from the previous CE + the mask
. It should at least be unmasked */ | |
3253 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFF
FF, trail); | |
3254 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates
in this block, but not this one. | |
3255 // We need to backup | |
3256 loadState(source, &state, TRUE); | |
3257 return CE; | |
3258 } | |
3259 // calculate the supplementary code point value, if surrogat
e was not tailored | |
3260 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10U
L)+0xdc00-0x10000)); | |
3261 } | |
3262 } | |
3263 break; | |
3264 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ | |
3265 UChar nextChar; | |
3266 if( source->flags & UCOL_USE_ITERATOR) { | |
3267 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source
->iterator))) { | |
3268 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); | |
3269 source->iterator->next(source->iterator); | |
3270 return getImplicit(cp, source); | |
3271 } | |
3272 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->po
s<source->endp)) && | |
3273 U_IS_TRAIL((nextChar=*source->pos))) { | |
3274 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); | |
3275 source->pos++; | |
3276 return getImplicit(cp, source); | |
3277 } | |
3278 return UCOL_NOT_FOUND; | |
3279 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ | |
3280 return UCOL_NOT_FOUND; /* broken surrogate sequence */ | |
3281 case CHARSET_TAG: | |
3282 /* not yet implemented */ | |
3283 /* probably after 1.8 */ | |
3284 return UCOL_NOT_FOUND; | |
3285 default: | |
3286 *status = U_INTERNAL_PROGRAM_ERROR; | |
3287 CE=0; | |
3288 break; | |
3289 } | |
3290 if (CE <= UCOL_NOT_FOUND) break; | |
3291 } | |
3292 return CE; | |
3293 } | |
3294 | |
3295 | |
3296 /* now uses Mark's getImplicitPrimary code */ | |
3297 static | |
3298 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) { | |
3299 uint32_t r = uprv_uca_getImplicitPrimary(cp); | |
3300 | |
3301 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505; | |
3302 collationSource->toReturn = collationSource->CEpos; | |
3303 | |
3304 // **** doesn't work if using iterator **** | |
3305 if (collationSource->flags & UCOL_ITER_INNORMBUF) { | |
3306 collationSource->offsetRepeatCount = 1; | |
3307 } else { | |
3308 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->
string); | |
3309 | |
3310 UErrorCode errorCode = U_ZERO_ERROR; | |
3311 collationSource->appendOffset(firstOffset, errorCode); | |
3312 collationSource->appendOffset(firstOffset + 1, errorCode); | |
3313 | |
3314 collationSource->offsetReturn = collationSource->offsetStore - 1; | |
3315 *(collationSource->offsetBuffer) = firstOffset; | |
3316 if (collationSource->offsetReturn == collationSource->offsetBuffer) { | |
3317 collationSource->offsetStore = collationSource->offsetBuffer; | |
3318 } | |
3319 } | |
3320 | |
3321 return ((r & 0x0000FFFF)<<16) | 0x000000C0; | |
3322 } | |
3323 | |
3324 /** | |
3325 * This function handles the special CEs like contractions, expansions, | |
3326 * surrogates, Thai. | |
3327 * It is called by both getPrevCE | |
3328 */ | |
3329 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, | |
3330 collIterate *source, | |
3331 UErrorCode *status) | |
3332 { | |
3333 const uint32_t *CEOffset = NULL; | |
3334 UChar *UCharOffset = NULL; | |
3335 UChar schar; | |
3336 const UChar *constart = NULL; | |
3337 uint32_t size; | |
3338 UChar buffer[UCOL_MAX_BUFFER]; | |
3339 uint32_t *endCEBuffer; | |
3340 UChar *strbuffer; | |
3341 int32_t noChars = 0; | |
3342 int32_t CECount = 0; | |
3343 | |
3344 for(;;) | |
3345 { | |
3346 /* the only ces that loops are thai and contractions */ | |
3347 switch (getCETag(CE)) | |
3348 { | |
3349 case NOT_FOUND_TAG: /* this tag always returns */ | |
3350 return CE; | |
3351 | |
3352 case SPEC_PROC_TAG: | |
3353 { | |
3354 // Special processing is getting a CE that is preceded by a cert
ain prefix | |
3355 // Currently this is only needed for optimizing Japanese length
and iteration marks. | |
3356 // When we encouter a special processing tag, we go backwards an
d try to see if | |
3357 // we have a match. | |
3358 // Contraction tables are used - so the whole process is not unl
ike contraction. | |
3359 // prefix data is stored backwards in the table. | |
3360 const UChar *UCharOffset; | |
3361 UChar schar, tchar; | |
3362 collIterateState prefixState; | |
3363 backupState(source, &prefixState); | |
3364 for(;;) { | |
3365 // This loop will run once per source string character, for
as long as we | |
3366 // are matching a potential contraction sequence | |
3367 | |
3368 // First we position ourselves at the begining of contractio
n sequence | |
3369 const UChar *ContractionStart = UCharOffset = (UChar *)coll-
>image+getContractOffset(CE); | |
3370 | |
3371 if (collIter_bos(source)) { | |
3372 CE = *(coll->contractionCEs + (UCharOffset - coll->contr
actionIndex)); | |
3373 break; | |
3374 } | |
3375 schar = getPrevNormalizedChar(source, status); | |
3376 goBackOne(source); | |
3377 | |
3378 while(schar > (tchar = *UCharOffset)) { /* since the contrac
tion codepoints should be ordered, we skip all that are smaller */ | |
3379 UCharOffset++; | |
3380 } | |
3381 | |
3382 if (schar == tchar) { | |
3383 // Found the source string char in the table. | |
3384 // Pick up the corresponding CE from the table. | |
3385 CE = *(coll->contractionCEs + | |
3386 (UCharOffset - coll->contractionIndex)); | |
3387 } | |
3388 else | |
3389 { | |
3390 // if there is a completely ignorable code point in the
middle of | |
3391 // a prefix, we need to act as if it's not there | |
3392 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-
fdef are set to zero) | |
3393 // lone surrogates cannot be set to zero as it would bre
ak other processing | |
3394 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping
, schar); | |
3395 // it's easy for BMP code points | |
3396 if(isZeroCE == 0) { | |
3397 continue; | |
3398 } else if(U16_IS_SURROGATE(schar)) { | |
3399 // for supplementary code points, we have to check t
he next one | |
3400 // situations where we are going to ignore | |
3401 // 1. beginning of the string: schar is a lone surro
gate | |
3402 // 2. schar is a lone surrogate | |
3403 // 3. schar is a trail surrogate in a valid surrogat
e sequence | |
3404 // that is explicitly set to zero. | |
3405 if (!collIter_bos(source)) { | |
3406 UChar lead; | |
3407 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(
lead = getPrevNormalizedChar(source, status))) { | |
3408 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapp
ing, lead); | |
3409 if(isSpecial(isZeroCE) && getCETag(isZeroCE)
== SURROGATE_TAG) { | |
3410 uint32_t finalCE = UTRIE_GET32_FROM_OFFS
ET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar); | |
3411 if(finalCE == 0) { | |
3412 // this is a real, assigned complete
ly ignorable code point | |
3413 goBackOne(source); | |
3414 continue; | |
3415 } | |
3416 } | |
3417 } else { | |
3418 // lone surrogate, treat like unassigned | |
3419 return UCOL_NOT_FOUND; | |
3420 } | |
3421 } else { | |
3422 // lone surrogate at the beggining, treat like u
nassigned | |
3423 return UCOL_NOT_FOUND; | |
3424 } | |
3425 } | |
3426 // Source string char was not in the table. | |
3427 // We have not found the prefix. | |
3428 CE = *(coll->contractionCEs + | |
3429 (ContractionStart - coll->contractionIndex)); | |
3430 } | |
3431 | |
3432 if(!isPrefix(CE)) { | |
3433 // The source string char was in the contraction table,
and the corresponding | |
3434 // CE is not a prefix CE. We found the prefix, break | |
3435 // out of loop, this CE will end up being returned. T
his is the normal | |
3436 // way out of prefix handling when the source actually
contained | |
3437 // the prefix. | |
3438 break; | |
3439 } | |
3440 } | |
3441 loadState(source, &prefixState, TRUE); | |
3442 break; | |
3443 } | |
3444 | |
3445 case CONTRACTION_TAG: { | |
3446 /* to ensure that the backwards and forwards iteration matches, we | |
3447 take the current region of most possible match and pass it through | |
3448 the forward iteration. this will ensure that the obstinate problem o
f | |
3449 overlapping contractions will not occur. | |
3450 */ | |
3451 schar = peekCodeUnit(source, 0); | |
3452 constart = (UChar *)coll->image + getContractOffset(CE); | |
3453 if (isAtStartPrevIterate(source) | |
3454 /* commented away contraction end checks after adding the checks | |
3455 in getPrevCE */) { | |
3456 /* start of string or this is not the end of any contraction
*/ | |
3457 CE = *(coll->contractionCEs + | |
3458 (constart - coll->contractionIndex)); | |
3459 break; | |
3460 } | |
3461 strbuffer = buffer; | |
3462 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1); | |
3463 *(UCharOffset --) = 0; | |
3464 noChars = 0; | |
3465 // have to swap thai characters | |
3466 while (ucol_unsafeCP(schar, coll)) { | |
3467 *(UCharOffset) = schar; | |
3468 noChars++; | |
3469 UCharOffset --; | |
3470 schar = getPrevNormalizedChar(source, status); | |
3471 goBackOne(source); | |
3472 // TODO: when we exhaust the contraction buffer, | |
3473 // it needs to get reallocated. The problem is | |
3474 // that the size depends on the string which is | |
3475 // not iterated over. However, since we're travelling | |
3476 // backwards, we already had to set the iterator at | |
3477 // the end - so we might as well know where we are? | |
3478 if (UCharOffset + 1 == buffer) { | |
3479 /* we have exhausted the buffer */ | |
3480 int32_t newsize = 0; | |
3481 if(source->pos) { // actually dealing with a position | |
3482 newsize = (int32_t)(source->pos - source->string + 1); | |
3483 } else { // iterator | |
3484 newsize = 4 * UCOL_MAX_BUFFER; | |
3485 } | |
3486 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) * | |
3487 (newsize + UCOL_MAX_BUFFER)); | |
3488 /* test for NULL */ | |
3489 if (strbuffer == NULL) { | |
3490 *status = U_MEMORY_ALLOCATION_ERROR; | |
3491 return UCOL_NO_MORE_CES; | |
3492 } | |
3493 UCharOffset = strbuffer + newsize; | |
3494 uprv_memcpy(UCharOffset, buffer, | |
3495 UCOL_MAX_BUFFER * sizeof(UChar)); | |
3496 UCharOffset --; | |
3497 } | |
3498 if ((source->pos && (source->pos == source->string || | |
3499 ((source->flags & UCOL_ITER_INNORMBUF) && | |
3500 *(source->pos - 1) == 0 && source->fcdPosition == NULL))) | |
3501 || (source->iterator && !source->iterator->hasPrevious(sourc
e->iterator))) { | |
3502 break; | |
3503 } | |
3504 } | |
3505 /* adds the initial base character to the string */ | |
3506 *(UCharOffset) = schar; | |
3507 noChars++; | |
3508 | |
3509 int32_t offsetBias; | |
3510 | |
3511 // **** doesn't work if using iterator **** | |
3512 if (source->flags & UCOL_ITER_INNORMBUF) { | |
3513 offsetBias = -1; | |
3514 } else { | |
3515 offsetBias = (int32_t)(source->pos - source->string); | |
3516 } | |
3517 | |
3518 /* a new collIterate is used to simplify things, since using the cur
rent | |
3519 collIterate will mean that the forward and backwards iteration will | |
3520 share and change the same buffers. we don't want to get into that. *
/ | |
3521 collIterate temp; | |
3522 int32_t rawOffset; | |
3523 | |
3524 IInit_collIterate(coll, UCharOffset, noChars, &temp, status); | |
3525 if(U_FAILURE(*status)) { | |
3526 return (uint32_t)UCOL_NULLORDER; | |
3527 } | |
3528 temp.flags &= ~UCOL_ITER_NORM; | |
3529 temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT; | |
3530 | |
3531 rawOffset = (int32_t)(temp.pos - temp.string); // should always be z
ero? | |
3532 CE = ucol_IGetNextCE(coll, &temp, status); | |
3533 | |
3534 if (source->extendCEs) { | |
3535 endCEBuffer = source->extendCEs + source->extendCEsSize; | |
3536 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(u
int32_t)); | |
3537 } else { | |
3538 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE; | |
3539 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_
t)); | |
3540 } | |
3541 | |
3542 while (CE != UCOL_NO_MORE_CES) { | |
3543 *(source->CEpos ++) = CE; | |
3544 | |
3545 if (offsetBias >= 0) { | |
3546 source->appendOffset(rawOffset + offsetBias, *status); | |
3547 } | |
3548 | |
3549 CECount++; | |
3550 if (source->CEpos == endCEBuffer) { | |
3551 /* ran out of CE space, reallocate to new buffer. | |
3552 If reallocation fails, reset pointers and bail out, | |
3553 there's no guarantee of the right character position after | |
3554 this bail*/ | |
3555 if (!increaseCEsCapacity(source)) { | |
3556 *status = U_MEMORY_ALLOCATION_ERROR; | |
3557 break; | |
3558 } | |
3559 | |
3560 endCEBuffer = source->extendCEs + source->extendCEsSize; | |
3561 } | |
3562 | |
3563 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) { | |
3564 rawOffset = (int32_t)(temp.fcdPosition - temp.string); | |
3565 } else { | |
3566 rawOffset = (int32_t)(temp.pos - temp.string); | |
3567 } | |
3568 | |
3569 CE = ucol_IGetNextCE(coll, &temp, status); | |
3570 } | |
3571 | |
3572 if (strbuffer != buffer) { | |
3573 uprv_free(strbuffer); | |
3574 } | |
3575 if (U_FAILURE(*status)) { | |
3576 return (uint32_t)UCOL_NULLORDER; | |
3577 } | |
3578 | |
3579 if (source->offsetRepeatValue != 0) { | |
3580 if (CECount > noChars) { | |
3581 source->offsetRepeatCount += temp.offsetRepeatCount; | |
3582 } else { | |
3583 // **** does this really skip the right offsets? **** | |
3584 source->offsetReturn -= (noChars - CECount); | |
3585 } | |
3586 } | |
3587 | |
3588 if (offsetBias >= 0) { | |
3589 source->offsetReturn = source->offsetStore - 1; | |
3590 if (source->offsetReturn == source->offsetBuffer) { | |
3591 source->offsetStore = source->offsetBuffer; | |
3592 } | |
3593 } | |
3594 | |
3595 source->toReturn = source->CEpos - 1; | |
3596 if (source->toReturn == source->CEs) { | |
3597 source->CEpos = source->CEs; | |
3598 } | |
3599 | |
3600 return *(source->toReturn); | |
3601 } | |
3602 case LONG_PRIMARY_TAG: | |
3603 { | |
3604 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON
<< 8) | UCOL_BYTE_COMMON; | |
3605 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; | |
3606 source->toReturn = source->CEpos - 1; | |
3607 | |
3608 if (source->flags & UCOL_ITER_INNORMBUF) { | |
3609 source->offsetRepeatCount = 1; | |
3610 } else { | |
3611 int32_t firstOffset = (int32_t)(source->pos - source->string
); | |
3612 | |
3613 source->appendOffset(firstOffset, *status); | |
3614 source->appendOffset(firstOffset + 1, *status); | |
3615 | |
3616 source->offsetReturn = source->offsetStore - 1; | |
3617 *(source->offsetBuffer) = firstOffset; | |
3618 if (source->offsetReturn == source->offsetBuffer) { | |
3619 source->offsetStore = source->offsetBuffer; | |
3620 } | |
3621 } | |
3622 | |
3623 | |
3624 return *(source->toReturn); | |
3625 } | |
3626 | |
3627 case EXPANSION_TAG: /* this tag always returns */ | |
3628 { | |
3629 /* | |
3630 This should handle expansion. | |
3631 NOTE: we can encounter both continuations and expansions in an expan
sion! | |
3632 I have to decide where continuations are going to be dealt with | |
3633 */ | |
3634 int32_t firstOffset = (int32_t)(source->pos - source->string); | |
3635 | |
3636 // **** doesn't work if using iterator **** | |
3637 if (source->offsetReturn != NULL) { | |
3638 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetRet
urn == source->offsetBuffer) { | |
3639 source->offsetStore = source->offsetBuffer; | |
3640 }else { | |
3641 firstOffset = -1; | |
3642 } | |
3643 } | |
3644 | |
3645 /* find the offset to expansion table */ | |
3646 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); | |
3647 size = getExpansionCount(CE); | |
3648 if (size != 0) { | |
3649 /* | |
3650 if there are less than 16 elements in expansion, we don't termin
ate | |
3651 */ | |
3652 uint32_t count; | |
3653 | |
3654 for (count = 0; count < size; count++) { | |
3655 *(source->CEpos ++) = *CEOffset++; | |
3656 | |
3657 if (firstOffset >= 0) { | |
3658 source->appendOffset(firstOffset + 1, *status); | |
3659 } | |
3660 } | |
3661 } else { | |
3662 /* else, we do */ | |
3663 while (*CEOffset != 0) { | |
3664 *(source->CEpos ++) = *CEOffset ++; | |
3665 | |
3666 if (firstOffset >= 0) { | |
3667 source->appendOffset(firstOffset + 1, *status); | |
3668 } | |
3669 } | |
3670 } | |
3671 | |
3672 if (firstOffset >= 0) { | |
3673 source->offsetReturn = source->offsetStore - 1; | |
3674 *(source->offsetBuffer) = firstOffset; | |
3675 if (source->offsetReturn == source->offsetBuffer) { | |
3676 source->offsetStore = source->offsetBuffer; | |
3677 } | |
3678 } else { | |
3679 source->offsetRepeatCount += size - 1; | |
3680 } | |
3681 | |
3682 source->toReturn = source->CEpos - 1; | |
3683 // in case of one element expansion, we | |
3684 // want to immediately return CEpos | |
3685 if(source->toReturn == source->CEs) { | |
3686 source->CEpos = source->CEs; | |
3687 } | |
3688 | |
3689 return *(source->toReturn); | |
3690 } | |
3691 | |
3692 case DIGIT_TAG: | |
3693 { | |
3694 /* | |
3695 We do a check to see if we want to collate digits as numbers; if
so we generate | |
3696 a custom collation key. Otherwise we pull out the value stored i
n the expansion table. | |
3697 */ | |
3698 uint32_t i; /* general counter */ | |
3699 | |
3700 if (source->coll->numericCollation == UCOL_ON){ | |
3701 uint32_t digIndx = 0; | |
3702 uint32_t endIndex = 0; | |
3703 uint32_t leadingZeroIndex = 0; | |
3704 uint32_t trailingZeroCount = 0; | |
3705 | |
3706 uint8_t collateVal = 0; | |
3707 | |
3708 UBool nonZeroValReached = FALSE; | |
3709 | |
3710 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I j
ust need a temporary place to store my generated CEs. | |
3711 /* | |
3712 We parse the source string until we hit a char that's NOT a
digit. | |
3713 Use this u_charDigitValue. This might be slow because we hav
e to | |
3714 handle surrogates... | |
3715 */ | |
3716 /* | |
3717 We need to break up the digit string into collection element
s of UCOL_MAX_DIGITS_FOR_NUMBER or less, | |
3718 with any chunks smaller than that being on the right end of
the digit string - i.e. the first collation | |
3719 element we process when going backward. To determine how lon
g that chunk might be, we may need to make | |
3720 two passes through the loop that collects digits - one to se
e how long the string is (and how much is | |
3721 leading zeros) to determine the length of that right-hand ch
unk, and a second (if the whole string has | |
3722 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits
) to actually process that collation | |
3723 element chunk after resetting the state to the initialState
at the right side of the digit string. | |
3724 */ | |
3725 uint32_t ceLimit = 0; | |
3726 UChar initial_ch = ch; | |
3727 collIterateState initialState = {0,0,0,0,0,0,0,0,0}; | |
3728 backupState(source, &initialState); | |
3729 | |
3730 for(;;) { | |
3731 collIterateState state = {0,0,0,0,0,0,0,0,0}; | |
3732 UChar32 char32 = 0; | |
3733 int32_t digVal = 0; | |
3734 | |
3735 if (U16_IS_TRAIL (ch)) { | |
3736 if (!collIter_bos(source)){ | |
3737 UChar lead = getPrevNormalizedChar(source, statu
s); | |
3738 if(U16_IS_LEAD(lead)) { | |
3739 char32 = U16_GET_SUPPLEMENTARY(lead,ch); | |
3740 goBackOne(source); | |
3741 } else { | |
3742 char32 = ch; | |
3743 } | |
3744 } else { | |
3745 char32 = ch; | |
3746 } | |
3747 } else { | |
3748 char32 = ch; | |
3749 } | |
3750 digVal = u_charDigitValue(char32); | |
3751 | |
3752 for(;;) { | |
3753 // Make sure we have enough space. No longer needed; | |
3754 // at this point the largest value of digIndx when w
e need to save data in numTempBuf | |
3755 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-
incremented) so we just ensure | |
3756 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FO
R_NUMBER/2 + 2). | |
3757 | |
3758 // Skip over trailing zeroes, and keep a count of th
em. | |
3759 if (digVal != 0) | |
3760 nonZeroValReached = TRUE; | |
3761 | |
3762 if (nonZeroValReached) { | |
3763 /* | |
3764 We parse the digit string into base 100 numbers
(this fits into a byte). | |
3765 We only add to the buffer in twos, thus if we ar
e parsing an odd character, | |
3766 that serves as the 'tens' digit while the if we
are parsing an even one, that | |
3767 is the 'ones' digit. We dumped the parsed base 1
00 value (collateVal) into | |
3768 a buffer. We multiply each collateVal by 2 (to g
ive us room) and add 5 (to avoid | |
3769 overlapping magic CE byte values). The last byte
we subtract 1 to ensure it is less | |
3770 than all the other bytes. | |
3771 | |
3772 Since we're doing in this reverse we want to put
the first digit encountered into the | |
3773 ones place and the second digit encountered into
the tens place. | |
3774 */ | |
3775 | |
3776 if ((digIndx + trailingZeroCount) % 2 == 1) { | |
3777 // High-order digit case (tens place) | |
3778 collateVal += (uint8_t)(digVal * 10); | |
3779 | |
3780 // We cannot set leadingZeroIndex unless it
has been set for the | |
3781 // low-order digit. Therefore, all we can do
for the high-order | |
3782 // digit is turn it off, never on. | |
3783 // The only time we will have a high digit w
ithout a low is for | |
3784 // the very first non-zero digit, so no zero
check is necessary. | |
3785 if (collateVal != 0) | |
3786 leadingZeroIndex = 0; | |
3787 | |
3788 // The first pass through, digIndx may excee
d the limit, but in that case | |
3789 // we no longer care about numTempBuf conten
ts since they will be discarded | |
3790 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER )
{ | |
3791 numTempBuf[(digIndx/2) + 2] = collateVal
*2 + 6; | |
3792 } | |
3793 collateVal = 0; | |
3794 } else { | |
3795 // Low-order digit case (ones place) | |
3796 collateVal = (uint8_t)digVal; | |
3797 | |
3798 // Check for leading zeroes. | |
3799 if (collateVal == 0) { | |
3800 if (!leadingZeroIndex) | |
3801 leadingZeroIndex = (digIndx/2) + 2; | |
3802 } else | |
3803 leadingZeroIndex = 0; | |
3804 | |
3805 // No need to write to buffer; the case of a
last odd digit | |
3806 // is handled below. | |
3807 } | |
3808 ++digIndx; | |
3809 } else | |
3810 ++trailingZeroCount; | |
3811 | |
3812 if (!collIter_bos(source)) { | |
3813 ch = getPrevNormalizedChar(source, status); | |
3814 //goBackOne(source); | |
3815 if (U16_IS_TRAIL(ch)) { | |
3816 backupState(source, &state); | |
3817 if (!collIter_bos(source)) { | |
3818 goBackOne(source); | |
3819 UChar lead = getPrevNormalizedChar(sourc
e, status); | |
3820 | |
3821 if(U16_IS_LEAD(lead)) { | |
3822 char32 = U16_GET_SUPPLEMENTARY(lead,
ch); | |
3823 } else { | |
3824 loadState(source, &state, FALSE); | |
3825 char32 = ch; | |
3826 } | |
3827 } | |
3828 } else | |
3829 char32 = ch; | |
3830 | |
3831 if ((digVal = u_charDigitValue(char32)) == -1 ||
(ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) { | |
3832 if (char32 > 0xFFFF) {// For surrogates. | |
3833 loadState(source, &state, FALSE); | |
3834 } | |
3835 // Don't need to "reverse" the goBackOne cal
l, | |
3836 // as this points to the next position to pr
ocess.. | |
3837 //if (char32 > 0xFFFF) // For surrogates. | |
3838 //getNextNormalizedChar(source); | |
3839 break; | |
3840 } | |
3841 | |
3842 goBackOne(source); | |
3843 }else | |
3844 break; | |
3845 } | |
3846 | |
3847 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_N
UMBER) { | |
3848 // our collation element is not too big, go ahead an
d finish with it | |
3849 break; | |
3850 } | |
3851 // our digit string is too long for a collation element; | |
3852 // set the limit for it, reset the state and begin again | |
3853 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGIT
S_FOR_NUMBER; | |
3854 if ( ceLimit == 0 ) { | |
3855 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER; | |
3856 } | |
3857 ch = initial_ch; | |
3858 loadState(source, &initialState, FALSE); | |
3859 digIndx = endIndex = leadingZeroIndex = trailingZeroCoun
t = 0; | |
3860 collateVal = 0; | |
3861 nonZeroValReached = FALSE; | |
3862 } | |
3863 | |
3864 if (! nonZeroValReached) { | |
3865 digIndx = 2; | |
3866 trailingZeroCount = 0; | |
3867 numTempBuf[2] = 6; | |
3868 } | |
3869 | |
3870 if ((digIndx + trailingZeroCount) % 2 != 0) { | |
3871 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6; | |
3872 digIndx += 1; // The implicit leading zero | |
3873 } | |
3874 if (trailingZeroCount % 2 != 0) { | |
3875 // We had to consume one trailing zero for the low digit | |
3876 // of the least significant byte | |
3877 digIndx += 1; // The trailing zero not in the expo
nent | |
3878 trailingZeroCount -= 1; | |
3879 } | |
3880 | |
3881 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2
) + 2) ; | |
3882 | |
3883 // Subtract one off of the last byte. Really the first byte
here, but it's reversed... | |
3884 numTempBuf[2] -= 1; | |
3885 | |
3886 /* | |
3887 We want to skip over the first two slots in the buffer. The
first slot | |
3888 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The
second slot is for the | |
3889 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. | |
3890 The exponent must be adjusted by the number of leading zeroe
s, and the number of | |
3891 trailing zeroes. | |
3892 */ | |
3893 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; | |
3894 uint32_t exponent = (digIndx+trailingZeroCount)/2; | |
3895 if (leadingZeroIndex) | |
3896 exponent -= ((digIndx/2) + 2 - leadingZeroIndex); | |
3897 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F)); | |
3898 | |
3899 // Now transfer the collation key to our collIterate struct. | |
3900 // The total size for our collation key is half of endIndex,
rounded up. | |
3901 int32_t size = (endIndex+1)/2; | |
3902 if(!ensureCEsCapacity(source, size)) { | |
3903 return (uint32_t)UCOL_NULLORDER; | |
3904 } | |
3905 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1])
<< UCOL_PRIMARYORDERSHIFT) | //Primary weight | |
3906 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Seco
ndary weight | |
3907 UCOL_BYTE_COMMON; // Tertiary weight. | |
3908 i = endIndex - 1; // Reset the index into the buffer. | |
3909 while(i >= 2) { | |
3910 uint32_t primWeight = numTempBuf[i--] << 8; | |
3911 if ( i >= 2) | |
3912 primWeight |= numTempBuf[i--]; | |
3913 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHI
FT) | UCOL_CONTINUATION_MARKER; | |
3914 } | |
3915 | |
3916 source->toReturn = source->CEpos -1; | |
3917 return *(source->toReturn); | |
3918 } else { | |
3919 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); | |
3920 CE = *(CEOffset++); | |
3921 break; | |
3922 } | |
3923 } | |
3924 | |
3925 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ | |
3926 { | |
3927 static const uint32_t | |
3928 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11
A7; | |
3929 //const uint32_t LCount = 19; | |
3930 static const uint32_t VCount = 21; | |
3931 static const uint32_t TCount = 28; | |
3932 //const uint32_t NCount = VCount * TCount; /* 588 */ | |
3933 //const uint32_t SCount = LCount * NCount; /* 11172 */ | |
3934 | |
3935 uint32_t L = ch - SBase; | |
3936 /* | |
3937 divide into pieces. | |
3938 we do it in this order since some compilers can do % and / in on
e | |
3939 operation | |
3940 */ | |
3941 uint32_t T = L % TCount; | |
3942 L /= TCount; | |
3943 uint32_t V = L % VCount; | |
3944 L /= VCount; | |
3945 | |
3946 /* offset them */ | |
3947 L += LBase; | |
3948 V += VBase; | |
3949 T += TBase; | |
3950 | |
3951 int32_t firstOffset = (int32_t)(source->pos - source->string); | |
3952 source->appendOffset(firstOffset, *status); | |
3953 | |
3954 /* | |
3955 * return the first CE, but first put the rest into the expansio
n buffer | |
3956 */ | |
3957 if (!source->coll->image->jamoSpecial) { | |
3958 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L
); | |
3959 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V
); | |
3960 source->appendOffset(firstOffset + 1, *status); | |
3961 | |
3962 if (T != TBase) { | |
3963 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mappin
g, T); | |
3964 source->appendOffset(firstOffset + 1, *status); | |
3965 } | |
3966 | |
3967 source->toReturn = source->CEpos - 1; | |
3968 | |
3969 source->offsetReturn = source->offsetStore - 1; | |
3970 if (source->offsetReturn == source->offsetBuffer) { | |
3971 source->offsetStore = source->offsetBuffer; | |
3972 } | |
3973 | |
3974 return *(source->toReturn); | |
3975 } else { | |
3976 // Since Hanguls pass the FCD check, it is | |
3977 // guaranteed that we won't be in | |
3978 // the normalization buffer if something like this happens | |
3979 | |
3980 // Move Jamos into normalization buffer | |
3981 UChar *tempbuffer = source->writableBuffer.getBuffer(5); | |
3982 int32_t tempbufferLength, jamoOffset; | |
3983 tempbuffer[0] = 0; | |
3984 tempbuffer[1] = (UChar)L; | |
3985 tempbuffer[2] = (UChar)V; | |
3986 if (T != TBase) { | |
3987 tempbuffer[3] = (UChar)T; | |
3988 tempbufferLength = 4; | |
3989 } else { | |
3990 tempbufferLength = 3; | |
3991 } | |
3992 source->writableBuffer.releaseBuffer(tempbufferLength); | |
3993 | |
3994 // Indicate where to continue in main input string after exh
austing the writableBuffer | |
3995 if (source->pos == source->string) { | |
3996 jamoOffset = 0; | |
3997 source->fcdPosition = NULL; | |
3998 } else { | |
3999 jamoOffset = source->pos - source->string; | |
4000 source->fcdPosition = source->pos-1; | |
4001 } | |
4002 | |
4003 // Append offsets for the additional chars | |
4004 // (not the 0, and not the L whose offsets match the origina
l Hangul) | |
4005 int32_t jamoRemaining = tempbufferLength - 2; | |
4006 jamoOffset++; // appended offsets should match end of origin
al Hangul | |
4007 while (jamoRemaining-- > 0) { | |
4008 source->appendOffset(jamoOffset, *status); | |
4009 } | |
4010 | |
4011 source->offsetRepeatValue = jamoOffset; | |
4012 | |
4013 source->offsetReturn = source->offsetStore - 1; | |
4014 if (source->offsetReturn == source->offsetBuffer) { | |
4015 source->offsetStore = source->offsetBuffer; | |
4016 } | |
4017 | |
4018 source->pos = source->writableBuffer.getTermin
atedBuffer() + tempbufferLength; | |
4019 source->origFlags = source->flags; | |
4020 source->flags |= UCOL_ITER_INNORMBUF; | |
4021 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HAS
LEN); | |
4022 | |
4023 return(UCOL_IGNORABLE); | |
4024 } | |
4025 } | |
4026 | |
4027 case IMPLICIT_TAG: /* everything that is not defined otherwise */ | |
4028 return getPrevImplicit(ch, source); | |
4029 | |
4030 // TODO: Remove CJK implicits as they are handled by the getImplicit
Primary function | |
4031 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
*/ | |
4032 return getPrevImplicit(ch, source); | |
4033 | |
4034 case SURROGATE_TAG: /* This is a surrogate pair */ | |
4035 /* essentially an engaged lead surrogate. */ | |
4036 /* if you have encountered it here, it means that a */ | |
4037 /* broken sequence was encountered and this is an error */ | |
4038 return UCOL_NOT_FOUND; | |
4039 | |
4040 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ | |
4041 return UCOL_NOT_FOUND; /* broken surrogate sequence */ | |
4042 | |
4043 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ | |
4044 { | |
4045 UChar32 cp = 0; | |
4046 UChar prevChar; | |
4047 const UChar *prev; | |
4048 if (isAtStartPrevIterate(source)) { | |
4049 /* we are at the start of the string, wrong place to be at *
/ | |
4050 return UCOL_NOT_FOUND; | |
4051 } | |
4052 if (source->pos != source->writableBuffer.getBuffer()) { | |
4053 prev = source->pos - 1; | |
4054 } else { | |
4055 prev = source->fcdPosition; | |
4056 } | |
4057 prevChar = *prev; | |
4058 | |
4059 /* Handles Han and Supplementary characters here.*/ | |
4060 if (U16_IS_LEAD(prevChar)) { | |
4061 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<
10UL)+0xdc00-0x10000)); | |
4062 source->pos = prev; | |
4063 } else { | |
4064 return UCOL_NOT_FOUND; /* like unassigned */ | |
4065 } | |
4066 | |
4067 return getPrevImplicit(cp, source); | |
4068 } | |
4069 | |
4070 /* UCA is filled with these. Tailorings are NOT_FOUND */ | |
4071 /* not yet implemented */ | |
4072 case CHARSET_TAG: /* this tag always returns */ | |
4073 /* probably after 1.8 */ | |
4074 return UCOL_NOT_FOUND; | |
4075 | |
4076 default: /* this tag always returns */ | |
4077 *status = U_INTERNAL_PROGRAM_ERROR; | |
4078 CE=0; | |
4079 break; | |
4080 } | |
4081 | |
4082 if (CE <= UCOL_NOT_FOUND) { | |
4083 break; | |
4084 } | |
4085 } | |
4086 | |
4087 return CE; | |
4088 } | |
4089 | |
4090 /* This should really be a macro
*/ | |
4091 /* This function is used to reverse parts of a buffer. We need this operation wh
en doing continuation */ | |
4092 /* secondaries in French
*/ | |
4093 /* | |
4094 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) { | |
4095 uint8_t temp; | |
4096 while(start<end) { | |
4097 temp = *start; | |
4098 *start++ = *end; | |
4099 *end-- = temp; | |
4100 } | |
4101 } | |
4102 */ | |
4103 | |
4104 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \ | |
4105 TYPE tempA; \ | |
4106 while((start)<(end)) { \ | |
4107 tempA = *(start); \ | |
4108 *(start)++ = *(end); \ | |
4109 *(end)-- = tempA; \ | |
4110 } \ | |
4111 } | |
4112 | |
4113 /****************************************************************************/ | |
4114 /* Following are the sortkey generation functions */ | |
4115 /* */ | |
4116 /****************************************************************************/ | |
4117 | |
4118 U_CAPI int32_t U_EXPORT2 | 113 U_CAPI int32_t U_EXPORT2 |
4119 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, | 114 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, |
4120 const uint8_t *src2, int32_t src2Length, | 115 const uint8_t *src2, int32_t src2Length, |
4121 uint8_t *dest, int32_t destCapacity) { | 116 uint8_t *dest, int32_t destCapacity) { |
4122 /* check arguments */ | 117 /* check arguments */ |
4123 if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[sr
c1Length-1]!=0) || | 118 if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[sr
c1Length-1]!=0) || |
4124 src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[sr
c2Length-1]!=0) || | 119 src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[sr
c2Length-1]!=0) || |
4125 destCapacity<0 || (destCapacity>0 && dest==NULL) | 120 destCapacity<0 || (destCapacity>0 && dest==NULL) |
4126 ) { | 121 ) { |
4127 /* error, attempt to write a zero byte and return 0 */ | 122 /* error, attempt to write a zero byte and return 0 */ |
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4183 /* src1 is not finished, therefore *src2==0, and src1 is appended */ | 178 /* src1 is not finished, therefore *src2==0, and src1 is appended */ |
4184 src2=src1; | 179 src2=src1; |
4185 } | 180 } |
4186 /* append src2, "the other, unfinished sort key" */ | 181 /* append src2, "the other, unfinished sort key" */ |
4187 while((*p++=*src2++)!=0) {} | 182 while((*p++=*src2++)!=0) {} |
4188 | 183 |
4189 /* the actual length might be less than destLength if either sort key contai
ned illegally embedded zero bytes */ | 184 /* the actual length might be less than destLength if either sort key contai
ned illegally embedded zero bytes */ |
4190 return (int32_t)(p-dest); | 185 return (int32_t)(p-dest); |
4191 } | 186 } |
4192 | 187 |
4193 U_NAMESPACE_BEGIN | |
4194 | |
4195 class SortKeyByteSink : public ByteSink { | |
4196 public: | |
4197 SortKeyByteSink(char *dest, int32_t destCapacity) | |
4198 : buffer_(dest), capacity_(destCapacity), | |
4199 appended_(0) { | |
4200 if (buffer_ == NULL) { | |
4201 capacity_ = 0; | |
4202 } else if(capacity_ < 0) { | |
4203 buffer_ = NULL; | |
4204 capacity_ = 0; | |
4205 } | |
4206 } | |
4207 virtual ~SortKeyByteSink(); | |
4208 | |
4209 virtual void Append(const char *bytes, int32_t n); | |
4210 void Append(uint32_t b) { | |
4211 if (appended_ < capacity_ || Resize(1, appended_)) { | |
4212 buffer_[appended_] = (char)b; | |
4213 } | |
4214 ++appended_; | |
4215 } | |
4216 void Append(uint32_t b1, uint32_t b2) { | |
4217 int32_t a2 = appended_ + 2; | |
4218 if (a2 <= capacity_ || Resize(2, appended_)) { | |
4219 buffer_[appended_] = (char)b1; | |
4220 buffer_[appended_ + 1] = (char)b2; | |
4221 } else if(appended_ < capacity_) { | |
4222 buffer_[appended_] = (char)b1; | |
4223 } | |
4224 appended_ = a2; | |
4225 } | |
4226 virtual char *GetAppendBuffer(int32_t min_capacity, | |
4227 int32_t desired_capacity_hint, | |
4228 char *scratch, int32_t scratch_capacity, | |
4229 int32_t *result_capacity); | |
4230 int32_t NumberOfBytesAppended() const { return appended_; } | |
4231 /** @return FALSE if memory allocation failed */ | |
4232 UBool IsOk() const { return buffer_ != NULL; } | |
4233 | |
4234 protected: | |
4235 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t leng
th) = 0; | |
4236 virtual UBool Resize(int32_t appendCapacity, int32_t length) = 0; | |
4237 | |
4238 void SetNotOk() { | |
4239 buffer_ = NULL; | |
4240 capacity_ = 0; | |
4241 } | |
4242 | |
4243 char *buffer_; | |
4244 int32_t capacity_; | |
4245 int32_t appended_; | |
4246 | |
4247 private: | |
4248 SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemente
d | |
4249 SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator
not implemented | |
4250 }; | |
4251 | |
4252 SortKeyByteSink::~SortKeyByteSink() {} | |
4253 | |
4254 void | |
4255 SortKeyByteSink::Append(const char *bytes, int32_t n) { | |
4256 if (n <= 0 || bytes == NULL) { | |
4257 return; | |
4258 } | |
4259 int32_t length = appended_; | |
4260 appended_ += n; | |
4261 if ((buffer_ + length) == bytes) { | |
4262 return; // the caller used GetAppendBuffer() and wrote the bytes alread
y | |
4263 } | |
4264 int32_t available = capacity_ - length; | |
4265 if (n <= available) { | |
4266 uprv_memcpy(buffer_ + length, bytes, n); | |
4267 } else { | |
4268 AppendBeyondCapacity(bytes, n, length); | |
4269 } | |
4270 } | |
4271 | |
4272 char * | |
4273 SortKeyByteSink::GetAppendBuffer(int32_t min_capacity, | |
4274 int32_t desired_capacity_hint, | |
4275 char *scratch, | |
4276 int32_t scratch_capacity, | |
4277 int32_t *result_capacity) { | |
4278 if (min_capacity < 1 || scratch_capacity < min_capacity) { | |
4279 *result_capacity = 0; | |
4280 return NULL; | |
4281 } | |
4282 int32_t available = capacity_ - appended_; | |
4283 if (available >= min_capacity) { | |
4284 *result_capacity = available; | |
4285 return buffer_ + appended_; | |
4286 } else if (Resize(desired_capacity_hint, appended_)) { | |
4287 *result_capacity = capacity_ - appended_; | |
4288 return buffer_ + appended_; | |
4289 } else { | |
4290 *result_capacity = scratch_capacity; | |
4291 return scratch; | |
4292 } | |
4293 } | |
4294 | |
4295 class FixedSortKeyByteSink : public SortKeyByteSink { | |
4296 public: | |
4297 FixedSortKeyByteSink(char *dest, int32_t destCapacity) | |
4298 : SortKeyByteSink(dest, destCapacity) {} | |
4299 virtual ~FixedSortKeyByteSink(); | |
4300 | |
4301 private: | |
4302 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t leng
th); | |
4303 virtual UBool Resize(int32_t appendCapacity, int32_t length); | |
4304 }; | |
4305 | |
4306 FixedSortKeyByteSink::~FixedSortKeyByteSink() {} | |
4307 | |
4308 void | |
4309 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int
32_t length) { | |
4310 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_ | |
4311 // Fill the buffer completely. | |
4312 int32_t available = capacity_ - length; | |
4313 if (available > 0) { | |
4314 uprv_memcpy(buffer_ + length, bytes, available); | |
4315 } | |
4316 } | |
4317 | |
4318 UBool | |
4319 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) { | |
4320 return FALSE; | |
4321 } | |
4322 | |
4323 class CollationKeyByteSink : public SortKeyByteSink { | |
4324 public: | |
4325 CollationKeyByteSink(CollationKey &key) | |
4326 : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getC
apacity()), | |
4327 key_(key) {} | |
4328 virtual ~CollationKeyByteSink(); | |
4329 | |
4330 private: | |
4331 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t leng
th); | |
4332 virtual UBool Resize(int32_t appendCapacity, int32_t length); | |
4333 | |
4334 CollationKey &key_; | |
4335 }; | |
4336 | |
4337 CollationKeyByteSink::~CollationKeyByteSink() {} | |
4338 | |
4339 void | |
4340 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t
length) { | |
4341 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_ | |
4342 if (Resize(n, length)) { | |
4343 uprv_memcpy(buffer_ + length, bytes, n); | |
4344 } | |
4345 } | |
4346 | |
4347 UBool | |
4348 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) { | |
4349 if (buffer_ == NULL) { | |
4350 return FALSE; // allocation failed before already | |
4351 } | |
4352 int32_t newCapacity = 2 * capacity_; | |
4353 int32_t altCapacity = length + 2 * appendCapacity; | |
4354 if (newCapacity < altCapacity) { | |
4355 newCapacity = altCapacity; | |
4356 } | |
4357 if (newCapacity < 200) { | |
4358 newCapacity = 200; | |
4359 } | |
4360 uint8_t *newBuffer = key_.reallocate(newCapacity, length); | |
4361 if (newBuffer == NULL) { | |
4362 SetNotOk(); | |
4363 return FALSE; | |
4364 } | |
4365 buffer_ = reinterpret_cast<char *>(newBuffer); | |
4366 capacity_ = newCapacity; | |
4367 return TRUE; | |
4368 } | |
4369 | |
4370 /** | |
4371 * uint8_t byte buffer, similar to CharString but simpler. | |
4372 */ | |
4373 class SortKeyLevel : public UMemory { | |
4374 public: | |
4375 SortKeyLevel() : len(0), ok(TRUE) {} | |
4376 ~SortKeyLevel() {} | |
4377 | |
4378 /** @return FALSE if memory allocation failed */ | |
4379 UBool isOk() const { return ok; } | |
4380 UBool isEmpty() const { return len == 0; } | |
4381 int32_t length() const { return len; } | |
4382 const uint8_t *data() const { return buffer.getAlias(); } | |
4383 uint8_t operator[](int32_t index) const { return buffer[index]; } | |
4384 | |
4385 void appendByte(uint32_t b); | |
4386 | |
4387 void appendTo(ByteSink &sink) const { | |
4388 sink.Append(reinterpret_cast<const char *>(buffer.getAlias()), len); | |
4389 } | |
4390 | |
4391 uint8_t &lastByte() { | |
4392 U_ASSERT(len > 0); | |
4393 return buffer[len - 1]; | |
4394 } | |
4395 | |
4396 uint8_t *getLastFewBytes(int32_t n) { | |
4397 if (ok && len >= n) { | |
4398 return buffer.getAlias() + len - n; | |
4399 } else { | |
4400 return NULL; | |
4401 } | |
4402 } | |
4403 | |
4404 private: | |
4405 MaybeStackArray<uint8_t, 40> buffer; | |
4406 int32_t len; | |
4407 UBool ok; | |
4408 | |
4409 UBool ensureCapacity(int32_t appendCapacity); | |
4410 | |
4411 SortKeyLevel(const SortKeyLevel &other); // forbid copying of this class | |
4412 SortKeyLevel &operator=(const SortKeyLevel &other); // forbid copying of thi
s class | |
4413 }; | |
4414 | |
4415 void SortKeyLevel::appendByte(uint32_t b) { | |
4416 if(len < buffer.getCapacity() || ensureCapacity(1)) { | |
4417 buffer[len++] = (uint8_t)b; | |
4418 } | |
4419 } | |
4420 | |
4421 UBool SortKeyLevel::ensureCapacity(int32_t appendCapacity) { | |
4422 if(!ok) { | |
4423 return FALSE; | |
4424 } | |
4425 int32_t newCapacity = 2 * buffer.getCapacity(); | |
4426 int32_t altCapacity = len + 2 * appendCapacity; | |
4427 if (newCapacity < altCapacity) { | |
4428 newCapacity = altCapacity; | |
4429 } | |
4430 if (newCapacity < 200) { | |
4431 newCapacity = 200; | |
4432 } | |
4433 if(buffer.resize(newCapacity, len)==NULL) { | |
4434 return ok = FALSE; | |
4435 } | |
4436 return TRUE; | |
4437 } | |
4438 | |
4439 U_NAMESPACE_END | |
4440 | |
4441 /* sortkey API */ | |
4442 U_CAPI int32_t U_EXPORT2 | 188 U_CAPI int32_t U_EXPORT2 |
4443 ucol_getSortKey(const UCollator *coll, | 189 ucol_getSortKey(const UCollator *coll, |
4444 const UChar *source, | 190 const UChar *source, |
4445 int32_t sourceLength, | 191 int32_t sourceLength, |
4446 uint8_t *result, | 192 uint8_t *result, |
4447 int32_t resultLength) | 193 int32_t resultLength) |
4448 { | 194 { |
4449 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); | 195 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); |
4450 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { | 196 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { |
4451 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, sour
ce, | 197 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, sour
ce, |
4452 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLengt
h)); | 198 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLengt
h)); |
4453 } | 199 } |
4454 | 200 |
4455 if(coll->delegate != NULL) { | 201 int32_t keySize = Collator::fromUCollator(coll)-> |
4456 return ((const Collator*)coll->delegate)->getSortKey(source, sourceLength,
result, resultLength); | 202 getSortKey(source, sourceLength, result, resultLength); |
4457 } | 203 |
4458 | |
4459 UErrorCode status = U_ZERO_ERROR; | |
4460 int32_t keySize = 0; | |
4461 | |
4462 if(source != NULL) { | |
4463 // source == NULL is actually an error situation, but we would need to | |
4464 // have an error code to return it. Until we introduce a new | |
4465 // API, it stays like this | |
4466 | |
4467 /* this uses the function pointer that is set in updateinternalstate */ | |
4468 /* currently, there are two funcs: */ | |
4469 /*ucol_calcSortKey(...);*/ | |
4470 /*ucol_calcSortKeySimpleTertiary(...);*/ | |
4471 | |
4472 uint8_t noDest[1] = { 0 }; | |
4473 if(result == NULL) { | |
4474 // Distinguish pure preflighting from an allocation error. | |
4475 result = noDest; | |
4476 resultLength = 0; | |
4477 } | |
4478 FixedSortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength
); | |
4479 coll->sortKeyGen(coll, source, sourceLength, sink, &status); | |
4480 if(U_SUCCESS(status)) { | |
4481 keySize = sink.NumberOfBytesAppended(); | |
4482 } | |
4483 } | |
4484 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); | 204 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); |
4485 UTRACE_EXIT_STATUS(status); | 205 UTRACE_EXIT_VALUE(keySize); |
4486 return keySize; | 206 return keySize; |
4487 } | 207 } |
4488 | 208 |
4489 U_CFUNC int32_t | |
4490 ucol_getCollationKey(const UCollator *coll, | |
4491 const UChar *source, int32_t sourceLength, | |
4492 CollationKey &key, | |
4493 UErrorCode &errorCode) { | |
4494 CollationKeyByteSink sink(key); | |
4495 coll->sortKeyGen(coll, source, sourceLength, sink, &errorCode); | |
4496 return sink.NumberOfBytesAppended(); | |
4497 } | |
4498 | |
4499 // Is this primary weight compressible? | |
4500 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit). | |
4501 // TODO: This should use per-lead-byte flags from FractionalUCA.txt. | |
4502 static inline UBool | |
4503 isCompressible(const UCollator * /*coll*/, uint8_t primary1) { | |
4504 return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegul
arPrimary; | |
4505 } | |
4506 | |
4507 static | |
4508 inline void doCaseShift(SortKeyLevel &cases, uint32_t &caseShift) { | |
4509 if (caseShift == 0) { | |
4510 cases.appendByte(UCOL_CASE_BYTE_START); | |
4511 caseShift = UCOL_CASE_SHIFT_START; | |
4512 } | |
4513 } | |
4514 | |
4515 // Packs the secondary buffer when processing French locale. | |
4516 static void | |
4517 packFrench(const uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result)
{ | |
4518 secondaries += secsize; // We read the secondary-level bytes back to front. | |
4519 uint8_t secondary; | |
4520 int32_t count2 = 0; | |
4521 int32_t i = 0; | |
4522 // we use i here since the key size already accounts for terminators, so we'
ll discard the increment | |
4523 for(i = 0; i<secsize; i++) { | |
4524 secondary = *(secondaries-i-1); | |
4525 /* This is compression code. */ | |
4526 if (secondary == UCOL_COMMON2) { | |
4527 ++count2; | |
4528 } else { | |
4529 if (count2 > 0) { | |
4530 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. | |
4531 while (count2 > UCOL_TOP_COUNT2) { | |
4532 result.Append(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); | |
4533 count2 -= (uint32_t)UCOL_TOP_COUNT2; | |
4534 } | |
4535 result.Append(UCOL_COMMON_TOP2 - (count2-1)); | |
4536 } else { | |
4537 while (count2 > UCOL_BOT_COUNT2) { | |
4538 result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); | |
4539 count2 -= (uint32_t)UCOL_BOT_COUNT2; | |
4540 } | |
4541 result.Append(UCOL_COMMON_BOT2 + (count2-1)); | |
4542 } | |
4543 count2 = 0; | |
4544 } | |
4545 result.Append(secondary); | |
4546 } | |
4547 } | |
4548 if (count2 > 0) { | |
4549 while (count2 > UCOL_BOT_COUNT2) { | |
4550 result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); | |
4551 count2 -= (uint32_t)UCOL_BOT_COUNT2; | |
4552 } | |
4553 result.Append(UCOL_COMMON_BOT2 + (count2-1)); | |
4554 } | |
4555 } | |
4556 | |
4557 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0 | |
4558 | |
4559 /* This is the sortkey work horse function */ | |
4560 U_CFUNC void U_CALLCONV | |
4561 ucol_calcSortKey(const UCollator *coll, | |
4562 const UChar *source, | |
4563 int32_t sourceLength, | |
4564 SortKeyByteSink &result, | |
4565 UErrorCode *status) | |
4566 { | |
4567 if(U_FAILURE(*status)) { | |
4568 return; | |
4569 } | |
4570 | |
4571 SortKeyByteSink &primaries = result; | |
4572 SortKeyLevel secondaries; | |
4573 SortKeyLevel tertiaries; | |
4574 SortKeyLevel cases; | |
4575 SortKeyLevel quads; | |
4576 | |
4577 UnicodeString normSource; | |
4578 | |
4579 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); | |
4580 | |
4581 UColAttributeValue strength = coll->strength; | |
4582 | |
4583 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); | |
4584 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); | |
4585 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); | |
4586 UBool compareIdent = (strength == UCOL_IDENTICAL); | |
4587 UBool doCase = (coll->caseLevel == UCOL_ON); | |
4588 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0)
; | |
4589 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); | |
4590 //UBool qShifted = shifted && (compareQuad == 0); | |
4591 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); | |
4592 | |
4593 uint32_t variableTopValue = coll->variableTopValue; | |
4594 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no | |
4595 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high. | |
4596 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); | |
4597 uint8_t UCOL_HIRAGANA_QUAD = 0; | |
4598 if(doHiragana) { | |
4599 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++; | |
4600 /* allocate one more space for hiragana, value for hiragana */ | |
4601 } | |
4602 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); | |
4603 | |
4604 /* support for special features like caselevel and funky secondaries */ | |
4605 int32_t lastSecondaryLength = 0; | |
4606 uint32_t caseShift = 0; | |
4607 | |
4608 /* If we need to normalize, we'll do it all at once at the beginning! */ | |
4609 const Normalizer2 *norm2; | |
4610 if(compareIdent) { | |
4611 norm2 = Normalizer2Factory::getNFDInstance(*status); | |
4612 } else if(coll->normalizationMode != UCOL_OFF) { | |
4613 norm2 = Normalizer2Factory::getFCDInstance(*status); | |
4614 } else { | |
4615 norm2 = NULL; | |
4616 } | |
4617 if(norm2 != NULL) { | |
4618 normSource.setTo(FALSE, source, len); | |
4619 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); | |
4620 if(qcYesLength != len) { | |
4621 UnicodeString unnormalized = normSource.tempSubString(qcYesLength); | |
4622 normSource.truncate(qcYesLength); | |
4623 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); | |
4624 source = normSource.getBuffer(); | |
4625 len = normSource.length(); | |
4626 } | |
4627 } | |
4628 collIterate s; | |
4629 IInit_collIterate(coll, source, len, &s, status); | |
4630 if(U_FAILURE(*status)) { | |
4631 return; | |
4632 } | |
4633 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was norma
lized. | |
4634 | |
4635 uint32_t order = 0; | |
4636 | |
4637 uint8_t primary1 = 0; | |
4638 uint8_t primary2 = 0; | |
4639 uint8_t secondary = 0; | |
4640 uint8_t tertiary = 0; | |
4641 uint8_t caseSwitch = coll->caseSwitch; | |
4642 uint8_t tertiaryMask = coll->tertiaryMask; | |
4643 int8_t tertiaryAddition = coll->tertiaryAddition; | |
4644 uint8_t tertiaryTop = coll->tertiaryTop; | |
4645 uint8_t tertiaryBottom = coll->tertiaryBottom; | |
4646 uint8_t tertiaryCommon = coll->tertiaryCommon; | |
4647 uint8_t caseBits = 0; | |
4648 | |
4649 UBool wasShifted = FALSE; | |
4650 UBool notIsContinuation = FALSE; | |
4651 | |
4652 uint32_t count2 = 0, count3 = 0, count4 = 0; | |
4653 uint8_t leadPrimary = 0; | |
4654 | |
4655 for(;;) { | |
4656 order = ucol_IGetNextCE(coll, &s, status); | |
4657 if(order == UCOL_NO_MORE_CES) { | |
4658 break; | |
4659 } | |
4660 | |
4661 if(order == 0) { | |
4662 continue; | |
4663 } | |
4664 | |
4665 notIsContinuation = !isContinuation(order); | |
4666 | |
4667 if(notIsContinuation) { | |
4668 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK); | |
4669 } else { | |
4670 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); | |
4671 } | |
4672 | |
4673 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); | |
4674 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); | |
4675 primary1 = (uint8_t)(order >> 8); | |
4676 | |
4677 uint8_t originalPrimary1 = primary1; | |
4678 if(notIsContinuation && coll->leadBytePermutationTable != NULL) { | |
4679 primary1 = coll->leadBytePermutationTable[primary1]; | |
4680 } | |
4681 | |
4682 if((shifted && ((notIsContinuation && order <= variableTopValue && prima
ry1 > 0) | |
4683 || (!notIsContinuation && wasShifted))) | |
4684 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that
primary ignorables */ | |
4685 { | |
4686 /* and other ignorables should be removed if following a shifted cod
e point */ | |
4687 if(primary1 == 0) { /* if we were shifted and we got an ignorable co
de point */ | |
4688 /* we should just completely ignore it */ | |
4689 continue; | |
4690 } | |
4691 if(compareQuad == 0) { | |
4692 if(count4 > 0) { | |
4693 while (count4 > UCOL_BOT_COUNT4) { | |
4694 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); | |
4695 count4 -= UCOL_BOT_COUNT4; | |
4696 } | |
4697 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1)); | |
4698 count4 = 0; | |
4699 } | |
4700 /* We are dealing with a variable and we're treating them as shi
fted */ | |
4701 /* This is a shifted ignorable */ | |
4702 if(primary1 != 0) { /* we need to check this since we could be i
n continuation */ | |
4703 quads.appendByte(primary1); | |
4704 } | |
4705 if(primary2 != 0) { | |
4706 quads.appendByte(primary2); | |
4707 } | |
4708 } | |
4709 wasShifted = TRUE; | |
4710 } else { | |
4711 wasShifted = FALSE; | |
4712 /* Note: This code assumes that the table is well built i.e. not hav
ing 0 bytes where they are not supposed to be. */ | |
4713 /* Usually, we'll have non-zero primary1 & primary2, except in cases
of a-z and friends, when primary2 will */ | |
4714 /* regular and simple sortkey calc */ | |
4715 if(primary1 != UCOL_IGNORABLE) { | |
4716 if(notIsContinuation) { | |
4717 if(leadPrimary == primary1) { | |
4718 primaries.Append(primary2); | |
4719 } else { | |
4720 if(leadPrimary != 0) { | |
4721 primaries.Append((primary1 > leadPrimary) ? UCOL_BYT
E_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); | |
4722 } | |
4723 if(primary2 == UCOL_IGNORABLE) { | |
4724 /* one byter, not compressed */ | |
4725 primaries.Append(primary1); | |
4726 leadPrimary = 0; | |
4727 } else if(isCompressible(coll, originalPrimary1)) { | |
4728 /* compress */ | |
4729 primaries.Append(leadPrimary = primary1, primary2); | |
4730 } else { | |
4731 leadPrimary = 0; | |
4732 primaries.Append(primary1, primary2); | |
4733 } | |
4734 } | |
4735 } else { /* we are in continuation, so we're gonna add primary t
o the key don't care about compression */ | |
4736 if(primary2 == UCOL_IGNORABLE) { | |
4737 primaries.Append(primary1); | |
4738 } else { | |
4739 primaries.Append(primary1, primary2); | |
4740 } | |
4741 } | |
4742 } | |
4743 | |
4744 if(secondary > compareSec) { | |
4745 if(!isFrenchSec) { | |
4746 /* This is compression code. */ | |
4747 if (secondary == UCOL_COMMON2 && notIsContinuation) { | |
4748 ++count2; | |
4749 } else { | |
4750 if (count2 > 0) { | |
4751 if (secondary > UCOL_COMMON2) { // not necessary for
4th level. | |
4752 while (count2 > UCOL_TOP_COUNT2) { | |
4753 secondaries.appendByte(UCOL_COMMON_TOP2 - UC
OL_TOP_COUNT2); | |
4754 count2 -= (uint32_t)UCOL_TOP_COUNT2; | |
4755 } | |
4756 secondaries.appendByte(UCOL_COMMON_TOP2 - (count
2-1)); | |
4757 } else { | |
4758 while (count2 > UCOL_BOT_COUNT2) { | |
4759 secondaries.appendByte(UCOL_COMMON_BOT2 + UC
OL_BOT_COUNT2); | |
4760 count2 -= (uint32_t)UCOL_BOT_COUNT2; | |
4761 } | |
4762 secondaries.appendByte(UCOL_COMMON_BOT2 + (count
2-1)); | |
4763 } | |
4764 count2 = 0; | |
4765 } | |
4766 secondaries.appendByte(secondary); | |
4767 } | |
4768 } else { | |
4769 /* Do the special handling for French secondaries */ | |
4770 /* We need to get continuation elements and do intermediate
restore */ | |
4771 /* abc1c2c3de with french secondaries need to be edc1c2c3ba
NOT edc3c2c1ba */ | |
4772 if(notIsContinuation) { | |
4773 if (lastSecondaryLength > 1) { | |
4774 uint8_t *frenchStartPtr = secondaries.getLastFewByte
s(lastSecondaryLength); | |
4775 if (frenchStartPtr != NULL) { | |
4776 /* reverse secondaries from frenchStartPtr up to
frenchEndPtr */ | |
4777 uint8_t *frenchEndPtr = frenchStartPtr + lastSec
ondaryLength - 1; | |
4778 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr); | |
4779 } | |
4780 } | |
4781 lastSecondaryLength = 1; | |
4782 } else { | |
4783 ++lastSecondaryLength; | |
4784 } | |
4785 secondaries.appendByte(secondary); | |
4786 } | |
4787 } | |
4788 | |
4789 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { | |
4790 // do the case level if we need to do it. We don't want to calcu
late | |
4791 // case level for primary ignorables if we have only primary str
ength and case level | |
4792 // otherwise we would break well formedness of CEs | |
4793 doCaseShift(cases, caseShift); | |
4794 if(notIsContinuation) { | |
4795 caseBits = (uint8_t)(tertiary & 0xC0); | |
4796 | |
4797 if(tertiary != 0) { | |
4798 if(coll->caseFirst == UCOL_UPPER_FIRST) { | |
4799 if((caseBits & 0xC0) == 0) { | |
4800 cases.lastByte() |= 1 << (--caseShift); | |
4801 } else { | |
4802 cases.lastByte() |= 0 << (--caseShift); | |
4803 /* second bit */ | |
4804 doCaseShift(cases, caseShift); | |
4805 cases.lastByte() |= ((caseBits>>6)&1) << (--case
Shift); | |
4806 } | |
4807 } else { | |
4808 if((caseBits & 0xC0) == 0) { | |
4809 cases.lastByte() |= 0 << (--caseShift); | |
4810 } else { | |
4811 cases.lastByte() |= 1 << (--caseShift); | |
4812 /* second bit */ | |
4813 doCaseShift(cases, caseShift); | |
4814 cases.lastByte() |= ((caseBits>>7)&1) << (--case
Shift); | |
4815 } | |
4816 } | |
4817 } | |
4818 } | |
4819 } else { | |
4820 if(notIsContinuation) { | |
4821 tertiary ^= caseSwitch; | |
4822 } | |
4823 } | |
4824 | |
4825 tertiary &= tertiaryMask; | |
4826 if(tertiary > compareTer) { | |
4827 /* This is compression code. */ | |
4828 /* sequence size check is included in the if clause */ | |
4829 if (tertiary == tertiaryCommon && notIsContinuation) { | |
4830 ++count3; | |
4831 } else { | |
4832 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMO
N3_NORMAL) { | |
4833 tertiary += tertiaryAddition; | |
4834 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UC
OL_COMMON3_UPPERFIRST) { | |
4835 tertiary -= tertiaryAddition; | |
4836 } | |
4837 if (count3 > 0) { | |
4838 if ((tertiary > tertiaryCommon)) { | |
4839 while (count3 > coll->tertiaryTopCount) { | |
4840 tertiaries.appendByte(tertiaryTop - coll->tertia
ryTopCount); | |
4841 count3 -= (uint32_t)coll->tertiaryTopCount; | |
4842 } | |
4843 tertiaries.appendByte(tertiaryTop - (count3-1)); | |
4844 } else { | |
4845 while (count3 > coll->tertiaryBottomCount) { | |
4846 tertiaries.appendByte(tertiaryBottom + coll->ter
tiaryBottomCount); | |
4847 count3 -= (uint32_t)coll->tertiaryBottomCount; | |
4848 } | |
4849 tertiaries.appendByte(tertiaryBottom + (count3-1)); | |
4850 } | |
4851 count3 = 0; | |
4852 } | |
4853 tertiaries.appendByte(tertiary); | |
4854 } | |
4855 } | |
4856 | |
4857 if(/*qShifted*/(compareQuad==0) && notIsContinuation) { | |
4858 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we ne
ed to note it | |
4859 if(count4>0) { // Close this part | |
4860 while (count4 > UCOL_BOT_COUNT4) { | |
4861 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4)
; | |
4862 count4 -= UCOL_BOT_COUNT4; | |
4863 } | |
4864 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1)); | |
4865 count4 = 0; | |
4866 } | |
4867 quads.appendByte(UCOL_HIRAGANA_QUAD); // Add the Hiragana | |
4868 } else { // This wasn't Hiragana, so we can continue adding stuf
f | |
4869 count4++; | |
4870 } | |
4871 } | |
4872 } | |
4873 } | |
4874 | |
4875 /* Here, we are generally done with processing */ | |
4876 /* bailing out would not be too productive */ | |
4877 | |
4878 UBool ok = TRUE; | |
4879 if(U_SUCCESS(*status)) { | |
4880 /* we have done all the CE's, now let's put them together to form a key
*/ | |
4881 if(compareSec == 0) { | |
4882 if (count2 > 0) { | |
4883 while (count2 > UCOL_BOT_COUNT2) { | |
4884 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); | |
4885 count2 -= (uint32_t)UCOL_BOT_COUNT2; | |
4886 } | |
4887 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); | |
4888 } | |
4889 result.Append(UCOL_LEVELTERMINATOR); | |
4890 if(!secondaries.isOk()) { | |
4891 ok = FALSE; | |
4892 } else if(!isFrenchSec) { | |
4893 secondaries.appendTo(result); | |
4894 } else { | |
4895 // If there are any unresolved continuation secondaries, | |
4896 // reverse them here so that we can reverse the whole secondary
thing. | |
4897 if (lastSecondaryLength > 1) { | |
4898 uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSe
condaryLength); | |
4899 if (frenchStartPtr != NULL) { | |
4900 /* reverse secondaries from frenchStartPtr up to frenchE
ndPtr */ | |
4901 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLe
ngth - 1; | |
4902 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, french
EndPtr); | |
4903 } | |
4904 } | |
4905 packFrench(secondaries.data(), secondaries.length(), result); | |
4906 } | |
4907 } | |
4908 | |
4909 if(doCase) { | |
4910 ok &= cases.isOk(); | |
4911 result.Append(UCOL_LEVELTERMINATOR); | |
4912 cases.appendTo(result); | |
4913 } | |
4914 | |
4915 if(compareTer == 0) { | |
4916 if (count3 > 0) { | |
4917 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) { | |
4918 while (count3 >= coll->tertiaryTopCount) { | |
4919 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCou
nt); | |
4920 count3 -= (uint32_t)coll->tertiaryTopCount; | |
4921 } | |
4922 tertiaries.appendByte(tertiaryTop - count3); | |
4923 } else { | |
4924 while (count3 > coll->tertiaryBottomCount) { | |
4925 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBot
tomCount); | |
4926 count3 -= (uint32_t)coll->tertiaryBottomCount; | |
4927 } | |
4928 tertiaries.appendByte(tertiaryBottom + (count3-1)); | |
4929 } | |
4930 } | |
4931 ok &= tertiaries.isOk(); | |
4932 result.Append(UCOL_LEVELTERMINATOR); | |
4933 tertiaries.appendTo(result); | |
4934 | |
4935 if(compareQuad == 0/*qShifted == TRUE*/) { | |
4936 if(count4 > 0) { | |
4937 while (count4 > UCOL_BOT_COUNT4) { | |
4938 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); | |
4939 count4 -= UCOL_BOT_COUNT4; | |
4940 } | |
4941 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1)); | |
4942 } | |
4943 ok &= quads.isOk(); | |
4944 result.Append(UCOL_LEVELTERMINATOR); | |
4945 quads.appendTo(result); | |
4946 } | |
4947 | |
4948 if(compareIdent) { | |
4949 result.Append(UCOL_LEVELTERMINATOR); | |
4950 u_writeIdenticalLevelRun(s.string, len, result); | |
4951 } | |
4952 } | |
4953 result.Append(0); | |
4954 } | |
4955 | |
4956 /* To avoid memory leak, free the offset buffer if necessary. */ | |
4957 ucol_freeOffsetBuffer(&s); | |
4958 | |
4959 ok &= result.IsOk(); | |
4960 if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; } | |
4961 } | |
4962 | |
4963 | |
4964 U_CFUNC void U_CALLCONV | |
4965 ucol_calcSortKeySimpleTertiary(const UCollator *coll, | |
4966 const UChar *source, | |
4967 int32_t sourceLength, | |
4968 SortKeyByteSink &result, | |
4969 UErrorCode *status) | |
4970 { | |
4971 U_ALIGN_CODE(16); | |
4972 | |
4973 if(U_FAILURE(*status)) { | |
4974 return; | |
4975 } | |
4976 | |
4977 SortKeyByteSink &primaries = result; | |
4978 SortKeyLevel secondaries; | |
4979 SortKeyLevel tertiaries; | |
4980 | |
4981 UnicodeString normSource; | |
4982 | |
4983 int32_t len = sourceLength; | |
4984 | |
4985 /* If we need to normalize, we'll do it all at once at the beginning! */ | |
4986 if(coll->normalizationMode != UCOL_OFF) { | |
4987 normSource.setTo(len < 0, source, len); | |
4988 const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status); | |
4989 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); | |
4990 if(qcYesLength != normSource.length()) { | |
4991 UnicodeString unnormalized = normSource.tempSubString(qcYesLength); | |
4992 normSource.truncate(qcYesLength); | |
4993 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); | |
4994 source = normSource.getBuffer(); | |
4995 len = normSource.length(); | |
4996 } | |
4997 } | |
4998 collIterate s; | |
4999 IInit_collIterate(coll, (UChar *)source, len, &s, status); | |
5000 if(U_FAILURE(*status)) { | |
5001 return; | |
5002 } | |
5003 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was norma
lized. | |
5004 | |
5005 uint32_t order = 0; | |
5006 | |
5007 uint8_t primary1 = 0; | |
5008 uint8_t primary2 = 0; | |
5009 uint8_t secondary = 0; | |
5010 uint8_t tertiary = 0; | |
5011 uint8_t caseSwitch = coll->caseSwitch; | |
5012 uint8_t tertiaryMask = coll->tertiaryMask; | |
5013 int8_t tertiaryAddition = coll->tertiaryAddition; | |
5014 uint8_t tertiaryTop = coll->tertiaryTop; | |
5015 uint8_t tertiaryBottom = coll->tertiaryBottom; | |
5016 uint8_t tertiaryCommon = coll->tertiaryCommon; | |
5017 | |
5018 UBool notIsContinuation = FALSE; | |
5019 | |
5020 uint32_t count2 = 0, count3 = 0; | |
5021 uint8_t leadPrimary = 0; | |
5022 | |
5023 for(;;) { | |
5024 order = ucol_IGetNextCE(coll, &s, status); | |
5025 | |
5026 if(order == 0) { | |
5027 continue; | |
5028 } | |
5029 | |
5030 if(order == UCOL_NO_MORE_CES) { | |
5031 break; | |
5032 } | |
5033 | |
5034 notIsContinuation = !isContinuation(order); | |
5035 | |
5036 if(notIsContinuation) { | |
5037 tertiary = (uint8_t)((order & tertiaryMask)); | |
5038 } else { | |
5039 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); | |
5040 } | |
5041 | |
5042 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); | |
5043 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); | |
5044 primary1 = (uint8_t)(order >> 8); | |
5045 | |
5046 uint8_t originalPrimary1 = primary1; | |
5047 if (coll->leadBytePermutationTable != NULL && notIsContinuation) { | |
5048 primary1 = coll->leadBytePermutationTable[primary1]; | |
5049 } | |
5050 | |
5051 /* Note: This code assumes that the table is well built i.e. not having
0 bytes where they are not supposed to be. */ | |
5052 /* Usually, we'll have non-zero primary1 & primary2, except in cases of
a-z and friends, when primary2 will */ | |
5053 /* be zero with non zero primary1. primary3 is different than 0 only for
long primaries - see above. */ | |
5054 /* regular and simple sortkey calc */ | |
5055 if(primary1 != UCOL_IGNORABLE) { | |
5056 if(notIsContinuation) { | |
5057 if(leadPrimary == primary1) { | |
5058 primaries.Append(primary2); | |
5059 } else { | |
5060 if(leadPrimary != 0) { | |
5061 primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UN
SHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); | |
5062 } | |
5063 if(primary2 == UCOL_IGNORABLE) { | |
5064 /* one byter, not compressed */ | |
5065 primaries.Append(primary1); | |
5066 leadPrimary = 0; | |
5067 } else if(isCompressible(coll, originalPrimary1)) { | |
5068 /* compress */ | |
5069 primaries.Append(leadPrimary = primary1, primary2); | |
5070 } else { | |
5071 leadPrimary = 0; | |
5072 primaries.Append(primary1, primary2); | |
5073 } | |
5074 } | |
5075 } else { /* we are in continuation, so we're gonna add primary to th
e key don't care about compression */ | |
5076 if(primary2 == UCOL_IGNORABLE) { | |
5077 primaries.Append(primary1); | |
5078 } else { | |
5079 primaries.Append(primary1, primary2); | |
5080 } | |
5081 } | |
5082 } | |
5083 | |
5084 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */ | |
5085 /* This is compression code. */ | |
5086 if (secondary == UCOL_COMMON2 && notIsContinuation) { | |
5087 ++count2; | |
5088 } else { | |
5089 if (count2 > 0) { | |
5090 if (secondary > UCOL_COMMON2) { // not necessary for 4th lev
el. | |
5091 while (count2 > UCOL_TOP_COUNT2) { | |
5092 secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_C
OUNT2); | |
5093 count2 -= (uint32_t)UCOL_TOP_COUNT2; | |
5094 } | |
5095 secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1)); | |
5096 } else { | |
5097 while (count2 > UCOL_BOT_COUNT2) { | |
5098 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_C
OUNT2); | |
5099 count2 -= (uint32_t)UCOL_BOT_COUNT2; | |
5100 } | |
5101 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); | |
5102 } | |
5103 count2 = 0; | |
5104 } | |
5105 secondaries.appendByte(secondary); | |
5106 } | |
5107 } | |
5108 | |
5109 if(notIsContinuation) { | |
5110 tertiary ^= caseSwitch; | |
5111 } | |
5112 | |
5113 if(tertiary > 0) { | |
5114 /* This is compression code. */ | |
5115 /* sequence size check is included in the if clause */ | |
5116 if (tertiary == tertiaryCommon && notIsContinuation) { | |
5117 ++count3; | |
5118 } else { | |
5119 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_N
ORMAL) { | |
5120 tertiary += tertiaryAddition; | |
5121 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_
COMMON3_UPPERFIRST) { | |
5122 tertiary -= tertiaryAddition; | |
5123 } | |
5124 if (count3 > 0) { | |
5125 if ((tertiary > tertiaryCommon)) { | |
5126 while (count3 > coll->tertiaryTopCount) { | |
5127 tertiaries.appendByte(tertiaryTop - coll->tertiaryTo
pCount); | |
5128 count3 -= (uint32_t)coll->tertiaryTopCount; | |
5129 } | |
5130 tertiaries.appendByte(tertiaryTop - (count3-1)); | |
5131 } else { | |
5132 while (count3 > coll->tertiaryBottomCount) { | |
5133 tertiaries.appendByte(tertiaryBottom + coll->tertiar
yBottomCount); | |
5134 count3 -= (uint32_t)coll->tertiaryBottomCount; | |
5135 } | |
5136 tertiaries.appendByte(tertiaryBottom + (count3-1)); | |
5137 } | |
5138 count3 = 0; | |
5139 } | |
5140 tertiaries.appendByte(tertiary); | |
5141 } | |
5142 } | |
5143 } | |
5144 | |
5145 UBool ok = TRUE; | |
5146 if(U_SUCCESS(*status)) { | |
5147 /* we have done all the CE's, now let's put them together to form a key
*/ | |
5148 if (count2 > 0) { | |
5149 while (count2 > UCOL_BOT_COUNT2) { | |
5150 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); | |
5151 count2 -= (uint32_t)UCOL_BOT_COUNT2; | |
5152 } | |
5153 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); | |
5154 } | |
5155 ok &= secondaries.isOk(); | |
5156 result.Append(UCOL_LEVELTERMINATOR); | |
5157 secondaries.appendTo(result); | |
5158 | |
5159 if (count3 > 0) { | |
5160 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) { | |
5161 while (count3 >= coll->tertiaryTopCount) { | |
5162 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount); | |
5163 count3 -= (uint32_t)coll->tertiaryTopCount; | |
5164 } | |
5165 tertiaries.appendByte(tertiaryTop - count3); | |
5166 } else { | |
5167 while (count3 > coll->tertiaryBottomCount) { | |
5168 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomC
ount); | |
5169 count3 -= (uint32_t)coll->tertiaryBottomCount; | |
5170 } | |
5171 tertiaries.appendByte(tertiaryBottom + (count3-1)); | |
5172 } | |
5173 } | |
5174 ok &= tertiaries.isOk(); | |
5175 result.Append(UCOL_LEVELTERMINATOR); | |
5176 tertiaries.appendTo(result); | |
5177 | |
5178 result.Append(0); | |
5179 } | |
5180 | |
5181 /* To avoid memory leak, free the offset buffer if necessary. */ | |
5182 ucol_freeOffsetBuffer(&s); | |
5183 | |
5184 ok &= result.IsOk(); | |
5185 if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; } | |
5186 } | |
5187 | |
5188 static inline | |
5189 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) { | |
5190 UBool notIsContinuation = !isContinuation(CE); | |
5191 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF); | |
5192 if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0) | |
5193 || (!notIsContinuation && *wasShifted))) | |
5194 || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that pri
mary ignorables */ | |
5195 { | |
5196 // The stuff below should probably be in the sortkey code... maybe not..
. | |
5197 if(primary1 != 0) { /* if we were shifted and we got an ignorable code p
oint */ | |
5198 /* we should just completely ignore it */ | |
5199 *wasShifted = TRUE; | |
5200 //continue; | |
5201 } | |
5202 //*wasShifted = TRUE; | |
5203 return TRUE; | |
5204 } else { | |
5205 *wasShifted = FALSE; | |
5206 return FALSE; | |
5207 } | |
5208 } | |
5209 static inline | |
5210 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *des
t) { | |
5211 if(level < maxLevel) { | |
5212 dest[i++] = UCOL_LEVELTERMINATOR; | |
5213 } else { | |
5214 dest[i++] = 0; | |
5215 } | |
5216 } | |
5217 | |
5218 /** enumeration of level identifiers for partial sort key generation */ | |
5219 enum { | |
5220 UCOL_PSK_PRIMARY = 0, | |
5221 UCOL_PSK_SECONDARY = 1, | |
5222 UCOL_PSK_CASE = 2, | |
5223 UCOL_PSK_TERTIARY = 3, | |
5224 UCOL_PSK_QUATERNARY = 4, | |
5225 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have t
hree bits to blow */ | |
5226 UCOL_PSK_IDENTICAL = 6, | |
5227 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce
zeros */ | |
5228 UCOL_PSK_LIMIT | |
5229 }; | |
5230 | |
5231 /** collation state enum. *_SHIFT value is how much to shift right | |
5232 * to get the state piece to the right. *_MASK value should be | |
5233 * ANDed with the shifted state. This data is stored in state[1] | |
5234 * field. | |
5235 */ | |
5236 enum { | |
5237 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value
from above */ | |
5238 UCOL_PSK_LEVEL_MASK = 7, /** three bits */ | |
5239 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary
or quaternary already written */ | |
5240 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1, | |
5241 /** can be only 0 or 1, since we get up to two bytes from primary or quatern
ary | |
5242 * This field is also used to denote that the French secondary level is fin
ished | |
5243 */ | |
5244 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */ | |
5245 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */ | |
5246 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already wri
tten */ | |
5247 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */ | |
5248 /** When we do French we need to reverse secondary values. However, continua
tions | |
5249 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2
c3ba | |
5250 */ | |
5251 UCOL_PSK_BOCSU_BYTES_SHIFT = 7, | |
5252 UCOL_PSK_BOCSU_BYTES_MASK = 3, | |
5253 UCOL_PSK_CONSUMED_CES_SHIFT = 9, | |
5254 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF | |
5255 }; | |
5256 | |
5257 // macro calculating the number of expansion CEs available | |
5258 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn | |
5259 | |
5260 | |
5261 /** main sortkey part procedure. On the first call, | |
5262 * you should pass in a collator, an iterator, empty state | |
5263 * state[0] == state[1] == 0, a buffer to hold results | |
5264 * number of bytes you need and an error code pointer. | |
5265 * Make sure your buffer is big enough to hold the wanted | |
5266 * number of sortkey bytes. I don't check. | |
5267 * The only meaningful status you can get back is | |
5268 * U_BUFFER_OVERFLOW_ERROR, which basically means that you | |
5269 * have been dealt a raw deal and that you probably won't | |
5270 * be able to use partial sortkey generation for this | |
5271 * particular combination of string and collator. This | |
5272 * is highly unlikely, but you should still check the error code. | |
5273 * Any other status means that you're not in a sane situation | |
5274 * anymore. After the first call, preserve state values and | |
5275 * use them on subsequent calls to obtain more bytes of a sortkey. | |
5276 * Use until the number of bytes written is smaller than the requested | |
5277 * number of bytes. Generated sortkey is not compatible with the | |
5278 * one generated by ucol_getSortKey, as we don't do any compression. | |
5279 * However, levels are still terminated by a 1 (one) and the sortkey | |
5280 * is terminated by a 0 (zero). Identical level is the same as in the | |
5281 * regular sortkey - internal bocu-1 implementation is used. | |
5282 * For curious, although you cannot do much about this, here is | |
5283 * the structure of state words. | |
5284 * state[0] - iterator state. Depends on the iterator implementation, | |
5285 * but allows the iterator to continue where it stopped in | |
5286 * the last iteration. | |
5287 * state[1] - collation processing state. Here is the distribution | |
5288 * of the bits: | |
5289 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary | |
5290 * quaternary, quin (we don't use this one), identical and | |
5291 * null (producing only zeroes - first one to terminate the | |
5292 * sortkey and subsequent to fill the buffer). | |
5293 * 3 - byte count. Number of bytes written on the primary level. | |
5294 * 4 - was shifted. Whether the previous iteration finished in the | |
5295 * shifted state. | |
5296 * 5, 6 - French continuation bytes written. See the comment in the enum | |
5297 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on | |
5298 * the identical level. | |
5299 * 9..31 - CEs consumed. Number of getCE or next32 operations performed | |
5300 * since thes last successful update of the iterator state. | |
5301 */ | |
5302 U_CAPI int32_t U_EXPORT2 | 209 U_CAPI int32_t U_EXPORT2 |
5303 ucol_nextSortKeyPart(const UCollator *coll, | 210 ucol_nextSortKeyPart(const UCollator *coll, |
5304 UCharIterator *iter, | 211 UCharIterator *iter, |
5305 uint32_t state[2], | 212 uint32_t state[2], |
5306 uint8_t *dest, int32_t count, | 213 uint8_t *dest, int32_t count, |
5307 UErrorCode *status) | 214 UErrorCode *status) |
5308 { | 215 { |
5309 /* error checking */ | 216 /* error checking */ |
5310 if(status==NULL || U_FAILURE(*status)) { | 217 if(status==NULL || U_FAILURE(*status)) { |
5311 return 0; | 218 return 0; |
5312 } | 219 } |
5313 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); | 220 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); |
5314 if( coll==NULL || iter==NULL || | |
5315 state==NULL || | |
5316 count<0 || (count>0 && dest==NULL) | |
5317 ) { | |
5318 *status=U_ILLEGAL_ARGUMENT_ERROR; | |
5319 UTRACE_EXIT_STATUS(status); | |
5320 return 0; | |
5321 } | |
5322 | |
5323 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=
%d", | 221 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=
%d", |
5324 coll, iter, state[0], state[1], dest, count); | 222 coll, iter, state[0], state[1], dest, count); |
5325 | 223 |
5326 if(count==0) { | 224 int32_t i = Collator::fromUCollator(coll)-> |
5327 /* nothing to do */ | 225 internalNextSortKeyPart(iter, state, dest, count, *status); |
5328 UTRACE_EXIT_VALUE(0); | 226 |
5329 return 0; | |
5330 } | |
5331 /** Setting up situation according to the state we got from the previous ite
ration */ | |
5332 // The state of the iterator from the previous invocation | |
5333 uint32_t iterState = state[0]; | |
5334 // Has the last iteration ended in the shifted state | |
5335 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_
SHIFTED_MASK)?TRUE:FALSE; | |
5336 // What is the current level of the sortkey? | |
5337 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK; | |
5338 // Have we written only one byte from a two byte primary in the previous ite
ration? | |
5339 // Also on secondary level - have we finished with the French secondary? | |
5340 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_D
ONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK; | |
5341 // number of bytes in the continuation buffer for French | |
5342 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USE
D_FRENCH_MASK; | |
5343 // Number of bytes already written from a bocsu sequence. Since | |
5344 // the longes bocsu sequence is 4 long, this can be up to 3. | |
5345 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK
_BOCSU_BYTES_MASK; | |
5346 // Number of elements that need to be consumed in this iteration because | |
5347 // the iterator returned UITER_NO_STATE at the end of the last iteration, | |
5348 // so we had to save the last valid state. | |
5349 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED
_CES_MASK; | |
5350 | |
5351 /** values that depend on the collator attributes */ | |
5352 // strength of the collator. | |
5353 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status); | |
5354 // maximal level of the partial sortkey. Need to take whether case level is
done | |
5355 int32_t maxLevel = 0; | |
5356 if(strength < UCOL_TERTIARY) { | |
5357 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { | |
5358 maxLevel = UCOL_PSK_CASE; | |
5359 } else { | |
5360 maxLevel = strength; | |
5361 } | |
5362 } else { | |
5363 if(strength == UCOL_TERTIARY) { | |
5364 maxLevel = UCOL_PSK_TERTIARY; | |
5365 } else if(strength == UCOL_QUATERNARY) { | |
5366 maxLevel = UCOL_PSK_QUATERNARY; | |
5367 } else { // identical | |
5368 maxLevel = UCOL_IDENTICAL; | |
5369 } | |
5370 } | |
5371 // value for the quaternary level if Hiragana is encountered. Used for JIS X
4061 collation | |
5372 uint8_t UCOL_HIRAGANA_QUAD = | |
5373 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON
)?0xFE:0xFF; | |
5374 // Boundary value that decides whether a CE is shifted or not | |
5375 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopV
alue<<16):0; | |
5376 // Are we doing French collation? | |
5377 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status)
== UCOL_ON); | |
5378 | |
5379 /** initializing the collation state */ | |
5380 UBool notIsContinuation = FALSE; | |
5381 uint32_t CE = UCOL_NO_MORE_CES; | |
5382 | |
5383 collIterate s; | |
5384 IInit_collIterate(coll, NULL, -1, &s, status); | |
5385 if(U_FAILURE(*status)) { | |
5386 UTRACE_EXIT_STATUS(*status); | |
5387 return 0; | |
5388 } | |
5389 s.iterator = iter; | |
5390 s.flags |= UCOL_USE_ITERATOR; | |
5391 // This variable tells us whether we have produced some other levels in this
iteration | |
5392 // before we moved to the identical level. In that case, we need to switch t
he | |
5393 // type of the iterator. | |
5394 UBool doingIdenticalFromStart = FALSE; | |
5395 // Normalizing iterator | |
5396 // The division for the array length may truncate the array size to | |
5397 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high | |
5398 // for all platforms anyway. | |
5399 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
5400 UNormIterator *normIter = NULL; | |
5401 // If the normalization is turned on for the collator and we are below ident
ical level | |
5402 // we will use a FCD normalizing iterator | |
5403 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && le
vel < UCOL_PSK_IDENTICAL) { | |
5404 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); | |
5405 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status); | |
5406 s.flags &= ~UCOL_ITER_NORM; | |
5407 if(U_FAILURE(*status)) { | |
5408 UTRACE_EXIT_STATUS(*status); | |
5409 return 0; | |
5410 } | |
5411 } else if(level == UCOL_PSK_IDENTICAL) { | |
5412 // for identical level, we need a NFD iterator. We need to instantiate i
t here, since we | |
5413 // will be updating the state - and this cannot be done on an ordinary i
terator. | |
5414 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); | |
5415 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); | |
5416 s.flags &= ~UCOL_ITER_NORM; | |
5417 if(U_FAILURE(*status)) { | |
5418 UTRACE_EXIT_STATUS(*status); | |
5419 return 0; | |
5420 } | |
5421 doingIdenticalFromStart = TRUE; | |
5422 } | |
5423 | |
5424 // This is the tentative new state of the iterator. The problem | |
5425 // is that the iterator might return an undefined state, in | |
5426 // which case we should save the last valid state and increase | |
5427 // the iterator skip value. | |
5428 uint32_t newState = 0; | |
5429 | |
5430 // First, we set the iterator to the last valid position | |
5431 // from the last iteration. This was saved in state[0]. | |
5432 if(iterState == 0) { | |
5433 /* initial state */ | |
5434 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone)
{ | |
5435 s.iterator->move(s.iterator, 0, UITER_LIMIT); | |
5436 } else { | |
5437 s.iterator->move(s.iterator, 0, UITER_START); | |
5438 } | |
5439 } else { | |
5440 /* reset to previous state */ | |
5441 s.iterator->setState(s.iterator, iterState, status); | |
5442 if(U_FAILURE(*status)) { | |
5443 UTRACE_EXIT_STATUS(*status); | |
5444 return 0; | |
5445 } | |
5446 } | |
5447 | |
5448 | |
5449 | |
5450 // This variable tells us whether we can attempt to update the state | |
5451 // of iterator. Situations where we don't want to update iterator state | |
5452 // are the existence of expansion CEs that are not yet processed, and | |
5453 // finishing the case level without enough space in the buffer to insert | |
5454 // a level terminator. | |
5455 UBool canUpdateState = TRUE; | |
5456 | |
5457 // Consume all the CEs that were consumed at the end of the previous | |
5458 // iteration without updating the iterator state. On identical level, | |
5459 // consume the code points. | |
5460 int32_t counter = cces; | |
5461 if(level < UCOL_PSK_IDENTICAL) { | |
5462 while(counter-->0) { | |
5463 // If we're doing French and we are on the secondary level, | |
5464 // we go backwards. | |
5465 if(level == UCOL_PSK_SECONDARY && doingFrench) { | |
5466 CE = ucol_IGetPrevCE(coll, &s, status); | |
5467 } else { | |
5468 CE = ucol_IGetNextCE(coll, &s, status); | |
5469 } | |
5470 if(CE==UCOL_NO_MORE_CES) { | |
5471 /* should not happen */ | |
5472 *status=U_INTERNAL_PROGRAM_ERROR; | |
5473 UTRACE_EXIT_STATUS(*status); | |
5474 return 0; | |
5475 } | |
5476 if(uprv_numAvailableExpCEs(s)) { | |
5477 canUpdateState = FALSE; | |
5478 } | |
5479 } | |
5480 } else { | |
5481 while(counter-->0) { | |
5482 uiter_next32(s.iterator); | |
5483 } | |
5484 } | |
5485 | |
5486 // French secondary needs to know whether the iterator state of zero came fr
om previous level OR | |
5487 // from a new invocation... | |
5488 UBool wasDoingPrimary = FALSE; | |
5489 // destination buffer byte counter. When this guy | |
5490 // gets to count, we're done with the iteration | |
5491 int32_t i = 0; | |
5492 // used to count the zero bytes written after we | |
5493 // have finished with the sort key | |
5494 int32_t j = 0; | |
5495 | |
5496 | |
5497 // Hm.... I think we're ready to plunge in. Basic story is as following: | |
5498 // we have a fall through case based on level. This is used for initial | |
5499 // positioning on iteration start. Every level processor contains a | |
5500 // for(;;) which will be broken when we exhaust all the CEs. Other | |
5501 // way to exit is a goto saveState, which happens when we have filled | |
5502 // out our buffer. | |
5503 switch(level) { | |
5504 case UCOL_PSK_PRIMARY: | |
5505 wasDoingPrimary = TRUE; | |
5506 for(;;) { | |
5507 if(i==count) { | |
5508 goto saveState; | |
5509 } | |
5510 // We should save the state only if we | |
5511 // are sure that we are done with the | |
5512 // previous iterator state | |
5513 if(canUpdateState && byteCountOrFrenchDone == 0) { | |
5514 newState = s.iterator->getState(s.iterator); | |
5515 if(newState != UITER_NO_STATE) { | |
5516 iterState = newState; | |
5517 cces = 0; | |
5518 } | |
5519 } | |
5520 CE = ucol_IGetNextCE(coll, &s, status); | |
5521 cces++; | |
5522 if(CE==UCOL_NO_MORE_CES) { | |
5523 // Add the level separator | |
5524 terminatePSKLevel(level, maxLevel, i, dest); | |
5525 byteCountOrFrenchDone=0; | |
5526 // Restart the iteration an move to the | |
5527 // second level | |
5528 s.iterator->move(s.iterator, 0, UITER_START); | |
5529 cces = 0; | |
5530 level = UCOL_PSK_SECONDARY; | |
5531 break; | |
5532 } | |
5533 if(!isContinuation(CE)){ | |
5534 if(coll->leadBytePermutationTable != NULL){ | |
5535 CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE &
0x00FFFFFF); | |
5536 } | |
5537 } | |
5538 if(!isShiftedCE(CE, LVT, &wasShifted)) { | |
5539 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */ | |
5540 if(CE != 0) { | |
5541 if(byteCountOrFrenchDone == 0) { | |
5542 // get the second byte of primary | |
5543 dest[i++]=(uint8_t)(CE >> 8); | |
5544 } else { | |
5545 byteCountOrFrenchDone = 0; | |
5546 } | |
5547 if((CE &=0xff)!=0) { | |
5548 if(i==count) { | |
5549 /* overflow */ | |
5550 byteCountOrFrenchDone = 1; | |
5551 cces--; | |
5552 goto saveState; | |
5553 } | |
5554 dest[i++]=(uint8_t)CE; | |
5555 } | |
5556 } | |
5557 } | |
5558 if(uprv_numAvailableExpCEs(s)) { | |
5559 canUpdateState = FALSE; | |
5560 } else { | |
5561 canUpdateState = TRUE; | |
5562 } | |
5563 } | |
5564 /* fall through to next level */ | |
5565 case UCOL_PSK_SECONDARY: | |
5566 if(strength >= UCOL_SECONDARY) { | |
5567 if(!doingFrench) { | |
5568 for(;;) { | |
5569 if(i == count) { | |
5570 goto saveState; | |
5571 } | |
5572 // We should save the state only if we | |
5573 // are sure that we are done with the | |
5574 // previous iterator state | |
5575 if(canUpdateState) { | |
5576 newState = s.iterator->getState(s.iterator); | |
5577 if(newState != UITER_NO_STATE) { | |
5578 iterState = newState; | |
5579 cces = 0; | |
5580 } | |
5581 } | |
5582 CE = ucol_IGetNextCE(coll, &s, status); | |
5583 cces++; | |
5584 if(CE==UCOL_NO_MORE_CES) { | |
5585 // Add the level separator | |
5586 terminatePSKLevel(level, maxLevel, i, dest); | |
5587 byteCountOrFrenchDone = 0; | |
5588 // Restart the iteration an move to the | |
5589 // second level | |
5590 s.iterator->move(s.iterator, 0, UITER_START); | |
5591 cces = 0; | |
5592 level = UCOL_PSK_CASE; | |
5593 break; | |
5594 } | |
5595 if(!isShiftedCE(CE, LVT, &wasShifted)) { | |
5596 CE >>= 8; /* get secondary */ | |
5597 if(CE != 0) { | |
5598 dest[i++]=(uint8_t)CE; | |
5599 } | |
5600 } | |
5601 if(uprv_numAvailableExpCEs(s)) { | |
5602 canUpdateState = FALSE; | |
5603 } else { | |
5604 canUpdateState = TRUE; | |
5605 } | |
5606 } | |
5607 } else { // French secondary processing | |
5608 uint8_t frenchBuff[UCOL_MAX_BUFFER]; | |
5609 int32_t frenchIndex = 0; | |
5610 // Here we are going backwards. | |
5611 // If the iterator is at the beggining, it should be | |
5612 // moved to end. | |
5613 if(wasDoingPrimary) { | |
5614 s.iterator->move(s.iterator, 0, UITER_LIMIT); | |
5615 cces = 0; | |
5616 } | |
5617 for(;;) { | |
5618 if(i == count) { | |
5619 goto saveState; | |
5620 } | |
5621 if(canUpdateState) { | |
5622 newState = s.iterator->getState(s.iterator); | |
5623 if(newState != UITER_NO_STATE) { | |
5624 iterState = newState; | |
5625 cces = 0; | |
5626 } | |
5627 } | |
5628 CE = ucol_IGetPrevCE(coll, &s, status); | |
5629 cces++; | |
5630 if(CE==UCOL_NO_MORE_CES) { | |
5631 // Add the level separator | |
5632 terminatePSKLevel(level, maxLevel, i, dest); | |
5633 byteCountOrFrenchDone = 0; | |
5634 // Restart the iteration an move to the next level | |
5635 s.iterator->move(s.iterator, 0, UITER_START); | |
5636 level = UCOL_PSK_CASE; | |
5637 break; | |
5638 } | |
5639 if(isContinuation(CE)) { // if it's a continuation, we want
to save it and | |
5640 // reverse when we get a first non-continuation CE. | |
5641 CE >>= 8; | |
5642 frenchBuff[frenchIndex++] = (uint8_t)CE; | |
5643 } else if(!isShiftedCE(CE, LVT, &wasShifted)) { | |
5644 CE >>= 8; /* get secondary */ | |
5645 if(!frenchIndex) { | |
5646 if(CE != 0) { | |
5647 dest[i++]=(uint8_t)CE; | |
5648 } | |
5649 } else { | |
5650 frenchBuff[frenchIndex++] = (uint8_t)CE; | |
5651 frenchIndex -= usedFrench; | |
5652 usedFrench = 0; | |
5653 while(i < count && frenchIndex) { | |
5654 dest[i++] = frenchBuff[--frenchIndex]; | |
5655 usedFrench++; | |
5656 } | |
5657 } | |
5658 } | |
5659 if(uprv_numAvailableExpCEs(s)) { | |
5660 canUpdateState = FALSE; | |
5661 } else { | |
5662 canUpdateState = TRUE; | |
5663 } | |
5664 } | |
5665 } | |
5666 } else { | |
5667 level = UCOL_PSK_CASE; | |
5668 } | |
5669 /* fall through to next level */ | |
5670 case UCOL_PSK_CASE: | |
5671 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { | |
5672 uint32_t caseShift = UCOL_CASE_SHIFT_START; | |
5673 uint8_t caseByte = UCOL_CASE_BYTE_START; | |
5674 uint8_t caseBits = 0; | |
5675 | |
5676 for(;;) { | |
5677 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START); | |
5678 if(i == count) { | |
5679 goto saveState; | |
5680 } | |
5681 // We should save the state only if we | |
5682 // are sure that we are done with the | |
5683 // previous iterator state | |
5684 if(canUpdateState) { | |
5685 newState = s.iterator->getState(s.iterator); | |
5686 if(newState != UITER_NO_STATE) { | |
5687 iterState = newState; | |
5688 cces = 0; | |
5689 } | |
5690 } | |
5691 CE = ucol_IGetNextCE(coll, &s, status); | |
5692 cces++; | |
5693 if(CE==UCOL_NO_MORE_CES) { | |
5694 // On the case level we might have an unfinished | |
5695 // case byte. Add one if it's started. | |
5696 if(caseShift != UCOL_CASE_SHIFT_START) { | |
5697 dest[i++] = caseByte; | |
5698 } | |
5699 cces = 0; | |
5700 // We have finished processing CEs on this level. | |
5701 // However, we don't know if we have enough space | |
5702 // to add a case level terminator. | |
5703 if(i < count) { | |
5704 // Add the level separator | |
5705 terminatePSKLevel(level, maxLevel, i, dest); | |
5706 // Restart the iteration and move to the | |
5707 // next level | |
5708 s.iterator->move(s.iterator, 0, UITER_START); | |
5709 level = UCOL_PSK_TERTIARY; | |
5710 } else { | |
5711 canUpdateState = FALSE; | |
5712 } | |
5713 break; | |
5714 } | |
5715 | |
5716 if(!isShiftedCE(CE, LVT, &wasShifted)) { | |
5717 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || s
trength > UCOL_PRIMARY)) { | |
5718 // do the case level if we need to do it. We don't want
to calculate | |
5719 // case level for primary ignorables if we have only pri
mary strength and case level | |
5720 // otherwise we would break well formedness of CEs | |
5721 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); | |
5722 caseBits = (uint8_t)(CE & 0xC0); | |
5723 // this copies the case level logic from the | |
5724 // sort key generation code | |
5725 if(CE != 0) { | |
5726 if (caseShift == 0) { | |
5727 dest[i++] = caseByte; | |
5728 caseShift = UCOL_CASE_SHIFT_START; | |
5729 caseByte = UCOL_CASE_BYTE_START; | |
5730 } | |
5731 if(coll->caseFirst == UCOL_UPPER_FIRST) { | |
5732 if((caseBits & 0xC0) == 0) { | |
5733 caseByte |= 1 << (--caseShift); | |
5734 } else { | |
5735 caseByte |= 0 << (--caseShift); | |
5736 /* second bit */ | |
5737 if(caseShift == 0) { | |
5738 dest[i++] = caseByte; | |
5739 caseShift = UCOL_CASE_SHIFT_START; | |
5740 caseByte = UCOL_CASE_BYTE_START; | |
5741 } | |
5742 caseByte |= ((caseBits>>6)&1) << (--caseShif
t); | |
5743 } | |
5744 } else { | |
5745 if((caseBits & 0xC0) == 0) { | |
5746 caseByte |= 0 << (--caseShift); | |
5747 } else { | |
5748 caseByte |= 1 << (--caseShift); | |
5749 /* second bit */ | |
5750 if(caseShift == 0) { | |
5751 dest[i++] = caseByte; | |
5752 caseShift = UCOL_CASE_SHIFT_START; | |
5753 caseByte = UCOL_CASE_BYTE_START; | |
5754 } | |
5755 caseByte |= ((caseBits>>7)&1) << (--caseShif
t); | |
5756 } | |
5757 } | |
5758 } | |
5759 | |
5760 } | |
5761 } | |
5762 // Not sure this is correct for the case level - revisit | |
5763 if(uprv_numAvailableExpCEs(s)) { | |
5764 canUpdateState = FALSE; | |
5765 } else { | |
5766 canUpdateState = TRUE; | |
5767 } | |
5768 } | |
5769 } else { | |
5770 level = UCOL_PSK_TERTIARY; | |
5771 } | |
5772 /* fall through to next level */ | |
5773 case UCOL_PSK_TERTIARY: | |
5774 if(strength >= UCOL_TERTIARY) { | |
5775 for(;;) { | |
5776 if(i == count) { | |
5777 goto saveState; | |
5778 } | |
5779 // We should save the state only if we | |
5780 // are sure that we are done with the | |
5781 // previous iterator state | |
5782 if(canUpdateState) { | |
5783 newState = s.iterator->getState(s.iterator); | |
5784 if(newState != UITER_NO_STATE) { | |
5785 iterState = newState; | |
5786 cces = 0; | |
5787 } | |
5788 } | |
5789 CE = ucol_IGetNextCE(coll, &s, status); | |
5790 cces++; | |
5791 if(CE==UCOL_NO_MORE_CES) { | |
5792 // Add the level separator | |
5793 terminatePSKLevel(level, maxLevel, i, dest); | |
5794 byteCountOrFrenchDone = 0; | |
5795 // Restart the iteration an move to the | |
5796 // second level | |
5797 s.iterator->move(s.iterator, 0, UITER_START); | |
5798 cces = 0; | |
5799 level = UCOL_PSK_QUATERNARY; | |
5800 break; | |
5801 } | |
5802 if(!isShiftedCE(CE, LVT, &wasShifted)) { | |
5803 notIsContinuation = !isContinuation(CE); | |
5804 | |
5805 if(notIsContinuation) { | |
5806 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); | |
5807 CE ^= coll->caseSwitch; | |
5808 CE &= coll->tertiaryMask; | |
5809 } else { | |
5810 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); | |
5811 } | |
5812 | |
5813 if(CE != 0) { | |
5814 dest[i++]=(uint8_t)CE; | |
5815 } | |
5816 } | |
5817 if(uprv_numAvailableExpCEs(s)) { | |
5818 canUpdateState = FALSE; | |
5819 } else { | |
5820 canUpdateState = TRUE; | |
5821 } | |
5822 } | |
5823 } else { | |
5824 // if we're not doing tertiary | |
5825 // skip to the end | |
5826 level = UCOL_PSK_NULL; | |
5827 } | |
5828 /* fall through to next level */ | |
5829 case UCOL_PSK_QUATERNARY: | |
5830 if(strength >= UCOL_QUATERNARY) { | |
5831 for(;;) { | |
5832 if(i == count) { | |
5833 goto saveState; | |
5834 } | |
5835 // We should save the state only if we | |
5836 // are sure that we are done with the | |
5837 // previous iterator state | |
5838 if(canUpdateState) { | |
5839 newState = s.iterator->getState(s.iterator); | |
5840 if(newState != UITER_NO_STATE) { | |
5841 iterState = newState; | |
5842 cces = 0; | |
5843 } | |
5844 } | |
5845 CE = ucol_IGetNextCE(coll, &s, status); | |
5846 cces++; | |
5847 if(CE==UCOL_NO_MORE_CES) { | |
5848 // Add the level separator | |
5849 terminatePSKLevel(level, maxLevel, i, dest); | |
5850 //dest[i++] = UCOL_LEVELTERMINATOR; | |
5851 byteCountOrFrenchDone = 0; | |
5852 // Restart the iteration an move to the | |
5853 // second level | |
5854 s.iterator->move(s.iterator, 0, UITER_START); | |
5855 cces = 0; | |
5856 level = UCOL_PSK_QUIN; | |
5857 break; | |
5858 } | |
5859 if(CE==0) | |
5860 continue; | |
5861 if(isShiftedCE(CE, LVT, &wasShifted)) { | |
5862 CE >>= 16; /* get primary */ | |
5863 if(CE != 0) { | |
5864 if(byteCountOrFrenchDone == 0) { | |
5865 dest[i++]=(uint8_t)(CE >> 8); | |
5866 } else { | |
5867 byteCountOrFrenchDone = 0; | |
5868 } | |
5869 if((CE &=0xff)!=0) { | |
5870 if(i==count) { | |
5871 /* overflow */ | |
5872 byteCountOrFrenchDone = 1; | |
5873 goto saveState; | |
5874 } | |
5875 dest[i++]=(uint8_t)CE; | |
5876 } | |
5877 } | |
5878 } else { | |
5879 notIsContinuation = !isContinuation(CE); | |
5880 if(notIsContinuation) { | |
5881 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana a
nd we need to note it | |
5882 dest[i++] = UCOL_HIRAGANA_QUAD; | |
5883 } else { | |
5884 dest[i++] = 0xFF; | |
5885 } | |
5886 } | |
5887 } | |
5888 if(uprv_numAvailableExpCEs(s)) { | |
5889 canUpdateState = FALSE; | |
5890 } else { | |
5891 canUpdateState = TRUE; | |
5892 } | |
5893 } | |
5894 } else { | |
5895 // if we're not doing quaternary | |
5896 // skip to the end | |
5897 level = UCOL_PSK_NULL; | |
5898 } | |
5899 /* fall through to next level */ | |
5900 case UCOL_PSK_QUIN: | |
5901 level = UCOL_PSK_IDENTICAL; | |
5902 /* fall through to next level */ | |
5903 case UCOL_PSK_IDENTICAL: | |
5904 if(strength >= UCOL_IDENTICAL) { | |
5905 UChar32 first, second; | |
5906 int32_t bocsuBytesWritten = 0; | |
5907 // We always need to do identical on | |
5908 // the NFD form of the string. | |
5909 if(normIter == NULL) { | |
5910 // we arrived from the level below and | |
5911 // normalization was not turned on. | |
5912 // therefore, we need to make a fresh NFD iterator | |
5913 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter),
status); | |
5914 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); | |
5915 } else if(!doingIdenticalFromStart) { | |
5916 // there is an iterator, but we did some other levels. | |
5917 // therefore, we have a FCD iterator - need to make | |
5918 // a NFD one. | |
5919 // normIter being at the beginning does not guarantee | |
5920 // that the underlying iterator is at the beginning | |
5921 iter->move(iter, 0, UITER_START); | |
5922 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); | |
5923 } | |
5924 // At this point we have a NFD iterator that is positioned | |
5925 // in the right place | |
5926 if(U_FAILURE(*status)) { | |
5927 UTRACE_EXIT_STATUS(*status); | |
5928 return 0; | |
5929 } | |
5930 first = uiter_previous32(s.iterator); | |
5931 // maybe we're at the start of the string | |
5932 if(first == U_SENTINEL) { | |
5933 first = 0; | |
5934 } else { | |
5935 uiter_next32(s.iterator); | |
5936 } | |
5937 | |
5938 j = 0; | |
5939 for(;;) { | |
5940 if(i == count) { | |
5941 if(j+1 < bocsuBytesWritten) { | |
5942 bocsuBytesUsed = j+1; | |
5943 } | |
5944 goto saveState; | |
5945 } | |
5946 | |
5947 // On identical level, we will always save | |
5948 // the state if we reach this point, since | |
5949 // we don't depend on getNextCE for content | |
5950 // all the content is in our buffer and we | |
5951 // already either stored the full buffer OR | |
5952 // otherwise we won't arrive here. | |
5953 newState = s.iterator->getState(s.iterator); | |
5954 if(newState != UITER_NO_STATE) { | |
5955 iterState = newState; | |
5956 cces = 0; | |
5957 } | |
5958 | |
5959 uint8_t buff[4]; | |
5960 second = uiter_next32(s.iterator); | |
5961 cces++; | |
5962 | |
5963 // end condition for identical level | |
5964 if(second == U_SENTINEL) { | |
5965 terminatePSKLevel(level, maxLevel, i, dest); | |
5966 level = UCOL_PSK_NULL; | |
5967 break; | |
5968 } | |
5969 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, seco
nd, buff); | |
5970 first = second; | |
5971 | |
5972 j = 0; | |
5973 if(bocsuBytesUsed != 0) { | |
5974 while(bocsuBytesUsed-->0) { | |
5975 j++; | |
5976 } | |
5977 } | |
5978 | |
5979 while(i < count && j < bocsuBytesWritten) { | |
5980 dest[i++] = buff[j++]; | |
5981 } | |
5982 } | |
5983 | |
5984 } else { | |
5985 level = UCOL_PSK_NULL; | |
5986 } | |
5987 /* fall through to next level */ | |
5988 case UCOL_PSK_NULL: | |
5989 j = i; | |
5990 while(j<count) { | |
5991 dest[j++]=0; | |
5992 } | |
5993 break; | |
5994 default: | |
5995 *status = U_INTERNAL_PROGRAM_ERROR; | |
5996 UTRACE_EXIT_STATUS(*status); | |
5997 return 0; | |
5998 } | |
5999 | |
6000 saveState: | |
6001 // Now we need to return stuff. First we want to see whether we have | |
6002 // done everything for the current state of iterator. | |
6003 if(byteCountOrFrenchDone | |
6004 || canUpdateState == FALSE | |
6005 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE) | |
6006 { | |
6007 // Any of above mean that the previous transaction | |
6008 // wasn't finished and that we should store the | |
6009 // previous iterator state. | |
6010 state[0] = iterState; | |
6011 } else { | |
6012 // The transaction is complete. We will continue in the next iteration. | |
6013 state[0] = s.iterator->getState(s.iterator); | |
6014 cces = 0; | |
6015 } | |
6016 // Store the number of bocsu bytes written. | |
6017 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) { | |
6018 *status = U_INDEX_OUTOFBOUNDS_ERROR; | |
6019 } | |
6020 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BY
TES_SHIFT; | |
6021 | |
6022 // Next we put in the level of comparison | |
6023 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT); | |
6024 | |
6025 // If we are doing French, we need to store whether we have just finished th
e French level | |
6026 if(level == UCOL_PSK_SECONDARY && doingFrench) { | |
6027 state[1] |= (((int32_t)(state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_D
ONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); | |
6028 } else { | |
6029 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE
_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); | |
6030 } | |
6031 | |
6032 // Was the latest CE shifted | |
6033 if(wasShifted) { | |
6034 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT; | |
6035 } | |
6036 // Check for cces overflow | |
6037 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) { | |
6038 *status = U_INDEX_OUTOFBOUNDS_ERROR; | |
6039 } | |
6040 // Store cces | |
6041 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SH
IFT); | |
6042 | |
6043 // Check for French overflow | |
6044 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) { | |
6045 *status = U_INDEX_OUTOFBOUNDS_ERROR; | |
6046 } | |
6047 // Store number of bytes written in the French secondary continuation sequen
ce | |
6048 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENC
H_SHIFT); | |
6049 | |
6050 | |
6051 // If we have used normalizing iterator, get rid of it | |
6052 if(normIter != NULL) { | |
6053 unorm_closeIter(normIter); | |
6054 } | |
6055 | |
6056 /* To avoid memory leak, free the offset buffer if necessary. */ | |
6057 ucol_freeOffsetBuffer(&s); | |
6058 | |
6059 // Return number of meaningful sortkey bytes. | 227 // Return number of meaningful sortkey bytes. |
6060 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", | 228 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", |
6061 dest,i, state[0], state[1]); | 229 dest,i, state[0], state[1]); |
6062 UTRACE_EXIT_VALUE(i); | 230 UTRACE_EXIT_VALUE_STATUS(i, *status); |
6063 return i; | 231 return i; |
6064 } | 232 } |
6065 | 233 |
6066 /** | 234 /** |
6067 * Produce a bound for a given sortkey and a number of levels. | 235 * Produce a bound for a given sortkey and a number of levels. |
6068 */ | 236 */ |
6069 U_CAPI int32_t U_EXPORT2 | 237 U_CAPI int32_t U_EXPORT2 |
6070 ucol_getBound(const uint8_t *source, | 238 ucol_getBound(const uint8_t *source, |
6071 int32_t sourceLength, | 239 int32_t sourceLength, |
6072 UColBoundMode boundType, | 240 UColBoundMode boundType, |
6073 uint32_t noOfLevels, | 241 uint32_t noOfLevels, |
6074 uint8_t *result, | 242 uint8_t *result, |
6075 int32_t resultLength, | 243 int32_t resultLength, |
6076 UErrorCode *status) | 244 UErrorCode *status) |
6077 { | 245 { |
6078 // consistency checks | 246 // consistency checks |
6079 if(status == NULL || U_FAILURE(*status)) { | 247 if(status == NULL || U_FAILURE(*status)) { |
6080 return 0; | 248 return 0; |
6081 } | 249 } |
6082 if(source == NULL) { | 250 if(source == NULL) { |
6083 *status = U_ILLEGAL_ARGUMENT_ERROR; | 251 *status = U_ILLEGAL_ARGUMENT_ERROR; |
6084 return 0; | 252 return 0; |
6085 } | 253 } |
6086 | 254 |
6087 int32_t sourceIndex = 0; | 255 int32_t sourceIndex = 0; |
6088 // Scan the string until we skip enough of the key OR reach the end of the k
ey | 256 // Scan the string until we skip enough of the key OR reach the end of the k
ey |
6089 do { | 257 do { |
6090 sourceIndex++; | 258 sourceIndex++; |
6091 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) { | 259 if(source[sourceIndex] == Collation::LEVEL_SEPARATOR_BYTE) { |
6092 noOfLevels--; | 260 noOfLevels--; |
6093 } | 261 } |
6094 } while (noOfLevels > 0 | 262 } while (noOfLevels > 0 |
6095 && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); | 263 && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); |
6096 | 264 |
6097 if((source[sourceIndex] == 0 || sourceIndex == sourceLength) | 265 if((source[sourceIndex] == 0 || sourceIndex == sourceLength) |
6098 && noOfLevels > 0) { | 266 && noOfLevels > 0) { |
6099 *status = U_SORT_KEY_TOO_SHORT_WARNING; | 267 *status = U_SORT_KEY_TOO_SHORT_WARNING; |
6100 } | 268 } |
6101 | 269 |
(...skipping 22 matching lines...) Expand all Loading... |
6124 return 0; | 292 return 0; |
6125 } | 293 } |
6126 result[sourceIndex++] = 0; | 294 result[sourceIndex++] = 0; |
6127 | 295 |
6128 return sourceIndex; | 296 return sourceIndex; |
6129 } else { | 297 } else { |
6130 return sourceIndex+boundType+1; | 298 return sourceIndex+boundType+1; |
6131 } | 299 } |
6132 } | 300 } |
6133 | 301 |
6134 /****************************************************************************/ | 302 U_CAPI void U_EXPORT2 |
6135 /* Following are the functions that deal with the properties of a collator */ | 303 ucol_setMaxVariable(UCollator *coll, UColReorderCode group, UErrorCode *pErrorCo
de) { |
6136 /* there are new APIs and some compatibility APIs */ | 304 if(U_FAILURE(*pErrorCode)) { return; } |
6137 /****************************************************************************/ | 305 Collator::fromUCollator(coll)->setMaxVariable(group, *pErrorCode); |
6138 | 306 } |
6139 static inline void | 307 |
6140 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE, | 308 U_CAPI UColReorderCode U_EXPORT2 |
6141 int32_t *primShift, int32_t *secShift, int32_t *terShift) | 309 ucol_getMaxVariable(const UCollator *coll) { |
6142 { | 310 return Collator::fromUCollator(coll)->getMaxVariable(); |
6143 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; | |
6144 UBool reverseSecondary = FALSE; | |
6145 UBool continuation = isContinuation(CE); | |
6146 if(!continuation) { | |
6147 tertiary = (uint8_t)((CE & coll->tertiaryMask)); | |
6148 tertiary ^= coll->caseSwitch; | |
6149 reverseSecondary = TRUE; | |
6150 } else { | |
6151 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); | |
6152 tertiary &= UCOL_REMOVE_CASE; | |
6153 reverseSecondary = FALSE; | |
6154 } | |
6155 | |
6156 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); | |
6157 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); | |
6158 primary1 = (uint8_t)(CE >> 8); | |
6159 | |
6160 if(primary1 != 0) { | |
6161 if (coll->leadBytePermutationTable != NULL && !continuation) { | |
6162 primary1 = coll->leadBytePermutationTable[primary1]; | |
6163 } | |
6164 | |
6165 coll->latinOneCEs[ch] |= (primary1 << *primShift); | |
6166 *primShift -= 8; | |
6167 } | |
6168 if(primary2 != 0) { | |
6169 if(*primShift < 0) { | |
6170 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; | |
6171 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; | |
6172 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; | |
6173 return; | |
6174 } | |
6175 coll->latinOneCEs[ch] |= (primary2 << *primShift); | |
6176 *primShift -= 8; | |
6177 } | |
6178 if(secondary != 0) { | |
6179 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse se
condary | |
6180 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space fo
r secondary | |
6181 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24); | |
6182 } else { // normal case | |
6183 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secSh
ift); | |
6184 } | |
6185 *secShift -= 8; | |
6186 } | |
6187 if(tertiary != 0) { | |
6188 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift
); | |
6189 *terShift -= 8; | |
6190 } | |
6191 } | |
6192 | |
6193 static inline UBool | |
6194 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) { | |
6195 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3); | |
6196 if(newTable == NULL) { | |
6197 *status = U_MEMORY_ALLOCATION_ERROR; | |
6198 coll->latinOneFailed = TRUE; | |
6199 return FALSE; | |
6200 } | |
6201 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTable
Len)*sizeof(uint32_t); | |
6202 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3); | |
6203 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy); | |
6204 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToC
opy); | |
6205 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, siz
eToCopy); | |
6206 coll->latinOneTableLen = size; | |
6207 uprv_free(coll->latinOneCEs); | |
6208 coll->latinOneCEs = newTable; | |
6209 return TRUE; | |
6210 } | |
6211 | |
6212 static UBool | |
6213 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) { | |
6214 UBool result = TRUE; | |
6215 if(coll->latinOneCEs == NULL) { | |
6216 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINO
NETABLELEN*3); | |
6217 if(coll->latinOneCEs == NULL) { | |
6218 *status = U_MEMORY_ALLOCATION_ERROR; | |
6219 return FALSE; | |
6220 } | |
6221 coll->latinOneTableLen = UCOL_LATINONETABLELEN; | |
6222 } | |
6223 UChar ch = 0; | |
6224 UCollationElements *it = ucol_openElements(coll, &ch, 1, status); | |
6225 // Check for null pointer | |
6226 if (U_FAILURE(*status)) { | |
6227 ucol_closeElements(it); | |
6228 return FALSE; | |
6229 } | |
6230 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3)
; | |
6231 | |
6232 int32_t primShift = 24, secShift = 24, terShift = 24; | |
6233 uint32_t CE = 0; | |
6234 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1; | |
6235 | |
6236 // TODO: make safe if you get more than you wanted... | |
6237 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) { | |
6238 primShift = 24; secShift = 24; terShift = 24; | |
6239 if(ch < 0x100) { | |
6240 CE = coll->latinOneMapping[ch]; | |
6241 } else { | |
6242 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); | |
6243 if(CE == UCOL_NOT_FOUND && coll->UCA) { | |
6244 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); | |
6245 } | |
6246 } | |
6247 if(CE < UCOL_NOT_FOUND) { | |
6248 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift
); | |
6249 } else { | |
6250 switch (getCETag(CE)) { | |
6251 case EXPANSION_TAG: | |
6252 case DIGIT_TAG: | |
6253 ucol_setText(it, &ch, 1, status); | |
6254 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) { | |
6255 if(primShift < 0 || secShift < 0 || terShift < 0) { | |
6256 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; | |
6257 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL
_OUT_CE; | |
6258 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BA
IL_OUT_CE; | |
6259 break; | |
6260 } | |
6261 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &
terShift); | |
6262 } | |
6263 break; | |
6264 case CONTRACTION_TAG: | |
6265 // here is the trick | |
6266 // F2 is contraction. We do something very similar to contractio
ns | |
6267 // but have two indices, one in the real contraction table and t
he | |
6268 // other to where we stuffed things. This hopes that we don't ha
ve | |
6269 // many contractions (this should work for latin-1 tables). | |
6270 { | |
6271 if((CE & 0x00FFF000) != 0) { | |
6272 *status = U_UNSUPPORTED_ERROR; | |
6273 goto cleanup_after_failure; | |
6274 } | |
6275 | |
6276 const UChar *UCharOffset = (UChar *)coll->image+getContractO
ffset(CE); | |
6277 | |
6278 CE |= (contractionOffset & 0xFFF) << 12; // insert the offse
t in latin-1 table | |
6279 | |
6280 coll->latinOneCEs[ch] = CE; | |
6281 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE; | |
6282 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE; | |
6283 | |
6284 // We're going to jump into contraction table, pick the elem
ents | |
6285 // and use them | |
6286 do { | |
6287 CE = *(coll->contractionCEs + | |
6288 (UCharOffset - coll->contractionIndex)); | |
6289 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG)
{ | |
6290 uint32_t size; | |
6291 uint32_t i; /* general counter */ | |
6292 uint32_t *CEOffset = (uint32_t *)coll->image+getExpa
nsionOffset(CE); /* find the offset to expansion table */ | |
6293 size = getExpansionCount(CE); | |
6294 //CE = *CEOffset++; | |
6295 if(size != 0) { /* if there are less than 16 element
s in expansion, we don't terminate */ | |
6296 for(i = 0; i<size; i++) { | |
6297 if(primShift < 0 || secShift < 0 || terShift
< 0) { | |
6298 coll->latinOneCEs[(UChar)contractionOffs
et] = UCOL_BAIL_OUT_CE; | |
6299 coll->latinOneCEs[coll->latinOneTableLen
+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; | |
6300 coll->latinOneCEs[2*coll->latinOneTableL
en+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; | |
6301 break; | |
6302 } | |
6303 ucol_addLatinOneEntry(coll, (UChar)contracti
onOffset, *CEOffset++, &primShift, &secShift, &terShift); | |
6304 } | |
6305 } else { /* else, we do */ | |
6306 while(*CEOffset != 0) { | |
6307 if(primShift < 0 || secShift < 0 || terShift
< 0) { | |
6308 coll->latinOneCEs[(UChar)contractionOffs
et] = UCOL_BAIL_OUT_CE; | |
6309 coll->latinOneCEs[coll->latinOneTableLen
+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; | |
6310 coll->latinOneCEs[2*coll->latinOneTableL
en+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; | |
6311 break; | |
6312 } | |
6313 ucol_addLatinOneEntry(coll, (UChar)contracti
onOffset, *CEOffset++, &primShift, &secShift, &terShift); | |
6314 } | |
6315 } | |
6316 contractionOffset++; | |
6317 } else if(CE < UCOL_NOT_FOUND) { | |
6318 ucol_addLatinOneEntry(coll, (UChar)contractionOffset
++, CE, &primShift, &secShift, &terShift); | |
6319 } else { | |
6320 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_B
AIL_OUT_CE; | |
6321 coll->latinOneCEs[coll->latinOneTableLen+(UChar)cont
ractionOffset] = UCOL_BAIL_OUT_CE; | |
6322 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)co
ntractionOffset] = UCOL_BAIL_OUT_CE; | |
6323 contractionOffset++; | |
6324 } | |
6325 UCharOffset++; | |
6326 primShift = 24; secShift = 24; terShift = 24; | |
6327 if(contractionOffset == coll->latinOneTableLen) { // we
need to reallocate | |
6328 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneT
ableLen, status)) { | |
6329 goto cleanup_after_failure; | |
6330 } | |
6331 } | |
6332 } while(*UCharOffset != 0xFFFF); | |
6333 } | |
6334 break;; | |
6335 case SPEC_PROC_TAG: | |
6336 { | |
6337 // 0xB7 is a precontext character defined in UCA5.1, a speci
al | |
6338 // handle is implemeted in order to save LatinOne table for | |
6339 // most locales. | |
6340 if (ch==0xb7) { | |
6341 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShif
t, &terShift); | |
6342 } | |
6343 else { | |
6344 goto cleanup_after_failure; | |
6345 } | |
6346 } | |
6347 break; | |
6348 default: | |
6349 goto cleanup_after_failure; | |
6350 } | |
6351 } | |
6352 } | |
6353 // compact table | |
6354 if(contractionOffset < coll->latinOneTableLen) { | |
6355 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) { | |
6356 goto cleanup_after_failure; | |
6357 } | |
6358 } | |
6359 ucol_closeElements(it); | |
6360 return result; | |
6361 | |
6362 cleanup_after_failure: | |
6363 // status should already be set before arriving here. | |
6364 coll->latinOneFailed = TRUE; | |
6365 ucol_closeElements(it); | |
6366 return FALSE; | |
6367 } | |
6368 | |
6369 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) { | |
6370 if(U_SUCCESS(*status)) { | |
6371 if(coll->caseFirst == UCOL_UPPER_FIRST) { | |
6372 coll->caseSwitch = UCOL_CASE_SWITCH; | |
6373 } else { | |
6374 coll->caseSwitch = UCOL_NO_CASE_SWITCH; | |
6375 } | |
6376 | |
6377 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) { | |
6378 coll->tertiaryMask = UCOL_REMOVE_CASE; | |
6379 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; | |
6380 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /*
Should be 0x80 */ | |
6381 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF; | |
6382 coll->tertiaryBottom = UCOL_COMMON_BOT3; | |
6383 } else { | |
6384 coll->tertiaryMask = UCOL_KEEP_CASE; | |
6385 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON; | |
6386 if(coll->caseFirst == UCOL_UPPER_FIRST) { | |
6387 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST; | |
6388 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER; | |
6389 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER; | |
6390 } else { | |
6391 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; | |
6392 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER; | |
6393 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER; | |
6394 } | |
6395 } | |
6396 | |
6397 /* Set the compression values */ | |
6398 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBott
om - 1); | |
6399 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* w
e multilply double with int, but need only int */ | |
6400 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopC
ount); | |
6401 | |
6402 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY | |
6403 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == U
COL_NON_IGNORABLE) | |
6404 { | |
6405 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary; | |
6406 } else { | |
6407 coll->sortKeyGen = ucol_calcSortKey; | |
6408 } | |
6409 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && col
l->numericCollation == UCOL_OFF | |
6410 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneF
ailed) | |
6411 { | |
6412 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) { | |
6413 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in build
ing latin1 table, we'll use it | |
6414 //fprintf(stderr, "F"); | |
6415 coll->latinOneUse = TRUE; | |
6416 } else { | |
6417 coll->latinOneUse = FALSE; | |
6418 } | |
6419 if(*status == U_UNSUPPORTED_ERROR) { | |
6420 *status = U_ZERO_ERROR; | |
6421 } | |
6422 } else { // latin1Table exists and it doesn't need to be regenerated
, just use it | |
6423 coll->latinOneUse = TRUE; | |
6424 } | |
6425 } else { | |
6426 coll->latinOneUse = FALSE; | |
6427 } | |
6428 } | |
6429 } | 311 } |
6430 | 312 |
6431 U_CAPI uint32_t U_EXPORT2 | 313 U_CAPI uint32_t U_EXPORT2 |
6432 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCod
e *status) { | 314 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCod
e *status) { |
6433 if(U_FAILURE(*status) || coll == NULL) { | 315 if(U_FAILURE(*status) || coll == NULL) { |
6434 return 0; | 316 return 0; |
6435 } | 317 } |
6436 if(len == -1) { | 318 return Collator::fromUCollator(coll)->setVariableTop(varTop, len, *status); |
6437 len = u_strlen(varTop); | |
6438 } | |
6439 if(len == 0) { | |
6440 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
6441 return 0; | |
6442 } | |
6443 | |
6444 if(coll->delegate!=NULL) { | |
6445 return ((Collator*)coll->delegate)->setVariableTop(varTop, len, *status); | |
6446 } | |
6447 | |
6448 | |
6449 collIterate s; | |
6450 IInit_collIterate(coll, varTop, len, &s, status); | |
6451 if(U_FAILURE(*status)) { | |
6452 return 0; | |
6453 } | |
6454 | |
6455 uint32_t CE = ucol_IGetNextCE(coll, &s, status); | |
6456 | |
6457 /* here we check if we have consumed all characters */ | |
6458 /* you can put in either one character or a contraction */ | |
6459 /* you shouldn't put more... */ | |
6460 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) { | |
6461 *status = U_CE_NOT_FOUND_ERROR; | |
6462 return 0; | |
6463 } | |
6464 | |
6465 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status); | |
6466 | |
6467 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) { | |
6468 *status = U_PRIMARY_TOO_LONG_ERROR; | |
6469 return 0; | |
6470 } | |
6471 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) { | |
6472 coll->variableTopValueisDefault = FALSE; | |
6473 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16; | |
6474 } | |
6475 | |
6476 /* To avoid memory leak, free the offset buffer if necessary. */ | |
6477 ucol_freeOffsetBuffer(&s); | |
6478 | |
6479 return CE & UCOL_PRIMARYMASK; | |
6480 } | 319 } |
6481 | 320 |
6482 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode
*status) { | 321 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode
*status) { |
6483 if(U_FAILURE(*status) || coll == NULL) { | 322 if(U_FAILURE(*status) || coll == NULL) { |
6484 return 0; | 323 return 0; |
6485 } | 324 } |
6486 if(coll->delegate!=NULL) { | 325 return Collator::fromUCollator(coll)->getVariableTop(*status); |
6487 return ((const Collator*)coll->delegate)->getVariableTop(*status); | |
6488 } | |
6489 return coll->variableTopValue<<16; | |
6490 } | 326 } |
6491 | 327 |
6492 U_CAPI void U_EXPORT2 | 328 U_CAPI void U_EXPORT2 |
6493 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *stat
us) { | 329 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *stat
us) { |
6494 if(U_FAILURE(*status) || coll == NULL) { | 330 if(U_FAILURE(*status) || coll == NULL) { |
6495 return; | 331 return; |
6496 } | 332 } |
6497 | 333 Collator::fromUCollator(coll)->setVariableTop(varTop, *status); |
6498 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) { | 334 } |
6499 coll->variableTopValueisDefault = FALSE; | 335 |
6500 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16; | |
6501 } | |
6502 } | |
6503 /* Attribute setter API */ | |
6504 U_CAPI void U_EXPORT2 | 336 U_CAPI void U_EXPORT2 |
6505 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value,
UErrorCode *status) { | 337 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value,
UErrorCode *status) { |
6506 if(U_FAILURE(*status) || coll == NULL) { | 338 if(U_FAILURE(*status) || coll == NULL) { |
6507 return; | 339 return; |
6508 } | 340 } |
6509 | 341 |
6510 if(coll->delegate != NULL) { | 342 Collator::fromUCollator(coll)->setAttribute(attr, value, *status); |
6511 ((Collator*)coll->delegate)->setAttribute(attr,value,*status); | |
6512 return; | |
6513 } | |
6514 | |
6515 UColAttributeValue oldFrench = coll->frenchCollation; | |
6516 UColAttributeValue oldCaseFirst = coll->caseFirst; | |
6517 switch(attr) { | |
6518 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */ | |
6519 if(value == UCOL_ON) { | |
6520 coll->numericCollation = UCOL_ON; | |
6521 coll->numericCollationisDefault = FALSE; | |
6522 } else if (value == UCOL_OFF) { | |
6523 coll->numericCollation = UCOL_OFF; | |
6524 coll->numericCollationisDefault = FALSE; | |
6525 } else if (value == UCOL_DEFAULT) { | |
6526 coll->numericCollationisDefault = TRUE; | |
6527 coll->numericCollation = (UColAttributeValue)coll->options->numericC
ollation; | |
6528 } else { | |
6529 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
6530 } | |
6531 break; | |
6532 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragan
a */ | |
6533 if(value == UCOL_ON || value == UCOL_OFF || value == UCOL_DEFAULT) { | |
6534 // This attribute is an implementation detail of the CLDR Japanese t
ailoring. | |
6535 // The implementation might change to use a different mechanism | |
6536 // to achieve the same Japanese sort order. | |
6537 // Since ICU 50, this attribute is not settable any more via API fun
ctions. | |
6538 } else { | |
6539 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
6540 } | |
6541 break; | |
6542 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*
/ | |
6543 if(value == UCOL_ON) { | |
6544 coll->frenchCollation = UCOL_ON; | |
6545 coll->frenchCollationisDefault = FALSE; | |
6546 } else if (value == UCOL_OFF) { | |
6547 coll->frenchCollation = UCOL_OFF; | |
6548 coll->frenchCollationisDefault = FALSE; | |
6549 } else if (value == UCOL_DEFAULT) { | |
6550 coll->frenchCollationisDefault = TRUE; | |
6551 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCol
lation; | |
6552 } else { | |
6553 *status = U_ILLEGAL_ARGUMENT_ERROR ; | |
6554 } | |
6555 break; | |
6556 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ | |
6557 if(value == UCOL_SHIFTED) { | |
6558 coll->alternateHandling = UCOL_SHIFTED; | |
6559 coll->alternateHandlingisDefault = FALSE; | |
6560 } else if (value == UCOL_NON_IGNORABLE) { | |
6561 coll->alternateHandling = UCOL_NON_IGNORABLE; | |
6562 coll->alternateHandlingisDefault = FALSE; | |
6563 } else if (value == UCOL_DEFAULT) { | |
6564 coll->alternateHandlingisDefault = TRUE; | |
6565 coll->alternateHandling = (UColAttributeValue)coll->options->alterna
teHandling ; | |
6566 } else { | |
6567 *status = U_ILLEGAL_ARGUMENT_ERROR ; | |
6568 } | |
6569 break; | |
6570 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ | |
6571 if(value == UCOL_LOWER_FIRST) { | |
6572 coll->caseFirst = UCOL_LOWER_FIRST; | |
6573 coll->caseFirstisDefault = FALSE; | |
6574 } else if (value == UCOL_UPPER_FIRST) { | |
6575 coll->caseFirst = UCOL_UPPER_FIRST; | |
6576 coll->caseFirstisDefault = FALSE; | |
6577 } else if (value == UCOL_OFF) { | |
6578 coll->caseFirst = UCOL_OFF; | |
6579 coll->caseFirstisDefault = FALSE; | |
6580 } else if (value == UCOL_DEFAULT) { | |
6581 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst; | |
6582 coll->caseFirstisDefault = TRUE; | |
6583 } else { | |
6584 *status = U_ILLEGAL_ARGUMENT_ERROR ; | |
6585 } | |
6586 break; | |
6587 case UCOL_CASE_LEVEL: /* do we have an extra case level */ | |
6588 if(value == UCOL_ON) { | |
6589 coll->caseLevel = UCOL_ON; | |
6590 coll->caseLevelisDefault = FALSE; | |
6591 } else if (value == UCOL_OFF) { | |
6592 coll->caseLevel = UCOL_OFF; | |
6593 coll->caseLevelisDefault = FALSE; | |
6594 } else if (value == UCOL_DEFAULT) { | |
6595 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel; | |
6596 coll->caseLevelisDefault = TRUE; | |
6597 } else { | |
6598 *status = U_ILLEGAL_ARGUMENT_ERROR ; | |
6599 } | |
6600 break; | |
6601 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ | |
6602 if(value == UCOL_ON) { | |
6603 coll->normalizationMode = UCOL_ON; | |
6604 coll->normalizationModeisDefault = FALSE; | |
6605 initializeFCD(status); | |
6606 } else if (value == UCOL_OFF) { | |
6607 coll->normalizationMode = UCOL_OFF; | |
6608 coll->normalizationModeisDefault = FALSE; | |
6609 } else if (value == UCOL_DEFAULT) { | |
6610 coll->normalizationModeisDefault = TRUE; | |
6611 coll->normalizationMode = (UColAttributeValue)coll->options->normali
zationMode; | |
6612 if(coll->normalizationMode == UCOL_ON) { | |
6613 initializeFCD(status); | |
6614 } | |
6615 } else { | |
6616 *status = U_ILLEGAL_ARGUMENT_ERROR ; | |
6617 } | |
6618 break; | |
6619 case UCOL_STRENGTH: /* attribute for strength */ | |
6620 if (value == UCOL_DEFAULT) { | |
6621 coll->strengthisDefault = TRUE; | |
6622 coll->strength = (UColAttributeValue)coll->options->strength; | |
6623 } else if (value <= UCOL_IDENTICAL) { | |
6624 coll->strengthisDefault = FALSE; | |
6625 coll->strength = value; | |
6626 } else { | |
6627 *status = U_ILLEGAL_ARGUMENT_ERROR ; | |
6628 } | |
6629 break; | |
6630 case UCOL_ATTRIBUTE_COUNT: | |
6631 default: | |
6632 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
6633 break; | |
6634 } | |
6635 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) { | |
6636 coll->latinOneRegenTable = TRUE; | |
6637 } else { | |
6638 coll->latinOneRegenTable = FALSE; | |
6639 } | |
6640 ucol_updateInternalState(coll, status); | |
6641 } | 343 } |
6642 | 344 |
6643 U_CAPI UColAttributeValue U_EXPORT2 | 345 U_CAPI UColAttributeValue U_EXPORT2 |
6644 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status)
{ | 346 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status)
{ |
6645 if(U_FAILURE(*status) || coll == NULL) { | 347 if(U_FAILURE(*status) || coll == NULL) { |
6646 return UCOL_DEFAULT; | 348 return UCOL_DEFAULT; |
6647 } | 349 } |
6648 | 350 |
6649 if(coll->delegate != NULL) { | 351 return Collator::fromUCollator(coll)->getAttribute(attr, *status); |
6650 return ((Collator*)coll->delegate)->getAttribute(attr,*status); | |
6651 } | |
6652 | |
6653 switch(attr) { | |
6654 case UCOL_NUMERIC_COLLATION: | |
6655 return coll->numericCollation; | |
6656 case UCOL_HIRAGANA_QUATERNARY_MODE: | |
6657 return coll->hiraganaQ; | |
6658 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*
/ | |
6659 return coll->frenchCollation; | |
6660 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ | |
6661 return coll->alternateHandling; | |
6662 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ | |
6663 return coll->caseFirst; | |
6664 case UCOL_CASE_LEVEL: /* do we have an extra case level */ | |
6665 return coll->caseLevel; | |
6666 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ | |
6667 return coll->normalizationMode; | |
6668 case UCOL_STRENGTH: /* attribute for strength */ | |
6669 return coll->strength; | |
6670 case UCOL_ATTRIBUTE_COUNT: | |
6671 default: | |
6672 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
6673 break; | |
6674 } | |
6675 return UCOL_DEFAULT; | |
6676 } | 352 } |
6677 | 353 |
6678 U_CAPI void U_EXPORT2 | 354 U_CAPI void U_EXPORT2 |
6679 ucol_setStrength( UCollator *coll, | 355 ucol_setStrength( UCollator *coll, |
6680 UCollationStrength strength) | 356 UCollationStrength strength) |
6681 { | 357 { |
6682 UErrorCode status = U_ZERO_ERROR; | 358 UErrorCode status = U_ZERO_ERROR; |
6683 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); | 359 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); |
6684 } | 360 } |
6685 | 361 |
6686 U_CAPI UCollationStrength U_EXPORT2 | 362 U_CAPI UCollationStrength U_EXPORT2 |
6687 ucol_getStrength(const UCollator *coll) | 363 ucol_getStrength(const UCollator *coll) |
6688 { | 364 { |
6689 UErrorCode status = U_ZERO_ERROR; | 365 UErrorCode status = U_ZERO_ERROR; |
6690 return ucol_getAttribute(coll, UCOL_STRENGTH, &status); | 366 return ucol_getAttribute(coll, UCOL_STRENGTH, &status); |
6691 } | 367 } |
6692 | 368 |
6693 U_CAPI int32_t U_EXPORT2 | 369 U_CAPI int32_t U_EXPORT2 |
6694 ucol_getReorderCodes(const UCollator *coll, | 370 ucol_getReorderCodes(const UCollator *coll, |
6695 int32_t *dest, | 371 int32_t *dest, |
6696 int32_t destCapacity, | 372 int32_t destCapacity, |
6697 UErrorCode *status) { | 373 UErrorCode *status) { |
6698 if (U_FAILURE(*status)) { | 374 if (U_FAILURE(*status)) { |
6699 return 0; | 375 return 0; |
6700 } | 376 } |
6701 | 377 |
6702 if(coll->delegate!=NULL) { | 378 return Collator::fromUCollator(coll)->getReorderCodes(dest, destCapacity, *s
tatus); |
6703 return ((const Collator*)coll->delegate)->getReorderCodes(dest, destCapaci
ty, *status); | |
6704 } | |
6705 | |
6706 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { | |
6707 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
6708 return 0; | |
6709 } | |
6710 | |
6711 #ifdef UCOL_DEBUG | |
6712 printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength); | |
6713 printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLe
ngth); | |
6714 #endif | |
6715 | |
6716 if (coll->reorderCodesLength > destCapacity) { | |
6717 *status = U_BUFFER_OVERFLOW_ERROR; | |
6718 return coll->reorderCodesLength; | |
6719 } | |
6720 for (int32_t i = 0; i < coll->reorderCodesLength; i++) { | |
6721 dest[i] = coll->reorderCodes[i]; | |
6722 } | |
6723 return coll->reorderCodesLength; | |
6724 } | 379 } |
6725 | 380 |
6726 U_CAPI void U_EXPORT2 | 381 U_CAPI void U_EXPORT2 |
6727 ucol_setReorderCodes(UCollator* coll, | 382 ucol_setReorderCodes(UCollator* coll, |
6728 const int32_t* reorderCodes, | 383 const int32_t* reorderCodes, |
6729 int32_t reorderCodesLength, | 384 int32_t reorderCodesLength, |
6730 UErrorCode *status) { | 385 UErrorCode *status) { |
6731 if (U_FAILURE(*status)) { | 386 if (U_FAILURE(*status)) { |
6732 return; | 387 return; |
6733 } | 388 } |
6734 | 389 |
6735 if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NUL
L)) { | 390 Collator::fromUCollator(coll)->setReorderCodes(reorderCodes, reorderCodesLen
gth, *status); |
6736 *status = U_ILLEGAL_ARGUMENT_ERROR; | |
6737 return; | |
6738 } | |
6739 | |
6740 if(coll->delegate!=NULL) { | |
6741 ((Collator*)coll->delegate)->setReorderCodes(reorderCodes, reorderCodesLen
gth, *status); | |
6742 return; | |
6743 } | |
6744 | |
6745 if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) { | |
6746 uprv_free(coll->reorderCodes); | |
6747 } | |
6748 coll->reorderCodes = NULL; | |
6749 coll->freeReorderCodesOnClose = FALSE; | |
6750 coll->reorderCodesLength = 0; | |
6751 if (reorderCodesLength == 0) { | |
6752 if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutat
ionTableOnClose == TRUE) { | |
6753 uprv_free(coll->leadBytePermutationTable); | |
6754 } | |
6755 coll->leadBytePermutationTable = NULL; | |
6756 coll->freeLeadBytePermutationTableOnClose = FALSE; | |
6757 return; | |
6758 } | |
6759 coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int3
2_t)); | |
6760 if (coll->reorderCodes == NULL) { | |
6761 *status = U_MEMORY_ALLOCATION_ERROR; | |
6762 return; | |
6763 } | |
6764 coll->freeReorderCodesOnClose = TRUE; | |
6765 for (int32_t i = 0; i < reorderCodesLength; i++) { | |
6766 coll->reorderCodes[i] = reorderCodes[i]; | |
6767 } | |
6768 coll->reorderCodesLength = reorderCodesLength; | |
6769 ucol_buildPermutationTable(coll, status); | |
6770 } | 391 } |
6771 | 392 |
6772 U_CAPI int32_t U_EXPORT2 | 393 U_CAPI int32_t U_EXPORT2 |
6773 ucol_getEquivalentReorderCodes(int32_t reorderCode, | 394 ucol_getEquivalentReorderCodes(int32_t reorderCode, |
6774 int32_t* dest, | 395 int32_t* dest, |
6775 int32_t destCapacity, | 396 int32_t destCapacity, |
6776 UErrorCode *pErrorCode) { | 397 UErrorCode *pErrorCode) { |
6777 bool equivalentCodesSet[USCRIPT_CODE_LIMIT]; | 398 return Collator::getEquivalentReorderCodes(reorderCode, dest, destCapacity,
*pErrorCode); |
6778 uint16_t leadBytes[256]; | 399 } |
6779 int leadBytesCount; | |
6780 int leadByteIndex; | |
6781 int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT]; | |
6782 int reorderCodesForLeadByteCount; | |
6783 int reorderCodeIndex; | |
6784 | |
6785 int32_t equivalentCodesCount = 0; | |
6786 int setIndex; | |
6787 | |
6788 if (U_FAILURE(*pErrorCode)) { | |
6789 return 0; | |
6790 } | |
6791 | |
6792 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { | |
6793 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
6794 return 0; | |
6795 } | |
6796 | |
6797 uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool)); | |
6798 | |
6799 const UCollator* uca = ucol_initUCA(pErrorCode); | |
6800 if (U_FAILURE(*pErrorCode)) { | |
6801 » return 0; | |
6802 } | |
6803 leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes
, 256); | |
6804 for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) { | |
6805 reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte( | |
6806 uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE
_LIMIT); | |
6807 for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCou
nt; reorderCodeIndex++) { | |
6808 equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true
; | |
6809 } | |
6810 } | |
6811 | |
6812 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) { | |
6813 if (equivalentCodesSet[setIndex] == true) { | |
6814 equivalentCodesCount++; | |
6815 } | |
6816 } | |
6817 | |
6818 if (destCapacity == 0) { | |
6819 return equivalentCodesCount; | |
6820 } | |
6821 | |
6822 equivalentCodesCount = 0; | |
6823 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) { | |
6824 if (equivalentCodesSet[setIndex] == true) { | |
6825 dest[equivalentCodesCount++] = setIndex; | |
6826 if (equivalentCodesCount >= destCapacity) { | |
6827 break; | |
6828 } | |
6829 } | |
6830 } | |
6831 return equivalentCodesCount; | |
6832 } | |
6833 | |
6834 | |
6835 /****************************************************************************/ | |
6836 /* Following are misc functions */ | |
6837 /* there are new APIs and some compatibility APIs */ | |
6838 /****************************************************************************/ | |
6839 | 400 |
6840 U_CAPI void U_EXPORT2 | 401 U_CAPI void U_EXPORT2 |
6841 ucol_getVersion(const UCollator* coll, | 402 ucol_getVersion(const UCollator* coll, |
6842 UVersionInfo versionInfo) | 403 UVersionInfo versionInfo) |
6843 { | 404 { |
6844 if(coll->delegate!=NULL) { | 405 Collator::fromUCollator(coll)->getVersion(versionInfo); |
6845 ((const Collator*)coll->delegate)->getVersion(versionInfo); | |
6846 return; | |
6847 } | |
6848 /* RunTime version */ | |
6849 uint8_t rtVersion = UCOL_RUNTIME_VERSION; | |
6850 /* Builder version*/ | |
6851 uint8_t bdVersion = coll->image->version[0]; | |
6852 | |
6853 /* Charset Version. Need to get the version from cnv files | |
6854 * makeconv should populate cnv files with version and | |
6855 * an api has to be provided in ucnv.h to obtain this version | |
6856 */ | |
6857 uint8_t csVersion = 0; | |
6858 | |
6859 /* combine the version info */ | |
6860 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersi
on)); | |
6861 | |
6862 /* Tailoring rules */ | |
6863 versionInfo[0] = (uint8_t)(cmbVersion>>8); | |
6864 versionInfo[1] = (uint8_t)cmbVersion; | |
6865 versionInfo[2] = coll->image->version[1]; | |
6866 if(coll->UCA) { | |
6867 /* Include the minor number when getting the UCA version. (major & 1f) <
< 3 | (minor & 7) */ | |
6868 versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->
UCA->image->UCAVersion[1] & 0x07); | |
6869 } else { | |
6870 versionInfo[3] = 0; | |
6871 } | |
6872 } | |
6873 | |
6874 | |
6875 /* This internal API checks whether a character is tailored or not */ | |
6876 U_CAPI UBool U_EXPORT2 | |
6877 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) { | |
6878 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) { | |
6879 return FALSE; | |
6880 } | |
6881 | |
6882 uint32_t CE = UCOL_NOT_FOUND; | |
6883 const UChar *ContractionStart = NULL; | |
6884 if(u < 0x100) { /* latin-1 */ | |
6885 CE = coll->latinOneMapping[u]; | |
6886 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) { | |
6887 return FALSE; | |
6888 } | |
6889 } else { /* regular */ | |
6890 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u); | |
6891 } | |
6892 | |
6893 if(isContraction(CE)) { | |
6894 ContractionStart = (UChar *)coll->image+getContractOffset(CE); | |
6895 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex)
); | |
6896 } | |
6897 | |
6898 return (UBool)(CE != UCOL_NOT_FOUND); | |
6899 } | |
6900 | |
6901 | |
6902 /****************************************************************************/ | |
6903 /* Following are the string compare functions */ | |
6904 /* */ | |
6905 /****************************************************************************/ | |
6906 | |
6907 | |
6908 /* ucol_checkIdent internal function. Does byte level string compare. */ | |
6909 /* Used by strcoll if strength == identical and strings */ | |
6910 /* are otherwise equal. */ | |
6911 /* */ | |
6912 /* Comparison must be done on NFD normalized strings. */ | |
6913 /* FCD is not good enough. */ | |
6914 | |
6915 static | |
6916 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBoo
l normalize, UErrorCode *status) | |
6917 { | |
6918 // When we arrive here, we can have normal strings or UCharIterators. Curren
tly they are both | |
6919 // of same type, but that doesn't really mean that it will stay that way. | |
6920 int32_t comparison; | |
6921 | |
6922 if (sColl->flags & UCOL_USE_ITERATOR) { | |
6923 // The division for the array length may truncate the array size to | |
6924 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too
high | |
6925 // for all platforms anyway. | |
6926 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
6927 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
6928 UNormIterator *sNIt = NULL, *tNIt = NULL; | |
6929 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); | |
6930 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); | |
6931 sColl->iterator->move(sColl->iterator, 0, UITER_START); | |
6932 tColl->iterator->move(tColl->iterator, 0, UITER_START); | |
6933 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, sta
tus); | |
6934 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, sta
tus); | |
6935 comparison = u_strCompareIter(sIt, tIt, TRUE); | |
6936 unorm_closeIter(sNIt); | |
6937 unorm_closeIter(tNIt); | |
6938 } else { | |
6939 int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl-
>endp - sColl->string) : -1; | |
6940 const UChar *sBuf = sColl->string; | |
6941 int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl-
>endp - tColl->string) : -1; | |
6942 const UChar *tBuf = tColl->string; | |
6943 | |
6944 if (normalize) { | |
6945 *status = U_ZERO_ERROR; | |
6946 // Note: We could use Normalizer::compare() or similar, but for shor
t strings | |
6947 // which may not be in FCD it might be faster to just NFD them. | |
6948 // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather tha
n | |
6949 // NFD'ing immediately might be faster for long strings, | |
6950 // but string comparison is usually done on relatively short strings
. | |
6951 sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN
) == 0, sBuf, sLen), | |
6952 sColl->writableBuffer, | |
6953 *status); | |
6954 tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN
) == 0, tBuf, tLen), | |
6955 tColl->writableBuffer, | |
6956 *status); | |
6957 if(U_FAILURE(*status)) { | |
6958 return UCOL_LESS; | |
6959 } | |
6960 comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writ
ableBuffer); | |
6961 } else { | |
6962 comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE); | |
6963 } | |
6964 } | |
6965 | |
6966 if (comparison < 0) { | |
6967 return UCOL_LESS; | |
6968 } else if (comparison == 0) { | |
6969 return UCOL_EQUAL; | |
6970 } else /* comparison > 0 */ { | |
6971 return UCOL_GREATER; | |
6972 } | |
6973 } | |
6974 | |
6975 /* CEBuf - A struct and some inline functions to handle the saving */ | |
6976 /* of CEs in a buffer within ucol_strcoll */ | |
6977 | |
6978 #define UCOL_CEBUF_SIZE 512 | |
6979 typedef struct ucol_CEBuf { | |
6980 uint32_t *buf; | |
6981 uint32_t *endp; | |
6982 uint32_t *pos; | |
6983 uint32_t localArray[UCOL_CEBUF_SIZE]; | |
6984 } ucol_CEBuf; | |
6985 | |
6986 | |
6987 static | |
6988 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) { | |
6989 (b)->buf = (b)->pos = (b)->localArray; | |
6990 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE; | |
6991 } | |
6992 | |
6993 static | |
6994 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) { | |
6995 uint32_t oldSize; | |
6996 uint32_t newSize; | |
6997 uint32_t *newBuf; | |
6998 | |
6999 ci->flags |= UCOL_ITER_ALLOCATED; | |
7000 oldSize = (uint32_t)(b->pos - b->buf); | |
7001 newSize = oldSize * 2; | |
7002 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t)); | |
7003 if(newBuf == NULL) { | |
7004 *status = U_MEMORY_ALLOCATION_ERROR; | |
7005 } | |
7006 else { | |
7007 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t)); | |
7008 if (b->buf != b->localArray) { | |
7009 uprv_free(b->buf); | |
7010 } | |
7011 b->buf = newBuf; | |
7012 b->endp = b->buf + newSize; | |
7013 b->pos = b->buf + oldSize; | |
7014 } | |
7015 } | |
7016 | |
7017 static | |
7018 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCo
de *status) { | |
7019 if (b->pos == b->endp) { | |
7020 ucol_CEBuf_Expand(b, ci, status); | |
7021 } | |
7022 if (U_SUCCESS(*status)) { | |
7023 *(b)->pos++ = ce; | |
7024 } | |
7025 } | |
7026 | |
7027 /* This is a trick string compare function that goes in and uses sortkeys to com
pare */ | |
7028 /* It is used when compare gets in trouble and needs to bail out
*/ | |
7029 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl, | |
7030 collIterate *tColl, | |
7031 UErrorCode *status) | |
7032 { | |
7033 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER]; | |
7034 uint8_t *sourceKeyP = sourceKey; | |
7035 uint8_t *targetKeyP = targetKey; | |
7036 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER; | |
7037 const UCollator *coll = sColl->coll; | |
7038 const UChar *source = NULL; | |
7039 const UChar *target = NULL; | |
7040 int32_t result = UCOL_EQUAL; | |
7041 UnicodeString sourceString, targetString; | |
7042 int32_t sourceLength; | |
7043 int32_t targetLength; | |
7044 | |
7045 if(sColl->flags & UCOL_USE_ITERATOR) { | |
7046 sColl->iterator->move(sColl->iterator, 0, UITER_START); | |
7047 tColl->iterator->move(tColl->iterator, 0, UITER_START); | |
7048 UChar32 c; | |
7049 while((c=sColl->iterator->next(sColl->iterator))>=0) { | |
7050 sourceString.append((UChar)c); | |
7051 } | |
7052 while((c=tColl->iterator->next(tColl->iterator))>=0) { | |
7053 targetString.append((UChar)c); | |
7054 } | |
7055 source = sourceString.getBuffer(); | |
7056 sourceLength = sourceString.length(); | |
7057 target = targetString.getBuffer(); | |
7058 targetLength = targetString.length(); | |
7059 } else { // no iterators | |
7060 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sCo
ll->string):-1; | |
7061 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tCo
ll->string):-1; | |
7062 source = sColl->string; | |
7063 target = tColl->string; | |
7064 } | |
7065 | |
7066 | |
7067 | |
7068 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourc
eKeyLen); | |
7069 if(sourceKeyLen > UCOL_MAX_BUFFER) { | |
7070 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t)); | |
7071 if(sourceKeyP == NULL) { | |
7072 *status = U_MEMORY_ALLOCATION_ERROR; | |
7073 goto cleanup_and_do_compare; | |
7074 } | |
7075 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, s
ourceKeyLen); | |
7076 } | |
7077 | |
7078 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targe
tKeyLen); | |
7079 if(targetKeyLen > UCOL_MAX_BUFFER) { | |
7080 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t)); | |
7081 if(targetKeyP == NULL) { | |
7082 *status = U_MEMORY_ALLOCATION_ERROR; | |
7083 goto cleanup_and_do_compare; | |
7084 } | |
7085 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, t
argetKeyLen); | |
7086 } | |
7087 | |
7088 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP); | |
7089 | |
7090 cleanup_and_do_compare: | |
7091 if(sourceKeyP != NULL && sourceKeyP != sourceKey) { | |
7092 uprv_free(sourceKeyP); | |
7093 } | |
7094 | |
7095 if(targetKeyP != NULL && targetKeyP != targetKey) { | |
7096 uprv_free(targetKeyP); | |
7097 } | |
7098 | |
7099 if(result<0) { | |
7100 return UCOL_LESS; | |
7101 } else if(result>0) { | |
7102 return UCOL_GREATER; | |
7103 } else { | |
7104 return UCOL_EQUAL; | |
7105 } | |
7106 } | |
7107 | |
7108 | |
7109 static UCollationResult | |
7110 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status) | |
7111 { | |
7112 U_ALIGN_CODE(16); | |
7113 | |
7114 const UCollator *coll = sColl->coll; | |
7115 | |
7116 | |
7117 // setting up the collator parameters | |
7118 UColAttributeValue strength = coll->strength; | |
7119 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY); | |
7120 | |
7121 UBool checkSecTer = initialCheckSecTer; | |
7122 UBool checkTertiary = (strength >= UCOL_TERTIARY); | |
7123 UBool checkQuad = (strength >= UCOL_QUATERNARY); | |
7124 UBool checkIdent = (strength == UCOL_IDENTICAL); | |
7125 UBool checkCase = (coll->caseLevel == UCOL_ON); | |
7126 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer; | |
7127 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); | |
7128 UBool qShifted = shifted && checkQuad; | |
7129 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad; | |
7130 | |
7131 if(doHiragana && shifted) { | |
7132 return (ucol_compareUsingSortKeys(sColl, tColl, status)); | |
7133 } | |
7134 uint8_t caseSwitch = coll->caseSwitch; | |
7135 uint8_t tertiaryMask = coll->tertiaryMask; | |
7136 | |
7137 // This is the lowest primary value that will not be ignored if shifted | |
7138 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0; | |
7139 | |
7140 UCollationResult result = UCOL_EQUAL; | |
7141 UCollationResult hirResult = UCOL_EQUAL; | |
7142 | |
7143 // Preparing the CE buffers. They will be filled during the primary phase | |
7144 ucol_CEBuf sCEs; | |
7145 ucol_CEBuf tCEs; | |
7146 UCOL_INIT_CEBUF(&sCEs); | |
7147 UCOL_INIT_CEBUF(&tCEs); | |
7148 | |
7149 uint32_t secS = 0, secT = 0; | |
7150 uint32_t sOrder=0, tOrder=0; | |
7151 | |
7152 // Non shifted primary processing is quite simple | |
7153 if(!shifted) { | |
7154 for(;;) { | |
7155 // We fetch CEs until we hit a non ignorable primary or end. | |
7156 uint32_t sPrimary; | |
7157 do { | |
7158 // We get the next CE | |
7159 sOrder = ucol_IGetNextCE(coll, sColl, status); | |
7160 // Stuff it in the buffer | |
7161 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); | |
7162 // And keep just the primary part. | |
7163 sPrimary = sOrder & UCOL_PRIMARYMASK; | |
7164 } while(sPrimary == 0); | |
7165 | |
7166 // see the comments on the above block | |
7167 uint32_t tPrimary; | |
7168 do { | |
7169 tOrder = ucol_IGetNextCE(coll, tColl, status); | |
7170 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); | |
7171 tPrimary = tOrder & UCOL_PRIMARYMASK; | |
7172 } while(tPrimary == 0); | |
7173 | |
7174 // if both primaries are the same | |
7175 if(sPrimary == tPrimary) { | |
7176 // and there are no more CEs, we advance to the next level | |
7177 if(sPrimary == UCOL_NO_MORE_CES_PRIMARY) { | |
7178 break; | |
7179 } | |
7180 if(doHiragana && hirResult == UCOL_EQUAL) { | |
7181 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCO
L_WAS_HIRAGANA)) { | |
7182 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl
->flags & UCOL_WAS_HIRAGANA)) | |
7183 ? UCOL_LESS:UCOL_GREATER; | |
7184 } | |
7185 } | |
7186 } else { | |
7187 // only need to check one for continuation | |
7188 // if one is then the other must be or the preceding CE would be
a prefix of the other | |
7189 if (coll->leadBytePermutationTable != NULL && !isContinuation(sO
rder)) { | |
7190 sPrimary = (coll->leadBytePermutationTable[sPrimary>>24] <<
24) | (sPrimary & 0x00FFFFFF); | |
7191 tPrimary = (coll->leadBytePermutationTable[tPrimary>>24] <<
24) | (tPrimary & 0x00FFFFFF); | |
7192 } | |
7193 // if two primaries are different, we are done | |
7194 result = (sPrimary < tPrimary) ? UCOL_LESS: UCOL_GREATER; | |
7195 goto commonReturn; | |
7196 } | |
7197 } // no primary difference... do the rest from the buffers | |
7198 } else { // shifted - do a slightly more complicated processing :) | |
7199 for(;;) { | |
7200 UBool sInShifted = FALSE; | |
7201 UBool tInShifted = FALSE; | |
7202 // This version of code can be refactored. However, it seems easier
to understand this way. | |
7203 // Source loop. Same as the target loop. | |
7204 for(;;) { | |
7205 sOrder = ucol_IGetNextCE(coll, sColl, status); | |
7206 if(sOrder == UCOL_NO_MORE_CES) { | |
7207 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); | |
7208 break; | |
7209 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMA
SK) == 0)) { | |
7210 /* UCA amendment - ignore ignorables that follow shifted cod
e points */ | |
7211 continue; | |
7212 } else if(isContinuation(sOrder)) { | |
7213 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary va
lue */ | |
7214 if(sInShifted) { | |
7215 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* pres
erve interesting continuation */ | |
7216 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); | |
7217 continue; | |
7218 } else { | |
7219 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); | |
7220 break; | |
7221 } | |
7222 } else { /* Just lower level values */ | |
7223 if(sInShifted) { | |
7224 continue; | |
7225 } else { | |
7226 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); | |
7227 continue; | |
7228 } | |
7229 } | |
7230 } else { /* regular */ | |
7231 if(coll->leadBytePermutationTable != NULL){ | |
7232 sOrder = (coll->leadBytePermutationTable[sOrder>>24] <<
24) | (sOrder & 0x00FFFFFF); | |
7233 } | |
7234 if((sOrder & UCOL_PRIMARYMASK) > LVT) { | |
7235 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); | |
7236 break; | |
7237 } else { | |
7238 if((sOrder & UCOL_PRIMARYMASK) > 0) { | |
7239 sInShifted = TRUE; | |
7240 sOrder &= UCOL_PRIMARYMASK; | |
7241 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); | |
7242 continue; | |
7243 } else { | |
7244 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); | |
7245 sInShifted = FALSE; | |
7246 continue; | |
7247 } | |
7248 } | |
7249 } | |
7250 } | |
7251 sOrder &= UCOL_PRIMARYMASK; | |
7252 sInShifted = FALSE; | |
7253 | |
7254 for(;;) { | |
7255 tOrder = ucol_IGetNextCE(coll, tColl, status); | |
7256 if(tOrder == UCOL_NO_MORE_CES) { | |
7257 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); | |
7258 break; | |
7259 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMA
SK) == 0)) { | |
7260 /* UCA amendment - ignore ignorables that follow shifted cod
e points */ | |
7261 continue; | |
7262 } else if(isContinuation(tOrder)) { | |
7263 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary va
lue */ | |
7264 if(tInShifted) { | |
7265 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* pres
erve interesting continuation */ | |
7266 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); | |
7267 continue; | |
7268 } else { | |
7269 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); | |
7270 break; | |
7271 } | |
7272 } else { /* Just lower level values */ | |
7273 if(tInShifted) { | |
7274 continue; | |
7275 } else { | |
7276 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); | |
7277 continue; | |
7278 } | |
7279 } | |
7280 } else { /* regular */ | |
7281 if(coll->leadBytePermutationTable != NULL){ | |
7282 tOrder = (coll->leadBytePermutationTable[tOrder>>24] <<
24) | (tOrder & 0x00FFFFFF); | |
7283 } | |
7284 if((tOrder & UCOL_PRIMARYMASK) > LVT) { | |
7285 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); | |
7286 break; | |
7287 } else { | |
7288 if((tOrder & UCOL_PRIMARYMASK) > 0) { | |
7289 tInShifted = TRUE; | |
7290 tOrder &= UCOL_PRIMARYMASK; | |
7291 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); | |
7292 continue; | |
7293 } else { | |
7294 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); | |
7295 tInShifted = FALSE; | |
7296 continue; | |
7297 } | |
7298 } | |
7299 } | |
7300 } | |
7301 tOrder &= UCOL_PRIMARYMASK; | |
7302 tInShifted = FALSE; | |
7303 | |
7304 if(sOrder == tOrder) { | |
7305 /* | |
7306 if(doHiragana && hirResult == UCOL_EQUAL) { | |
7307 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_
HIRAGANA)) { | |
7308 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags &
UCOL_WAS_HIRAGANA)) | |
7309 ? UCOL_LESS:UCOL_GREATER; | |
7310 } | |
7311 } | |
7312 */ | |
7313 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { | |
7314 break; | |
7315 } else { | |
7316 sOrder = 0; | |
7317 tOrder = 0; | |
7318 continue; | |
7319 } | |
7320 } else { | |
7321 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER; | |
7322 goto commonReturn; | |
7323 } | |
7324 } /* no primary difference... do the rest from the buffers */ | |
7325 } | |
7326 | |
7327 /* now, we're gonna reexamine collected CEs */ | |
7328 uint32_t *sCE; | |
7329 uint32_t *tCE; | |
7330 | |
7331 /* This is the secondary level of comparison */ | |
7332 if(checkSecTer) { | |
7333 if(!isFrenchSec) { /* normal */ | |
7334 sCE = sCEs.buf; | |
7335 tCE = tCEs.buf; | |
7336 for(;;) { | |
7337 while (secS == 0) { | |
7338 secS = *(sCE++) & UCOL_SECONDARYMASK; | |
7339 } | |
7340 | |
7341 while(secT == 0) { | |
7342 secT = *(tCE++) & UCOL_SECONDARYMASK; | |
7343 } | |
7344 | |
7345 if(secS == secT) { | |
7346 if(secS == UCOL_NO_MORE_CES_SECONDARY) { | |
7347 break; | |
7348 } else { | |
7349 secS = 0; secT = 0; | |
7350 continue; | |
7351 } | |
7352 } else { | |
7353 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; | |
7354 goto commonReturn; | |
7355 } | |
7356 } | |
7357 } else { /* do the French */ | |
7358 uint32_t *sCESave = NULL; | |
7359 uint32_t *tCESave = NULL; | |
7360 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimi
zed */ | |
7361 tCE = tCEs.pos-2; | |
7362 for(;;) { | |
7363 while (secS == 0 && sCE >= sCEs.buf) { | |
7364 if(sCESave == NULL) { | |
7365 secS = *(sCE--); | |
7366 if(isContinuation(secS)) { | |
7367 while(isContinuation(secS = *(sCE--))) | |
7368 ; | |
7369 /* after this, secS has the start of continuation, a
nd sCEs points before that */ | |
7370 sCESave = sCE; /* we save it, so that we know where
to come back AND that we need to go forward */ | |
7371 sCE+=2; /* need to point to the first continuation
CP */ | |
7372 /* However, now you can just continue doing stuff */ | |
7373 } | |
7374 } else { | |
7375 secS = *(sCE++); | |
7376 if(!isContinuation(secS)) { /* This means we have finish
ed with this cont */ | |
7377 sCE = sCESave; /* reset the pointer to be
fore continuation */ | |
7378 sCESave = NULL; | |
7379 secS = 0; /* Fetch a fresh CE before the continuati
on sequence. */ | |
7380 continue; | |
7381 } | |
7382 } | |
7383 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit *
/ | |
7384 } | |
7385 | |
7386 while(secT == 0 && tCE >= tCEs.buf) { | |
7387 if(tCESave == NULL) { | |
7388 secT = *(tCE--); | |
7389 if(isContinuation(secT)) { | |
7390 while(isContinuation(secT = *(tCE--))) | |
7391 ; | |
7392 /* after this, secS has the start of continuation, a
nd sCEs points before that */ | |
7393 tCESave = tCE; /* we save it, so that we know where
to come back AND that we need to go forward */ | |
7394 tCE+=2; /* need to point to the first continuation
CP */ | |
7395 /* However, now you can just continue doing stuff */ | |
7396 } | |
7397 } else { | |
7398 secT = *(tCE++); | |
7399 if(!isContinuation(secT)) { /* This means we have finish
ed with this cont */ | |
7400 tCE = tCESave; /* reset the pointer to befo
re continuation */ | |
7401 tCESave = NULL; | |
7402 secT = 0; /* Fetch a fresh CE before the continuati
on sequence. */ | |
7403 continue; | |
7404 } | |
7405 } | |
7406 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit *
/ | |
7407 } | |
7408 | |
7409 if(secS == secT) { | |
7410 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf &&
tCE < tCEs.buf)) { | |
7411 break; | |
7412 } else { | |
7413 secS = 0; secT = 0; | |
7414 continue; | |
7415 } | |
7416 } else { | |
7417 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; | |
7418 goto commonReturn; | |
7419 } | |
7420 } | |
7421 } | |
7422 } | |
7423 | |
7424 /* doing the case bit */ | |
7425 if(checkCase) { | |
7426 sCE = sCEs.buf; | |
7427 tCE = tCEs.buf; | |
7428 for(;;) { | |
7429 while((secS & UCOL_REMOVE_CASE) == 0) { | |
7430 if(!isContinuation(*sCE++)) { | |
7431 secS =*(sCE-1); | |
7432 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMA
RY) { | |
7433 // primary ignorables should not be considered on the ca
se level when the strength is primary | |
7434 // otherwise, the CEs stop being well-formed | |
7435 secS &= UCOL_TERT_CASE_MASK; | |
7436 secS ^= caseSwitch; | |
7437 } else { | |
7438 secS = 0; | |
7439 } | |
7440 } else { | |
7441 secS = 0; | |
7442 } | |
7443 } | |
7444 | |
7445 while((secT & UCOL_REMOVE_CASE) == 0) { | |
7446 if(!isContinuation(*tCE++)) { | |
7447 secT = *(tCE-1); | |
7448 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMA
RY) { | |
7449 // primary ignorables should not be considered on the ca
se level when the strength is primary | |
7450 // otherwise, the CEs stop being well-formed | |
7451 secT &= UCOL_TERT_CASE_MASK; | |
7452 secT ^= caseSwitch; | |
7453 } else { | |
7454 secT = 0; | |
7455 } | |
7456 } else { | |
7457 secT = 0; | |
7458 } | |
7459 } | |
7460 | |
7461 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) { | |
7462 result = UCOL_LESS; | |
7463 goto commonReturn; | |
7464 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK))
{ | |
7465 result = UCOL_GREATER; | |
7466 goto commonReturn; | |
7467 } | |
7468 | |
7469 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT &
UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) { | |
7470 break; | |
7471 } else { | |
7472 secS = 0; | |
7473 secT = 0; | |
7474 } | |
7475 } | |
7476 } | |
7477 | |
7478 /* Tertiary level */ | |
7479 if(checkTertiary) { | |
7480 secS = 0; | |
7481 secT = 0; | |
7482 sCE = sCEs.buf; | |
7483 tCE = tCEs.buf; | |
7484 for(;;) { | |
7485 while((secS & UCOL_REMOVE_CASE) == 0) { | |
7486 sOrder = *sCE++; | |
7487 secS = sOrder & tertiaryMask; | |
7488 if(!isContinuation(sOrder)) { | |
7489 secS ^= caseSwitch; | |
7490 } else { | |
7491 secS &= UCOL_REMOVE_CASE; | |
7492 } | |
7493 } | |
7494 | |
7495 while((secT & UCOL_REMOVE_CASE) == 0) { | |
7496 tOrder = *tCE++; | |
7497 secT = tOrder & tertiaryMask; | |
7498 if(!isContinuation(tOrder)) { | |
7499 secT ^= caseSwitch; | |
7500 } else { | |
7501 secT &= UCOL_REMOVE_CASE; | |
7502 } | |
7503 } | |
7504 | |
7505 if(secS == secT) { | |
7506 if((secS & UCOL_REMOVE_CASE) == 1) { | |
7507 break; | |
7508 } else { | |
7509 secS = 0; secT = 0; | |
7510 continue; | |
7511 } | |
7512 } else { | |
7513 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; | |
7514 goto commonReturn; | |
7515 } | |
7516 } | |
7517 } | |
7518 | |
7519 | |
7520 if(qShifted /*checkQuad*/) { | |
7521 UBool sInShifted = TRUE; | |
7522 UBool tInShifted = TRUE; | |
7523 secS = 0; | |
7524 secT = 0; | |
7525 sCE = sCEs.buf; | |
7526 tCE = tCEs.buf; | |
7527 for(;;) { | |
7528 while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(sec
S) && !sInShifted)) { | |
7529 secS = *(sCE++); | |
7530 if(isContinuation(secS)) { | |
7531 if(!sInShifted) { | |
7532 continue; | |
7533 } | |
7534 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non
continuation */ | |
7535 secS = UCOL_PRIMARYMASK; | |
7536 sInShifted = FALSE; | |
7537 } else { | |
7538 sInShifted = TRUE; | |
7539 } | |
7540 } | |
7541 secS &= UCOL_PRIMARYMASK; | |
7542 | |
7543 | |
7544 while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(sec
T) && !tInShifted)) { | |
7545 secT = *(tCE++); | |
7546 if(isContinuation(secT)) { | |
7547 if(!tInShifted) { | |
7548 continue; | |
7549 } | |
7550 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) { | |
7551 secT = UCOL_PRIMARYMASK; | |
7552 tInShifted = FALSE; | |
7553 } else { | |
7554 tInShifted = TRUE; | |
7555 } | |
7556 } | |
7557 secT &= UCOL_PRIMARYMASK; | |
7558 | |
7559 if(secS == secT) { | |
7560 if(secS == UCOL_NO_MORE_CES_PRIMARY) { | |
7561 break; | |
7562 } else { | |
7563 secS = 0; secT = 0; | |
7564 continue; | |
7565 } | |
7566 } else { | |
7567 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; | |
7568 goto commonReturn; | |
7569 } | |
7570 } | |
7571 } else if(doHiragana && hirResult != UCOL_EQUAL) { | |
7572 // If we're fine on quaternaries, we might be different | |
7573 // on Hiragana. This, however, might fail us in shifted. | |
7574 result = hirResult; | |
7575 goto commonReturn; | |
7576 } | |
7577 | |
7578 /* For IDENTICAL comparisons, we use a bitwise character comparison */ | |
7579 /* as a tiebreaker if all else is equal. */ | |
7580 /* Getting here should be quite rare - strings are not identical - */ | |
7581 /* that is checked first, but compared == through all other checks. */ | |
7582 if(checkIdent) | |
7583 { | |
7584 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UC
OL_ON); | |
7585 result = ucol_checkIdent(sColl, tColl, TRUE, status); | |
7586 } | |
7587 | |
7588 commonReturn: | |
7589 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) { | |
7590 if (sCEs.buf != sCEs.localArray ) { | |
7591 uprv_free(sCEs.buf); | |
7592 } | |
7593 if (tCEs.buf != tCEs.localArray ) { | |
7594 uprv_free(tCEs.buf); | |
7595 } | |
7596 } | |
7597 | |
7598 return result; | |
7599 } | |
7600 | |
7601 static UCollationResult | |
7602 ucol_strcollRegular(const UCollator *coll, | |
7603 const UChar *source, int32_t sourceLength, | |
7604 const UChar *target, int32_t targetLength, | |
7605 UErrorCode *status) { | |
7606 collIterate sColl, tColl; | |
7607 // Preparing the context objects for iterating over strings | |
7608 IInit_collIterate(coll, source, sourceLength, &sColl, status); | |
7609 IInit_collIterate(coll, target, targetLength, &tColl, status); | |
7610 if(U_FAILURE(*status)) { | |
7611 return UCOL_LESS; | |
7612 } | |
7613 return ucol_strcollRegular(&sColl, &tColl, status); | |
7614 } | |
7615 | |
7616 static inline uint32_t | |
7617 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength, | |
7618 uint32_t CE, const UChar *s, int32_t *index, int32_t l
en) | |
7619 { | |
7620 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); | |
7621 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; | |
7622 int32_t offset = 1; | |
7623 UChar schar = 0, tchar = 0; | |
7624 | |
7625 for(;;) { | |
7626 if(len == -1) { | |
7627 if(s[*index] == 0) { // end of string | |
7628 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOn
eOffset]); | |
7629 } else { | |
7630 schar = s[*index]; | |
7631 } | |
7632 } else { | |
7633 if(*index == len) { | |
7634 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOn
eOffset]); | |
7635 } else { | |
7636 schar = s[*index]; | |
7637 } | |
7638 } | |
7639 | |
7640 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contractio
n codepoints should be ordered, we skip all that are smaller */ | |
7641 offset++; | |
7642 } | |
7643 | |
7644 if (schar == tchar) { | |
7645 (*index)++; | |
7646 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff
set+offset]); | |
7647 } | |
7648 else | |
7649 { | |
7650 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { | |
7651 return UCOL_BAIL_OUT_CE; | |
7652 } | |
7653 // skip completely ignorables | |
7654 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); | |
7655 if(isZeroCE == 0) { // we have to ignore completely ignorables | |
7656 (*index)++; | |
7657 continue; | |
7658 } | |
7659 | |
7660 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff
set]); | |
7661 } | |
7662 } | |
7663 } | |
7664 | |
7665 | |
7666 /** | |
7667 * This is a fast strcoll, geared towards text in Latin-1. | |
7668 * It supports contractions of size two, French secondaries | |
7669 * and case switching. You can use it with strengths primary | |
7670 * to tertiary. It does not support shifted and case level. | |
7671 * It relies on the table build by setupLatin1Table. If it | |
7672 * doesn't understand something, it will go to the regular | |
7673 * strcoll. | |
7674 */ | |
7675 static UCollationResult | |
7676 ucol_strcollUseLatin1( const UCollator *coll, | |
7677 const UChar *source, | |
7678 int32_t sLen, | |
7679 const UChar *target, | |
7680 int32_t tLen, | |
7681 UErrorCode *status) | |
7682 { | |
7683 U_ALIGN_CODE(16); | |
7684 int32_t strength = coll->strength; | |
7685 | |
7686 int32_t sIndex = 0, tIndex = 0; | |
7687 UChar sChar = 0, tChar = 0; | |
7688 uint32_t sOrder=0, tOrder=0; | |
7689 | |
7690 UBool endOfSource = FALSE; | |
7691 | |
7692 uint32_t *elements = coll->latinOneCEs; | |
7693 | |
7694 UBool haveContractions = FALSE; // if we have contractions in our string | |
7695 // we cannot do French secondary | |
7696 | |
7697 // Do the primary level | |
7698 for(;;) { | |
7699 while(sOrder==0) { // this loop skips primary ignorables | |
7700 // sOrder=getNextlatinOneCE(source); | |
7701 if(sLen==-1) { // handling zero terminated strings | |
7702 sChar=source[sIndex++]; | |
7703 if(sChar==0) { | |
7704 endOfSource = TRUE; | |
7705 break; | |
7706 } | |
7707 } else { // handling strings with known length | |
7708 if(sIndex==sLen) { | |
7709 endOfSource = TRUE; | |
7710 break; | |
7711 } | |
7712 sChar=source[sIndex++]; | |
7713 } | |
7714 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sCha
r > 0xFF, but this is faster on win32) | |
7715 //fprintf(stderr, "R"); | |
7716 return ucol_strcollRegular(coll, source, sLen, target, tLen, sta
tus); | |
7717 } | |
7718 sOrder = elements[sChar]; | |
7719 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special | |
7720 // specials can basically be either contractions or bail-out sig
ns. If we get anything | |
7721 // else, we'll bail out anywasy | |
7722 if(getCETag(sOrder) == CONTRACTION_TAG) { | |
7723 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOr
der, source, &sIndex, sLen); | |
7724 haveContractions = TRUE; // if there are contractions, we ca
nnot do French secondary | |
7725 // However, if there are contractions in the table, but we a
lways use just one char, | |
7726 // we might be able to do French. This should be checked out
. | |
7727 } | |
7728 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { | |
7729 //fprintf(stderr, "S"); | |
7730 return ucol_strcollRegular(coll, source, sLen, target, tLen,
status); | |
7731 } | |
7732 } | |
7733 } | |
7734 | |
7735 while(tOrder==0) { // this loop skips primary ignorables | |
7736 // tOrder=getNextlatinOneCE(target); | |
7737 if(tLen==-1) { // handling zero terminated strings | |
7738 tChar=target[tIndex++]; | |
7739 if(tChar==0) { | |
7740 if(endOfSource) { // this is different than source loop, | |
7741 // as we already know that source loop is done here, | |
7742 // so we can either finish the primary loop if both | |
7743 // strings are done or anounce the result if only | |
7744 // target is done. Same below. | |
7745 goto endOfPrimLoop; | |
7746 } else { | |
7747 return UCOL_GREATER; | |
7748 } | |
7749 } | |
7750 } else { // handling strings with known length | |
7751 if(tIndex==tLen) { | |
7752 if(endOfSource) { | |
7753 goto endOfPrimLoop; | |
7754 } else { | |
7755 return UCOL_GREATER; | |
7756 } | |
7757 } | |
7758 tChar=target[tIndex++]; | |
7759 } | |
7760 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sCha
r > 0xFF, but this is faster on win32) | |
7761 //fprintf(stderr, "R"); | |
7762 return ucol_strcollRegular(coll, source, sLen, target, tLen, sta
tus); | |
7763 } | |
7764 tOrder = elements[tChar]; | |
7765 if(tOrder >= UCOL_NOT_FOUND) { | |
7766 // Handling specials, see the comments for source | |
7767 if(getCETag(tOrder) == CONTRACTION_TAG) { | |
7768 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOr
der, target, &tIndex, tLen); | |
7769 haveContractions = TRUE; | |
7770 } | |
7771 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { | |
7772 //fprintf(stderr, "S"); | |
7773 return ucol_strcollRegular(coll, source, sLen, target, tLen,
status); | |
7774 } | |
7775 } | |
7776 } | |
7777 if(endOfSource) { // source is finished, but target is not, say the resu
lt. | |
7778 return UCOL_LESS; | |
7779 } | |
7780 | |
7781 if(sOrder == tOrder) { // if we have same CEs, we continue the loop | |
7782 sOrder = 0; tOrder = 0; | |
7783 continue; | |
7784 } else { | |
7785 // compare current top bytes | |
7786 if(((sOrder^tOrder)&0xFF000000)!=0) { | |
7787 // top bytes differ, return difference | |
7788 if(sOrder < tOrder) { | |
7789 return UCOL_LESS; | |
7790 } else if(sOrder > tOrder) { | |
7791 return UCOL_GREATER; | |
7792 } | |
7793 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24)
; | |
7794 // since we must return enum value | |
7795 } | |
7796 | |
7797 // top bytes match, continue with following bytes | |
7798 sOrder<<=8; | |
7799 tOrder<<=8; | |
7800 } | |
7801 } | |
7802 | |
7803 endOfPrimLoop: | |
7804 // after primary loop, we definitely know the sizes of strings, | |
7805 // so we set it and use simpler loop for secondaries and tertiaries | |
7806 sLen = sIndex; tLen = tIndex; | |
7807 if(strength >= UCOL_SECONDARY) { | |
7808 // adjust the table beggining | |
7809 elements += coll->latinOneTableLen; | |
7810 endOfSource = FALSE; | |
7811 | |
7812 if(coll->frenchCollation == UCOL_OFF) { // non French | |
7813 // This loop is a simplified copy of primary loop | |
7814 // at this point we know that whole strings are latin-1, so we don't | |
7815 // check for that. We also know that we only have contractions as | |
7816 // specials. | |
7817 sIndex = 0; tIndex = 0; | |
7818 for(;;) { | |
7819 while(sOrder==0) { | |
7820 if(sIndex==sLen) { | |
7821 endOfSource = TRUE; | |
7822 break; | |
7823 } | |
7824 sChar=source[sIndex++]; | |
7825 sOrder = elements[sChar]; | |
7826 if(sOrder > UCOL_NOT_FOUND) { | |
7827 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDAR
Y, sOrder, source, &sIndex, sLen); | |
7828 } | |
7829 } | |
7830 | |
7831 while(tOrder==0) { | |
7832 if(tIndex==tLen) { | |
7833 if(endOfSource) { | |
7834 goto endOfSecLoop; | |
7835 } else { | |
7836 return UCOL_GREATER; | |
7837 } | |
7838 } | |
7839 tChar=target[tIndex++]; | |
7840 tOrder = elements[tChar]; | |
7841 if(tOrder > UCOL_NOT_FOUND) { | |
7842 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDAR
Y, tOrder, target, &tIndex, tLen); | |
7843 } | |
7844 } | |
7845 if(endOfSource) { | |
7846 return UCOL_LESS; | |
7847 } | |
7848 | |
7849 if(sOrder == tOrder) { | |
7850 sOrder = 0; tOrder = 0; | |
7851 continue; | |
7852 } else { | |
7853 // see primary loop for comments on this | |
7854 if(((sOrder^tOrder)&0xFF000000)!=0) { | |
7855 if(sOrder < tOrder) { | |
7856 return UCOL_LESS; | |
7857 } else if(sOrder > tOrder) { | |
7858 return UCOL_GREATER; | |
7859 } | |
7860 } | |
7861 sOrder<<=8; | |
7862 tOrder<<=8; | |
7863 } | |
7864 } | |
7865 } else { // French | |
7866 if(haveContractions) { // if we have contractions, we have to bail o
ut | |
7867 // since we don't really know how to handle them here | |
7868 return ucol_strcollRegular(coll, source, sLen, target, tLen, sta
tus); | |
7869 } | |
7870 // For French, we go backwards | |
7871 sIndex = sLen; tIndex = tLen; | |
7872 for(;;) { | |
7873 while(sOrder==0) { | |
7874 if(sIndex==0) { | |
7875 endOfSource = TRUE; | |
7876 break; | |
7877 } | |
7878 sChar=source[--sIndex]; | |
7879 sOrder = elements[sChar]; | |
7880 // don't even look for contractions | |
7881 } | |
7882 | |
7883 while(tOrder==0) { | |
7884 if(tIndex==0) { | |
7885 if(endOfSource) { | |
7886 goto endOfSecLoop; | |
7887 } else { | |
7888 return UCOL_GREATER; | |
7889 } | |
7890 } | |
7891 tChar=target[--tIndex]; | |
7892 tOrder = elements[tChar]; | |
7893 // don't even look for contractions | |
7894 } | |
7895 if(endOfSource) { | |
7896 return UCOL_LESS; | |
7897 } | |
7898 | |
7899 if(sOrder == tOrder) { | |
7900 sOrder = 0; tOrder = 0; | |
7901 continue; | |
7902 } else { | |
7903 // see the primary loop for comments | |
7904 if(((sOrder^tOrder)&0xFF000000)!=0) { | |
7905 if(sOrder < tOrder) { | |
7906 return UCOL_LESS; | |
7907 } else if(sOrder > tOrder) { | |
7908 return UCOL_GREATER; | |
7909 } | |
7910 } | |
7911 sOrder<<=8; | |
7912 tOrder<<=8; | |
7913 } | |
7914 } | |
7915 } | |
7916 } | |
7917 | |
7918 endOfSecLoop: | |
7919 if(strength >= UCOL_TERTIARY) { | |
7920 // tertiary loop is the same as secondary (except no French) | |
7921 elements += coll->latinOneTableLen; | |
7922 sIndex = 0; tIndex = 0; | |
7923 endOfSource = FALSE; | |
7924 for(;;) { | |
7925 while(sOrder==0) { | |
7926 if(sIndex==sLen) { | |
7927 endOfSource = TRUE; | |
7928 break; | |
7929 } | |
7930 sChar=source[sIndex++]; | |
7931 sOrder = elements[sChar]; | |
7932 if(sOrder > UCOL_NOT_FOUND) { | |
7933 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sO
rder, source, &sIndex, sLen); | |
7934 } | |
7935 } | |
7936 while(tOrder==0) { | |
7937 if(tIndex==tLen) { | |
7938 if(endOfSource) { | |
7939 return UCOL_EQUAL; // if both strings are at the end, th
ey are equal | |
7940 } else { | |
7941 return UCOL_GREATER; | |
7942 } | |
7943 } | |
7944 tChar=target[tIndex++]; | |
7945 tOrder = elements[tChar]; | |
7946 if(tOrder > UCOL_NOT_FOUND) { | |
7947 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tO
rder, target, &tIndex, tLen); | |
7948 } | |
7949 } | |
7950 if(endOfSource) { | |
7951 return UCOL_LESS; | |
7952 } | |
7953 if(sOrder == tOrder) { | |
7954 sOrder = 0; tOrder = 0; | |
7955 continue; | |
7956 } else { | |
7957 if(((sOrder^tOrder)&0xff000000)!=0) { | |
7958 if(sOrder < tOrder) { | |
7959 return UCOL_LESS; | |
7960 } else if(sOrder > tOrder) { | |
7961 return UCOL_GREATER; | |
7962 } | |
7963 } | |
7964 sOrder<<=8; | |
7965 tOrder<<=8; | |
7966 } | |
7967 } | |
7968 } | |
7969 return UCOL_EQUAL; | |
7970 } | |
7971 | |
7972 /* | |
7973 Note: ucol_strcollUTF8 supports null terminated input. Calculating length of | |
7974 null terminated input string takes extra amount of CPU cycles. | |
7975 */ | |
7976 static UCollationResult | |
7977 ucol_strcollRegularUTF8( | |
7978 const UCollator *coll, | |
7979 const char *source, | |
7980 int32_t sourceLength, | |
7981 const char *target, | |
7982 int32_t targetLength, | |
7983 UErrorCode *status) | |
7984 { | |
7985 UCharIterator src; | |
7986 UCharIterator tgt; | |
7987 | |
7988 uiter_setUTF8(&src, source, sourceLength); | |
7989 uiter_setUTF8(&tgt, target, targetLength); | |
7990 | |
7991 // Preparing the context objects for iterating over strings | |
7992 collIterate sColl, tColl; | |
7993 IInit_collIterate(coll, NULL, -1, &sColl, status); | |
7994 IInit_collIterate(coll, NULL, -1, &tColl, status); | |
7995 if(U_FAILURE(*status)) { | |
7996 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) | |
7997 return UCOL_EQUAL; | |
7998 } | |
7999 // The division for the array length may truncate the array size to | |
8000 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high | |
8001 // for all platforms anyway. | |
8002 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
8003 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
8004 UNormIterator *sNormIter = NULL, *tNormIter = NULL; | |
8005 | |
8006 sColl.iterator = &src; | |
8007 sColl.flags |= UCOL_USE_ITERATOR; | |
8008 tColl.flags |= UCOL_USE_ITERATOR; | |
8009 tColl.iterator = &tgt; | |
8010 | |
8011 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { | |
8012 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), statu
s); | |
8013 sColl.iterator = unorm_setIter(sNormIter, &src, UNORM_FCD, status); | |
8014 sColl.flags &= ~UCOL_ITER_NORM; | |
8015 | |
8016 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), statu
s); | |
8017 tColl.iterator = unorm_setIter(tNormIter, &tgt, UNORM_FCD, status); | |
8018 tColl.flags &= ~UCOL_ITER_NORM; | |
8019 } | |
8020 | |
8021 return ucol_strcollRegular(&sColl, &tColl, status); | |
8022 } | |
8023 | |
8024 static inline uint32_t | |
8025 ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength, | |
8026 uint32_t CE, const char *s, int32_t *index, int32_t le
n) | |
8027 { | |
8028 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); | |
8029 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; | |
8030 int32_t offset = 1; | |
8031 UChar32 schar = 0, tchar = 0; | |
8032 | |
8033 for(;;) { | |
8034 if (*index == len) { | |
8035 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff
set]); | |
8036 } | |
8037 U8_GET_OR_FFFD((const uint8_t*)s, 0, *index, len, schar); | |
8038 if (len < 0 && schar == 0) { | |
8039 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff
set]); | |
8040 } | |
8041 | |
8042 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contractio
n codepoints should be ordered, we skip all that are smaller */ | |
8043 offset++; | |
8044 } | |
8045 | |
8046 if (schar == tchar) { | |
8047 U8_FWD_1(s, *index, len); | |
8048 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff
set+offset]); | |
8049 } | |
8050 else | |
8051 { | |
8052 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { | |
8053 return UCOL_BAIL_OUT_CE; | |
8054 } | |
8055 // skip completely ignorables | |
8056 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); | |
8057 if(isZeroCE == 0) { // we have to ignore completely ignorables | |
8058 U8_FWD_1(s, *index, len); | |
8059 continue; | |
8060 } | |
8061 | |
8062 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOff
set]); | |
8063 } | |
8064 } | |
8065 } | |
8066 | |
8067 static inline UCollationResult | |
8068 ucol_strcollUseLatin1UTF8( | |
8069 const UCollator *coll, | |
8070 const char *source, | |
8071 int32_t sLen, | |
8072 const char *target, | |
8073 int32_t tLen, | |
8074 UErrorCode *status) | |
8075 { | |
8076 U_ALIGN_CODE(16); | |
8077 int32_t strength = coll->strength; | |
8078 | |
8079 int32_t sIndex = 0, tIndex = 0; | |
8080 UChar32 sChar = 0, tChar = 0; | |
8081 uint32_t sOrder=0, tOrder=0; | |
8082 | |
8083 UBool endOfSource = FALSE; | |
8084 | |
8085 uint32_t *elements = coll->latinOneCEs; | |
8086 | |
8087 UBool haveContractions = FALSE; // if we have contractions in our string | |
8088 // we cannot do French secondary | |
8089 | |
8090 // Do the primary level | |
8091 for(;;) { | |
8092 while(sOrder==0) { // this loop skips primary ignorables | |
8093 // sOrder=getNextlatinOneCE(source); | |
8094 if (sIndex == sLen) { | |
8095 endOfSource = TRUE; | |
8096 break; | |
8097 } | |
8098 U8_NEXT_OR_FFFD(source, sIndex, sLen ,sChar); | |
8099 if (sLen < 0 && sChar == 0) { | |
8100 endOfSource = TRUE; | |
8101 sLen = sIndex; | |
8102 break; | |
8103 } | |
8104 if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (
sChar > 0xFF, but this is faster on win32) | |
8105 //fprintf(stderr, "R"); | |
8106 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen,
status); | |
8107 } | |
8108 sOrder = elements[sChar]; | |
8109 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special | |
8110 // specials can basically be either contractions or bail-out sig
ns. If we get anything | |
8111 // else, we'll bail out anywasy | |
8112 if(getCETag(sOrder) == CONTRACTION_TAG) { | |
8113 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY,
sOrder, source, &sIndex, sLen); | |
8114 haveContractions = TRUE; // if there are contractions, we ca
nnot do French secondary | |
8115 // However, if there are contractions in the table, but we a
lways use just one char, | |
8116 // we might be able to do French. This should be checked out
. | |
8117 } | |
8118 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { | |
8119 //fprintf(stderr, "S"); | |
8120 return ucol_strcollRegularUTF8(coll, source, sLen, target, t
Len, status); | |
8121 } | |
8122 } | |
8123 } | |
8124 | |
8125 while(tOrder==0) { // this loop skips primary ignorables | |
8126 // tOrder=getNextlatinOneCE(target); | |
8127 if (tIndex == tLen) { | |
8128 if(endOfSource) { | |
8129 goto endOfPrimLoopU8; | |
8130 } else { | |
8131 return UCOL_GREATER; | |
8132 } | |
8133 } | |
8134 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); | |
8135 if (tLen < 0 && tChar == 0) { | |
8136 if(endOfSource) { | |
8137 tLen = tIndex; | |
8138 goto endOfPrimLoopU8; | |
8139 } else { | |
8140 return UCOL_GREATER; | |
8141 } | |
8142 } | |
8143 if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (
sChar > 0xFF, but this is faster on win32) | |
8144 //fprintf(stderr, "R"); | |
8145 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen,
status); | |
8146 } | |
8147 tOrder = elements[tChar]; | |
8148 if(tOrder >= UCOL_NOT_FOUND) { | |
8149 // Handling specials, see the comments for source | |
8150 if(getCETag(tOrder) == CONTRACTION_TAG) { | |
8151 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY,
tOrder, target, &tIndex, tLen); | |
8152 haveContractions = TRUE; | |
8153 } | |
8154 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { | |
8155 //fprintf(stderr, "S"); | |
8156 return ucol_strcollRegularUTF8(coll, source, sLen, target, t
Len, status); | |
8157 } | |
8158 } | |
8159 } | |
8160 if(endOfSource) { // source is finished, but target is not, say the resu
lt. | |
8161 return UCOL_LESS; | |
8162 } | |
8163 | |
8164 if(sOrder == tOrder) { // if we have same CEs, we continue the loop | |
8165 sOrder = 0; tOrder = 0; | |
8166 continue; | |
8167 } else { | |
8168 // compare current top bytes | |
8169 if(((sOrder^tOrder)&0xFF000000)!=0) { | |
8170 // top bytes differ, return difference | |
8171 if(sOrder < tOrder) { | |
8172 return UCOL_LESS; | |
8173 } else if(sOrder > tOrder) { | |
8174 return UCOL_GREATER; | |
8175 } | |
8176 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24)
; | |
8177 // since we must return enum value | |
8178 } | |
8179 | |
8180 // top bytes match, continue with following bytes | |
8181 sOrder<<=8; | |
8182 tOrder<<=8; | |
8183 } | |
8184 } | |
8185 | |
8186 endOfPrimLoopU8: | |
8187 // after primary loop, we definitely know the sizes of strings, | |
8188 // so we set it and use simpler loop for secondaries and tertiaries | |
8189 sLen = sIndex; tLen = tIndex; | |
8190 if(strength >= UCOL_SECONDARY) { | |
8191 // adjust the table beggining | |
8192 elements += coll->latinOneTableLen; | |
8193 endOfSource = FALSE; | |
8194 | |
8195 if(coll->frenchCollation == UCOL_OFF) { // non French | |
8196 // This loop is a simplified copy of primary loop | |
8197 // at this point we know that whole strings are latin-1, so we don't | |
8198 // check for that. We also know that we only have contractions as | |
8199 // specials. | |
8200 sIndex = 0; tIndex = 0; | |
8201 for(;;) { | |
8202 while(sOrder==0) { | |
8203 if(sIndex==sLen) { | |
8204 endOfSource = TRUE; | |
8205 break; | |
8206 } | |
8207 U_ASSERT(sLen >= 0); | |
8208 U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar); | |
8209 U_ASSERT(sChar >= 0 && sChar <= 0xFF); | |
8210 sOrder = elements[sChar]; | |
8211 if(sOrder > UCOL_NOT_FOUND) { | |
8212 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECO
NDARY, sOrder, source, &sIndex, sLen); | |
8213 } | |
8214 } | |
8215 | |
8216 while(tOrder==0) { | |
8217 if(tIndex==tLen) { | |
8218 if(endOfSource) { | |
8219 goto endOfSecLoopU8; | |
8220 } else { | |
8221 return UCOL_GREATER; | |
8222 } | |
8223 } | |
8224 U_ASSERT(tLen >= 0); | |
8225 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); | |
8226 U_ASSERT(tChar >= 0 && tChar <= 0xFF); | |
8227 tOrder = elements[tChar]; | |
8228 if(tOrder > UCOL_NOT_FOUND) { | |
8229 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECO
NDARY, tOrder, target, &tIndex, tLen); | |
8230 } | |
8231 } | |
8232 if(endOfSource) { | |
8233 return UCOL_LESS; | |
8234 } | |
8235 | |
8236 if(sOrder == tOrder) { | |
8237 sOrder = 0; tOrder = 0; | |
8238 continue; | |
8239 } else { | |
8240 // see primary loop for comments on this | |
8241 if(((sOrder^tOrder)&0xFF000000)!=0) { | |
8242 if(sOrder < tOrder) { | |
8243 return UCOL_LESS; | |
8244 } else if(sOrder > tOrder) { | |
8245 return UCOL_GREATER; | |
8246 } | |
8247 } | |
8248 sOrder<<=8; | |
8249 tOrder<<=8; | |
8250 } | |
8251 } | |
8252 } else { // French | |
8253 if(haveContractions) { // if we have contractions, we have to bail o
ut | |
8254 // since we don't really know how to handle them here | |
8255 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen,
status); | |
8256 } | |
8257 // For French, we go backwards | |
8258 sIndex = sLen; tIndex = tLen; | |
8259 for(;;) { | |
8260 while(sOrder==0) { | |
8261 if(sIndex==0) { | |
8262 endOfSource = TRUE; | |
8263 break; | |
8264 } | |
8265 U8_PREV_OR_FFFD(source, 0, sIndex, sChar); | |
8266 U_ASSERT(sChar >= 0 && sChar <= 0xFF); | |
8267 sOrder = elements[sChar]; | |
8268 // don't even look for contractions | |
8269 } | |
8270 | |
8271 while(tOrder==0) { | |
8272 if(tIndex==0) { | |
8273 if(endOfSource) { | |
8274 goto endOfSecLoopU8; | |
8275 } else { | |
8276 return UCOL_GREATER; | |
8277 } | |
8278 } | |
8279 U8_PREV_OR_FFFD(target, 0, tIndex, tChar); | |
8280 U_ASSERT(tChar >= 0 && tChar <= 0xFF); | |
8281 tOrder = elements[tChar]; | |
8282 // don't even look for contractions | |
8283 } | |
8284 if(endOfSource) { | |
8285 return UCOL_LESS; | |
8286 } | |
8287 | |
8288 if(sOrder == tOrder) { | |
8289 sOrder = 0; tOrder = 0; | |
8290 continue; | |
8291 } else { | |
8292 // see the primary loop for comments | |
8293 if(((sOrder^tOrder)&0xFF000000)!=0) { | |
8294 if(sOrder < tOrder) { | |
8295 return UCOL_LESS; | |
8296 } else if(sOrder > tOrder) { | |
8297 return UCOL_GREATER; | |
8298 } | |
8299 } | |
8300 sOrder<<=8; | |
8301 tOrder<<=8; | |
8302 } | |
8303 } | |
8304 } | |
8305 } | |
8306 | |
8307 endOfSecLoopU8: | |
8308 if(strength >= UCOL_TERTIARY) { | |
8309 // tertiary loop is the same as secondary (except no French) | |
8310 elements += coll->latinOneTableLen; | |
8311 sIndex = 0; tIndex = 0; | |
8312 endOfSource = FALSE; | |
8313 for(;;) { | |
8314 while(sOrder==0) { | |
8315 if(sIndex==sLen) { | |
8316 endOfSource = TRUE; | |
8317 break; | |
8318 } | |
8319 U_ASSERT(sLen >= 0); | |
8320 U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar); | |
8321 U_ASSERT(sChar >= 0 && sChar <= 0xFF); | |
8322 sOrder = elements[sChar]; | |
8323 if(sOrder > UCOL_NOT_FOUND) { | |
8324 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY
, sOrder, source, &sIndex, sLen); | |
8325 } | |
8326 } | |
8327 while(tOrder==0) { | |
8328 if(tIndex==tLen) { | |
8329 if(endOfSource) { | |
8330 return UCOL_EQUAL; // if both strings are at the end, th
ey are equal | |
8331 } else { | |
8332 return UCOL_GREATER; | |
8333 } | |
8334 } | |
8335 U_ASSERT(tLen >= 0); | |
8336 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); | |
8337 U_ASSERT(tChar >= 0 && tChar <= 0xFF); | |
8338 tOrder = elements[tChar]; | |
8339 if(tOrder > UCOL_NOT_FOUND) { | |
8340 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY
, tOrder, target, &tIndex, tLen); | |
8341 } | |
8342 } | |
8343 if(endOfSource) { | |
8344 return UCOL_LESS; | |
8345 } | |
8346 if(sOrder == tOrder) { | |
8347 sOrder = 0; tOrder = 0; | |
8348 continue; | |
8349 } else { | |
8350 if(((sOrder^tOrder)&0xff000000)!=0) { | |
8351 if(sOrder < tOrder) { | |
8352 return UCOL_LESS; | |
8353 } else if(sOrder > tOrder) { | |
8354 return UCOL_GREATER; | |
8355 } | |
8356 } | |
8357 sOrder<<=8; | |
8358 tOrder<<=8; | |
8359 } | |
8360 } | |
8361 } | |
8362 return UCOL_EQUAL; | |
8363 } | 406 } |
8364 | 407 |
8365 U_CAPI UCollationResult U_EXPORT2 | 408 U_CAPI UCollationResult U_EXPORT2 |
8366 ucol_strcollIter( const UCollator *coll, | 409 ucol_strcollIter( const UCollator *coll, |
8367 UCharIterator *sIter, | 410 UCharIterator *sIter, |
8368 UCharIterator *tIter, | 411 UCharIterator *tIter, |
8369 UErrorCode *status) | 412 UErrorCode *status) |
8370 { | 413 { |
8371 if(!status || U_FAILURE(*status)) { | 414 if(!status || U_FAILURE(*status)) { |
8372 return UCOL_EQUAL; | 415 return UCOL_EQUAL; |
8373 } | 416 } |
8374 | 417 |
8375 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); | 418 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); |
8376 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIt
er); | 419 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIt
er); |
8377 | 420 |
8378 if (sIter == tIter) { | |
8379 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) | |
8380 return UCOL_EQUAL; | |
8381 } | |
8382 if(sIter == NULL || tIter == NULL || coll == NULL) { | 421 if(sIter == NULL || tIter == NULL || coll == NULL) { |
8383 *status = U_ILLEGAL_ARGUMENT_ERROR; | 422 *status = U_ILLEGAL_ARGUMENT_ERROR; |
8384 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) | 423 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); |
8385 return UCOL_EQUAL; | 424 return UCOL_EQUAL; |
8386 } | 425 } |
8387 | 426 |
8388 UCollationResult result = UCOL_EQUAL; | 427 UCollationResult result = Collator::fromUCollator(coll)->compare(*sIter, *tI
ter, *status); |
8389 | 428 |
8390 // Preparing the context objects for iterating over strings | 429 UTRACE_EXIT_VALUE_STATUS(result, *status); |
8391 collIterate sColl, tColl; | |
8392 IInit_collIterate(coll, NULL, -1, &sColl, status); | |
8393 IInit_collIterate(coll, NULL, -1, &tColl, status); | |
8394 if(U_FAILURE(*status)) { | |
8395 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) | |
8396 return UCOL_EQUAL; | |
8397 } | |
8398 // The division for the array length may truncate the array size to | |
8399 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high | |
8400 // for all platforms anyway. | |
8401 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
8402 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; | |
8403 UNormIterator *sNormIter = NULL, *tNormIter = NULL; | |
8404 | |
8405 sColl.iterator = sIter; | |
8406 sColl.flags |= UCOL_USE_ITERATOR; | |
8407 tColl.flags |= UCOL_USE_ITERATOR; | |
8408 tColl.iterator = tIter; | |
8409 | |
8410 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { | |
8411 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), statu
s); | |
8412 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status); | |
8413 sColl.flags &= ~UCOL_ITER_NORM; | |
8414 | |
8415 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), statu
s); | |
8416 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status); | |
8417 tColl.flags &= ~UCOL_ITER_NORM; | |
8418 } | |
8419 | |
8420 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL; | |
8421 | |
8422 while((sChar = sColl.iterator->next(sColl.iterator)) == | |
8423 (tChar = tColl.iterator->next(tColl.iterator))) { | |
8424 if(sChar == U_SENTINEL) { | |
8425 result = UCOL_EQUAL; | |
8426 goto end_compare; | |
8427 } | |
8428 } | |
8429 | |
8430 if(sChar == U_SENTINEL) { | |
8431 tChar = tColl.iterator->previous(tColl.iterator); | |
8432 } | |
8433 | |
8434 if(tChar == U_SENTINEL) { | |
8435 sChar = sColl.iterator->previous(sColl.iterator); | |
8436 } | |
8437 | |
8438 sChar = sColl.iterator->previous(sColl.iterator); | |
8439 tChar = tColl.iterator->previous(tColl.iterator); | |
8440 | |
8441 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll)) | |
8442 { | |
8443 // We are stopped in the middle of a contraction. | |
8444 // Scan backwards through the == part of the string looking for the star
t of the contraction. | |
8445 // It doesn't matter which string we scan, since they are the same in
this region. | |
8446 do | |
8447 { | |
8448 sChar = sColl.iterator->previous(sColl.iterator); | |
8449 tChar = tColl.iterator->previous(tColl.iterator); | |
8450 } | |
8451 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll)); | |
8452 } | |
8453 | |
8454 | |
8455 if(U_SUCCESS(*status)) { | |
8456 result = ucol_strcollRegular(&sColl, &tColl, status); | |
8457 } | |
8458 | |
8459 end_compare: | |
8460 if(sNormIter || tNormIter) { | |
8461 unorm_closeIter(sNormIter); | |
8462 unorm_closeIter(tNormIter); | |
8463 } | |
8464 | |
8465 UTRACE_EXIT_VALUE_STATUS(result, *status) | |
8466 return result; | 430 return result; |
8467 } | 431 } |
8468 | 432 |
8469 | 433 |
8470 /* */ | 434 /* */ |
8471 /* ucol_strcoll Main public API string comparison function */ | 435 /* ucol_strcoll Main public API string comparison function */ |
8472 /* */ | 436 /* */ |
8473 U_CAPI UCollationResult U_EXPORT2 | 437 U_CAPI UCollationResult U_EXPORT2 |
8474 ucol_strcoll( const UCollator *coll, | 438 ucol_strcoll( const UCollator *coll, |
8475 const UChar *source, | 439 const UChar *source, |
8476 int32_t sourceLength, | 440 int32_t sourceLength, |
8477 const UChar *target, | 441 const UChar *target, |
8478 int32_t targetLength) | 442 int32_t targetLength) |
8479 { | 443 { |
8480 U_ALIGN_CODE(16); | 444 U_ALIGN_CODE(16); |
8481 | 445 |
8482 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); | 446 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); |
8483 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { | 447 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { |
8484 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, sour
ce, target); | 448 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, sour
ce, target); |
8485 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLengt
h); | 449 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLengt
h); |
8486 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLengt
h); | 450 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLengt
h); |
8487 } | 451 } |
8488 | 452 |
8489 if((source == NULL && sourceLength != 0) || (target == NULL && targetLength
!= 0)) { | |
8490 // do not crash, but return. Should have | |
8491 // status argument to return error. | |
8492 UTRACE_EXIT_VALUE(UCOL_EQUAL); | |
8493 return UCOL_EQUAL; | |
8494 } | |
8495 | |
8496 /* Quick check if source and target are same strings. */ | |
8497 /* They should either both be NULL terminated or the explicit length should
be set on both. */ | |
8498 if (source==target && sourceLength==targetLength) { | |
8499 UTRACE_EXIT_VALUE(UCOL_EQUAL); | |
8500 return UCOL_EQUAL; | |
8501 } | |
8502 | |
8503 if(coll->delegate != NULL) { | |
8504 UErrorCode status = U_ZERO_ERROR; | |
8505 return ((const Collator*)coll->delegate)->compare(source,sourceLength,targ
et,targetLength, status); | |
8506 } | |
8507 | |
8508 /* Scan the strings. Find:
*/ | |
8509 /* The length of any leading portion that is equal
*/ | |
8510 /* Whether they are exactly equal. (in which case we just return)
*/ | |
8511 const UChar *pSrc = source; | |
8512 const UChar *pTarg = target; | |
8513 int32_t equalLength; | |
8514 | |
8515 if (sourceLength == -1 && targetLength == -1) { | |
8516 // Both strings are null terminated. | |
8517 // Scan through any leading equal portion. | |
8518 while (*pSrc == *pTarg && *pSrc != 0) { | |
8519 pSrc++; | |
8520 pTarg++; | |
8521 } | |
8522 if (*pSrc == 0 && *pTarg == 0) { | |
8523 UTRACE_EXIT_VALUE(UCOL_EQUAL); | |
8524 return UCOL_EQUAL; | |
8525 } | |
8526 equalLength = (int32_t)(pSrc - source); | |
8527 } | |
8528 else | |
8529 { | |
8530 // One or both strings has an explicit length. | |
8531 const UChar *pSrcEnd = source + sourceLength; | |
8532 const UChar *pTargEnd = target + targetLength; | |
8533 | |
8534 // Scan while the strings are bitwise ==, or until one is exhausted. | |
8535 for (;;) { | |
8536 if (pSrc == pSrcEnd || pTarg == pTargEnd) { | |
8537 break; | |
8538 } | |
8539 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLeng
th == -1)) { | |
8540 break; | |
8541 } | |
8542 if (*pSrc != *pTarg) { | |
8543 break; | |
8544 } | |
8545 pSrc++; | |
8546 pTarg++; | |
8547 } | |
8548 equalLength = (int32_t)(pSrc - source); | |
8549 | |
8550 // If we made it all the way through both strings, we are done. They ar
e == | |
8551 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of
src string, however it was specified. */ | |
8552 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also
at end of dest string */ | |
8553 { | |
8554 UTRACE_EXIT_VALUE(UCOL_EQUAL); | |
8555 return UCOL_EQUAL; | |
8556 } | |
8557 } | |
8558 if (equalLength > 0) { | |
8559 /* There is an identical portion at the beginning of the two strings.
*/ | |
8560 /* If the identical portion ends within a contraction or a comibining
*/ | |
8561 /* character sequence, back up to the start of that sequence.
*/ | |
8562 | |
8563 // These values should already be set by the code above. | |
8564 //pSrc = source + equalLength; /* point to the first differing c
hars */ | |
8565 //pTarg = target + equalLength; | |
8566 if ((pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) || | |
8567 (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))) | |
8568 { | |
8569 // We are stopped in the middle of a contraction. | |
8570 // Scan backwards through the == part of the string looking for the
start of the contraction. | |
8571 // It doesn't matter which string we scan, since they are the same
in this region. | |
8572 do | |
8573 { | |
8574 equalLength--; | |
8575 pSrc--; | |
8576 } | |
8577 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll)); | |
8578 } | |
8579 | |
8580 source += equalLength; | |
8581 target += equalLength; | |
8582 if (sourceLength > 0) { | |
8583 sourceLength -= equalLength; | |
8584 } | |
8585 if (targetLength > 0) { | |
8586 targetLength -= equalLength; | |
8587 } | |
8588 } | |
8589 | |
8590 UErrorCode status = U_ZERO_ERROR; | 453 UErrorCode status = U_ZERO_ERROR; |
8591 UCollationResult returnVal; | 454 UCollationResult returnVal = Collator::fromUCollator(coll)-> |
8592 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLeng
th > 0 && *target&0xff00)) { | 455 compare(source, sourceLength, target, targetLength, status); |
8593 returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targ
etLength, &status); | 456 UTRACE_EXIT_VALUE_STATUS(returnVal, status); |
8594 } else { | |
8595 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, ta
rgetLength, &status); | |
8596 } | |
8597 UTRACE_EXIT_VALUE(returnVal); | |
8598 return returnVal; | 457 return returnVal; |
8599 } | 458 } |
8600 | 459 |
8601 U_CAPI UCollationResult U_EXPORT2 | 460 U_CAPI UCollationResult U_EXPORT2 |
8602 ucol_strcollUTF8( | 461 ucol_strcollUTF8( |
8603 const UCollator *coll, | 462 const UCollator *coll, |
8604 const char *source, | 463 const char *source, |
8605 int32_t sourceLength, | 464 int32_t sourceLength, |
8606 const char *target, | 465 const char *target, |
8607 int32_t targetLength, | 466 int32_t targetLength, |
8608 UErrorCode *status) | 467 UErrorCode *status) |
8609 { | 468 { |
8610 U_ALIGN_CODE(16); | 469 U_ALIGN_CODE(16); |
8611 | 470 |
8612 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8); | 471 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8); |
8613 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { | 472 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { |
8614 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, sour
ce, target); | 473 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, sour
ce, target); |
8615 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLengt
h); | 474 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLengt
h); |
8616 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLengt
h); | 475 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLengt
h); |
8617 } | 476 } |
8618 | 477 |
8619 if (U_FAILURE(*status)) { | 478 if (U_FAILURE(*status)) { |
8620 /* do nothing */ | 479 /* do nothing */ |
8621 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); | 480 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); |
8622 return UCOL_EQUAL; | 481 return UCOL_EQUAL; |
8623 } | 482 } |
8624 | 483 |
8625 if((source == NULL && sourceLength != 0) || (target == NULL && targetLength
!= 0)) { | 484 UCollationResult returnVal = Collator::fromUCollator(coll)->internalCompareU
TF8( |
8626 *status = U_ILLEGAL_ARGUMENT_ERROR; | 485 source, sourceLength, target, targetLength, *status); |
8627 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); | |
8628 return UCOL_EQUAL; | |
8629 } | |
8630 | |
8631 /* Quick check if source and target are same strings. */ | |
8632 /* They should either both be NULL terminated or the explicit length should
be set on both. */ | |
8633 if (source==target && sourceLength==targetLength) { | |
8634 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); | |
8635 return UCOL_EQUAL; | |
8636 } | |
8637 | |
8638 if(coll->delegate != NULL) { | |
8639 return ((const Collator*)coll->delegate)->compareUTF8( | |
8640 StringPiece(source, (sourceLength < 0) ? uprv_strlen(source) : sourc
eLength), | |
8641 StringPiece(target, (targetLength < 0) ? uprv_strlen(target) : targe
tLength), | |
8642 *status); | |
8643 } | |
8644 | |
8645 /* Scan the strings. Find:
*/ | |
8646 /* The length of any leading portion that is equal
*/ | |
8647 /* Whether they are exactly equal. (in which case we just return)
*/ | |
8648 const char *pSrc = source; | |
8649 const char *pTarg = target; | |
8650 UBool bSrcLimit = FALSE; | |
8651 UBool bTargLimit = FALSE; | |
8652 | |
8653 if (sourceLength == -1 && targetLength == -1) { | |
8654 // Both strings are null terminated. | |
8655 // Scan through any leading equal portion. | |
8656 while (*pSrc == *pTarg && *pSrc != 0) { | |
8657 pSrc++; | |
8658 pTarg++; | |
8659 } | |
8660 if (*pSrc == 0 && *pTarg == 0) { | |
8661 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); | |
8662 return UCOL_EQUAL; | |
8663 } | |
8664 bSrcLimit = (*pSrc == 0); | |
8665 bTargLimit = (*pTarg == 0); | |
8666 } | |
8667 else | |
8668 { | |
8669 // One or both strings has an explicit length. | |
8670 const char *pSrcEnd = source + sourceLength; | |
8671 const char *pTargEnd = target + targetLength; | |
8672 | |
8673 // Scan while the strings are bitwise ==, or until one is exhausted. | |
8674 for (;;) { | |
8675 if (pSrc == pSrcEnd || pTarg == pTargEnd) { | |
8676 break; | |
8677 } | |
8678 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLeng
th == -1)) { | |
8679 break; | |
8680 } | |
8681 if (*pSrc != *pTarg) { | |
8682 break; | |
8683 } | |
8684 pSrc++; | |
8685 pTarg++; | |
8686 } | |
8687 bSrcLimit = (pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)); | |
8688 bTargLimit = (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)); | |
8689 | |
8690 // If we made it all the way through both strings, we are done. They ar
e == | |
8691 if (bSrcLimit && /* At end of src string, however it was specified. *
/ | |
8692 bTargLimit) /* and also at end of dest string *
/ | |
8693 { | |
8694 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); | |
8695 return UCOL_EQUAL; | |
8696 } | |
8697 } | |
8698 | |
8699 U_ASSERT(!(bSrcLimit && bTargLimit)); | |
8700 | |
8701 int32_t equalLength = pSrc - source; | |
8702 UBool bSawNonLatin1 = FALSE; | |
8703 | |
8704 if (equalLength > 0) { | |
8705 // Align position to the start of UTF-8 code point. | |
8706 if (bTargLimit) { | |
8707 U8_SET_CP_START((const uint8_t*)source, 0, equalLength); | |
8708 } else { | |
8709 U8_SET_CP_START((const uint8_t*)target, 0, equalLength); | |
8710 } | |
8711 pSrc = source + equalLength; | |
8712 pTarg = target + equalLength; | |
8713 } | |
8714 | |
8715 if (equalLength > 0) { | |
8716 /* There is an identical portion at the beginning of the two strings.
*/ | |
8717 /* If the identical portion ends within a contraction or a comibining
*/ | |
8718 /* character sequence, back up to the start of that sequence.
*/ | |
8719 UBool bUnsafeCP = FALSE; | |
8720 UChar32 uc32 = -1; | |
8721 | |
8722 if (!bSrcLimit) { | |
8723 U8_GET_OR_FFFD((const uint8_t*)source, 0, equalLength, sourceLength,
uc32); | |
8724 if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) { | |
8725 bUnsafeCP = TRUE; | |
8726 } | |
8727 bSawNonLatin1 |= (uc32 > 0xff); | |
8728 } | |
8729 if (!bTargLimit) { | |
8730 U8_GET_OR_FFFD((const uint8_t*)target, 0, equalLength, targetLength,
uc32); | |
8731 if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) { | |
8732 bUnsafeCP = TRUE; | |
8733 } | |
8734 bSawNonLatin1 |= (uc32 > 0xff); | |
8735 } | |
8736 | |
8737 if (bUnsafeCP) { | |
8738 while (equalLength > 0) { | |
8739 // We are stopped in the middle of a contraction. | |
8740 // Scan backwards through the == part of the string looking for
the start of the contraction. | |
8741 // It doesn't matter which string we scan, since they are the
same in this region. | |
8742 U8_PREV_OR_FFFD((uint8_t*)source, 0, equalLength, uc32); | |
8743 bSawNonLatin1 |= (uc32 > 0xff); | |
8744 if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) { | |
8745 break; | |
8746 } | |
8747 } | |
8748 } | |
8749 source += equalLength; | |
8750 target += equalLength; | |
8751 if (sourceLength > 0) { | |
8752 sourceLength -= equalLength; | |
8753 } | |
8754 if (targetLength > 0) { | |
8755 targetLength -= equalLength; | |
8756 } | |
8757 } else { | |
8758 // Lead byte of Latin 1 character is 0x00 - 0xC3 | |
8759 bSawNonLatin1 = (source && (sourceLength != 0) && (uint8_t)*source > 0xc
3); | |
8760 bSawNonLatin1 |= (target && (targetLength != 0) && (uint8_t)*target > 0x
c3); | |
8761 } | |
8762 | |
8763 UCollationResult returnVal; | |
8764 | |
8765 if(!coll->latinOneUse || bSawNonLatin1) { | |
8766 returnVal = ucol_strcollRegularUTF8(coll, source, sourceLength, target,
targetLength, status); | |
8767 } else { | |
8768 returnVal = ucol_strcollUseLatin1UTF8(coll, source, sourceLength, target
, targetLength, status); | |
8769 } | |
8770 UTRACE_EXIT_VALUE_STATUS(returnVal, *status); | 486 UTRACE_EXIT_VALUE_STATUS(returnVal, *status); |
8771 return returnVal; | 487 return returnVal; |
8772 } | 488 } |
8773 | 489 |
8774 | 490 |
8775 /* convenience function for comparing strings */ | 491 /* convenience function for comparing strings */ |
8776 U_CAPI UBool U_EXPORT2 | 492 U_CAPI UBool U_EXPORT2 |
8777 ucol_greater( const UCollator *coll, | 493 ucol_greater( const UCollator *coll, |
8778 const UChar *source, | 494 const UChar *source, |
8779 int32_t sourceLength, | 495 int32_t sourceLength, |
(...skipping 23 matching lines...) Expand all Loading... |
8803 int32_t sourceLength, | 519 int32_t sourceLength, |
8804 const UChar *target, | 520 const UChar *target, |
8805 int32_t targetLength) | 521 int32_t targetLength) |
8806 { | 522 { |
8807 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) | 523 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) |
8808 == UCOL_EQUAL); | 524 == UCOL_EQUAL); |
8809 } | 525 } |
8810 | 526 |
8811 U_CAPI void U_EXPORT2 | 527 U_CAPI void U_EXPORT2 |
8812 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { | 528 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { |
8813 if(coll && coll->UCA) { | 529 const Collator *c = Collator::fromUCollator(coll); |
8814 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo)); | 530 if(c != NULL) { |
| 531 UVersionInfo v; |
| 532 c->getVersion(v); |
| 533 // Note: This is tied to how the current implementation encodes the UCA
version |
| 534 // in the overall getVersion(). |
| 535 // Alternatively, we could load the root collator and get at lower-level
data from there. |
| 536 // Either way, it will reflect the input collator's UCA version only |
| 537 // if it is a known implementation. |
| 538 // It would be cleaner to make this a virtual Collator method. |
| 539 info[0] = v[1] >> 3; |
| 540 info[1] = v[1] & 7; |
| 541 info[2] = v[2] >> 6; |
| 542 info[3] = 0; |
8815 } | 543 } |
8816 } | 544 } |
8817 | 545 |
| 546 U_CAPI const UChar * U_EXPORT2 |
| 547 ucol_getRules(const UCollator *coll, int32_t *length) { |
| 548 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); |
| 549 // OK to crash if coll==NULL: We do not want to check "this" pointers. |
| 550 if(rbc != NULL || coll == NULL) { |
| 551 const UnicodeString &rules = rbc->getRules(); |
| 552 U_ASSERT(rules.getBuffer()[rules.length()] == 0); |
| 553 *length = rules.length(); |
| 554 return rules.getBuffer(); |
| 555 } |
| 556 static const UChar _NUL = 0; |
| 557 *length = 0; |
| 558 return &_NUL; |
| 559 } |
| 560 |
| 561 U_CAPI int32_t U_EXPORT2 |
| 562 ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int3
2_t bufferLen) { |
| 563 UnicodeString rules; |
| 564 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); |
| 565 if(rbc != NULL || coll == NULL) { |
| 566 rbc->getRules(delta, rules); |
| 567 } |
| 568 if(buffer != NULL && bufferLen > 0) { |
| 569 UErrorCode errorCode = U_ZERO_ERROR; |
| 570 return rules.extract(buffer, bufferLen, errorCode); |
| 571 } else { |
| 572 return rules.length(); |
| 573 } |
| 574 } |
| 575 |
| 576 U_CAPI const char * U_EXPORT2 |
| 577 ucol_getLocale(const UCollator *coll, ULocDataLocaleType type, UErrorCode *statu
s) { |
| 578 return ucol_getLocaleByType(coll, type, status); |
| 579 } |
| 580 |
| 581 U_CAPI const char * U_EXPORT2 |
| 582 ucol_getLocaleByType(const UCollator *coll, ULocDataLocaleType type, UErrorCode
*status) { |
| 583 if(U_FAILURE(*status)) { |
| 584 return NULL; |
| 585 } |
| 586 UTRACE_ENTRY(UTRACE_UCOL_GETLOCALE); |
| 587 UTRACE_DATA1(UTRACE_INFO, "coll=%p", coll); |
| 588 |
| 589 const char *result; |
| 590 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); |
| 591 if(rbc == NULL && coll != NULL) { |
| 592 *status = U_UNSUPPORTED_ERROR; |
| 593 result = NULL; |
| 594 } else { |
| 595 result = rbc->internalGetLocaleID(type, *status); |
| 596 } |
| 597 |
| 598 UTRACE_DATA1(UTRACE_INFO, "result = %s", result); |
| 599 UTRACE_EXIT_STATUS(*status); |
| 600 return result; |
| 601 } |
| 602 |
| 603 U_CAPI USet * U_EXPORT2 |
| 604 ucol_getTailoredSet(const UCollator *coll, UErrorCode *status) { |
| 605 if(U_FAILURE(*status)) { |
| 606 return NULL; |
| 607 } |
| 608 UnicodeSet *set = Collator::fromUCollator(coll)->getTailoredSet(*status); |
| 609 if(U_FAILURE(*status)) { |
| 610 delete set; |
| 611 return NULL; |
| 612 } |
| 613 return set->toUSet(); |
| 614 } |
| 615 |
| 616 U_CAPI UBool U_EXPORT2 |
| 617 ucol_equals(const UCollator *source, const UCollator *target) { |
| 618 return source == target || |
| 619 (*Collator::fromUCollator(source)) == (*Collator::fromUCollator(target))
; |
| 620 } |
| 621 |
8818 #endif /* #if !UCONFIG_NO_COLLATION */ | 622 #endif /* #if !UCONFIG_NO_COLLATION */ |
OLD | NEW |