| OLD | NEW |
| (Empty) |
| 1 Index: source/test/cintltst/usrchtst.c | |
| 2 =================================================================== | |
| 3 --- source/test/cintltst/usrchtst.c (revision 75773) | |
| 4 +++ source/test/cintltst/usrchtst.c (working copy) | |
| 5 @@ -1,5 +1,5 @@ | |
| 6 /******************************************************************** | |
| 7 - * Copyright (c) 2001-2010 International Business Machines | |
| 8 + * Copyright (c) 2001-2011 International Business Machines | |
| 9 * Corporation and others. All Rights Reserved. | |
| 10 ******************************************************************** | |
| 11 * File usrchtst.c | |
| 12 @@ -2553,7 +2553,173 @@ | |
| 13 ucol_close(coll); | |
| 14 } | |
| 15 | |
| 16 +/** | |
| 17 +* TestUsingSearchCollator | |
| 18 +*/ | |
| 19 | |
| 20 +#define ARRAY_LENGTH(array) (sizeof(array)/sizeof(array[0])) | |
| 21 + | |
| 22 +typedef struct { | |
| 23 + const UChar * pattern; | |
| 24 + const int32_t * offsets; | |
| 25 + int32_t offsetsLen; | |
| 26 +} PatternAndOffsets; | |
| 27 + | |
| 28 +static const UChar scKoText[] = { | |
| 29 + 0x0020, | |
| 30 +/*01*/ 0xAC00, 0x0020, /* simple LV Hangul */ | |
| 31 +/*03*/ 0xAC01, 0x0020, /* simple LVT Hangul */ | |
| 32 +/*05*/ 0xAC0F, 0x0020, /* LVTT, last jamo expands for s
earch */ | |
| 33 +/*07*/ 0xAFFF, 0x0020, /* LLVVVTT, every jamo expands f
or search */ | |
| 34 +/*09*/ 0x1100, 0x1161, 0x11A8, 0x0020, /* 0xAC01 as conjoining jamo */ | |
| 35 +/*13*/ 0x1100, 0x1161, 0x1100, 0x0020, /* 0xAC01 as basic conjoining ja
mo (per search rules) */ | |
| 36 +/*17*/ 0x3131, 0x314F, 0x3131, 0x0020, /* 0xAC01 as compatibility jamo
*/ | |
| 37 +/*21*/ 0x1100, 0x1161, 0x11B6, 0x0020, /* 0xAC0F as conjoining jamo; la
st expands for search */ | |
| 38 +/*25*/ 0x1100, 0x1161, 0x1105, 0x1112, 0x0020, /* 0xAC0F as basic conjoining ja
mo; last expands for search */ | |
| 39 +/*30*/ 0x1101, 0x1170, 0x11B6, 0x0020, /* 0xAFFF as conjoining jamo; al
l expand for search */ | |
| 40 +/*34*/ 0x00E6, 0x0020, /* small letter ae, expands */ | |
| 41 +/*36*/ 0x1E4D, 0x0020, /* small letter o with tilde and
acute, decomposes */ | |
| 42 + 0 | |
| 43 +}; | |
| 44 + | |
| 45 +static const UChar scKoPat0[] = { 0xAC01, 0 }; | |
| 46 +static const UChar scKoPat1[] = { 0x1100, 0x1161, 0x11A8, 0 }; /* 0xAC01 as con
joining jamo */ | |
| 47 +static const UChar scKoPat2[] = { 0xAC0F, 0 }; | |
| 48 +static const UChar scKoPat3[] = { 0x1100, 0x1161, 0x1105, 0x1112, 0 }; /* 0xAC0
F as basic conjoining jamo */ | |
| 49 +static const UChar scKoPat4[] = { 0xAFFF, 0 }; | |
| 50 +static const UChar scKoPat5[] = { 0x1101, 0x1170, 0x11B6, 0 }; /* 0xAFFF as con
joining jamo */ | |
| 51 + | |
| 52 +static const int32_t scKoSrchOff01[] = { 3, 9, 13 }; | |
| 53 +static const int32_t scKoSrchOff23[] = { 5, 21, 25 }; | |
| 54 +static const int32_t scKoSrchOff45[] = { 7, 30 }; | |
| 55 + | |
| 56 +static const PatternAndOffsets scKoSrchPatternsOffsets[] = { | |
| 57 + { scKoPat0, scKoSrchOff01, ARRAY_LENGTH(scKoSrchOff01) }, | |
| 58 + { scKoPat1, scKoSrchOff01, ARRAY_LENGTH(scKoSrchOff01) }, | |
| 59 + { scKoPat2, scKoSrchOff23, ARRAY_LENGTH(scKoSrchOff23) }, | |
| 60 + { scKoPat3, scKoSrchOff23, ARRAY_LENGTH(scKoSrchOff23) }, | |
| 61 + { scKoPat4, scKoSrchOff45, ARRAY_LENGTH(scKoSrchOff45) }, | |
| 62 + { scKoPat5, scKoSrchOff45, ARRAY_LENGTH(scKoSrchOff45) }, | |
| 63 + { NULL, NULL, 0 } | |
| 64 +}; | |
| 65 + | |
| 66 +static const int32_t scKoStndOff01[] = { 3, 9 }; | |
| 67 +static const int32_t scKoStndOff2[] = { 5, 21 }; | |
| 68 +static const int32_t scKoStndOff3[] = { 25 }; | |
| 69 +static const int32_t scKoStndOff45[] = { 7, 30 }; | |
| 70 + | |
| 71 +static const PatternAndOffsets scKoStndPatternsOffsets[] = { | |
| 72 + { scKoPat0, scKoStndOff01, ARRAY_LENGTH(scKoStndOff01) }, | |
| 73 + { scKoPat1, scKoStndOff01, ARRAY_LENGTH(scKoStndOff01) }, | |
| 74 + { scKoPat2, scKoStndOff2, ARRAY_LENGTH(scKoStndOff2) }, | |
| 75 + { scKoPat3, scKoStndOff3, ARRAY_LENGTH(scKoStndOff3) }, | |
| 76 + { scKoPat4, scKoStndOff45, ARRAY_LENGTH(scKoStndOff45) }, | |
| 77 + { scKoPat5, scKoStndOff45, ARRAY_LENGTH(scKoStndOff45) }, | |
| 78 + { NULL, NULL, 0 } | |
| 79 +}; | |
| 80 + | |
| 81 +typedef struct { | |
| 82 + const char * locale; | |
| 83 + const UChar * text; | |
| 84 + const PatternAndOffsets * patternsAndOffsets; | |
| 85 +} TUSCItem; | |
| 86 + | |
| 87 +static const TUSCItem tuscItems[] = { | |
| 88 + { "root", scKoText, scKoStndPatternsOffsets }, | |
| 89 + { "root@collation=search", scKoText, scKoSrchPatternsOffsets }, | |
| 90 + { "ko@collation=search", scKoText, scKoSrchPatternsOffsets }, | |
| 91 + { NULL, NULL, NULL } | |
| 92 +}; | |
| 93 + | |
| 94 +static const UChar dummyPat[] = { 0x0061, 0 }; | |
| 95 + | |
| 96 +static void TestUsingSearchCollator(void) | |
| 97 +{ | |
| 98 + const TUSCItem * tuscItemPtr; | |
| 99 + for (tuscItemPtr = tuscItems; tuscItemPtr->locale != NULL; tuscItemPtr++) { | |
| 100 + UErrorCode status = U_ZERO_ERROR; | |
| 101 + UCollator* ucol = ucol_open(tuscItemPtr->locale, &status); | |
| 102 + if ( U_SUCCESS(status) ) { | |
| 103 + UStringSearch* usrch = usearch_openFromCollator(dummyPat, -1, tuscI
temPtr->text, -1, ucol, NULL, &status); | |
| 104 + if ( U_SUCCESS(status) ) { | |
| 105 + const PatternAndOffsets * patternsOffsetsPtr; | |
| 106 + for ( patternsOffsetsPtr = tuscItemPtr->patternsAndOffsets; pat
ternsOffsetsPtr->pattern != NULL; patternsOffsetsPtr++) { | |
| 107 + usearch_setPattern(usrch, patternsOffsetsPtr->pattern, -1,
&status); | |
| 108 + if ( U_SUCCESS(status) ) { | |
| 109 + int32_t offset; | |
| 110 + const int32_t * nextOffsetPtr; | |
| 111 + const int32_t * limitOffsetPtr; | |
| 112 + | |
| 113 + usearch_reset(usrch); | |
| 114 + nextOffsetPtr = patternsOffsetsPtr->offsets; | |
| 115 + limitOffsetPtr = patternsOffsetsPtr->offsets + patterns
OffsetsPtr->offsetsLen; | |
| 116 + while (TRUE) { | |
| 117 + offset = usearch_next(usrch, &status); | |
| 118 + if ( U_FAILURE(status) || offset == USEARCH_DONE )
{ | |
| 119 + break; | |
| 120 + } | |
| 121 + if ( nextOffsetPtr < limitOffsetPtr ) { | |
| 122 + if (offset != *nextOffsetPtr) { | |
| 123 + log_err("error, locale %s, expected usearc
h_next %d, got %d\n", tuscItemPtr->locale, *nextOffsetPtr, offset); | |
| 124 + nextOffsetPtr = limitOffsetPtr; | |
| 125 + break; | |
| 126 + } | |
| 127 + nextOffsetPtr++; | |
| 128 + } else { | |
| 129 + log_err("error, locale %s, usearch_next returne
d more matches than expected\n", tuscItemPtr->locale ); | |
| 130 + } | |
| 131 + } | |
| 132 + if ( U_FAILURE(status) ) { | |
| 133 + log_err("error, locale %s, usearch_next failed: %s\
n", tuscItemPtr->locale, u_errorName(status) ); | |
| 134 + } else if ( nextOffsetPtr < limitOffsetPtr ) { | |
| 135 + log_err("error, locale %s, usearch_next returned fe
wer matches than expected\n", tuscItemPtr->locale ); | |
| 136 + } | |
| 137 + | |
| 138 + status = U_ZERO_ERROR; | |
| 139 + usearch_reset(usrch); | |
| 140 + nextOffsetPtr = patternsOffsetsPtr->offsets + patternsO
ffsetsPtr->offsetsLen; | |
| 141 + limitOffsetPtr = patternsOffsetsPtr->offsets; | |
| 142 + while (TRUE) { | |
| 143 + offset = usearch_previous(usrch, &status); | |
| 144 + if ( U_FAILURE(status) || offset == USEARCH_DONE )
{ | |
| 145 + break; | |
| 146 + } | |
| 147 + if ( nextOffsetPtr > limitOffsetPtr ) { | |
| 148 + nextOffsetPtr--; | |
| 149 + if (offset != *nextOffsetPtr) { | |
| 150 + log_err("error, locale %s, expected usearc
h_previous %d, got %d\n", tuscItemPtr->locale, *nextOffsetPtr, offset); | |
| 151 + nextOffsetPtr = limitOffsetPtr; | |
| 152 + break; | |
| 153 + } | |
| 154 + } else { | |
| 155 + log_err("error, locale %s, usearch_previous ret
urned more matches than expected\n", tuscItemPtr->locale ); | |
| 156 + } | |
| 157 + } | |
| 158 + if ( U_FAILURE(status) ) { | |
| 159 + log_err("error, locale %s, usearch_previous failed:
%s\n", tuscItemPtr->locale, u_errorName(status) ); | |
| 160 + } else if ( nextOffsetPtr > limitOffsetPtr ) { | |
| 161 + log_err("error, locale %s, usearch_previous returne
d fewer matches than expected\n", tuscItemPtr->locale ); | |
| 162 + } | |
| 163 + | |
| 164 + } else { | |
| 165 + log_err("error, locale %s, usearch_setPattern failed: %
s\n", tuscItemPtr->locale, u_errorName(status) ); | |
| 166 + } | |
| 167 + } | |
| 168 + usearch_close(usrch); | |
| 169 + } else { | |
| 170 + log_err("error, locale %s, usearch_openFromCollator failed: %s\
n", tuscItemPtr->locale, u_errorName(status) ); | |
| 171 + } | |
| 172 + ucol_close(ucol); | |
| 173 + } else { | |
| 174 + log_err("error, locale %s, ucol_open failed: %s\n", tuscItemPtr->lo
cale, u_errorName(status) ); | |
| 175 + } | |
| 176 + } | |
| 177 +} | |
| 178 + | |
| 179 +/** | |
| 180 +* addSearchTest | |
| 181 +*/ | |
| 182 + | |
| 183 void addSearchTest(TestNode** root) | |
| 184 { | |
| 185 addTest(root, &TestStart, "tscoll/usrchtst/TestStart"); | |
| 186 @@ -2608,6 +2774,7 @@ | |
| 187 addTest(root, &TestForwardBackward, "tscoll/usrchtst/TestForwardBackward"); | |
| 188 addTest(root, &TestSearchForNull, "tscoll/usrchtst/TestSearchForNull"); | |
| 189 addTest(root, &TestStrengthIdentical, "tscoll/usrchtst/TestStrengthIdentica
l"); | |
| 190 + addTest(root, &TestUsingSearchCollator, "tscoll/usrchtst/TestUsingSearchCol
lator"); | |
| 191 } | |
| 192 | |
| 193 #endif /* #if !UCONFIG_NO_COLLATION */ | |
| 194 Index: source/test/cintltst/citertst.c | |
| 195 =================================================================== | |
| 196 --- source/test/cintltst/citertst.c (revision 75773) | |
| 197 +++ source/test/cintltst/citertst.c (working copy) | |
| 198 @@ -1,6 +1,6 @@ | |
| 199 /******************************************************************** | |
| 200 * COPYRIGHT: | |
| 201 - * Copyright (c) 1997-2010, International Business Machines Corporation and | |
| 202 + * Copyright (c) 1997-2011, International Business Machines Corporation and | |
| 203 * others. All Rights Reserved. | |
| 204 ********************************************************************/ | |
| 205 /******************************************************************************
** | |
| 206 @@ -22,6 +22,7 @@ | |
| 207 #if !UCONFIG_NO_COLLATION | |
| 208 | |
| 209 #include "unicode/ucol.h" | |
| 210 +#include "unicode/ucoleitr.h" | |
| 211 #include "unicode/uloc.h" | |
| 212 #include "unicode/uchar.h" | |
| 213 #include "unicode/ustring.h" | |
| 214 @@ -58,6 +59,7 @@ | |
| 215 addTest(root, &TestCEBufferOverflow, "tscoll/citertst/TestCEBufferOverflow"
); | |
| 216 addTest(root, &TestCEValidity, "tscoll/citertst/TestCEValidity"); | |
| 217 addTest(root, &TestSortKeyValidity, "tscoll/citertst/TestSortKeyValidity"); | |
| 218 + addTest(root, &TestSearchCollatorElements, "tscoll/citertst/TestSearchColla
torElements"); | |
| 219 } | |
| 220 | |
| 221 /* The locales we support */ | |
| 222 @@ -2017,4 +2019,141 @@ | |
| 223 T_FileStream_close(file); | |
| 224 } | |
| 225 | |
| 226 +/** | |
| 227 +* TestSearchCollatorElements tests iterator behavior (forwards and backwards) w
ith | |
| 228 +* normalization on AND jamo tailoring, among other things. | |
| 229 +*/ | |
| 230 +static const UChar tsceText[] = { /* Nothing in here should be ignorable */ | |
| 231 + 0x0020, 0xAC00, /* simple LV Hangul */ | |
| 232 + 0x0020, 0xAC01, /* simple LVT Hangul */ | |
| 233 + 0x0020, 0xAC0F, /* LVTT, last jamo expands for search */ | |
| 234 + 0x0020, 0xAFFF, /* LLVVVTT, every jamo expands for search *
/ | |
| 235 + 0x0020, 0x1100, 0x1161, 0x11A8, /* 0xAC01 as conjoining jamo */ | |
| 236 + 0x0020, 0x3131, 0x314F, 0x3131, /* 0xAC01 as compatibility jamo */ | |
| 237 + 0x0020, 0x1100, 0x1161, 0x11B6, /* 0xAC0F as conjoining jamo; last expands
for search */ | |
| 238 + 0x0020, 0x1101, 0x1170, 0x11B6, /* 0xAFFF as conjoining jamo; all expand fo
r search */ | |
| 239 + 0x0020, 0x00E6, /* small letter ae, expands */ | |
| 240 + 0x0020, 0x1E4D, /* small letter o with tilde and acute, dec
omposes */ | |
| 241 + 0x0020 | |
| 242 +}; | |
| 243 +enum { kLen_tsceText = sizeof(tsceText)/sizeof(tsceText[0]) }; | |
| 244 + | |
| 245 +static const int32_t rootStandardOffsets[] = { | |
| 246 + 0, 1,2, | |
| 247 + 2, 3,4,4, | |
| 248 + 4, 5,6,6, | |
| 249 + 6, 7,8,8, | |
| 250 + 8, 9,10,11, | |
| 251 + 12, 13,14,15, | |
| 252 + 16, 17,18,19, | |
| 253 + 20, 21,22,23, | |
| 254 + 24, 25,26,26,26, | |
| 255 + 26, 27,28,28, | |
| 256 + 28, | |
| 257 + 29 | |
| 258 +}; | |
| 259 +enum { kLen_rootStandardOffsets = sizeof(rootStandardOffsets)/sizeof(rootStanda
rdOffsets[0]) }; | |
| 260 + | |
| 261 +static const int32_t rootSearchOffsets[] = { | |
| 262 + 0, 1,2, | |
| 263 + 2, 3,4,4, | |
| 264 + 4, 5,6,6,6, | |
| 265 + 6, 7,8,8,8,8,8,8, | |
| 266 + 8, 9,10,11, | |
| 267 + 12, 13,14,15, | |
| 268 + 16, 17,18,19,20, | |
| 269 + 20, 21,22,22,23,23,23,24, | |
| 270 + 24, 25,26,26,26, | |
| 271 + 26, 27,28,28, | |
| 272 + 28, | |
| 273 + 29 | |
| 274 +}; | |
| 275 +enum { kLen_rootSearchOffsets = sizeof(rootSearchOffsets)/sizeof(rootSearchOffs
ets[0]) }; | |
| 276 + | |
| 277 +typedef struct { | |
| 278 + const char * locale; | |
| 279 + const int32_t * offsets; | |
| 280 + int32_t offsetsLen; | |
| 281 +} TSCEItem; | |
| 282 + | |
| 283 +static const TSCEItem tsceItems[] = { | |
| 284 + { "root", rootStandardOffsets, kLen_rootStandardOffsets }, | |
| 285 + { "root@collation=search", rootSearchOffsets, kLen_rootSearchOffsets }, | |
| 286 + { NULL, NULL, 0 } | |
| 287 +}; | |
| 288 + | |
| 289 +static void TestSearchCollatorElements(void) | |
| 290 +{ | |
| 291 + const TSCEItem * tsceItemPtr; | |
| 292 + for (tsceItemPtr = tsceItems; tsceItemPtr->locale != NULL; tsceItemPtr++) { | |
| 293 + UErrorCode status = U_ZERO_ERROR; | |
| 294 + UCollator* ucol = ucol_open(tsceItemPtr->locale, &status); | |
| 295 + if ( U_SUCCESS(status) ) { | |
| 296 + UCollationElements * uce = ucol_openElements(ucol, tsceText, kLen_t
sceText, &status); | |
| 297 + if ( U_SUCCESS(status) ) { | |
| 298 + int32_t offset, element; | |
| 299 + const int32_t * nextOffsetPtr; | |
| 300 + const int32_t * limitOffsetPtr; | |
| 301 + | |
| 302 + nextOffsetPtr = tsceItemPtr->offsets; | |
| 303 + limitOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen
; | |
| 304 + do { | |
| 305 + offset = ucol_getOffset(uce); | |
| 306 + element = ucol_next(uce, &status); | |
| 307 + if ( element == 0 ) { | |
| 308 + log_err("error, locale %s, ucol_next returned element 0
\n", tsceItemPtr->locale ); | |
| 309 + } | |
| 310 + if ( nextOffsetPtr < limitOffsetPtr ) { | |
| 311 + if (offset != *nextOffsetPtr) { | |
| 312 + log_err("error, locale %s, expected ucol_next -> uc
ol_getOffset %d, got %d\n", | |
| 313 + tsceItemPtr->locale
, *nextOffsetPtr, offset ); | |
| 314 + nextOffsetPtr = limitOffsetPtr; | |
| 315 + break; | |
| 316 + } | |
| 317 + nextOffsetPtr++; | |
| 318 + } else { | |
| 319 + log_err("error, locale %s, ucol_next returned more elem
ents than expected\n", tsceItemPtr->locale ); | |
| 320 + } | |
| 321 + } while ( U_SUCCESS(status) && element != UCOL_NULLORDER ); | |
| 322 + if ( nextOffsetPtr < limitOffsetPtr ) { | |
| 323 + log_err("error, locale %s, ucol_next returned fewer element
s than expected\n", tsceItemPtr->locale ); | |
| 324 + } | |
| 325 + | |
| 326 + ucol_setOffset(uce, kLen_tsceText, &status); | |
| 327 + status = U_ZERO_ERROR; | |
| 328 + nextOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen; | |
| 329 + limitOffsetPtr = tsceItemPtr->offsets; | |
| 330 + do { | |
| 331 + offset = ucol_getOffset(uce); | |
| 332 + element = ucol_previous(uce, &status); | |
| 333 + if ( element == 0 ) { | |
| 334 + log_err("error, locale %s, ucol_previous returned eleme
nt 0\n", tsceItemPtr->locale ); | |
| 335 + } | |
| 336 + if ( nextOffsetPtr > limitOffsetPtr ) { | |
| 337 + nextOffsetPtr--; | |
| 338 + if (offset != *nextOffsetPtr) { | |
| 339 + log_err("error, locale %s, expected ucol_previous -
> ucol_getOffset %d, got %d\n", | |
| 340 + tsceItemPtr->lo
cale, *nextOffsetPtr, offset ); | |
| 341 + nextOffsetPtr = limitOffsetPtr; | |
| 342 + break; | |
| 343 + } | |
| 344 + } else { | |
| 345 + log_err("error, locale %s, ucol_previous returned more
elements than expected\n", tsceItemPtr->locale ); | |
| 346 + } | |
| 347 + } while ( U_SUCCESS(status) && element != UCOL_NULLORDER ); | |
| 348 + if ( nextOffsetPtr > limitOffsetPtr ) { | |
| 349 + log_err("error, locale %s, ucol_previous returned fewer ele
ments than expected\n", tsceItemPtr->locale ); | |
| 350 + } | |
| 351 + | |
| 352 + ucol_closeElements(uce); | |
| 353 + } else { | |
| 354 + log_err("error, locale %s, ucol_openElements failed: %s\n", tsc
eItemPtr->locale, u_errorName(status) ); | |
| 355 + } | |
| 356 + ucol_close(ucol); | |
| 357 + } else { | |
| 358 + log_err("error, locale %s, ucol_open failed: %s\n", tsceItemPtr->lo
cale, u_errorName(status) ); | |
| 359 + } | |
| 360 + } | |
| 361 +} | |
| 362 + | |
| 363 #endif /* #if !UCONFIG_NO_COLLATION */ | |
| 364 Index: source/test/cintltst/citertst.h | |
| 365 =================================================================== | |
| 366 --- source/test/cintltst/citertst.h (revision 75773) | |
| 367 +++ source/test/cintltst/citertst.h (working copy) | |
| 368 @@ -1,6 +1,6 @@ | |
| 369 /******************************************************************** | |
| 370 * COPYRIGHT: | |
| 371 - * Copyright (c) 1997-2008, International Business Machines Corporation and | |
| 372 + * Copyright (c) 1997-2008,2011, International Business Machines Corporation an
d | |
| 373 * others. All Rights Reserved. | |
| 374 ********************************************************************/ | |
| 375 /******************************************************************************
** | |
| 376 @@ -101,6 +101,11 @@ | |
| 377 * Bound checkings. | |
| 378 */ | |
| 379 static void TestSortKeyValidity(void); | |
| 380 +/** | |
| 381 +* TestSearchCollatorElements tests iterator behavior (forwards and backwards) w
ith | |
| 382 +* normalization on AND jamo tailoring, among other things. | |
| 383 +*/ | |
| 384 +static void TestSearchCollatorElements(void); | |
| 385 | |
| 386 /*------------------------------------------------------------------------ | |
| 387 Internal utilities | |
| 388 Index: source/i18n/ucol.cpp | |
| 389 =================================================================== | |
| 390 --- source/i18n/ucol.cpp (revision 75773) | |
| 391 +++ source/i18n/ucol.cpp (working copy) | |
| 392 @@ -1,6 +1,6 @@ | |
| 393 /* | |
| 394 ******************************************************************************* | |
| 395 -* Copyright (C) 1996-2010, International Business Machines | |
| 396 +* Copyright (C) 1996-2011, International Business Machines | |
| 397 * Corporation and others. All Rights Reserved. | |
| 398 ******************************************************************************* | |
| 399 * file name: ucol.cpp | |
| 400 @@ -1444,173 +1444,176 @@ | |
| 401 UChar ch = 0; | |
| 402 collationSource->offsetReturn = NULL; | |
| 403 | |
| 404 - for (;;) /* Loop handles case when incremental no
rmalize switches */ | |
| 405 - { /* to or from the side buffer / origin
al string, and we */ | |
| 406 - /* need to start again to get the next character. */ | |
| 407 + do { | |
| 408 + for (;;) /* Loop handles case when incrementa
l normalize switches */ | |
| 409 + { /* to or from the side buffer / or
iginal string, and we */ | |
| 410 + /* need to start again to get the next character. */ | |
| 411 | |
| 412 - if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF |
UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0) | |
| 413 - { | |
| 414 - // The source string is null terminated and we're not working from
the side buffer, | |
| 415 - // and we're not normalizing. This is the fast path. | |
| 416 - // (We can be in the side buffer for Thai pre-vowel reordering ev
en when not normalizing.) | |
| 417 - ch = *collationSource->pos++; | |
| 418 - if (ch != 0) { | |
| 419 - break; | |
| 420 + if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMB
UF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0) | |
| 421 + { | |
| 422 + // The source string is null terminated and we're not working f
rom the side buffer, | |
| 423 + // and we're not normalizing. This is the fast path. | |
| 424 + // (We can be in the side buffer for Thai pre-vowel reorderin
g even when not normalizing.) | |
| 425 + ch = *collationSource->pos++; | |
| 426 + if (ch != 0) { | |
| 427 + break; | |
| 428 + } | |
| 429 + else { | |
| 430 + return UCOL_NO_MORE_CES; | |
| 431 + } | |
| 432 } | |
| 433 - else { | |
| 434 - return UCOL_NO_MORE_CES; | |
| 435 - } | |
| 436 - } | |
| 437 | |
| 438 - if (collationSource->flags & UCOL_ITER_HASLEN) { | |
| 439 - // Normal path for strings when length is specified. | |
| 440 - // (We can't be in side buffer because it is always null terminat
ed.) | |
| 441 - if (collationSource->pos >= collationSource->endp) { | |
| 442 - // Ran off of the end of the main source string. We're done. | |
| 443 - return UCOL_NO_MORE_CES; | |
| 444 + if (collationSource->flags & UCOL_ITER_HASLEN) { | |
| 445 + // Normal path for strings when length is specified. | |
| 446 + // (We can't be in side buffer because it is always null term
inated.) | |
| 447 + if (collationSource->pos >= collationSource->endp) { | |
| 448 + // Ran off of the end of the main source string. We're don
e. | |
| 449 + return UCOL_NO_MORE_CES; | |
| 450 + } | |
| 451 + ch = *collationSource->pos++; | |
| 452 } | |
| 453 - ch = *collationSource->pos++; | |
| 454 - } | |
| 455 - else if(collationSource->flags & UCOL_USE_ITERATOR) { | |
| 456 - UChar32 iterCh = collationSource->iterator->next(collationSource->i
terator); | |
| 457 - if(iterCh == U_SENTINEL) { | |
| 458 - return UCOL_NO_MORE_CES; | |
| 459 - } | |
| 460 - ch = (UChar)iterCh; | |
| 461 - } | |
| 462 - else | |
| 463 - { | |
| 464 - // Null terminated string. | |
| 465 - ch = *collationSource->pos++; | |
| 466 - if (ch == 0) { | |
| 467 - // Ran off end of buffer. | |
| 468 - if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { | |
| 469 - // Ran off end of main string. backing up one character. | |
| 470 - collationSource->pos--; | |
| 471 + else if(collationSource->flags & UCOL_USE_ITERATOR) { | |
| 472 + UChar32 iterCh = collationSource->iterator->next(collationSourc
e->iterator); | |
| 473 + if(iterCh == U_SENTINEL) { | |
| 474 return UCOL_NO_MORE_CES; | |
| 475 } | |
| 476 - else | |
| 477 - { | |
| 478 - // Hit null in the normalize side buffer. | |
| 479 - // Usually this means the end of the normalized data, | |
| 480 - // except for one odd case: a null followed by combining ch
ars, | |
| 481 - // which is the case if we are at the start of the buffer
. | |
| 482 - if (collationSource->pos == collationSource->writableBuffer
.getBuffer()+1) { | |
| 483 - break; | |
| 484 + ch = (UChar)iterCh; | |
| 485 + } | |
| 486 + else | |
| 487 + { | |
| 488 + // Null terminated string. | |
| 489 + ch = *collationSource->pos++; | |
| 490 + if (ch == 0) { | |
| 491 + // Ran off end of buffer. | |
| 492 + if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { | |
| 493 + // Ran off end of main string. backing up one character
. | |
| 494 + collationSource->pos--; | |
| 495 + return UCOL_NO_MORE_CES; | |
| 496 } | |
| 497 + else | |
| 498 + { | |
| 499 + // Hit null in the normalize side buffer. | |
| 500 + // Usually this means the end of the normalized data, | |
| 501 + // except for one odd case: a null followed by combinin
g chars, | |
| 502 + // which is the case if we are at the start of the bu
ffer. | |
| 503 + if (collationSource->pos == collationSource->writableBu
ffer.getBuffer()+1) { | |
| 504 + break; | |
| 505 + } | |
| 506 | |
| 507 - // Null marked end of side buffer. | |
| 508 - // Revert to the main string and | |
| 509 - // loop back to top to try again to get a character. | |
| 510 - collationSource->pos = collationSource->fcdPosition; | |
| 511 - collationSource->flags = collationSource->origFlags; | |
| 512 - continue; | |
| 513 + // Null marked end of side buffer. | |
| 514 + // Revert to the main string and | |
| 515 + // loop back to top to try again to get a character. | |
| 516 + collationSource->pos = collationSource->fcdPosition; | |
| 517 + collationSource->flags = collationSource->origFlags; | |
| 518 + continue; | |
| 519 + } | |
| 520 } | |
| 521 } | |
| 522 - } | |
| 523 | |
| 524 - if(collationSource->flags&UCOL_HIRAGANA_Q) { | |
| 525 - /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the
flag | |
| 526 - * based on whether the previous codepoint was Hiragana or Katakana
. | |
| 527 - */ | |
| 528 - if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) |
| | |
| 529 - ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3
099 && ch <= 0x309C))) { | |
| 530 - collationSource->flags |= UCOL_WAS_HIRAGANA; | |
| 531 - } else { | |
| 532 - collationSource->flags &= ~UCOL_WAS_HIRAGANA; | |
| 533 + if(collationSource->flags&UCOL_HIRAGANA_Q) { | |
| 534 + /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set
the flag | |
| 535 + * based on whether the previous codepoint was Hiragana or Kata
kana. | |
| 536 + */ | |
| 537 + if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f
)) || | |
| 538 + ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >=
0x3099 && ch <= 0x309C))) { | |
| 539 + collationSource->flags |= UCOL_WAS_HIRAGANA; | |
| 540 + } else { | |
| 541 + collationSource->flags &= ~UCOL_WAS_HIRAGANA; | |
| 542 + } | |
| 543 } | |
| 544 - } | |
| 545 | |
| 546 - // We've got a character. See if there's any fcd and/or normalization
stuff to do. | |
| 547 - // Note that UCOL_ITER_NORM flag is always zero when we are in the s
ide buffer. | |
| 548 - if ((collationSource->flags & UCOL_ITER_NORM) == 0) { | |
| 549 - break; | |
| 550 - } | |
| 551 + // We've got a character. See if there's any fcd and/or normalizat
ion stuff to do. | |
| 552 + // Note that UCOL_ITER_NORM flag is always zero when we are in t
he side buffer. | |
| 553 + if ((collationSource->flags & UCOL_ITER_NORM) == 0) { | |
| 554 + break; | |
| 555 + } | |
| 556 | |
| 557 - if (collationSource->fcdPosition >= collationSource->pos) { | |
| 558 - // An earlier FCD check has already covered the current character. | |
| 559 - // We can go ahead and process this char. | |
| 560 - break; | |
| 561 - } | |
| 562 - | |
| 563 - if (ch < ZERO_CC_LIMIT_ ) { | |
| 564 - // Fast fcd safe path. Trailing combining class == 0. This char i
s OK. | |
| 565 - break; | |
| 566 - } | |
| 567 - | |
| 568 - if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
| 569 - // We need to peek at the next character in order to tell if we are
FCD | |
| 570 - if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource-
>pos >= collationSource->endp) { | |
| 571 - // We are at the last char of source string. | |
| 572 - // It is always OK for FCD check. | |
| 573 + if (collationSource->fcdPosition >= collationSource->pos) { | |
| 574 + // An earlier FCD check has already covered the current charact
er. | |
| 575 + // We can go ahead and process this char. | |
| 576 break; | |
| 577 } | |
| 578 | |
| 579 - // Not at last char of source string (or we'll check against termin
ating null). Do the FCD fast test | |
| 580 - if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
| 581 + if (ch < ZERO_CC_LIMIT_ ) { | |
| 582 + // Fast fcd safe path. Trailing combining class == 0. This ch
ar is OK. | |
| 583 break; | |
| 584 } | |
| 585 - } | |
| 586 | |
| 587 + if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
| 588 + // We need to peek at the next character in order to tell if we
are FCD | |
| 589 + if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSou
rce->pos >= collationSource->endp) { | |
| 590 + // We are at the last char of source string. | |
| 591 + // It is always OK for FCD check. | |
| 592 + break; | |
| 593 + } | |
| 594 | |
| 595 - // Need a more complete FCD check and possible normalization. | |
| 596 - if (collIterFCD(collationSource)) { | |
| 597 - collIterNormalize(collationSource); | |
| 598 - } | |
| 599 - if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { | |
| 600 - // No normalization was needed. Go ahead and process the char we
already had. | |
| 601 - break; | |
| 602 - } | |
| 603 + // Not at last char of source string (or we'll check against te
rminating null). Do the FCD fast test | |
| 604 + if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
| 605 + break; | |
| 606 + } | |
| 607 + } | |
| 608 | |
| 609 - // Some normalization happened. Next loop iteration will pick up a cha
r | |
| 610 - // from the normalization buffer. | |
| 611 | |
| 612 - } // end for (;;) | |
| 613 + // Need a more complete FCD check and possible normalization. | |
| 614 + if (collIterFCD(collationSource)) { | |
| 615 + collIterNormalize(collationSource); | |
| 616 + } | |
| 617 + if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { | |
| 618 + // No normalization was needed. Go ahead and process the char
we already had. | |
| 619 + break; | |
| 620 + } | |
| 621 | |
| 622 + // Some normalization happened. Next loop iteration will pick up a
char | |
| 623 + // from the normalization buffer. | |
| 624 | |
| 625 - if (ch <= 0xFF) { | |
| 626 - /* For latin-1 characters we never need to fall back to the UCA table
*/ | |
| 627 - /* because all of the UCA data is replicated in the latinOneMapping
array */ | |
| 628 - order = coll->latinOneMapping[ch]; | |
| 629 - if (order > UCOL_NOT_FOUND) { | |
| 630 - order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, sta
tus); | |
| 631 + } // end for (;;) | |
| 632 + | |
| 633 + | |
| 634 + if (ch <= 0xFF) { | |
| 635 + /* For latin-1 characters we never need to fall back to the UCA ta
ble */ | |
| 636 + /* because all of the UCA data is replicated in the latinOneMapp
ing array */ | |
| 637 + order = coll->latinOneMapping[ch]; | |
| 638 + if (order > UCOL_NOT_FOUND) { | |
| 639 + order = ucol_prv_getSpecialCE(coll, ch, order, collationSource,
status); | |
| 640 + } | |
| 641 } | |
| 642 - } | |
| 643 - else | |
| 644 - { | |
| 645 - // Always use UCA for Han, Hangul | |
| 646 - // (Han extension A is before main Han block) | |
| 647 - // **** Han compatibility chars ?? **** | |
| 648 - if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && | |
| 649 - (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) { | |
| 650 - if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) { | |
| 651 - // between the two target ranges; do normal lookup | |
| 652 - // **** this range is YI, Modifier tone letters, **** | |
| 653 - // **** Latin-D, Syloti Nagari, Phagas-pa. **** | |
| 654 - // **** Latin-D might be tailored, so we need to **** | |
| 655 - // **** do the normal lookup for these guys. **** | |
| 656 + else | |
| 657 + { | |
| 658 + // Always use UCA for Han, Hangul | |
| 659 + // (Han extension A is before main Han block) | |
| 660 + // **** Han compatibility chars ?? **** | |
| 661 + if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && | |
| 662 + (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) { | |
| 663 + if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) { | |
| 664 + // between the two target ranges; do normal lookup | |
| 665 + // **** this range is YI, Modifier tone letters, **** | |
| 666 + // **** Latin-D, Syloti Nagari, Phagas-pa. **** | |
| 667 + // **** Latin-D might be tailored, so we need to **** | |
| 668 + // **** do the normal lookup for these guys. **** | |
| 669 + order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); | |
| 670 + } else { | |
| 671 + // in one of the target ranges; use UCA | |
| 672 + order = UCOL_NOT_FOUND; | |
| 673 + } | |
| 674 + } else { | |
| 675 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); | |
| 676 - } else { | |
| 677 - // in one of the target ranges; use UCA | |
| 678 - order = UCOL_NOT_FOUND; | |
| 679 } | |
| 680 - } else { | |
| 681 - order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); | |
| 682 - } | |
| 683 | |
| 684 - if(order > UCOL_NOT_FOUND) { /* i
f a CE is special */ | |
| 685 - order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, sta
tus); /* and try to get the special CE */ | |
| 686 - } | |
| 687 + if(order > UCOL_NOT_FOUND) {
/* if a CE is special */ | |
| 688 + order = ucol_prv_getSpecialCE(coll, ch, order, collationSource,
status); /* and try to get the special CE */ | |
| 689 + } | |
| 690 | |
| 691 - if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good
CE in the tailoring */ | |
| 692 - /* if we got here, the codepoint MUST be over 0xFF - so we look dir
ectly in the trie */ | |
| 693 - order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); | |
| 694 + if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a
good CE in the tailoring */ | |
| 695 + /* if we got here, the codepoint MUST be over 0xFF - so we look
directly in the trie */ | |
| 696 + order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); | |
| 697 | |
| 698 - if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */ | |
| 699 - order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSo
urce, status); | |
| 700 + if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE
*/ | |
| 701 + order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collati
onSource, status); | |
| 702 + } | |
| 703 } | |
| 704 } | |
| 705 - } | |
| 706 + } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_
LAST_HANGUL ); | |
| 707 + | |
| 708 if(order == UCOL_NOT_FOUND) { | |
| 709 order = getImplicit(ch, collationSource); | |
| 710 } | |
| 711 @@ -1958,161 +1961,163 @@ | |
| 712 else { | |
| 713 UChar ch = 0; | |
| 714 | |
| 715 - /* | |
| 716 - Loop handles case when incremental normalize switches to or from the | |
| 717 - side buffer / original string, and we need to start again to get the | |
| 718 - next character. | |
| 719 - */ | |
| 720 - for (;;) { | |
| 721 - if (data->flags & UCOL_ITER_HASLEN) { | |
| 722 - /* | |
| 723 - Normal path for strings when length is specified. | |
| 724 - Not in side buffer because it is always null terminated. | |
| 725 - */ | |
| 726 - if (data->pos <= data->string) { | |
| 727 - /* End of the main source string */ | |
| 728 - return UCOL_NO_MORE_CES; | |
| 729 - } | |
| 730 - data->pos --; | |
| 731 - ch = *data->pos; | |
| 732 - } | |
| 733 - // we are using an iterator to go back. Pray for us! | |
| 734 - else if (data->flags & UCOL_USE_ITERATOR) { | |
| 735 - UChar32 iterCh = data->iterator->previous(data->iterator); | |
| 736 - if(iterCh == U_SENTINEL) { | |
| 737 - return UCOL_NO_MORE_CES; | |
| 738 - } else { | |
| 739 - ch = (UChar)iterCh; | |
| 740 - } | |
| 741 - } | |
| 742 - else { | |
| 743 - data->pos --; | |
| 744 - ch = *data->pos; | |
| 745 - /* we are in the side buffer. */ | |
| 746 - if (ch == 0) { | |
| 747 + do { | |
| 748 + /* | |
| 749 + Loop handles case when incremental normalize switches to or from th
e | |
| 750 + side buffer / original string, and we need to start again to get th
e | |
| 751 + next character. | |
| 752 + */ | |
| 753 + for (;;) { | |
| 754 + if (data->flags & UCOL_ITER_HASLEN) { | |
| 755 /* | |
| 756 - At the start of the normalize side buffer. | |
| 757 - Go back to string. | |
| 758 - Because pointer points to the last accessed character, | |
| 759 - hence we have to increment it by one here. | |
| 760 + Normal path for strings when length is specified. | |
| 761 + Not in side buffer because it is always null terminated. | |
| 762 */ | |
| 763 - data->flags = data->origFlags; | |
| 764 - data->offsetRepeatValue = 0; | |
| 765 - | |
| 766 - if (data->fcdPosition == NULL) { | |
| 767 - data->pos = data->string; | |
| 768 + if (data->pos <= data->string) { | |
| 769 + /* End of the main source string */ | |
| 770 return UCOL_NO_MORE_CES; | |
| 771 } | |
| 772 - else { | |
| 773 - data->pos = data->fcdPosition + 1; | |
| 774 + data->pos --; | |
| 775 + ch = *data->pos; | |
| 776 + } | |
| 777 + // we are using an iterator to go back. Pray for us! | |
| 778 + else if (data->flags & UCOL_USE_ITERATOR) { | |
| 779 + UChar32 iterCh = data->iterator->previous(data->iterator); | |
| 780 + if(iterCh == U_SENTINEL) { | |
| 781 + return UCOL_NO_MORE_CES; | |
| 782 + } else { | |
| 783 + ch = (UChar)iterCh; | |
| 784 + } | |
| 785 + } | |
| 786 + else { | |
| 787 + data->pos --; | |
| 788 + ch = *data->pos; | |
| 789 + /* we are in the side buffer. */ | |
| 790 + if (ch == 0) { | |
| 791 + /* | |
| 792 + At the start of the normalize side buffer. | |
| 793 + Go back to string. | |
| 794 + Because pointer points to the last accessed character, | |
| 795 + hence we have to increment it by one here. | |
| 796 + */ | |
| 797 + data->flags = data->origFlags; | |
| 798 + data->offsetRepeatValue = 0; | |
| 799 + | |
| 800 + if (data->fcdPosition == NULL) { | |
| 801 + data->pos = data->string; | |
| 802 + return UCOL_NO_MORE_CES; | |
| 803 + } | |
| 804 + else { | |
| 805 + data->pos = data->fcdPosition + 1; | |
| 806 + } | |
| 807 + | |
| 808 + continue; | |
| 809 } | |
| 810 - | |
| 811 - continue; | |
| 812 } | |
| 813 - } | |
| 814 | |
| 815 - if(data->flags&UCOL_HIRAGANA_Q) { | |
| 816 - if(ch>=0x3040 && ch<=0x309f) { | |
| 817 - data->flags |= UCOL_WAS_HIRAGANA; | |
| 818 - } else { | |
| 819 - data->flags &= ~UCOL_WAS_HIRAGANA; | |
| 820 - } | |
| 821 - } | |
| 822 + if(data->flags&UCOL_HIRAGANA_Q) { | |
| 823 + if(ch>=0x3040 && ch<=0x309f) { | |
| 824 + data->flags |= UCOL_WAS_HIRAGANA; | |
| 825 + } else { | |
| 826 + data->flags &= ~UCOL_WAS_HIRAGANA; | |
| 827 + } | |
| 828 + } | |
| 829 | |
| 830 - /* | |
| 831 - * got a character to determine if there's fcd and/or normalization | |
| 832 - * stuff to do. | |
| 833 - * if the current character is not fcd. | |
| 834 - * if current character is at the start of the string | |
| 835 - * Trailing combining class == 0. | |
| 836 - * Note if pos is in the writablebuffer, norm is always 0 | |
| 837 - */ | |
| 838 - if (ch < ZERO_CC_LIMIT_ || | |
| 839 - // this should propel us out of the loop in the iterator case | |
| 840 - (data->flags & UCOL_ITER_NORM) == 0 || | |
| 841 - (data->fcdPosition != NULL && data->fcdPosition <= data->pos) | |
| 842 - || data->string == data->pos) { | |
| 843 - break; | |
| 844 - } | |
| 845 - | |
| 846 - if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
| 847 - /* if next character is FCD */ | |
| 848 - if (data->pos == data->string) { | |
| 849 - /* First char of string is always OK for FCD check */ | |
| 850 + /* | |
| 851 + * got a character to determine if there's fcd and/or normalizat
ion | |
| 852 + * stuff to do. | |
| 853 + * if the current character is not fcd. | |
| 854 + * if current character is at the start of the string | |
| 855 + * Trailing combining class == 0. | |
| 856 + * Note if pos is in the writablebuffer, norm is always 0 | |
| 857 + */ | |
| 858 + if (ch < ZERO_CC_LIMIT_ || | |
| 859 + // this should propel us out of the loop in the iterator case | |
| 860 + (data->flags & UCOL_ITER_NORM) == 0 || | |
| 861 + (data->fcdPosition != NULL && data->fcdPosition <= data->po
s) | |
| 862 + || data->string == data->pos) { | |
| 863 break; | |
| 864 } | |
| 865 | |
| 866 - /* Not first char of string, do the FCD fast test */ | |
| 867 - if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
| 868 + if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
| 869 + /* if next character is FCD */ | |
| 870 + if (data->pos == data->string) { | |
| 871 + /* First char of string is always OK for FCD check */ | |
| 872 + break; | |
| 873 + } | |
| 874 + | |
| 875 + /* Not first char of string, do the FCD fast test */ | |
| 876 + if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) { | |
| 877 + break; | |
| 878 + } | |
| 879 + } | |
| 880 + | |
| 881 + /* Need a more complete FCD check and possible normalization. *
/ | |
| 882 + if (collPrevIterFCD(data)) { | |
| 883 + collPrevIterNormalize(data); | |
| 884 + } | |
| 885 + | |
| 886 + if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { | |
| 887 + /* No normalization. Go ahead and process the char. */ | |
| 888 break; | |
| 889 } | |
| 890 - } | |
| 891 | |
| 892 - /* Need a more complete FCD check and possible normalization. */ | |
| 893 - if (collPrevIterFCD(data)) { | |
| 894 - collPrevIterNormalize(data); | |
| 895 + /* | |
| 896 + Some normalization happened. | |
| 897 + Next loop picks up a char from the normalization buffer. | |
| 898 + */ | |
| 899 } | |
| 900 | |
| 901 - if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { | |
| 902 - /* No normalization. Go ahead and process the char. */ | |
| 903 - break; | |
| 904 - } | |
| 905 - | |
| 906 - /* | |
| 907 - Some normalization happened. | |
| 908 - Next loop picks up a char from the normalization buffer. | |
| 909 + /* attempt to handle contractions, after removal of the backwards | |
| 910 + contraction | |
| 911 */ | |
| 912 - } | |
| 913 - | |
| 914 - /* attempt to handle contractions, after removal of the backwards | |
| 915 - contraction | |
| 916 - */ | |
| 917 - if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) { | |
| 918 - result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data
, status); | |
| 919 - } else { | |
| 920 - if (ch <= 0xFF) { | |
| 921 - result = coll->latinOneMapping[ch]; | |
| 922 - } | |
| 923 - else { | |
| 924 - // Always use UCA for [3400..9FFF], [AC00..D7AF] | |
| 925 - // **** [FA0E..FA2F] ?? **** | |
| 926 - if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && | |
| 927 - (ch >= 0x3400 && ch <= 0xD7AF)) { | |
| 928 - if (ch > 0x9FFF && ch < 0xAC00) { | |
| 929 - // between the two target ranges; do normal lookup | |
| 930 - // **** this range is YI, Modifier tone letters, **** | |
| 931 - // **** Latin-D, Syloti Nagari, Phagas-pa. **** | |
| 932 - // **** Latin-D might be tailored, so we need to **** | |
| 933 - // **** do the normal lookup for these guys. **** | |
| 934 - result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); | |
| 935 + if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data))
{ | |
| 936 + result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION,
data, status); | |
| 937 + } else { | |
| 938 + if (ch <= 0xFF) { | |
| 939 + result = coll->latinOneMapping[ch]; | |
| 940 + } | |
| 941 + else { | |
| 942 + // Always use UCA for [3400..9FFF], [AC00..D7AF] | |
| 943 + // **** [FA0E..FA2F] ?? **** | |
| 944 + if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && | |
| 945 + (ch >= 0x3400 && ch <= 0xD7AF)) { | |
| 946 + if (ch > 0x9FFF && ch < 0xAC00) { | |
| 947 + // between the two target ranges; do normal lookup | |
| 948 + // **** this range is YI, Modifier tone letters, **
** | |
| 949 + // **** Latin-D, Syloti Nagari, Phagas-pa. **
** | |
| 950 + // **** Latin-D might be tailored, so we need to **
** | |
| 951 + // **** do the normal lookup for these guys. **
** | |
| 952 + result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch)
; | |
| 953 + } else { | |
| 954 + result = UCOL_NOT_FOUND; | |
| 955 + } | |
| 956 } else { | |
| 957 - result = UCOL_NOT_FOUND; | |
| 958 + result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); | |
| 959 } | |
| 960 - } else { | |
| 961 - result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); | |
| 962 } | |
| 963 - } | |
| 964 - if (result > UCOL_NOT_FOUND) { | |
| 965 - result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, stat
us); | |
| 966 - } | |
| 967 - if (result == UCOL_NOT_FOUND) { // Not found in master list | |
| 968 - if (!isAtStartPrevIterate(data) && | |
| 969 - ucol_contractionEndCP(ch, data->coll)) | |
| 970 - { | |
| 971 - result = UCOL_CONTRACTION; | |
| 972 - } else { | |
| 973 - if(coll->UCA) { | |
| 974 - result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch)
; | |
| 975 + if (result > UCOL_NOT_FOUND) { | |
| 976 + result = ucol_prv_getSpecialPrevCE(coll, ch, result, data,
status); | |
| 977 + } | |
| 978 + if (result == UCOL_NOT_FOUND) { // Not found in master list | |
| 979 + if (!isAtStartPrevIterate(data) && | |
| 980 + ucol_contractionEndCP(ch, data->coll)) | |
| 981 + { | |
| 982 + result = UCOL_CONTRACTION; | |
| 983 + } else { | |
| 984 + if(coll->UCA) { | |
| 985 + result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping,
ch); | |
| 986 + } | |
| 987 } | |
| 988 - } | |
| 989 | |
| 990 - if (result > UCOL_NOT_FOUND) { | |
| 991 - if(coll->UCA) { | |
| 992 - result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, resul
t, data, status); | |
| 993 + if (result > UCOL_NOT_FOUND) { | |
| 994 + if(coll->UCA) { | |
| 995 + result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, r
esult, data, status); | |
| 996 + } | |
| 997 } | |
| 998 } | |
| 999 } | |
| 1000 - } | |
| 1001 + } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <=
UCOL_LAST_HANGUL ); | |
| 1002 | |
| 1003 if(result == UCOL_NOT_FOUND) { | |
| 1004 result = getPrevImplicit(ch, data); | |
| 1005 @@ -3193,6 +3198,7 @@ | |
| 1006 // Since Hanguls pass the FCD check, it is | |
| 1007 // guaranteed that we won't be in | |
| 1008 // the normalization buffer if something like this happens | |
| 1009 + | |
| 1010 // However, if we are using a uchar iterator and normalizat
ion | |
| 1011 // is ON, the Hangul that lead us here is going to be in th
at | |
| 1012 // normalization buffer. Here we want to restore the uchar | |
| 1013 @@ -3201,6 +3207,7 @@ | |
| 1014 source->flags = source->origFlags; // restore the itera
tor | |
| 1015 source->pos = NULL; | |
| 1016 } | |
| 1017 + | |
| 1018 // Move Jamos into normalization buffer | |
| 1019 UChar *buffer = source->writableBuffer.getBuffer(4); | |
| 1020 int32_t bufferLength; | |
| 1021 @@ -3214,8 +3221,9 @@ | |
| 1022 } | |
| 1023 source->writableBuffer.releaseBuffer(bufferLength); | |
| 1024 | |
| 1025 - source->fcdPosition = source->pos; // Indicate wher
e to continue in main input string | |
| 1026 - // after exhausting the writableBuffer | |
| 1027 + // Indicate where to continue in main input string after ex
hausting the writableBuffer | |
| 1028 + source->fcdPosition = source->pos; | |
| 1029 + | |
| 1030 source->pos = source->writableBuffer.getTerminatedBuffer(
); | |
| 1031 source->origFlags = source->flags; | |
| 1032 source->flags |= UCOL_ITER_INNORMBUF; | |
| 1033 @@ -3966,13 +3974,10 @@ | |
| 1034 // Since Hanguls pass the FCD check, it is | |
| 1035 // guaranteed that we won't be in | |
| 1036 // the normalization buffer if something like this happens | |
| 1037 + | |
| 1038 // Move Jamos into normalization buffer | |
| 1039 - /* | |
| 1040 - Move the Jamos into the | |
| 1041 - normalization buffer | |
| 1042 - */ | |
| 1043 UChar *tempbuffer = source->writableBuffer.getBuffer(5); | |
| 1044 - int32_t tempbufferLength; | |
| 1045 + int32_t tempbufferLength, jamoOffset; | |
| 1046 tempbuffer[0] = 0; | |
| 1047 tempbuffer[1] = (UChar)L; | |
| 1048 tempbuffer[2] = (UChar)V; | |
| 1049 @@ -3984,16 +3989,30 @@ | |
| 1050 } | |
| 1051 source->writableBuffer.releaseBuffer(tempbufferLength); | |
| 1052 | |
| 1053 - /* | |
| 1054 - Indicate where to continue in main input string after exhau
sting | |
| 1055 - the writableBuffer | |
| 1056 - */ | |
| 1057 + // Indicate where to continue in main input string after ex
hausting the writableBuffer | |
| 1058 if (source->pos == source->string) { | |
| 1059 + jamoOffset = 0; | |
| 1060 source->fcdPosition = NULL; | |
| 1061 } else { | |
| 1062 + jamoOffset = source->pos - source->string; | |
| 1063 source->fcdPosition = source->pos-1; | |
| 1064 } | |
| 1065 + | |
| 1066 + // Append offsets for the additional cha
rs | |
| 1067 + // (not the 0, and not the L whose offse
ts match the original Hangul) | |
| 1068 + int32_t jamoRemaining = tempbufferLength - 2; | |
| 1069 + jamoOffset++; // appended offsets should match end of origi
nal Hangul | |
| 1070 + while (jamoRemaining-- > 0) { | |
| 1071 + source->appendOffset(jamoOffset, *status); | |
| 1072 + } | |
| 1073 | |
| 1074 + source->offsetRepeatValue = jamoOffset; | |
| 1075 + | |
| 1076 + source->offsetReturn = source->offsetStore - 1; | |
| 1077 + if (source->offsetReturn == source->offsetBuffer) { | |
| 1078 + source->offsetStore = source->offsetBuffer; | |
| 1079 + } | |
| 1080 + | |
| 1081 source->pos = source->writableBuffer.getTermi
natedBuffer() + tempbufferLength; | |
| 1082 source->origFlags = source->flags; | |
| 1083 source->flags |= UCOL_ITER_INNORMBUF; | |
| OLD | NEW |