| OLD | NEW |
| (Empty) |
| 1 /******************************************************************** | |
| 2 * COPYRIGHT: | |
| 3 * Copyright (c) 1998-2014, International Business Machines Corporation and | |
| 4 * others. All Rights Reserved. | |
| 5 ********************************************************************/ | |
| 6 /* | |
| 7 * File utf8tst.c | |
| 8 * | |
| 9 * Modification History: | |
| 10 * | |
| 11 * Date Name Description | |
| 12 * 07/24/2000 Madhu Creation | |
| 13 ******************************************************************************* | |
| 14 */ | |
| 15 | |
| 16 #include "unicode/utypes.h" | |
| 17 #include "unicode/utf8.h" | |
| 18 #include "cmemory.h" | |
| 19 #include "cintltst.h" | |
| 20 | |
| 21 /* lenient UTF-8 ------------------------------------------------------------ */ | |
| 22 | |
| 23 /* | |
| 24 * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate | |
| 25 * code points with their "natural" encoding. | |
| 26 * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of | |
| 27 * single surrogates. | |
| 28 * | |
| 29 * This is not conformant with UTF-8. | |
| 30 * | |
| 31 * Supplementary code points may be encoded as pairs of 3-byte sequences, but | |
| 32 * the macros below do not attempt to assemble such pairs. | |
| 33 */ | |
| 34 | |
| 35 #define L8_NEXT(s, i, length, c) { \ | |
| 36 (c)=(uint8_t)(s)[(i)++]; \ | |
| 37 if((c)>=0x80) { \ | |
| 38 if(U8_IS_LEAD(c)) { \ | |
| 39 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length
), c, -2); \ | |
| 40 } else { \ | |
| 41 (c)=U_SENTINEL; \ | |
| 42 } \ | |
| 43 } \ | |
| 44 } | |
| 45 | |
| 46 #define L8_PREV(s, start, i, c) { \ | |
| 47 (c)=(uint8_t)(s)[--(i)]; \ | |
| 48 if((c)>=0x80) { \ | |
| 49 if((c)<=0xbf) { \ | |
| 50 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \ | |
| 51 } else { \ | |
| 52 (c)=U_SENTINEL; \ | |
| 53 } \ | |
| 54 } \ | |
| 55 } | |
| 56 | |
| 57 /* -------------------------------------------------------------------------- */ | |
| 58 | |
| 59 static void printUChars(const uint8_t *uchars, int16_t len); | |
| 60 | |
| 61 static void TestCodeUnitValues(void); | |
| 62 static void TestCharLength(void); | |
| 63 static void TestGetChar(void); | |
| 64 static void TestNextPrevChar(void); | |
| 65 static void TestNulTerminated(void); | |
| 66 static void TestNextPrevNonCharacters(void); | |
| 67 static void TestNextPrevCharUnsafe(void); | |
| 68 static void TestFwdBack(void); | |
| 69 static void TestFwdBackUnsafe(void); | |
| 70 static void TestSetChar(void); | |
| 71 static void TestSetCharUnsafe(void); | |
| 72 static void TestAppendChar(void); | |
| 73 static void TestAppend(void); | |
| 74 static void TestSurrogates(void); | |
| 75 | |
| 76 void addUTF8Test(TestNode** root); | |
| 77 | |
| 78 void | |
| 79 addUTF8Test(TestNode** root) | |
| 80 { | |
| 81 addTest(root, &TestCodeUnitValues, "utf8tst/TestCodeUnitValues"); | |
| 82 addTest(root, &TestCharLength, "utf8tst/TestCharLength"); | |
| 83 addTest(root, &TestGetChar, "utf8tst/TestGetChar"); | |
| 84 addTest(root, &TestNextPrevChar, "utf8tst/TestNextPrevChar"); | |
| 85 addTest(root, &TestNulTerminated, "utf8tst/TestNulTerminated"); | |
| 86 addTest(root, &TestNextPrevNonCharacters, "utf8tst/TestNextPrevNonCharacte
rs"); | |
| 87 addTest(root, &TestNextPrevCharUnsafe, "utf8tst/TestNextPrevCharUnsafe"
); | |
| 88 addTest(root, &TestFwdBack, "utf8tst/TestFwdBack"); | |
| 89 addTest(root, &TestFwdBackUnsafe, "utf8tst/TestFwdBackUnsafe"); | |
| 90 addTest(root, &TestSetChar, "utf8tst/TestSetChar"); | |
| 91 addTest(root, &TestSetCharUnsafe, "utf8tst/TestSetCharUnsafe"); | |
| 92 addTest(root, &TestAppendChar, "utf8tst/TestAppendChar"); | |
| 93 addTest(root, &TestAppend, "utf8tst/TestAppend"); | |
| 94 addTest(root, &TestSurrogates, "utf8tst/TestSurrogates"); | |
| 95 } | |
| 96 | |
| 97 static void TestCodeUnitValues() | |
| 98 { | |
| 99 static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0
xfd, 0x80, 0x81, 0xbc, 0xbe,}; | |
| 100 | |
| 101 int16_t i; | |
| 102 for(i=0; i<UPRV_LENGTHOF(codeunit); i++){ | |
| 103 uint8_t c=codeunit[i]; | |
| 104 log_verbose("Testing code unit value of %x\n", c); | |
| 105 if(i<4){ | |
| 106 if(!UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) || !U8_
IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)){ | |
| 107 log_err("ERROR: 0x%02x is a single byte but results in single: %
c lead: %c trail: %c\n", | |
| 108 c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n
', UTF8_IS_TRAIL(c) ? 'y' : 'n'); | |
| 109 } | |
| 110 } else if(i< 8){ | |
| 111 if(!UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) || !U8_
IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)){ | |
| 112 log_err("ERROR: 0x%02x is a lead byte but results in single: %c
lead: %c trail: %c\n", | |
| 113 c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n
', UTF8_IS_TRAIL(c) ? 'y' : 'n'); | |
| 114 } | |
| 115 } else if(i< 12){ | |
| 116 if(!UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || !U8_
IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){ | |
| 117 log_err("ERROR: 0x%02x is a trail byte but results in single: %c
lead: %c trail: %c\n", | |
| 118 c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n
', UTF8_IS_TRAIL(c) ? 'y' : 'n'); | |
| 119 } | |
| 120 } | |
| 121 } | |
| 122 } | |
| 123 | |
| 124 static void TestCharLength() | |
| 125 { | |
| 126 static const uint32_t codepoint[]={ | |
| 127 1, 0x0061, | |
| 128 1, 0x007f, | |
| 129 2, 0x016f, | |
| 130 2, 0x07ff, | |
| 131 3, 0x0865, | |
| 132 3, 0x20ac, | |
| 133 4, 0x20402, | |
| 134 4, 0x23456, | |
| 135 4, 0x24506, | |
| 136 4, 0x20402, | |
| 137 4, 0x10402, | |
| 138 3, 0xd7ff, | |
| 139 3, 0xe000, | |
| 140 | |
| 141 }; | |
| 142 | |
| 143 int16_t i; | |
| 144 UBool multiple; | |
| 145 for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){ | |
| 146 UChar32 c=codepoint[i+1]; | |
| 147 if(UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] || U8_LENGTH(c) != (uin
t16_t)codepoint[i]){ | |
| 148 log_err("The no: of code units for %lx:- Expected: %d Got: %d\n",
c, codepoint[i], UTF8_CHAR_LENGTH(c)); | |
| 149 }else{ | |
| 150 log_verbose("The no: of code units for %lx is %d\n",c, UTF8_CHAR_L
ENGTH(c)); | |
| 151 } | |
| 152 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE); | |
| 153 if(UTF8_NEED_MULTIPLE_UCHAR(c) != multiple){ | |
| 154 log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c); | |
| 155 } | |
| 156 } | |
| 157 } | |
| 158 | |
| 159 static void TestGetChar() | |
| 160 { | |
| 161 static const uint8_t input[]={ | |
| 162 /* code unit,*/ | |
| 163 0x61, | |
| 164 0x7f, | |
| 165 0xe4, | |
| 166 0xba, | |
| 167 0x8c, | |
| 168 0xF0, | |
| 169 0x90, | |
| 170 0x90, | |
| 171 0x81, | |
| 172 0xc0, | |
| 173 0x65, | |
| 174 0x31, | |
| 175 0x9a, | |
| 176 0xc9 | |
| 177 }; | |
| 178 static const UChar32 result[]={ | |
| 179 /* codepoint-unsafe, codepoint-safe(not strict) codepoint-safe(strict) */ | |
| 180 0x61, 0x61, 0x61, | |
| 181 0x7f, 0x7f, 0x7f, | |
| 182 0x4e8c, 0x4e8c, 0x4e8c, | |
| 183 0x4e8c, 0x4e8c, 0x4e8c , | |
| 184 0x4e8c, 0x4e8c, 0x4e8c, | |
| 185 0x10401, 0x10401, 0x10401 , | |
| 186 0x10401, 0x10401, 0x10401 , | |
| 187 0x10401, 0x10401, 0x10401 , | |
| 188 0x10401, 0x10401, 0x10401, | |
| 189 0x25, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
| 190 0x65, 0x65, 0x65, | |
| 191 0x31, 0x31, 0x31, | |
| 192 0x31, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
| 193 0x240, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1 | |
| 194 }; | |
| 195 uint16_t i=0; | |
| 196 UChar32 c, expected; | |
| 197 uint32_t offset=0; | |
| 198 | |
| 199 for(offset=0; offset<sizeof(input); offset++) { | |
| 200 if (offset < sizeof(input) - 1) { | |
| 201 UTF8_GET_CHAR_UNSAFE(input, offset, c); | |
| 202 if(c != result[i]){ | |
| 203 log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expe
cted:%lx Got:%lx\n", offset, result[i], c); | |
| 204 | |
| 205 } | |
| 206 | |
| 207 U8_GET_UNSAFE(input, offset, c); | |
| 208 if(c != result[i]){ | |
| 209 log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%l
x Got:%lx\n", offset, result[i], c); | |
| 210 | |
| 211 } | |
| 212 } | |
| 213 | |
| 214 UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE); | |
| 215 expected=result[i+1]; | |
| 216 if(c != expected){ | |
| 217 log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%
lx Got:%lx\n", offset, expected, c); | |
| 218 } | |
| 219 | |
| 220 U8_GET(input, 0, offset, sizeof(input), c); | |
| 221 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; } | |
| 222 if(c != expected){ | |
| 223 log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n
", offset, expected, c); | |
| 224 } | |
| 225 | |
| 226 U8_GET_OR_FFFD(input, 0, offset, sizeof(input), c); | |
| 227 if(expected<0) { expected=0xfffd; } | |
| 228 if(c != expected){ | |
| 229 log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx G
ot:%lx\n", offset, expected, c); | |
| 230 } | |
| 231 | |
| 232 UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE); | |
| 233 if(c != result[i+2]){ | |
| 234 log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Ex
pected:%lx Got:%lx\n", offset, result[i+2], c); | |
| 235 } | |
| 236 | |
| 237 i=(uint16_t)(i+3); | |
| 238 } | |
| 239 } | |
| 240 | |
| 241 static void TestNextPrevChar() { | |
| 242 static const uint8_t input[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd
, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00}; | |
| 243 static const UChar32 result[]={ | |
| 244 /* next_unsafe next_safe_ns next_safe_s prev_unsafe pr
ev_safe_ns prev_safe_s */ | |
| 245 0x0061, 0x0061, 0x0061, 0x0000, 0x
0000, 0x0000, | |
| 246 0x10401, 0x10401, 0x10401, 0xf0, UT
F8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
| 247 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841410, UT
F8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
| 248 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xa1050, UT
F8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
| 249 0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841, UT
F8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
| 250 0x00, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x61, 0x
61, 0x61, | |
| 251 0x80, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xc2, UT
F8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
| 252 0xfd, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x77e, UT
F8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, | |
| 253 0xbe, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xfd, UT
F8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
| 254 0xa1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x00, UT
F8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, | |
| 255 0x61, 0x61, 0x61, 0xc0, UT
F8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
| 256 0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x10401, 0x
10401, 0x10401, | |
| 257 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UT
F_ERROR_VALUE, UTF_ERROR_VALUE, | |
| 258 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UT
F8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, | |
| 259 0x0840, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xf0, UT
F8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
| 260 0x0000, 0x0000, 0x0000, 0x0061, 0x
0061, 0x0061 | |
| 261 }; | |
| 262 static const int32_t movedOffset[]={ | |
| 263 /* next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns
prev_safe_s */ | |
| 264 1, 1, 1, 15, 15,
15, | |
| 265 5, 5, 5, 14, 14 ,
14, | |
| 266 3, 3, 3, 9, 13,
13, | |
| 267 4, 4, 4, 9, 12,
12, | |
| 268 5, 5, 5, 9, 11,
11, | |
| 269 7, 7, 7, 10, 10,
10, | |
| 270 7, 7, 7, 9, 9,
9, | |
| 271 8, 9, 9, 7, 7,
7, | |
| 272 9, 9, 9, 7, 7,
7, | |
| 273 11, 10, 10, 5, 5,
5, | |
| 274 11, 11, 11, 5, 5,
5, | |
| 275 12, 12, 12, 1, 1,
1, | |
| 276 13, 13, 13, 1, 1,
1, | |
| 277 14, 14, 14, 1, 1,
1, | |
| 278 14, 15, 15, 1, 1,
1, | |
| 279 14, 16, 16, 0, 0,
0, | |
| 280 }; | |
| 281 /* TODO: remove unused columns for next_unsafe & prev_unsafe, and adjust the
test code */ | |
| 282 | |
| 283 UChar32 c, expected; | |
| 284 uint32_t i=0; | |
| 285 uint32_t offset=0; | |
| 286 int32_t setOffset=0; | |
| 287 for(offset=0; offset<sizeof(input); offset++){ | |
| 288 setOffset=offset; | |
| 289 UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE); | |
| 290 if(setOffset != movedOffset[i+1]){ | |
| 291 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset corre
ctly at %d\n ExpectedOffset:%d Got %d\n", | |
| 292 offset, movedOffset[i+1], setOffset); | |
| 293 } | |
| 294 expected=result[i+1]; | |
| 295 if(c != expected){ | |
| 296 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%
lx Got:%lx\n", offset, expected, c); | |
| 297 } | |
| 298 | |
| 299 setOffset=offset; | |
| 300 U8_NEXT(input, setOffset, sizeof(input), c); | |
| 301 if(setOffset != movedOffset[i+1]){ | |
| 302 log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n
ExpectedOffset:%d Got %d\n", | |
| 303 offset, movedOffset[i+1], setOffset); | |
| 304 } | |
| 305 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; } | |
| 306 if(c != expected){ | |
| 307 log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n
", offset, expected, c); | |
| 308 } | |
| 309 | |
| 310 setOffset=offset; | |
| 311 U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c); | |
| 312 if(setOffset != movedOffset[i+1]){ | |
| 313 log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly
at %d\n ExpectedOffset:%d Got %d\n", | |
| 314 offset, movedOffset[i+1], setOffset); | |
| 315 } | |
| 316 if(expected<0) { expected=0xfffd; } | |
| 317 if(c != expected){ | |
| 318 log_err("ERROR: U8_NEXT_OR_FFFD failed for input=%ld. Expected:%lx G
ot:%lx\n", offset, expected, c); | |
| 319 } | |
| 320 | |
| 321 setOffset=offset; | |
| 322 UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE); | |
| 323 if(setOffset != movedOffset[i+1]){ | |
| 324 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offs
et correctly at %d\n ExpectedOffset:%d Got %d\n", | |
| 325 offset, movedOffset[i+2], setOffset); | |
| 326 } | |
| 327 if(c != result[i+2]){ | |
| 328 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. E
xpected:%lx Got:%lx\n", offset, result[i+2], c); | |
| 329 } | |
| 330 | |
| 331 i=i+6; | |
| 332 } | |
| 333 | |
| 334 i=0; | |
| 335 for(offset=sizeof(input); offset > 0; --offset){ | |
| 336 setOffset=offset; | |
| 337 UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE); | |
| 338 if(setOffset != movedOffset[i+4]){ | |
| 339 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset corre
ctly at %d\n ExpectedOffset:%d Got %d\n", | |
| 340 offset, movedOffset[i+4], setOffset); | |
| 341 } | |
| 342 expected=result[i+4]; | |
| 343 if(c != expected){ | |
| 344 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%
lx Got:%lx\n", offset, expected, c); | |
| 345 } | |
| 346 | |
| 347 setOffset=offset; | |
| 348 U8_PREV(input, 0, setOffset, c); | |
| 349 if(setOffset != movedOffset[i+4]){ | |
| 350 log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n
ExpectedOffset:%d Got %d\n", | |
| 351 offset, movedOffset[i+4], setOffset); | |
| 352 } | |
| 353 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; } | |
| 354 if(c != expected){ | |
| 355 log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n
", offset, expected, c); | |
| 356 } | |
| 357 | |
| 358 setOffset=offset; | |
| 359 U8_PREV_OR_FFFD(input, 0, setOffset, c); | |
| 360 if(setOffset != movedOffset[i+4]){ | |
| 361 log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly
at %d\n ExpectedOffset:%d Got %d\n", | |
| 362 offset, movedOffset[i+4], setOffset); | |
| 363 } | |
| 364 if(expected<0) { expected=0xfffd; } | |
| 365 if(c != expected){ | |
| 366 log_err("ERROR: U8_PREV_OR_FFFD failed for input=%ld. Expected:%lx G
ot:%lx\n", offset, expected, c); | |
| 367 } | |
| 368 | |
| 369 setOffset=offset; | |
| 370 UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE); | |
| 371 if(setOffset != movedOffset[i+5]){ | |
| 372 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offs
et correctly at %d\n ExpectedOffset:%d Got %d\n", | |
| 373 offset, movedOffset[i+5], setOffset); | |
| 374 } | |
| 375 if(c != result[i+5]){ | |
| 376 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. E
xpected:%lx Got:%lx\n", offset, result[i+5], c); | |
| 377 } | |
| 378 | |
| 379 i=i+6; | |
| 380 } | |
| 381 } | |
| 382 | |
| 383 /* keep this in sync with utf16tst.c's TestNulTerminated() */ | |
| 384 static void TestNulTerminated() { | |
| 385 static const uint8_t input[]={ | |
| 386 /* 0 */ 0x61, | |
| 387 /* 1 */ 0xf0, 0x90, 0x90, 0x81, | |
| 388 /* 5 */ 0xc0, 0x80, | |
| 389 /* 7 */ 0xdf, 0x80, | |
| 390 /* 9 */ 0xc2, | |
| 391 /* 10 */ 0x62, | |
| 392 /* 11 */ 0xfd, 0xbe, | |
| 393 /* 13 */ 0xe0, 0xa0, 0x80, | |
| 394 /* 16 */ 0xe2, 0x82, 0xac, | |
| 395 /* 19 */ 0xf0, 0x90, 0x90, | |
| 396 /* 22 */ 0x00 | |
| 397 /* 23 */ | |
| 398 }; | |
| 399 static const UChar32 result[]={ | |
| 400 0x61, | |
| 401 0x10401, | |
| 402 U_SENTINEL, | |
| 403 0x7c0, | |
| 404 U_SENTINEL, | |
| 405 0x62, | |
| 406 U_SENTINEL, | |
| 407 0x800, | |
| 408 0x20ac, | |
| 409 U_SENTINEL, | |
| 410 0 | |
| 411 }; | |
| 412 | |
| 413 UChar32 c, c2, expected; | |
| 414 int32_t i0, i=0, j, k, expectedIndex; | |
| 415 int32_t cpIndex=0; | |
| 416 do { | |
| 417 i0=i; | |
| 418 U8_NEXT(input, i, -1, c); | |
| 419 expected=result[cpIndex]; | |
| 420 if(c!=expected) { | |
| 421 log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected); | |
| 422 } | |
| 423 j=i0; | |
| 424 U8_NEXT_OR_FFFD(input, j, -1, c); | |
| 425 if(expected<0) { expected=0xfffd; } | |
| 426 if(c!=expected) { | |
| 427 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expect
ed); | |
| 428 } | |
| 429 if(j!=i) { | |
| 430 log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to
%d\n", j, i); | |
| 431 } | |
| 432 j=i0; | |
| 433 U8_FWD_1(input, j, -1); | |
| 434 if(j!=i) { | |
| 435 log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n",
j, i); | |
| 436 } | |
| 437 ++cpIndex; | |
| 438 /* | |
| 439 * Move by this many code points from the start. | |
| 440 * U8_FWD_N() stops at the end of the string, that is, at the NUL if nec
essary. | |
| 441 */ | |
| 442 expectedIndex= (c==0) ? i-1 : i; | |
| 443 k=0; | |
| 444 U8_FWD_N(input, k, -1, cpIndex); | |
| 445 if(k!=expectedIndex) { | |
| 446 log_err("U8_FWD_N(code points from 0) moved to index %d but expected
%d\n", k, expectedIndex); | |
| 447 } | |
| 448 } while(c!=0); | |
| 449 | |
| 450 i=0; | |
| 451 do { | |
| 452 j=i0=i; | |
| 453 U8_NEXT(input, i, -1, c); | |
| 454 do { | |
| 455 U8_GET(input, 0, j, -1, c2); | |
| 456 if(c2!=c) { | |
| 457 log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0,
c, c2, j); | |
| 458 } | |
| 459 U8_GET_OR_FFFD(input, 0, j, -1, c2); | |
| 460 expected= (c>=0) ? c : 0xfffd; | |
| 461 if(c2!=expected) { | |
| 462 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFF
D(at %d)\n", i0, expected, c2, j); | |
| 463 } | |
| 464 /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the co
de point */ | |
| 465 k=j+1; | |
| 466 U8_SET_CP_LIMIT(input, 0, k, -1); | |
| 467 if(k!=i) { | |
| 468 log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to
%d\n", i, j+1, k); | |
| 469 } | |
| 470 } while(++j<i); | |
| 471 } while(c!=0); | |
| 472 } | |
| 473 | |
| 474 static void TestNextPrevNonCharacters() { | |
| 475 /* test non-characters */ | |
| 476 static const uint8_t nonChars[]={ | |
| 477 0xef, 0xb7, 0x90, /* U+fdd0 */ | |
| 478 0xef, 0xbf, 0xbf, /* U+feff */ | |
| 479 0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */ | |
| 480 0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */ | |
| 481 0xf4, 0x8f, 0xbf, 0xbe /* U+10fffe */ | |
| 482 }; | |
| 483 | |
| 484 UChar32 ch; | |
| 485 int32_t idx; | |
| 486 | |
| 487 for(idx=0; idx<(int32_t)sizeof(nonChars);) { | |
| 488 U8_NEXT(nonChars, idx, sizeof(nonChars), ch); | |
| 489 if(!U_IS_UNICODE_NONCHAR(ch)) { | |
| 490 log_err("U8_NEXT(before %d) failed to read a non-character\n", idx); | |
| 491 } | |
| 492 } | |
| 493 for(idx=(int32_t)sizeof(nonChars); idx>0;) { | |
| 494 U8_PREV(nonChars, 0, idx, ch); | |
| 495 if(!U_IS_UNICODE_NONCHAR(ch)) { | |
| 496 log_err("U8_PREV(at %d) failed to read a non-character\n", idx); | |
| 497 } | |
| 498 } | |
| 499 } | |
| 500 | |
| 501 static void TestNextPrevCharUnsafe() { | |
| 502 /* | |
| 503 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries
. | |
| 504 * The behavior of _UNSAFE macros for ill-formed strings is undefined. | |
| 505 */ | |
| 506 static const uint8_t input[]={ | |
| 507 0x61, | |
| 508 0xf0, 0x90, 0x90, 0x81, | |
| 509 0xc0, 0x80, /* non-shortest form */ | |
| 510 0xe2, 0x82, 0xac, | |
| 511 0xc2, 0xa1, | |
| 512 0xf4, 0x8f, 0xbf, 0xbf, | |
| 513 0x00 | |
| 514 }; | |
| 515 static const UChar32 codePoints[]={ | |
| 516 0x61, | |
| 517 0x10401, | |
| 518 0, | |
| 519 0x20ac, | |
| 520 0xa1, | |
| 521 0x10ffff, | |
| 522 0 | |
| 523 }; | |
| 524 | |
| 525 UChar32 c; | |
| 526 int32_t i; | |
| 527 uint32_t offset; | |
| 528 for(i=0, offset=0; offset<sizeof(input); ++i) { | |
| 529 UTF8_NEXT_CHAR_UNSAFE(input, offset, c); | |
| 530 if(c != codePoints[i]){ | |
| 531 log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expecte
d:%lx Got:%lx\n", | |
| 532 offset, codePoints[i], c); | |
| 533 } | |
| 534 } | |
| 535 for(i=0, offset=0; offset<sizeof(input); ++i) { | |
| 536 U8_NEXT_UNSAFE(input, offset, c); | |
| 537 if(c != codePoints[i]){ | |
| 538 log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx G
ot:%lx\n", | |
| 539 offset, codePoints[i], c); | |
| 540 } | |
| 541 } | |
| 542 | |
| 543 for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){ | |
| 544 UTF8_PREV_CHAR_UNSAFE(input, offset, c); | |
| 545 if(c != codePoints[i]){ | |
| 546 log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expect
ed:%lx Got:%lx\n", | |
| 547 offset, codePoints[i], c); | |
| 548 } | |
| 549 } | |
| 550 for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){ | |
| 551 U8_PREV_UNSAFE(input, offset, c); | |
| 552 if(c != codePoints[i]){ | |
| 553 log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx
Got:%lx\n", | |
| 554 offset, codePoints[i], c); | |
| 555 } | |
| 556 } | |
| 557 } | |
| 558 | |
| 559 static void TestFwdBack() { | |
| 560 static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0
, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00}; | |
| 561 static const uint16_t fwd_safe[] ={1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18}; | |
| 562 static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6
, 5, 1, 0}; | |
| 563 | |
| 564 static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5}; | |
| 565 static const uint16_t fwd_N_safe[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe
macro keeps it at the end of the string */ | |
| 566 static const uint16_t back_N_safe[] ={18, 17, 15, 12, 11, 9, 7, 0}; | |
| 567 | |
| 568 uint32_t offsafe=0; | |
| 569 | |
| 570 uint32_t i=0; | |
| 571 while(offsafe < sizeof(input)){ | |
| 572 UTF8_FWD_1_SAFE(input, offsafe, sizeof(input)); | |
| 573 if(offsafe != fwd_safe[i]){ | |
| 574 log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe
[i], offsafe); | |
| 575 } | |
| 576 i++; | |
| 577 } | |
| 578 | |
| 579 i=0; | |
| 580 while(offsafe < sizeof(input)){ | |
| 581 U8_FWD_1(input, offsafe, sizeof(input)); | |
| 582 if(offsafe != fwd_safe[i]){ | |
| 583 log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i],
offsafe); | |
| 584 } | |
| 585 i++; | |
| 586 } | |
| 587 | |
| 588 i=0; | |
| 589 offsafe=sizeof(input); | |
| 590 while(offsafe > 0){ | |
| 591 UTF8_BACK_1_SAFE(input, 0, offsafe); | |
| 592 if(offsafe != back_safe[i]){ | |
| 593 log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_sa
fe[i], offsafe); | |
| 594 } | |
| 595 i++; | |
| 596 } | |
| 597 | |
| 598 i=0; | |
| 599 offsafe=sizeof(input); | |
| 600 while(offsafe > 0){ | |
| 601 U8_BACK_1(input, 0, offsafe); | |
| 602 if(offsafe != back_safe[i]){ | |
| 603 log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_safe[i
], offsafe); | |
| 604 } | |
| 605 i++; | |
| 606 } | |
| 607 | |
| 608 offsafe=0; | |
| 609 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){ | |
| 610 UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]); | |
| 611 if(offsafe != fwd_N_safe[i]){ | |
| 612 log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i,
fwd_N_safe[i], offsafe); | |
| 613 } | |
| 614 | |
| 615 } | |
| 616 | |
| 617 offsafe=0; | |
| 618 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){ | |
| 619 U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]); | |
| 620 if(offsafe != fwd_N_safe[i]){ | |
| 621 log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_
safe[i], offsafe); | |
| 622 } | |
| 623 | |
| 624 } | |
| 625 | |
| 626 offsafe=sizeof(input); | |
| 627 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){ | |
| 628 UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]); | |
| 629 if(offsafe != back_N_safe[i]){ | |
| 630 log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i
, back_N_safe[i], offsafe); | |
| 631 } | |
| 632 } | |
| 633 | |
| 634 offsafe=sizeof(input); | |
| 635 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){ | |
| 636 U8_BACK_N(input, 0, offsafe, Nvalue[i]); | |
| 637 if(offsafe != back_N_safe[i]){ | |
| 638 log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back
_N_safe[i], offsafe); | |
| 639 } | |
| 640 } | |
| 641 } | |
| 642 | |
| 643 static void TestFwdBackUnsafe() { | |
| 644 /* | |
| 645 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries
. | |
| 646 * The behavior of _UNSAFE macros for ill-formed strings is undefined. | |
| 647 */ | |
| 648 static const uint8_t input[]={ | |
| 649 0x61, | |
| 650 0xf0, 0x90, 0x90, 0x81, | |
| 651 0xc0, 0x80, /* non-shortest form */ | |
| 652 0xe2, 0x82, 0xac, | |
| 653 0xc2, 0xa1, | |
| 654 0xf4, 0x8f, 0xbf, 0xbf, | |
| 655 0x00 | |
| 656 }; | |
| 657 static const int8_t boundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 }; | |
| 658 | |
| 659 int32_t offset; | |
| 660 int32_t i; | |
| 661 for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) { | |
| 662 UTF8_FWD_1_UNSAFE(input, offset); | |
| 663 if(offset != boundaries[i]){ | |
| 664 log_err("ERROR: UTF8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", bou
ndaries[i], offset); | |
| 665 } | |
| 666 } | |
| 667 for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) { | |
| 668 U8_FWD_1_UNSAFE(input, offset); | |
| 669 if(offset != boundaries[i]){ | |
| 670 log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", bound
aries[i], offset); | |
| 671 } | |
| 672 } | |
| 673 | |
| 674 for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --
i) { | |
| 675 UTF8_BACK_1_UNSAFE(input, offset); | |
| 676 if(offset != boundaries[i]){ | |
| 677 log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", bo
undaries[i], offset); | |
| 678 } | |
| 679 } | |
| 680 for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --
i) { | |
| 681 U8_BACK_1_UNSAFE(input, offset); | |
| 682 if(offset != boundaries[i]){ | |
| 683 log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boun
daries[i], offset); | |
| 684 } | |
| 685 } | |
| 686 | |
| 687 for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) { | |
| 688 offset=0; | |
| 689 UTF8_FWD_N_UNSAFE(input, offset, i); | |
| 690 if(offset != boundaries[i]) { | |
| 691 log_err("ERROR: UTF8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", bou
ndaries[i], offset); | |
| 692 } | |
| 693 } | |
| 694 for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) { | |
| 695 offset=0; | |
| 696 U8_FWD_N_UNSAFE(input, offset, i); | |
| 697 if(offset != boundaries[i]) { | |
| 698 log_err("ERROR: U8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", bound
aries[i], offset); | |
| 699 } | |
| 700 } | |
| 701 | |
| 702 for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) { | |
| 703 int32_t j=UPRV_LENGTHOF(boundaries)-1-i; | |
| 704 offset=UPRV_LENGTHOF(input); | |
| 705 UTF8_BACK_N_UNSAFE(input, offset, i); | |
| 706 if(offset != boundaries[j]) { | |
| 707 log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", bo
undaries[j], offset); | |
| 708 } | |
| 709 } | |
| 710 for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) { | |
| 711 int32_t j=UPRV_LENGTHOF(boundaries)-1-i; | |
| 712 offset=UPRV_LENGTHOF(input); | |
| 713 U8_BACK_N_UNSAFE(input, offset, i); | |
| 714 if(offset != boundaries[j]) { | |
| 715 log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boun
daries[j], offset); | |
| 716 } | |
| 717 } | |
| 718 } | |
| 719 | |
| 720 static void TestSetChar() { | |
| 721 static const uint8_t input[] | |
| 722 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x8
0, 0xe0, 0x00 }; | |
| 723 static const int16_t start_safe[] | |
| 724 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14 }; | |
| 725 static const int16_t limit_safe[] | |
| 726 = {0, 1, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14 }; | |
| 727 | |
| 728 uint32_t i=0; | |
| 729 int32_t offset=0, setOffset=0; | |
| 730 for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){ | |
| 731 if (offset<UPRV_LENGTHOF(input)){ | |
| 732 setOffset=offset; | |
| 733 UTF8_SET_CHAR_START_SAFE(input, 0, setOffset); | |
| 734 if(setOffset != start_safe[i]){ | |
| 735 log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld.
Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset); | |
| 736 } | |
| 737 | |
| 738 setOffset=offset; | |
| 739 U8_SET_CP_START(input, 0, setOffset); | |
| 740 if(setOffset != start_safe[i]){ | |
| 741 log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:
%ld Got:%ld\n", offset, start_safe[i], setOffset); | |
| 742 } | |
| 743 } | |
| 744 | |
| 745 setOffset=offset; | |
| 746 UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, sizeof(input)); | |
| 747 if(setOffset != limit_safe[i]){ | |
| 748 log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expe
cted:%ld Got:%ld\n", offset, limit_safe[i], setOffset); | |
| 749 } | |
| 750 | |
| 751 setOffset=offset; | |
| 752 U8_SET_CP_LIMIT(input,0, setOffset, sizeof(input)); | |
| 753 if(setOffset != limit_safe[i]){ | |
| 754 log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld
Got:%ld\n", offset, limit_safe[i], setOffset); | |
| 755 } | |
| 756 | |
| 757 i++; | |
| 758 } | |
| 759 } | |
| 760 | |
| 761 static void TestSetCharUnsafe() { | |
| 762 static const uint8_t input[] | |
| 763 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0x2e, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x8
0, 0xe0, 0x80, 0x80, 0x00 }; | |
| 764 static const int16_t start_unsafe[] | |
| 765 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 9, 9,
12, 12, 12, 15 }; | |
| 766 static const int16_t limit_unsafe[] | |
| 767 = {0, 1, 4, 4, 4, 5, 6, 7, 9, 9, 10, 10,
10, 15, 15, 15, 16 }; | |
| 768 | |
| 769 uint32_t i=0; | |
| 770 int32_t offset=0, setOffset=0; | |
| 771 for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){ | |
| 772 if (offset<UPRV_LENGTHOF(input)){ | |
| 773 setOffset=offset; | |
| 774 UTF8_SET_CHAR_START_UNSAFE(input, setOffset); | |
| 775 if(setOffset != start_unsafe[i]){ | |
| 776 log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld
. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset); | |
| 777 } | |
| 778 | |
| 779 setOffset=offset; | |
| 780 U8_SET_CP_START_UNSAFE(input, setOffset); | |
| 781 if(setOffset != start_unsafe[i]){ | |
| 782 log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Ex
pected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset); | |
| 783 } | |
| 784 } | |
| 785 | |
| 786 if (offset != 0) { /* Can't have it go off the end of the array */ | |
| 787 setOffset=offset; | |
| 788 UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset); | |
| 789 if(setOffset != limit_unsafe[i]){ | |
| 790 log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld
. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset); | |
| 791 } | |
| 792 | |
| 793 setOffset=offset; | |
| 794 U8_SET_CP_LIMIT_UNSAFE(input, setOffset); | |
| 795 if(setOffset != limit_unsafe[i]){ | |
| 796 log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Ex
pected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset); | |
| 797 } | |
| 798 } | |
| 799 | |
| 800 i++; | |
| 801 } | |
| 802 } | |
| 803 | |
| 804 static void TestAppendChar(){ | |
| 805 static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
0x69, 0x6a, 0x00}; | |
| 806 static const uint32_t test[]={ | |
| 807 /* append-position(unsafe), CHAR to be appended */ | |
| 808 0, 0x10401, | |
| 809 2, 0x0028, | |
| 810 2, 0x007f, | |
| 811 3, 0xd801, | |
| 812 1, 0x20402, | |
| 813 8, 0x10401, | |
| 814 5, 0xc0, | |
| 815 5, 0xc1, | |
| 816 5, 0xfd, | |
| 817 6, 0x80, | |
| 818 6, 0x81, | |
| 819 6, 0xbf, | |
| 820 7, 0xfe, | |
| 821 | |
| 822 /* append-position(safe), CHAR to be appended */ | |
| 823 0, 0x10401, | |
| 824 2, 0x0028, | |
| 825 3, 0x7f, | |
| 826 3, 0xd801, /* illegal for UTF-8 starting with U
nicode 3.2 */ | |
| 827 1, 0x20402, | |
| 828 9, 0x10401, | |
| 829 5, 0xc0, | |
| 830 5, 0xc1, | |
| 831 5, 0xfd, | |
| 832 6, 0x80, | |
| 833 6, 0x81, | |
| 834 6, 0xbf, | |
| 835 7, 0xfe, | |
| 836 | |
| 837 }; | |
| 838 static const uint16_t movedOffset[]={ | |
| 839 /* offset-moved-to(unsafe) */ | |
| 840 4, /*for append-pos: 0 , CHAR 0x10401*/ | |
| 841 3, | |
| 842 3, | |
| 843 6, | |
| 844 5, | |
| 845 12, | |
| 846 7, | |
| 847 7, | |
| 848 7, | |
| 849 8, | |
| 850 8, | |
| 851 8, | |
| 852 9, | |
| 853 | |
| 854 /* offset-moved-to(safe) */ | |
| 855 4, /*for append-pos: 0, CHAR 0x10401*/ | |
| 856 3, | |
| 857 4, | |
| 858 6, | |
| 859 5, | |
| 860 11, | |
| 861 7, | |
| 862 7, | |
| 863 7, | |
| 864 8, | |
| 865 8, | |
| 866 8, | |
| 867 9, | |
| 868 | |
| 869 }; | |
| 870 | |
| 871 static const uint8_t result[][11]={ | |
| 872 /*unsafe*/ | |
| 873 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
| 874 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
| 875 {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
| 876 {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
| 877 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
| 878 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90}, | |
| 879 | |
| 880 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00}, | |
| 881 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00}, | |
| 882 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00}, | |
| 883 | |
| 884 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00}, | |
| 885 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00}, | |
| 886 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00}, | |
| 887 | |
| 888 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00}, | |
| 889 /*safe*/ | |
| 890 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
| 891 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
| 892 {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
| 893 {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
| 894 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
| 895 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*ge
ts UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/ | |
| 896 | |
| 897 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00}, | |
| 898 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00}, | |
| 899 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00}, | |
| 900 | |
| 901 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00}, | |
| 902 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00}, | |
| 903 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00}, | |
| 904 | |
| 905 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00}, | |
| 906 | |
| 907 }; | |
| 908 uint16_t i, count=0; | |
| 909 uint8_t str[12]; | |
| 910 uint32_t offset; | |
| 911 /* UChar32 c=0;*/ | |
| 912 uint16_t size=UPRV_LENGTHOF(s); | |
| 913 for(i=0; i<UPRV_LENGTHOF(test); i=(uint16_t)(i+2)){ | |
| 914 uprv_memcpy(str, s, size); | |
| 915 offset=test[i]; | |
| 916 if(count<13){ | |
| 917 UTF8_APPEND_CHAR_UNSAFE(str, offset, test[i+1]); | |
| 918 if(offset != movedOffset[count]){ | |
| 919 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offse
t correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n", | |
| 920 count, movedOffset[count], offset); | |
| 921 | |
| 922 } | |
| 923 if(uprv_memcmp(str, result[count], size) !=0){ | |
| 924 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nE
xpected:", count); | |
| 925 printUChars(result[count], size); | |
| 926 log_err("\nGot: "); | |
| 927 printUChars(str, size); | |
| 928 log_err("\n"); | |
| 929 } | |
| 930 }else{ | |
| 931 UTF8_APPEND_CHAR_SAFE(str, offset, size, test[i+1]); | |
| 932 if(offset != movedOffset[count]){ | |
| 933 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset
correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n", | |
| 934 count, movedOffset[count], offset); | |
| 935 | |
| 936 } | |
| 937 if(uprv_memcmp(str, result[count], size) !=0){ | |
| 938 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExp
ected:", count); | |
| 939 printUChars(result[count], size); | |
| 940 log_err("\nGot: "); | |
| 941 printUChars(str, size); | |
| 942 log_err("\n"); | |
| 943 } | |
| 944 /*call the API instead of MACRO | |
| 945 uprv_memcpy(str, s, size); | |
| 946 offset=test[i]; | |
| 947 c=test[i+1]; | |
| 948 if((uint32_t)(c)<=0x7f) { | |
| 949 (str)[(offset)++]=(uint8_t)(c); | |
| 950 } else { | |
| 951 (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32
_t)(size), c); | |
| 952 } | |
| 953 if(offset != movedOffset[count]){ | |
| 954 log_err("ERROR: utf8_appendCharSafeBody() failed to move the off
set correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n", | |
| 955 count, movedOffset[count], offset); | |
| 956 | |
| 957 } | |
| 958 if(uprv_memcmp(str, result[count], size) !=0){ | |
| 959 log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \
nExpected:", count); | |
| 960 printUChars(result[count], size); | |
| 961 printf("\nGot: "); | |
| 962 printUChars(str, size); | |
| 963 printf("\n"); | |
| 964 } | |
| 965 */ | |
| 966 } | |
| 967 count++; | |
| 968 } | |
| 969 | |
| 970 | |
| 971 } | |
| 972 | |
| 973 static void TestAppend() { | |
| 974 static const UChar32 codePoints[]={ | |
| 975 0x61, 0xdf, 0x901, 0x3040, | |
| 976 0xac00, 0xd800, 0xdbff, 0xdcde, | |
| 977 0xdffd, 0xe000, 0xffff, 0x10000, | |
| 978 0x12345, 0xe0021, 0x10ffff, 0x110000, | |
| 979 0x234567, 0x7fffffff, -1, -1000, | |
| 980 0, 0x400 | |
| 981 }; | |
| 982 static const uint8_t expectUnsafe[]={ | |
| 983 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80, | |
| 984 0xea, 0xb0, 0x80, 0xed, 0xa0, 0x80, 0xed, 0xaf, 0xbf, 0xed, 0xb3, 0x9
e, | |
| 985 0xed, 0xbf, 0xbd, 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x8
0, 0x80, | |
| 986 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf
, /* not 0x110000 */ | |
| 987 /* none from this line */ | |
| 988 0, 0xd0, 0x80 | |
| 989 }, expectSafe[]={ | |
| 990 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80, | |
| 991 0xea, 0xb0, 0x80, /* no surrogates */ | |
| 992 /* no surrogates */ 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0
x80, 0x80, | |
| 993 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf
, /* not 0x110000 */ | |
| 994 /* none from this line */ | |
| 995 0, 0xd0, 0x80 | |
| 996 }; | |
| 997 | |
| 998 uint8_t buffer[100]; | |
| 999 UChar32 c; | |
| 1000 int32_t i, length; | |
| 1001 UBool isError, expectIsError, wrongIsError; | |
| 1002 | |
| 1003 length=0; | |
| 1004 for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) { | |
| 1005 c=codePoints[i]; | |
| 1006 if(c<0 || 0x10ffff<c) { | |
| 1007 continue; /* skip non-code points for U8_APPEND_UNSAFE */ | |
| 1008 } | |
| 1009 | |
| 1010 U8_APPEND_UNSAFE(buffer, length, c); | |
| 1011 } | |
| 1012 if(length!=UPRV_LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, le
ngth)) { | |
| 1013 log_err("U8_APPEND_UNSAFE did not generate the expected output\n"); | |
| 1014 } | |
| 1015 | |
| 1016 length=0; | |
| 1017 wrongIsError=FALSE; | |
| 1018 for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) { | |
| 1019 c=codePoints[i]; | |
| 1020 expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c); | |
| 1021 isError=FALSE; | |
| 1022 | |
| 1023 U8_APPEND(buffer, length, UPRV_LENGTHOF(buffer), c, isError); | |
| 1024 wrongIsError|= isError!=expectIsError; | |
| 1025 } | |
| 1026 if(wrongIsError) { | |
| 1027 log_err("U8_APPEND did not set isError correctly\n"); | |
| 1028 } | |
| 1029 if(length!=UPRV_LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length
)) { | |
| 1030 log_err("U8_APPEND did not generate the expected output\n"); | |
| 1031 } | |
| 1032 } | |
| 1033 | |
| 1034 static void | |
| 1035 TestSurrogates() { | |
| 1036 static const uint8_t b[]={ | |
| 1037 0xc3, 0x9f, /* 00DF */ | |
| 1038 0xed, 0x9f, 0xbf, /* D7FF */ | |
| 1039 0xed, 0xa0, 0x81, /* D801 */ | |
| 1040 0xed, 0xbf, 0xbe, /* DFFE */ | |
| 1041 0xee, 0x80, 0x80, /* E000 */ | |
| 1042 0xf0, 0x97, 0xbf, 0xbe /* 17FFE */ | |
| 1043 }; | |
| 1044 static const UChar32 cp[]={ | |
| 1045 0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe | |
| 1046 }; | |
| 1047 | |
| 1048 UChar32 cu, cs, cl; | |
| 1049 int32_t i, j, k, iu, is, il, length; | |
| 1050 | |
| 1051 k=0; /* index into cp[] */ | |
| 1052 length=UPRV_LENGTHOF(b); | |
| 1053 for(i=0; i<length;) { | |
| 1054 j=i; | |
| 1055 U8_NEXT_UNSAFE(b, j, cu); | |
| 1056 iu=j; | |
| 1057 | |
| 1058 j=i; | |
| 1059 U8_NEXT(b, j, length, cs); | |
| 1060 is=j; | |
| 1061 | |
| 1062 j=i; | |
| 1063 L8_NEXT(b, j, length, cl); | |
| 1064 il=j; | |
| 1065 | |
| 1066 if(cu!=cp[k]) { | |
| 1067 log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (lon
g)cu, (long)cp[k]); | |
| 1068 } | |
| 1069 | |
| 1070 /* U8_NEXT() returns <0 for surrogate code points */ | |
| 1071 if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) { | |
| 1072 log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (
long)cu); | |
| 1073 } | |
| 1074 | |
| 1075 /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */ | |
| 1076 if(cl!=cu) { | |
| 1077 log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (
long)cu); | |
| 1078 } | |
| 1079 | |
| 1080 if(is!=iu || il!=iu) { | |
| 1081 log_err("U8_NEXT(b[%ld]) or L8_NEXT(b[%ld]) did not advance the inde
x correctly\n", (long)i, (long)i); | |
| 1082 } | |
| 1083 | |
| 1084 ++k; /* next code point */ | |
| 1085 i=iu; /* advance by one UTF-8 sequence */ | |
| 1086 } | |
| 1087 | |
| 1088 while(i>0) { | |
| 1089 --k; /* previous code point */ | |
| 1090 | |
| 1091 j=i; | |
| 1092 U8_PREV_UNSAFE(b, j, cu); | |
| 1093 iu=j; | |
| 1094 | |
| 1095 j=i; | |
| 1096 U8_PREV(b, 0, j, cs); | |
| 1097 is=j; | |
| 1098 | |
| 1099 j=i; | |
| 1100 L8_PREV(b, 0, j, cl); | |
| 1101 il=j; | |
| 1102 | |
| 1103 if(cu!=cp[k]) { | |
| 1104 log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (lon
g)cu, (long)cp[k]); | |
| 1105 } | |
| 1106 | |
| 1107 /* U8_PREV() returns <0 for surrogate code points */ | |
| 1108 if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) { | |
| 1109 log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (
long)cu); | |
| 1110 } | |
| 1111 | |
| 1112 /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */ | |
| 1113 if(cl!=cu) { | |
| 1114 log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (
long)cu); | |
| 1115 } | |
| 1116 | |
| 1117 if(is!=iu || il !=iu) { | |
| 1118 log_err("U8_PREV(b[%ld]) or L8_PREV(b[%ld]) did not advance the inde
x correctly\n", (long)i, (long)i); | |
| 1119 } | |
| 1120 | |
| 1121 i=iu; /* go back by one UTF-8 sequence */ | |
| 1122 } | |
| 1123 } | |
| 1124 | |
| 1125 static void printUChars(const uint8_t *uchars, int16_t len){ | |
| 1126 int16_t i=0; | |
| 1127 for(i=0; i<len; i++){ | |
| 1128 log_err("0x%02x ", *(uchars+i)); | |
| 1129 } | |
| 1130 } | |
| OLD | NEW |