OLD | NEW |
| (Empty) |
1 /******************************************************************** | |
2 * COPYRIGHT: | |
3 * Copyright (c) 1998-2014, International Business Machines Corporation and | |
4 * others. All Rights Reserved. | |
5 ********************************************************************/ | |
6 /* | |
7 * File utf8tst.c | |
8 * | |
9 * Modification History: | |
10 * | |
11 * Date Name Description | |
12 * 07/24/2000 Madhu Creation | |
13 ******************************************************************************* | |
14 */ | |
15 | |
16 #include "unicode/utypes.h" | |
17 #include "unicode/utf8.h" | |
18 #include "cmemory.h" | |
19 #include "cintltst.h" | |
20 | |
21 /* lenient UTF-8 ------------------------------------------------------------ */ | |
22 | |
23 /* | |
24 * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate | |
25 * code points with their "natural" encoding. | |
26 * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of | |
27 * single surrogates. | |
28 * | |
29 * This is not conformant with UTF-8. | |
30 * | |
31 * Supplementary code points may be encoded as pairs of 3-byte sequences, but | |
32 * the macros below do not attempt to assemble such pairs. | |
33 */ | |
34 | |
35 #define L8_NEXT(s, i, length, c) { \ | |
36 (c)=(uint8_t)(s)[(i)++]; \ | |
37 if((c)>=0x80) { \ | |
38 if(U8_IS_LEAD(c)) { \ | |
39 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length
), c, -2); \ | |
40 } else { \ | |
41 (c)=U_SENTINEL; \ | |
42 } \ | |
43 } \ | |
44 } | |
45 | |
46 #define L8_PREV(s, start, i, c) { \ | |
47 (c)=(uint8_t)(s)[--(i)]; \ | |
48 if((c)>=0x80) { \ | |
49 if((c)<=0xbf) { \ | |
50 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \ | |
51 } else { \ | |
52 (c)=U_SENTINEL; \ | |
53 } \ | |
54 } \ | |
55 } | |
56 | |
57 /* -------------------------------------------------------------------------- */ | |
58 | |
59 static void printUChars(const uint8_t *uchars, int16_t len); | |
60 | |
61 static void TestCodeUnitValues(void); | |
62 static void TestCharLength(void); | |
63 static void TestGetChar(void); | |
64 static void TestNextPrevChar(void); | |
65 static void TestNulTerminated(void); | |
66 static void TestNextPrevNonCharacters(void); | |
67 static void TestNextPrevCharUnsafe(void); | |
68 static void TestFwdBack(void); | |
69 static void TestFwdBackUnsafe(void); | |
70 static void TestSetChar(void); | |
71 static void TestSetCharUnsafe(void); | |
72 static void TestAppendChar(void); | |
73 static void TestAppend(void); | |
74 static void TestSurrogates(void); | |
75 | |
76 void addUTF8Test(TestNode** root); | |
77 | |
78 void | |
79 addUTF8Test(TestNode** root) | |
80 { | |
81 addTest(root, &TestCodeUnitValues, "utf8tst/TestCodeUnitValues"); | |
82 addTest(root, &TestCharLength, "utf8tst/TestCharLength"); | |
83 addTest(root, &TestGetChar, "utf8tst/TestGetChar"); | |
84 addTest(root, &TestNextPrevChar, "utf8tst/TestNextPrevChar"); | |
85 addTest(root, &TestNulTerminated, "utf8tst/TestNulTerminated"); | |
86 addTest(root, &TestNextPrevNonCharacters, "utf8tst/TestNextPrevNonCharacte
rs"); | |
87 addTest(root, &TestNextPrevCharUnsafe, "utf8tst/TestNextPrevCharUnsafe"
); | |
88 addTest(root, &TestFwdBack, "utf8tst/TestFwdBack"); | |
89 addTest(root, &TestFwdBackUnsafe, "utf8tst/TestFwdBackUnsafe"); | |
90 addTest(root, &TestSetChar, "utf8tst/TestSetChar"); | |
91 addTest(root, &TestSetCharUnsafe, "utf8tst/TestSetCharUnsafe"); | |
92 addTest(root, &TestAppendChar, "utf8tst/TestAppendChar"); | |
93 addTest(root, &TestAppend, "utf8tst/TestAppend"); | |
94 addTest(root, &TestSurrogates, "utf8tst/TestSurrogates"); | |
95 } | |
96 | |
97 static void TestCodeUnitValues() | |
98 { | |
99 static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0
xfd, 0x80, 0x81, 0xbc, 0xbe,}; | |
100 | |
101 int16_t i; | |
102 for(i=0; i<UPRV_LENGTHOF(codeunit); i++){ | |
103 uint8_t c=codeunit[i]; | |
104 log_verbose("Testing code unit value of %x\n", c); | |
105 if(i<4){ | |
106 if(!UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) || !U8_
IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)){ | |
107 log_err("ERROR: 0x%02x is a single byte but results in single: %
c lead: %c trail: %c\n", | |
108 c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n
', UTF8_IS_TRAIL(c) ? 'y' : 'n'); | |
109 } | |
110 } else if(i< 8){ | |
111 if(!UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) || !U8_
IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)){ | |
112 log_err("ERROR: 0x%02x is a lead byte but results in single: %c
lead: %c trail: %c\n", | |
113 c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n
', UTF8_IS_TRAIL(c) ? 'y' : 'n'); | |
114 } | |
115 } else if(i< 12){ | |
116 if(!UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || !U8_
IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){ | |
117 log_err("ERROR: 0x%02x is a trail byte but results in single: %c
lead: %c trail: %c\n", | |
118 c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n
', UTF8_IS_TRAIL(c) ? 'y' : 'n'); | |
119 } | |
120 } | |
121 } | |
122 } | |
123 | |
124 static void TestCharLength() | |
125 { | |
126 static const uint32_t codepoint[]={ | |
127 1, 0x0061, | |
128 1, 0x007f, | |
129 2, 0x016f, | |
130 2, 0x07ff, | |
131 3, 0x0865, | |
132 3, 0x20ac, | |
133 4, 0x20402, | |
134 4, 0x23456, | |
135 4, 0x24506, | |
136 4, 0x20402, | |
137 4, 0x10402, | |
138 3, 0xd7ff, | |
139 3, 0xe000, | |
140 | |
141 }; | |
142 | |
143 int16_t i; | |
144 UBool multiple; | |
145 for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){ | |
146 UChar32 c=codepoint[i+1]; | |
147 if(UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] || U8_LENGTH(c) != (uin
t16_t)codepoint[i]){ | |
148 log_err("The no: of code units for %lx:- Expected: %d Got: %d\n",
c, codepoint[i], UTF8_CHAR_LENGTH(c)); | |
149 }else{ | |
150 log_verbose("The no: of code units for %lx is %d\n",c, UTF8_CHAR_L
ENGTH(c)); | |
151 } | |
152 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE); | |
153 if(UTF8_NEED_MULTIPLE_UCHAR(c) != multiple){ | |
154 log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c); | |
155 } | |
156 } | |
157 } | |
158 | |
159 static void TestGetChar() | |
160 { | |
161 static const uint8_t input[]={ | |
162 /* code unit,*/ | |
163 0x61, | |
164 0x7f, | |
165 0xe4, | |
166 0xba, | |
167 0x8c, | |
168 0xF0, | |
169 0x90, | |
170 0x90, | |
171 0x81, | |
172 0xc0, | |
173 0x65, | |
174 0x31, | |
175 0x9a, | |
176 0xc9 | |
177 }; | |
178 static const UChar32 result[]={ | |
179 /* codepoint-unsafe, codepoint-safe(not strict) codepoint-safe(strict) */ | |
180 0x61, 0x61, 0x61, | |
181 0x7f, 0x7f, 0x7f, | |
182 0x4e8c, 0x4e8c, 0x4e8c, | |
183 0x4e8c, 0x4e8c, 0x4e8c , | |
184 0x4e8c, 0x4e8c, 0x4e8c, | |
185 0x10401, 0x10401, 0x10401 , | |
186 0x10401, 0x10401, 0x10401 , | |
187 0x10401, 0x10401, 0x10401 , | |
188 0x10401, 0x10401, 0x10401, | |
189 0x25, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
190 0x65, 0x65, 0x65, | |
191 0x31, 0x31, 0x31, | |
192 0x31, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
193 0x240, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1 | |
194 }; | |
195 uint16_t i=0; | |
196 UChar32 c, expected; | |
197 uint32_t offset=0; | |
198 | |
199 for(offset=0; offset<sizeof(input); offset++) { | |
200 if (offset < sizeof(input) - 1) { | |
201 UTF8_GET_CHAR_UNSAFE(input, offset, c); | |
202 if(c != result[i]){ | |
203 log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expe
cted:%lx Got:%lx\n", offset, result[i], c); | |
204 | |
205 } | |
206 | |
207 U8_GET_UNSAFE(input, offset, c); | |
208 if(c != result[i]){ | |
209 log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%l
x Got:%lx\n", offset, result[i], c); | |
210 | |
211 } | |
212 } | |
213 | |
214 UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE); | |
215 expected=result[i+1]; | |
216 if(c != expected){ | |
217 log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%
lx Got:%lx\n", offset, expected, c); | |
218 } | |
219 | |
220 U8_GET(input, 0, offset, sizeof(input), c); | |
221 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; } | |
222 if(c != expected){ | |
223 log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n
", offset, expected, c); | |
224 } | |
225 | |
226 U8_GET_OR_FFFD(input, 0, offset, sizeof(input), c); | |
227 if(expected<0) { expected=0xfffd; } | |
228 if(c != expected){ | |
229 log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx G
ot:%lx\n", offset, expected, c); | |
230 } | |
231 | |
232 UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE); | |
233 if(c != result[i+2]){ | |
234 log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Ex
pected:%lx Got:%lx\n", offset, result[i+2], c); | |
235 } | |
236 | |
237 i=(uint16_t)(i+3); | |
238 } | |
239 } | |
240 | |
241 static void TestNextPrevChar() { | |
242 static const uint8_t input[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd
, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00}; | |
243 static const UChar32 result[]={ | |
244 /* next_unsafe next_safe_ns next_safe_s prev_unsafe pr
ev_safe_ns prev_safe_s */ | |
245 0x0061, 0x0061, 0x0061, 0x0000, 0x
0000, 0x0000, | |
246 0x10401, 0x10401, 0x10401, 0xf0, UT
F8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
247 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841410, UT
F8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
248 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xa1050, UT
F8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
249 0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841, UT
F8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
250 0x00, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x61, 0x
61, 0x61, | |
251 0x80, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xc2, UT
F8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
252 0xfd, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x77e, UT
F8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, | |
253 0xbe, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xfd, UT
F8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
254 0xa1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x00, UT
F8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, | |
255 0x61, 0x61, 0x61, 0xc0, UT
F8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
256 0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x10401, 0x
10401, 0x10401, | |
257 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UT
F_ERROR_VALUE, UTF_ERROR_VALUE, | |
258 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UT
F8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, | |
259 0x0840, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xf0, UT
F8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, | |
260 0x0000, 0x0000, 0x0000, 0x0061, 0x
0061, 0x0061 | |
261 }; | |
262 static const int32_t movedOffset[]={ | |
263 /* next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns
prev_safe_s */ | |
264 1, 1, 1, 15, 15,
15, | |
265 5, 5, 5, 14, 14 ,
14, | |
266 3, 3, 3, 9, 13,
13, | |
267 4, 4, 4, 9, 12,
12, | |
268 5, 5, 5, 9, 11,
11, | |
269 7, 7, 7, 10, 10,
10, | |
270 7, 7, 7, 9, 9,
9, | |
271 8, 9, 9, 7, 7,
7, | |
272 9, 9, 9, 7, 7,
7, | |
273 11, 10, 10, 5, 5,
5, | |
274 11, 11, 11, 5, 5,
5, | |
275 12, 12, 12, 1, 1,
1, | |
276 13, 13, 13, 1, 1,
1, | |
277 14, 14, 14, 1, 1,
1, | |
278 14, 15, 15, 1, 1,
1, | |
279 14, 16, 16, 0, 0,
0, | |
280 }; | |
281 /* TODO: remove unused columns for next_unsafe & prev_unsafe, and adjust the
test code */ | |
282 | |
283 UChar32 c, expected; | |
284 uint32_t i=0; | |
285 uint32_t offset=0; | |
286 int32_t setOffset=0; | |
287 for(offset=0; offset<sizeof(input); offset++){ | |
288 setOffset=offset; | |
289 UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE); | |
290 if(setOffset != movedOffset[i+1]){ | |
291 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset corre
ctly at %d\n ExpectedOffset:%d Got %d\n", | |
292 offset, movedOffset[i+1], setOffset); | |
293 } | |
294 expected=result[i+1]; | |
295 if(c != expected){ | |
296 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%
lx Got:%lx\n", offset, expected, c); | |
297 } | |
298 | |
299 setOffset=offset; | |
300 U8_NEXT(input, setOffset, sizeof(input), c); | |
301 if(setOffset != movedOffset[i+1]){ | |
302 log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n
ExpectedOffset:%d Got %d\n", | |
303 offset, movedOffset[i+1], setOffset); | |
304 } | |
305 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; } | |
306 if(c != expected){ | |
307 log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n
", offset, expected, c); | |
308 } | |
309 | |
310 setOffset=offset; | |
311 U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c); | |
312 if(setOffset != movedOffset[i+1]){ | |
313 log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly
at %d\n ExpectedOffset:%d Got %d\n", | |
314 offset, movedOffset[i+1], setOffset); | |
315 } | |
316 if(expected<0) { expected=0xfffd; } | |
317 if(c != expected){ | |
318 log_err("ERROR: U8_NEXT_OR_FFFD failed for input=%ld. Expected:%lx G
ot:%lx\n", offset, expected, c); | |
319 } | |
320 | |
321 setOffset=offset; | |
322 UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE); | |
323 if(setOffset != movedOffset[i+1]){ | |
324 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offs
et correctly at %d\n ExpectedOffset:%d Got %d\n", | |
325 offset, movedOffset[i+2], setOffset); | |
326 } | |
327 if(c != result[i+2]){ | |
328 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. E
xpected:%lx Got:%lx\n", offset, result[i+2], c); | |
329 } | |
330 | |
331 i=i+6; | |
332 } | |
333 | |
334 i=0; | |
335 for(offset=sizeof(input); offset > 0; --offset){ | |
336 setOffset=offset; | |
337 UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE); | |
338 if(setOffset != movedOffset[i+4]){ | |
339 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset corre
ctly at %d\n ExpectedOffset:%d Got %d\n", | |
340 offset, movedOffset[i+4], setOffset); | |
341 } | |
342 expected=result[i+4]; | |
343 if(c != expected){ | |
344 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%
lx Got:%lx\n", offset, expected, c); | |
345 } | |
346 | |
347 setOffset=offset; | |
348 U8_PREV(input, 0, setOffset, c); | |
349 if(setOffset != movedOffset[i+4]){ | |
350 log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n
ExpectedOffset:%d Got %d\n", | |
351 offset, movedOffset[i+4], setOffset); | |
352 } | |
353 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; } | |
354 if(c != expected){ | |
355 log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n
", offset, expected, c); | |
356 } | |
357 | |
358 setOffset=offset; | |
359 U8_PREV_OR_FFFD(input, 0, setOffset, c); | |
360 if(setOffset != movedOffset[i+4]){ | |
361 log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly
at %d\n ExpectedOffset:%d Got %d\n", | |
362 offset, movedOffset[i+4], setOffset); | |
363 } | |
364 if(expected<0) { expected=0xfffd; } | |
365 if(c != expected){ | |
366 log_err("ERROR: U8_PREV_OR_FFFD failed for input=%ld. Expected:%lx G
ot:%lx\n", offset, expected, c); | |
367 } | |
368 | |
369 setOffset=offset; | |
370 UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE); | |
371 if(setOffset != movedOffset[i+5]){ | |
372 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offs
et correctly at %d\n ExpectedOffset:%d Got %d\n", | |
373 offset, movedOffset[i+5], setOffset); | |
374 } | |
375 if(c != result[i+5]){ | |
376 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. E
xpected:%lx Got:%lx\n", offset, result[i+5], c); | |
377 } | |
378 | |
379 i=i+6; | |
380 } | |
381 } | |
382 | |
383 /* keep this in sync with utf16tst.c's TestNulTerminated() */ | |
384 static void TestNulTerminated() { | |
385 static const uint8_t input[]={ | |
386 /* 0 */ 0x61, | |
387 /* 1 */ 0xf0, 0x90, 0x90, 0x81, | |
388 /* 5 */ 0xc0, 0x80, | |
389 /* 7 */ 0xdf, 0x80, | |
390 /* 9 */ 0xc2, | |
391 /* 10 */ 0x62, | |
392 /* 11 */ 0xfd, 0xbe, | |
393 /* 13 */ 0xe0, 0xa0, 0x80, | |
394 /* 16 */ 0xe2, 0x82, 0xac, | |
395 /* 19 */ 0xf0, 0x90, 0x90, | |
396 /* 22 */ 0x00 | |
397 /* 23 */ | |
398 }; | |
399 static const UChar32 result[]={ | |
400 0x61, | |
401 0x10401, | |
402 U_SENTINEL, | |
403 0x7c0, | |
404 U_SENTINEL, | |
405 0x62, | |
406 U_SENTINEL, | |
407 0x800, | |
408 0x20ac, | |
409 U_SENTINEL, | |
410 0 | |
411 }; | |
412 | |
413 UChar32 c, c2, expected; | |
414 int32_t i0, i=0, j, k, expectedIndex; | |
415 int32_t cpIndex=0; | |
416 do { | |
417 i0=i; | |
418 U8_NEXT(input, i, -1, c); | |
419 expected=result[cpIndex]; | |
420 if(c!=expected) { | |
421 log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected); | |
422 } | |
423 j=i0; | |
424 U8_NEXT_OR_FFFD(input, j, -1, c); | |
425 if(expected<0) { expected=0xfffd; } | |
426 if(c!=expected) { | |
427 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expect
ed); | |
428 } | |
429 if(j!=i) { | |
430 log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to
%d\n", j, i); | |
431 } | |
432 j=i0; | |
433 U8_FWD_1(input, j, -1); | |
434 if(j!=i) { | |
435 log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n",
j, i); | |
436 } | |
437 ++cpIndex; | |
438 /* | |
439 * Move by this many code points from the start. | |
440 * U8_FWD_N() stops at the end of the string, that is, at the NUL if nec
essary. | |
441 */ | |
442 expectedIndex= (c==0) ? i-1 : i; | |
443 k=0; | |
444 U8_FWD_N(input, k, -1, cpIndex); | |
445 if(k!=expectedIndex) { | |
446 log_err("U8_FWD_N(code points from 0) moved to index %d but expected
%d\n", k, expectedIndex); | |
447 } | |
448 } while(c!=0); | |
449 | |
450 i=0; | |
451 do { | |
452 j=i0=i; | |
453 U8_NEXT(input, i, -1, c); | |
454 do { | |
455 U8_GET(input, 0, j, -1, c2); | |
456 if(c2!=c) { | |
457 log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0,
c, c2, j); | |
458 } | |
459 U8_GET_OR_FFFD(input, 0, j, -1, c2); | |
460 expected= (c>=0) ? c : 0xfffd; | |
461 if(c2!=expected) { | |
462 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFF
D(at %d)\n", i0, expected, c2, j); | |
463 } | |
464 /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the co
de point */ | |
465 k=j+1; | |
466 U8_SET_CP_LIMIT(input, 0, k, -1); | |
467 if(k!=i) { | |
468 log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to
%d\n", i, j+1, k); | |
469 } | |
470 } while(++j<i); | |
471 } while(c!=0); | |
472 } | |
473 | |
474 static void TestNextPrevNonCharacters() { | |
475 /* test non-characters */ | |
476 static const uint8_t nonChars[]={ | |
477 0xef, 0xb7, 0x90, /* U+fdd0 */ | |
478 0xef, 0xbf, 0xbf, /* U+feff */ | |
479 0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */ | |
480 0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */ | |
481 0xf4, 0x8f, 0xbf, 0xbe /* U+10fffe */ | |
482 }; | |
483 | |
484 UChar32 ch; | |
485 int32_t idx; | |
486 | |
487 for(idx=0; idx<(int32_t)sizeof(nonChars);) { | |
488 U8_NEXT(nonChars, idx, sizeof(nonChars), ch); | |
489 if(!U_IS_UNICODE_NONCHAR(ch)) { | |
490 log_err("U8_NEXT(before %d) failed to read a non-character\n", idx); | |
491 } | |
492 } | |
493 for(idx=(int32_t)sizeof(nonChars); idx>0;) { | |
494 U8_PREV(nonChars, 0, idx, ch); | |
495 if(!U_IS_UNICODE_NONCHAR(ch)) { | |
496 log_err("U8_PREV(at %d) failed to read a non-character\n", idx); | |
497 } | |
498 } | |
499 } | |
500 | |
501 static void TestNextPrevCharUnsafe() { | |
502 /* | |
503 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries
. | |
504 * The behavior of _UNSAFE macros for ill-formed strings is undefined. | |
505 */ | |
506 static const uint8_t input[]={ | |
507 0x61, | |
508 0xf0, 0x90, 0x90, 0x81, | |
509 0xc0, 0x80, /* non-shortest form */ | |
510 0xe2, 0x82, 0xac, | |
511 0xc2, 0xa1, | |
512 0xf4, 0x8f, 0xbf, 0xbf, | |
513 0x00 | |
514 }; | |
515 static const UChar32 codePoints[]={ | |
516 0x61, | |
517 0x10401, | |
518 0, | |
519 0x20ac, | |
520 0xa1, | |
521 0x10ffff, | |
522 0 | |
523 }; | |
524 | |
525 UChar32 c; | |
526 int32_t i; | |
527 uint32_t offset; | |
528 for(i=0, offset=0; offset<sizeof(input); ++i) { | |
529 UTF8_NEXT_CHAR_UNSAFE(input, offset, c); | |
530 if(c != codePoints[i]){ | |
531 log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expecte
d:%lx Got:%lx\n", | |
532 offset, codePoints[i], c); | |
533 } | |
534 } | |
535 for(i=0, offset=0; offset<sizeof(input); ++i) { | |
536 U8_NEXT_UNSAFE(input, offset, c); | |
537 if(c != codePoints[i]){ | |
538 log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx G
ot:%lx\n", | |
539 offset, codePoints[i], c); | |
540 } | |
541 } | |
542 | |
543 for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){ | |
544 UTF8_PREV_CHAR_UNSAFE(input, offset, c); | |
545 if(c != codePoints[i]){ | |
546 log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expect
ed:%lx Got:%lx\n", | |
547 offset, codePoints[i], c); | |
548 } | |
549 } | |
550 for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){ | |
551 U8_PREV_UNSAFE(input, offset, c); | |
552 if(c != codePoints[i]){ | |
553 log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx
Got:%lx\n", | |
554 offset, codePoints[i], c); | |
555 } | |
556 } | |
557 } | |
558 | |
559 static void TestFwdBack() { | |
560 static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0
, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00}; | |
561 static const uint16_t fwd_safe[] ={1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18}; | |
562 static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6
, 5, 1, 0}; | |
563 | |
564 static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5}; | |
565 static const uint16_t fwd_N_safe[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe
macro keeps it at the end of the string */ | |
566 static const uint16_t back_N_safe[] ={18, 17, 15, 12, 11, 9, 7, 0}; | |
567 | |
568 uint32_t offsafe=0; | |
569 | |
570 uint32_t i=0; | |
571 while(offsafe < sizeof(input)){ | |
572 UTF8_FWD_1_SAFE(input, offsafe, sizeof(input)); | |
573 if(offsafe != fwd_safe[i]){ | |
574 log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe
[i], offsafe); | |
575 } | |
576 i++; | |
577 } | |
578 | |
579 i=0; | |
580 while(offsafe < sizeof(input)){ | |
581 U8_FWD_1(input, offsafe, sizeof(input)); | |
582 if(offsafe != fwd_safe[i]){ | |
583 log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i],
offsafe); | |
584 } | |
585 i++; | |
586 } | |
587 | |
588 i=0; | |
589 offsafe=sizeof(input); | |
590 while(offsafe > 0){ | |
591 UTF8_BACK_1_SAFE(input, 0, offsafe); | |
592 if(offsafe != back_safe[i]){ | |
593 log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_sa
fe[i], offsafe); | |
594 } | |
595 i++; | |
596 } | |
597 | |
598 i=0; | |
599 offsafe=sizeof(input); | |
600 while(offsafe > 0){ | |
601 U8_BACK_1(input, 0, offsafe); | |
602 if(offsafe != back_safe[i]){ | |
603 log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_safe[i
], offsafe); | |
604 } | |
605 i++; | |
606 } | |
607 | |
608 offsafe=0; | |
609 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){ | |
610 UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]); | |
611 if(offsafe != fwd_N_safe[i]){ | |
612 log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i,
fwd_N_safe[i], offsafe); | |
613 } | |
614 | |
615 } | |
616 | |
617 offsafe=0; | |
618 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){ | |
619 U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]); | |
620 if(offsafe != fwd_N_safe[i]){ | |
621 log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_
safe[i], offsafe); | |
622 } | |
623 | |
624 } | |
625 | |
626 offsafe=sizeof(input); | |
627 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){ | |
628 UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]); | |
629 if(offsafe != back_N_safe[i]){ | |
630 log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i
, back_N_safe[i], offsafe); | |
631 } | |
632 } | |
633 | |
634 offsafe=sizeof(input); | |
635 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){ | |
636 U8_BACK_N(input, 0, offsafe, Nvalue[i]); | |
637 if(offsafe != back_N_safe[i]){ | |
638 log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back
_N_safe[i], offsafe); | |
639 } | |
640 } | |
641 } | |
642 | |
643 static void TestFwdBackUnsafe() { | |
644 /* | |
645 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries
. | |
646 * The behavior of _UNSAFE macros for ill-formed strings is undefined. | |
647 */ | |
648 static const uint8_t input[]={ | |
649 0x61, | |
650 0xf0, 0x90, 0x90, 0x81, | |
651 0xc0, 0x80, /* non-shortest form */ | |
652 0xe2, 0x82, 0xac, | |
653 0xc2, 0xa1, | |
654 0xf4, 0x8f, 0xbf, 0xbf, | |
655 0x00 | |
656 }; | |
657 static const int8_t boundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 }; | |
658 | |
659 int32_t offset; | |
660 int32_t i; | |
661 for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) { | |
662 UTF8_FWD_1_UNSAFE(input, offset); | |
663 if(offset != boundaries[i]){ | |
664 log_err("ERROR: UTF8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", bou
ndaries[i], offset); | |
665 } | |
666 } | |
667 for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) { | |
668 U8_FWD_1_UNSAFE(input, offset); | |
669 if(offset != boundaries[i]){ | |
670 log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", bound
aries[i], offset); | |
671 } | |
672 } | |
673 | |
674 for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --
i) { | |
675 UTF8_BACK_1_UNSAFE(input, offset); | |
676 if(offset != boundaries[i]){ | |
677 log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", bo
undaries[i], offset); | |
678 } | |
679 } | |
680 for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --
i) { | |
681 U8_BACK_1_UNSAFE(input, offset); | |
682 if(offset != boundaries[i]){ | |
683 log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boun
daries[i], offset); | |
684 } | |
685 } | |
686 | |
687 for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) { | |
688 offset=0; | |
689 UTF8_FWD_N_UNSAFE(input, offset, i); | |
690 if(offset != boundaries[i]) { | |
691 log_err("ERROR: UTF8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", bou
ndaries[i], offset); | |
692 } | |
693 } | |
694 for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) { | |
695 offset=0; | |
696 U8_FWD_N_UNSAFE(input, offset, i); | |
697 if(offset != boundaries[i]) { | |
698 log_err("ERROR: U8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", bound
aries[i], offset); | |
699 } | |
700 } | |
701 | |
702 for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) { | |
703 int32_t j=UPRV_LENGTHOF(boundaries)-1-i; | |
704 offset=UPRV_LENGTHOF(input); | |
705 UTF8_BACK_N_UNSAFE(input, offset, i); | |
706 if(offset != boundaries[j]) { | |
707 log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", bo
undaries[j], offset); | |
708 } | |
709 } | |
710 for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) { | |
711 int32_t j=UPRV_LENGTHOF(boundaries)-1-i; | |
712 offset=UPRV_LENGTHOF(input); | |
713 U8_BACK_N_UNSAFE(input, offset, i); | |
714 if(offset != boundaries[j]) { | |
715 log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boun
daries[j], offset); | |
716 } | |
717 } | |
718 } | |
719 | |
720 static void TestSetChar() { | |
721 static const uint8_t input[] | |
722 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x8
0, 0xe0, 0x00 }; | |
723 static const int16_t start_safe[] | |
724 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14 }; | |
725 static const int16_t limit_safe[] | |
726 = {0, 1, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14 }; | |
727 | |
728 uint32_t i=0; | |
729 int32_t offset=0, setOffset=0; | |
730 for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){ | |
731 if (offset<UPRV_LENGTHOF(input)){ | |
732 setOffset=offset; | |
733 UTF8_SET_CHAR_START_SAFE(input, 0, setOffset); | |
734 if(setOffset != start_safe[i]){ | |
735 log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld.
Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset); | |
736 } | |
737 | |
738 setOffset=offset; | |
739 U8_SET_CP_START(input, 0, setOffset); | |
740 if(setOffset != start_safe[i]){ | |
741 log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:
%ld Got:%ld\n", offset, start_safe[i], setOffset); | |
742 } | |
743 } | |
744 | |
745 setOffset=offset; | |
746 UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, sizeof(input)); | |
747 if(setOffset != limit_safe[i]){ | |
748 log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expe
cted:%ld Got:%ld\n", offset, limit_safe[i], setOffset); | |
749 } | |
750 | |
751 setOffset=offset; | |
752 U8_SET_CP_LIMIT(input,0, setOffset, sizeof(input)); | |
753 if(setOffset != limit_safe[i]){ | |
754 log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld
Got:%ld\n", offset, limit_safe[i], setOffset); | |
755 } | |
756 | |
757 i++; | |
758 } | |
759 } | |
760 | |
761 static void TestSetCharUnsafe() { | |
762 static const uint8_t input[] | |
763 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0x2e, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x8
0, 0xe0, 0x80, 0x80, 0x00 }; | |
764 static const int16_t start_unsafe[] | |
765 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 9, 9,
12, 12, 12, 15 }; | |
766 static const int16_t limit_unsafe[] | |
767 = {0, 1, 4, 4, 4, 5, 6, 7, 9, 9, 10, 10,
10, 15, 15, 15, 16 }; | |
768 | |
769 uint32_t i=0; | |
770 int32_t offset=0, setOffset=0; | |
771 for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){ | |
772 if (offset<UPRV_LENGTHOF(input)){ | |
773 setOffset=offset; | |
774 UTF8_SET_CHAR_START_UNSAFE(input, setOffset); | |
775 if(setOffset != start_unsafe[i]){ | |
776 log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld
. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset); | |
777 } | |
778 | |
779 setOffset=offset; | |
780 U8_SET_CP_START_UNSAFE(input, setOffset); | |
781 if(setOffset != start_unsafe[i]){ | |
782 log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Ex
pected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset); | |
783 } | |
784 } | |
785 | |
786 if (offset != 0) { /* Can't have it go off the end of the array */ | |
787 setOffset=offset; | |
788 UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset); | |
789 if(setOffset != limit_unsafe[i]){ | |
790 log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld
. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset); | |
791 } | |
792 | |
793 setOffset=offset; | |
794 U8_SET_CP_LIMIT_UNSAFE(input, setOffset); | |
795 if(setOffset != limit_unsafe[i]){ | |
796 log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Ex
pected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset); | |
797 } | |
798 } | |
799 | |
800 i++; | |
801 } | |
802 } | |
803 | |
804 static void TestAppendChar(){ | |
805 static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
0x69, 0x6a, 0x00}; | |
806 static const uint32_t test[]={ | |
807 /* append-position(unsafe), CHAR to be appended */ | |
808 0, 0x10401, | |
809 2, 0x0028, | |
810 2, 0x007f, | |
811 3, 0xd801, | |
812 1, 0x20402, | |
813 8, 0x10401, | |
814 5, 0xc0, | |
815 5, 0xc1, | |
816 5, 0xfd, | |
817 6, 0x80, | |
818 6, 0x81, | |
819 6, 0xbf, | |
820 7, 0xfe, | |
821 | |
822 /* append-position(safe), CHAR to be appended */ | |
823 0, 0x10401, | |
824 2, 0x0028, | |
825 3, 0x7f, | |
826 3, 0xd801, /* illegal for UTF-8 starting with U
nicode 3.2 */ | |
827 1, 0x20402, | |
828 9, 0x10401, | |
829 5, 0xc0, | |
830 5, 0xc1, | |
831 5, 0xfd, | |
832 6, 0x80, | |
833 6, 0x81, | |
834 6, 0xbf, | |
835 7, 0xfe, | |
836 | |
837 }; | |
838 static const uint16_t movedOffset[]={ | |
839 /* offset-moved-to(unsafe) */ | |
840 4, /*for append-pos: 0 , CHAR 0x10401*/ | |
841 3, | |
842 3, | |
843 6, | |
844 5, | |
845 12, | |
846 7, | |
847 7, | |
848 7, | |
849 8, | |
850 8, | |
851 8, | |
852 9, | |
853 | |
854 /* offset-moved-to(safe) */ | |
855 4, /*for append-pos: 0, CHAR 0x10401*/ | |
856 3, | |
857 4, | |
858 6, | |
859 5, | |
860 11, | |
861 7, | |
862 7, | |
863 7, | |
864 8, | |
865 8, | |
866 8, | |
867 9, | |
868 | |
869 }; | |
870 | |
871 static const uint8_t result[][11]={ | |
872 /*unsafe*/ | |
873 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
874 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
875 {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
876 {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
877 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
878 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90}, | |
879 | |
880 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00}, | |
881 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00}, | |
882 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00}, | |
883 | |
884 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00}, | |
885 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00}, | |
886 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00}, | |
887 | |
888 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00}, | |
889 /*safe*/ | |
890 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
891 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
892 {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
893 {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
894 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, | |
895 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*ge
ts UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/ | |
896 | |
897 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00}, | |
898 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00}, | |
899 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00}, | |
900 | |
901 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00}, | |
902 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00}, | |
903 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00}, | |
904 | |
905 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00}, | |
906 | |
907 }; | |
908 uint16_t i, count=0; | |
909 uint8_t str[12]; | |
910 uint32_t offset; | |
911 /* UChar32 c=0;*/ | |
912 uint16_t size=UPRV_LENGTHOF(s); | |
913 for(i=0; i<UPRV_LENGTHOF(test); i=(uint16_t)(i+2)){ | |
914 uprv_memcpy(str, s, size); | |
915 offset=test[i]; | |
916 if(count<13){ | |
917 UTF8_APPEND_CHAR_UNSAFE(str, offset, test[i+1]); | |
918 if(offset != movedOffset[count]){ | |
919 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offse
t correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n", | |
920 count, movedOffset[count], offset); | |
921 | |
922 } | |
923 if(uprv_memcmp(str, result[count], size) !=0){ | |
924 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nE
xpected:", count); | |
925 printUChars(result[count], size); | |
926 log_err("\nGot: "); | |
927 printUChars(str, size); | |
928 log_err("\n"); | |
929 } | |
930 }else{ | |
931 UTF8_APPEND_CHAR_SAFE(str, offset, size, test[i+1]); | |
932 if(offset != movedOffset[count]){ | |
933 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset
correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n", | |
934 count, movedOffset[count], offset); | |
935 | |
936 } | |
937 if(uprv_memcmp(str, result[count], size) !=0){ | |
938 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExp
ected:", count); | |
939 printUChars(result[count], size); | |
940 log_err("\nGot: "); | |
941 printUChars(str, size); | |
942 log_err("\n"); | |
943 } | |
944 /*call the API instead of MACRO | |
945 uprv_memcpy(str, s, size); | |
946 offset=test[i]; | |
947 c=test[i+1]; | |
948 if((uint32_t)(c)<=0x7f) { | |
949 (str)[(offset)++]=(uint8_t)(c); | |
950 } else { | |
951 (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32
_t)(size), c); | |
952 } | |
953 if(offset != movedOffset[count]){ | |
954 log_err("ERROR: utf8_appendCharSafeBody() failed to move the off
set correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n", | |
955 count, movedOffset[count], offset); | |
956 | |
957 } | |
958 if(uprv_memcmp(str, result[count], size) !=0){ | |
959 log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \
nExpected:", count); | |
960 printUChars(result[count], size); | |
961 printf("\nGot: "); | |
962 printUChars(str, size); | |
963 printf("\n"); | |
964 } | |
965 */ | |
966 } | |
967 count++; | |
968 } | |
969 | |
970 | |
971 } | |
972 | |
973 static void TestAppend() { | |
974 static const UChar32 codePoints[]={ | |
975 0x61, 0xdf, 0x901, 0x3040, | |
976 0xac00, 0xd800, 0xdbff, 0xdcde, | |
977 0xdffd, 0xe000, 0xffff, 0x10000, | |
978 0x12345, 0xe0021, 0x10ffff, 0x110000, | |
979 0x234567, 0x7fffffff, -1, -1000, | |
980 0, 0x400 | |
981 }; | |
982 static const uint8_t expectUnsafe[]={ | |
983 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80, | |
984 0xea, 0xb0, 0x80, 0xed, 0xa0, 0x80, 0xed, 0xaf, 0xbf, 0xed, 0xb3, 0x9
e, | |
985 0xed, 0xbf, 0xbd, 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x8
0, 0x80, | |
986 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf
, /* not 0x110000 */ | |
987 /* none from this line */ | |
988 0, 0xd0, 0x80 | |
989 }, expectSafe[]={ | |
990 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80, | |
991 0xea, 0xb0, 0x80, /* no surrogates */ | |
992 /* no surrogates */ 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0
x80, 0x80, | |
993 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf
, /* not 0x110000 */ | |
994 /* none from this line */ | |
995 0, 0xd0, 0x80 | |
996 }; | |
997 | |
998 uint8_t buffer[100]; | |
999 UChar32 c; | |
1000 int32_t i, length; | |
1001 UBool isError, expectIsError, wrongIsError; | |
1002 | |
1003 length=0; | |
1004 for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) { | |
1005 c=codePoints[i]; | |
1006 if(c<0 || 0x10ffff<c) { | |
1007 continue; /* skip non-code points for U8_APPEND_UNSAFE */ | |
1008 } | |
1009 | |
1010 U8_APPEND_UNSAFE(buffer, length, c); | |
1011 } | |
1012 if(length!=UPRV_LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, le
ngth)) { | |
1013 log_err("U8_APPEND_UNSAFE did not generate the expected output\n"); | |
1014 } | |
1015 | |
1016 length=0; | |
1017 wrongIsError=FALSE; | |
1018 for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) { | |
1019 c=codePoints[i]; | |
1020 expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c); | |
1021 isError=FALSE; | |
1022 | |
1023 U8_APPEND(buffer, length, UPRV_LENGTHOF(buffer), c, isError); | |
1024 wrongIsError|= isError!=expectIsError; | |
1025 } | |
1026 if(wrongIsError) { | |
1027 log_err("U8_APPEND did not set isError correctly\n"); | |
1028 } | |
1029 if(length!=UPRV_LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length
)) { | |
1030 log_err("U8_APPEND did not generate the expected output\n"); | |
1031 } | |
1032 } | |
1033 | |
1034 static void | |
1035 TestSurrogates() { | |
1036 static const uint8_t b[]={ | |
1037 0xc3, 0x9f, /* 00DF */ | |
1038 0xed, 0x9f, 0xbf, /* D7FF */ | |
1039 0xed, 0xa0, 0x81, /* D801 */ | |
1040 0xed, 0xbf, 0xbe, /* DFFE */ | |
1041 0xee, 0x80, 0x80, /* E000 */ | |
1042 0xf0, 0x97, 0xbf, 0xbe /* 17FFE */ | |
1043 }; | |
1044 static const UChar32 cp[]={ | |
1045 0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe | |
1046 }; | |
1047 | |
1048 UChar32 cu, cs, cl; | |
1049 int32_t i, j, k, iu, is, il, length; | |
1050 | |
1051 k=0; /* index into cp[] */ | |
1052 length=UPRV_LENGTHOF(b); | |
1053 for(i=0; i<length;) { | |
1054 j=i; | |
1055 U8_NEXT_UNSAFE(b, j, cu); | |
1056 iu=j; | |
1057 | |
1058 j=i; | |
1059 U8_NEXT(b, j, length, cs); | |
1060 is=j; | |
1061 | |
1062 j=i; | |
1063 L8_NEXT(b, j, length, cl); | |
1064 il=j; | |
1065 | |
1066 if(cu!=cp[k]) { | |
1067 log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (lon
g)cu, (long)cp[k]); | |
1068 } | |
1069 | |
1070 /* U8_NEXT() returns <0 for surrogate code points */ | |
1071 if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) { | |
1072 log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (
long)cu); | |
1073 } | |
1074 | |
1075 /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */ | |
1076 if(cl!=cu) { | |
1077 log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (
long)cu); | |
1078 } | |
1079 | |
1080 if(is!=iu || il!=iu) { | |
1081 log_err("U8_NEXT(b[%ld]) or L8_NEXT(b[%ld]) did not advance the inde
x correctly\n", (long)i, (long)i); | |
1082 } | |
1083 | |
1084 ++k; /* next code point */ | |
1085 i=iu; /* advance by one UTF-8 sequence */ | |
1086 } | |
1087 | |
1088 while(i>0) { | |
1089 --k; /* previous code point */ | |
1090 | |
1091 j=i; | |
1092 U8_PREV_UNSAFE(b, j, cu); | |
1093 iu=j; | |
1094 | |
1095 j=i; | |
1096 U8_PREV(b, 0, j, cs); | |
1097 is=j; | |
1098 | |
1099 j=i; | |
1100 L8_PREV(b, 0, j, cl); | |
1101 il=j; | |
1102 | |
1103 if(cu!=cp[k]) { | |
1104 log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (lon
g)cu, (long)cp[k]); | |
1105 } | |
1106 | |
1107 /* U8_PREV() returns <0 for surrogate code points */ | |
1108 if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) { | |
1109 log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (
long)cu); | |
1110 } | |
1111 | |
1112 /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */ | |
1113 if(cl!=cu) { | |
1114 log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (
long)cu); | |
1115 } | |
1116 | |
1117 if(is!=iu || il !=iu) { | |
1118 log_err("U8_PREV(b[%ld]) or L8_PREV(b[%ld]) did not advance the inde
x correctly\n", (long)i, (long)i); | |
1119 } | |
1120 | |
1121 i=iu; /* go back by one UTF-8 sequence */ | |
1122 } | |
1123 } | |
1124 | |
1125 static void printUChars(const uint8_t *uchars, int16_t len){ | |
1126 int16_t i=0; | |
1127 for(i=0; i<len; i++){ | |
1128 log_err("0x%02x ", *(uchars+i)); | |
1129 } | |
1130 } | |
OLD | NEW |