OLD | NEW |
| (Empty) |
1 /* | |
2 **************************************************************************** | |
3 * Copyright (c) 2005-2009, International Business Machines Corporation and * | |
4 * others. All Rights Reserved. * | |
5 **************************************************************************** | |
6 */ | |
7 | |
8 #include "unicode/utypes.h" | |
9 | |
10 #include "unicode/ucsdet.h" | |
11 #include "unicode/ucnv.h" | |
12 #include "unicode/ustring.h" | |
13 | |
14 #include "cintltst.h" | |
15 | |
16 #include <stdlib.h> | |
17 #include <string.h> | |
18 | |
19 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) | |
20 | |
21 #define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type)) | |
22 #define DELETE_ARRAY(array) free(array) | |
23 | |
24 static void TestConstruction(void); | |
25 static void TestUTF8(void); | |
26 static void TestUTF16(void); | |
27 static void TestC1Bytes(void); | |
28 static void TestInputFilter(void); | |
29 static void TestChaining(void); | |
30 static void TestBufferOverflow(void); | |
31 static void TestIBM424(void); | |
32 static void TestIBM420(void); | |
33 | |
34 void addUCsdetTest(TestNode** root); | |
35 | |
36 void addUCsdetTest(TestNode** root) | |
37 { | |
38 addTest(root, &TestConstruction, "ucsdetst/TestConstruction"); | |
39 addTest(root, &TestUTF8, "ucsdetst/TestUTF8"); | |
40 addTest(root, &TestUTF16, "ucsdetst/TestUTF16"); | |
41 addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes"); | |
42 addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter"); | |
43 addTest(root, &TestChaining, "ucsdetst/TestErrorChaining"); | |
44 addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow"); | |
45 #if !UCONFIG_NO_LEGACY_CONVERSION | |
46 addTest(root, &TestIBM424, "ucsdetst/TestIBM424"); | |
47 addTest(root, &TestIBM420, "ucsdetst/TestIBM420"); | |
48 #endif | |
49 } | |
50 | |
51 static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv) | |
52 { | |
53 UErrorCode status; | |
54 char buffer[1024]; | |
55 char *dest, *destLimit = buffer + sizeof(buffer); | |
56 const UChar *srcLimit = src + length; | |
57 int32_t result = 0; | |
58 | |
59 do { | |
60 dest = buffer; | |
61 status = U_ZERO_ERROR; | |
62 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status
); | |
63 result += (int32_t) (dest - buffer); | |
64 } while (status == U_BUFFER_OVERFLOW_ERROR); | |
65 | |
66 return result; | |
67 } | |
68 | |
69 static char *extractBytes(const UChar *src, int32_t length, const char *codepage
, int32_t *byteLength) | |
70 { | |
71 UErrorCode status = U_ZERO_ERROR; | |
72 UConverter *cnv = ucnv_open(codepage, &status); | |
73 int32_t byteCount = preflight(src, length, cnv); | |
74 const UChar *srcLimit = src + length; | |
75 char *bytes = NEW_ARRAY(char, byteCount + 1); | |
76 char *dest = bytes, *destLimit = bytes + byteCount + 1; | |
77 | |
78 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status); | |
79 ucnv_close(cnv); | |
80 | |
81 *byteLength = byteCount; | |
82 return bytes; | |
83 } | |
84 | |
85 static void freeBytes(char *bytes) | |
86 { | |
87 DELETE_ARRAY(bytes); | |
88 } | |
89 | |
90 static void TestConstruction(void) | |
91 { | |
92 UErrorCode status = U_ZERO_ERROR; | |
93 UCharsetDetector *csd = ucsdet_open(&status); | |
94 UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status); | |
95 const char *name; | |
96 int32_t count = uenum_count(e, &status); | |
97 int32_t i, length; | |
98 | |
99 for(i = 0; i < count; i += 1) { | |
100 name = uenum_next(e, &length, &status); | |
101 | |
102 if(name == NULL || length <= 0) { | |
103 log_err("ucsdet_getAllDetectableCharsets() returned a null or empty
name!\n"); | |
104 } | |
105 } | |
106 /* one past the list of all names must return NULL */ | |
107 name = uenum_next(e, &length, &status); | |
108 if(name != NULL || length != 0 || U_FAILURE(status)) { | |
109 log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-n
ull name!\n"); | |
110 } | |
111 | |
112 uenum_close(e); | |
113 ucsdet_close(csd); | |
114 } | |
115 | |
116 static void TestUTF8(void) | |
117 { | |
118 UErrorCode status = U_ZERO_ERROR; | |
119 static const char ss[] = "This is a string with some non-ascii characters th
at will " | |
120 "be converted to UTF-8, then shoved through the detection process
. " | |
121 "\\u0391\\u0392\\u0393\\u0394\\u0395" | |
122 "Sure would be nice if our source could contain Unicode directly!
"; | |
123 int32_t byteLength = 0, sLength = 0, dLength = 0; | |
124 UChar s[sizeof(ss)]; | |
125 char *bytes; | |
126 UCharsetDetector *csd = ucsdet_open(&status); | |
127 const UCharsetMatch *match; | |
128 UChar detected[sizeof(ss)]; | |
129 | |
130 sLength = u_unescape(ss, s, sizeof(ss)); | |
131 bytes = extractBytes(s, sLength, "UTF-8", &byteLength); | |
132 | |
133 ucsdet_setText(csd, bytes, byteLength, &status); | |
134 if (U_FAILURE(status)) { | |
135 log_err("status is %s\n", u_errorName(status)); | |
136 goto bail; | |
137 } | |
138 | |
139 match = ucsdet_detect(csd, &status); | |
140 | |
141 if (match == NULL) { | |
142 log_err("Detection failure for UTF-8: got no matches.\n"); | |
143 goto bail; | |
144 } | |
145 | |
146 dLength = ucsdet_getUChars(match, detected, sLength, &status); | |
147 | |
148 if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) { | |
149 log_err("Round-trip test failed!\n"); | |
150 } | |
151 | |
152 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */ | |
153 | |
154 bail: | |
155 freeBytes(bytes); | |
156 ucsdet_close(csd); | |
157 } | |
158 | |
159 static void TestUTF16(void) | |
160 { | |
161 UErrorCode status = U_ZERO_ERROR; | |
162 /* Notice the BOM on the start of this string */ | |
163 static const UChar chars[] = { | |
164 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C, | |
165 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a, | |
166 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628, | |
167 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646, | |
168 0x064a, 0x062a, 0x0000}; | |
169 int32_t beLength = 0, leLength = 0, cLength = ARRAY_SIZE(chars); | |
170 char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength); | |
171 char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength); | |
172 UCharsetDetector *csd = ucsdet_open(&status); | |
173 const UCharsetMatch *match; | |
174 const char *name; | |
175 int32_t conf; | |
176 | |
177 ucsdet_setText(csd, beBytes, beLength, &status); | |
178 match = ucsdet_detect(csd, &status); | |
179 | |
180 if (match == NULL) { | |
181 log_err("Encoding detection failure for UTF-16BE: got no matches.\n"); | |
182 goto try_le; | |
183 } | |
184 | |
185 name = ucsdet_getName(match, &status); | |
186 conf = ucsdet_getConfidence(match, &status); | |
187 | |
188 if (strcmp(name, "UTF-16BE") != 0) { | |
189 log_err("Encoding detection failure for UTF-16BE: got %s\n", name); | |
190 } | |
191 | |
192 if (conf != 100) { | |
193 log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf); | |
194 } | |
195 | |
196 try_le: | |
197 ucsdet_setText(csd, leBytes, leLength, &status); | |
198 match = ucsdet_detect(csd, &status); | |
199 | |
200 if (match == NULL) { | |
201 log_err("Encoding detection failure for UTF-16LE: got no matches.\n"); | |
202 goto bail; | |
203 } | |
204 | |
205 name = ucsdet_getName(match, &status); | |
206 conf = ucsdet_getConfidence(match, &status); | |
207 | |
208 | |
209 if (strcmp(name, "UTF-16LE") != 0) { | |
210 log_err("Enconding detection failure for UTF-16LE: got %s\n", name); | |
211 } | |
212 | |
213 if (conf != 100) { | |
214 log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf); | |
215 } | |
216 | |
217 bail: | |
218 freeBytes(leBytes); | |
219 freeBytes(beBytes); | |
220 ucsdet_close(csd); | |
221 } | |
222 | |
223 static void TestC1Bytes(void) | |
224 { | |
225 #if !UCONFIG_NO_LEGACY_CONVERSION | |
226 UErrorCode status = U_ZERO_ERROR; | |
227 static const char ssISO[] = "This is a small sample of some English text. Ju
st enough to be sure that it detects correctly."; | |
228 static const char ssWindows[] = "This is another small sample of some Englis
h text. Just enough to be sure that it detects correctly. It also includes some
\\u201CC1\\u201D bytes."; | |
229 int32_t sISOLength = 0, sWindowsLength = 0; | |
230 UChar sISO[sizeof(ssISO)]; | |
231 UChar sWindows[sizeof(ssWindows)]; | |
232 int32_t lISO = 0, lWindows = 0; | |
233 char *bISO; | |
234 char *bWindows; | |
235 UCharsetDetector *csd = ucsdet_open(&status); | |
236 const UCharsetMatch *match; | |
237 const char *name; | |
238 | |
239 sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO)); | |
240 sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows)); | |
241 bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO); | |
242 bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows)
; | |
243 | |
244 ucsdet_setText(csd, bWindows, lWindows, &status); | |
245 match = ucsdet_detect(csd, &status); | |
246 | |
247 if (match == NULL) { | |
248 log_err("English test with C1 bytes got no matches.\n"); | |
249 goto bail; | |
250 } | |
251 | |
252 name = ucsdet_getName(match, &status); | |
253 | |
254 if (strcmp(name, "windows-1252") != 0) { | |
255 log_data_err("English text with C1 bytes does not detect as windows-1252
, but as %s. (Are you missing data?)\n", name); | |
256 } | |
257 | |
258 ucsdet_setText(csd, bISO, lISO, &status); | |
259 match = ucsdet_detect(csd, &status); | |
260 | |
261 if (match == NULL) { | |
262 log_err("English text without C1 bytes got no matches.\n"); | |
263 goto bail; | |
264 } | |
265 | |
266 name = ucsdet_getName(match, &status); | |
267 | |
268 if (strcmp(name, "ISO-8859-1") != 0) { | |
269 log_err("English text without C1 bytes does not detect as ISO-8859-1, bu
t as %s\n", name); | |
270 } | |
271 | |
272 bail: | |
273 freeBytes(bWindows); | |
274 freeBytes(bISO); | |
275 | |
276 ucsdet_close(csd); | |
277 #endif | |
278 } | |
279 | |
280 static void TestInputFilter(void) | |
281 { | |
282 UErrorCode status = U_ZERO_ERROR; | |
283 static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> U
n tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>"; | |
284 int32_t sLength = 0; | |
285 UChar s[sizeof(ss)]; | |
286 int32_t byteLength = 0; | |
287 char *bytes; | |
288 UCharsetDetector *csd = ucsdet_open(&status); | |
289 const UCharsetMatch *match; | |
290 const char *lang, *name; | |
291 | |
292 sLength = u_unescape(ss, s, sizeof(ss)); | |
293 bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength); | |
294 | |
295 ucsdet_enableInputFilter(csd, TRUE); | |
296 | |
297 if (!ucsdet_isInputFilterEnabled(csd)) { | |
298 log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter
!\n"); | |
299 } | |
300 | |
301 | |
302 ucsdet_setText(csd, bytes, byteLength, &status); | |
303 match = ucsdet_detect(csd, &status); | |
304 | |
305 if (match == NULL) { | |
306 log_err("Turning on the input filter resulted in no matches.\n"); | |
307 goto turn_off; | |
308 } | |
309 | |
310 name = ucsdet_getName(match, &status); | |
311 | |
312 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { | |
313 log_err("Turning on the input filter resulted in %s rather than ISO-8859
-1\n", name); | |
314 } else { | |
315 lang = ucsdet_getLanguage(match, &status); | |
316 | |
317 if (lang == NULL || strcmp(lang, "fr") != 0) { | |
318 log_err("Input filter did not strip markup!\n"); | |
319 } | |
320 } | |
321 | |
322 turn_off: | |
323 ucsdet_enableInputFilter(csd, FALSE); | |
324 ucsdet_setText(csd, bytes, byteLength, &status); | |
325 match = ucsdet_detect(csd, &status); | |
326 | |
327 if (match == NULL) { | |
328 log_err("Turning off the input filter resulted in no matches.\n"); | |
329 goto bail; | |
330 } | |
331 | |
332 name = ucsdet_getName(match, &status); | |
333 | |
334 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { | |
335 log_err("Turning off the input filter resulted in %s rather than ISO-885
9-1\n", name); | |
336 } else { | |
337 lang = ucsdet_getLanguage(match, &status); | |
338 | |
339 if (lang == NULL || strcmp(lang, "en") != 0) { | |
340 log_err("Unfiltered input did not detect as English!\n"); | |
341 } | |
342 } | |
343 | |
344 bail: | |
345 freeBytes(bytes); | |
346 ucsdet_close(csd); | |
347 } | |
348 | |
349 static void TestChaining(void) { | |
350 UErrorCode status = U_USELESS_COLLATOR_ERROR; | |
351 | |
352 ucsdet_open(&status); | |
353 ucsdet_setText(NULL, NULL, 0, &status); | |
354 ucsdet_getName(NULL, &status); | |
355 ucsdet_getConfidence(NULL, &status); | |
356 ucsdet_getLanguage(NULL, &status); | |
357 ucsdet_detect(NULL, &status); | |
358 ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status); | |
359 ucsdet_detectAll(NULL, NULL, &status); | |
360 ucsdet_getUChars(NULL, NULL, 0, &status); | |
361 ucsdet_getUChars(NULL, NULL, 0, &status); | |
362 ucsdet_close(NULL); | |
363 | |
364 /* All of this code should have done nothing. */ | |
365 if (status != U_USELESS_COLLATOR_ERROR) { | |
366 log_err("Status got changed to %s\n", u_errorName(status)); | |
367 } | |
368 } | |
369 | |
370 static void TestBufferOverflow(void) { | |
371 UErrorCode status = U_ZERO_ERROR; | |
372 static const char *testStrings[] = { | |
373 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x2
0\x1b", /* A partial ISO-2022 shift state at the end */ | |
374 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x2
0\x1b\x24", /* A partial ISO-2022 shift state at the end */ | |
375 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x2
0\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */ | |
376 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x2
0\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one
at the start */ | |
377 "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */ | |
378 "\xa1", /* Could be a single byte shift-jis at the end */ | |
379 "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */ | |
380 "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but n
ow we have English creeping in. */ | |
381 }; | |
382 static const char *testResults[] = { | |
383 "windows-1252", | |
384 "windows-1252", | |
385 "windows-1252", | |
386 "windows-1252", | |
387 "ISO-2022-JP", | |
388 NULL, | |
389 NULL, | |
390 "ISO-8859-1" | |
391 }; | |
392 int32_t idx = 0; | |
393 UCharsetDetector *csd = ucsdet_open(&status); | |
394 const UCharsetMatch *match; | |
395 | |
396 ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status); | |
397 | |
398 if (U_FAILURE(status)) { | |
399 log_err("Couldn't open detector. %s\n", u_errorName(status)); | |
400 goto bail; | |
401 } | |
402 | |
403 for (idx = 0; idx < ARRAY_SIZE(testStrings); idx++) { | |
404 ucsdet_setText(csd, testStrings[idx], -1, &status); | |
405 match = ucsdet_detect(csd, &status); | |
406 | |
407 if (match == NULL) { | |
408 if (testResults[idx] != NULL) { | |
409 log_err("Unexpectedly got no results at index %d.\n", idx); | |
410 } | |
411 else { | |
412 log_verbose("Got no result as expected at index %d.\n", idx); | |
413 } | |
414 continue; | |
415 } | |
416 | |
417 if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), t
estResults[idx]) != 0) { | |
418 log_err("Unexpectedly got %s instead of %s at index %d with confiden
ce %d.\n", | |
419 ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_ge
tConfidence(match, &status)); | |
420 goto bail; | |
421 } | |
422 } | |
423 | |
424 bail: | |
425 ucsdet_close(csd); | |
426 } | |
427 | |
428 static void TestIBM424(void) | |
429 { | |
430 UErrorCode status = U_ZERO_ERROR; | |
431 | |
432 static const UChar chars[] = { | |
433 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05
D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8, | |
434 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05
D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9, | |
435 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05
DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8, | |
436 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05
D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA, | |
437 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05
E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5, | |
438 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05
D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE, | |
439 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05
E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, | |
440 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05
EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC, | |
441 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x00
22, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3, | |
442 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05
D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020, | |
443 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05
D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC, | |
444 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x00
20, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, | |
445 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x00
20, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, | |
446 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05
D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, | |
447 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05
DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC, | |
448 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x00
20, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1, | |
449 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05
D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000 | |
450 }; | |
451 | |
452 static const UChar chars_reverse[] = { | |
453 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05
DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA, | |
454 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05
E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8, | |
455 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05
D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1, | |
456 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05
E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, | |
457 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05
DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9, | |
458 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05
D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4, | |
459 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05
D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9, | |
460 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05
DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5, | |
461 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05
E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3, | |
462 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05
E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020, | |
463 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05
E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, | |
464 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05
DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9, | |
465 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05
E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020, | |
466 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05
D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4, | |
467 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05
D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7, | |
468 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x00
20, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0, | |
469 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x00
20, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4, | |
470 0x0000 | |
471 }; | |
472 | |
473 int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = A
RRAY_SIZE(chars_reverse); | |
474 | |
475 char *bytes = extractBytes(chars, cLength, "IBM424", &bLength); | |
476 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength); | |
477 | |
478 UCharsetDetector *csd = ucsdet_open(&status); | |
479 const UCharsetMatch *match; | |
480 const char *name; | |
481 | |
482 ucsdet_setText(csd, bytes, bLength, &status); | |
483 match = ucsdet_detect(csd, &status); | |
484 | |
485 if (match == NULL) { | |
486 log_err("Encoding detection failure for IBM424_rtl: got no matches.\n"); | |
487 goto bail; | |
488 } | |
489 | |
490 name = ucsdet_getName(match, &status); | |
491 if (strcmp(name, "IBM424_rtl") != 0) { | |
492 log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are yo
u missing data?)\n", name); | |
493 } | |
494 | |
495 ucsdet_setText(csd, bytes_r, brLength, &status); | |
496 match = ucsdet_detect(csd, &status); | |
497 | |
498 if (match == NULL) { | |
499 log_err("Encoding detection failure for IBM424_ltr: got no matches.\n"); | |
500 goto bail; | |
501 } | |
502 | |
503 name = ucsdet_getName(match, &status); | |
504 if (strcmp(name, "IBM424_ltr") != 0) { | |
505 log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are yo
u missing data?)\n", name); | |
506 } | |
507 | |
508 bail: | |
509 freeBytes(bytes); | |
510 freeBytes(bytes_r); | |
511 ucsdet_close(csd); | |
512 } | |
513 | |
514 static void TestIBM420(void) | |
515 { | |
516 UErrorCode status = U_ZERO_ERROR; | |
517 | |
518 static const UChar chars[] = { | |
519 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F,
0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627, | |
520 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641, | |
521 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627,
0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, | |
522 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645,
0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645, | |
523 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627,
0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A, | |
524 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644, | |
525 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644,
0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020, | |
526 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637,
0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, | |
527 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641,
0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634, | |
528 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020,
0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F, | |
529 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626,
0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647, | |
530 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020,
0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627, | |
531 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C,
0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E, | |
532 0x0000 | |
533 }; | |
534 static const UChar chars_reverse[] = { | |
535 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627,
0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F, | |
536 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631,
0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020, | |
537 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627,
0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648, | |
538 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646,
0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628, | |
539 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F,
0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, | |
540 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A,
0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A, | |
541 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648,
0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644, | |
542 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644,
0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A, | |
543 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645,
0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A, | |
544 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020,
0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627, | |
545 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020,
0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A, | |
546 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646,
0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645, | |
547 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646,
0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648, | |
548 0x0000, | |
549 }; | |
550 | |
551 int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = A
RRAY_SIZE(chars_reverse); | |
552 | |
553 char *bytes = extractBytes(chars, cLength, "IBM420", &bLength); | |
554 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength); | |
555 | |
556 UCharsetDetector *csd = ucsdet_open(&status); | |
557 const UCharsetMatch *match; | |
558 const char *name; | |
559 | |
560 ucsdet_setText(csd, bytes, bLength, &status); | |
561 match = ucsdet_detect(csd, &status); | |
562 | |
563 if (match == NULL) { | |
564 log_err("Encoding detection failure for IBM420_rtl: got no matches.\n"); | |
565 goto bail; | |
566 } | |
567 | |
568 name = ucsdet_getName(match, &status); | |
569 if (strcmp(name, "IBM420_rtl") != 0) { | |
570 log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are yo
u missing data?)\n", name); | |
571 } | |
572 | |
573 ucsdet_setText(csd, bytes_r, brLength, &status); | |
574 match = ucsdet_detect(csd, &status); | |
575 | |
576 if (match == NULL) { | |
577 log_err("Encoding detection failure for IBM420_ltr: got no matches.\n"); | |
578 goto bail; | |
579 } | |
580 | |
581 name = ucsdet_getName(match, &status); | |
582 if (strcmp(name, "IBM420_ltr") != 0) { | |
583 log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are yo
u missing data?)\n", name); | |
584 } | |
585 | |
586 bail: | |
587 freeBytes(bytes); | |
588 freeBytes(bytes_r); | |
589 ucsdet_close(csd); | |
590 } | |
OLD | NEW |