OLD | NEW |
(Empty) | |
| 1 /* |
| 2 **************************************************************************** |
| 3 * Copyright (c) 2005-2009, International Business Machines Corporation and * |
| 4 * others. All Rights Reserved. * |
| 5 **************************************************************************** |
| 6 */ |
| 7 |
| 8 #include "unicode/utypes.h" |
| 9 |
| 10 #include "unicode/ucsdet.h" |
| 11 #include "unicode/ucnv.h" |
| 12 #include "unicode/ustring.h" |
| 13 |
| 14 #include "cintltst.h" |
| 15 |
| 16 #include <stdlib.h> |
| 17 #include <string.h> |
| 18 |
| 19 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) |
| 20 |
| 21 #define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type)) |
| 22 #define DELETE_ARRAY(array) free(array) |
| 23 |
| 24 static void TestConstruction(void); |
| 25 static void TestUTF8(void); |
| 26 static void TestUTF16(void); |
| 27 static void TestC1Bytes(void); |
| 28 static void TestInputFilter(void); |
| 29 static void TestChaining(void); |
| 30 static void TestBufferOverflow(void); |
| 31 static void TestIBM424(void); |
| 32 static void TestIBM420(void); |
| 33 |
| 34 void addUCsdetTest(TestNode** root); |
| 35 |
| 36 void addUCsdetTest(TestNode** root) |
| 37 { |
| 38 addTest(root, &TestConstruction, "ucsdetst/TestConstruction"); |
| 39 addTest(root, &TestUTF8, "ucsdetst/TestUTF8"); |
| 40 addTest(root, &TestUTF16, "ucsdetst/TestUTF16"); |
| 41 addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes"); |
| 42 addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter"); |
| 43 addTest(root, &TestChaining, "ucsdetst/TestErrorChaining"); |
| 44 addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow"); |
| 45 #if !UCONFIG_NO_LEGACY_CONVERSION |
| 46 addTest(root, &TestIBM424, "ucsdetst/TestIBM424"); |
| 47 addTest(root, &TestIBM420, "ucsdetst/TestIBM420"); |
| 48 #endif |
| 49 } |
| 50 |
| 51 static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv) |
| 52 { |
| 53 UErrorCode status; |
| 54 char buffer[1024]; |
| 55 char *dest, *destLimit = buffer + sizeof(buffer); |
| 56 const UChar *srcLimit = src + length; |
| 57 int32_t result = 0; |
| 58 |
| 59 do { |
| 60 dest = buffer; |
| 61 status = U_ZERO_ERROR; |
| 62 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status
); |
| 63 result += (int32_t) (dest - buffer); |
| 64 } while (status == U_BUFFER_OVERFLOW_ERROR); |
| 65 |
| 66 return result; |
| 67 } |
| 68 |
| 69 static char *extractBytes(const UChar *src, int32_t length, const char *codepage
, int32_t *byteLength) |
| 70 { |
| 71 UErrorCode status = U_ZERO_ERROR; |
| 72 UConverter *cnv = ucnv_open(codepage, &status); |
| 73 int32_t byteCount = preflight(src, length, cnv); |
| 74 const UChar *srcLimit = src + length; |
| 75 char *bytes = NEW_ARRAY(char, byteCount + 1); |
| 76 char *dest = bytes, *destLimit = bytes + byteCount + 1; |
| 77 |
| 78 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status); |
| 79 ucnv_close(cnv); |
| 80 |
| 81 *byteLength = byteCount; |
| 82 return bytes; |
| 83 } |
| 84 |
| 85 static void freeBytes(char *bytes) |
| 86 { |
| 87 DELETE_ARRAY(bytes); |
| 88 } |
| 89 |
| 90 static void TestConstruction(void) |
| 91 { |
| 92 UErrorCode status = U_ZERO_ERROR; |
| 93 UCharsetDetector *csd = ucsdet_open(&status); |
| 94 UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status); |
| 95 const char *name; |
| 96 int32_t count = uenum_count(e, &status); |
| 97 int32_t i, length; |
| 98 |
| 99 for(i = 0; i < count; i += 1) { |
| 100 name = uenum_next(e, &length, &status); |
| 101 |
| 102 if(name == NULL || length <= 0) { |
| 103 log_err("ucsdet_getAllDetectableCharsets() returned a null or empty
name!\n"); |
| 104 } |
| 105 } |
| 106 /* one past the list of all names must return NULL */ |
| 107 name = uenum_next(e, &length, &status); |
| 108 if(name != NULL || length != 0 || U_FAILURE(status)) { |
| 109 log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-n
ull name!\n"); |
| 110 } |
| 111 |
| 112 uenum_close(e); |
| 113 ucsdet_close(csd); |
| 114 } |
| 115 |
| 116 static void TestUTF8(void) |
| 117 { |
| 118 UErrorCode status = U_ZERO_ERROR; |
| 119 static const char ss[] = "This is a string with some non-ascii characters th
at will " |
| 120 "be converted to UTF-8, then shoved through the detection process
. " |
| 121 "\\u0391\\u0392\\u0393\\u0394\\u0395" |
| 122 "Sure would be nice if our source could contain Unicode directly!
"; |
| 123 int32_t byteLength = 0, sLength = 0, dLength = 0; |
| 124 UChar s[sizeof(ss)]; |
| 125 char *bytes; |
| 126 UCharsetDetector *csd = ucsdet_open(&status); |
| 127 const UCharsetMatch *match; |
| 128 UChar detected[sizeof(ss)]; |
| 129 |
| 130 sLength = u_unescape(ss, s, sizeof(ss)); |
| 131 bytes = extractBytes(s, sLength, "UTF-8", &byteLength); |
| 132 |
| 133 ucsdet_setText(csd, bytes, byteLength, &status); |
| 134 if (U_FAILURE(status)) { |
| 135 log_err("status is %s\n", u_errorName(status)); |
| 136 goto bail; |
| 137 } |
| 138 |
| 139 match = ucsdet_detect(csd, &status); |
| 140 |
| 141 if (match == NULL) { |
| 142 log_err("Detection failure for UTF-8: got no matches.\n"); |
| 143 goto bail; |
| 144 } |
| 145 |
| 146 dLength = ucsdet_getUChars(match, detected, sLength, &status); |
| 147 |
| 148 if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) { |
| 149 log_err("Round-trip test failed!\n"); |
| 150 } |
| 151 |
| 152 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */ |
| 153 |
| 154 bail: |
| 155 freeBytes(bytes); |
| 156 ucsdet_close(csd); |
| 157 } |
| 158 |
| 159 static void TestUTF16(void) |
| 160 { |
| 161 UErrorCode status = U_ZERO_ERROR; |
| 162 /* Notice the BOM on the start of this string */ |
| 163 static const UChar chars[] = { |
| 164 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C, |
| 165 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a, |
| 166 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628, |
| 167 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646, |
| 168 0x064a, 0x062a, 0x0000}; |
| 169 int32_t beLength = 0, leLength = 0, cLength = ARRAY_SIZE(chars); |
| 170 char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength); |
| 171 char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength); |
| 172 UCharsetDetector *csd = ucsdet_open(&status); |
| 173 const UCharsetMatch *match; |
| 174 const char *name; |
| 175 int32_t conf; |
| 176 |
| 177 ucsdet_setText(csd, beBytes, beLength, &status); |
| 178 match = ucsdet_detect(csd, &status); |
| 179 |
| 180 if (match == NULL) { |
| 181 log_err("Encoding detection failure for UTF-16BE: got no matches.\n"); |
| 182 goto try_le; |
| 183 } |
| 184 |
| 185 name = ucsdet_getName(match, &status); |
| 186 conf = ucsdet_getConfidence(match, &status); |
| 187 |
| 188 if (strcmp(name, "UTF-16BE") != 0) { |
| 189 log_err("Encoding detection failure for UTF-16BE: got %s\n", name); |
| 190 } |
| 191 |
| 192 if (conf != 100) { |
| 193 log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf); |
| 194 } |
| 195 |
| 196 try_le: |
| 197 ucsdet_setText(csd, leBytes, leLength, &status); |
| 198 match = ucsdet_detect(csd, &status); |
| 199 |
| 200 if (match == NULL) { |
| 201 log_err("Encoding detection failure for UTF-16LE: got no matches.\n"); |
| 202 goto bail; |
| 203 } |
| 204 |
| 205 name = ucsdet_getName(match, &status); |
| 206 conf = ucsdet_getConfidence(match, &status); |
| 207 |
| 208 |
| 209 if (strcmp(name, "UTF-16LE") != 0) { |
| 210 log_err("Enconding detection failure for UTF-16LE: got %s\n", name); |
| 211 } |
| 212 |
| 213 if (conf != 100) { |
| 214 log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf); |
| 215 } |
| 216 |
| 217 bail: |
| 218 freeBytes(leBytes); |
| 219 freeBytes(beBytes); |
| 220 ucsdet_close(csd); |
| 221 } |
| 222 |
| 223 static void TestC1Bytes(void) |
| 224 { |
| 225 #if !UCONFIG_NO_LEGACY_CONVERSION |
| 226 UErrorCode status = U_ZERO_ERROR; |
| 227 static const char ssISO[] = "This is a small sample of some English text. Ju
st enough to be sure that it detects correctly."; |
| 228 static const char ssWindows[] = "This is another small sample of some Englis
h text. Just enough to be sure that it detects correctly. It also includes some
\\u201CC1\\u201D bytes."; |
| 229 int32_t sISOLength = 0, sWindowsLength = 0; |
| 230 UChar sISO[sizeof(ssISO)]; |
| 231 UChar sWindows[sizeof(ssWindows)]; |
| 232 int32_t lISO = 0, lWindows = 0; |
| 233 char *bISO; |
| 234 char *bWindows; |
| 235 UCharsetDetector *csd = ucsdet_open(&status); |
| 236 const UCharsetMatch *match; |
| 237 const char *name; |
| 238 |
| 239 sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO)); |
| 240 sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows)); |
| 241 bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO); |
| 242 bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows)
; |
| 243 |
| 244 ucsdet_setText(csd, bWindows, lWindows, &status); |
| 245 match = ucsdet_detect(csd, &status); |
| 246 |
| 247 if (match == NULL) { |
| 248 log_err("English test with C1 bytes got no matches.\n"); |
| 249 goto bail; |
| 250 } |
| 251 |
| 252 name = ucsdet_getName(match, &status); |
| 253 |
| 254 if (strcmp(name, "windows-1252") != 0) { |
| 255 log_data_err("English text with C1 bytes does not detect as windows-1252
, but as %s. (Are you missing data?)\n", name); |
| 256 } |
| 257 |
| 258 ucsdet_setText(csd, bISO, lISO, &status); |
| 259 match = ucsdet_detect(csd, &status); |
| 260 |
| 261 if (match == NULL) { |
| 262 log_err("English text without C1 bytes got no matches.\n"); |
| 263 goto bail; |
| 264 } |
| 265 |
| 266 name = ucsdet_getName(match, &status); |
| 267 |
| 268 if (strcmp(name, "ISO-8859-1") != 0) { |
| 269 log_err("English text without C1 bytes does not detect as ISO-8859-1, bu
t as %s\n", name); |
| 270 } |
| 271 |
| 272 bail: |
| 273 freeBytes(bWindows); |
| 274 freeBytes(bISO); |
| 275 |
| 276 ucsdet_close(csd); |
| 277 #endif |
| 278 } |
| 279 |
| 280 static void TestInputFilter(void) |
| 281 { |
| 282 UErrorCode status = U_ZERO_ERROR; |
| 283 static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> U
n tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>"; |
| 284 int32_t sLength = 0; |
| 285 UChar s[sizeof(ss)]; |
| 286 int32_t byteLength = 0; |
| 287 char *bytes; |
| 288 UCharsetDetector *csd = ucsdet_open(&status); |
| 289 const UCharsetMatch *match; |
| 290 const char *lang, *name; |
| 291 |
| 292 sLength = u_unescape(ss, s, sizeof(ss)); |
| 293 bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength); |
| 294 |
| 295 ucsdet_enableInputFilter(csd, TRUE); |
| 296 |
| 297 if (!ucsdet_isInputFilterEnabled(csd)) { |
| 298 log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter
!\n"); |
| 299 } |
| 300 |
| 301 |
| 302 ucsdet_setText(csd, bytes, byteLength, &status); |
| 303 match = ucsdet_detect(csd, &status); |
| 304 |
| 305 if (match == NULL) { |
| 306 log_err("Turning on the input filter resulted in no matches.\n"); |
| 307 goto turn_off; |
| 308 } |
| 309 |
| 310 name = ucsdet_getName(match, &status); |
| 311 |
| 312 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { |
| 313 log_err("Turning on the input filter resulted in %s rather than ISO-8859
-1\n", name); |
| 314 } else { |
| 315 lang = ucsdet_getLanguage(match, &status); |
| 316 |
| 317 if (lang == NULL || strcmp(lang, "fr") != 0) { |
| 318 log_err("Input filter did not strip markup!\n"); |
| 319 } |
| 320 } |
| 321 |
| 322 turn_off: |
| 323 ucsdet_enableInputFilter(csd, FALSE); |
| 324 ucsdet_setText(csd, bytes, byteLength, &status); |
| 325 match = ucsdet_detect(csd, &status); |
| 326 |
| 327 if (match == NULL) { |
| 328 log_err("Turning off the input filter resulted in no matches.\n"); |
| 329 goto bail; |
| 330 } |
| 331 |
| 332 name = ucsdet_getName(match, &status); |
| 333 |
| 334 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { |
| 335 log_err("Turning off the input filter resulted in %s rather than ISO-885
9-1\n", name); |
| 336 } else { |
| 337 lang = ucsdet_getLanguage(match, &status); |
| 338 |
| 339 if (lang == NULL || strcmp(lang, "en") != 0) { |
| 340 log_err("Unfiltered input did not detect as English!\n"); |
| 341 } |
| 342 } |
| 343 |
| 344 bail: |
| 345 freeBytes(bytes); |
| 346 ucsdet_close(csd); |
| 347 } |
| 348 |
| 349 static void TestChaining(void) { |
| 350 UErrorCode status = U_USELESS_COLLATOR_ERROR; |
| 351 |
| 352 ucsdet_open(&status); |
| 353 ucsdet_setText(NULL, NULL, 0, &status); |
| 354 ucsdet_getName(NULL, &status); |
| 355 ucsdet_getConfidence(NULL, &status); |
| 356 ucsdet_getLanguage(NULL, &status); |
| 357 ucsdet_detect(NULL, &status); |
| 358 ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status); |
| 359 ucsdet_detectAll(NULL, NULL, &status); |
| 360 ucsdet_getUChars(NULL, NULL, 0, &status); |
| 361 ucsdet_getUChars(NULL, NULL, 0, &status); |
| 362 ucsdet_close(NULL); |
| 363 |
| 364 /* All of this code should have done nothing. */ |
| 365 if (status != U_USELESS_COLLATOR_ERROR) { |
| 366 log_err("Status got changed to %s\n", u_errorName(status)); |
| 367 } |
| 368 } |
| 369 |
| 370 static void TestBufferOverflow(void) { |
| 371 UErrorCode status = U_ZERO_ERROR; |
| 372 static const char *testStrings[] = { |
| 373 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x2
0\x1b", /* A partial ISO-2022 shift state at the end */ |
| 374 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x2
0\x1b\x24", /* A partial ISO-2022 shift state at the end */ |
| 375 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x2
0\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */ |
| 376 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x2
0\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one
at the start */ |
| 377 "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */ |
| 378 "\xa1", /* Could be a single byte shift-jis at the end */ |
| 379 "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */ |
| 380 "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but n
ow we have English creeping in. */ |
| 381 }; |
| 382 static const char *testResults[] = { |
| 383 "windows-1252", |
| 384 "windows-1252", |
| 385 "windows-1252", |
| 386 "windows-1252", |
| 387 "ISO-2022-JP", |
| 388 NULL, |
| 389 NULL, |
| 390 "ISO-8859-1" |
| 391 }; |
| 392 int32_t idx = 0; |
| 393 UCharsetDetector *csd = ucsdet_open(&status); |
| 394 const UCharsetMatch *match; |
| 395 |
| 396 ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status); |
| 397 |
| 398 if (U_FAILURE(status)) { |
| 399 log_err("Couldn't open detector. %s\n", u_errorName(status)); |
| 400 goto bail; |
| 401 } |
| 402 |
| 403 for (idx = 0; idx < ARRAY_SIZE(testStrings); idx++) { |
| 404 ucsdet_setText(csd, testStrings[idx], -1, &status); |
| 405 match = ucsdet_detect(csd, &status); |
| 406 |
| 407 if (match == NULL) { |
| 408 if (testResults[idx] != NULL) { |
| 409 log_err("Unexpectedly got no results at index %d.\n", idx); |
| 410 } |
| 411 else { |
| 412 log_verbose("Got no result as expected at index %d.\n", idx); |
| 413 } |
| 414 continue; |
| 415 } |
| 416 |
| 417 if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), t
estResults[idx]) != 0) { |
| 418 log_err("Unexpectedly got %s instead of %s at index %d with confiden
ce %d.\n", |
| 419 ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_ge
tConfidence(match, &status)); |
| 420 goto bail; |
| 421 } |
| 422 } |
| 423 |
| 424 bail: |
| 425 ucsdet_close(csd); |
| 426 } |
| 427 |
| 428 static void TestIBM424(void) |
| 429 { |
| 430 UErrorCode status = U_ZERO_ERROR; |
| 431 |
| 432 static const UChar chars[] = { |
| 433 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05
D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8, |
| 434 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05
D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9, |
| 435 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05
DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8, |
| 436 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05
D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA, |
| 437 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05
E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5, |
| 438 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05
D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE, |
| 439 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05
E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, |
| 440 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05
EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC, |
| 441 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x00
22, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3, |
| 442 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05
D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020, |
| 443 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05
D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC, |
| 444 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x00
20, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, |
| 445 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x00
20, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, |
| 446 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05
D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, |
| 447 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05
DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC, |
| 448 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x00
20, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1, |
| 449 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05
D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000 |
| 450 }; |
| 451 |
| 452 static const UChar chars_reverse[] = { |
| 453 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05
DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA, |
| 454 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05
E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8, |
| 455 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05
D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1, |
| 456 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05
E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, |
| 457 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05
DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9, |
| 458 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05
D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4, |
| 459 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05
D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9, |
| 460 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05
DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5, |
| 461 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05
E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3, |
| 462 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05
E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020, |
| 463 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05
E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, |
| 464 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05
DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9, |
| 465 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05
E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020, |
| 466 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05
D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4, |
| 467 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05
D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7, |
| 468 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x00
20, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0, |
| 469 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x00
20, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4, |
| 470 0x0000 |
| 471 }; |
| 472 |
| 473 int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = A
RRAY_SIZE(chars_reverse); |
| 474 |
| 475 char *bytes = extractBytes(chars, cLength, "IBM424", &bLength); |
| 476 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength); |
| 477 |
| 478 UCharsetDetector *csd = ucsdet_open(&status); |
| 479 const UCharsetMatch *match; |
| 480 const char *name; |
| 481 |
| 482 ucsdet_setText(csd, bytes, bLength, &status); |
| 483 match = ucsdet_detect(csd, &status); |
| 484 |
| 485 if (match == NULL) { |
| 486 log_err("Encoding detection failure for IBM424_rtl: got no matches.\n"); |
| 487 goto bail; |
| 488 } |
| 489 |
| 490 name = ucsdet_getName(match, &status); |
| 491 if (strcmp(name, "IBM424_rtl") != 0) { |
| 492 log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are yo
u missing data?)\n", name); |
| 493 } |
| 494 |
| 495 ucsdet_setText(csd, bytes_r, brLength, &status); |
| 496 match = ucsdet_detect(csd, &status); |
| 497 |
| 498 if (match == NULL) { |
| 499 log_err("Encoding detection failure for IBM424_ltr: got no matches.\n"); |
| 500 goto bail; |
| 501 } |
| 502 |
| 503 name = ucsdet_getName(match, &status); |
| 504 if (strcmp(name, "IBM424_ltr") != 0) { |
| 505 log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are yo
u missing data?)\n", name); |
| 506 } |
| 507 |
| 508 bail: |
| 509 freeBytes(bytes); |
| 510 freeBytes(bytes_r); |
| 511 ucsdet_close(csd); |
| 512 } |
| 513 |
| 514 static void TestIBM420(void) |
| 515 { |
| 516 UErrorCode status = U_ZERO_ERROR; |
| 517 |
| 518 static const UChar chars[] = { |
| 519 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F,
0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627, |
| 520 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641, |
| 521 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627,
0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, |
| 522 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645,
0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645, |
| 523 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627,
0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A, |
| 524 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644, |
| 525 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644,
0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020, |
| 526 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637,
0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, |
| 527 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641,
0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634, |
| 528 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020,
0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F, |
| 529 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626,
0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647, |
| 530 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020,
0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627, |
| 531 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C,
0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E, |
| 532 0x0000 |
| 533 }; |
| 534 static const UChar chars_reverse[] = { |
| 535 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627,
0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F, |
| 536 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631,
0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020, |
| 537 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627,
0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648, |
| 538 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646,
0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628, |
| 539 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F,
0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, |
| 540 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A,
0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A, |
| 541 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648,
0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644, |
| 542 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644,
0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A, |
| 543 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645,
0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A, |
| 544 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020,
0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627, |
| 545 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020,
0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A, |
| 546 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646,
0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645, |
| 547 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646,
0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648, |
| 548 0x0000, |
| 549 }; |
| 550 |
| 551 int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = A
RRAY_SIZE(chars_reverse); |
| 552 |
| 553 char *bytes = extractBytes(chars, cLength, "IBM420", &bLength); |
| 554 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength); |
| 555 |
| 556 UCharsetDetector *csd = ucsdet_open(&status); |
| 557 const UCharsetMatch *match; |
| 558 const char *name; |
| 559 |
| 560 ucsdet_setText(csd, bytes, bLength, &status); |
| 561 match = ucsdet_detect(csd, &status); |
| 562 |
| 563 if (match == NULL) { |
| 564 log_err("Encoding detection failure for IBM420_rtl: got no matches.\n"); |
| 565 goto bail; |
| 566 } |
| 567 |
| 568 name = ucsdet_getName(match, &status); |
| 569 if (strcmp(name, "IBM420_rtl") != 0) { |
| 570 log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are yo
u missing data?)\n", name); |
| 571 } |
| 572 |
| 573 ucsdet_setText(csd, bytes_r, brLength, &status); |
| 574 match = ucsdet_detect(csd, &status); |
| 575 |
| 576 if (match == NULL) { |
| 577 log_err("Encoding detection failure for IBM420_ltr: got no matches.\n"); |
| 578 goto bail; |
| 579 } |
| 580 |
| 581 name = ucsdet_getName(match, &status); |
| 582 if (strcmp(name, "IBM420_ltr") != 0) { |
| 583 log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are yo
u missing data?)\n", name); |
| 584 } |
| 585 |
| 586 bail: |
| 587 freeBytes(bytes); |
| 588 freeBytes(bytes_r); |
| 589 ucsdet_close(csd); |
| 590 } |
OLD | NEW |