OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ********************************************************************** |
| 3 * Copyright (C) 2005-2009, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ********************************************************************** |
| 6 */ |
| 7 |
| 8 |
| 9 #include "unicode/utypes.h" |
| 10 #include "unicode/ucsdet.h" |
| 11 #include "unicode/ucnv.h" |
| 12 #include "unicode/unistr.h" |
| 13 #include "unicode/putil.h" |
| 14 #include "unicode/uniset.h" |
| 15 |
| 16 #include "intltest.h" |
| 17 #include "csdetest.h" |
| 18 |
| 19 #include "xmlparser.h" |
| 20 |
| 21 #include <stdlib.h> |
| 22 #include <string.h> |
| 23 |
| 24 #ifdef DEBUG_DETECT |
| 25 #include <stdio.h> |
| 26 #endif |
| 27 |
| 28 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) |
| 29 |
| 30 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type)) |
| 31 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array)) |
| 32 |
| 33 #define CH_SPACE 0x0020 |
| 34 #define CH_SLASH 0x002F |
| 35 |
| 36 //--------------------------------------------------------------------------- |
| 37 // |
| 38 // Test class boilerplate |
| 39 // |
| 40 //--------------------------------------------------------------------------- |
| 41 CharsetDetectionTest::CharsetDetectionTest() |
| 42 { |
| 43 } |
| 44 |
| 45 |
| 46 CharsetDetectionTest::~CharsetDetectionTest() |
| 47 { |
| 48 } |
| 49 |
| 50 |
| 51 |
| 52 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char
* &name, char* /*par*/ ) |
| 53 { |
| 54 if (exec) logln("TestSuite CharsetDetectionTest: "); |
| 55 switch (index) { |
| 56 case 0: name = "ConstructionTest"; |
| 57 if (exec) ConstructionTest(); |
| 58 break; |
| 59 |
| 60 case 1: name = "UTF8Test"; |
| 61 if (exec) UTF8Test(); |
| 62 break; |
| 63 |
| 64 case 2: name = "UTF16Test"; |
| 65 if (exec) UTF16Test(); |
| 66 break; |
| 67 |
| 68 case 3: name = "C1BytesTest"; |
| 69 if (exec) C1BytesTest(); |
| 70 break; |
| 71 |
| 72 case 4: name = "InputFilterTest"; |
| 73 if (exec) InputFilterTest(); |
| 74 break; |
| 75 |
| 76 case 5: name = "DetectionTest"; |
| 77 if (exec) DetectionTest(); |
| 78 break; |
| 79 #if !UCONFIG_NO_LEGACY_CONVERSION |
| 80 case 6: name = "IBM424Test"; |
| 81 if (exec) IBM424Test(); |
| 82 break; |
| 83 |
| 84 case 7: name = "IBM420Test"; |
| 85 if (exec) IBM420Test(); |
| 86 break; |
| 87 #else |
| 88 case 6: |
| 89 case 7: name = "skip"; break; |
| 90 #endif |
| 91 case 8: name = "Ticket6394Test"; |
| 92 if (exec) Ticket6394Test(); |
| 93 break; |
| 94 |
| 95 default: name = ""; |
| 96 break; //needed to end loop |
| 97 } |
| 98 } |
| 99 |
| 100 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits) |
| 101 { |
| 102 int32_t offset = -1; |
| 103 |
| 104 splits = 1; |
| 105 while((offset = src.indexOf(ch, offset + 1)) >= 0) { |
| 106 splits += 1; |
| 107 } |
| 108 |
| 109 UnicodeString *result = new UnicodeString[splits]; |
| 110 |
| 111 int32_t start = 0; |
| 112 int32_t split = 0; |
| 113 int32_t end; |
| 114 |
| 115 while((end = src.indexOf(ch, start)) >= 0) { |
| 116 src.extractBetween(start, end, result[split++]); |
| 117 start = end + 1; |
| 118 } |
| 119 |
| 120 src.extractBetween(start, src.length(), result[split]); |
| 121 |
| 122 return result; |
| 123 } |
| 124 |
| 125 static char *extractBytes(const UnicodeString &source, const char *codepage, int
32_t &length) |
| 126 { |
| 127 int32_t sLength = source.length(); |
| 128 char *bytes = NULL; |
| 129 |
| 130 length = source.extract(0, sLength, NULL, codepage); |
| 131 |
| 132 if (length > 0) { |
| 133 bytes = NEW_ARRAY(char, length + 1); |
| 134 source.extract(0, sLength, bytes, codepage); |
| 135 } |
| 136 |
| 137 return bytes; |
| 138 } |
| 139 |
| 140 static void freeBytes(char *bytes) |
| 141 { |
| 142 DELETE_ARRAY(bytes); |
| 143 } |
| 144 |
| 145 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const
UnicodeString &encoding, const UnicodeString &id) |
| 146 { |
| 147 int32_t splits = 0; |
| 148 int32_t testLength = testString.length(); |
| 149 UnicodeString *eSplit = split(encoding, CH_SLASH, splits); |
| 150 UErrorCode status = U_ZERO_ERROR; |
| 151 int32_t cpLength = eSplit[0].length(); |
| 152 char codepage[64]; |
| 153 |
| 154 u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength); |
| 155 codepage[cpLength] = '\0'; |
| 156 |
| 157 LocalUCharsetDetectorPointer csd(ucsdet_open(&status)); |
| 158 |
| 159 int32_t byteLength = 0; |
| 160 char *bytes = extractBytes(testString, codepage, byteLength); |
| 161 |
| 162 if (bytes == NULL) { |
| 163 #if !UCONFIG_NO_LEGACY_CONVERSION |
| 164 errln("Can't open a " + encoding + " converter for " + id); |
| 165 #endif |
| 166 return; |
| 167 } |
| 168 |
| 169 ucsdet_setText(csd.getAlias(), bytes, byteLength, &status); |
| 170 |
| 171 int32_t matchCount = 0; |
| 172 const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount
, &status); |
| 173 |
| 174 |
| 175 UnicodeString name(ucsdet_getName(matches[0], &status)); |
| 176 UnicodeString lang(ucsdet_getLanguage(matches[0], &status)); |
| 177 UChar *decoded = NULL; |
| 178 int32_t dLength = 0; |
| 179 |
| 180 if (matchCount == 0) { |
| 181 errln("Encoding detection failure for " + id + ": expected " + eSplit[0]
+ ", got no matches"); |
| 182 goto bail; |
| 183 } |
| 184 |
| 185 if (name.compare(eSplit[0]) != 0) { |
| 186 errln("Encoding detection failure for " + id + ": expected " + eSplit[0]
+ ", got " + name); |
| 187 |
| 188 #ifdef DEBUG_DETECT |
| 189 for (int32_t m = 0; m < matchCount; m += 1) { |
| 190 const char *name = ucsdet_getName(matches[m], &status); |
| 191 const char *lang = ucsdet_getLanguage(matches[m], &status); |
| 192 int32_t confidence = ucsdet_getConfidence(matches[m], &status); |
| 193 |
| 194 printf("%s (%s) %d\n", name, lang, confidence); |
| 195 } |
| 196 #endif |
| 197 goto bail; |
| 198 } |
| 199 |
| 200 if (splits > 1 && lang.compare(eSplit[1]) != 0) { |
| 201 errln("Language detection failure for " + id + ", " + eSplit[0] + ": exp
ected " + eSplit[1] + ", got " + lang); |
| 202 goto bail; |
| 203 } |
| 204 |
| 205 decoded = NEW_ARRAY(UChar, testLength); |
| 206 dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status); |
| 207 |
| 208 if (testString.compare(decoded, dLength) != 0) { |
| 209 errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() d
idn't yeild the original string."); |
| 210 |
| 211 #ifdef DEBUG_DETECT |
| 212 for(int32_t i = 0; i < testLength; i += 1) { |
| 213 if(testString[i] != decoded[i]) { |
| 214 printf("Strings differ at byte %d\n", i); |
| 215 break; |
| 216 } |
| 217 } |
| 218 #endif |
| 219 |
| 220 } |
| 221 |
| 222 DELETE_ARRAY(decoded); |
| 223 |
| 224 bail: |
| 225 freeBytes(bytes); |
| 226 delete[] eSplit; |
| 227 } |
| 228 |
| 229 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filenam
e) { |
| 230 UErrorCode status = U_ZERO_ERROR; |
| 231 const char *testDataDirectory = IntlTest::getSourceTestData(status); |
| 232 |
| 233 if (U_FAILURE(status)) { |
| 234 errln("ERROR: getPath() failed - %s", u_errorName(status)); |
| 235 return NULL; |
| 236 } |
| 237 |
| 238 strcpy(buffer, testDataDirectory); |
| 239 strcat(buffer, filename); |
| 240 return buffer; |
| 241 } |
| 242 |
| 243 void CharsetDetectionTest::ConstructionTest() |
| 244 { |
| 245 IcuTestErrorCode status(*this, "ConstructionTest"); |
| 246 LocalUCharsetDetectorPointer csd(ucsdet_open(status)); |
| 247 LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), s
tatus)); |
| 248 int32_t count = uenum_count(e.getAlias(), status); |
| 249 |
| 250 #ifdef DEBUG_DETECT |
| 251 printf("There are %d recognizers.\n", count); |
| 252 #endif |
| 253 |
| 254 for(int32_t i = 0; i < count; i += 1) { |
| 255 int32_t length; |
| 256 const char *name = uenum_next(e.getAlias(), &length, status); |
| 257 |
| 258 if(name == NULL || length <= 0) { |
| 259 errln("ucsdet_getAllDetectableCharsets() returned a null or empty na
me!"); |
| 260 } |
| 261 |
| 262 #ifdef DEBUG_DETECT |
| 263 printf("%s\n", name); |
| 264 #endif |
| 265 } |
| 266 } |
| 267 |
| 268 void CharsetDetectionTest::UTF8Test() |
| 269 { |
| 270 UErrorCode status = U_ZERO_ERROR; |
| 271 UnicodeString ss = "This is a string with some non-ascii characters that wil
l " |
| 272 "be converted to UTF-8, then shoved through the detection
process. " |
| 273 "\\u0391\\u0392\\u0393\\u0394\\u0395" |
| 274 "Sure would be nice if our source could contain Unicode d
irectly!"; |
| 275 UnicodeString s = ss.unescape(); |
| 276 int32_t byteLength = 0, sLength = s.length(); |
| 277 char *bytes = extractBytes(s, "UTF-8", byteLength); |
| 278 UCharsetDetector *csd = ucsdet_open(&status); |
| 279 const UCharsetMatch *match; |
| 280 UChar *detected = NEW_ARRAY(UChar, sLength); |
| 281 |
| 282 ucsdet_setText(csd, bytes, byteLength, &status); |
| 283 match = ucsdet_detect(csd, &status); |
| 284 |
| 285 if (match == NULL) { |
| 286 errln("Detection failure for UTF-8: got no matches."); |
| 287 goto bail; |
| 288 } |
| 289 |
| 290 ucsdet_getUChars(match, detected, sLength, &status); |
| 291 |
| 292 if (s.compare(detected, sLength) != 0) { |
| 293 errln("Round-trip test failed!"); |
| 294 } |
| 295 |
| 296 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */ |
| 297 |
| 298 bail: |
| 299 DELETE_ARRAY(detected); |
| 300 freeBytes(bytes); |
| 301 ucsdet_close(csd); |
| 302 } |
| 303 |
| 304 void CharsetDetectionTest::UTF16Test() |
| 305 { |
| 306 UErrorCode status = U_ZERO_ERROR; |
| 307 /* Notice the BOM on the start of this string */ |
| 308 UChar chars[] = { |
| 309 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C, |
| 310 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a, |
| 311 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628, |
| 312 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646, |
| 313 0x064a, 0x062a, 0x0000}; |
| 314 UnicodeString s(chars); |
| 315 int32_t beLength = 0, leLength = 0; |
| 316 char *beBytes = extractBytes(s, "UTF-16BE", beLength); |
| 317 char *leBytes = extractBytes(s, "UTF-16LE", leLength); |
| 318 UCharsetDetector *csd = ucsdet_open(&status); |
| 319 const UCharsetMatch *match; |
| 320 const char *name; |
| 321 int32_t conf; |
| 322 |
| 323 ucsdet_setText(csd, beBytes, beLength, &status); |
| 324 match = ucsdet_detect(csd, &status); |
| 325 |
| 326 if (match == NULL) { |
| 327 errln("Encoding detection failure for UTF-16BE: got no matches."); |
| 328 goto try_le; |
| 329 } |
| 330 |
| 331 name = ucsdet_getName(match, &status); |
| 332 conf = ucsdet_getConfidence(match, &status); |
| 333 |
| 334 if (strcmp(name, "UTF-16BE") != 0) { |
| 335 errln("Encoding detection failure for UTF-16BE: got %s", name); |
| 336 goto try_le; // no point in looking at confidence if we got the wrong ch
aracter set. |
| 337 } |
| 338 |
| 339 if (conf != 100) { |
| 340 errln("Did not get 100%% confidence for UTF-16BE: got %d", conf); |
| 341 } |
| 342 |
| 343 try_le: |
| 344 ucsdet_setText(csd, leBytes, leLength, &status); |
| 345 match = ucsdet_detect(csd, &status); |
| 346 |
| 347 if (match == NULL) { |
| 348 errln("Encoding detection failure for UTF-16LE: got no matches."); |
| 349 goto bail; |
| 350 } |
| 351 |
| 352 name = ucsdet_getName(match, &status); |
| 353 conf = ucsdet_getConfidence(match, &status); |
| 354 |
| 355 |
| 356 if (strcmp(name, "UTF-16LE") != 0) { |
| 357 errln("Enconding detection failure for UTF-16LE: got %s", name); |
| 358 goto bail; // no point in looking at confidence if we got the wrong char
acter set. |
| 359 } |
| 360 |
| 361 if (conf != 100) { |
| 362 errln("Did not get 100%% confidence for UTF-16LE: got %d", conf); |
| 363 } |
| 364 |
| 365 bail: |
| 366 freeBytes(leBytes); |
| 367 freeBytes(beBytes); |
| 368 ucsdet_close(csd); |
| 369 } |
| 370 |
| 371 void CharsetDetectionTest::InputFilterTest() |
| 372 { |
| 373 UErrorCode status = U_ZERO_ERROR; |
| 374 UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\
u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>"; |
| 375 UnicodeString s = ss.unescape(); |
| 376 int32_t byteLength = 0; |
| 377 char *bytes = extractBytes(s, "ISO-8859-1", byteLength); |
| 378 UCharsetDetector *csd = ucsdet_open(&status); |
| 379 const UCharsetMatch *match; |
| 380 const char *lang, *name; |
| 381 |
| 382 ucsdet_enableInputFilter(csd, TRUE); |
| 383 |
| 384 if (!ucsdet_isInputFilterEnabled(csd)) { |
| 385 errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!"
); |
| 386 } |
| 387 |
| 388 |
| 389 ucsdet_setText(csd, bytes, byteLength, &status); |
| 390 match = ucsdet_detect(csd, &status); |
| 391 |
| 392 if (match == NULL) { |
| 393 errln("Turning on the input filter resulted in no matches."); |
| 394 goto turn_off; |
| 395 } |
| 396 |
| 397 name = ucsdet_getName(match, &status); |
| 398 |
| 399 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { |
| 400 errln("Turning on the input filter resulted in %s rather than ISO-8859-1
.", name); |
| 401 } else { |
| 402 lang = ucsdet_getLanguage(match, &status); |
| 403 |
| 404 if (lang == NULL || strcmp(lang, "fr") != 0) { |
| 405 errln("Input filter did not strip markup!"); |
| 406 } |
| 407 } |
| 408 |
| 409 turn_off: |
| 410 ucsdet_enableInputFilter(csd, FALSE); |
| 411 ucsdet_setText(csd, bytes, byteLength, &status); |
| 412 match = ucsdet_detect(csd, &status); |
| 413 |
| 414 if (match == NULL) { |
| 415 errln("Turning off the input filter resulted in no matches."); |
| 416 goto bail; |
| 417 } |
| 418 |
| 419 name = ucsdet_getName(match, &status); |
| 420 |
| 421 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { |
| 422 errln("Turning off the input filter resulted in %s rather than ISO-8859-
1.", name); |
| 423 } else { |
| 424 lang = ucsdet_getLanguage(match, &status); |
| 425 |
| 426 if (lang == NULL || strcmp(lang, "en") != 0) { |
| 427 errln("Unfiltered input did not detect as English!"); |
| 428 } |
| 429 } |
| 430 |
| 431 bail: |
| 432 freeBytes(bytes); |
| 433 ucsdet_close(csd); |
| 434 } |
| 435 |
| 436 void CharsetDetectionTest::C1BytesTest() |
| 437 { |
| 438 #if !UCONFIG_NO_LEGACY_CONVERSION |
| 439 UErrorCode status = U_ZERO_ERROR; |
| 440 UnicodeString sISO = "This is a small sample of some English text. Just enou
gh to be sure that it detects correctly."; |
| 441 UnicodeString ssWindows("This is another small sample of some English text.
Just enough to be sure that it detects correctly. It also includes some \\u201CC
1\\u201D bytes.", -1, US_INV); |
| 442 UnicodeString sWindows = ssWindows.unescape(); |
| 443 int32_t lISO = 0, lWindows = 0; |
| 444 char *bISO = extractBytes(sISO, "ISO-8859-1", lISO); |
| 445 char *bWindows = extractBytes(sWindows, "windows-1252", lWindows); |
| 446 UCharsetDetector *csd = ucsdet_open(&status); |
| 447 const UCharsetMatch *match; |
| 448 const char *name; |
| 449 |
| 450 ucsdet_setText(csd, bWindows, lWindows, &status); |
| 451 match = ucsdet_detect(csd, &status); |
| 452 |
| 453 if (match == NULL) { |
| 454 errcheckln(status, "English test with C1 bytes got no matches. - %s", u_
errorName(status)); |
| 455 goto bail; |
| 456 } |
| 457 |
| 458 name = ucsdet_getName(match, &status); |
| 459 |
| 460 if (strcmp(name, "windows-1252") != 0) { |
| 461 errln("English text with C1 bytes does not detect as windows-1252, but a
s %s", name); |
| 462 } |
| 463 |
| 464 ucsdet_setText(csd, bISO, lISO, &status); |
| 465 match = ucsdet_detect(csd, &status); |
| 466 |
| 467 if (match == NULL) { |
| 468 errln("English text without C1 bytes got no matches."); |
| 469 goto bail; |
| 470 } |
| 471 |
| 472 name = ucsdet_getName(match, &status); |
| 473 |
| 474 if (strcmp(name, "ISO-8859-1") != 0) { |
| 475 errln("English text without C1 bytes does not detect as ISO-8859-1, but
as %s", name); |
| 476 } |
| 477 |
| 478 bail: |
| 479 freeBytes(bWindows); |
| 480 freeBytes(bISO); |
| 481 |
| 482 ucsdet_close(csd); |
| 483 #endif |
| 484 } |
| 485 |
| 486 void CharsetDetectionTest::DetectionTest() |
| 487 { |
| 488 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 489 UErrorCode status = U_ZERO_ERROR; |
| 490 char path[2048]; |
| 491 const char *testFilePath = getPath(path, "csdetest.xml"); |
| 492 |
| 493 if (testFilePath == NULL) { |
| 494 return; /* Couldn't get path: error message already output. */ |
| 495 } |
| 496 |
| 497 UXMLParser *parser = UXMLParser::createParser(status); |
| 498 if (U_FAILURE(status)) { |
| 499 dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status)); |
| 500 return; |
| 501 } |
| 502 |
| 503 UXMLElement *root = parser->parseFile(testFilePath, status); |
| 504 if (!assertSuccess( "parseFile",status)) return; |
| 505 |
| 506 UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case"); |
| 507 UnicodeString id_attr = UNICODE_STRING_SIMPLE("id"); |
| 508 UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings"); |
| 509 |
| 510 const UXMLElement *testCase; |
| 511 int32_t tc = 0; |
| 512 |
| 513 while((testCase = root->nextChildElement(tc)) != NULL) { |
| 514 if (testCase->getTagName().compare(test_case) == 0) { |
| 515 const UnicodeString *id = testCase->getAttribute(id_attr); |
| 516 const UnicodeString *encodings = testCase->getAttribute(enc_attr); |
| 517 const UnicodeString text = testCase->getText(TRUE); |
| 518 int32_t encodingCount; |
| 519 UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCo
unt); |
| 520 |
| 521 for(int32_t e = 0; e < encodingCount; e += 1) { |
| 522 checkEncoding(text, encodingList[e], *id); |
| 523 } |
| 524 |
| 525 delete[] encodingList; |
| 526 } |
| 527 } |
| 528 |
| 529 delete root; |
| 530 delete parser; |
| 531 #endif |
| 532 } |
| 533 |
| 534 void CharsetDetectionTest::IBM424Test() |
| 535 { |
| 536 UErrorCode status = U_ZERO_ERROR; |
| 537 |
| 538 static const UChar chars[] = { |
| 539 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05
D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8, |
| 540 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05
D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9, |
| 541 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05
DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8, |
| 542 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05
D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA, |
| 543 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05
E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5, |
| 544 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05
D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE, |
| 545 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05
E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, |
| 546 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05
EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC, |
| 547 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x00
22, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3, |
| 548 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05
D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020, |
| 549 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05
D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC, |
| 550 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x00
20, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, |
| 551 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x00
20, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, |
| 552 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05
D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, |
| 553 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05
DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC, |
| 554 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x00
20, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1, |
| 555 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05
D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000 |
| 556 }; |
| 557 |
| 558 static const UChar chars_reverse[] = { |
| 559 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05
DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA, |
| 560 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05
E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8, |
| 561 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05
D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1, |
| 562 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05
E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, |
| 563 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05
DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9, |
| 564 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05
D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4, |
| 565 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05
D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9, |
| 566 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05
DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5, |
| 567 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05
E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3, |
| 568 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05
E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020, |
| 569 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05
E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, |
| 570 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05
DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9, |
| 571 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05
E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020, |
| 572 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05
D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4, |
| 573 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05
D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7, |
| 574 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x00
20, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0, |
| 575 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x00
20, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4, |
| 576 0x0000 |
| 577 }; |
| 578 |
| 579 int32_t bLength = 0, brLength = 0; |
| 580 |
| 581 UnicodeString s1(chars); |
| 582 UnicodeString s2(chars_reverse); |
| 583 |
| 584 char *bytes = extractBytes(s1, "IBM424", bLength); |
| 585 char *bytes_r = extractBytes(s2, "IBM424", brLength); |
| 586 |
| 587 UCharsetDetector *csd = ucsdet_open(&status); |
| 588 if (U_FAILURE(status)) { |
| 589 errln("Error opening charset detector. - %s", u_errorName(status)); |
| 590 } |
| 591 const UCharsetMatch *match; |
| 592 const char *name; |
| 593 |
| 594 ucsdet_setText(csd, bytes, bLength, &status); |
| 595 match = ucsdet_detect(csd, &status); |
| 596 |
| 597 if (match == NULL) { |
| 598 errcheckln(status, "Encoding detection failure for IBM424_rtl: got no ma
tches. - %s", u_errorName(status)); |
| 599 goto bail; |
| 600 } |
| 601 |
| 602 name = ucsdet_getName(match, &status); |
| 603 if (strcmp(name, "IBM424_rtl") != 0) { |
| 604 errln("Encoding detection failure for IBM424_rtl: got %s", name); |
| 605 } |
| 606 |
| 607 ucsdet_setText(csd, bytes_r, brLength, &status); |
| 608 match = ucsdet_detect(csd, &status); |
| 609 |
| 610 if (match == NULL) { |
| 611 errln("Encoding detection failure for IBM424_ltr: got no matches."); |
| 612 goto bail; |
| 613 } |
| 614 |
| 615 name = ucsdet_getName(match, &status); |
| 616 if (strcmp(name, "IBM424_ltr") != 0) { |
| 617 errln("Encoding detection failure for IBM424_ltr: got %s", name); |
| 618 } |
| 619 |
| 620 bail: |
| 621 freeBytes(bytes); |
| 622 freeBytes(bytes_r); |
| 623 ucsdet_close(csd); |
| 624 } |
| 625 |
| 626 void CharsetDetectionTest::IBM420Test() |
| 627 { |
| 628 UErrorCode status = U_ZERO_ERROR; |
| 629 |
| 630 static const UChar chars[] = { |
| 631 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F,
0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627, |
| 632 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641, |
| 633 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627,
0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, |
| 634 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645,
0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645, |
| 635 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627,
0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A, |
| 636 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644, |
| 637 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644,
0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020, |
| 638 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637,
0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, |
| 639 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641,
0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634, |
| 640 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020,
0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F, |
| 641 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626,
0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647, |
| 642 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020,
0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627, |
| 643 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C,
0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E, |
| 644 0x0000 |
| 645 }; |
| 646 static const UChar chars_reverse[] = { |
| 647 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627,
0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F, |
| 648 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631,
0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020, |
| 649 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627,
0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648, |
| 650 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646,
0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628, |
| 651 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F,
0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, |
| 652 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A,
0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A, |
| 653 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648,
0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644, |
| 654 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644,
0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A, |
| 655 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645,
0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A, |
| 656 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020,
0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627, |
| 657 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020,
0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A, |
| 658 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646,
0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645, |
| 659 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646,
0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648, |
| 660 0x0000, |
| 661 }; |
| 662 |
| 663 int32_t bLength = 0, brLength = 0; |
| 664 |
| 665 UnicodeString s1(chars); |
| 666 UnicodeString s2(chars_reverse); |
| 667 |
| 668 char *bytes = extractBytes(s1, "IBM420", bLength); |
| 669 char *bytes_r = extractBytes(s2, "IBM420", brLength); |
| 670 |
| 671 UCharsetDetector *csd = ucsdet_open(&status); |
| 672 if (U_FAILURE(status)) { |
| 673 errln("Error opening charset detector. - %s", u_errorName(status)); |
| 674 } |
| 675 const UCharsetMatch *match; |
| 676 const char *name; |
| 677 |
| 678 ucsdet_setText(csd, bytes, bLength, &status); |
| 679 match = ucsdet_detect(csd, &status); |
| 680 |
| 681 if (match == NULL) { |
| 682 errcheckln(status, "Encoding detection failure for IBM420_rtl: got no ma
tches. - %s", u_errorName(status)); |
| 683 goto bail; |
| 684 } |
| 685 |
| 686 name = ucsdet_getName(match, &status); |
| 687 if (strcmp(name, "IBM420_rtl") != 0) { |
| 688 errln("Encoding detection failure for IBM420_rtl: got %s\n", name); |
| 689 } |
| 690 |
| 691 ucsdet_setText(csd, bytes_r, brLength, &status); |
| 692 match = ucsdet_detect(csd, &status); |
| 693 |
| 694 if (match == NULL) { |
| 695 errln("Encoding detection failure for IBM420_ltr: got no matches.\n"); |
| 696 goto bail; |
| 697 } |
| 698 |
| 699 name = ucsdet_getName(match, &status); |
| 700 if (strcmp(name, "IBM420_ltr") != 0) { |
| 701 errln("Encoding detection failure for IBM420_ltr: got %s\n", name); |
| 702 } |
| 703 |
| 704 bail: |
| 705 freeBytes(bytes); |
| 706 freeBytes(bytes_r); |
| 707 ucsdet_close(csd); |
| 708 } |
| 709 |
| 710 |
| 711 void CharsetDetectionTest::Ticket6394Test() { |
| 712 #if !UCONFIG_NO_CONVERSION |
| 713 const char charText[] = "Here is some random English text that should be de
tected as ISO-8859-1." |
| 714 "Ticket 6394 claims that ISO-8859-1 will appear in
the array of detected " |
| 715 "encodings more than once. The hop through Unicode
String is for platforms " |
| 716 "where this char * string is be EBCDIC and needs co
nversion to Latin1."; |
| 717 char latin1Text[sizeof(charText)]; |
| 718 UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(la
tin1Text), "ISO-8859-1"); |
| 719 |
| 720 UErrorCode status = U_ZERO_ERROR; |
| 721 UCharsetDetector *csd = ucsdet_open(&status); |
| 722 ucsdet_setText(csd, latin1Text, -1, &status); |
| 723 if (U_FAILURE(status)) { |
| 724 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_er
rorName(status)); |
| 725 return; |
| 726 } |
| 727 |
| 728 int32_t matchCount = 0; |
| 729 const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status); |
| 730 if (U_FAILURE(status)) { |
| 731 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_er
rorName(status)); |
| 732 return; |
| 733 } |
| 734 |
| 735 UnicodeSet setOfCharsetNames; // UnicodSets can hold strings. |
| 736 int32_t i; |
| 737 for (i=0; i<matchCount; i++) { |
| 738 UnicodeString charSetName(ucsdet_getName(matches[i], &status)); |
| 739 if (U_FAILURE(status)) { |
| 740 errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __L
INE__, u_errorName(status), i); |
| 741 status = U_ZERO_ERROR; |
| 742 } |
| 743 if (setOfCharsetNames.contains(charSetName)) { |
| 744 errln("Fail at file %s, line %d ", __FILE__, __LINE__); |
| 745 errln(UnicodeString(" Duplicate charset name = ") + charSetName); |
| 746 } |
| 747 setOfCharsetNames.add(charSetName); |
| 748 } |
| 749 ucsdet_close(csd); |
| 750 #endif |
| 751 } |
| 752 |
OLD | NEW |