| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 ********************************************************************** | |
| 3 * Copyright (C) 2005-2015, International Business Machines | |
| 4 * Corporation and others. All Rights Reserved. | |
| 5 ********************************************************************** | |
| 6 */ | |
| 7 | |
| 8 | |
| 9 #include "unicode/utypes.h" | |
| 10 #include "unicode/ucsdet.h" | |
| 11 #include "unicode/ucnv.h" | |
| 12 #include "unicode/unistr.h" | |
| 13 #include "unicode/putil.h" | |
| 14 #include "unicode/uniset.h" | |
| 15 | |
| 16 #include "intltest.h" | |
| 17 #include "csdetest.h" | |
| 18 | |
| 19 #include "xmlparser.h" | |
| 20 | |
| 21 #include <stdlib.h> | |
| 22 #include <string.h> | |
| 23 | |
| 24 #ifdef DEBUG_DETECT | |
| 25 #include <stdio.h> | |
| 26 #endif | |
| 27 | |
| 28 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) | |
| 29 | |
| 30 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type)) | |
| 31 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array)) | |
| 32 | |
| 33 #define CH_SPACE 0x0020 | |
| 34 #define CH_SLASH 0x002F | |
| 35 | |
| 36 #define TEST_ASSERT(x) {if (!(x)) { \ | |
| 37 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} | |
| 38 | |
| 39 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ | |
| 40 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__
, __LINE__, u_errorName(errcode));\ | |
| 41 return;}} | |
| 42 | |
| 43 | |
| 44 //--------------------------------------------------------------------------- | |
| 45 // | |
| 46 // Test class boilerplate | |
| 47 // | |
| 48 //--------------------------------------------------------------------------- | |
| 49 CharsetDetectionTest::CharsetDetectionTest() | |
| 50 { | |
| 51 } | |
| 52 | |
| 53 | |
| 54 CharsetDetectionTest::~CharsetDetectionTest() | |
| 55 { | |
| 56 } | |
| 57 | |
| 58 | |
| 59 | |
| 60 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char
* &name, char* /*par*/ ) | |
| 61 { | |
| 62 if (exec) logln("TestSuite CharsetDetectionTest: "); | |
| 63 switch (index) { | |
| 64 case 0: name = "ConstructionTest"; | |
| 65 if (exec) ConstructionTest(); | |
| 66 break; | |
| 67 | |
| 68 case 1: name = "UTF8Test"; | |
| 69 if (exec) UTF8Test(); | |
| 70 break; | |
| 71 | |
| 72 case 2: name = "UTF16Test"; | |
| 73 if (exec) UTF16Test(); | |
| 74 break; | |
| 75 | |
| 76 case 3: name = "C1BytesTest"; | |
| 77 if (exec) C1BytesTest(); | |
| 78 break; | |
| 79 | |
| 80 case 4: name = "InputFilterTest"; | |
| 81 if (exec) InputFilterTest(); | |
| 82 break; | |
| 83 | |
| 84 case 5: name = "DetectionTest"; | |
| 85 if (exec) DetectionTest(); | |
| 86 break; | |
| 87 #if !UCONFIG_NO_LEGACY_CONVERSION | |
| 88 case 6: name = "IBM424Test"; | |
| 89 if (exec) IBM424Test(); | |
| 90 break; | |
| 91 | |
| 92 case 7: name = "IBM420Test"; | |
| 93 if (exec) IBM420Test(); | |
| 94 break; | |
| 95 #else | |
| 96 case 6: | |
| 97 case 7: name = "skip"; break; | |
| 98 #endif | |
| 99 case 8: name = "Ticket6394Test"; | |
| 100 if (exec) Ticket6394Test(); | |
| 101 break; | |
| 102 | |
| 103 case 9: name = "Ticket6954Test"; | |
| 104 if (exec) Ticket6954Test(); | |
| 105 break; | |
| 106 | |
| 107 default: name = ""; | |
| 108 break; //needed to end loop | |
| 109 } | |
| 110 } | |
| 111 | |
| 112 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits) | |
| 113 { | |
| 114 int32_t offset = -1; | |
| 115 | |
| 116 splits = 1; | |
| 117 while((offset = src.indexOf(ch, offset + 1)) >= 0) { | |
| 118 splits += 1; | |
| 119 } | |
| 120 | |
| 121 UnicodeString *result = new UnicodeString[splits]; | |
| 122 | |
| 123 int32_t start = 0; | |
| 124 int32_t split = 0; | |
| 125 int32_t end; | |
| 126 | |
| 127 while((end = src.indexOf(ch, start)) >= 0) { | |
| 128 src.extractBetween(start, end, result[split++]); | |
| 129 start = end + 1; | |
| 130 } | |
| 131 | |
| 132 src.extractBetween(start, src.length(), result[split]); | |
| 133 | |
| 134 return result; | |
| 135 } | |
| 136 | |
| 137 static char *extractBytes(const UnicodeString &source, const char *codepage, int
32_t &length) | |
| 138 { | |
| 139 int32_t sLength = source.length(); | |
| 140 char *bytes = NULL; | |
| 141 | |
| 142 length = source.extract(0, sLength, NULL, codepage); | |
| 143 | |
| 144 if (length > 0) { | |
| 145 bytes = NEW_ARRAY(char, length + 1); | |
| 146 source.extract(0, sLength, bytes, codepage); | |
| 147 } | |
| 148 | |
| 149 return bytes; | |
| 150 } | |
| 151 | |
| 152 static void freeBytes(char *bytes) | |
| 153 { | |
| 154 DELETE_ARRAY(bytes); | |
| 155 } | |
| 156 | |
| 157 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const
UnicodeString &encoding, const UnicodeString &id) | |
| 158 { | |
| 159 int32_t splits = 0; | |
| 160 int32_t testLength = testString.length(); | |
| 161 UnicodeString *eSplit = split(encoding, CH_SLASH, splits); | |
| 162 UErrorCode status = U_ZERO_ERROR; | |
| 163 int32_t cpLength = eSplit[0].length(); | |
| 164 char codepage[64]; | |
| 165 | |
| 166 u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength); | |
| 167 codepage[cpLength] = '\0'; | |
| 168 | |
| 169 LocalUCharsetDetectorPointer csd(ucsdet_open(&status)); | |
| 170 | |
| 171 int32_t byteLength = 0; | |
| 172 char *bytes = extractBytes(testString, codepage, byteLength); | |
| 173 | |
| 174 if (bytes == NULL) { | |
| 175 #if !UCONFIG_NO_LEGACY_CONVERSION | |
| 176 dataerrln("Can't open a " + encoding + " converter for " + id); | |
| 177 #endif | |
| 178 return; | |
| 179 } | |
| 180 | |
| 181 ucsdet_setText(csd.getAlias(), bytes, byteLength, &status); | |
| 182 | |
| 183 int32_t matchCount = 0; | |
| 184 const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount
, &status); | |
| 185 | |
| 186 | |
| 187 UnicodeString name(ucsdet_getName(matches[0], &status)); | |
| 188 UnicodeString lang(ucsdet_getLanguage(matches[0], &status)); | |
| 189 UChar *decoded = NULL; | |
| 190 int32_t dLength = 0; | |
| 191 | |
| 192 if (matchCount == 0) { | |
| 193 errln("Encoding detection failure for " + id + ": expected " + eSplit[0]
+ ", got no matches"); | |
| 194 goto bail; | |
| 195 } | |
| 196 | |
| 197 if (name.compare(eSplit[0]) != 0) { | |
| 198 errln("Encoding detection failure for " + id + ": expected " + eSplit[0]
+ ", got " + name); | |
| 199 | |
| 200 #ifdef DEBUG_DETECT | |
| 201 for (int32_t m = 0; m < matchCount; m += 1) { | |
| 202 const char *name = ucsdet_getName(matches[m], &status); | |
| 203 const char *lang = ucsdet_getLanguage(matches[m], &status); | |
| 204 int32_t confidence = ucsdet_getConfidence(matches[m], &status); | |
| 205 | |
| 206 printf("%s (%s) %d\n", name, lang, confidence); | |
| 207 } | |
| 208 #endif | |
| 209 goto bail; | |
| 210 } | |
| 211 | |
| 212 if (splits > 1 && lang.compare(eSplit[1]) != 0) { | |
| 213 errln("Language detection failure for " + id + ", " + eSplit[0] + ": exp
ected " + eSplit[1] + ", got " + lang); | |
| 214 goto bail; | |
| 215 } | |
| 216 | |
| 217 decoded = NEW_ARRAY(UChar, testLength); | |
| 218 dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status); | |
| 219 | |
| 220 if (testString.compare(decoded, dLength) != 0) { | |
| 221 errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() d
idn't yeild the original string."); | |
| 222 | |
| 223 #ifdef DEBUG_DETECT | |
| 224 for(int32_t i = 0; i < testLength; i += 1) { | |
| 225 if(testString[i] != decoded[i]) { | |
| 226 printf("Strings differ at byte %d\n", i); | |
| 227 break; | |
| 228 } | |
| 229 } | |
| 230 #endif | |
| 231 | |
| 232 } | |
| 233 | |
| 234 DELETE_ARRAY(decoded); | |
| 235 | |
| 236 bail: | |
| 237 freeBytes(bytes); | |
| 238 delete[] eSplit; | |
| 239 } | |
| 240 | |
| 241 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filenam
e) { | |
| 242 UErrorCode status = U_ZERO_ERROR; | |
| 243 const char *testDataDirectory = IntlTest::getSourceTestData(status); | |
| 244 | |
| 245 if (U_FAILURE(status)) { | |
| 246 errln("ERROR: getPath() failed - %s", u_errorName(status)); | |
| 247 return NULL; | |
| 248 } | |
| 249 | |
| 250 strcpy(buffer, testDataDirectory); | |
| 251 strcat(buffer, filename); | |
| 252 return buffer; | |
| 253 } | |
| 254 | |
| 255 void CharsetDetectionTest::ConstructionTest() | |
| 256 { | |
| 257 IcuTestErrorCode status(*this, "ConstructionTest"); | |
| 258 LocalUCharsetDetectorPointer csd(ucsdet_open(status)); | |
| 259 LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), s
tatus)); | |
| 260 int32_t count = uenum_count(e.getAlias(), status); | |
| 261 | |
| 262 #ifdef DEBUG_DETECT | |
| 263 printf("There are %d recognizers.\n", count); | |
| 264 #endif | |
| 265 | |
| 266 for(int32_t i = 0; i < count; i += 1) { | |
| 267 int32_t length; | |
| 268 const char *name = uenum_next(e.getAlias(), &length, status); | |
| 269 | |
| 270 if(name == NULL || length <= 0) { | |
| 271 errln("ucsdet_getAllDetectableCharsets() returned a null or empty na
me!"); | |
| 272 } | |
| 273 | |
| 274 #ifdef DEBUG_DETECT | |
| 275 printf("%s\n", name); | |
| 276 #endif | |
| 277 } | |
| 278 | |
| 279 const char* defDisabled[] = { | |
| 280 "IBM420_rtl", "IBM420_ltr", | |
| 281 "IBM424_rtl", "IBM424_ltr", | |
| 282 0 | |
| 283 }; | |
| 284 | |
| 285 LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias()
, status)); | |
| 286 const char *activeName = NULL; | |
| 287 | |
| 288 while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) { | |
| 289 // the charset must be included in all list | |
| 290 UBool found = FALSE; | |
| 291 | |
| 292 const char *name = NULL; | |
| 293 uenum_reset(e.getAlias(), status); | |
| 294 while ((name = uenum_next(e.getAlias(), NULL, status))) { | |
| 295 if (strcmp(activeName, name) == 0) { | |
| 296 found = TRUE; | |
| 297 break; | |
| 298 } | |
| 299 } | |
| 300 | |
| 301 if (!found) { | |
| 302 errln(UnicodeString(activeName) + " is not included in the all chars
et list."); | |
| 303 } | |
| 304 | |
| 305 // some charsets are disabled by default | |
| 306 found = FALSE; | |
| 307 for (int32_t i = 0; defDisabled[i] != 0; i++) { | |
| 308 if (strcmp(activeName, defDisabled[i]) == 0) { | |
| 309 found = TRUE; | |
| 310 break; | |
| 311 } | |
| 312 } | |
| 313 if (found) { | |
| 314 errln(UnicodeString(activeName) + " should not be included in the de
fault charset list."); | |
| 315 } | |
| 316 } | |
| 317 } | |
| 318 | |
| 319 void CharsetDetectionTest::UTF8Test() | |
| 320 { | |
| 321 UErrorCode status = U_ZERO_ERROR; | |
| 322 UnicodeString ss = "This is a string with some non-ascii characters that wil
l " | |
| 323 "be converted to UTF-8, then shoved through the detection
process. " | |
| 324 "\\u0391\\u0392\\u0393\\u0394\\u0395" | |
| 325 "Sure would be nice if our source could contain Unicode d
irectly!"; | |
| 326 UnicodeString s = ss.unescape(); | |
| 327 int32_t byteLength = 0, sLength = s.length(); | |
| 328 char *bytes = extractBytes(s, "UTF-8", byteLength); | |
| 329 UCharsetDetector *csd = ucsdet_open(&status); | |
| 330 const UCharsetMatch *match; | |
| 331 UChar *detected = NEW_ARRAY(UChar, sLength); | |
| 332 | |
| 333 ucsdet_setText(csd, bytes, byteLength, &status); | |
| 334 match = ucsdet_detect(csd, &status); | |
| 335 | |
| 336 if (match == NULL) { | |
| 337 errln("Detection failure for UTF-8: got no matches."); | |
| 338 goto bail; | |
| 339 } | |
| 340 | |
| 341 ucsdet_getUChars(match, detected, sLength, &status); | |
| 342 | |
| 343 if (s.compare(detected, sLength) != 0) { | |
| 344 errln("Round-trip test failed!"); | |
| 345 } | |
| 346 | |
| 347 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */ | |
| 348 | |
| 349 bail: | |
| 350 DELETE_ARRAY(detected); | |
| 351 freeBytes(bytes); | |
| 352 ucsdet_close(csd); | |
| 353 } | |
| 354 | |
| 355 void CharsetDetectionTest::UTF16Test() | |
| 356 { | |
| 357 UErrorCode status = U_ZERO_ERROR; | |
| 358 /* Notice the BOM on the start of this string */ | |
| 359 UChar chars[] = { | |
| 360 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C, | |
| 361 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a, | |
| 362 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628, | |
| 363 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646, | |
| 364 0x064a, 0x062a, 0x0000}; | |
| 365 UnicodeString s(chars); | |
| 366 int32_t beLength = 0, leLength = 0; | |
| 367 char *beBytes = extractBytes(s, "UTF-16BE", beLength); | |
| 368 char *leBytes = extractBytes(s, "UTF-16LE", leLength); | |
| 369 UCharsetDetector *csd = ucsdet_open(&status); | |
| 370 const UCharsetMatch *match; | |
| 371 const char *name; | |
| 372 int32_t conf; | |
| 373 | |
| 374 ucsdet_setText(csd, beBytes, beLength, &status); | |
| 375 match = ucsdet_detect(csd, &status); | |
| 376 | |
| 377 if (match == NULL) { | |
| 378 errln("Encoding detection failure for UTF-16BE: got no matches."); | |
| 379 goto try_le; | |
| 380 } | |
| 381 | |
| 382 name = ucsdet_getName(match, &status); | |
| 383 conf = ucsdet_getConfidence(match, &status); | |
| 384 | |
| 385 if (strcmp(name, "UTF-16BE") != 0) { | |
| 386 errln("Encoding detection failure for UTF-16BE: got %s", name); | |
| 387 goto try_le; // no point in looking at confidence if we got the wrong ch
aracter set. | |
| 388 } | |
| 389 | |
| 390 if (conf != 100) { | |
| 391 errln("Did not get 100%% confidence for UTF-16BE: got %d", conf); | |
| 392 } | |
| 393 | |
| 394 try_le: | |
| 395 ucsdet_setText(csd, leBytes, leLength, &status); | |
| 396 match = ucsdet_detect(csd, &status); | |
| 397 | |
| 398 if (match == NULL) { | |
| 399 errln("Encoding detection failure for UTF-16LE: got no matches."); | |
| 400 goto bail; | |
| 401 } | |
| 402 | |
| 403 name = ucsdet_getName(match, &status); | |
| 404 conf = ucsdet_getConfidence(match, &status); | |
| 405 | |
| 406 | |
| 407 if (strcmp(name, "UTF-16LE") != 0) { | |
| 408 errln("Enconding detection failure for UTF-16LE: got %s", name); | |
| 409 goto bail; // no point in looking at confidence if we got the wrong char
acter set. | |
| 410 } | |
| 411 | |
| 412 if (conf != 100) { | |
| 413 errln("Did not get 100%% confidence for UTF-16LE: got %d", conf); | |
| 414 } | |
| 415 | |
| 416 bail: | |
| 417 freeBytes(leBytes); | |
| 418 freeBytes(beBytes); | |
| 419 ucsdet_close(csd); | |
| 420 } | |
| 421 | |
| 422 void CharsetDetectionTest::InputFilterTest() | |
| 423 { | |
| 424 UErrorCode status = U_ZERO_ERROR; | |
| 425 UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\
u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>"; | |
| 426 UnicodeString s = ss.unescape(); | |
| 427 int32_t byteLength = 0; | |
| 428 char *bytes = extractBytes(s, "ISO-8859-1", byteLength); | |
| 429 UCharsetDetector *csd = ucsdet_open(&status); | |
| 430 const UCharsetMatch *match; | |
| 431 const char *lang, *name; | |
| 432 | |
| 433 ucsdet_enableInputFilter(csd, TRUE); | |
| 434 | |
| 435 if (!ucsdet_isInputFilterEnabled(csd)) { | |
| 436 errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!"
); | |
| 437 } | |
| 438 | |
| 439 | |
| 440 ucsdet_setText(csd, bytes, byteLength, &status); | |
| 441 match = ucsdet_detect(csd, &status); | |
| 442 | |
| 443 if (match == NULL) { | |
| 444 errln("Turning on the input filter resulted in no matches."); | |
| 445 goto turn_off; | |
| 446 } | |
| 447 | |
| 448 name = ucsdet_getName(match, &status); | |
| 449 | |
| 450 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { | |
| 451 errln("Turning on the input filter resulted in %s rather than ISO-8859-1
.", name); | |
| 452 } else { | |
| 453 lang = ucsdet_getLanguage(match, &status); | |
| 454 | |
| 455 if (lang == NULL || strcmp(lang, "fr") != 0) { | |
| 456 errln("Input filter did not strip markup!"); | |
| 457 } | |
| 458 } | |
| 459 | |
| 460 turn_off: | |
| 461 ucsdet_enableInputFilter(csd, FALSE); | |
| 462 ucsdet_setText(csd, bytes, byteLength, &status); | |
| 463 match = ucsdet_detect(csd, &status); | |
| 464 | |
| 465 if (match == NULL) { | |
| 466 errln("Turning off the input filter resulted in no matches."); | |
| 467 goto bail; | |
| 468 } | |
| 469 | |
| 470 name = ucsdet_getName(match, &status); | |
| 471 | |
| 472 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { | |
| 473 errln("Turning off the input filter resulted in %s rather than ISO-8859-
1.", name); | |
| 474 } else { | |
| 475 lang = ucsdet_getLanguage(match, &status); | |
| 476 | |
| 477 if (lang == NULL || strcmp(lang, "en") != 0) { | |
| 478 errln("Unfiltered input did not detect as English!"); | |
| 479 } | |
| 480 } | |
| 481 | |
| 482 bail: | |
| 483 freeBytes(bytes); | |
| 484 ucsdet_close(csd); | |
| 485 } | |
| 486 | |
| 487 void CharsetDetectionTest::C1BytesTest() | |
| 488 { | |
| 489 #if !UCONFIG_NO_LEGACY_CONVERSION | |
| 490 UErrorCode status = U_ZERO_ERROR; | |
| 491 UnicodeString sISO = "This is a small sample of some English text. Just enou
gh to be sure that it detects correctly."; | |
| 492 UnicodeString ssWindows("This is another small sample of some English text.
Just enough to be sure that it detects correctly. It also includes some \\u201CC
1\\u201D bytes.", -1, US_INV); | |
| 493 UnicodeString sWindows = ssWindows.unescape(); | |
| 494 int32_t lISO = 0, lWindows = 0; | |
| 495 char *bISO = extractBytes(sISO, "ISO-8859-1", lISO); | |
| 496 char *bWindows = extractBytes(sWindows, "windows-1252", lWindows); | |
| 497 UCharsetDetector *csd = ucsdet_open(&status); | |
| 498 const UCharsetMatch *match; | |
| 499 const char *name; | |
| 500 | |
| 501 ucsdet_setText(csd, bWindows, lWindows, &status); | |
| 502 match = ucsdet_detect(csd, &status); | |
| 503 | |
| 504 if (match == NULL) { | |
| 505 errcheckln(status, "English test with C1 bytes got no matches. - %s", u_
errorName(status)); | |
| 506 goto bail; | |
| 507 } | |
| 508 | |
| 509 name = ucsdet_getName(match, &status); | |
| 510 | |
| 511 if (strcmp(name, "windows-1252") != 0) { | |
| 512 errln("English text with C1 bytes does not detect as windows-1252, but a
s %s", name); | |
| 513 } | |
| 514 | |
| 515 ucsdet_setText(csd, bISO, lISO, &status); | |
| 516 match = ucsdet_detect(csd, &status); | |
| 517 | |
| 518 if (match == NULL) { | |
| 519 errln("English text without C1 bytes got no matches."); | |
| 520 goto bail; | |
| 521 } | |
| 522 | |
| 523 name = ucsdet_getName(match, &status); | |
| 524 | |
| 525 if (strcmp(name, "ISO-8859-1") != 0) { | |
| 526 errln("English text without C1 bytes does not detect as ISO-8859-1, but
as %s", name); | |
| 527 } | |
| 528 | |
| 529 bail: | |
| 530 freeBytes(bWindows); | |
| 531 freeBytes(bISO); | |
| 532 | |
| 533 ucsdet_close(csd); | |
| 534 #endif | |
| 535 } | |
| 536 | |
| 537 void CharsetDetectionTest::DetectionTest() | |
| 538 { | |
| 539 #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
| 540 UErrorCode status = U_ZERO_ERROR; | |
| 541 char path[2048]; | |
| 542 const char *testFilePath = getPath(path, "csdetest.xml"); | |
| 543 | |
| 544 if (testFilePath == NULL) { | |
| 545 return; /* Couldn't get path: error message already output. */ | |
| 546 } | |
| 547 | |
| 548 UXMLParser *parser = UXMLParser::createParser(status); | |
| 549 if (U_FAILURE(status)) { | |
| 550 dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status)); | |
| 551 return; | |
| 552 } | |
| 553 | |
| 554 UXMLElement *root = parser->parseFile(testFilePath, status); | |
| 555 if (!assertSuccess( "parseFile",status)) return; | |
| 556 | |
| 557 UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case"); | |
| 558 UnicodeString id_attr = UNICODE_STRING_SIMPLE("id"); | |
| 559 UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings"); | |
| 560 | |
| 561 const UXMLElement *testCase; | |
| 562 int32_t tc = 0; | |
| 563 | |
| 564 while((testCase = root->nextChildElement(tc)) != NULL) { | |
| 565 if (testCase->getTagName().compare(test_case) == 0) { | |
| 566 const UnicodeString *id = testCase->getAttribute(id_attr); | |
| 567 const UnicodeString *encodings = testCase->getAttribute(enc_attr); | |
| 568 const UnicodeString text = testCase->getText(TRUE); | |
| 569 int32_t encodingCount; | |
| 570 UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCo
unt); | |
| 571 | |
| 572 for(int32_t e = 0; e < encodingCount; e += 1) { | |
| 573 checkEncoding(text, encodingList[e], *id); | |
| 574 } | |
| 575 | |
| 576 delete[] encodingList; | |
| 577 } | |
| 578 } | |
| 579 | |
| 580 delete root; | |
| 581 delete parser; | |
| 582 #endif | |
| 583 } | |
| 584 | |
| 585 void CharsetDetectionTest::IBM424Test() | |
| 586 { | |
| 587 #if !UCONFIG_ONLY_HTML_CONVERSION | |
| 588 UErrorCode status = U_ZERO_ERROR; | |
| 589 | |
| 590 static const UChar chars[] = { | |
| 591 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05
D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8, | |
| 592 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05
D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9, | |
| 593 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05
DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8, | |
| 594 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05
D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA, | |
| 595 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05
E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5, | |
| 596 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05
D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE, | |
| 597 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05
E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, | |
| 598 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05
EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC, | |
| 599 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x00
22, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3, | |
| 600 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05
D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020, | |
| 601 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05
D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC, | |
| 602 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x00
20, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, | |
| 603 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x00
20, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, | |
| 604 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05
D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, | |
| 605 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05
DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC, | |
| 606 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x00
20, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1, | |
| 607 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05
D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000 | |
| 608 }; | |
| 609 | |
| 610 static const UChar chars_reverse[] = { | |
| 611 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05
DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA, | |
| 612 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05
E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8, | |
| 613 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05
D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1, | |
| 614 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05
E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, | |
| 615 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05
DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9, | |
| 616 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05
D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4, | |
| 617 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05
D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9, | |
| 618 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05
DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5, | |
| 619 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05
E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3, | |
| 620 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05
E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020, | |
| 621 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05
E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, | |
| 622 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05
DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9, | |
| 623 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05
E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020, | |
| 624 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05
D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4, | |
| 625 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05
D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7, | |
| 626 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x00
20, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0, | |
| 627 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x00
20, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4, | |
| 628 0x0000 | |
| 629 }; | |
| 630 | |
| 631 int32_t bLength = 0, brLength = 0; | |
| 632 | |
| 633 UnicodeString s1(chars); | |
| 634 UnicodeString s2(chars_reverse); | |
| 635 | |
| 636 char *bytes = extractBytes(s1, "IBM424", bLength); | |
| 637 char *bytes_r = extractBytes(s2, "IBM424", brLength); | |
| 638 | |
| 639 UCharsetDetector *csd = ucsdet_open(&status); | |
| 640 ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status); | |
| 641 ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status); | |
| 642 ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status); | |
| 643 ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status); | |
| 644 if (U_FAILURE(status)) { | |
| 645 errln("Error opening charset detector. - %s", u_errorName(status)); | |
| 646 } | |
| 647 const UCharsetMatch *match; | |
| 648 const char *name; | |
| 649 | |
| 650 ucsdet_setText(csd, bytes, bLength, &status); | |
| 651 match = ucsdet_detect(csd, &status); | |
| 652 | |
| 653 if (match == NULL) { | |
| 654 errcheckln(status, "Encoding detection failure for IBM424_rtl: got no ma
tches. - %s", u_errorName(status)); | |
| 655 goto bail; | |
| 656 } | |
| 657 | |
| 658 name = ucsdet_getName(match, &status); | |
| 659 if (strcmp(name, "IBM424_rtl") != 0) { | |
| 660 errln("Encoding detection failure for IBM424_rtl: got %s", name); | |
| 661 } | |
| 662 | |
| 663 ucsdet_setText(csd, bytes_r, brLength, &status); | |
| 664 match = ucsdet_detect(csd, &status); | |
| 665 | |
| 666 if (match == NULL) { | |
| 667 errln("Encoding detection failure for IBM424_ltr: got no matches."); | |
| 668 goto bail; | |
| 669 } | |
| 670 | |
| 671 name = ucsdet_getName(match, &status); | |
| 672 if (strcmp(name, "IBM424_ltr") != 0) { | |
| 673 errln("Encoding detection failure for IBM424_ltr: got %s", name); | |
| 674 } | |
| 675 | |
| 676 bail: | |
| 677 freeBytes(bytes); | |
| 678 freeBytes(bytes_r); | |
| 679 ucsdet_close(csd); | |
| 680 #endif | |
| 681 } | |
| 682 | |
| 683 void CharsetDetectionTest::IBM420Test() | |
| 684 { | |
| 685 #if !UCONFIG_ONLY_HTML_CONVERSION | |
| 686 UErrorCode status = U_ZERO_ERROR; | |
| 687 | |
| 688 static const UChar chars[] = { | |
| 689 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F,
0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627, | |
| 690 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641, | |
| 691 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627,
0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, | |
| 692 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645,
0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645, | |
| 693 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627,
0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A, | |
| 694 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644, | |
| 695 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644,
0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020, | |
| 696 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637,
0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, | |
| 697 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641,
0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634, | |
| 698 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020,
0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F, | |
| 699 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626,
0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647, | |
| 700 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020,
0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627, | |
| 701 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C,
0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E, | |
| 702 0x0000 | |
| 703 }; | |
| 704 static const UChar chars_reverse[] = { | |
| 705 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627,
0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F, | |
| 706 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631,
0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020, | |
| 707 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627,
0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648, | |
| 708 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646,
0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628, | |
| 709 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F,
0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, | |
| 710 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A,
0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A, | |
| 711 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648,
0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644, | |
| 712 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644,
0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A, | |
| 713 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645,
0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A, | |
| 714 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020,
0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627, | |
| 715 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020,
0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A, | |
| 716 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646,
0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645, | |
| 717 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646,
0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648, | |
| 718 0x0000, | |
| 719 }; | |
| 720 | |
| 721 int32_t bLength = 0, brLength = 0; | |
| 722 | |
| 723 UnicodeString s1(chars); | |
| 724 UnicodeString s2(chars_reverse); | |
| 725 | |
| 726 char *bytes = extractBytes(s1, "IBM420", bLength); | |
| 727 char *bytes_r = extractBytes(s2, "IBM420", brLength); | |
| 728 | |
| 729 UCharsetDetector *csd = ucsdet_open(&status); | |
| 730 if (U_FAILURE(status)) { | |
| 731 errln("Error opening charset detector. - %s", u_errorName(status)); | |
| 732 } | |
| 733 ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status); | |
| 734 ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status); | |
| 735 ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status); | |
| 736 ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status); | |
| 737 const UCharsetMatch *match; | |
| 738 const char *name; | |
| 739 | |
| 740 ucsdet_setText(csd, bytes, bLength, &status); | |
| 741 match = ucsdet_detect(csd, &status); | |
| 742 | |
| 743 if (match == NULL) { | |
| 744 errcheckln(status, "Encoding detection failure for IBM420_rtl: got no ma
tches. - %s", u_errorName(status)); | |
| 745 goto bail; | |
| 746 } | |
| 747 | |
| 748 name = ucsdet_getName(match, &status); | |
| 749 if (strcmp(name, "IBM420_rtl") != 0) { | |
| 750 errln("Encoding detection failure for IBM420_rtl: got %s\n", name); | |
| 751 } | |
| 752 | |
| 753 ucsdet_setText(csd, bytes_r, brLength, &status); | |
| 754 match = ucsdet_detect(csd, &status); | |
| 755 | |
| 756 if (match == NULL) { | |
| 757 errln("Encoding detection failure for IBM420_ltr: got no matches.\n"); | |
| 758 goto bail; | |
| 759 } | |
| 760 | |
| 761 name = ucsdet_getName(match, &status); | |
| 762 if (strcmp(name, "IBM420_ltr") != 0) { | |
| 763 errln("Encoding detection failure for IBM420_ltr: got %s\n", name); | |
| 764 } | |
| 765 | |
| 766 bail: | |
| 767 freeBytes(bytes); | |
| 768 freeBytes(bytes_r); | |
| 769 ucsdet_close(csd); | |
| 770 #endif | |
| 771 } | |
| 772 | |
| 773 | |
| 774 void CharsetDetectionTest::Ticket6394Test() { | |
| 775 #if !UCONFIG_NO_CONVERSION | |
| 776 const char charText[] = "Here is some random English text that should be de
tected as ISO-8859-1." | |
| 777 "Ticket 6394 claims that ISO-8859-1 will appear in
the array of detected " | |
| 778 "encodings more than once. The hop through Unicode
String is for platforms " | |
| 779 "where this char * string is be EBCDIC and needs co
nversion to Latin1."; | |
| 780 char latin1Text[sizeof(charText)]; | |
| 781 UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(la
tin1Text), "ISO-8859-1"); | |
| 782 | |
| 783 UErrorCode status = U_ZERO_ERROR; | |
| 784 UCharsetDetector *csd = ucsdet_open(&status); | |
| 785 ucsdet_setText(csd, latin1Text, -1, &status); | |
| 786 if (U_FAILURE(status)) { | |
| 787 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_er
rorName(status)); | |
| 788 return; | |
| 789 } | |
| 790 | |
| 791 int32_t matchCount = 0; | |
| 792 const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status); | |
| 793 if (U_FAILURE(status)) { | |
| 794 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_er
rorName(status)); | |
| 795 return; | |
| 796 } | |
| 797 | |
| 798 UnicodeSet setOfCharsetNames; // UnicodSets can hold strings. | |
| 799 int32_t i; | |
| 800 for (i=0; i<matchCount; i++) { | |
| 801 UnicodeString charSetName(ucsdet_getName(matches[i], &status)); | |
| 802 if (U_FAILURE(status)) { | |
| 803 errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __L
INE__, u_errorName(status), i); | |
| 804 status = U_ZERO_ERROR; | |
| 805 } | |
| 806 if (setOfCharsetNames.contains(charSetName)) { | |
| 807 errln("Fail at file %s, line %d ", __FILE__, __LINE__); | |
| 808 errln(UnicodeString(" Duplicate charset name = ") + charSetName); | |
| 809 } | |
| 810 setOfCharsetNames.add(charSetName); | |
| 811 } | |
| 812 ucsdet_close(csd); | |
| 813 #endif | |
| 814 } | |
| 815 | |
| 816 | |
| 817 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish b
etween | |
| 818 // similar Windows and non-Windows SBCS encodings. State was kept
in the shared | |
| 819 // Charset Recognizer objects, and could be overwritten. | |
| 820 void CharsetDetectionTest::Ticket6954Test() { | |
| 821 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_FORMATTING | |
| 822 UErrorCode status = U_ZERO_ERROR; | |
| 823 UnicodeString sISO = "This is a small sample of some English text. Just enou
gh to be sure that it detects correctly."; | |
| 824 UnicodeString ssWindows("This is another small sample of some English text.
Just enough to be sure that it detects correctly." | |
| 825 "It also includes some \\u201CC1\\u201D bytes.", -1,
US_INV); | |
| 826 UnicodeString sWindows = ssWindows.unescape(); | |
| 827 int32_t lISO = 0, lWindows = 0; | |
| 828 char *bISO = extractBytes(sISO, "ISO-8859-1", lISO); | |
| 829 char *bWindows = extractBytes(sWindows, "windows-1252", lWindows); | |
| 830 | |
| 831 // First do a plain vanilla detect of 1252 text | |
| 832 | |
| 833 UCharsetDetector *csd1 = ucsdet_open(&status); | |
| 834 ucsdet_setText(csd1, bWindows, lWindows, &status); | |
| 835 const UCharsetMatch *match1 = ucsdet_detect(csd1, &status); | |
| 836 const char *name1 = ucsdet_getName(match1, &status); | |
| 837 TEST_ASSERT_SUCCESS(status); | |
| 838 TEST_ASSERT(strcmp(name1, "windows-1252")==0); | |
| 839 | |
| 840 // Next, using a completely separate detector, detect some 8859-1 text | |
| 841 | |
| 842 UCharsetDetector *csd2 = ucsdet_open(&status); | |
| 843 ucsdet_setText(csd2, bISO, lISO, &status); | |
| 844 const UCharsetMatch *match2 = ucsdet_detect(csd2, &status); | |
| 845 const char *name2 = ucsdet_getName(match2, &status); | |
| 846 TEST_ASSERT_SUCCESS(status); | |
| 847 TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0); | |
| 848 | |
| 849 // Recheck the 1252 results from the first detector, which should not have b
een | |
| 850 // altered by the use of a different detector. | |
| 851 | |
| 852 name1 = ucsdet_getName(match1, &status); | |
| 853 TEST_ASSERT_SUCCESS(status); | |
| 854 TEST_ASSERT(strcmp(name1, "windows-1252")==0); | |
| 855 | |
| 856 ucsdet_close(csd1); | |
| 857 ucsdet_close(csd2); | |
| 858 freeBytes(bISO); | |
| 859 freeBytes(bWindows); | |
| 860 #endif | |
| 861 } | |
| OLD | NEW |