icu46/source/test/intltest/csdetest.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/test/intltest/csdetest.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (C) 2005-2009, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 **********************************************************************

	6 */

	7

	8

	9 #include "unicode/utypes.h"

	10 #include "unicode/ucsdet.h"

	11 #include "unicode/ucnv.h"

	12 #include "unicode/unistr.h"

	13 #include "unicode/putil.h"

	14 #include "unicode/uniset.h"

	15

	16 #include "intltest.h"

	17 #include "csdetest.h"

	18

	19 #include "xmlparser.h"

	20

	21 #include <stdlib.h>

	22 #include <string.h>

	23

	24 #ifdef DEBUG_DETECT

	25 #include <stdio.h>

	26 #endif

	27

	28 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

	29

	30 #define NEW_ARRAY(type,count) (type ) /uprv_/malloc((count) sizeof(type))

	31 #define DELETE_ARRAY(array) /uprv_/free((void *) (array))

	32

	33 #define CH_SPACE 0x0020

	34 #define CH_SLASH 0x002F

	35

	36 //---------------------------------------------------------------------------

	37 //

	38 // Test class boilerplate

	39 //

	40 //---------------------------------------------------------------------------

	41 CharsetDetectionTest::CharsetDetectionTest()

	42 {

	43 }

	44

	45

	46 CharsetDetectionTest::~CharsetDetectionTest()

	47 {

	48 }

	49

	50

	51

	52 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char * &name, char* /par/ )

	53 {

	54 if (exec) logln("TestSuite CharsetDetectionTest: ");

	55 switch (index) {

	56 case 0: name = "ConstructionTest";

	57 if (exec) ConstructionTest();

	58 break;

	59

	60 case 1: name = "UTF8Test";

	61 if (exec) UTF8Test();

	62 break;

	63

	64 case 2: name = "UTF16Test";

	65 if (exec) UTF16Test();

	66 break;

	67

	68 case 3: name = "C1BytesTest";

	69 if (exec) C1BytesTest();

	70 break;

	71

	72 case 4: name = "InputFilterTest";

	73 if (exec) InputFilterTest();

	74 break;

	75

	76 case 5: name = "DetectionTest";

	77 if (exec) DetectionTest();

	78 break;

	79 #if !UCONFIG_NO_LEGACY_CONVERSION

	80 case 6: name = "IBM424Test";

	81 if (exec) IBM424Test();

	82 break;

	83

	84 case 7: name = "IBM420Test";

	85 if (exec) IBM420Test();

	86 break;

	87 #else

	88 case 6:

	89 case 7: name = "skip"; break;

	90 #endif

	91 case 8: name = "Ticket6394Test";

	92 if (exec) Ticket6394Test();

	93 break;

	94

	95 default: name = "";

	96 break; //needed to end loop

	97 }

	98 }

	99

	100 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)

	101 {

	102 int32_t offset = -1;

	103

	104 splits = 1;

	105 while((offset = src.indexOf(ch, offset + 1)) >= 0) {

	106 splits += 1;

	107 }

	108

	109 UnicodeString *result = new UnicodeString[splits];

	110

	111 int32_t start = 0;

	112 int32_t split = 0;

	113 int32_t end;

	114

	115 while((end = src.indexOf(ch, start)) >= 0) {

	116 src.extractBetween(start, end, result[split++]);

	117 start = end + 1;

	118 }

	119

	120 src.extractBetween(start, src.length(), result[split]);

	121

	122 return result;

	123 }

	124

	125 static char extractBytes(const UnicodeString &source, const char codepage, int 32_t &length)

	126 {

	127 int32_t sLength = source.length();

	128 char *bytes = NULL;

	129

	130 length = source.extract(0, sLength, NULL, codepage);

	131

	132 if (length > 0) {

	133 bytes = NEW_ARRAY(char, length + 1);

	134 source.extract(0, sLength, bytes, codepage);

	135 }

	136

	137 return bytes;

	138 }

	139

	140 static void freeBytes(char *bytes)

	141 {

	142 DELETE_ARRAY(bytes);

	143 }

	144

	145 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)

	146 {

	147 int32_t splits = 0;

	148 int32_t testLength = testString.length();

	149 UnicodeString *eSplit = split(encoding, CH_SLASH, splits);

	150 UErrorCode status = U_ZERO_ERROR;

	151 int32_t cpLength = eSplit[0].length();

	152 char codepage[64];

	153

	154 u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);

	155 codepage[cpLength] = '\0';

	156

	157 LocalUCharsetDetectorPointer csd(ucsdet_open(&status));

	158

	159 int32_t byteLength = 0;

	160 char *bytes = extractBytes(testString, codepage, byteLength);

	161

	162 if (bytes == NULL) {

	163 #if !UCONFIG_NO_LEGACY_CONVERSION

	164 errln("Can't open a " + encoding + " converter for " + id);

	165 #endif

	166 return;

	167 }

	168

	169 ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);

	170

	171 int32_t matchCount = 0;

	172 const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount , &status);

	173

	174

	175 UnicodeString name(ucsdet_getName(matches[0], &status));

	176 UnicodeString lang(ucsdet_getLanguage(matches[0], &status));

	177 UChar *decoded = NULL;

	178 int32_t dLength = 0;

	179

	180 if (matchCount == 0) {

	181 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");

	182 goto bail;

	183 }

	184

	185 if (name.compare(eSplit[0]) != 0) {

	186 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);

	187

	188 #ifdef DEBUG_DETECT

	189 for (int32_t m = 0; m < matchCount; m += 1) {

	190 const char *name = ucsdet_getName(matches[m], &status);

	191 const char *lang = ucsdet_getLanguage(matches[m], &status);

	192 int32_t confidence = ucsdet_getConfidence(matches[m], &status);

	193

	194 printf("%s (%s) %d\n", name, lang, confidence);

	195 }

	196 #endif

	197 goto bail;

	198 }

	199

	200 if (splits > 1 && lang.compare(eSplit[1]) != 0) {

	201 errln("Language detection failure for " + id + ", " + eSplit[0] + ": exp ected " + eSplit[1] + ", got " + lang);

	202 goto bail;

	203 }

	204

	205 decoded = NEW_ARRAY(UChar, testLength);

	206 dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);

	207

	208 if (testString.compare(decoded, dLength) != 0) {

	209 errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() d idn't yeild the original string.");

	210

	211 #ifdef DEBUG_DETECT

	212 for(int32_t i = 0; i < testLength; i += 1) {

	213 if(testString[i] != decoded[i]) {

	214 printf("Strings differ at byte %d\n", i);

	215 break;

	216 }

	217 }

	218 #endif

	219

	220 }

	221

	222 DELETE_ARRAY(decoded);

	223

	224 bail:

	225 freeBytes(bytes);

	226 delete[] eSplit;

	227 }

	228

	229 const char CharsetDetectionTest::getPath(char buffer[2048], const char filenam e) {

	230 UErrorCode status = U_ZERO_ERROR;

	231 const char *testDataDirectory = IntlTest::getSourceTestData(status);

	232

	233 if (U_FAILURE(status)) {

	234 errln("ERROR: getPath() failed - %s", u_errorName(status));

	235 return NULL;

	236 }

	237

	238 strcpy(buffer, testDataDirectory);

	239 strcat(buffer, filename);

	240 return buffer;

	241 }

	242

	243 void CharsetDetectionTest::ConstructionTest()

	244 {

	245 IcuTestErrorCode status(*this, "ConstructionTest");

	246 LocalUCharsetDetectorPointer csd(ucsdet_open(status));

	247 LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), s tatus));

	248 int32_t count = uenum_count(e.getAlias(), status);

	249

	250 #ifdef DEBUG_DETECT

	251 printf("There are %d recognizers.\n", count);

	252 #endif

	253

	254 for(int32_t i = 0; i < count; i += 1) {

	255 int32_t length;

	256 const char *name = uenum_next(e.getAlias(), &length, status);

	257

	258 if(name == NULL \|\| length <= 0) {

	259 errln("ucsdet_getAllDetectableCharsets() returned a null or empty na me!");

	260 }

	261

	262 #ifdef DEBUG_DETECT

	263 printf("%s\n", name);

	264 #endif

	265 }

	266 }

	267

	268 void CharsetDetectionTest::UTF8Test()

	269 {

	270 UErrorCode status = U_ZERO_ERROR;

	271 UnicodeString ss = "This is a string with some non-ascii characters that wil l "

	272 "be converted to UTF-8, then shoved through the detection process. "

	273 "\\u0391\\u0392\\u0393\\u0394\\u0395"

	274 "Sure would be nice if our source could contain Unicode d irectly!";

	275 UnicodeString s = ss.unescape();

	276 int32_t byteLength = 0, sLength = s.length();

	277 char *bytes = extractBytes(s, "UTF-8", byteLength);

	278 UCharsetDetector *csd = ucsdet_open(&status);

	279 const UCharsetMatch *match;

	280 UChar *detected = NEW_ARRAY(UChar, sLength);

	281

	282 ucsdet_setText(csd, bytes, byteLength, &status);

	283 match = ucsdet_detect(csd, &status);

	284

	285 if (match == NULL) {

	286 errln("Detection failure for UTF-8: got no matches.");

	287 goto bail;

	288 }

	289

	290 ucsdet_getUChars(match, detected, sLength, &status);

	291

	292 if (s.compare(detected, sLength) != 0) {

	293 errln("Round-trip test failed!");

	294 }

	295

	296 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */

	297

	298 bail:

	299 DELETE_ARRAY(detected);

	300 freeBytes(bytes);

	301 ucsdet_close(csd);

	302 }

	303

	304 void CharsetDetectionTest::UTF16Test()

	305 {

	306 UErrorCode status = U_ZERO_ERROR;

	307 /* Notice the BOM on the start of this string */

	308 UChar chars[] = {

	309 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,

	310 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,

	311 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,

	312 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,

	313 0x064a, 0x062a, 0x0000};

	314 UnicodeString s(chars);

	315 int32_t beLength = 0, leLength = 0;

	316 char *beBytes = extractBytes(s, "UTF-16BE", beLength);

	317 char *leBytes = extractBytes(s, "UTF-16LE", leLength);

	318 UCharsetDetector *csd = ucsdet_open(&status);

	319 const UCharsetMatch *match;

	320 const char *name;

	321 int32_t conf;

	322

	323 ucsdet_setText(csd, beBytes, beLength, &status);

	324 match = ucsdet_detect(csd, &status);

	325

	326 if (match == NULL) {

	327 errln("Encoding detection failure for UTF-16BE: got no matches.");

	328 goto try_le;

	329 }

	330

	331 name = ucsdet_getName(match, &status);

	332 conf = ucsdet_getConfidence(match, &status);

	333

	334 if (strcmp(name, "UTF-16BE") != 0) {

	335 errln("Encoding detection failure for UTF-16BE: got %s", name);

	336 goto try_le; // no point in looking at confidence if we got the wrong ch aracter set.

	337 }

	338

	339 if (conf != 100) {

	340 errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);

	341 }

	342

	343 try_le:

	344 ucsdet_setText(csd, leBytes, leLength, &status);

	345 match = ucsdet_detect(csd, &status);

	346

	347 if (match == NULL) {

	348 errln("Encoding detection failure for UTF-16LE: got no matches.");

	349 goto bail;

	350 }

	351

	352 name = ucsdet_getName(match, &status);

	353 conf = ucsdet_getConfidence(match, &status);

	354

	355

	356 if (strcmp(name, "UTF-16LE") != 0) {

	357 errln("Enconding detection failure for UTF-16LE: got %s", name);

	358 goto bail; // no point in looking at confidence if we got the wrong char acter set.

	359 }

	360

	361 if (conf != 100) {

	362 errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);

	363 }

	364

	365 bail:

	366 freeBytes(leBytes);

	367 freeBytes(beBytes);

	368 ucsdet_close(csd);

	369 }

	370

	371 void CharsetDetectionTest::InputFilterTest()

	372 {

	373 UErrorCode status = U_ZERO_ERROR;

	374 UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\ u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";

	375 UnicodeString s = ss.unescape();

	376 int32_t byteLength = 0;

	377 char *bytes = extractBytes(s, "ISO-8859-1", byteLength);

	378 UCharsetDetector *csd = ucsdet_open(&status);

	379 const UCharsetMatch *match;

	380 const char lang, name;

	381

	382 ucsdet_enableInputFilter(csd, TRUE);

	383

	384 if (!ucsdet_isInputFilterEnabled(csd)) {

	385 errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!" );

	386 }

	387

	388

	389 ucsdet_setText(csd, bytes, byteLength, &status);

	390 match = ucsdet_detect(csd, &status);

	391

	392 if (match == NULL) {

	393 errln("Turning on the input filter resulted in no matches.");

	394 goto turn_off;

	395 }

	396

	397 name = ucsdet_getName(match, &status);

	398

	399 if (name == NULL \|\| strcmp(name, "ISO-8859-1") != 0) {

	400 errln("Turning on the input filter resulted in %s rather than ISO-8859-1 .", name);

	401 } else {

	402 lang = ucsdet_getLanguage(match, &status);

	403

	404 if (lang == NULL \|\| strcmp(lang, "fr") != 0) {

	405 errln("Input filter did not strip markup!");

	406 }

	407 }

	408

	409 turn_off:

	410 ucsdet_enableInputFilter(csd, FALSE);

	411 ucsdet_setText(csd, bytes, byteLength, &status);

	412 match = ucsdet_detect(csd, &status);

	413

	414 if (match == NULL) {

	415 errln("Turning off the input filter resulted in no matches.");

	416 goto bail;

	417 }

	418

	419 name = ucsdet_getName(match, &status);

	420

	421 if (name == NULL \|\| strcmp(name, "ISO-8859-1") != 0) {

	422 errln("Turning off the input filter resulted in %s rather than ISO-8859- 1.", name);

	423 } else {

	424 lang = ucsdet_getLanguage(match, &status);

	425

	426 if (lang == NULL \|\| strcmp(lang, "en") != 0) {

	427 errln("Unfiltered input did not detect as English!");

	428 }

	429 }

	430

	431 bail:

	432 freeBytes(bytes);

	433 ucsdet_close(csd);

	434 }

	435

	436 void CharsetDetectionTest::C1BytesTest()

	437 {

	438 #if !UCONFIG_NO_LEGACY_CONVERSION

	439 UErrorCode status = U_ZERO_ERROR;

	440 UnicodeString sISO = "This is a small sample of some English text. Just enou gh to be sure that it detects correctly.";

	441 UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC 1\\u201D bytes.", -1, US_INV);

	442 UnicodeString sWindows = ssWindows.unescape();

	443 int32_t lISO = 0, lWindows = 0;

	444 char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);

	445 char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);

	446 UCharsetDetector *csd = ucsdet_open(&status);

	447 const UCharsetMatch *match;

	448 const char *name;

	449

	450 ucsdet_setText(csd, bWindows, lWindows, &status);

	451 match = ucsdet_detect(csd, &status);

	452

	453 if (match == NULL) {

	454 errcheckln(status, "English test with C1 bytes got no matches. - %s", u_ errorName(status));

	455 goto bail;

	456 }

	457

	458 name = ucsdet_getName(match, &status);

	459

	460 if (strcmp(name, "windows-1252") != 0) {

	461 errln("English text with C1 bytes does not detect as windows-1252, but a s %s", name);

	462 }

	463

	464 ucsdet_setText(csd, bISO, lISO, &status);

	465 match = ucsdet_detect(csd, &status);

	466

	467 if (match == NULL) {

	468 errln("English text without C1 bytes got no matches.");

	469 goto bail;

	470 }

	471

	472 name = ucsdet_getName(match, &status);

	473

	474 if (strcmp(name, "ISO-8859-1") != 0) {

	475 errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);

	476 }

	477

	478 bail:

	479 freeBytes(bWindows);

	480 freeBytes(bISO);

	481

	482 ucsdet_close(csd);

	483 #endif

	484 }

	485

	486 void CharsetDetectionTest::DetectionTest()

	487 {

	488 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

	489 UErrorCode status = U_ZERO_ERROR;

	490 char path[2048];

	491 const char *testFilePath = getPath(path, "csdetest.xml");

	492

	493 if (testFilePath == NULL) {

	494 return; /* Couldn't get path: error message already output. */

	495 }

	496

	497 UXMLParser *parser = UXMLParser::createParser(status);

	498 if (U_FAILURE(status)) {

	499 dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));

	500 return;

	501 }

	502

	503 UXMLElement *root = parser->parseFile(testFilePath, status);

	504 if (!assertSuccess( "parseFile",status)) return;

	505

	506 UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");

	507 UnicodeString id_attr = UNICODE_STRING_SIMPLE("id");

	508 UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings");

	509

	510 const UXMLElement *testCase;

	511 int32_t tc = 0;

	512

	513 while((testCase = root->nextChildElement(tc)) != NULL) {

	514 if (testCase->getTagName().compare(test_case) == 0) {

	515 const UnicodeString *id = testCase->getAttribute(id_attr);

	516 const UnicodeString *encodings = testCase->getAttribute(enc_attr);

	517 const UnicodeString text = testCase->getText(TRUE);

	518 int32_t encodingCount;

	519 UnicodeString encodingList = split(encodings, CH_SPACE, encodingCo unt);

	520

	521 for(int32_t e = 0; e < encodingCount; e += 1) {

	522 checkEncoding(text, encodingList[e], *id);

	523 }

	524

	525 delete[] encodingList;

	526 }

	527 }

	528

	529 delete root;

	530 delete parser;

	531 #endif

	532 }

	533

	534 void CharsetDetectionTest::IBM424Test()

	535 {

	536 UErrorCode status = U_ZERO_ERROR;

	537

	538 static const UChar chars[] = {

	539 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05 D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,

	540 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05 D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,

	541 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05 DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,

	542 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05 D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,

	543 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05 E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,

	544 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05 D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,

	545 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05 E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,

	546 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05 EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,

	547 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x00 22, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,

	548 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05 D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,

	549 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05 D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,

	550 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x00 20, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,

	551 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x00 20, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,

	552 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05 D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,

	553 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05 DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,

	554 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x00 20, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,

	555 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05 D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000

	556 };

	557

	558 static const UChar chars_reverse[] = {

	559 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05 DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,

	560 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05 E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,

	561 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05 D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,

	562 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05 E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,

	563 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05 DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,

	564 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05 D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,

	565 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05 D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,

	566 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05 DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,

	567 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05 E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,

	568 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05 E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,

	569 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05 E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,

	570 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05 DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,

	571 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05 E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,

	572 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05 D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,

	573 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05 D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,

	574 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x00 20, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,

	575 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x00 20, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,

	576 0x0000

	577 };

	578

	579 int32_t bLength = 0, brLength = 0;

	580

	581 UnicodeString s1(chars);

	582 UnicodeString s2(chars_reverse);

	583

	584 char *bytes = extractBytes(s1, "IBM424", bLength);

	585 char *bytes_r = extractBytes(s2, "IBM424", brLength);

	586

	587 UCharsetDetector *csd = ucsdet_open(&status);

	588 if (U_FAILURE(status)) {

	589 errln("Error opening charset detector. - %s", u_errorName(status));

	590 }

	591 const UCharsetMatch *match;

	592 const char *name;

	593

	594 ucsdet_setText(csd, bytes, bLength, &status);

	595 match = ucsdet_detect(csd, &status);

	596

	597 if (match == NULL) {

	598 errcheckln(status, "Encoding detection failure for IBM424_rtl: got no ma tches. - %s", u_errorName(status));

	599 goto bail;

	600 }

	601

	602 name = ucsdet_getName(match, &status);

	603 if (strcmp(name, "IBM424_rtl") != 0) {

	604 errln("Encoding detection failure for IBM424_rtl: got %s", name);

	605 }

	606

	607 ucsdet_setText(csd, bytes_r, brLength, &status);

	608 match = ucsdet_detect(csd, &status);

	609

	610 if (match == NULL) {

	611 errln("Encoding detection failure for IBM424_ltr: got no matches.");

	612 goto bail;

	613 }

	614

	615 name = ucsdet_getName(match, &status);

	616 if (strcmp(name, "IBM424_ltr") != 0) {

	617 errln("Encoding detection failure for IBM424_ltr: got %s", name);

	618 }

	619

	620 bail:

	621 freeBytes(bytes);

	622 freeBytes(bytes_r);

	623 ucsdet_close(csd);

	624 }

	625

	626 void CharsetDetectionTest::IBM420Test()

	627 {

	628 UErrorCode status = U_ZERO_ERROR;

	629

	630 static const UChar chars[] = {

	631 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,

	632 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,

	633 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,

	634 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,

	635 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,

	636 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,

	637 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,

	638 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,

	639 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,

	640 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,

	641 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,

	642 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,

	643 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,

	644 0x0000

	645 };

	646 static const UChar chars_reverse[] = {

	647 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,

	648 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,

	649 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,

	650 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,

	651 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,

	652 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,

	653 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,

	654 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,

	655 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,

	656 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,

	657 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,

	658 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,

	659 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,

	660 0x0000,

	661 };

	662

	663 int32_t bLength = 0, brLength = 0;

	664

	665 UnicodeString s1(chars);

	666 UnicodeString s2(chars_reverse);

	667

	668 char *bytes = extractBytes(s1, "IBM420", bLength);

	669 char *bytes_r = extractBytes(s2, "IBM420", brLength);

	670

	671 UCharsetDetector *csd = ucsdet_open(&status);

	672 if (U_FAILURE(status)) {

	673 errln("Error opening charset detector. - %s", u_errorName(status));

	674 }

	675 const UCharsetMatch *match;

	676 const char *name;

	677

	678 ucsdet_setText(csd, bytes, bLength, &status);

	679 match = ucsdet_detect(csd, &status);

	680

	681 if (match == NULL) {

	682 errcheckln(status, "Encoding detection failure for IBM420_rtl: got no ma tches. - %s", u_errorName(status));

	683 goto bail;

	684 }

	685

	686 name = ucsdet_getName(match, &status);

	687 if (strcmp(name, "IBM420_rtl") != 0) {

	688 errln("Encoding detection failure for IBM420_rtl: got %s\n", name);

	689 }

	690

	691 ucsdet_setText(csd, bytes_r, brLength, &status);

	692 match = ucsdet_detect(csd, &status);

	693

	694 if (match == NULL) {

	695 errln("Encoding detection failure for IBM420_ltr: got no matches.\n");

	696 goto bail;

	697 }

	698

	699 name = ucsdet_getName(match, &status);

	700 if (strcmp(name, "IBM420_ltr") != 0) {

	701 errln("Encoding detection failure for IBM420_ltr: got %s\n", name);

	702 }

	703

	704 bail:

	705 freeBytes(bytes);

	706 freeBytes(bytes_r);

	707 ucsdet_close(csd);

	708 }

	709

	710

	711 void CharsetDetectionTest::Ticket6394Test() {

	712 #if !UCONFIG_NO_CONVERSION

	713 const char charText[] = "Here is some random English text that should be de tected as ISO-8859-1."

	714 "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "

	715 "encodings more than once. The hop through Unicode String is for platforms "

	716 "where this char * string is be EBCDIC and needs co nversion to Latin1.";

	717 char latin1Text[sizeof(charText)];

	718 UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(la tin1Text), "ISO-8859-1");

	719

	720 UErrorCode status = U_ZERO_ERROR;

	721 UCharsetDetector *csd = ucsdet_open(&status);

	722 ucsdet_setText(csd, latin1Text, -1, &status);

	723 if (U_FAILURE(status)) {

	724 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_er rorName(status));

	725 return;

	726 }

	727

	728 int32_t matchCount = 0;

	729 const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);

	730 if (U_FAILURE(status)) {

	731 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_er rorName(status));

	732 return;

	733 }

	734

	735 UnicodeSet setOfCharsetNames; // UnicodSets can hold strings.

	736 int32_t i;

	737 for (i=0; i<matchCount; i++) {

	738 UnicodeString charSetName(ucsdet_getName(matches[i], &status));

	739 if (U_FAILURE(status)) {

	740 errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __L INE__, u_errorName(status), i);

	741 status = U_ZERO_ERROR;

	742 }

	743 if (setOfCharsetNames.contains(charSetName)) {

	744 errln("Fail at file %s, line %d ", __FILE__, __LINE__);

	745 errln(UnicodeString(" Duplicate charset name = ") + charSetName);

	746 }

	747 setOfCharsetNames.add(charSetName);

	748 }

	749 ucsdet_close(csd);

	750 #endif

	751 }

	752

OLD	NEW

« no previous file with comments | « icu46/source/test/intltest/csdetest.h ('k') | icu46/source/test/intltest/currcoll.h » ('j') | no next file with comments »