source/test/intltest/csdetest.cpp - Issue 2435373002: Delete source/test

Side by Side Diff: source/test/intltest/csdetest.cpp

Issue 2435373002: Delete source/test (Closed)

Patch Set: Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 /*

2 **********************************************************************

3 * Copyright (C) 2005-2015, International Business Machines

4 * Corporation and others. All Rights Reserved.

5 **********************************************************************

6 */

7

8

9 #include "unicode/utypes.h"

10 #include "unicode/ucsdet.h"

11 #include "unicode/ucnv.h"

12 #include "unicode/unistr.h"

13 #include "unicode/putil.h"

14 #include "unicode/uniset.h"

15

16 #include "intltest.h"

17 #include "csdetest.h"

18

19 #include "xmlparser.h"

20

21 #include <stdlib.h>

22 #include <string.h>

23

24 #ifdef DEBUG_DETECT

25 #include <stdio.h>

26 #endif

27

28 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

29

30 #define NEW_ARRAY(type,count) (type ) /uprv_/malloc((count) sizeof(type))

31 #define DELETE_ARRAY(array) /uprv_/free((void *) (array))

32

33 #define CH_SPACE 0x0020

34 #define CH_SLASH 0x002F

35

36 #define TEST_ASSERT(x) {if (!(x)) { \

37 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}

38

39 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \

40 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__ , __LINE__, u_errorName(errcode));\

41 return;}}

42

43

44 //---------------------------------------------------------------------------

45 //

46 // Test class boilerplate

47 //

48 //---------------------------------------------------------------------------

49 CharsetDetectionTest::CharsetDetectionTest()

50 {

51 }

52

53

54 CharsetDetectionTest::~CharsetDetectionTest()

55 {

56 }

57

58

59

60 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char * &name, char* /par/ )

61 {

62 if (exec) logln("TestSuite CharsetDetectionTest: ");

63 switch (index) {

64 case 0: name = "ConstructionTest";

65 if (exec) ConstructionTest();

66 break;

67

68 case 1: name = "UTF8Test";

69 if (exec) UTF8Test();

70 break;

71

72 case 2: name = "UTF16Test";

73 if (exec) UTF16Test();

74 break;

75

76 case 3: name = "C1BytesTest";

77 if (exec) C1BytesTest();

78 break;

79

80 case 4: name = "InputFilterTest";

81 if (exec) InputFilterTest();

82 break;

83

84 case 5: name = "DetectionTest";

85 if (exec) DetectionTest();

86 break;

87 #if !UCONFIG_NO_LEGACY_CONVERSION

88 case 6: name = "IBM424Test";

89 if (exec) IBM424Test();

90 break;

91

92 case 7: name = "IBM420Test";

93 if (exec) IBM420Test();

94 break;

95 #else

96 case 6:

97 case 7: name = "skip"; break;

98 #endif

99 case 8: name = "Ticket6394Test";

100 if (exec) Ticket6394Test();

101 break;

102

103 case 9: name = "Ticket6954Test";

104 if (exec) Ticket6954Test();

105 break;

106

107 default: name = "";

108 break; //needed to end loop

109 }

110 }

111

112 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)

113 {

114 int32_t offset = -1;

115

116 splits = 1;

117 while((offset = src.indexOf(ch, offset + 1)) >= 0) {

118 splits += 1;

119 }

120

121 UnicodeString *result = new UnicodeString[splits];

122

123 int32_t start = 0;

124 int32_t split = 0;

125 int32_t end;

126

127 while((end = src.indexOf(ch, start)) >= 0) {

128 src.extractBetween(start, end, result[split++]);

129 start = end + 1;

130 }

131

132 src.extractBetween(start, src.length(), result[split]);

133

134 return result;

135 }

136

137 static char extractBytes(const UnicodeString &source, const char codepage, int 32_t &length)

138 {

139 int32_t sLength = source.length();

140 char *bytes = NULL;

141

142 length = source.extract(0, sLength, NULL, codepage);

143

144 if (length > 0) {

145 bytes = NEW_ARRAY(char, length + 1);

146 source.extract(0, sLength, bytes, codepage);

147 }

148

149 return bytes;

150 }

151

152 static void freeBytes(char *bytes)

153 {

154 DELETE_ARRAY(bytes);

155 }

156

157 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)

158 {

159 int32_t splits = 0;

160 int32_t testLength = testString.length();

161 UnicodeString *eSplit = split(encoding, CH_SLASH, splits);

162 UErrorCode status = U_ZERO_ERROR;

163 int32_t cpLength = eSplit[0].length();

164 char codepage[64];

165

166 u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);

167 codepage[cpLength] = '\0';

168

169 LocalUCharsetDetectorPointer csd(ucsdet_open(&status));

170

171 int32_t byteLength = 0;

172 char *bytes = extractBytes(testString, codepage, byteLength);

173

174 if (bytes == NULL) {

175 #if !UCONFIG_NO_LEGACY_CONVERSION

176 dataerrln("Can't open a " + encoding + " converter for " + id);

177 #endif

178 return;

179 }

180

181 ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);

182

183 int32_t matchCount = 0;

184 const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount , &status);

185

186

187 UnicodeString name(ucsdet_getName(matches[0], &status));

188 UnicodeString lang(ucsdet_getLanguage(matches[0], &status));

189 UChar *decoded = NULL;

190 int32_t dLength = 0;

191

192 if (matchCount == 0) {

193 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");

194 goto bail;

195 }

196

197 if (name.compare(eSplit[0]) != 0) {

198 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);

199

200 #ifdef DEBUG_DETECT

201 for (int32_t m = 0; m < matchCount; m += 1) {

202 const char *name = ucsdet_getName(matches[m], &status);

203 const char *lang = ucsdet_getLanguage(matches[m], &status);

204 int32_t confidence = ucsdet_getConfidence(matches[m], &status);

205

206 printf("%s (%s) %d\n", name, lang, confidence);

207 }

208 #endif

209 goto bail;

210 }

211

212 if (splits > 1 && lang.compare(eSplit[1]) != 0) {

213 errln("Language detection failure for " + id + ", " + eSplit[0] + ": exp ected " + eSplit[1] + ", got " + lang);

214 goto bail;

215 }

216

217 decoded = NEW_ARRAY(UChar, testLength);

218 dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);

219

220 if (testString.compare(decoded, dLength) != 0) {

221 errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() d idn't yeild the original string.");

222

223 #ifdef DEBUG_DETECT

224 for(int32_t i = 0; i < testLength; i += 1) {

225 if(testString[i] != decoded[i]) {

226 printf("Strings differ at byte %d\n", i);

227 break;

228 }

229 }

230 #endif

231

232 }

233

234 DELETE_ARRAY(decoded);

235

236 bail:

237 freeBytes(bytes);

238 delete[] eSplit;

239 }

240

241 const char CharsetDetectionTest::getPath(char buffer[2048], const char filenam e) {

242 UErrorCode status = U_ZERO_ERROR;

243 const char *testDataDirectory = IntlTest::getSourceTestData(status);

244

245 if (U_FAILURE(status)) {

246 errln("ERROR: getPath() failed - %s", u_errorName(status));

247 return NULL;

248 }

249

250 strcpy(buffer, testDataDirectory);

251 strcat(buffer, filename);

252 return buffer;

253 }

254

255 void CharsetDetectionTest::ConstructionTest()

256 {

257 IcuTestErrorCode status(*this, "ConstructionTest");

258 LocalUCharsetDetectorPointer csd(ucsdet_open(status));

259 LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), s tatus));

260 int32_t count = uenum_count(e.getAlias(), status);

261

262 #ifdef DEBUG_DETECT

263 printf("There are %d recognizers.\n", count);

264 #endif

265

266 for(int32_t i = 0; i < count; i += 1) {

267 int32_t length;

268 const char *name = uenum_next(e.getAlias(), &length, status);

269

270 if(name == NULL \|\| length <= 0) {

271 errln("ucsdet_getAllDetectableCharsets() returned a null or empty na me!");

272 }

273

274 #ifdef DEBUG_DETECT

275 printf("%s\n", name);

276 #endif

277 }

278

279 const char* defDisabled[] = {

280 "IBM420_rtl", "IBM420_ltr",

281 "IBM424_rtl", "IBM424_ltr",

282 0

283 };

284

285 LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias() , status));

286 const char *activeName = NULL;

287

288 while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {

289 // the charset must be included in all list

290 UBool found = FALSE;

291

292 const char *name = NULL;

293 uenum_reset(e.getAlias(), status);

294 while ((name = uenum_next(e.getAlias(), NULL, status))) {

295 if (strcmp(activeName, name) == 0) {

296 found = TRUE;

297 break;

298 }

299 }

300

301 if (!found) {

302 errln(UnicodeString(activeName) + " is not included in the all chars et list.");

303 }

304

305 // some charsets are disabled by default

306 found = FALSE;

307 for (int32_t i = 0; defDisabled[i] != 0; i++) {

308 if (strcmp(activeName, defDisabled[i]) == 0) {

309 found = TRUE;

310 break;

311 }

312 }

313 if (found) {

314 errln(UnicodeString(activeName) + " should not be included in the de fault charset list.");

315 }

316 }

317 }

318

319 void CharsetDetectionTest::UTF8Test()

320 {

321 UErrorCode status = U_ZERO_ERROR;

322 UnicodeString ss = "This is a string with some non-ascii characters that wil l "

323 "be converted to UTF-8, then shoved through the detection process. "

324 "\\u0391\\u0392\\u0393\\u0394\\u0395"

325 "Sure would be nice if our source could contain Unicode d irectly!";

326 UnicodeString s = ss.unescape();

327 int32_t byteLength = 0, sLength = s.length();

328 char *bytes = extractBytes(s, "UTF-8", byteLength);

329 UCharsetDetector *csd = ucsdet_open(&status);

330 const UCharsetMatch *match;

331 UChar *detected = NEW_ARRAY(UChar, sLength);

332

333 ucsdet_setText(csd, bytes, byteLength, &status);

334 match = ucsdet_detect(csd, &status);

335

336 if (match == NULL) {

337 errln("Detection failure for UTF-8: got no matches.");

338 goto bail;

339 }

340

341 ucsdet_getUChars(match, detected, sLength, &status);

342

343 if (s.compare(detected, sLength) != 0) {

344 errln("Round-trip test failed!");

345 }

346

347 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */

348

349 bail:

350 DELETE_ARRAY(detected);

351 freeBytes(bytes);

352 ucsdet_close(csd);

353 }

354

355 void CharsetDetectionTest::UTF16Test()

356 {

357 UErrorCode status = U_ZERO_ERROR;

358 /* Notice the BOM on the start of this string */

359 UChar chars[] = {

360 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,

361 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,

362 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,

363 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,

364 0x064a, 0x062a, 0x0000};

365 UnicodeString s(chars);

366 int32_t beLength = 0, leLength = 0;

367 char *beBytes = extractBytes(s, "UTF-16BE", beLength);

368 char *leBytes = extractBytes(s, "UTF-16LE", leLength);

369 UCharsetDetector *csd = ucsdet_open(&status);

370 const UCharsetMatch *match;

371 const char *name;

372 int32_t conf;

373

374 ucsdet_setText(csd, beBytes, beLength, &status);

375 match = ucsdet_detect(csd, &status);

376

377 if (match == NULL) {

378 errln("Encoding detection failure for UTF-16BE: got no matches.");

379 goto try_le;

380 }

381

382 name = ucsdet_getName(match, &status);

383 conf = ucsdet_getConfidence(match, &status);

384

385 if (strcmp(name, "UTF-16BE") != 0) {

386 errln("Encoding detection failure for UTF-16BE: got %s", name);

387 goto try_le; // no point in looking at confidence if we got the wrong ch aracter set.

388 }

389

390 if (conf != 100) {

391 errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);

392 }

393

394 try_le:

395 ucsdet_setText(csd, leBytes, leLength, &status);

396 match = ucsdet_detect(csd, &status);

397

398 if (match == NULL) {

399 errln("Encoding detection failure for UTF-16LE: got no matches.");

400 goto bail;

401 }

402

403 name = ucsdet_getName(match, &status);

404 conf = ucsdet_getConfidence(match, &status);

405

406

407 if (strcmp(name, "UTF-16LE") != 0) {

408 errln("Enconding detection failure for UTF-16LE: got %s", name);

409 goto bail; // no point in looking at confidence if we got the wrong char acter set.

410 }

411

412 if (conf != 100) {

413 errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);

414 }

415

416 bail:

417 freeBytes(leBytes);

418 freeBytes(beBytes);

419 ucsdet_close(csd);

420 }

421

422 void CharsetDetectionTest::InputFilterTest()

423 {

424 UErrorCode status = U_ZERO_ERROR;

425 UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\ u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";

426 UnicodeString s = ss.unescape();

427 int32_t byteLength = 0;

428 char *bytes = extractBytes(s, "ISO-8859-1", byteLength);

429 UCharsetDetector *csd = ucsdet_open(&status);

430 const UCharsetMatch *match;

431 const char lang, name;

432

433 ucsdet_enableInputFilter(csd, TRUE);

434

435 if (!ucsdet_isInputFilterEnabled(csd)) {

436 errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!" );

437 }

438

439

440 ucsdet_setText(csd, bytes, byteLength, &status);

441 match = ucsdet_detect(csd, &status);

442

443 if (match == NULL) {

444 errln("Turning on the input filter resulted in no matches.");

445 goto turn_off;

446 }

447

448 name = ucsdet_getName(match, &status);

449

450 if (name == NULL \|\| strcmp(name, "ISO-8859-1") != 0) {

451 errln("Turning on the input filter resulted in %s rather than ISO-8859-1 .", name);

452 } else {

453 lang = ucsdet_getLanguage(match, &status);

454

455 if (lang == NULL \|\| strcmp(lang, "fr") != 0) {

456 errln("Input filter did not strip markup!");

457 }

458 }

459

460 turn_off:

461 ucsdet_enableInputFilter(csd, FALSE);

462 ucsdet_setText(csd, bytes, byteLength, &status);

463 match = ucsdet_detect(csd, &status);

464

465 if (match == NULL) {

466 errln("Turning off the input filter resulted in no matches.");

467 goto bail;

468 }

469

470 name = ucsdet_getName(match, &status);

471

472 if (name == NULL \|\| strcmp(name, "ISO-8859-1") != 0) {

473 errln("Turning off the input filter resulted in %s rather than ISO-8859- 1.", name);

474 } else {

475 lang = ucsdet_getLanguage(match, &status);

476

477 if (lang == NULL \|\| strcmp(lang, "en") != 0) {

478 errln("Unfiltered input did not detect as English!");

479 }

480 }

481

482 bail:

483 freeBytes(bytes);

484 ucsdet_close(csd);

485 }

486

487 void CharsetDetectionTest::C1BytesTest()

488 {

489 #if !UCONFIG_NO_LEGACY_CONVERSION

490 UErrorCode status = U_ZERO_ERROR;

491 UnicodeString sISO = "This is a small sample of some English text. Just enou gh to be sure that it detects correctly.";

492 UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC 1\\u201D bytes.", -1, US_INV);

493 UnicodeString sWindows = ssWindows.unescape();

494 int32_t lISO = 0, lWindows = 0;

495 char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);

496 char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);

497 UCharsetDetector *csd = ucsdet_open(&status);

498 const UCharsetMatch *match;

499 const char *name;

500

501 ucsdet_setText(csd, bWindows, lWindows, &status);

502 match = ucsdet_detect(csd, &status);

503

504 if (match == NULL) {

505 errcheckln(status, "English test with C1 bytes got no matches. - %s", u_ errorName(status));

506 goto bail;

507 }

508

509 name = ucsdet_getName(match, &status);

510

511 if (strcmp(name, "windows-1252") != 0) {

512 errln("English text with C1 bytes does not detect as windows-1252, but a s %s", name);

513 }

514

515 ucsdet_setText(csd, bISO, lISO, &status);

516 match = ucsdet_detect(csd, &status);

517

518 if (match == NULL) {

519 errln("English text without C1 bytes got no matches.");

520 goto bail;

521 }

522

523 name = ucsdet_getName(match, &status);

524

525 if (strcmp(name, "ISO-8859-1") != 0) {

526 errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);

527 }

528

529 bail:

530 freeBytes(bWindows);

531 freeBytes(bISO);

532

533 ucsdet_close(csd);

534 #endif

535 }

536

537 void CharsetDetectionTest::DetectionTest()

538 {

539 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

540 UErrorCode status = U_ZERO_ERROR;

541 char path[2048];

542 const char *testFilePath = getPath(path, "csdetest.xml");

543

544 if (testFilePath == NULL) {

545 return; /* Couldn't get path: error message already output. */

546 }

547

548 UXMLParser *parser = UXMLParser::createParser(status);

549 if (U_FAILURE(status)) {

550 dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));

551 return;

552 }

553

554 UXMLElement *root = parser->parseFile(testFilePath, status);

555 if (!assertSuccess( "parseFile",status)) return;

556

557 UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");

558 UnicodeString id_attr = UNICODE_STRING_SIMPLE("id");

559 UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings");

560

561 const UXMLElement *testCase;

562 int32_t tc = 0;

563

564 while((testCase = root->nextChildElement(tc)) != NULL) {

565 if (testCase->getTagName().compare(test_case) == 0) {

566 const UnicodeString *id = testCase->getAttribute(id_attr);

567 const UnicodeString *encodings = testCase->getAttribute(enc_attr);

568 const UnicodeString text = testCase->getText(TRUE);

569 int32_t encodingCount;

570 UnicodeString encodingList = split(encodings, CH_SPACE, encodingCo unt);

571

572 for(int32_t e = 0; e < encodingCount; e += 1) {

573 checkEncoding(text, encodingList[e], *id);

574 }

575

576 delete[] encodingList;

577 }

578 }

579

580 delete root;

581 delete parser;

582 #endif

583 }

584

585 void CharsetDetectionTest::IBM424Test()

586 {

587 #if !UCONFIG_ONLY_HTML_CONVERSION

588 UErrorCode status = U_ZERO_ERROR;

589

590 static const UChar chars[] = {

591 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05 D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,

592 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05 D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,

593 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05 DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,

594 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05 D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,

595 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05 E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,

596 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05 D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,

597 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05 E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,

598 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05 EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,

599 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x00 22, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,

600 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05 D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,

601 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05 D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,

602 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x00 20, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,

603 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x00 20, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,

604 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05 D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,

605 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05 DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,

606 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x00 20, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,

607 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05 D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000

608 };

609

610 static const UChar chars_reverse[] = {

611 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05 DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,

612 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05 E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,

613 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05 D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,

614 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05 E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,

615 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05 DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,

616 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05 D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,

617 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05 D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,

618 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05 DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,

619 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05 E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,

620 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05 E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,

621 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05 E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,

622 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05 DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,

623 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05 E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,

624 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05 D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,

625 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05 D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,

626 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x00 20, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,

627 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x00 20, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,

628 0x0000

629 };

630

631 int32_t bLength = 0, brLength = 0;

632

633 UnicodeString s1(chars);

634 UnicodeString s2(chars_reverse);

635

636 char *bytes = extractBytes(s1, "IBM424", bLength);

637 char *bytes_r = extractBytes(s2, "IBM424", brLength);

638

639 UCharsetDetector *csd = ucsdet_open(&status);

640 ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);

641 ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);

642 ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);

643 ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);

644 if (U_FAILURE(status)) {

645 errln("Error opening charset detector. - %s", u_errorName(status));

646 }

647 const UCharsetMatch *match;

648 const char *name;

649

650 ucsdet_setText(csd, bytes, bLength, &status);

651 match = ucsdet_detect(csd, &status);

652

653 if (match == NULL) {

654 errcheckln(status, "Encoding detection failure for IBM424_rtl: got no ma tches. - %s", u_errorName(status));

655 goto bail;

656 }

657

658 name = ucsdet_getName(match, &status);

659 if (strcmp(name, "IBM424_rtl") != 0) {

660 errln("Encoding detection failure for IBM424_rtl: got %s", name);

661 }

662

663 ucsdet_setText(csd, bytes_r, brLength, &status);

664 match = ucsdet_detect(csd, &status);

665

666 if (match == NULL) {

667 errln("Encoding detection failure for IBM424_ltr: got no matches.");

668 goto bail;

669 }

670

671 name = ucsdet_getName(match, &status);

672 if (strcmp(name, "IBM424_ltr") != 0) {

673 errln("Encoding detection failure for IBM424_ltr: got %s", name);

674 }

675

676 bail:

677 freeBytes(bytes);

678 freeBytes(bytes_r);

679 ucsdet_close(csd);

680 #endif

681 }

682

683 void CharsetDetectionTest::IBM420Test()

684 {

685 #if !UCONFIG_ONLY_HTML_CONVERSION

686 UErrorCode status = U_ZERO_ERROR;

687

688 static const UChar chars[] = {

689 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,

690 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,

691 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,

692 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,

693 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,

694 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,

695 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,

696 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,

697 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,

698 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,

699 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,

700 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,

701 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,

702 0x0000

703 };

704 static const UChar chars_reverse[] = {

705 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,

706 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,

707 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,

708 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,

709 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,

710 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,

711 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,

712 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,

713 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,

714 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,

715 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,

716 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,

717 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,

718 0x0000,

719 };

720

721 int32_t bLength = 0, brLength = 0;

722

723 UnicodeString s1(chars);

724 UnicodeString s2(chars_reverse);

725

726 char *bytes = extractBytes(s1, "IBM420", bLength);

727 char *bytes_r = extractBytes(s2, "IBM420", brLength);

728

729 UCharsetDetector *csd = ucsdet_open(&status);

730 if (U_FAILURE(status)) {

731 errln("Error opening charset detector. - %s", u_errorName(status));

732 }

733 ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);

734 ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);

735 ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);

736 ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);

737 const UCharsetMatch *match;

738 const char *name;

739

740 ucsdet_setText(csd, bytes, bLength, &status);

741 match = ucsdet_detect(csd, &status);

742

743 if (match == NULL) {

744 errcheckln(status, "Encoding detection failure for IBM420_rtl: got no ma tches. - %s", u_errorName(status));

745 goto bail;

746 }

747

748 name = ucsdet_getName(match, &status);

749 if (strcmp(name, "IBM420_rtl") != 0) {

750 errln("Encoding detection failure for IBM420_rtl: got %s\n", name);

751 }

752

753 ucsdet_setText(csd, bytes_r, brLength, &status);

754 match = ucsdet_detect(csd, &status);

755

756 if (match == NULL) {

757 errln("Encoding detection failure for IBM420_ltr: got no matches.\n");

758 goto bail;

759 }

760

761 name = ucsdet_getName(match, &status);

762 if (strcmp(name, "IBM420_ltr") != 0) {

763 errln("Encoding detection failure for IBM420_ltr: got %s\n", name);

764 }

765

766 bail:

767 freeBytes(bytes);

768 freeBytes(bytes_r);

769 ucsdet_close(csd);

770 #endif

771 }

772

773

774 void CharsetDetectionTest::Ticket6394Test() {

775 #if !UCONFIG_NO_CONVERSION

776 const char charText[] = "Here is some random English text that should be de tected as ISO-8859-1."

777 "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "

778 "encodings more than once. The hop through Unicode String is for platforms "

779 "where this char * string is be EBCDIC and needs co nversion to Latin1.";

780 char latin1Text[sizeof(charText)];

781 UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(la tin1Text), "ISO-8859-1");

782

783 UErrorCode status = U_ZERO_ERROR;

784 UCharsetDetector *csd = ucsdet_open(&status);

785 ucsdet_setText(csd, latin1Text, -1, &status);

786 if (U_FAILURE(status)) {

787 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_er rorName(status));

788 return;

789 }

790

791 int32_t matchCount = 0;

792 const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);

793 if (U_FAILURE(status)) {

794 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_er rorName(status));

795 return;

796 }

797

798 UnicodeSet setOfCharsetNames; // UnicodSets can hold strings.

799 int32_t i;

800 for (i=0; i<matchCount; i++) {

801 UnicodeString charSetName(ucsdet_getName(matches[i], &status));

802 if (U_FAILURE(status)) {

803 errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __L INE__, u_errorName(status), i);

804 status = U_ZERO_ERROR;

805 }

806 if (setOfCharsetNames.contains(charSetName)) {

807 errln("Fail at file %s, line %d ", __FILE__, __LINE__);

808 errln(UnicodeString(" Duplicate charset name = ") + charSetName);

809 }

810 setOfCharsetNames.add(charSetName);

811 }

812 ucsdet_close(csd);

813 #endif

814 }

815

816

817 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish b etween

818 // similar Windows and non-Windows SBCS encodings. State was kept in the shared

819 // Charset Recognizer objects, and could be overwritten.

820 void CharsetDetectionTest::Ticket6954Test() {

821 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_FORMATTING

822 UErrorCode status = U_ZERO_ERROR;

823 UnicodeString sISO = "This is a small sample of some English text. Just enou gh to be sure that it detects correctly.";

824 UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."

825 "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);

826 UnicodeString sWindows = ssWindows.unescape();

827 int32_t lISO = 0, lWindows = 0;

828 char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);

829 char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);

830

831 // First do a plain vanilla detect of 1252 text

832

833 UCharsetDetector *csd1 = ucsdet_open(&status);

834 ucsdet_setText(csd1, bWindows, lWindows, &status);

835 const UCharsetMatch *match1 = ucsdet_detect(csd1, &status);

836 const char *name1 = ucsdet_getName(match1, &status);

837 TEST_ASSERT_SUCCESS(status);

838 TEST_ASSERT(strcmp(name1, "windows-1252")==0);

839

840 // Next, using a completely separate detector, detect some 8859-1 text

841

842 UCharsetDetector *csd2 = ucsdet_open(&status);

843 ucsdet_setText(csd2, bISO, lISO, &status);

844 const UCharsetMatch *match2 = ucsdet_detect(csd2, &status);

845 const char *name2 = ucsdet_getName(match2, &status);

846 TEST_ASSERT_SUCCESS(status);

847 TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);

848

849 // Recheck the 1252 results from the first detector, which should not have b een

850 // altered by the use of a different detector.

851

852 name1 = ucsdet_getName(match1, &status);

853 TEST_ASSERT_SUCCESS(status);

854 TEST_ASSERT(strcmp(name1, "windows-1252")==0);

855

856 ucsdet_close(csd1);

857 ucsdet_close(csd2);

858 freeBytes(bISO);

859 freeBytes(bWindows);

860 #endif

861 }

OLD	NEW

« no previous file with comments | « source/test/intltest/csdetest.h ('k') | source/test/intltest/currcoll.h » ('j') | no next file with comments »