icu46/source/test/thaitest/thaitest.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/test/thaitest/thaitest.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 ******************************************************************************

	3 * Copyright (C) 1998-2003, 2006, International Business Machines Corporation *

	4 * and others. All Rights Reserved. *

	5 ******************************************************************************

	6 */

	7

	8 #include <errno.h>

	9 #include <stdio.h>

	10 #include <string.h>

	11

	12 #include "unicode/utypes.h"

	13 #include "unicode/uchar.h"

	14 #include "unicode/uchriter.h"

	15 #include "unicode/brkiter.h"

	16 #include "unicode/locid.h"

	17 #include "unicode/unistr.h"

	18 #include "unicode/uniset.h"

	19 #include "unicode/ustring.h"

	20

	21 /*

	22 * This program takes a Unicode text file containing Thai text with

	23 * spaces inserted where the word breaks are. It computes a copy of

	24 * the text without spaces and uses a word instance of a Thai BreakIterator

	25 * to compute the word breaks. The program reports any differences in the

	26 * breaks.

	27 *

	28 * NOTE: by it's very nature, Thai word breaking is not exact, so it is

	29 * exptected that this program will always report some differences.

	30 */

	31

	32 /*

	33 * This class is a break iterator that counts words and spaces.

	34 */

	35 class SpaceBreakIterator

	36 {

	37 public:

	38 // The constructor:

	39 // text - pointer to an array of UChars to iterate over

	40 // count - the number of UChars in text

	41 SpaceBreakIterator(const UChar *text, int32_t count);

	42

	43 // the destructor

	44 ~SpaceBreakIterator();

	45

	46 // return next break position

	47 int32_t next();

	48

	49 // return current word count

	50 int32_t getWordCount();

	51

	52 // return current space count

	53 int32_t getSpaceCount();

	54

	55 private:

	56 // No arg constructor: private so clients can't call it.

	57 SpaceBreakIterator();

	58

	59 // The underlying BreakIterator

	60 BreakIterator *fBreakIter;

	61

	62 // address of the UChar array

	63 const UChar *fText;

	64

	65 // number of UChars in fText

	66 int32_t fTextCount;

	67

	68 // current word count

	69 int32_t fWordCount;

	70

	71 // current space count

	72 int32_t fSpaceCount;

	73

	74 // UnicodeSet of SA characters

	75 UnicodeSet fComplexContext;

	76

	77 // true when fBreakIter has returned DONE

	78 UBool fDone;

	79 };

	80

	81 /*

	82 * This is the main class. It compares word breaks and reports the differences.

	83 */

	84 class ThaiWordbreakTest

	85 {

	86 public:

	87 // The main constructor:

	88 // spaces - pointer to a UChar array for the text with spaces

	89 // spaceCount - the number of characters in the spaces array

	90 // noSpaces - pointer to a UChar array for the text without spaces

	91 // noSpaceCount - the number of characters in the noSpaces array

	92 // verbose - report all breaks if true, otherwise just report differenc es

	93 ThaiWordbreakTest(const UChar spaces, int32_t spaceCount, const UChar noSp aces, int32_t noSpaceCount, UBool verbose);

	94 ~ThaiWordbreakTest();

	95

	96 // returns the number of breaks that are in the spaces array

	97 // but aren't found in the noSpaces array

	98 int32_t getBreaksNotFound();

	99

	100 // returns the number of breaks which are found in the noSpaces

	101 // array but aren't in the spaces array

	102 int32_t getInvalidBreaks();

	103

	104 // returns the number of words found in the spaces array

	105 int32_t getWordCount();

	106

	107 // reads the input Unicode text file:

	108 // fileName - the path name of the file

	109 // charCount - set to the number of UChars read from the file

	110 // returns - the address of the UChar array containing the characters

	111 static const UChar readFile(char fileName, int32_t &charCount);

	112

	113 // removes spaces form the input UChar array:

	114 // spaces - pointer to the input UChar array

	115 // count - number of UChars in the spaces array

	116 // nonSpaceCount - the number of UChars in the result array

	117 // returns - the address of the UChar array with spaces removed

	118 static const UChar crunchSpaces(const UChar spaces, int32_t count, int32_t &nonSpaceCount);

	119

	120 private:

	121 // The no arg constructor - private so clients can't call it

	122 ThaiWordbreakTest();

	123

	124 // This does the actual comparison:

	125 // spaces - the address of the UChar array for the text with spaces

	126 // spaceCount - the number of UChars in the spaces array

	127 // noSpaces - the address of the UChar array for the text without spaces

	128 // noSpaceCount - the number of UChars in the noSpaces array

	129 // returns - true if all breaks match, FALSE otherwise

	130 UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,

	131 const UChar *noSpaces, int32_t noSpaceCount);

	132

	133 // helper method to report a break in the spaces

	134 // array that's not found in the noSpaces array

	135 void breakNotFound(int32_t br);

	136

	137 // helper method to report a break that's found in

	138 // the noSpaces array that's not in the spaces array

	139 void foundInvalidBreak(int32_t br);

	140

	141 // count of breaks in the spaces array that

	142 // aren't found in the noSpaces array

	143 int32_t fBreaksNotFound;

	144

	145 // count of breaks found in the noSpaces array

	146 // that aren't in the spaces array

	147 int32_t fInvalidBreaks;

	148

	149 // number of words found in the spaces array

	150 int32_t fWordCount;

	151

	152 // report all breaks if true, otherwise just report differences

	153 UBool fVerbose;

	154 };

	155

	156 /*

	157 * The main constructor: it calls compareWordBreaks and reports any differences

	158 */

	159 ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,

	160 const UChar *noSpaces, int32_t noSpaceCount , UBool verbose)

	161 : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)

	162 {

	163 compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);

	164 }

	165

	166 /*

	167 * The no arg constructor

	168 */

	169 ThaiWordbreakTest::ThaiWordbreakTest()

	170 {

	171 // nothing

	172 }

	173

	174 /*

	175 * The destructor

	176 */

	177 ThaiWordbreakTest::~ThaiWordbreakTest()

	178 {

	179 // nothing?

	180 }

	181

	182 /*

	183 * returns the number of breaks in the spaces array

	184 * that aren't found in the noSpaces array

	185 */

	186 inline int32_t ThaiWordbreakTest::getBreaksNotFound()

	187 {

	188 return fBreaksNotFound;

	189 }

	190

	191 /*

	192 * Returns the number of breaks found in the noSpaces

	193 * array that aren't in the spaces array

	194 */

	195 inline int32_t ThaiWordbreakTest::getInvalidBreaks()

	196 {

	197 return fInvalidBreaks;

	198 }

	199

	200 /*

	201 * Returns the number of words found in the spaces array

	202 */

	203 inline int32_t ThaiWordbreakTest::getWordCount()

	204 {

	205 return fWordCount;

	206 }

	207

	208 /*

	209 * This method does the acutal break comparison and reports the results.

	210 * It uses a SpaceBreakIterator to iterate over the text with spaces,

	211 * and a word instance of a Thai BreakIterator to iterate over the text

	212 * without spaces.

	213 */

	214 UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCou nt,

	215 const UChar *noSpaces, int32_t noSpac eCount)

	216 {

	217 UBool result = TRUE;

	218 Locale thai("th");

	219 UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, n oSpaceCount);

	220 UErrorCode status = U_ZERO_ERROR;

	221

	222 BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);

	223 breakIter->adoptText(noSpaceIter);

	224

	225 SpaceBreakIterator spaceIter(spaces, spaceCount);

	226

	227 int32_t nextBreak = 0;

	228 int32_t nextSpaceBreak = 0;

	229 int32_t iterCount = 0;

	230

	231 while (TRUE) {

	232 nextSpaceBreak = spaceIter.next();

	233 nextBreak = breakIter->next();

	234

	235 if (nextSpaceBreak == BreakIterator::DONE \|\| nextBreak == BreakIterator: :DONE) {

	236 if (nextBreak != BreakIterator::DONE) {

	237 fprintf(stderr, "break iterator didn't end.\n");

	238 } else if (nextSpaceBreak != BreakIterator::DONE) {

	239 fprintf(stderr, "premature break iterator end.\n");

	240 }

	241

	242 break;

	243 }

	244

	245 while (nextSpaceBreak != nextBreak &&

	246 nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterat or::DONE) {

	247 if (nextSpaceBreak < nextBreak) {

	248 breakNotFound(nextSpaceBreak);

	249 result = FALSE;

	250 nextSpaceBreak = spaceIter.next();

	251 } else if (nextSpaceBreak > nextBreak) {

	252 foundInvalidBreak(nextBreak);

	253 result = FALSE;

	254 nextBreak = breakIter->next();

	255 }

	256 }

	257

	258 if (fVerbose) {

	259 printf("%d %d\n", nextSpaceBreak, nextBreak);

	260 }

	261 }

	262

	263

	264 fWordCount = spaceIter.getWordCount();

	265

	266 delete breakIter;

	267

	268 return result;

	269 }

	270

	271 /*

	272 * Report a break that's in the text with spaces but

	273 * not found in the text without spaces.

	274 */

	275 void ThaiWordbreakTest::breakNotFound(int32_t br)

	276 {

	277 if (fVerbose) {

	278 printf("%d ****\n", br);

	279 } else {

	280 fprintf(stderr, "break not found: %d\n", br);

	281 }

	282

	283 fBreaksNotFound += 1;

	284 }

	285

	286 /*

	287 * Report a break that's found in the text without spaces

	288 * that isn't in the text with spaces.

	289 */

	290 void ThaiWordbreakTest::foundInvalidBreak(int32_t br)

	291 {

	292 if (fVerbose) {

	293 printf("**** %d\n", br);

	294 } else {

	295 fprintf(stderr, "found invalid break: %d\n", br);

	296 }

	297

	298 fInvalidBreaks += 1;

	299 }

	300

	301 /*

	302 * Read the text from a file. The text must start with a Unicode Byte

	303 * Order Mark (BOM) so that we know what order to read the bytes in.

	304 */

	305 const UChar ThaiWordbreakTest::readFile(char fileName, int32_t &charCount)

	306 {

	307 FILE *f;

	308 int32_t fileSize;

	309

	310 UChar *buffer;

	311 char *bufferChars;

	312

	313 f = fopen(fileName, "rb");

	314

	315 if( f == NULL ) {

	316 fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errn o));

	317 return 0;

	318 }

	319

	320 fseek(f, 0, SEEK_END);

	321 fileSize = ftell(f);

	322

	323 fseek(f, 0, SEEK_SET);

	324 bufferChars = new char[fileSize];

	325

	326 if(bufferChars == 0) {

	327 fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileN ame, strerror(errno));

	328 fclose(f);

	329 return 0;

	330 }

	331

	332 fread(bufferChars, sizeof(char), fileSize, f);

	333 if( ferror(f) ) {

	334 fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errn o));

	335 fclose(f);

	336 delete[] bufferChars;

	337 return 0;

	338 }

	339 fclose(f);

	340

	341 UnicodeString myText(bufferChars, fileSize, "UTF-8");

	342

	343 delete[] bufferChars;

	344

	345 charCount = myText.length();

	346 buffer = new UChar[charCount];

	347 if(buffer == 0) {

	348 fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileN ame, strerror(errno));

	349 return 0;

	350 }

	351

	352 myText.extract(1, myText.length(), buffer);

	353 charCount--; // skip the BOM

	354 buffer[charCount] = 0; // NULL terminate for easier reading in the debugg er

	355

	356 return buffer;

	357 }

	358

	359 /*

	360 * Remove spaces from the input UChar array.

	361 *

	362 * We check explicitly for a Unicode code value of 0x0020

	363 * because Unicode::isSpaceChar returns true for CR, LF, etc.

	364 *

	365 */

	366 const UChar ThaiWordbreakTest::crunchSpaces(const UChar spaces, int32_t count, int32_t &nonSpaceCount)

	367 {

	368 int32_t i, out, spaceCount;

	369

	370 spaceCount = 0;

	371 for (i = 0; i < count; i += 1) {

	372 if (spaces[i] == 0x0020 /Unicode::isSpaceChar(spaces[i])/) {

	373 spaceCount += 1;

	374 }

	375 }

	376

	377 nonSpaceCount = count - spaceCount;

	378 UChar *noSpaces = new UChar[nonSpaceCount];

	379

	380 if (noSpaces == 0) {

	381 fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n ");

	382 return 0;

	383 }

	384

	385 for (out = 0, i = 0; i < count; i += 1) {

	386 if (spaces[i] != 0x0020 /! Unicode::isSpaceChar(spaces[i])/) {

	387 noSpaces[out++] = spaces[i];

	388 }

	389 }

	390

	391 return noSpaces;

	392 }

	393

	394 /*

	395 * Generate a text file with spaces in it from a file without.

	396 */

	397 int generateFile(const UChar *chars, int32_t length) {

	398 Locale root("");

	399 UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, leng th);

	400 UErrorCode status = U_ZERO_ERROR;

	401

	402 UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status) ;

	403 BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);

	404 breakIter->adoptText(noSpaceIter);

	405 char outbuf[1024];

	406 int32_t strlength;

	407 UChar bom = 0xFEFF;

	408

	409 printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &statu s));

	410 int32_t prevbreak = 0;

	411 while (U_SUCCESS(status)) {

	412 int32_t nextbreak = breakIter->next();

	413 if (nextbreak == BreakIterator::DONE) {

	414 break;

	415 }

	416 printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prev break],

	417 nextbreak-prevbreak, &status));

	418 if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])

	419 && complexContext.contains(chars[nextbreak])) {

	420 printf(" ");

	421 }

	422 prevbreak = nextbreak;

	423 }

	424

	425 if (U_FAILURE(status)) {

	426 fprintf(stderr, "generate failed: %s\n", u_errorName(status));

	427 return status;

	428 }

	429 else {

	430 return 0;

	431 }

	432 }

	433

	434 /*

	435 * The main routine. Read the command line arguments, read the text file,

	436 * remove the spaces, do the comparison and report the final results

	437 */

	438 int main(int argc, char **argv)

	439 {

	440 char *fileName = "space.txt";

	441 int arg = 1;

	442 UBool verbose = FALSE;

	443 UBool generate = FALSE;

	444

	445 if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {

	446 generate = TRUE;

	447 arg += 1;

	448 }

	449

	450 if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {

	451 verbose = TRUE;

	452 arg += 1;

	453 }

	454

	455 if (arg == argc - 1) {

	456 fileName = argv[arg++];

	457 }

	458

	459 if (arg != argc) {

	460 fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);

	461 return 1;

	462 }

	463

	464 int32_t spaceCount, nonSpaceCount;

	465 const UChar spaces, noSpaces;

	466

	467 spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);

	468

	469 if (spaces == 0) {

	470 return 1;

	471 }

	472

	473 if (generate) {

	474 return generateFile(spaces, spaceCount);

	475 }

	476

	477 noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount );

	478

	479 if (noSpaces == 0) {

	480 return 1;

	481 }

	482

	483 ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose) ;

	484

	485 printf("word count: %d\n", test.getWordCount());

	486 printf("breaks not found: %d\n", test.getBreaksNotFound());

	487 printf("invalid breaks found: %d\n", test.getInvalidBreaks());

	488

	489 return 0;

	490 }

	491

	492 /*

	493 * The main constructor. Clear all the counts and construct a default

	494 * word instance of a BreakIterator.

	495 */

	496 SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)

	497 : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0) , fDone(FALSE)

	498 {

	499 UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);

	500 UErrorCode status = U_ZERO_ERROR;

	501 fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), stat us);

	502 Locale root("");

	503

	504 fBreakIter = BreakIterator::createWordInstance(root, status);

	505 fBreakIter->adoptText(iter);

	506 }

	507

	508 SpaceBreakIterator::SpaceBreakIterator()

	509 {

	510 // nothing

	511 }

	512

	513 /*

	514 * The destructor. delete the underlying BreakIterator

	515 */

	516 SpaceBreakIterator::~SpaceBreakIterator()

	517 {

	518 delete fBreakIter;

	519 }

	520

	521 /*

	522 * Return the next break, counting words and spaces.

	523 */

	524 int32_t SpaceBreakIterator::next()

	525 {

	526 if (fDone) {

	527 return BreakIterator::DONE;

	528 }

	529

	530 int32_t nextBreak;

	531 do {

	532 nextBreak = fBreakIter->next();

	533

	534 if (nextBreak == BreakIterator::DONE) {

	535 fDone = TRUE;

	536 return BreakIterator::DONE;

	537 }

	538 }

	539 while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])

	540 && fComplexContext.contains(fText[nextBreak]));

	541

	542 int32_t result = nextBreak - fSpaceCount;

	543

	544 if (nextBreak < fTextCount) {

	545 if (fText[nextBreak] == 0x0020 /Unicode::isSpaceChar(fText[nextBreak]) /) {

	546 fSpaceCount += fBreakIter->next() - nextBreak;

	547 }

	548 }

	549

	550 fWordCount += 1;

	551

	552 return result;

	553 }

	554

	555 /*

	556 * Returns the current space count

	557 */

	558 int32_t SpaceBreakIterator::getSpaceCount()

	559 {

	560 return fSpaceCount;

	561 }

	562

	563 /*

	564 * Returns the current word count

	565 */

	566 int32_t SpaceBreakIterator::getWordCount()

	567 {

	568 return fWordCount;

	569 }

	570

	571

OLD	NEW

« no previous file with comments | « icu46/source/test/thaitest/space.txt ('k') | icu46/source/test/thaitest/thaitest.dsp » ('j') | no next file with comments »