icu46/source/test/thaitest/thaitest.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Unified Diff: icu46/source/test/thaitest/thaitest.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu46/source/test/thaitest/thaitest.cpp

===================================================================

--- icu46/source/test/thaitest/thaitest.cpp (revision 0)

+++ icu46/source/test/thaitest/thaitest.cpp (revision 0)

@@ -0,0 +1,571 @@

+/*

+ ******************************************************************************

+ */

+#include <errno.h>

+#include <stdio.h>

+#include <string.h>

+#include "unicode/utypes.h"

+#include "unicode/uchar.h"

+#include "unicode/uchriter.h"

+#include "unicode/brkiter.h"

+#include "unicode/locid.h"

+#include "unicode/unistr.h"

+#include "unicode/uniset.h"

+#include "unicode/ustring.h"

+/*

+ * This program takes a Unicode text file containing Thai text with

+ * spaces inserted where the word breaks are. It computes a copy of

+ * the text without spaces and uses a word instance of a Thai BreakIterator

+ * to compute the word breaks. The program reports any differences in the

+ * breaks.

+ *

+ * NOTE: by it's very nature, Thai word breaking is not exact, so it is

+ * exptected that this program will always report some differences.

+ */

+/*

+ * This class is a break iterator that counts words and spaces.

+ */

+class SpaceBreakIterator

+public:

+ // The constructor:

+ // text - pointer to an array of UChars to iterate over

+ // count - the number of UChars in text

+ SpaceBreakIterator(const UChar *text, int32_t count);

+ // the destructor

+ ~SpaceBreakIterator();

+ // return next break position

+ int32_t next();

+ // return current word count

+ int32_t getWordCount();

+ // return current space count

+ int32_t getSpaceCount();

+private:

+ // No arg constructor: private so clients can't call it.

+ SpaceBreakIterator();

+ // The underlying BreakIterator

+ BreakIterator *fBreakIter;

+ // address of the UChar array

+ const UChar *fText;

+ // number of UChars in fText

+ int32_t fTextCount;

+ // current word count

+ int32_t fWordCount;

+ // current space count

+ int32_t fSpaceCount;

+ // UnicodeSet of SA characters

+ UnicodeSet fComplexContext;

+ // true when fBreakIter has returned DONE

+ UBool fDone;

+};

+/*

+ * This is the main class. It compares word breaks and reports the differences.

+ */

+class ThaiWordbreakTest

+public:

+ // The main constructor:

+ // spaces - pointer to a UChar array for the text with spaces

+ // spaceCount - the number of characters in the spaces array

+ // noSpaces - pointer to a UChar array for the text without spaces

+ // noSpaceCount - the number of characters in the noSpaces array

+ // verbose - report all breaks if true, otherwise just report differences

+ ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose);

+ ~ThaiWordbreakTest();

+ // returns the number of breaks that are in the spaces array

+ // but aren't found in the noSpaces array

+ int32_t getBreaksNotFound();

+ // returns the number of breaks which are found in the noSpaces

+ // array but aren't in the spaces array

+ int32_t getInvalidBreaks();

+ // returns the number of words found in the spaces array

+ int32_t getWordCount();

+ // reads the input Unicode text file:

+ // fileName - the path name of the file

+ // charCount - set to the number of UChars read from the file

+ // returns - the address of the UChar array containing the characters

+ static const UChar *readFile(char *fileName, int32_t &charCount);

+ // removes spaces form the input UChar array:

+ // spaces - pointer to the input UChar array

+ // count - number of UChars in the spaces array

+ // nonSpaceCount - the number of UChars in the result array

+ // returns - the address of the UChar array with spaces removed

+ static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount);

+private:

+ // The no arg constructor - private so clients can't call it

+ ThaiWordbreakTest();

+ // This does the actual comparison:

+ // spaces - the address of the UChar array for the text with spaces

+ // spaceCount - the number of UChars in the spaces array

+ // noSpaces - the address of the UChar array for the text without spaces

+ // noSpaceCount - the number of UChars in the noSpaces array

+ // returns - true if all breaks match, FALSE otherwise

+ UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,

+ const UChar *noSpaces, int32_t noSpaceCount);

+ // helper method to report a break in the spaces

+ // array that's not found in the noSpaces array

+ void breakNotFound(int32_t br);

+ // helper method to report a break that's found in

+ // the noSpaces array that's not in the spaces array

+ void foundInvalidBreak(int32_t br);

+ // count of breaks in the spaces array that

+ // aren't found in the noSpaces array

+ int32_t fBreaksNotFound;

+ // count of breaks found in the noSpaces array

+ // that aren't in the spaces array

+ int32_t fInvalidBreaks;

+ // number of words found in the spaces array

+ int32_t fWordCount;

+ // report all breaks if true, otherwise just report differences

+ UBool fVerbose;

+};

+/*

+ * The main constructor: it calls compareWordBreaks and reports any differences

+ */

+ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,

+ const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)

+: fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)

+ compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);

+/*

+ * The no arg constructor

+ */

+ThaiWordbreakTest::ThaiWordbreakTest()

+ // nothing

+/*

+ * The destructor

+ */

+ThaiWordbreakTest::~ThaiWordbreakTest()

+ // nothing?

+/*

+ * returns the number of breaks in the spaces array

+ * that aren't found in the noSpaces array

+ */

+inline int32_t ThaiWordbreakTest::getBreaksNotFound()

+ return fBreaksNotFound;

+/*

+ * Returns the number of breaks found in the noSpaces

+ * array that aren't in the spaces array

+ */

+inline int32_t ThaiWordbreakTest::getInvalidBreaks()

+ return fInvalidBreaks;

+/*

+ * Returns the number of words found in the spaces array

+ */

+inline int32_t ThaiWordbreakTest::getWordCount()

+ return fWordCount;

+/*

+ * This method does the acutal break comparison and reports the results.

+ * It uses a SpaceBreakIterator to iterate over the text with spaces,

+ * and a word instance of a Thai BreakIterator to iterate over the text

+ * without spaces.

+ */

+UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,

+ const UChar *noSpaces, int32_t noSpaceCount)

+ UBool result = TRUE;

+ Locale thai("th");

+ UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);

+ UErrorCode status = U_ZERO_ERROR;

+ BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);

+ breakIter->adoptText(noSpaceIter);

+ SpaceBreakIterator spaceIter(spaces, spaceCount);

+ int32_t nextBreak = 0;

+ int32_t nextSpaceBreak = 0;

+ int32_t iterCount = 0;

+ while (TRUE) {

+ nextSpaceBreak = spaceIter.next();

+ nextBreak = breakIter->next();

+ if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {

+ if (nextBreak != BreakIterator::DONE) {

+ fprintf(stderr, "break iterator didn't end.\n");

+ } else if (nextSpaceBreak != BreakIterator::DONE) {

+ fprintf(stderr, "premature break iterator end.\n");

+ }

+ break;

+ }

+ while (nextSpaceBreak != nextBreak &&

+ nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {

+ if (nextSpaceBreak < nextBreak) {

+ breakNotFound(nextSpaceBreak);

+ result = FALSE;

+ nextSpaceBreak = spaceIter.next();

+ } else if (nextSpaceBreak > nextBreak) {

+ foundInvalidBreak(nextBreak);

+ result = FALSE;

+ nextBreak = breakIter->next();

+ }

+ if (fVerbose) {

+ printf("%d %d\n", nextSpaceBreak, nextBreak);

+ }

+ fWordCount = spaceIter.getWordCount();

+ delete breakIter;

+ return result;

+/*

+ * Report a break that's in the text with spaces but

+ * not found in the text without spaces.

+ */

+void ThaiWordbreakTest::breakNotFound(int32_t br)

+ if (fVerbose) {

+ printf("%d ****\n", br);

+ } else {

+ fprintf(stderr, "break not found: %d\n", br);

+ }

+ fBreaksNotFound += 1;

+/*

+ * Report a break that's found in the text without spaces

+ * that isn't in the text with spaces.

+ */

+void ThaiWordbreakTest::foundInvalidBreak(int32_t br)

+ if (fVerbose) {

+ printf("**** %d\n", br);

+ } else {

+ fprintf(stderr, "found invalid break: %d\n", br);

+ }

+ fInvalidBreaks += 1;

+/*

+ * Read the text from a file. The text must start with a Unicode Byte

+ * Order Mark (BOM) so that we know what order to read the bytes in.

+ */

+const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount)

+ FILE *f;

+ int32_t fileSize;

+ UChar *buffer;

+ char *bufferChars;

+ f = fopen(fileName, "rb");

+ if( f == NULL ) {

+ fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));

+ return 0;

+ }

+ fseek(f, 0, SEEK_END);

+ fileSize = ftell(f);

+ fseek(f, 0, SEEK_SET);

+ bufferChars = new char[fileSize];

+ if(bufferChars == 0) {

+ fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));

+ fclose(f);

+ return 0;

+ }

+ fread(bufferChars, sizeof(char), fileSize, f);

+ if( ferror(f) ) {

+ fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));

+ fclose(f);

+ delete[] bufferChars;

+ return 0;

+ }

+ fclose(f);

+ UnicodeString myText(bufferChars, fileSize, "UTF-8");

+ delete[] bufferChars;

+ charCount = myText.length();

+ buffer = new UChar[charCount];

+ if(buffer == 0) {

+ fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));

+ return 0;

+ }

+ myText.extract(1, myText.length(), buffer);

+ charCount--; // skip the BOM

+ buffer[charCount] = 0; // NULL terminate for easier reading in the debugger

+ return buffer;

+/*

+ * Remove spaces from the input UChar array.

+ *

+ * We check explicitly for a Unicode code value of 0x0020

+ * because Unicode::isSpaceChar returns true for CR, LF, etc.

+ *

+ */

+const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount)

+ int32_t i, out, spaceCount;

+ spaceCount = 0;

+ for (i = 0; i < count; i += 1) {

+ if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {

+ spaceCount += 1;

+ }

+ nonSpaceCount = count - spaceCount;

+ UChar *noSpaces = new UChar[nonSpaceCount];

+ if (noSpaces == 0) {

+ fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");

+ return 0;

+ }

+ for (out = 0, i = 0; i < count; i += 1) {

+ if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {

+ noSpaces[out++] = spaces[i];

+ }

+ return noSpaces;

+/*

+ * Generate a text file with spaces in it from a file without.

+ */

+int generateFile(const UChar *chars, int32_t length) {

+ Locale root("");

+ UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length);

+ UErrorCode status = U_ZERO_ERROR;

+ UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);

+ BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);

+ breakIter->adoptText(noSpaceIter);

+ char outbuf[1024];

+ int32_t strlength;

+ UChar bom = 0xFEFF;

+ printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status));

+ int32_t prevbreak = 0;

+ while (U_SUCCESS(status)) {

+ int32_t nextbreak = breakIter->next();

+ if (nextbreak == BreakIterator::DONE) {

+ break;

+ }

+ printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak],

+ nextbreak-prevbreak, &status));

+ if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])

+ && complexContext.contains(chars[nextbreak])) {

+ printf(" ");

+ }

+ prevbreak = nextbreak;

+ }

+ if (U_FAILURE(status)) {

+ fprintf(stderr, "generate failed: %s\n", u_errorName(status));

+ return status;

+ }

+ else {

+ return 0;

+ }

+/*

+ * The main routine. Read the command line arguments, read the text file,

+ * remove the spaces, do the comparison and report the final results

+ */

+int main(int argc, char **argv)

+ char *fileName = "space.txt";

+ int arg = 1;

+ UBool verbose = FALSE;

+ UBool generate = FALSE;

+ if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {

+ generate = TRUE;

+ arg += 1;

+ }

+ if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {

+ verbose = TRUE;

+ arg += 1;

+ }

+ if (arg == argc - 1) {

+ fileName = argv[arg++];

+ }

+ if (arg != argc) {

+ fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);

+ return 1;

+ }

+ int32_t spaceCount, nonSpaceCount;

+ const UChar *spaces, *noSpaces;

+ spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);

+ if (spaces == 0) {

+ return 1;

+ }

+ if (generate) {

+ return generateFile(spaces, spaceCount);

+ }

+ noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);

+ if (noSpaces == 0) {

+ return 1;

+ }

+ ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);

+ printf("word count: %d\n", test.getWordCount());

+ printf("breaks not found: %d\n", test.getBreaksNotFound());

+ printf("invalid breaks found: %d\n", test.getInvalidBreaks());

+ return 0;

+/*

+ * The main constructor. Clear all the counts and construct a default

+ * word instance of a BreakIterator.

+ */

+SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)

+ : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE)

+ UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);

+ UErrorCode status = U_ZERO_ERROR;

+ fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);

+ Locale root("");

+ fBreakIter = BreakIterator::createWordInstance(root, status);

+ fBreakIter->adoptText(iter);

+SpaceBreakIterator::SpaceBreakIterator()

+ // nothing

+/*

+ * The destructor. delete the underlying BreakIterator

+ */

+SpaceBreakIterator::~SpaceBreakIterator()

+ delete fBreakIter;

+/*

+ * Return the next break, counting words and spaces.

+ */

+int32_t SpaceBreakIterator::next()

+ if (fDone) {

+ return BreakIterator::DONE;

+ }

+ int32_t nextBreak;

+ do {

+ nextBreak = fBreakIter->next();

+ if (nextBreak == BreakIterator::DONE) {

+ fDone = TRUE;

+ return BreakIterator::DONE;

+ }

+ while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])

+ && fComplexContext.contains(fText[nextBreak]));

+ int32_t result = nextBreak - fSpaceCount;

+ if (nextBreak < fTextCount) {

+ if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {

+ fSpaceCount += fBreakIter->next() - nextBreak;

+ }

+ fWordCount += 1;

+ return result;

+/*

+ * Returns the current space count

+ */

+int32_t SpaceBreakIterator::getSpaceCount()

+ return fSpaceCount;

+/*

+ * Returns the current word count

+ */

+int32_t SpaceBreakIterator::getWordCount()

+ return fWordCount;

Property changes on: icu46/source/test/thaitest/thaitest.cpp

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu46/source/test/thaitest/space.txt ('k') | icu46/source/test/thaitest/thaitest.dsp » ('j') | no next file with comments »