Index: source/test/thaitest/thaitest.cpp |
diff --git a/source/test/thaitest/thaitest.cpp b/source/test/thaitest/thaitest.cpp |
deleted file mode 100644 |
index 657fab433c09d5b18ee125b2dd5b1f7bb1689f99..0000000000000000000000000000000000000000 |
--- a/source/test/thaitest/thaitest.cpp |
+++ /dev/null |
@@ -1,571 +0,0 @@ |
-/* |
- ****************************************************************************** |
- * Copyright (C) 1998-2003, 2006, International Business Machines Corporation * |
- * and others. All Rights Reserved. * |
- ****************************************************************************** |
- */ |
- |
-#include <errno.h> |
-#include <stdio.h> |
-#include <string.h> |
- |
-#include "unicode/utypes.h" |
-#include "unicode/uchar.h" |
-#include "unicode/uchriter.h" |
-#include "unicode/brkiter.h" |
-#include "unicode/locid.h" |
-#include "unicode/unistr.h" |
-#include "unicode/uniset.h" |
-#include "unicode/ustring.h" |
- |
-/* |
- * This program takes a Unicode text file containing Thai text with |
- * spaces inserted where the word breaks are. It computes a copy of |
- * the text without spaces and uses a word instance of a Thai BreakIterator |
- * to compute the word breaks. The program reports any differences in the |
- * breaks. |
- * |
- * NOTE: by it's very nature, Thai word breaking is not exact, so it is |
- * exptected that this program will always report some differences. |
- */ |
- |
-/* |
- * This class is a break iterator that counts words and spaces. |
- */ |
-class SpaceBreakIterator |
-{ |
-public: |
- // The constructor: |
- // text - pointer to an array of UChars to iterate over |
- // count - the number of UChars in text |
- SpaceBreakIterator(const UChar *text, int32_t count); |
- |
- // the destructor |
- ~SpaceBreakIterator(); |
- |
- // return next break position |
- int32_t next(); |
- |
- // return current word count |
- int32_t getWordCount(); |
- |
- // return current space count |
- int32_t getSpaceCount(); |
- |
-private: |
- // No arg constructor: private so clients can't call it. |
- SpaceBreakIterator(); |
- |
- // The underlying BreakIterator |
- BreakIterator *fBreakIter; |
- |
- // address of the UChar array |
- const UChar *fText; |
- |
- // number of UChars in fText |
- int32_t fTextCount; |
- |
- // current word count |
- int32_t fWordCount; |
- |
- // current space count |
- int32_t fSpaceCount; |
- |
- // UnicodeSet of SA characters |
- UnicodeSet fComplexContext; |
- |
- // true when fBreakIter has returned DONE |
- UBool fDone; |
-}; |
- |
-/* |
- * This is the main class. It compares word breaks and reports the differences. |
- */ |
-class ThaiWordbreakTest |
-{ |
-public: |
- // The main constructor: |
- // spaces - pointer to a UChar array for the text with spaces |
- // spaceCount - the number of characters in the spaces array |
- // noSpaces - pointer to a UChar array for the text without spaces |
- // noSpaceCount - the number of characters in the noSpaces array |
- // verbose - report all breaks if true, otherwise just report differences |
- ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose); |
- ~ThaiWordbreakTest(); |
- |
- // returns the number of breaks that are in the spaces array |
- // but aren't found in the noSpaces array |
- int32_t getBreaksNotFound(); |
- |
- // returns the number of breaks which are found in the noSpaces |
- // array but aren't in the spaces array |
- int32_t getInvalidBreaks(); |
- |
- // returns the number of words found in the spaces array |
- int32_t getWordCount(); |
- |
- // reads the input Unicode text file: |
- // fileName - the path name of the file |
- // charCount - set to the number of UChars read from the file |
- // returns - the address of the UChar array containing the characters |
- static const UChar *readFile(char *fileName, int32_t &charCount); |
- |
- // removes spaces form the input UChar array: |
- // spaces - pointer to the input UChar array |
- // count - number of UChars in the spaces array |
- // nonSpaceCount - the number of UChars in the result array |
- // returns - the address of the UChar array with spaces removed |
- static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount); |
- |
-private: |
- // The no arg constructor - private so clients can't call it |
- ThaiWordbreakTest(); |
- |
- // This does the actual comparison: |
- // spaces - the address of the UChar array for the text with spaces |
- // spaceCount - the number of UChars in the spaces array |
- // noSpaces - the address of the UChar array for the text without spaces |
- // noSpaceCount - the number of UChars in the noSpaces array |
- // returns - true if all breaks match, FALSE otherwise |
- UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount, |
- const UChar *noSpaces, int32_t noSpaceCount); |
- |
- // helper method to report a break in the spaces |
- // array that's not found in the noSpaces array |
- void breakNotFound(int32_t br); |
- |
- // helper method to report a break that's found in |
- // the noSpaces array that's not in the spaces array |
- void foundInvalidBreak(int32_t br); |
- |
- // count of breaks in the spaces array that |
- // aren't found in the noSpaces array |
- int32_t fBreaksNotFound; |
- |
- // count of breaks found in the noSpaces array |
- // that aren't in the spaces array |
- int32_t fInvalidBreaks; |
- |
- // number of words found in the spaces array |
- int32_t fWordCount; |
- |
- // report all breaks if true, otherwise just report differences |
- UBool fVerbose; |
-}; |
- |
-/* |
- * The main constructor: it calls compareWordBreaks and reports any differences |
- */ |
-ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, |
- const UChar *noSpaces, int32_t noSpaceCount, UBool verbose) |
-: fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose) |
-{ |
- compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount); |
-} |
- |
-/* |
- * The no arg constructor |
- */ |
-ThaiWordbreakTest::ThaiWordbreakTest() |
-{ |
- // nothing |
-} |
- |
-/* |
- * The destructor |
- */ |
-ThaiWordbreakTest::~ThaiWordbreakTest() |
-{ |
- // nothing? |
-} |
- |
-/* |
- * returns the number of breaks in the spaces array |
- * that aren't found in the noSpaces array |
- */ |
-inline int32_t ThaiWordbreakTest::getBreaksNotFound() |
-{ |
- return fBreaksNotFound; |
-} |
- |
-/* |
- * Returns the number of breaks found in the noSpaces |
- * array that aren't in the spaces array |
- */ |
-inline int32_t ThaiWordbreakTest::getInvalidBreaks() |
-{ |
- return fInvalidBreaks; |
-} |
- |
-/* |
- * Returns the number of words found in the spaces array |
- */ |
-inline int32_t ThaiWordbreakTest::getWordCount() |
-{ |
- return fWordCount; |
-} |
- |
-/* |
- * This method does the acutal break comparison and reports the results. |
- * It uses a SpaceBreakIterator to iterate over the text with spaces, |
- * and a word instance of a Thai BreakIterator to iterate over the text |
- * without spaces. |
- */ |
-UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount, |
- const UChar *noSpaces, int32_t noSpaceCount) |
-{ |
- UBool result = TRUE; |
- Locale thai("th"); |
- UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount); |
- UErrorCode status = U_ZERO_ERROR; |
- |
- BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status); |
- breakIter->adoptText(noSpaceIter); |
- |
- SpaceBreakIterator spaceIter(spaces, spaceCount); |
- |
- int32_t nextBreak = 0; |
- int32_t nextSpaceBreak = 0; |
- int32_t iterCount = 0; |
- |
- while (TRUE) { |
- nextSpaceBreak = spaceIter.next(); |
- nextBreak = breakIter->next(); |
- |
- if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) { |
- if (nextBreak != BreakIterator::DONE) { |
- fprintf(stderr, "break iterator didn't end.\n"); |
- } else if (nextSpaceBreak != BreakIterator::DONE) { |
- fprintf(stderr, "premature break iterator end.\n"); |
- } |
- |
- break; |
- } |
- |
- while (nextSpaceBreak != nextBreak && |
- nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) { |
- if (nextSpaceBreak < nextBreak) { |
- breakNotFound(nextSpaceBreak); |
- result = FALSE; |
- nextSpaceBreak = spaceIter.next(); |
- } else if (nextSpaceBreak > nextBreak) { |
- foundInvalidBreak(nextBreak); |
- result = FALSE; |
- nextBreak = breakIter->next(); |
- } |
- } |
- |
- if (fVerbose) { |
- printf("%d %d\n", nextSpaceBreak, nextBreak); |
- } |
- } |
- |
- |
- fWordCount = spaceIter.getWordCount(); |
- |
- delete breakIter; |
- |
- return result; |
-} |
- |
-/* |
- * Report a break that's in the text with spaces but |
- * not found in the text without spaces. |
- */ |
-void ThaiWordbreakTest::breakNotFound(int32_t br) |
-{ |
- if (fVerbose) { |
- printf("%d ****\n", br); |
- } else { |
- fprintf(stderr, "break not found: %d\n", br); |
- } |
- |
- fBreaksNotFound += 1; |
-} |
- |
-/* |
- * Report a break that's found in the text without spaces |
- * that isn't in the text with spaces. |
- */ |
-void ThaiWordbreakTest::foundInvalidBreak(int32_t br) |
-{ |
- if (fVerbose) { |
- printf("**** %d\n", br); |
- } else { |
- fprintf(stderr, "found invalid break: %d\n", br); |
- } |
- |
- fInvalidBreaks += 1; |
-} |
- |
-/* |
- * Read the text from a file. The text must start with a Unicode Byte |
- * Order Mark (BOM) so that we know what order to read the bytes in. |
- */ |
-const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount) |
-{ |
- FILE *f; |
- int32_t fileSize; |
- |
- UChar *buffer; |
- char *bufferChars; |
- |
- f = fopen(fileName, "rb"); |
- |
- if( f == NULL ) { |
- fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno)); |
- return 0; |
- } |
- |
- fseek(f, 0, SEEK_END); |
- fileSize = ftell(f); |
- |
- fseek(f, 0, SEEK_SET); |
- bufferChars = new char[fileSize]; |
- |
- if(bufferChars == 0) { |
- fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno)); |
- fclose(f); |
- return 0; |
- } |
- |
- fread(bufferChars, sizeof(char), fileSize, f); |
- if( ferror(f) ) { |
- fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno)); |
- fclose(f); |
- delete[] bufferChars; |
- return 0; |
- } |
- fclose(f); |
- |
- UnicodeString myText(bufferChars, fileSize, "UTF-8"); |
- |
- delete[] bufferChars; |
- |
- charCount = myText.length(); |
- buffer = new UChar[charCount]; |
- if(buffer == 0) { |
- fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno)); |
- return 0; |
- } |
- |
- myText.extract(1, myText.length(), buffer); |
- charCount--; // skip the BOM |
- buffer[charCount] = 0; // NULL terminate for easier reading in the debugger |
- |
- return buffer; |
-} |
- |
-/* |
- * Remove spaces from the input UChar array. |
- * |
- * We check explicitly for a Unicode code value of 0x0020 |
- * because Unicode::isSpaceChar returns true for CR, LF, etc. |
- * |
- */ |
-const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount) |
-{ |
- int32_t i, out, spaceCount; |
- |
- spaceCount = 0; |
- for (i = 0; i < count; i += 1) { |
- if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) { |
- spaceCount += 1; |
- } |
- } |
- |
- nonSpaceCount = count - spaceCount; |
- UChar *noSpaces = new UChar[nonSpaceCount]; |
- |
- if (noSpaces == 0) { |
- fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n"); |
- return 0; |
- } |
- |
- for (out = 0, i = 0; i < count; i += 1) { |
- if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) { |
- noSpaces[out++] = spaces[i]; |
- } |
- } |
- |
- return noSpaces; |
-} |
- |
-/* |
- * Generate a text file with spaces in it from a file without. |
- */ |
-int generateFile(const UChar *chars, int32_t length) { |
- Locale root(""); |
- UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length); |
- UErrorCode status = U_ZERO_ERROR; |
- |
- UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status); |
- BreakIterator *breakIter = BreakIterator::createWordInstance(root, status); |
- breakIter->adoptText(noSpaceIter); |
- char outbuf[1024]; |
- int32_t strlength; |
- UChar bom = 0xFEFF; |
- |
- printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status)); |
- int32_t prevbreak = 0; |
- while (U_SUCCESS(status)) { |
- int32_t nextbreak = breakIter->next(); |
- if (nextbreak == BreakIterator::DONE) { |
- break; |
- } |
- printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak], |
- nextbreak-prevbreak, &status)); |
- if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1]) |
- && complexContext.contains(chars[nextbreak])) { |
- printf(" "); |
- } |
- prevbreak = nextbreak; |
- } |
- |
- if (U_FAILURE(status)) { |
- fprintf(stderr, "generate failed: %s\n", u_errorName(status)); |
- return status; |
- } |
- else { |
- return 0; |
- } |
-} |
- |
-/* |
- * The main routine. Read the command line arguments, read the text file, |
- * remove the spaces, do the comparison and report the final results |
- */ |
-int main(int argc, char **argv) |
-{ |
- char *fileName = "space.txt"; |
- int arg = 1; |
- UBool verbose = FALSE; |
- UBool generate = FALSE; |
- |
- if (argc >= 2 && strcmp(argv[1], "-generate") == 0) { |
- generate = TRUE; |
- arg += 1; |
- } |
- |
- if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) { |
- verbose = TRUE; |
- arg += 1; |
- } |
- |
- if (arg == argc - 1) { |
- fileName = argv[arg++]; |
- } |
- |
- if (arg != argc) { |
- fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]); |
- return 1; |
- } |
- |
- int32_t spaceCount, nonSpaceCount; |
- const UChar *spaces, *noSpaces; |
- |
- spaces = ThaiWordbreakTest::readFile(fileName, spaceCount); |
- |
- if (spaces == 0) { |
- return 1; |
- } |
- |
- if (generate) { |
- return generateFile(spaces, spaceCount); |
- } |
- |
- noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount); |
- |
- if (noSpaces == 0) { |
- return 1; |
- } |
- |
- ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose); |
- |
- printf("word count: %d\n", test.getWordCount()); |
- printf("breaks not found: %d\n", test.getBreaksNotFound()); |
- printf("invalid breaks found: %d\n", test.getInvalidBreaks()); |
- |
- return 0; |
-} |
- |
-/* |
- * The main constructor. Clear all the counts and construct a default |
- * word instance of a BreakIterator. |
- */ |
-SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count) |
- : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE) |
-{ |
- UCharCharacterIterator *iter = new UCharCharacterIterator(text, count); |
- UErrorCode status = U_ZERO_ERROR; |
- fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status); |
- Locale root(""); |
- |
- fBreakIter = BreakIterator::createWordInstance(root, status); |
- fBreakIter->adoptText(iter); |
-} |
- |
-SpaceBreakIterator::SpaceBreakIterator() |
-{ |
- // nothing |
-} |
- |
-/* |
- * The destructor. delete the underlying BreakIterator |
- */ |
-SpaceBreakIterator::~SpaceBreakIterator() |
-{ |
- delete fBreakIter; |
-} |
- |
-/* |
- * Return the next break, counting words and spaces. |
- */ |
-int32_t SpaceBreakIterator::next() |
-{ |
- if (fDone) { |
- return BreakIterator::DONE; |
- } |
- |
- int32_t nextBreak; |
- do { |
- nextBreak = fBreakIter->next(); |
- |
- if (nextBreak == BreakIterator::DONE) { |
- fDone = TRUE; |
- return BreakIterator::DONE; |
- } |
- } |
- while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1]) |
- && fComplexContext.contains(fText[nextBreak])); |
- |
- int32_t result = nextBreak - fSpaceCount; |
- |
- if (nextBreak < fTextCount) { |
- if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) { |
- fSpaceCount += fBreakIter->next() - nextBreak; |
- } |
- } |
- |
- fWordCount += 1; |
- |
- return result; |
-} |
- |
-/* |
- * Returns the current space count |
- */ |
-int32_t SpaceBreakIterator::getSpaceCount() |
-{ |
- return fSpaceCount; |
-} |
- |
-/* |
- * Returns the current word count |
- */ |
-int32_t SpaceBreakIterator::getWordCount() |
-{ |
- return fWordCount; |
-} |
- |
- |