| Index: source/test/intltest/rbbitst.cpp
|
| diff --git a/source/test/intltest/rbbitst.cpp b/source/test/intltest/rbbitst.cpp
|
| index 95bf8dde4df9fd75edfda691e6c79d82ec496fff..278176dc333130ba0576e54b66b26b7ba0a94374 100644
|
| --- a/source/test/intltest/rbbitst.cpp
|
| +++ b/source/test/intltest/rbbitst.cpp
|
| @@ -1,6 +1,6 @@
|
| /********************************************************************
|
| * COPYRIGHT:
|
| - * Copyright (c) 1999-2013, International Business Machines Corporation and
|
| + * Copyright (c) 1999-2014, International Business Machines Corporation and
|
| * others. All Rights Reserved.
|
| ********************************************************************/
|
| /************************************************************************
|
| @@ -31,9 +31,9 @@
|
| #include "intltest.h"
|
| #include "rbbitst.h"
|
| #include <string.h>
|
| +#include "charstr.h"
|
| #include "uvector.h"
|
| #include "uvectr32.h"
|
| -#include <string.h>
|
| #include <stdio.h>
|
| #include <stdlib.h>
|
| #include "unicode/numfmt.h"
|
| @@ -354,27 +354,19 @@ void RBBITest::TestStatusReturn() {
|
| }
|
|
|
|
|
| -static void printStringBreaks(UnicodeString ustr, int expected[],
|
| - int expectedcount)
|
| -{
|
| +static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
|
| UErrorCode status = U_ZERO_ERROR;
|
| char name[100];
|
| printf("code alpha extend alphanum type word sent line name\n");
|
| - int j;
|
| - for (j = 0; j < ustr.length(); j ++) {
|
| - if (expectedcount > 0) {
|
| - int k;
|
| - for (k = 0; k < expectedcount; k ++) {
|
| - if (j == expected[k]) {
|
| - printf("------------------------------------------------ %d\n",
|
| - j);
|
| - }
|
| - }
|
| - }
|
| - UChar32 c = ustr.char32At(j);
|
| - if (c > 0xffff) {
|
| - j ++;
|
| + int nextExpectedIndex = 0;
|
| + utext_setNativeIndex(tstr, 0);
|
| + for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
|
| + if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
|
| + printf("------------------------------------------------ %d\n", j);
|
| + ++nextExpectedIndex;
|
| }
|
| +
|
| + UChar32 c = utext_next32(tstr);
|
| u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
|
| printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
|
| u_isUAlphabetic(c),
|
| @@ -400,6 +392,19 @@ static void printStringBreaks(UnicodeString ustr, int expected[],
|
| }
|
|
|
|
|
| +static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
|
| + UErrorCode status = U_ZERO_ERROR;
|
| + UText *tstr = NULL;
|
| + tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
|
| + if (U_FAILURE(status)) {
|
| + printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
|
| + return;
|
| + }
|
| + printStringBreaks(tstr, expected, expectedCount);
|
| + utext_close(tstr);
|
| +}
|
| +
|
| +
|
| void RBBITest::TestBug3818() {
|
| UErrorCode status = U_ZERO_ERROR;
|
|
|
| @@ -830,23 +835,173 @@ void RBBITest::TestBug5775() {
|
| //------------------------------------------------------------------------------
|
|
|
| struct TestParams {
|
| - BreakIterator *bi;
|
| - UnicodeString dataToBreak;
|
| - UVector32 *expectedBreaks;
|
| - UVector32 *srcLine;
|
| + BreakIterator *bi; // Break iterator is set while parsing test source.
|
| + // Changed out whenever test data changes break type.
|
| +
|
| + UnicodeString dataToBreak; // Data that is built up while parsing the test.
|
| + UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
|
| + UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
|
| UVector32 *srcCol;
|
| +
|
| + UText *textToBreak; // UText, could be UTF8 or UTF16.
|
| + UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
|
| + CharString utf8String; // UTF-8 form of text to break.
|
| +
|
| + TestParams(UErrorCode &status) : dataToBreak() {
|
| + bi = NULL;
|
| + expectedBreaks = new UVector32(status);
|
| + srcLine = new UVector32(status);
|
| + srcCol = new UVector32(status);
|
| + textToBreak = NULL;
|
| + textMap = new UVector32(status);
|
| + }
|
| +
|
| + ~TestParams() {
|
| + delete bi;
|
| + delete expectedBreaks;
|
| + delete srcLine;
|
| + delete srcCol;
|
| + utext_close(textToBreak);
|
| + delete textMap;
|
| + }
|
| +
|
| + int32_t getSrcLine(int32_t bp);
|
| + int32_t getExpectedBreak(int32_t bp);
|
| + int32_t getSrcCol(int32_t bp);
|
| +
|
| + void setUTF16(UErrorCode &status);
|
| + void setUTF8(UErrorCode &status);
|
| };
|
|
|
| -void RBBITest::executeTest(TestParams *t) {
|
| +// Append a UnicodeString to a CharString with UTF-8 encoding.
|
| +// Substitute any invalid chars.
|
| +// Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
|
| +static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| + int32_t utf8Length;
|
| + u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
|
| + src.getBuffer(), src.length(), // UTF-16 data
|
| + 0xfffd, NULL, // Substitution char, number of subs.
|
| + &status);
|
| + if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
|
| + return;
|
| + }
|
| + status = U_ZERO_ERROR;
|
| + int32_t capacity;
|
| + char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
|
| + u_strToUTF8WithSub(buffer, utf8Length, NULL,
|
| + src.getBuffer(), src.length(),
|
| + 0xfffd, NULL, &status);
|
| + dest.append(buffer, utf8Length, status);
|
| +}
|
| +
|
| +
|
| +void TestParams::setUTF16(UErrorCode &status) {
|
| + textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
|
| + textMap->removeAllElements();
|
| + for (int32_t i=0; i<dataToBreak.length(); i++) {
|
| + if (i == dataToBreak.getChar32Start(i)) {
|
| + textMap->addElement(i, status);
|
| + } else {
|
| + textMap->addElement(-1, status);
|
| + }
|
| + }
|
| + textMap->addElement(dataToBreak.length(), status);
|
| + U_ASSERT(dataToBreak.length() + 1 == textMap->size());
|
| +}
|
| +
|
| +
|
| +void TestParams::setUTF8(UErrorCode &status) {
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| + utf8String.clear();
|
| + CharStringAppend(utf8String, dataToBreak, status);
|
| + textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + textMap->removeAllElements();
|
| + int32_t utf16Index = 0;
|
| + for (;;) {
|
| + textMap->addElement(utf16Index, status);
|
| + UChar32 c32 = utext_current32(textToBreak);
|
| + if (c32 < 0) {
|
| + break;
|
| + }
|
| + utf16Index += U16_LENGTH(c32);
|
| + utext_next32(textToBreak);
|
| + while (textMap->size() < utext_getNativeIndex(textToBreak)) {
|
| + textMap->addElement(-1, status);
|
| + }
|
| + }
|
| + U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
|
| +}
|
| +
|
| +
|
| +int32_t TestParams::getSrcLine(int bp) {
|
| + if (bp >= textMap->size()) {
|
| + bp = textMap->size() - 1;
|
| + }
|
| + int32_t i = 0;
|
| + for(; bp >= 0 ; --bp) {
|
| + // Move to a character boundary if we are not on one already.
|
| + i = textMap->elementAti(bp);
|
| + if (i >= 0) {
|
| + break;
|
| + }
|
| + }
|
| + return srcLine->elementAti(i);
|
| +}
|
| +
|
| +
|
| +int32_t TestParams::getExpectedBreak(int bp) {
|
| + if (bp >= textMap->size()) {
|
| + return 0;
|
| + }
|
| + int32_t i = textMap->elementAti(bp);
|
| + int32_t retVal = 0;
|
| + if (i >= 0) {
|
| + retVal = expectedBreaks->elementAti(i);
|
| + }
|
| + return retVal;
|
| +}
|
| +
|
| +
|
| +int32_t TestParams::getSrcCol(int bp) {
|
| + if (bp >= textMap->size()) {
|
| + bp = textMap->size() - 1;
|
| + }
|
| + int32_t i = 0;
|
| + for(; bp >= 0; --bp) {
|
| + // Move bp to a character boundary if we are not on one already.
|
| + i = textMap->elementAti(bp);
|
| + if (i >= 0) {
|
| + break;
|
| + }
|
| + }
|
| + return srcCol->elementAti(i);
|
| +}
|
| +
|
| +
|
| +void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
|
| int32_t bp;
|
| int32_t prevBP;
|
| int32_t i;
|
|
|
| + TEST_ASSERT_SUCCESS(status);
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| if (t->bi == NULL) {
|
| return;
|
| }
|
|
|
| - t->bi->setText(t->dataToBreak);
|
| + t->bi->setText(t->textToBreak, status);
|
| //
|
| // Run the iterator forward
|
| //
|
| @@ -855,93 +1010,92 @@ void RBBITest::executeTest(TestParams *t) {
|
| if (prevBP == bp) {
|
| // Fail for lack of forward progress.
|
| errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
|
| - bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
|
| + bp, t->getSrcLine(bp), t->getSrcCol(bp));
|
| break;
|
| }
|
|
|
| - // Check that there were we didn't miss an expected break between the last one
|
| + // Check that there we didn't miss an expected break between the last one
|
| // and this one.
|
| for (i=prevBP+1; i<bp; i++) {
|
| - if (t->expectedBreaks->elementAti(i) != 0) {
|
| + if (t->getExpectedBreak(i) != 0) {
|
| int expected[] = {0, i};
|
| printStringBreaks(t->dataToBreak, expected, 2);
|
| errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
|
| - i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
|
| + i, t->getSrcLine(i), t->getSrcCol(i));
|
| }
|
| }
|
|
|
| // Check that the break we did find was expected
|
| - if (t->expectedBreaks->elementAti(bp) == 0) {
|
| + if (t->getExpectedBreak(bp) == 0) {
|
| int expected[] = {0, bp};
|
| - printStringBreaks(t->dataToBreak, expected, 2);
|
| + printStringBreaks(t->textToBreak, expected, 2);
|
| errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
|
| - bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
|
| + bp, t->getSrcLine(bp), t->getSrcCol(bp));
|
| } else {
|
| // The break was expected.
|
| // Check that the {nnn} tag value is correct.
|
| - int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
|
| + int32_t expectedTagVal = t->getExpectedBreak(bp);
|
| if (expectedTagVal == -1) {
|
| expectedTagVal = 0;
|
| }
|
| - int32_t line = t->srcLine->elementAti(bp);
|
| + int32_t line = t->getSrcLine(bp);
|
| int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
|
| if (rs != expectedTagVal) {
|
| errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
|
| " Actual, Expected status = %4d, %4d",
|
| - bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
|
| + bp, line, t->getSrcCol(bp), rs, expectedTagVal);
|
| }
|
| }
|
|
|
| -
|
| prevBP = bp;
|
| }
|
|
|
| // Verify that there were no missed expected breaks after the last one found
|
| - for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
|
| - if (t->expectedBreaks->elementAti(i) != 0) {
|
| + for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
|
| + if (t->getExpectedBreak(i) != 0) {
|
| errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
|
| - i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
|
| + i, t->getSrcLine(i), t->getSrcCol(i));
|
| }
|
| }
|
|
|
| //
|
| // Run the iterator backwards, verify that the same breaks are found.
|
| //
|
| - prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen.
|
| + prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen.
|
| for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
|
| if (prevBP == bp) {
|
| // Fail for lack of progress.
|
| errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
|
| - bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
|
| + bp, t->getSrcLine(bp), t->getSrcCol(bp));
|
| break;
|
| }
|
|
|
| - // Check that there were we didn't miss an expected break between the last one
|
| + // Check that we didn't miss an expected break between the last one
|
| // and this one. (UVector returns zeros for index out of bounds.)
|
| for (i=prevBP-1; i>bp; i--) {
|
| - if (t->expectedBreaks->elementAti(i) != 0) {
|
| - errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
|
| - i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
|
| + if (t->getExpectedBreak(i) != 0) {
|
| + errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
|
| + i, t->getSrcLine(i), t->getSrcCol(i));
|
| }
|
| }
|
|
|
| // Check that the break we did find was expected
|
| - if (t->expectedBreaks->elementAti(bp) == 0) {
|
| + if (t->getExpectedBreak(bp) == 0) {
|
| errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
|
| - bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
|
| + bp, t->getSrcLine(bp), t->getSrcCol(bp));
|
| } else {
|
| // The break was expected.
|
| // Check that the {nnn} tag value is correct.
|
| - int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
|
| + int32_t expectedTagVal = t->getExpectedBreak(bp);
|
| if (expectedTagVal == -1) {
|
| expectedTagVal = 0;
|
| }
|
| - int line = t->srcLine->elementAti(bp);
|
| - int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
|
| + int line = t->getSrcLine(bp);
|
| + int32_t rs = t->bi->getRuleStatus();
|
| if (rs != expectedTagVal) {
|
| errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
|
| " Actual, Expected status = %4d, %4d",
|
| - bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
|
| + bp, line, t->getSrcCol(bp), rs, expectedTagVal);
|
| }
|
| }
|
|
|
| @@ -950,30 +1104,30 @@ void RBBITest::executeTest(TestParams *t) {
|
|
|
| // Verify that there were no missed breaks prior to the last one found
|
| for (i=prevBP-1; i>=0; i--) {
|
| - if (t->expectedBreaks->elementAti(i) != 0) {
|
| + if (t->getExpectedBreak(i) != 0) {
|
| errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
|
| - i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
|
| + i, t->getSrcLine(i), t->getSrcCol(i));
|
| }
|
| }
|
|
|
| // Check isBoundary()
|
| - for (i=0; i<t->expectedBreaks->size(); i++) {
|
| - UBool boundaryExpected = (t->expectedBreaks->elementAti(i) != 0);
|
| + for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
|
| + UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
|
| UBool boundaryFound = t->bi->isBoundary(i);
|
| if (boundaryExpected != boundaryFound) {
|
| errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
|
| " Expected, Actual= %s, %s",
|
| - i, t->srcLine->elementAti(i), t->srcCol->elementAti(i),
|
| + i, t->getSrcLine(i), t->getSrcCol(i),
|
| boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
|
| }
|
| }
|
|
|
| // Check following()
|
| - for (i=0; i<t->expectedBreaks->size(); i++) {
|
| + for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
|
| int32_t actualBreak = t->bi->following(i);
|
| int32_t expectedBreak = BreakIterator::DONE;
|
| - for (int32_t j=i+1; j < t->expectedBreaks->size(); j++) {
|
| - if (t->expectedBreaks->elementAti(j) != 0) {
|
| + for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
|
| + if (t->getExpectedBreak(j) != 0) {
|
| expectedBreak = j;
|
| break;
|
| }
|
| @@ -981,17 +1135,24 @@ void RBBITest::executeTest(TestParams *t) {
|
| if (expectedBreak != actualBreak) {
|
| errln("following(%d) incorrect. File line,col= %4d,%4d\n"
|
| " Expected, Actual= %d, %d",
|
| - i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
|
| + i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
|
| }
|
| }
|
|
|
| // Check preceding()
|
| - for (i=t->expectedBreaks->size(); i>=0; i--) {
|
| + for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
|
| int32_t actualBreak = t->bi->preceding(i);
|
| int32_t expectedBreak = BreakIterator::DONE;
|
|
|
| - for (int32_t j=i-1; j >= 0; j--) {
|
| - if (t->expectedBreaks->elementAti(j) != 0) {
|
| + // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
|
| + // preceding(trailing byte) will return the index of some preceding code point,
|
| + // not the lead byte of the current code point, even though that has a smaller index.
|
| + // Therefore, start looking at the expected break data not at i-1, but at
|
| + // the start of code point index - 1.
|
| + utext_setNativeIndex(t->textToBreak, i);
|
| + int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
|
| + for (; j >= 0; j--) {
|
| + if (t->getExpectedBreak(j) != 0) {
|
| expectedBreak = j;
|
| break;
|
| }
|
| @@ -999,7 +1160,7 @@ void RBBITest::executeTest(TestParams *t) {
|
| if (expectedBreak != actualBreak) {
|
| errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
|
| " Expected, Actual= %d, %d",
|
| - i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
|
| + i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
|
| }
|
| }
|
| }
|
| @@ -1011,11 +1172,7 @@ void RBBITest::TestExtended() {
|
| Locale locale("");
|
|
|
| UnicodeString rules;
|
| - TestParams tp;
|
| - tp.bi = NULL;
|
| - tp.expectedBreaks = new UVector32(status);
|
| - tp.srcLine = new UVector32(status);
|
| - tp.srcCol = new UVector32(status);
|
| + TestParams tp(status);
|
|
|
| RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
|
| if (U_FAILURE(status)) {
|
| @@ -1190,7 +1347,16 @@ void RBBITest::TestExtended() {
|
| charIdx += 6;
|
|
|
| // RUN THE TEST!
|
| - executeTest(&tp);
|
| + status = U_ZERO_ERROR;
|
| + tp.setUTF16(status);
|
| + executeTest(&tp, status);
|
| + TEST_ASSERT_SUCCESS(status);
|
| +
|
| + // Run again, this time with UTF-8 text wrapped in a UText.
|
| + status = U_ZERO_ERROR;
|
| + tp.setUTF8(status);
|
| + TEST_ASSERT_SUCCESS(status);
|
| + executeTest(&tp, status);
|
| break;
|
| }
|
|
|
| @@ -1356,10 +1522,6 @@ void RBBITest::TestExtended() {
|
| }
|
|
|
| end_test:
|
| - delete tp.bi;
|
| - delete tp.expectedBreaks;
|
| - delete tp.srcLine;
|
| - delete tp.srcCol;
|
| delete [] testFile;
|
| #endif
|
| }
|
|
|