icu46/source/test/intltest/rbbitst.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/test/intltest/rbbitst.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /********************************************************************

	2 * COPYRIGHT:

	3 * Copyright (c) 1999-2010, International Business Machines Corporation and

	4 * others. All Rights Reserved.

	5 ********************************************************************/

	6 /************************************************************************

	7 * Date Name Description

	8 * 12/15/99 Madhu Creation.

	9 * 01/12/2000 Madhu Updated for changed API and added new tests

	10 ************************************************************************/

	11

	12 #include <typeinfo> // for 'typeid' to work

	13

	14 #include "unicode/utypes.h"

	15

	16 #if !UCONFIG_NO_BREAK_ITERATION

	17

	18 #include "unicode/utypes.h"

	19 #include "unicode/brkiter.h"

	20 #include "unicode/rbbi.h"

	21 #include "unicode/uchar.h"

	22 #include "unicode/utf16.h"

	23 #include "unicode/ucnv.h"

	24 #include "unicode/schriter.h"

	25 #include "unicode/uniset.h"

	26 #include "unicode/regex.h" // TODO: make conditional on regexp being buil t.

	27 #include "unicode/ustring.h"

	28 #include "unicode/utext.h"

	29 #include "intltest.h"

	30 #include "rbbitst.h"

	31 #include <string.h>

	32 #include "uvector.h"

	33 #include "uvectr32.h"

	34 #include "triedict.h"

	35 #include <string.h>

	36 #include <stdio.h>

	37 #include <stdlib.h>

	38

	39 #define TEST_ASSERT(x) {if (!(x)) { \

	40 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}

	41

	42 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \

	43 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__ , __LINE__, u_errorName(errcode));}}

	44

	45

	46 //---------------------------------------------

	47 // runIndexedTest

	48 //---------------------------------------------

	49

	50 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha r* params )

	51 {

	52 if (exec) logln("TestSuite RuleBasedBreakIterator: ");

	53

	54 switch (index) {

	55 #if !UCONFIG_NO_FILE_IO

	56 case 0: name = "TestBug4153072";

	57 if(exec) TestBug4153072(); break;

	58 #else

	59 case 0: name = "skip";

	60 break;

	61 #endif

	62

	63 case 1: name = "TestJapaneseLineBreak";

	64 if(exec) TestJapaneseLineBreak(); break;

	65 case 2: name = "TestStatusReturn";

	66 if(exec) TestStatusReturn(); break;

	67

	68 #if !UCONFIG_NO_FILE_IO

	69 case 3: name = "TestUnicodeFiles";

	70 if(exec) TestUnicodeFiles(); break;

	71 case 4: name = "TestEmptyString";

	72 if(exec) TestEmptyString(); break;

	73 #else

	74 case 3: case 4: name = "skip";

	75 break;

	76 #endif

	77

	78 case 5: name = "TestGetAvailableLocales";

	79 if(exec) TestGetAvailableLocales(); break;

	80

	81 case 6: name = "TestGetDisplayName";

	82 if(exec) TestGetDisplayName(); break;

	83

	84 #if !UCONFIG_NO_FILE_IO

	85 case 7: name = "TestEndBehaviour";

	86 if(exec) TestEndBehaviour(); break;

	87 case 8: name = "TestMixedThaiLineBreak";

	88 if(exec) TestMixedThaiLineBreak(); break;

	89 case 9: name = "TestThaiLineBreak";

	90 if(exec) TestThaiLineBreak(); break;

	91 case 10: name = "TestMaiyamok";

	92 if(exec) TestMaiyamok(); break;

	93 case 11: name = "TestWordBreaks";

	94 if(exec) TestWordBreaks(); break;

	95 case 12: name = "TestWordBoundary";

	96 if(exec) TestWordBoundary(); break;

	97 case 13: name = "TestLineBreaks";

	98 if(exec) TestLineBreaks(); break;

	99 case 14: name = "TestSentBreaks";

	100 if(exec) TestSentBreaks(); break;

	101 case 15: name = "TestExtended";

	102 if(exec) TestExtended(); break;

	103 #else

	104 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: cas e 15: name = "skip";

	105 break;

	106 #endif

	107

	108 case 16:

	109 if(exec) {

	110 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO

	111 name = "TestMonkey";

	112 TestMonkey(params);

	113 #else

	114 name = "skip";

	115 #endif

	116 }

	117 break;

	118

	119 #if !UCONFIG_NO_FILE_IO

	120 case 17: name = "TestBug3818";

	121 if(exec) TestBug3818(); break;

	122 case 18: name = "TestJapaneseWordBreak";

	123 if(exec) TestJapaneseWordBreak(); break;

	124 #else

	125 case 17: case 18: name = "skip";

	126 break;

	127 #endif

	128

	129 case 19: name = "TestDebug";

	130 if(exec) TestDebug(); break;

	131 case 20: name = "TestTrieDict";

	132 if(exec) TestTrieDict(); break;

	133

	134 #if !UCONFIG_NO_FILE_IO

	135 case 21: name = "TestBug5775";

	136 if (exec) TestBug5775(); break;

	137 case 22: name = "TestThaiBreaks";

	138 if (exec) TestThaiBreaks(); break;

	139 case 23: name = "TestTailoredBreaks";

	140 if (exec) TestTailoredBreaks(); break;

	141 #else

	142 case 21: case 22: case 23: name = "skip";

	143 break;

	144 #endif

	145 case 24: name = "TestDictRules";

	146 if (exec) TestDictRules(); break;

	147 case 25: name = "TestBug5532";

	148 if (exec) TestBug5532(); break;

	149 default: name = ""; break; //needed to end loop

	150 }

	151 }

	152

	153

	154 //---------------------------------------------------------------------------

	155 //

	156 // class BITestData Holds a set of Break iterator test data and results

	157 // Includes

	158 // - the string data to be broken

	159 // - a vector of the expected break positions.

	160 // - a vector of source line numbers for the data,

	161 // (to help see where errors occured.)

	162 // - The expected break tag values.

	163 // - Vectors of actual break positions and tag values.

	164 // - Functions for comparing actual with expected and

	165 // reporting errors.

	166 //

	167 //----------------------------------------------------------------------------

	168 class BITestData {

	169 public:

	170 UnicodeString fDataToBreak;

	171 UVector fExpectedBreakPositions;

	172 UVector fExpectedTags;

	173 UVector fLineNum;

	174 UVector fActualBreakPositions; // Test Results.

	175 UVector fActualTags;

	176

	177 BITestData(UErrorCode &status);

	178 void addDataChunk(const char *data, int32_t tag, int32_t lineNum , UErrorCode status);

	179 void checkResults(const char heading, RBBITest test);

	180 void err(const char heading, RBBITest test, int32_t expectedId x, int32_t actualIdx);

	181 void clearResults();

	182 };

	183

	184 //

	185 // Constructor.

	186 //

	187 BITestData::BITestData(UErrorCode &status)

	188 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fAc tualBreakPositions(status),

	189 fActualTags(status)

	190 {

	191 }

	192

	193 //

	194 // addDataChunk. Add a section (non-breaking) piece if data to the test data.

	195 // The macro form collects the line number, which is helpful

	196 // when tracking down failures.

	197 //

	198 // A null data item is inserted at the start of each test's data

	199 // to put the starting zero into the data list. The position s aved for

	200 // each non-null item is its ending position.

	201 //

	202 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE __, status);

	203 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UE rrorCode status) {

	204 if (U_FAILURE(status)) {return;}

	205 if (data != NULL) {

	206 fDataToBreak.append(CharsToUnicodeString(data));

	207 }

	208 fExpectedBreakPositions.addElement(fDataToBreak.length(), status);

	209 fExpectedTags.addElement(tag, status);

	210 fLineNum.addElement(lineNum, status);

	211 }

	212

	213

	214 //

	215 // checkResults. Compare the actual and expected break positions, report any differences.

	216 //

	217 void BITestData::checkResults(const char heading, RBBITest test) {

	218 int32_t expectedIndex = 0;

	219 int32_t actualIndex = 0;

	220

	221 for (;;) {

	222 // If we've run through both the expected and actual results vectors, we 're done.

	223 // break out of the loop.

	224 if (expectedIndex >= fExpectedBreakPositions.size() &&

	225 actualIndex >= fActualBreakPositions.size()) {

	226 break;

	227 }

	228

	229

	230 if (expectedIndex >= fExpectedBreakPositions.size()) {

	231 err(heading, test, expectedIndex-1, actualIndex);

	232 actualIndex++;

	233 continue;

	234 }

	235

	236 if (actualIndex >= fActualBreakPositions.size()) {

	237 err(heading, test, expectedIndex, actualIndex-1);

	238 expectedIndex++;

	239 continue;

	240 }

	241

	242 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPosit ions.elementAti(expectedIndex)) {

	243 err(heading, test, expectedIndex, actualIndex);

	244 // Try to resync the positions of the indices, to avoid a rash of sp urious erros.

	245 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPo sitions.elementAti(expectedIndex)) {

	246 actualIndex++;

	247 } else {

	248 expectedIndex++;

	249 }

	250 continue;

	251 }

	252

	253 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expe ctedIndex)) {

	254 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",

	255 heading, fLineNum.elementAt(expectedIndex),

	256 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti( actualIndex));

	257 }

	258

	259 actualIndex++;

	260 expectedIndex++;

	261 }

	262 }

	263

	264 //

	265 // err - An error was found. Report it, along with information about where the

	266 // incorrectly broken test data appeared in the s ource file.

	267 //

	268 void BITestData::err(const char heading, RBBITest test, int32_t expectedIdx , int32_t actualIdx)

	269 {

	270 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);

	271 int32_t actual = fActualBreakPositions.elementAti(actualIdx);

	272 int32_t o = 0;

	273 int32_t line = fLineNum.elementAti(expectedIdx);

	274 if (expectedIdx > 0) {

	275 // The line numbers are off by one because a premature break occurs some where

	276 // within the previous item, rather than at the start of the current (expected) item.

	277 // We want to report the offset of the unexpected break from the star t of

	278 // this previous item.

	279 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);

	280 }

	281 if (actual < expected) {

	282 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected);

	283 } else {

	284 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected);

	285 }

	286 }

	287

	288

	289 void BITestData::clearResults() {

	290 fActualBreakPositions.removeAllElements();

	291 fActualTags.removeAllElements();

	292 }

	293

	294

	295 //------------------------------------------------------------------------------ -----

	296 //

	297 // Cannned Test Characters

	298 //

	299 //------------------------------------------------------------------------------ -----

	300

	301 static const UChar cannedTestArray[] = {

	302 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024 , 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,

	303 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x00 44, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x00 64, 0x0065, 0x007b,

	304 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,

	305 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00 ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,

	306 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02 b4, 0x0300, 0x0301, 0x0302, 0x0303,

	307 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x09 40, 0x0949, 0x0f3a, 0x0f3b, 0x2000,

	308 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x20 28, 0x2029, 0x202a, 0x203e, 0x203f,

	309 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x21 64, 0x0000

	310 };

	311

	312 static UnicodeString* cannedTestChars = 0;

	313

	314 #define halfNA "\\u0928\\u094d\\u200d"

	315 #define halfSA "\\u0938\\u094d\\u200d"

	316 #define halfCHA "\\u091a\\u094d\\u200d"

	317 #define halfKA "\\u0915\\u094d\\u200d"

	318 #define deadTA "\\u0924\\u094d"

	319

	320 //------------------------------------------------------------------------------ --------

	321 //

	322 // RBBITest constructor and destructor

	323 //

	324 //------------------------------------------------------------------------------ --------

	325

	326 RBBITest::RBBITest() {

	327 UnicodeString temp(cannedTestArray);

	328 cannedTestChars = new UnicodeString();

	329 *cannedTestChars += (UChar)0x0000;

	330 *cannedTestChars += temp;

	331 }

	332

	333

	334 RBBITest::~RBBITest() {

	335 delete cannedTestChars;

	336 }

	337

	338

	339 static const int T_NUMBER = 100;

	340 static const int T_LETTER = 200;

	341 static const int T_H_OR_K = 300;

	342 static const int T_IDEO = 400;

	343

	344

	345

	346

	347

	348

	349 //--------------------------------------------------------------------

	350 //Testing the BreakIterator for devanagari script

	351 //--------------------------------------------------------------------

	352

	353 #define deadRA "\\u0930\\u094d" /deadform RA = devanagari RA + virama /

	354 #define deadPHA "\\u092b\\u094d" /deadform PHA = devanagari PHA + vira ma/

	355 #define deadTTHA "\\u0920\\u094d"

	356 #define deadPA "\\u092a\\u094d"

	357 #define deadSA "\\u0938\\u094d"

	358 #define visarga "\\u0903" /devanagari visarga looks like a engl ish colon/

	359

	360

	361

	362

	363

	364

	365 //------------------------------------------------------------------------------ -----

	366 //

	367 // Test for status {tag} return value from break rules.

	368 // TODO: a more thorough test.

	369 //

	370 //------------------------------------------------------------------------------ -----

	371 void RBBITest::TestStatusReturn() {

	372 UnicodeString rulesString1("$Letters = [:L:];\n"

	373 "$Numbers = [:N:];\n"

	374 "$Letters+{1};\n"

	375 "$Numbers+{2};\n"

	376 "Help\\ {4}/me\\!;\n"

	377 "[^$Letters $Numbers];\n"

	378 "!.*;\n", -1, US_INV);

	379 UnicodeString testString1 = "abc123..abc Help me Help me!";

	380 // 01234567890123456789012345678

	381 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, - 1};

	382 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, - 1};

	383

	384 UErrorCode status=U_ZERO_ERROR;

	385 UParseError parseError;

	386

	387 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parse Error, status);

	388 if(U_FAILURE(status)) {

	389 dataerrln("FAIL : in construction - %s", u_errorName(status));

	390 } else {

	391 int32_t pos;

	392 int32_t i = 0;

	393 bi->setText(testString1);

	394 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {

	395 if (pos != bounds1[i]) {

	396 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos) ;

	397 break;

	398 }

	399

	400 int tag = bi->getRuleStatus();

	401 if (tag != brkStatus[i]) {

	402 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);

	403 break;

	404 }

	405 i++;

	406 }

	407 }

	408 delete bi;

	409 }

	410

	411

	412 static void printStringBreaks(UnicodeString ustr, int expected[],

	413 int expectedcount)

	414 {

	415 UErrorCode status = U_ZERO_ERROR;

	416 char name[100];

	417 printf("code alpha extend alphanum type word sent line name\n");

	418 int j;

	419 for (j = 0; j < ustr.length(); j ++) {

	420 if (expectedcount > 0) {

	421 int k;

	422 for (k = 0; k < expectedcount; k ++) {

	423 if (j == expected[k]) {

	424 printf("------------------------------------------------ %d\ n",

	425 j);

	426 }

	427 }

	428 }

	429 UChar32 c = ustr.char32At(j);

	430 if (c > 0xffff) {

	431 j ++;

	432 }

	433 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);

	434 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,

	435 u_isUAlphabetic(c),

	436 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),

	437 u_isalnum(c),

	438 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,

	439 u_charType(c),

	440 U_SHORT_PROPERTY_NAME),

	441 u_getPropertyValueName(UCHAR_WORD_BREAK,

	442 u_getIntPropertyValue(c,

	443 UCHAR_WORD_BREAK),

	444 U_SHORT_PROPERTY_NAME),

	445 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,

	446 u_getIntPropertyValue(c,

	447 UCHAR_SENTENCE_BREAK),

	448 U_SHORT_PROPERTY_NAME),

	449 u_getPropertyValueName(UCHAR_LINE_BREAK,

	450 u_getIntPropertyValue(c,

	451 UCHAR_LINE_BREAK),

	452 U_SHORT_PROPERTY_NAME),

	453 name);

	454 }

	455 }

	456

	457 void RBBITest::TestThaiLineBreak() {

	458 UErrorCode status = U_ZERO_ERROR;

	459 BITestData thaiLineSelection(status);

	460

	461 // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol th at

	462 // represents elided letters at the end of a long word. It should be bound to

	463 // the end of the word and not treated as an independent punctuation mark.

	464

	465

	466 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at sta rt of data

	467 ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f ", 0, status);

	468 ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);

	469 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);

	470 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);

	471 // ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, st atus);

	472 // ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);

	473 ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35 \\u0e48", 0, status);

	474 // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us

	475 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);

	476 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);

	477 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);

	478 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, s tatus);

	479 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, s tatus);

	480 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);

	481

	482 // the one time where the paiyannoi occurs somewhere other than at the end

	483 // of a word is in the Thai abbrevation for "etc.", which both begins and

	484 // ends with a paiyannoi

	485 ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);

	486 ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);

	487 ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);

	488

	489 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createL ineInstance(

	490 Locale("th"), status);

	491 if (U_FAILURE(status))

	492 {

	493 errcheckln(status, "Failed to create the BreakIterator for Thai locale i n TestThaiLineBreak. - %s", u_errorName(status));

	494 return;

	495 }

	496

	497 generalIteratorTest(*e, thaiLineSelection);

	498 delete e;

	499 }

	500

	501

	502

	503 void RBBITest::TestMixedThaiLineBreak()

	504 {

	505 UErrorCode status = U_ZERO_ERROR;

	506 BITestData thaiLineSelection(status);

	507

	508 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at sta rt of data

	509

	510

	511 // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English ch aracters

	512 // start

	513

	514 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);

	515 ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31 \\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);

	516 ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);

	517 ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);

	518 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);

	519 ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);

	520 ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);

	521 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);

	522 ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);

	523 ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);

	524 ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);

	525 ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);

	526 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01 \\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);

	527 ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E1 7\\u0E1e\\u0E2F", 0, status);

	528 ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status) ;

	529 ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);

	530

	531 // @suwit - end of changes

	532

	533

	534 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createL ineInstance(Locale("th"), status);

	535 if (U_FAILURE(status))

	536 {

	537 errcheckln(status, "Failed to create the BreakIterator for Thai locale i n TestMixedThaiLineBreak. - %s", u_errorName(status));

	538 return;

	539 }

	540

	541

	542 generalIteratorTest(*e, thaiLineSelection);

	543 delete e;

	544 }

	545

	546

	547 void RBBITest::TestMaiyamok()

	548 {

	549 UErrorCode status = U_ZERO_ERROR;

	550 BITestData thaiLineSelection(status);

	551 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at sta rt of data

	552 // the Thai maiyamok character is a shorthand symbol that means "repeat the previous

	553 // word". Instead of appearing as a word unto itself, however, it's kept to gether

	554 // with the word before it

	555 ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);

	556 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);

	557 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32 \\u0e07", 0, status);

	558 ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);

	559 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);

	560 ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);

	561 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);

	562 ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);

	563 ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);

	564

	565 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createL ineInstance(

	566 Locale("th"), status);

	567

	568 if (U_FAILURE(status))

	569 {

	570 errcheckln(status, "Failed to create the BreakIterator for Thai locale i n TestMaiyamok. - %s", u_errorName(status));

	571 return;

	572 }

	573 generalIteratorTest(*e, thaiLineSelection);

	574 delete e;

	575 }

	576

	577

	578

	579 void RBBITest::TestBug3818() {

	580 UErrorCode status = U_ZERO_ERROR;

	581

	582 // Four Thai words...

	583 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0 x0E2B,0x0E0D,0x0E48,

	584 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0 x0E2B,0x0E0D,0x0E48, 0 };

	585 UnicodeString thaiStr(thaiWordData);

	586

	587 RuleBasedBreakIterator* bi =

	588 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th") , status);

	589 if (U_FAILURE(status) \|\| bi == NULL) {

	590 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __ LINE__, u_errorName(status));

	591 return;

	592 }

	593 bi->setText(thaiStr);

	594

	595 int32_t startOfSecondWord = bi->following(1);

	596 if (startOfSecondWord != 4) {

	597 errln("Fail at file %s, line %d expected start of word at 4, got %d",

	598 __FILE__, __LINE__, startOfSecondWord);

	599 }

	600 startOfSecondWord = bi->following(0);

	601 if (startOfSecondWord != 4) {

	602 errln("Fail at file %s, line %d expected start of word at 4, got %d",

	603 __FILE__, __LINE__, startOfSecondWord);

	604 }

	605 delete bi;

	606 }

	607

	608

	609 void RBBITest::TestJapaneseWordBreak() {

	610 UErrorCode status = U_ZERO_ERROR;

	611 BITestData japaneseWordSelection(status);

	612

	613 ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data

	614 ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2

	615 ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5

	616 ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7

	617 ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10

	618 ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11

	619 ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12

	620

	621 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createW ordInstance(

	622 Locale("ja"), status);

	623 if (U_FAILURE(status))

	624 {

	625 errcheckln(status, "Failed to create the BreakIterator for Japanese loca le in TestJapaneseWordBreak.\n");

	626 return;

	627 }

	628

	629 generalIteratorTest(*e, japaneseWordSelection);

	630 delete e;

	631 }

	632

	633 void RBBITest::TestTrieDict() {

	634 UErrorCode status = U_ZERO_ERROR;

	635

	636 //

	637 // Open and read the test data file.

	638 //

	639 const char *testDataDirectory = IntlTest::getSourceTestData(status);

	640 char testFileName[1000];

	641 if (testDataDirectory == NULL \|\| strlen(testDataDirectory) + strlen("riwords .txt") + 10 >= sizeof(testFileName)) {

	642 errln("Can't open test data. Path too long.");

	643 return;

	644 }

	645 strcpy(testFileName, testDataDirectory);

	646 strcat(testFileName, "riwords.txt");

	647

	648 // Items needing deleting at the end

	649 MutableTrieDictionary *mutableDict = NULL;

	650 CompactTrieDictionary *compactDict = NULL;

	651 UnicodeSet *breaks = NULL;

	652 UChar *testFile = NULL;

	653 StringEnumeration *enumer1 = NULL;

	654 StringEnumeration *enumer2 = NULL;

	655 MutableTrieDictionary *mutable2 = NULL;

	656 StringEnumeration *cloneEnum = NULL;

	657 CompactTrieDictionary *compact2 = NULL;

	658

	659

	660 const UnicodeString *originalWord = NULL;

	661 const UnicodeString *cloneWord = NULL;

	662 UChar *current;

	663 UChar *word;

	664 UChar uc;

	665 int32_t wordLen;

	666 int32_t wordCount;

	667 int32_t testCount;

	668

	669 int len;

	670 testFile = ReadAndConvertFile(testFileName, len, NULL, status);

	671 if (U_FAILURE(status)) {

	672 goto cleanup; /* something went wrong, error already output */

	673 }

	674

	675 mutableDict = new MutableTrieDictionary(0x0E1C, status);

	676 if (U_FAILURE(status)) {

	677 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status)) ;

	678 goto cleanup;

	679 }

	680

	681 breaks = new UnicodeSet;

	682 breaks->add(0x000A); // Line Feed

	683 breaks->add(0x000D); // Carriage Return

	684 breaks->add(0x2028); // Line Separator

	685 breaks->add(0x2029); // Paragraph Separator

	686

	687 // Now add each non-comment line of the file as a word.

	688 current = testFile;

	689 word = current;

	690 uc = *current++;

	691 wordLen = 0;

	692 wordCount = 0;

	693

	694 while (uc) {

	695 if (uc == 0x0023) { // #comment line, skip

	696 while (uc && !breaks->contains(uc)) {

	697 uc = *current++;

	698 }

	699 }

	700 else while (uc && !breaks->contains(uc)) {

	701 ++wordLen;

	702 uc = *current++;

	703 }

	704 if (wordLen > 0) {

	705 mutableDict->addWord(word, wordLen, status);

	706 if (U_FAILURE(status)) {

	707 errln("Could not add word to mutable dictionary; status %s\n", u _errorName(status));

	708 goto cleanup;

	709 }

	710 wordCount += 1;

	711 }

	712

	713 // Find beginning of next line

	714 while (uc && breaks->contains(uc)) {

	715 uc = *current++;

	716 }

	717 word = current-1;

	718 wordLen = 0;

	719 }

	720

	721 if (wordCount < 50) {

	722 errln("Word count (%d) unreasonably small\n", wordCount);

	723 goto cleanup;

	724 }

	725

	726 enumer1 = mutableDict->openWords(status);

	727 if (U_FAILURE(status)) {

	728 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName( status));

	729 goto cleanup;

	730 }

	731

	732 testCount = 0;

	733 if (wordCount != (testCount = enumer1->count(status))) {

	734 errln("MutableTrieDictionary word count (%d) differs from file word coun t (%d), with status %s\n",

	735 testCount, wordCount, u_errorName(status));

	736 goto cleanup;

	737 }

	738

	739 // Now compact it

	740 compactDict = new CompactTrieDictionary(*mutableDict, status);

	741 if (U_FAILURE(status)) {

	742 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status ));

	743 goto cleanup;

	744 }

	745

	746 enumer2 = compactDict->openWords(status);

	747 if (U_FAILURE(status)) {

	748 errln("Could not open compact trie dictionary enumerator: %s\n", u_error Name(status));

	749 goto cleanup;

	750 }

	751

	752 if (wordCount != (testCount = enumer2->count(status))) {

	753 errln("CompactTrieDictionary word count (%d) differs from file word coun t (%d), with status %s\n",

	754 testCount, wordCount, u_errorName(status));

	755 goto cleanup;

	756 }

	757

	758 if (typeid(enumer1) == typeid(enumer2)) {

	759 errln("CompactTrieEnumeration and MutableTrieEnumeration typeids are the same");

	760 }

	761 delete enumer1;

	762 enumer1 = NULL;

	763 delete enumer2;

	764 enumer2 = NULL;

	765

	766 // Now un-compact it

	767 mutable2 = compactDict->cloneMutable(status);

	768 if (U_FAILURE(status)) {

	769 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: % s\n", u_errorName(status));

	770 goto cleanup;

	771 }

	772

	773 cloneEnum = mutable2->openWords(status);

	774 if (U_FAILURE(status)) {

	775 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(st atus));

	776 goto cleanup;

	777 }

	778

	779 if (wordCount != (testCount = cloneEnum->count(status))) {

	780 errln("Cloned MutableTrieDictionary word count (%d) differs from file wo rd count (%d), with status %s\n",

	781 testCount, wordCount, u_errorName(status));

	782 goto cleanup;

	783 }

	784

	785 // Compact original dictionary to clone. Note that we can only compare the s ame kind of

	786 // dictionary as the order of the enumerators is not guaranteed to be the sa me between

	787 // different kinds

	788 enumer1 = mutableDict->openWords(status);

	789 if (U_FAILURE(status)) {

	790 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorNa me(status));

	791 goto cleanup;

	792 }

	793

	794 originalWord = enumer1->snext(status);

	795 cloneWord = cloneEnum->snext(status);

	796 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {

	797 if (originalWord != cloneWord) {

	798 errln("Original and cloned MutableTrieDictionary word mismatch\n");

	799 goto cleanup;

	800 }

	801 originalWord = enumer1->snext(status);

	802 cloneWord = cloneEnum->snext(status);

	803 }

	804

	805 if (U_FAILURE(status)) {

	806 errln("Enumeration failed: %s\n", u_errorName(status));

	807 goto cleanup;

	808 }

	809

	810 if (originalWord != cloneWord) {

	811 errln("Original and cloned MutableTrieDictionary ended enumeration at di fferent points\n");

	812 goto cleanup;

	813 }

	814

	815 // Test the data copying constructor for CompactTrieDict, and the data acces s APIs.

	816 compact2 = new CompactTrieDictionary(compactDict->data(), status);

	817 if (U_FAILURE(status)) {

	818 errln("CompactTrieDictionary(const void *,...) failed\n");

	819 goto cleanup;

	820 }

	821

	822 if (compact2->dataSize() == 0) {

	823 errln("CompactTrieDictionary->dataSize() == 0\n");

	824 goto cleanup;

	825 }

	826

	827 // Now count the words via the second dictionary

	828 delete enumer1;

	829 enumer1 = compact2->openWords(status);

	830 if (U_FAILURE(status)) {

	831 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_err orName(status));

	832 goto cleanup;

	833 }

	834

	835 if (wordCount != (testCount = enumer1->count(status))) {

	836 errln("CompactTrieDictionary 2 word count (%d) differs from file word co unt (%d), with status %s\n",

	837 testCount, wordCount, u_errorName(status));

	838 goto cleanup;

	839 }

	840

	841 cleanup:

	842 delete compactDict;

	843 delete mutableDict;

	844 delete breaks;

	845 delete[] testFile;

	846 delete enumer1;

	847 delete mutable2;

	848 delete cloneEnum;

	849 delete compact2;

	850 }

	851

	852

	853 //----------------------------------------------------------------------------

	854 //

	855 // generalIteratorTest Given a break iterator and a set of test data,

	856 // Run the tests and report the results.

	857 //

	858 //----------------------------------------------------------------------------

	859 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)

	860 {

	861

	862 bi.setText(td.fDataToBreak);

	863

	864 testFirstAndNext(bi, td);

	865

	866 testLastAndPrevious(bi, td);

	867

	868 testFollowing(bi, td);

	869 testPreceding(bi, td);

	870 testIsBoundary(bi, td);

	871 doMultipleSelectionTest(bi, td);

	872 }

	873

	874

	875 //

	876 // testFirstAndNext. Run the iterator forwards in the obvious first(), next( )

	877 // kind of loop.

	878 //

	879 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)

	880 {

	881 UErrorCode status = U_ZERO_ERROR;

	882 int32_t p;

	883 int32_t lastP = -1;

	884 int32_t tag;

	885

	886 logln("Test first and next");

	887 bi.setText(td.fDataToBreak);

	888 td.clearResults();

	889

	890 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {

	891 td.fActualBreakPositions.addElement(p, status); // Save result.

	892 tag = bi.getRuleStatus();

	893 td.fActualTags.addElement(tag, status);

	894 if (p <= lastP) {

	895 // If the iterator is not making forward progress, stop.

	896 // No need to raise an error here, it'll be detected in the normal check of results.

	897 break;

	898 }

	899 lastP = p;

	900 }

	901 td.checkResults("testFirstAndNext", this);

	902 }

	903

	904

	905 //

	906 // TestLastAndPrevious. Run the iterator backwards, starting with last().

	907 //

	908 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)

	909 {

	910 UErrorCode status = U_ZERO_ERROR;

	911 int32_t p;

	912 int32_t lastP = 0x7ffffffe;

	913 int32_t tag;

	914

	915 logln("Test last and previous");

	916 bi.setText(td.fDataToBreak);

	917 td.clearResults();

	918

	919 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {

	920 // Save break position. Insert it at start of vector of results, shovin g

	921 // already-saved results further towards the end.

	922 td.fActualBreakPositions.insertElementAt(p, 0, status);

	923 // bi.previous(); // TODO: Why does this fix things up????

	924 // bi.next();

	925 tag = bi.getRuleStatus();

	926 td.fActualTags.insertElementAt(tag, 0, status);

	927 if (p >= lastP) {

	928 // If the iterator is not making progress, stop.

	929 // No need to raise an error here, it'll be detected in the normal check of results.

	930 break;

	931 }

	932 lastP = p;

	933 }

	934 td.checkResults("testLastAndPrevious", this);

	935 }

	936

	937

	938 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)

	939 {

	940 UErrorCode status = U_ZERO_ERROR;

	941 int32_t p;

	942 int32_t tag;

	943 int32_t lastP = -2; // A value that will never be returned as a bre ak position.

	944 // cannot be -1; that is returned for DONE.

	945 int i;

	946

	947 logln("testFollowing():");

	948 bi.setText(td.fDataToBreak);

	949 td.clearResults();

	950

	951 // Save the starting point, since we won't get that out of following.

	952 p = bi.first();

	953 td.fActualBreakPositions.addElement(p, status); // Save result.

	954 tag = bi.getRuleStatus();

	955 td.fActualTags.addElement(tag, status);

	956

	957 for (i = 0; i <= td.fDataToBreak.length()+1; i++) {

	958 p = bi.following(i);

	959 if (p != lastP) {

	960 if (p == RuleBasedBreakIterator::DONE) {

	961 break;

	962 }

	963 // We've reached a new break position. Save it.

	964 td.fActualBreakPositions.addElement(p, status); // Save result.

	965 tag = bi.getRuleStatus();

	966 td.fActualTags.addElement(tag, status);

	967 lastP = p;

	968 }

	969 }

	970 // The loop normally exits by means of the break in the middle.

	971 // Make sure that the index was at the correct position for the break iterat or to have

	972 // returned DONE.

	973 if (i != td.fDataToBreak.length()) {

	974 errln("testFollowing(): iterator returned DONE prematurely.");

	975 }

	976

	977 // Full check of all results.

	978 td.checkResults("testFollowing", this);

	979 }

	980

	981

	982

	983 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {

	984 UErrorCode status = U_ZERO_ERROR;

	985 int32_t p;

	986 int32_t tag;

	987 int32_t lastP = 0x7ffffffe;

	988 int i;

	989

	990 logln("testPreceding():");

	991 bi.setText(td.fDataToBreak);

	992 td.clearResults();

	993

	994 p = bi.last();

	995 td.fActualBreakPositions.addElement(p, status);

	996 tag = bi.getRuleStatus();

	997 td.fActualTags.addElement(tag, status);

	998

	999 for (i = td.fDataToBreak.length(); i>=-1; i--) {

	1000 p = bi.preceding(i);

	1001 if (p != lastP) {

	1002 if (p == RuleBasedBreakIterator::DONE) {

	1003 break;

	1004 }

	1005 // We've reached a new break position. Save it.

	1006 td.fActualBreakPositions.insertElementAt(p, 0, status);

	1007 lastP = p;

	1008 tag = bi.getRuleStatus();

	1009 td.fActualTags.insertElementAt(tag, 0, status);

	1010 }

	1011 }

	1012 // The loop normally exits by means of the break in the middle.

	1013 // Make sure that the index was at the correct position for the break iterat or to have

	1014 // returned DONE.

	1015 if (i != 0) {

	1016 errln("testPreceding(): iterator returned DONE prematurely.");

	1017 }

	1018

	1019 // Full check of all results.

	1020 td.checkResults("testPreceding", this);

	1021 }

	1022

	1023

	1024

	1025 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {

	1026 UErrorCode status = U_ZERO_ERROR;

	1027 int i;

	1028 int32_t tag;

	1029

	1030 logln("testIsBoundary():");

	1031 bi.setText(td.fDataToBreak);

	1032 td.clearResults();

	1033

	1034 for (i = 0; i <= td.fDataToBreak.length(); i++) {

	1035 if (bi.isBoundary(i)) {

	1036 td.fActualBreakPositions.addElement(i, status); // Save result.

	1037 tag = bi.getRuleStatus();

	1038 td.fActualTags.addElement(tag, status);

	1039 }

	1040 }

	1041 td.checkResults("testIsBoundary: ", this);

	1042 }

	1043

	1044

	1045

	1046 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestD ata &td)

	1047 {

	1048 iterator.setText(td.fDataToBreak);

	1049

	1050 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clon e();

	1051 int32_t offset = iterator.first();

	1052 int32_t testOffset;

	1053 int32_t count = 0;

	1054

	1055 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length() );

	1056

	1057 if (*testIterator != iterator)

	1058 errln("clone() or operator!= failed: two clones compared unequal");

	1059

	1060 do {

	1061 testOffset = testIterator->first();

	1062 testOffset = testIterator->next(count);

	1063 if (offset != testOffset)

	1064 errln(UnicodeString("next(n) and next() not returning consistent res ults: for step ") + count + ", next(n) returned " + testOffset + " and next() ha d " + offset);

	1065

	1066 if (offset != RuleBasedBreakIterator::DONE) {

	1067 count++;

	1068 offset = iterator.next();

	1069

	1070 if (offset != RuleBasedBreakIterator::DONE && *testIterator == itera tor) {

	1071 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);

	1072 if (count > 10000 \|\| offset == -1) {

	1073 errln("operator== failed too many times. Stopping test.");

	1074 if (offset == -1) {

	1075 errln("Does (RuleBasedBreakIterator::DONE == -1)?");

	1076 }

	1077 return;

	1078 }

	1079 }

	1080 }

	1081 } while (offset != RuleBasedBreakIterator::DONE);

	1082

	1083 // now do it backwards...

	1084 offset = iterator.last();

	1085 count = 0;

	1086

	1087 do {

	1088 testOffset = testIterator->last();

	1089 testOffset = testIterator->next(count); // next() with a negative arg is same as previous

	1090 if (offset != testOffset)

	1091 errln(UnicodeString("next(n) and next() not returning consistent res ults: for step ") + count + ", next(n) returned " + testOffset + " and next() ha d " + offset);

	1092

	1093 if (offset != RuleBasedBreakIterator::DONE) {

	1094 count--;

	1095 offset = iterator.previous();

	1096 }

	1097 } while (offset != RuleBasedBreakIterator::DONE);

	1098

	1099 delete testIterator;

	1100 }

	1101

	1102

	1103 //---------------------------------------------

	1104 //

	1105 // other tests

	1106 //

	1107 //---------------------------------------------

	1108 void RBBITest::TestEmptyString()

	1109 {

	1110 UnicodeString text = "";

	1111 UErrorCode status = U_ZERO_ERROR;

	1112

	1113 BITestData x(status);

	1114 ADD_DATACHUNK(x, "", 0, status); // Break at start of data

	1115 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::create LineInstance(Locale::getDefault(), status);

	1116 if (U_FAILURE(status))

	1117 {

	1118 errcheckln(status, "Failed to create the BreakIterator for default local e in TestEmptyString. - %s", u_errorName(status));

	1119 return;

	1120 }

	1121 generalIteratorTest(*bi, x);

	1122 delete bi;

	1123 }

	1124

	1125 void RBBITest::TestGetAvailableLocales()

	1126 {

	1127 int32_t locCount = 0;

	1128 const Locale* locList = BreakIterator::getAvailableLocales(locCount);

	1129

	1130 if (locCount == 0)

	1131 dataerrln("getAvailableLocales() returned an empty list!");

	1132 // Just make sure that it's returning good memory.

	1133 int32_t i;

	1134 for (i = 0; i < locCount; ++i) {

	1135 logln(locList[i].getName());

	1136 }

	1137 }

	1138

	1139 //Testing the BreakIterator::getDisplayName() function

	1140 void RBBITest::TestGetDisplayName()

	1141 {

	1142 UnicodeString result;

	1143

	1144 BreakIterator::getDisplayName(Locale::getUS(), result);

	1145 if (Locale::getDefault() == Locale::getUS() && result != "English (United St ates)")

	1146 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (U nited States)\", got \""

	1147 + result);

	1148

	1149 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);

	1150 if (result != "French (France)")

	1151 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (Fr ance)\", got \""

	1152 + result);

	1153 }

	1154 /**

	1155 * Test End Behaviour

	1156 * @bug 4068137

	1157 */

	1158 void RBBITest::TestEndBehaviour()

	1159 {

	1160 UErrorCode status = U_ZERO_ERROR;

	1161 UnicodeString testString("boo.");

	1162 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);

	1163 if (U_FAILURE(status))

	1164 {

	1165 errcheckln(status, "Failed to create the BreakIterator for default local e in TestEndBehaviour. - %s", u_errorName(status));

	1166 return;

	1167 }

	1168 wb->setText(testString);

	1169

	1170 if (wb->first() != 0)

	1171 errln("Didn't get break at beginning of string.");

	1172 if (wb->next() != 3)

	1173 errln("Didn't get break before period in \"boo.\"");

	1174 if (wb->current() != 4 && wb->next() != 4)

	1175 errln("Didn't get break at end of string.");

	1176 delete wb;

	1177 }

	1178 /*

	1179 * @bug 4153072

	1180 */

	1181 void RBBITest::TestBug4153072() {

	1182 UErrorCode status = U_ZERO_ERROR;

	1183 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault() , status);

	1184 if (U_FAILURE(status))

	1185 {

	1186 errcheckln(status, "Failed to create the BreakIterator for default local e in TestBug4153072 - %s", u_errorName(status));

	1187 return;

	1188 }

	1189 UnicodeString str("...Hello, World!...");

	1190 int32_t begin = 3;

	1191 int32_t end = str.length() - 3;

	1192 UBool onBoundary;

	1193

	1194 StringCharacterIterator* textIterator = new StringCharacterIterator(str, beg in, end, begin);

	1195 iter->adoptText(textIterator);

	1196 int index;

	1197 // Note: with the switch to UText, there is no way to restrict the

	1198 // iteration range to begin at an index other than zero.

	1199 // String character iterators created with a non-zero bound are

	1200 // treated by RBBI as being empty.

	1201 for (index = -1; index < begin + 1; ++index) {

	1202 onBoundary = iter->isBoundary(index);

	1203 if (index == 0? !onBoundary : onBoundary) {

	1204 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +

	1205 " and begin index = " + begin);

	1206 }

	1207 }

	1208 delete iter;

	1209 }

	1210

	1211

	1212 //

	1213 // Test for problem reported by Ashok Matoria on 9 July 2007

	1214 // One.<kSoftHyphen><kSpace>Two.

	1215 //

	1216 // Sentence break at start (0) and then on calling next() it breaks at

	1217 // 'T' of "Two". Now, at this point if I do next() and

	1218 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".

	1219 //

	1220 void RBBITest::TestBug5775() {

	1221 UErrorCode status = U_ZERO_ERROR;

	1222 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish (), status);

	1223 TEST_ASSERT_SUCCESS(status);

	1224 if (U_FAILURE(status)) {

	1225 return;

	1226 }

	1227 // Check for status first for better handling of no data errors.

	1228 TEST_ASSERT(bi != NULL);

	1229 if (bi == NULL) {

	1230 return;

	1231 }

	1232

	1233 UnicodeString s("One.\\u00ad Two.", -1, US_INV);

	1234 // 01234 56789

	1235 s = s.unescape();

	1236 bi->setText(s);

	1237 int pos = bi->next();

	1238 TEST_ASSERT(pos == 6);

	1239 pos = bi->next();

	1240 TEST_ASSERT(pos == 10);

	1241 pos = bi->previous();

	1242 TEST_ASSERT(pos == 6);

	1243 delete bi;

	1244 }

	1245

	1246

	1247

	1248 /**

	1249 * Test Japanese Line Break

	1250 * @bug 4095322

	1251 */

	1252 void RBBITest::TestJapaneseLineBreak()

	1253 {

	1254 #if 0

	1255 // Test needs updating some more... Dump it for now.

	1256

	1257

	1258 // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count

	1259 // as opening and closing punctuation for line breaking.

	1260 // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars

	1261 // from these tests. 6-13-2002

	1262 //

	1263 UErrorCode status = U_ZERO_ERROR;

	1264 UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");

	1265 UnicodeString precedingChars = CharsToUnicodeString(

	1266 //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\ u201f");

	1267 "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");

	1268 UnicodeString followingChars = CharsToUnicodeString(

	1269 // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30 e3\\u30e5\\u30e7\\u30fc"

	1270 ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\ \u30e7"

	1271 // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u 00b0\\u2032\\u2033\\u2034"

	1272 ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u203 4"

	1273 "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");

	1274 BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);

	1275

	1276 int32_t i;

	1277 if (U_FAILURE(status))

	1278 {

	1279 errln("Failed to create the BreakIterator for Japanese locale in TestJap aneseLineBreak.\n");

	1280 return;

	1281 }

	1282

	1283 for (i = 0; i < precedingChars.length(); i++) {

	1284 testString.setCharAt(1, precedingChars[i]);

	1285 iter->setText(testString);

	1286 int32_t j = iter->first();

	1287 if (j != 0)

	1288 errln("ja line break failure: failed to start at 0");

	1289 j = iter->next();

	1290 if (j != 1)

	1291 errln("ja line break failure: failed to stop before '" + UCharToUnic odeString(precedingChars[i])

	1292 + "' (" + ((int)(precedingChars[i])) + ")");

	1293 j = iter->next();

	1294 if (j != 3)

	1295 errln("ja line break failure: failed to skip position after '" + UCh arToUnicodeString(precedingChars[i])

	1296 + "' (" + ((int)(precedingChars[i])) + ")");

	1297 }

	1298

	1299 for (i = 0; i < followingChars.length(); i++) {

	1300 testString.setCharAt(1, followingChars[i]);

	1301 iter->setText(testString);

	1302 int j = iter->first();

	1303 if (j != 0)

	1304 errln("ja line break failure: failed to start at 0");

	1305 j = iter->next();

	1306 if (j != 2)

	1307 errln("ja line break failure: failed to skip position before '" + UC harToUnicodeString(followingChars[i])

	1308 + "' (" + ((int)(followingChars[i])) + ")");

	1309 j = iter->next();

	1310 if (j != 3)

	1311 errln("ja line break failure: failed to stop after '" + UCharToUnico deString(followingChars[i])

	1312 + "' (" + ((int)(followingChars[i])) + ")");

	1313 }

	1314 delete iter;

	1315 #endif

	1316 }

	1317

	1318

	1319 //------------------------------------------------------------------------------

	1320 //

	1321 // RBBITest::Extended Run RBBI Tests from an external test data file

	1322 //

	1323 //------------------------------------------------------------------------------

	1324

	1325 struct TestParams {

	1326 BreakIterator *bi;

	1327 UnicodeString dataToBreak;

	1328 UVector32 *expectedBreaks;

	1329 UVector32 *srcLine;

	1330 UVector32 *srcCol;

	1331 };

	1332

	1333 void RBBITest::executeTest(TestParams *t) {

	1334 int32_t bp;

	1335 int32_t prevBP;

	1336 int32_t i;

	1337

	1338 if (t->bi == NULL) {

	1339 return;

	1340 }

	1341

	1342 t->bi->setText(t->dataToBreak);

	1343 //

	1344 // Run the iterator forward

	1345 //

	1346 prevBP = -1;

	1347 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {

	1348 if (prevBP == bp) {

	1349 // Fail for lack of forward progress.

	1350 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",

	1351 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));

	1352 break;

	1353 }

	1354

	1355 // Check that there were we didn't miss an expected break between the la st one

	1356 // and this one.

	1357 for (i=prevBP+1; i<bp; i++) {

	1358 if (t->expectedBreaks->elementAti(i) != 0) {

	1359 int expected[] = {0, i};

	1360 printStringBreaks(t->dataToBreak, expected, 2);

	1361 errln("Forward Iteration, break expected, but not found. Pos=%4 d File line,col= %4d,%4d",

	1362 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));

	1363 }

	1364 }

	1365

	1366 // Check that the break we did find was expected

	1367 if (t->expectedBreaks->elementAti(bp) == 0) {

	1368 int expected[] = {0, bp};

	1369 printStringBreaks(t->dataToBreak, expected, 2);

	1370 errln("Forward Iteration, break found, but not expected. Pos=%4d F ile line,col= %4d,%4d",

	1371 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));

	1372 } else {

	1373 // The break was expected.

	1374 // Check that the {nnn} tag value is correct.

	1375 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);

	1376 if (expectedTagVal == -1) {

	1377 expectedTagVal = 0;

	1378 }

	1379 int32_t line = t->srcLine->elementAti(bp);

	1380 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();

	1381 if (rs != expectedTagVal) {

	1382 errln("Incorrect status for forward break. Pos=%4d File line,c ol= %4d,%4d.\n"

	1383 " Actual, Expected status = %4d, %4d",

	1384 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);

	1385 }

	1386 }

	1387

	1388

	1389 prevBP = bp;

	1390 }

	1391

	1392 // Verify that there were no missed expected breaks after the last one found

	1393 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {

	1394 if (t->expectedBreaks->elementAti(i) != 0) {

	1395 errln("Forward Iteration, break expected, but not found. Pos=%4d F ile line,col= %4d,%4d",

	1396 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));

	1397 }

	1398 }

	1399

	1400 //

	1401 // Run the iterator backwards, verify that the same breaks are found.

	1402 //

	1403 prevBP = t->dataToBreak.length()+2; // start with a phony value for the las t break pos seen.

	1404 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {

	1405 if (prevBP == bp) {

	1406 // Fail for lack of progress.

	1407 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col =%4d,%4d",

	1408 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));

	1409 break;

	1410 }

	1411

	1412 // Check that there were we didn't miss an expected break between the la st one

	1413 // and this one. (UVector returns zeros for index out of bounds.)

	1414 for (i=prevBP-1; i>bp; i--) {

	1415 if (t->expectedBreaks->elementAti(i) != 0) {

	1416 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",

	1417 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));

	1418 }

	1419 }

	1420

	1421 // Check that the break we did find was expected

	1422 if (t->expectedBreaks->elementAti(bp) == 0) {

	1423 errln("Reverse Itertion, break found, but not expected. Pos=%4d Fi le line,col= %4d,%4d",

	1424 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));

	1425 } else {

	1426 // The break was expected.

	1427 // Check that the {nnn} tag value is correct.

	1428 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);

	1429 if (expectedTagVal == -1) {

	1430 expectedTagVal = 0;

	1431 }

	1432 int line = t->srcLine->elementAti(bp);

	1433 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();

	1434 if (rs != expectedTagVal) {

	1435 errln("Incorrect status for reverse break. Pos=%4d File line,c ol= %4d,%4d.\n"

	1436 " Actual, Expected status = %4d, %4d",

	1437 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);

	1438 }

	1439 }

	1440

	1441 prevBP = bp;

	1442 }

	1443

	1444 // Verify that there were no missed breaks prior to the last one found

	1445 for (i=prevBP-1; i>=0; i--) {

	1446 if (t->expectedBreaks->elementAti(i) != 0) {

	1447 errln("Forward Itertion, break expected, but not found. Pos=%4d Fi le line,col= %4d,%4d",

	1448 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));

	1449 }

	1450 }

	1451 }

	1452

	1453

	1454 void RBBITest::TestExtended() {

	1455 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

	1456 UErrorCode status = U_ZERO_ERROR;

	1457 Locale locale("");

	1458

	1459 UnicodeString rules;

	1460 TestParams tp;

	1461 tp.bi = NULL;

	1462 tp.expectedBreaks = new UVector32(status);

	1463 tp.srcLine = new UVector32(status);

	1464 tp.srcCol = new UVector32(status);

	1465

	1466 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale ([\\p{L}\\p{ Nd}_]) *>"), 0, status);

	1467 if (U_FAILURE(status)) {

	1468 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LI NE__, u_errorName(status));

	1469 }

	1470

	1471

	1472 //

	1473 // Open and read the test data file.

	1474 //

	1475 const char *testDataDirectory = IntlTest::getSourceTestData(status);

	1476 char testFileName[1000];

	1477 if (testDataDirectory == NULL \|\| strlen(testDataDirectory) >= sizeof(testFil eName)) {

	1478 errln("Can't open test data. Path too long.");

	1479 return;

	1480 }

	1481 strcpy(testFileName, testDataDirectory);

	1482 strcat(testFileName, "rbbitst.txt");

	1483

	1484 int len;

	1485 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);

	1486 if (U_FAILURE(status)) {

	1487 return; /* something went wrong, error already output */

	1488 }

	1489

	1490

	1491

	1492

	1493 //

	1494 // Put the test data into a UnicodeString

	1495 //

	1496 UnicodeString testString(FALSE, testFile, len);

	1497

	1498 enum EParseState{

	1499 PARSE_COMMENT,

	1500 PARSE_TAG,

	1501 PARSE_DATA,

	1502 PARSE_NUM

	1503 }

	1504 parseState = PARSE_TAG;

	1505

	1506 EParseState savedState = PARSE_TAG;

	1507

	1508 static const UChar CH_LF = 0x0a;

	1509 static const UChar CH_CR = 0x0d;

	1510 static const UChar CH_HASH = 0x23;

	1511 /static const UChar CH_PERIOD = 0x2e;/

	1512 static const UChar CH_LT = 0x3c;

	1513 static const UChar CH_GT = 0x3e;

	1514 static const UChar CH_BACKSLASH = 0x5c;

	1515 static const UChar CH_BULLET = 0x2022;

	1516

	1517 int32_t lineNum = 1;

	1518 int32_t colStart = 0;

	1519 int32_t column = 0;

	1520 int32_t charIdx = 0;

	1521

	1522 int32_t tagValue = 0; // The numeric value of a <nnn> tag.

	1523

	1524 for (charIdx = 0; charIdx < len; ) {

	1525 status = U_ZERO_ERROR;

	1526 UChar c = testString.charAt(charIdx);

	1527 charIdx++;

	1528 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {

	1529 // treat CRLF as a unit

	1530 c = CH_LF;

	1531 charIdx++;

	1532 }

	1533 if (c == CH_LF \|\| c == CH_CR) {

	1534 lineNum++;

	1535 colStart = charIdx;

	1536 }

	1537 column = charIdx - colStart + 1;

	1538

	1539 switch (parseState) {

	1540 case PARSE_COMMENT:

	1541 if (c == 0x0a \|\| c == 0x0d) {

	1542 parseState = savedState;

	1543 }

	1544 break;

	1545

	1546 case PARSE_TAG:

	1547 {

	1548 if (c == CH_HASH) {

	1549 parseState = PARSE_COMMENT;

	1550 savedState = PARSE_TAG;

	1551 break;

	1552 }

	1553 if (u_isUWhiteSpace(c)) {

	1554 break;

	1555 }

	1556 if (testString.compare(charIdx-1, 6, "<word>") == 0) {

	1557 delete tp.bi;

	1558 tp.bi = BreakIterator::createWordInstance(locale, status);

	1559 charIdx += 5;

	1560 break;

	1561 }

	1562 if (testString.compare(charIdx-1, 6, "<char>") == 0) {

	1563 delete tp.bi;

	1564 tp.bi = BreakIterator::createCharacterInstance(locale, status);

	1565 charIdx += 5;

	1566 break;

	1567 }

	1568 if (testString.compare(charIdx-1, 6, "<line>") == 0) {

	1569 delete tp.bi;

	1570 tp.bi = BreakIterator::createLineInstance(locale, status);

	1571 charIdx += 5;

	1572 break;

	1573 }

	1574 if (testString.compare(charIdx-1, 6, "<sent>") == 0) {

	1575 delete tp.bi;

	1576 tp.bi = NULL;

	1577 tp.bi = BreakIterator::createSentenceInstance(locale, status);

	1578 charIdx += 5;

	1579 break;

	1580 }

	1581 if (testString.compare(charIdx-1, 7, "<title>") == 0) {

	1582 delete tp.bi;

	1583 tp.bi = BreakIterator::createTitleInstance(locale, status);

	1584 charIdx += 6;

	1585 break;

	1586 }

	1587

	1588 // <locale loc_name>

	1589 localeMatcher.reset(testString);

	1590 if (localeMatcher.lookingAt(charIdx-1, status)) {

	1591 UnicodeString localeName = localeMatcher.group(1, status);

	1592 char localeName8[100];

	1593 localeName.extract(0, localeName.length(), localeName8, sizeof(l ocaleName8), 0);

	1594 locale = Locale::createFromName(localeName8);

	1595 charIdx += localeMatcher.group(0, status).length();

	1596 TEST_ASSERT_SUCCESS(status);

	1597 break;

	1598 }

	1599 if (testString.compare(charIdx-1, 6, "<data>") == 0) {

	1600 parseState = PARSE_DATA;

	1601 charIdx += 5;

	1602 tp.dataToBreak = "";

	1603 tp.expectedBreaks->removeAllElements();

	1604 tp.srcCol ->removeAllElements();

	1605 tp.srcLine->removeAllElements();

	1606 break;

	1607 }

	1608

	1609 errln("line %d: Tag expected in test file.", lineNum);

	1610 parseState = PARSE_COMMENT;

	1611 savedState = PARSE_DATA;

	1612 goto end_test; // Stop the test.

	1613 }

	1614 break;

	1615

	1616 case PARSE_DATA:

	1617 if (c == CH_BULLET) {

	1618 int32_t breakIdx = tp.dataToBreak.length();

	1619 tp.expectedBreaks->setSize(breakIdx+1);

	1620 tp.expectedBreaks->setElementAt(-1, breakIdx);

	1621 tp.srcLine->setSize(breakIdx+1);

	1622 tp.srcLine->setElementAt(lineNum, breakIdx);

	1623 tp.srcCol ->setSize(breakIdx+1);

	1624 tp.srcCol ->setElementAt(column, breakIdx);

	1625 break;

	1626 }

	1627

	1628 if (testString.compare(charIdx-1, 7, "</data>") == 0) {

	1629 // Add final entry to mappings from break location to source fil e position.

	1630 // Need one extra because last break position returned is after the

	1631 // last char in the data, not at the last char.

	1632 tp.srcLine->addElement(lineNum, status);

	1633 tp.srcCol ->addElement(column, status);

	1634

	1635 parseState = PARSE_TAG;

	1636 charIdx += 6;

	1637

	1638 // RUN THE TEST!

	1639 executeTest(&tp);

	1640 break;

	1641 }

	1642

	1643 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {

	1644 // Named character, e.g. \N{COMBINING GRAVE ACCENT}

	1645 // Get the code point from the name and insert it into the test data.

	1646 // (Damn, no API takes names in Unicode !!!

	1647 // we've got to take it back to char *)

	1648 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/'}'/, char Idx);

	1649 int32_t nameLength = nameEndIdx - (charIdx+2);

	1650 char charNameBuf[200];

	1651 UChar32 theChar = -1;

	1652 if (nameEndIdx != -1) {

	1653 UErrorCode status = U_ZERO_ERROR;

	1654 testString.extract(charIdx+2, nameLength, charNameBuf, sizeo f(charNameBuf));

	1655 charNameBuf[sizeof(charNameBuf)-1] = 0;

	1656 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, & status);

	1657 if (U_FAILURE(status)) {

	1658 theChar = -1;

	1659 }

	1660 }

	1661 if (theChar == -1) {

	1662 errln("Error in named character in test file at line %d, col %d",

	1663 lineNum, column);

	1664 } else {

	1665 // Named code point was recognized. Insert it

	1666 // into the test data.

	1667 tp.dataToBreak.append(theChar);

	1668 while (tp.dataToBreak.length() > tp.srcLine->size()) {

	1669 tp.srcLine->addElement(lineNum, status);

	1670 tp.srcCol ->addElement(column, status);

	1671 }

	1672 }

	1673 if (nameEndIdx > charIdx) {

	1674 charIdx = nameEndIdx+1;

	1675

	1676 }

	1677 break;

	1678 }

	1679

	1680

	1681

	1682

	1683 if (testString.compare(charIdx-1, 2, "<>") == 0) {

	1684 charIdx++;

	1685 int32_t breakIdx = tp.dataToBreak.length();

	1686 tp.expectedBreaks->setSize(breakIdx+1);

	1687 tp.expectedBreaks->setElementAt(-1, breakIdx);

	1688 tp.srcLine->setSize(breakIdx+1);

	1689 tp.srcLine->setElementAt(lineNum, breakIdx);

	1690 tp.srcCol ->setSize(breakIdx+1);

	1691 tp.srcCol ->setElementAt(column, breakIdx);

	1692 break;

	1693 }

	1694

	1695 if (c == CH_LT) {

	1696 tagValue = 0;

	1697 parseState = PARSE_NUM;

	1698 break;

	1699 }

	1700

	1701 if (c == CH_HASH && column==3) { // TODO: why is column off so fa r?

	1702 parseState = PARSE_COMMENT;

	1703 savedState = PARSE_DATA;

	1704 break;

	1705 }

	1706

	1707 if (c == CH_BACKSLASH) {

	1708 // Check for \ at end of line, a line continuation.

	1709 // Advance over (discard) the newline

	1710 UChar32 cp = testString.char32At(charIdx);

	1711 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) = = CH_LF) {

	1712 // We have a CR LF

	1713 // Need an extra increment of the input ptr to move over bo th of them

	1714 charIdx++;

	1715 }

	1716 if (cp == CH_LF \|\| cp == CH_CR) {

	1717 lineNum++;

	1718 colStart = charIdx;

	1719 charIdx++;

	1720 break;

	1721 }

	1722

	1723 // Let unescape handle the back slash.

	1724 cp = testString.unescapeAt(charIdx);

	1725 if (cp != -1) {

	1726 // Escape sequence was recognized. Insert the char

	1727 // into the test data.

	1728 tp.dataToBreak.append(cp);

	1729 while (tp.dataToBreak.length() > tp.srcLine->size()) {

	1730 tp.srcLine->addElement(lineNum, status);

	1731 tp.srcCol ->addElement(column, status);

	1732 }

	1733 break;

	1734 }

	1735

	1736

	1737 // Not a recognized backslash escape sequence.

	1738 // Take the next char as a literal.

	1739 // TODO: Should this be an error?

	1740 c = testString.charAt(charIdx);

	1741 charIdx = testString.moveIndex32(charIdx, 1);

	1742 }

	1743

	1744 // Normal, non-escaped data char.

	1745 tp.dataToBreak.append(c);

	1746

	1747 // Save the mapping from offset in the data to line/column numbers i n

	1748 // the original input file. Will be used for better error message s only.

	1749 // If there's an expected break before this char, the slot in the mapping

	1750 // vector will already be set for this char; don't overwrite it.

	1751 if (tp.dataToBreak.length() > tp.srcLine->size()) {

	1752 tp.srcLine->addElement(lineNum, status);

	1753 tp.srcCol ->addElement(column, status);

	1754 }

	1755 break;

	1756

	1757

	1758 case PARSE_NUM:

	1759 // We are parsing an expected numeric tag value, like <1234>,

	1760 // within a chunk of data.

	1761 if (u_isUWhiteSpace(c)) {

	1762 break;

	1763 }

	1764

	1765 if (c == CH_GT) {

	1766 // Finished the number. Add the info to the expected break data ,

	1767 // and switch parse state back to doing plain data.

	1768 parseState = PARSE_DATA;

	1769 if (tagValue == 0) {

	1770 tagValue = -1;

	1771 }

	1772 int32_t breakIdx = tp.dataToBreak.length();

	1773 tp.expectedBreaks->setSize(breakIdx+1);

	1774 tp.expectedBreaks->setElementAt(tagValue, breakIdx);

	1775 tp.srcLine->setSize(breakIdx+1);

	1776 tp.srcLine->setElementAt(lineNum, breakIdx);

	1777 tp.srcCol ->setSize(breakIdx+1);

	1778 tp.srcCol ->setElementAt(column, breakIdx);

	1779 break;

	1780 }

	1781

	1782 if (u_isdigit(c)) {

	1783 tagValue = tagValue*10 + u_charDigitValue(c);

	1784 break;

	1785 }

	1786

	1787 errln("Syntax Error in test file at line %d, col %d",

	1788 lineNum, column);

	1789 parseState = PARSE_COMMENT;

	1790 goto end_test; // Stop the test

	1791 break;

	1792 }

	1793

	1794

	1795 if (U_FAILURE(status)) {

	1796 errln("ICU Error %s while parsing test file at line %d.",

	1797 u_errorName(status), lineNum);

	1798 status = U_ZERO_ERROR;

	1799 goto end_test; // Stop the test

	1800 }

	1801

	1802 }

	1803

	1804 end_test:

	1805 delete tp.bi;

	1806 delete tp.expectedBreaks;

	1807 delete tp.srcLine;

	1808 delete tp.srcCol;

	1809 delete [] testFile;

	1810 #endif

	1811 }

	1812

	1813 void RBBITest::TestThaiBreaks() {

	1814 UErrorCode status=U_ZERO_ERROR;

	1815 BreakIterator* b;

	1816 Locale locale = Locale("th");

	1817 int32_t p, index;

	1818 UChar c[]= {

	1819 0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E 49, 0x0E07, 0x0020, 0x0E1B,

	1820 0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E 43, 0x0E19,

	1821 0x0E16, 0x0E49, 0x0E33, 0x0000

	1822 };

	1823 int32_t expectedWordResult[] = {

	1824 2, 3, 6, 10, 11, 15, 17, 20, 22

	1825 };

	1826 int32_t expectedLineResult[] = {

	1827 3, 6, 11, 15, 17, 20, 22

	1828 };

	1829

	1830 int32_t size = u_strlen(c);

	1831 UnicodeString text=UnicodeString(c);

	1832

	1833 b = BreakIterator::createWordInstance(locale, status);

	1834 if (U_FAILURE(status)) {

	1835 errcheckln(status, "Unable to create thai word break iterator. - %s", u_ errorName(status));

	1836 return;

	1837 }

	1838 b->setText(text);

	1839 p = index = 0;

	1840 while ((p=b->next())!=BreakIterator::DONE && p < size) {

	1841 if (p != expectedWordResult[index++]) {

	1842 errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p);

	1843 }

	1844 }

	1845 delete b;

	1846

	1847 b = BreakIterator::createLineInstance(locale, status);

	1848 if (U_FAILURE(status)) {

	1849 printf("Unable to create thai line break iterator.\n");

	1850 return;

	1851 }

	1852 b->setText(text);

	1853 p = index = 0;

	1854 while ((p=b->next())!=BreakIterator::DONE && p < size) {

	1855 if (p != expectedLineResult[index++]) {

	1856 errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p);

	1857 }

	1858 }

	1859

	1860 delete b;

	1861 }

	1862

	1863 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"

	1864 // Words don't include colon or period (cldrbug #1969).

	1865 static const char posxWordText[] = "Can't have breaks in xx:yy or struct. field for CS-types.";

	1866 static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24 , 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };

	1867 static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 };

	1868

	1869 // UBreakIteratorType UBRK_WORD, Locale "ja"

	1870 // Don't break in runs of hiragana or runs of ideograph, where the latter includ es \u3005 \u3007 \u303B (cldrbug #2009).

	1871 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3 007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"

	1872 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3 005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";

	1873 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 17 , 18, 20, 21, 24, 27, 28 };

	1874 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17 , 18, 19, 20, 21, 24, 25, 26, 27, 28 };

	1875

	1876 // UBreakIteratorType UBRK_SENTENCE, Locale "el"

	1877 // Add break after Greek question mark (cldrbug #2069).

	1878 static const char elSentText[] = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. "

	1879 "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\ \u03C0, \\u03A1\\u03C2? \\u03A3";

	1880 static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 };

	1881 static const int32_t elSentROffsets[] = { 20, 27, 35, 36 };

	1882

	1883 // UBreakIteratorType UBRK_CHARACTER, Locale "th"

	1884 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), exce pt for [SARA] AM (cldrbug #2161).

	1885 static const char thCharText[] = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0 E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 "

	1886 "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u 0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) "

	1887 "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0 E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 ";

	1888 static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11,

	1889 12, 13, 15, 16, 17, 19, 20, 22, 23, 24 , 25, 26, 27, 28,

	1890 29, 30, 32, 33, 35, 37, 38, 39, 40, 41 };

	1891 static const int32_t thCharROffsets[] = { 1, 3, 5, 6, 7, 8, 9, 11,

	1892 12, 13, 15, 17, 19, 20, 22, 24 , 26, 27, 28,

	1893 29, 32, 33, 35, 37, 38, 40, 41 };

	1894

	1895 typedef struct {

	1896 UBreakIteratorType type;

	1897 const char * locale;

	1898 const char * escapedText;

	1899 const int32_t * tailoredOffsets;

	1900 int32_t tailoredOffsetsCount;

	1901 const int32_t * rootOffsets;

	1902 int32_t rootOffsetsCount;

	1903 } TailoredBreakItem;

	1904

	1905 #define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0]))

	1906

	1907 static const TailoredBreakItem tbItems[] = {

	1908 { UBRK_WORD, "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffset s), ARRAY_PTR_LEN(posxWordROffsets) },

	1909 { UBRK_WORD, "ja", jaWordText, ARRAY_PTR_LEN(jaWordTOffsets) , ARRAY_PTR_LEN(jaWordROffsets) },

	1910 { UBRK_SENTENCE, "el", elSentText, ARRAY_PTR_LEN(elSentTOffsets) , ARRAY_PTR_LEN(elSentROffsets) },

	1911 { UBRK_CHARACTER, "th", thCharText, ARRAY_PTR_LEN(thCharTOffsets) , ARRAY_PTR_LEN(thCharROffsets) },

	1912 { UBRK_CHARACTER, NULL, NULL, NULL,0, NULL,0 } // terminator

	1913 };

	1914

	1915 static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int 32_t* offsets) {

	1916 while (count-- > 0) {

	1917 int writeCount;

	1918 sprintf(buffer, /* buflen, / " %d%n", offsets++, &writeCount); /* want s to be snprintf */

	1919 buffer += writeCount;

	1920 buflen -= writeCount;

	1921 }

	1922 }

	1923

	1924 enum { kMaxOffsetCount = 128 };

	1925

	1926 void RBBITest::TBTest(BreakIterator* brkitr, int type, const char locale, const char escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) {

	1927 brkitr->setText( CharsToUnicodeString(escapedText) );

	1928 int32_t foundOffsets[kMaxOffsetCount];

	1929 int32_t offset, foundOffsetsCount = 0;

	1930 // do forwards iteration test

	1931 while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) {

	1932 foundOffsets[foundOffsetsCount++] = offset;

	1933 }

	1934 if ( foundOffsetsCount != expectOffsetsCount \|\| memcmp(expectOffsets, foundO ffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) {

	1935 // log error for forwards test

	1936 char formatExpect[512], formatFound[512];

	1937 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, ex pectOffsets);

	1938 formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, found Offsets);

	1939 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n",

	1940 type, locale, escapedText, expectOffsetsCount, formatExpect, fou ndOffsetsCount, formatFound);

	1941 } else {

	1942 // do backwards iteration test

	1943 --foundOffsetsCount; // back off one from the end offset

	1944 while ( foundOffsetsCount > 0 ) {

	1945 offset = brkitr->previous();

	1946 if ( offset != foundOffsets[--foundOffsetsCount] ) {

	1947 // log error for backwards test

	1948 char formatExpect[512];

	1949 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsC ount, expectOffsets);

	1950 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s ; found rev offset %d where expect %d\n",

	1951 type, locale, escapedText, expectOffsetsCount, formatExp ect, offset, foundOffsets[foundOffsetsCount]);

	1952 break;

	1953 }

	1954 }

	1955 }

	1956 }

	1957

	1958 void RBBITest::TestTailoredBreaks() {

	1959 const TailoredBreakItem * tbItemPtr;

	1960 Locale rootLocale = Locale("root");

	1961 for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) {

	1962 Locale testLocale = Locale(tbItemPtr->locale);

	1963 BreakIterator * tailoredBrkiter = NULL;

	1964 BreakIterator * rootBrkiter = NULL;

	1965 UErrorCode status = U_ZERO_ERROR;

	1966 switch (tbItemPtr->type) {

	1967 case UBRK_CHARACTER:

	1968 tailoredBrkiter = BreakIterator::createCharacterInstance(testLoc ale, status);

	1969 rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status);

	1970 break;

	1971 case UBRK_WORD:

	1972 tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status);

	1973 rootBrkiter = BreakIterator::createWordInstance(rootLocale, stat us);

	1974 break;

	1975 case UBRK_LINE:

	1976 tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status);

	1977 rootBrkiter = BreakIterator::createLineInstance(rootLocale, stat us);

	1978 break;

	1979 case UBRK_SENTENCE:

	1980 tailoredBrkiter = BreakIterator::createSentenceInstance(testLoca le, status);

	1981 rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status);

	1982 break;

	1983 default:

	1984 status = U_UNSUPPORTED_ERROR;

	1985 break;

	1986 }

	1987 if (U_FAILURE(status)) {

	1988 errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName (status));

	1989 continue;

	1990 }

	1991 TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbIte mPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount);

	1992 TBTest(rootBrkiter, (int)(tbItemPtr->type), "root", tbIte mPtr->escapedText, tbItemPtr->rootOffsets, tbItemPtr->rootOffsetsCount);

	1993

	1994 delete rootBrkiter;

	1995 delete tailoredBrkiter;

	1996 }

	1997 }

	1998

	1999

	2000 //------------------------------------------------------------------------------ -

	2001 //

	2002 // TestDictRules create a break iterator from source rules that includes a

	2003 // dictionary range. Regression for bug #7130. Source rules

	2004 // do not declare a break iterator type (word, line, sentence, etc.

	2005 // but the dictionary code, without a type, would loop.

	2006 //

	2007 //------------------------------------------------------------------------------ -

	2008 void RBBITest::TestDictRules() {

	2009 const char *rules = "$dictionary = [a-z]; \n"

	2010 "!!forward; \n"

	2011 "$dictionary $dictionary; \n"

	2012 "!!reverse; \n"

	2013 "$dictionary $dictionary; \n";

	2014 const char *text = "aa";

	2015 UErrorCode status = U_ZERO_ERROR;

	2016 UParseError parseError;

	2017

	2018 RuleBasedBreakIterator bi(rules, parseError, status);

	2019 if (U_SUCCESS(status)) {

	2020 UnicodeString utext = text;

	2021 bi.setText(utext);

	2022 int32_t position;

	2023 int32_t loops;

	2024 for (loops = 0; loops<10; loops++) {

	2025 position = bi.next();

	2026 if (position == RuleBasedBreakIterator::DONE) {

	2027 break;

	2028 }

	2029 }

	2030 TEST_ASSERT(loops == 1);

	2031 } else {

	2032 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(statu s));

	2033 }

	2034 }

	2035

	2036

	2037

	2038 //------------------------------------------------------------------------------ -

	2039 //

	2040 // ReadAndConvertFile Read a text data file, convert it to UChars, and

	2041 // return the datain one big UChar * buffer, which the caller must delete.

	2042 //

	2043 // parameters:

	2044 // fileName: the name of the file, with no directory part. The test data directory

	2045 // is assumed.

	2046 // ulen an out parameter, receives the actual length (in UChars) of the file data.

	2047 // encoding The file encoding. If the file contains a BOM, that wil l override the encoding

	2048 // specified here. The BOM, if it exists, will be stripped from the returned data.

	2049 // Pass NULL for the system default encoding.

	2050 // status

	2051 // returns:

	2052 // The file data, converted to UChar.

	2053 // The caller must delete this when done with

	2054 // delete [] theBuffer;

	2055 //

	2056 // TODO: This is a clone of RegexTest::ReadAndConvertFile.

	2057 // Move this function to some common place.

	2058 //

	2059 //------------------------------------------------------------------------------ --

	2060 UChar RBBITest::ReadAndConvertFile(const char fileName, int &ulen, const char *encoding, UErrorCode &status) {

	2061 UChar *retPtr = NULL;

	2062 char *fileBuf = NULL;

	2063 UConverter* conv = NULL;

	2064 FILE *f = NULL;

	2065

	2066 ulen = 0;

	2067 if (U_FAILURE(status)) {

	2068 return retPtr;

	2069 }

	2070

	2071 //

	2072 // Open the file.

	2073 //

	2074 f = fopen(fileName, "rb");

	2075 if (f == 0) {

	2076 dataerrln("Error opening test data file %s\n", fileName);

	2077 status = U_FILE_ACCESS_ERROR;

	2078 return NULL;

	2079 }

	2080 //

	2081 // Read it in

	2082 //

	2083 int fileSize;

	2084 int amt_read;

	2085

	2086 fseek( f, 0, SEEK_END);

	2087 fileSize = ftell(f);

	2088 fileBuf = new char[fileSize];

	2089 fseek(f, 0, SEEK_SET);

	2090 amt_read = fread(fileBuf, 1, fileSize, f);

	2091 if (amt_read != fileSize \|\| fileSize <= 0) {

	2092 errln("Error reading test data file.");

	2093 goto cleanUpAndReturn;

	2094 }

	2095

	2096 //

	2097 // Look for a Unicode Signature (BOM) on the data just read

	2098 //

	2099 int32_t signatureLength;

	2100 const char * fileBufC;

	2101 const char* bomEncoding;

	2102

	2103 fileBufC = fileBuf;

	2104 bomEncoding = ucnv_detectUnicodeSignature(

	2105 fileBuf, fileSize, &signatureLength, &status);

	2106 if(bomEncoding!=NULL ){

	2107 fileBufC += signatureLength;

	2108 fileSize -= signatureLength;

	2109 encoding = bomEncoding;

	2110 }

	2111

	2112 //

	2113 // Open a converter to take the rule file to UTF-16

	2114 //

	2115 conv = ucnv_open(encoding, &status);

	2116 if (U_FAILURE(status)) {

	2117 goto cleanUpAndReturn;

	2118 }

	2119

	2120 //

	2121 // Convert the rules to UChar.

	2122 // Preflight first to determine required buffer size.

	2123 //

	2124 ulen = ucnv_toUChars(conv,

	2125 NULL, // dest,

	2126 0, // destCapacity,

	2127 fileBufC,

	2128 fileSize,

	2129 &status);

	2130 if (status == U_BUFFER_OVERFLOW_ERROR) {

	2131 // Buffer Overflow is expected from the preflight operation.

	2132 status = U_ZERO_ERROR;

	2133

	2134 retPtr = new UChar[ulen+1];

	2135 ucnv_toUChars(conv,

	2136 retPtr, // dest,

	2137 ulen+1,

	2138 fileBufC,

	2139 fileSize,

	2140 &status);

	2141 }

	2142

	2143 cleanUpAndReturn:

	2144 fclose(f);

	2145 delete []fileBuf;

	2146 ucnv_close(conv);

	2147 if (U_FAILURE(status)) {

	2148 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));

	2149 delete retPtr;

	2150 retPtr = 0;

	2151 ulen = 0;

	2152 };

	2153 return retPtr;

	2154 }

	2155

	2156

	2157

	2158 //------------------------------------------------------------------------------ --------------

	2159 //

	2160 // Run tests from each of the boundary test data files distributed by the Unic ode Consortium

	2161 //

	2162 //------------------------------------------------------------------------------ -------------

	2163 void RBBITest::TestUnicodeFiles() {

	2164 RuleBasedBreakIterator *bi;

	2165 UErrorCode status = U_ZERO_ERROR;

	2166

	2167 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Local e::getEnglish(), status);

	2168 TEST_ASSERT_SUCCESS(status);

	2169 if (U_SUCCESS(status)) {

	2170 runUnicodeTestData("GraphemeBreakTest.txt", bi);

	2171 }

	2172 delete bi;

	2173

	2174 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::ge tEnglish(), status);

	2175 TEST_ASSERT_SUCCESS(status);

	2176 if (U_SUCCESS(status)) {

	2177 runUnicodeTestData("WordBreakTest.txt", bi);

	2178 }

	2179 delete bi;

	2180

	2181 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale ::getEnglish(), status);

	2182 TEST_ASSERT_SUCCESS(status);

	2183 if (U_SUCCESS(status)) {

	2184 runUnicodeTestData("SentenceBreakTest.txt", bi);

	2185 }

	2186 delete bi;

	2187

	2188 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::ge tEnglish(), status);

	2189 TEST_ASSERT_SUCCESS(status);

	2190 if (U_SUCCESS(status)) {

	2191 runUnicodeTestData("LineBreakTest.txt", bi);

	2192 }

	2193 delete bi;

	2194 }

	2195

	2196

	2197 //------------------------------------------------------------------------------ --------------

	2198 //

	2199 // Run tests from one of the boundary test data files distributed by the Unico de Consortium

	2200 //

	2201 //------------------------------------------------------------------------------ -------------

	2202 void RBBITest::runUnicodeTestData(const char fileName, RuleBasedBreakIterator bi) {

	2203 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

	2204 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bom b.

	2205 UVersionInfo icu4601 = { 4, 6, 0, 1 };

	2206 UBool isICUVersionPast46 = isICUVersionAtLeast(icu4601);

	2207 UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");

	2208 UErrorCode status = U_ZERO_ERROR;

	2209

	2210 //

	2211 // Open and read the test data file, put it into a UnicodeString.

	2212 //

	2213 const char *testDataDirectory = IntlTest::getSourceTestData(status);

	2214 char testFileName[1000];

	2215 if (testDataDirectory == NULL \|\| strlen(testDataDirectory) >= sizeof(testFil eName)) {

	2216 dataerrln("Can't open test data. Path too long.");

	2217 return;

	2218 }

	2219 strcpy(testFileName, testDataDirectory);

	2220 strcat(testFileName, fileName);

	2221

	2222 logln("Opening data file %s\n", fileName);

	2223

	2224 int len;

	2225 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);

	2226 if (status != U_FILE_ACCESS_ERROR) {

	2227 TEST_ASSERT_SUCCESS(status);

	2228 TEST_ASSERT(testFile != NULL);

	2229 }

	2230 if (U_FAILURE(status) \|\| testFile == NULL) {

	2231 return; /* something went wrong, error already output */

	2232 }

	2233 UnicodeString testFileAsString(TRUE, testFile, len);

	2234

	2235 //

	2236 // Parse the test data file using a regular expression.

	2237 // Each kind of token is recognized in its own capture group; what type of item was scanned

	2238 // is identified by which group had a match.

	2239 //

	2240 // Caputure Group # 1 2 3 4 5

	2241 // Parses this item: divide x hex digits comme nt \n unrecognized \n

	2242 //

	2243 UnicodeString tokenExpr("[ \t](?:(\\u00F7)\|(\\u00D7)\|([0-9a-fA-F]+)\|((?:#. ?)?$.)\|(.*?$.))", -1, US_INV);

	2244 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE \| UREGEX_DOTALL, status);

	2245 UnicodeString testString;

	2246 UVector32 breakPositions(status);

	2247 int lineNumber = 1;

	2248 TEST_ASSERT_SUCCESS(status);

	2249 if (U_FAILURE(status)) {

	2250 return;

	2251 }

	2252

	2253 //

	2254 // Scan through each test case, building up the string to be broken in test String,

	2255 // and the positions that should be boundaries in the breakPositions vecto r.

	2256 //

	2257 int spin = 0;

	2258 while (tokenMatcher.find()) {

	2259 if(tokenMatcher.hitEnd()) {

	2260 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.

	2261 This occurred when the text file was corrupt (wasn't marked as UTF- 8)

	2262 and caused an infinite loop here on EBCDIC systems!

	2263 */

	2264 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt d ata file?\r", fileName, ++spin);

	2265 // return;

	2266 }

	2267 if (tokenMatcher.start(1, status) >= 0) {

	2268 // Scanned a divide sign, indicating a break position in the test da ta.

	2269 if (testString.length()>0) {

	2270 breakPositions.addElement(testString.length(), status);

	2271 }

	2272 }

	2273 else if (tokenMatcher.start(2, status) >= 0) {

	2274 // Scanned an 'x', meaning no break at this position in the test dat a

	2275 // Nothing to be done here.

	2276 }

	2277 else if (tokenMatcher.start(3, status) >= 0) {

	2278 // Scanned Hex digits. Convert them to binary, append to the charac ter data string.

	2279 const UnicodeString &hexNumber = tokenMatcher.group(3, status);

	2280 int length = hexNumber.length();

	2281 if (length<=8) {

	2282 char buf[10];

	2283 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);

	2284 UChar32 c = (UChar32)strtol(buf, NULL, 16);

	2285 if (c<=0x10ffff) {

	2286 testString.append(c);

	2287 } else {

	2288 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",

	2289 fileName, lineNumber);

	2290 }

	2291 } else {

	2292 errln("Syntax Error: Hex Unicode Character value must have no mo re than 8 digits at \'%s\', line %d.\n",

	2293 fileName, lineNumber);

	2294 }

	2295 }

	2296 else if (tokenMatcher.start(4, status) >= 0) {

	2297 // Scanned to end of a line, possibly skipping over a comment in the process.

	2298 // If the line from the file contained test data, run the test now .

	2299 //

	2300 if (testString.length() > 0) {

	2301 // TODO(andy): Remove this time bomb code.

	2302 if (!isLineBreak \|\| isICUVersionPast46 \|\| !(4658 <= lineNumber && lineNumber <= 4758)) {

	2303 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPos itions, bi);

	2304 }

	2305 }

	2306

	2307 // Clear out this test case.

	2308 // The string and breakPositions vector will be refilled as the n ext

	2309 // test case is parsed.

	2310 testString.remove();

	2311 breakPositions.removeAllElements();

	2312 lineNumber++;

	2313 } else {

	2314 // Scanner catchall. Something unrecognized appeared on the line.

	2315 char token[16];

	2316 UnicodeString uToken = tokenMatcher.group(0, status);

	2317 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));

	2318 token[sizeof(token)-1] = 0;

	2319 errln("Syntax error in test data file \'%s\', line %d. Scanning \"% s\"\n", fileName, lineNumber, token);

	2320

	2321 // Clean up, in preparation for continuing with the next line.

	2322 testString.remove();

	2323 breakPositions.removeAllElements();

	2324 lineNumber++;

	2325 }

	2326 TEST_ASSERT_SUCCESS(status);

	2327 if (U_FAILURE(status)) {

	2328 break;

	2329 }

	2330 }

	2331

	2332 delete [] testFile;

	2333 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS

	2334 }

	2335

	2336 //------------------------------------------------------------------------------ --------------

	2337 //

	2338 // checkUnicodeTestCase() Run one test case from one of the Unicode Consorti um

	2339 // test data files. Do only a simple, forward-only c heck -

	2340 // this test is mostly to check that ICU and the Unic ode

	2341 // data agree with each other.

	2342 //

	2343 //------------------------------------------------------------------------------ --------------

	2344 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,

	2345 const UnicodeString &testString, // Text data to be b roken

	2346 UVector32 *breakPositions, // Positions where b reaks should be found.

	2347 RuleBasedBreakIterator *bi) {

	2348 int32_t pos; // Break Position in the test string

	2349 int32_t expectedI = 0; // Index of expected break position in the vect or of expected results.

	2350 int32_t expectedPos; // Expected break position (index into test str ing)

	2351

	2352 bi->setText(testString);

	2353 pos = bi->first();

	2354 pos = bi->next();

	2355

	2356 while (pos != BreakIterator::DONE) {

	2357 if (expectedI >= breakPositions->size()) {

	2358 errln("Test file \"%s\", line %d, unexpected break found at position %d",

	2359 testFileName, lineNumber, pos);

	2360 break;

	2361 }

	2362 expectedPos = breakPositions->elementAti(expectedI);

	2363 if (pos < expectedPos) {

	2364 errln("Test file \"%s\", line %d, unexpected break found at position %d",

	2365 testFileName, lineNumber, pos);

	2366 break;

	2367 }

	2368 if (pos > expectedPos) {

	2369 errln("Test file \"%s\", line %d, failed to find expected break at p osition %d",

	2370 testFileName, lineNumber, expectedPos);

	2371 break;

	2372 }

	2373 pos = bi->next();

	2374 expectedI++;

	2375 }

	2376

	2377 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {

	2378 errln("Test file \"%s\", line %d, failed to find expected break at posit ion %d",

	2379 testFileName, lineNumber, breakPositions->elementAti(expectedI));

	2380 }

	2381 }

	2382

	2383

	2384

	2385 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

	2386 //------------------------------------------------------------------------------ ---------

	2387 //

	2388 // classs RBBIMonkeyKind

	2389 //

	2390 // Monkey Test for Break Iteration

	2391 // Abstract interface class. Concrete derived classes independently

	2392 // implement the break rules for different iterator types.

	2393 //

	2394 // The Monkey Test itself uses doesn't know which type of break iterator it is

	2395 // testing, but works purely in terms of the interface defined here.

	2396 //

	2397 //------------------------------------------------------------------------------ ---------

	2398 class RBBIMonkeyKind {

	2399 public:

	2400 // Return a UVector of UnicodeSets, representing the character classes used

	2401 // for this type of iterator.

	2402 virtual UVector *charClasses() = 0;

	2403

	2404 // Set the test text on which subsequent calls to next() will operate

	2405 virtual void setText(const UnicodeString &s) = 0;

	2406

	2407 // Find the next break postion, starting from the prev break position, or fr om zero.

	2408 // Return -1 after reaching end of string.

	2409 virtual int32_t next(int32_t i) = 0;

	2410

	2411 virtual ~RBBIMonkeyKind();

	2412 UErrorCode deferredStatus;

	2413

	2414

	2415 protected:

	2416 RBBIMonkeyKind();

	2417

	2418 private:

	2419 };

	2420

	2421 RBBIMonkeyKind::RBBIMonkeyKind() {

	2422 deferredStatus = U_ZERO_ERROR;

	2423 }

	2424

	2425 RBBIMonkeyKind::~RBBIMonkeyKind() {

	2426 }

	2427

	2428

	2429 //------------------------------------------------------------------------------ ----------

	2430 //

	2431 // Random Numbers. Similar to standard lib rand() and srand()

	2432 // Not using library to

	2433 // 1. Get same results on all platforms.

	2434 // 2. Get access to current seed, to more easily reproduce failures.

	2435 //

	2436 //------------------------------------------------------------------------------ ---------

	2437 static uint32_t m_seed = 1;

	2438

	2439 static uint32_t m_rand()

	2440 {

	2441 m_seed = m_seed * 1103515245 + 12345;

	2442 return (uint32_t)(m_seed/65536) % 32768;

	2443 }

	2444

	2445

	2446 //------------------------------------------------------------------------------ ------------

	2447 //

	2448 // class RBBICharMonkey Character (Grapheme Cluster) specific implementat ion

	2449 // of RBBIMonkeyKind.

	2450 //

	2451 //------------------------------------------------------------------------------ ------------

	2452 class RBBICharMonkey: public RBBIMonkeyKind {

	2453 public:

	2454 RBBICharMonkey();

	2455 virtual ~RBBICharMonkey();

	2456 virtual UVector *charClasses();

	2457 virtual void setText(const UnicodeString &s);

	2458 virtual int32_t next(int32_t i);

	2459 private:

	2460 UVector *fSets;

	2461

	2462 UnicodeSet *fCRLFSet;

	2463 UnicodeSet *fControlSet;

	2464 UnicodeSet *fExtendSet;

	2465 UnicodeSet *fPrependSet;

	2466 UnicodeSet *fSpacingSet;

	2467 UnicodeSet *fLSet;

	2468 UnicodeSet *fVSet;

	2469 UnicodeSet *fTSet;

	2470 UnicodeSet *fLVSet;

	2471 UnicodeSet *fLVTSet;

	2472 UnicodeSet *fHangulSet;

	2473 UnicodeSet *fAnySet;

	2474

	2475 const UnicodeString *fText;

	2476 };

	2477

	2478

	2479 RBBICharMonkey::RBBICharMonkey() {

	2480 UErrorCode status = U_ZERO_ERROR;

	2481

	2482 fText = NULL;

	2483

	2484 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);

	2485 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Br eak = Control}]"), status);

	2486 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Br eak = Extend}]"), status);

	2487 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Br eak = Prepend}]"), status);

	2488 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Br eak = SpacingMark}]"), status);

	2489 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Br eak = L}]"), status);

	2490 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Br eak = V}]"), status);

	2491 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Br eak = T}]"), status);

	2492 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Br eak = LV}]"), status);

	2493 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Br eak = LVT}]"), status);

	2494 fHangulSet = new UnicodeSet();

	2495 fHangulSet->addAll(*fLSet);

	2496 fHangulSet->addAll(*fVSet);

	2497 fHangulSet->addAll(*fTSet);

	2498 fHangulSet->addAll(*fLVSet);

	2499 fHangulSet->addAll(*fLVTSet);

	2500 fAnySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status);

	2501

	2502 fSets = new UVector(status);

	2503 fSets->addElement(fCRLFSet, status);

	2504 fSets->addElement(fControlSet, status);

	2505 fSets->addElement(fExtendSet, status);

	2506 fSets->addElement(fPrependSet, status);

	2507 fSets->addElement(fSpacingSet, status);

	2508 fSets->addElement(fHangulSet, status);

	2509 fSets->addElement(fAnySet, status);

	2510 if (U_FAILURE(status)) {

	2511 deferredStatus = status;

	2512 }

	2513 }

	2514

	2515

	2516 void RBBICharMonkey::setText(const UnicodeString &s) {

	2517 fText = &s;

	2518 }

	2519

	2520

	2521

	2522 int32_t RBBICharMonkey::next(int32_t prevPos) {

	2523 int p0, p1, p2, p3; // Indices of the significant code points around t he

	2524 // break position being tested. The candidate b reak

	2525 // location is before p2.

	2526

	2527 int breakPos = -1;

	2528

	2529 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.

	2530

	2531 if (U_FAILURE(deferredStatus)) {

	2532 return -1;

	2533 }

	2534

	2535 // Previous break at end of string. return DONE.

	2536 if (prevPos >= fText->length()) {

	2537 return -1;

	2538 }

	2539 p0 = p1 = p2 = p3 = prevPos;

	2540 c3 = fText->char32At(prevPos);

	2541 c0 = c1 = c2 = 0;

	2542

	2543 // Loop runs once per "significant" character position in the input text.

	2544 for (;;) {

	2545 // Move all of the positions forward in the input string.

	2546 p0 = p1; c0 = c1;

	2547 p1 = p2; c1 = c2;

	2548 p2 = p3; c2 = c3;

	2549

	2550 // Advancd p3 by one codepoint

	2551 p3 = fText->moveIndex32(p3, 1);

	2552 c3 = fText->char32At(p3);

	2553

	2554 if (p1 == p2) {

	2555 // Still warming up the loop. (won't work with zero length strings, but we don't care)

	2556 continue;

	2557 }

	2558 if (p2 == fText->length()) {

	2559 // Reached end of string. Always a break position.

	2560 break;

	2561 }

	2562

	2563 // Rule GB3 CR x LF

	2564 // No Extend or Format characters may appear between the CR and LF,

	2565 // which requires the additional check for p2 immediately following p1.

	2566 //

	2567 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {

	2568 continue;

	2569 }

	2570

	2571 // Rule (GB4). ( Control \| CR \| LF ) <break>

	2572 if (fControlSet->contains(c1) \|\|

	2573 c1 == 0x0D \|\|

	2574 c1 == 0x0A) {

	2575 break;

	2576 }

	2577

	2578 // Rule (GB5) <break> ( Control \| CR \| LF )

	2579 //

	2580 if (fControlSet->contains(c2) \|\|

	2581 c2 == 0x0D \|\|

	2582 c2 == 0x0A) {

	2583 break;

	2584 }

	2585

	2586

	2587 // Rule (GB6) L x ( L \| V \| LV \| LVT )

	2588 if (fLSet->contains(c1) &&

	2589 (fLSet->contains(c2) \|\|

	2590 fVSet->contains(c2) \|\|

	2591 fLVSet->contains(c2) \|\|

	2592 fLVTSet->contains(c2))) {

	2593 continue;

	2594 }

	2595

	2596 // Rule (GB7) ( LV \| V ) x ( V \| T )

	2597 if ((fLVSet->contains(c1) \|\| fVSet->contains(c1)) &&

	2598 (fVSet->contains(c2) \|\| fTSet->contains(c2))) {

	2599 continue;

	2600 }

	2601

	2602 // Rule (GB8) ( LVT \| T) x T

	2603 if ((fLVTSet->contains(c1) \|\| fTSet->contains(c1)) &&

	2604 fTSet->contains(c2)) {

	2605 continue;

	2606 }

	2607

	2608 // Rule (GB9) Numeric x ALetter

	2609 if (fExtendSet->contains(c2)) {

	2610 continue;

	2611 }

	2612

	2613 // Rule (GB9a) x SpacingMark

	2614 if (fSpacingSet->contains(c2)) {

	2615 continue;

	2616 }

	2617

	2618 // Rule (GB9b) Prepend x

	2619 if (fPrependSet->contains(c1)) {

	2620 continue;

	2621 }

	2622

	2623 // Rule (GB10) Any <break> Any

	2624 break;

	2625 }

	2626

	2627 breakPos = p2;

	2628 return breakPos;

	2629 }

	2630

	2631

	2632

	2633 UVector *RBBICharMonkey::charClasses() {

	2634 return fSets;

	2635 }

	2636

	2637

	2638 RBBICharMonkey::~RBBICharMonkey() {

	2639 delete fSets;

	2640 delete fCRLFSet;

	2641 delete fControlSet;

	2642 delete fExtendSet;

	2643 delete fPrependSet;

	2644 delete fSpacingSet;

	2645 delete fLSet;

	2646 delete fVSet;

	2647 delete fTSet;

	2648 delete fLVSet;

	2649 delete fLVTSet;

	2650 delete fHangulSet;

	2651 delete fAnySet;

	2652 }

	2653

	2654 //------------------------------------------------------------------------------ ------------

	2655 //

	2656 // class RBBIWordMonkey Word Break specific implementation

	2657 // of RBBIMonkeyKind.

	2658 //

	2659 //------------------------------------------------------------------------------ ------------

	2660 class RBBIWordMonkey: public RBBIMonkeyKind {

	2661 public:

	2662 RBBIWordMonkey();

	2663 virtual ~RBBIWordMonkey();

	2664 virtual UVector *charClasses();

	2665 virtual void setText(const UnicodeString &s);

	2666 virtual int32_t next(int32_t i);

	2667 private:

	2668 UVector *fSets;

	2669

	2670 UnicodeSet *fCRSet;

	2671 UnicodeSet *fLFSet;

	2672 UnicodeSet *fNewlineSet;

	2673 UnicodeSet *fKatakanaSet;

	2674 UnicodeSet *fALetterSet;

	2675 UnicodeSet *fMidNumLetSet;

	2676 UnicodeSet *fMidLetterSet;

	2677 UnicodeSet *fMidNumSet;

	2678 UnicodeSet *fNumericSet;

	2679 UnicodeSet *fFormatSet;

	2680 UnicodeSet *fOtherSet;

	2681 UnicodeSet *fExtendSet;

	2682 UnicodeSet *fExtendNumLetSet;

	2683

	2684 RegexMatcher *fMatcher;

	2685

	2686 const UnicodeString *fText;

	2687 };

	2688

	2689

	2690 RBBIWordMonkey::RBBIWordMonkey()

	2691 {

	2692 UErrorCode status = U_ZERO_ERROR;

	2693

	2694 fSets = new UVector(status);

	2695

	2696 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = C R}]"), status);

	2697 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = L F}]"), status);

	2698 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = N ewline}]"), status);

	2699 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = A Letter}]"), status);

	2700 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = K atakana}]"), status);

	2701 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = M idNumLet}]"), status);

	2702 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = M idLetter}]"), status);

	2703 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = M idNum}]"), status);

	2704 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = N umeric}]"), status);

	2705 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = F ormat}]"), status);

	2706 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = E xtendNumLet}]"), status);

	2707 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = E xtend}]"), status);

	2708

	2709 fOtherSet = new UnicodeSet();

	2710 if(U_FAILURE(status)) {

	2711 deferredStatus = status;

	2712 return;

	2713 }

	2714

	2715 fOtherSet->complement();

	2716 fOtherSet->removeAll(*fCRSet);

	2717 fOtherSet->removeAll(*fLFSet);

	2718 fOtherSet->removeAll(*fNewlineSet);

	2719 fOtherSet->removeAll(*fKatakanaSet);

	2720 fOtherSet->removeAll(*fALetterSet);

	2721 fOtherSet->removeAll(*fMidLetterSet);

	2722 fOtherSet->removeAll(*fMidNumSet);

	2723 fOtherSet->removeAll(*fNumericSet);

	2724 fOtherSet->removeAll(*fExtendNumLetSet);

	2725 fOtherSet->removeAll(*fFormatSet);

	2726 fOtherSet->removeAll(*fExtendSet);

	2727 // Inhibit dictionary characters from being tested at all.

	2728 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Comp lex_Context}]"), status));

	2729

	2730 fSets->addElement(fCRSet, status);

	2731 fSets->addElement(fLFSet, status);

	2732 fSets->addElement(fNewlineSet, status);

	2733 fSets->addElement(fALetterSet, status);

	2734 fSets->addElement(fKatakanaSet, status);

	2735 fSets->addElement(fMidLetterSet, status);

	2736 fSets->addElement(fMidNumLetSet, status);

	2737 fSets->addElement(fMidNumSet, status);

	2738 fSets->addElement(fNumericSet, status);

	2739 fSets->addElement(fFormatSet, status);

	2740 fSets->addElement(fExtendSet, status);

	2741 fSets->addElement(fOtherSet, status);

	2742 fSets->addElement(fExtendNumLetSet, status);

	2743

	2744 if (U_FAILURE(status)) {

	2745 deferredStatus = status;

	2746 }

	2747 }

	2748

	2749 void RBBIWordMonkey::setText(const UnicodeString &s) {

	2750 fText = &s;

	2751 }

	2752

	2753

	2754 int32_t RBBIWordMonkey::next(int32_t prevPos) {

	2755 int p0, p1, p2, p3; // Indices of the significant code points around t he

	2756 // break position being tested. The candidate b reak

	2757 // location is before p2.

	2758

	2759 int breakPos = -1;

	2760

	2761 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.

	2762

	2763 if (U_FAILURE(deferredStatus)) {

	2764 return -1;

	2765 }

	2766

	2767 // Prev break at end of string. return DONE.

	2768 if (prevPos >= fText->length()) {

	2769 return -1;

	2770 }

	2771 p0 = p1 = p2 = p3 = prevPos;

	2772 c3 = fText->char32At(prevPos);

	2773 c0 = c1 = c2 = 0;

	2774

	2775 // Loop runs once per "significant" character position in the input text.

	2776 for (;;) {

	2777 // Move all of the positions forward in the input string.

	2778 p0 = p1; c0 = c1;

	2779 p1 = p2; c1 = c2;

	2780 p2 = p3; c2 = c3;

	2781

	2782 // Advancd p3 by X(Extend \| Format)* Rule 4

	2783 // But do not advance over Extend & Format following a new line. (Uni code 5.1 change)

	2784 do {

	2785 p3 = fText->moveIndex32(p3, 1);

	2786 c3 = fText->char32At(p3);

	2787 if (fCRSet->contains(c2) \|\| fLFSet->contains(c2) \|\| fNewlineSet->con tains(c2)) {

	2788 break;

	2789 };

	2790 }

	2791 while (fFormatSet->contains(c3) \|\| fExtendSet->contains(c3));

	2792

	2793

	2794 if (p1 == p2) {

	2795 // Still warming up the loop. (won't work with zero length strings, but we don't care)

	2796 continue;

	2797 }

	2798 if (p2 == fText->length()) {

	2799 // Reached end of string. Always a break position.

	2800 break;

	2801 }

	2802

	2803 // Rule (3) CR x LF

	2804 // No Extend or Format characters may appear between the CR and LF,

	2805 // which requires the additional check for p2 immediately following p1.

	2806 //

	2807 if (c1==0x0D && c2==0x0A) {

	2808 continue;

	2809 }

	2810

	2811 // Rule (3a) Break before and after newlines (including CR and LF)

	2812 //

	2813 if (fCRSet->contains(c1) \|\| fLFSet->contains(c1) \|\| fNewlineSet->contain s(c1)) {

	2814 break;

	2815 };

	2816 if (fCRSet->contains(c2) \|\| fLFSet->contains(c2) \|\| fNewlineSet->contain s(c2)) {

	2817 break;

	2818 };

	2819

	2820 // Rule (5). ALetter x ALetter

	2821 if (fALetterSet->contains(c1) &&

	2822 fALetterSet->contains(c2)) {

	2823 continue;

	2824 }

	2825

	2826 // Rule (6) ALetter x (MidLetter \| MidNumLet) ALetter

	2827 //

	2828 if ( fALetterSet->contains(c1) &&

	2829 (fMidLetterSet->contains(c2) \|\| fMidNumLetSet->contains(c2)) &&

	2830 fALetterSet->contains(c3)) {

	2831 continue;

	2832 }

	2833

	2834

	2835 // Rule (7) ALetter (MidLetter \| MidNumLet) x ALetter

	2836 if (fALetterSet->contains(c0) &&

	2837 (fMidLetterSet->contains(c1) \|\| fMidNumLetSet->contains(c1)) &&

	2838 fALetterSet->contains(c2)) {

	2839 continue;

	2840 }

	2841

	2842 // Rule (8) Numeric x Numeric

	2843 if (fNumericSet->contains(c1) &&

	2844 fNumericSet->contains(c2)) {

	2845 continue;

	2846 }

	2847

	2848 // Rule (9) ALetter x Numeric

	2849 if (fALetterSet->contains(c1) &&

	2850 fNumericSet->contains(c2)) {

	2851 continue;

	2852 }

	2853

	2854 // Rule (10) Numeric x ALetter

	2855 if (fNumericSet->contains(c1) &&

	2856 fALetterSet->contains(c2)) {

	2857 continue;

	2858 }

	2859

	2860 // Rule (11) Numeric (MidNum \| MidNumLet) x Numeric

	2861 if (fNumericSet->contains(c0) &&

	2862 (fMidNumSet->contains(c1) \|\| fMidNumLetSet->contains(c1)) &&

	2863 fNumericSet->contains(c2)) {

	2864 continue;

	2865 }

	2866

	2867 // Rule (12) Numeric x (MidNum \| MidNumLet) Numeric

	2868 if (fNumericSet->contains(c1) &&

	2869 (fMidNumSet->contains(c2) \|\| fMidNumLetSet->contains(c2)) &&

	2870 fNumericSet->contains(c3)) {

	2871 continue;

	2872 }

	2873

	2874 // Rule (13) Katakana x Katakana

	2875 if (fKatakanaSet->contains(c1) &&

	2876 fKatakanaSet->contains(c2)) {

	2877 continue;

	2878 }

	2879

	2880 // Rule 13a

	2881 if ((fALetterSet->contains(c1) \|\| fNumericSet->contains(c1) \|\|

	2882 fKatakanaSet->contains(c1) \|\| fExtendNumLetSet->contains(c1)) &&

	2883 fExtendNumLetSet->contains(c2)) {

	2884 continue;

	2885 }

	2886

	2887 // Rule 13b

	2888 if (fExtendNumLetSet->contains(c1) &&

	2889 (fALetterSet->contains(c2) \|\| fNumericSet->contains(c2) \|\|

	2890 fKatakanaSet->contains(c2))) {

	2891 continue;

	2892 }

	2893

	2894 // Rule 14. Break found here.

	2895 break;

	2896 }

	2897

	2898 breakPos = p2;

	2899 return breakPos;

	2900 }

	2901

	2902

	2903 UVector *RBBIWordMonkey::charClasses() {

	2904 return fSets;

	2905 }

	2906

	2907

	2908 RBBIWordMonkey::~RBBIWordMonkey() {

	2909 delete fSets;

	2910 delete fCRSet;

	2911 delete fLFSet;

	2912 delete fNewlineSet;

	2913 delete fKatakanaSet;

	2914 delete fALetterSet;

	2915 delete fMidNumLetSet;

	2916 delete fMidLetterSet;

	2917 delete fMidNumSet;

	2918 delete fNumericSet;

	2919 delete fFormatSet;

	2920 delete fExtendSet;

	2921 delete fExtendNumLetSet;

	2922 delete fOtherSet;

	2923 }

	2924

	2925

	2926

	2927

	2928 //------------------------------------------------------------------------------ ------------

	2929 //

	2930 // class RBBISentMonkey Sentence Break specific implementation

	2931 // of RBBIMonkeyKind.

	2932 //

	2933 //------------------------------------------------------------------------------ ------------

	2934 class RBBISentMonkey: public RBBIMonkeyKind {

	2935 public:

	2936 RBBISentMonkey();

	2937 virtual ~RBBISentMonkey();

	2938 virtual UVector *charClasses();

	2939 virtual void setText(const UnicodeString &s);

	2940 virtual int32_t next(int32_t i);

	2941 private:

	2942 int moveBack(int posFrom);

	2943 int moveForward(int posFrom);

	2944 UChar32 cAt(int pos);

	2945

	2946 UVector *fSets;

	2947

	2948 UnicodeSet *fSepSet;

	2949 UnicodeSet *fFormatSet;

	2950 UnicodeSet *fSpSet;

	2951 UnicodeSet *fLowerSet;

	2952 UnicodeSet *fUpperSet;

	2953 UnicodeSet *fOLetterSet;

	2954 UnicodeSet *fNumericSet;

	2955 UnicodeSet *fATermSet;

	2956 UnicodeSet *fSContinueSet;

	2957 UnicodeSet *fSTermSet;

	2958 UnicodeSet *fCloseSet;

	2959 UnicodeSet *fOtherSet;

	2960 UnicodeSet *fExtendSet;

	2961

	2962 const UnicodeString *fText;

	2963

	2964 };

	2965

	2966 RBBISentMonkey::RBBISentMonkey()

	2967 {

	2968 UErrorCode status = U_ZERO_ERROR;

	2969

	2970 fSets = new UVector(status);

	2971

	2972 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator

	2973 // set and made into character classes of their own. For the monkey impl,

	2974 // they remain in SEP, since Sep always appears with C R and LF in the rules.

	2975 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);

	2976 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);

	2977 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);

	2978 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);

	2979 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);

	2980 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);

	2981 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);

	2982 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);

	2983 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);

	2984 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);

	2985 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);

	2986 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);

	2987 fOtherSet = new UnicodeSet();

	2988

	2989 if(U_FAILURE(status)) {

	2990 deferredStatus = status;

	2991 return;

	2992 }

	2993

	2994 fOtherSet->complement();

	2995 fOtherSet->removeAll(*fSepSet);

	2996 fOtherSet->removeAll(*fFormatSet);

	2997 fOtherSet->removeAll(*fSpSet);

	2998 fOtherSet->removeAll(*fLowerSet);

	2999 fOtherSet->removeAll(*fUpperSet);

	3000 fOtherSet->removeAll(*fOLetterSet);

	3001 fOtherSet->removeAll(*fNumericSet);

	3002 fOtherSet->removeAll(*fATermSet);

	3003 fOtherSet->removeAll(*fSContinueSet);

	3004 fOtherSet->removeAll(*fSTermSet);

	3005 fOtherSet->removeAll(*fCloseSet);

	3006 fOtherSet->removeAll(*fExtendSet);

	3007

	3008 fSets->addElement(fSepSet, status);

	3009 fSets->addElement(fFormatSet, status);

	3010 fSets->addElement(fSpSet, status);

	3011 fSets->addElement(fLowerSet, status);

	3012 fSets->addElement(fUpperSet, status);

	3013 fSets->addElement(fOLetterSet, status);

	3014 fSets->addElement(fNumericSet, status);

	3015 fSets->addElement(fATermSet, status);

	3016 fSets->addElement(fSContinueSet, status);

	3017 fSets->addElement(fSTermSet, status);

	3018 fSets->addElement(fCloseSet, status);

	3019 fSets->addElement(fOtherSet, status);

	3020 fSets->addElement(fExtendSet, status);

	3021

	3022 if (U_FAILURE(status)) {

	3023 deferredStatus = status;

	3024 }

	3025 }

	3026

	3027

	3028

	3029 void RBBISentMonkey::setText(const UnicodeString &s) {

	3030 fText = &s;

	3031 }

	3032

	3033 UVector *RBBISentMonkey::charClasses() {

	3034 return fSets;

	3035 }

	3036

	3037

	3038 // moveBack() Find the "significant" code point preceding the index i.

	3039 // Skips over ($Extend \| $Format)* .

	3040 //

	3041 int RBBISentMonkey::moveBack(int i) {

	3042 if (i <= 0) {

	3043 return -1;

	3044 }

	3045 UChar32 c;

	3046 int32_t j = i;

	3047 do {

	3048 j = fText->moveIndex32(j, -1);

	3049 c = fText->char32At(j);

	3050 }

	3051 while (j>0 &&(fFormatSet->contains(c) \|\| fExtendSet->contains(c)));

	3052 return j;

	3053

	3054 }

	3055

	3056

	3057 int RBBISentMonkey::moveForward(int i) {

	3058 if (i>=fText->length()) {

	3059 return fText->length();

	3060 }

	3061 UChar32 c;

	3062 int32_t j = i;

	3063 do {

	3064 j = fText->moveIndex32(j, 1);

	3065 c = cAt(j);

	3066 }

	3067 while (fFormatSet->contains(c) \|\| fExtendSet->contains(c));

	3068 return j;

	3069 }

	3070

	3071 UChar32 RBBISentMonkey::cAt(int pos) {

	3072 if (pos<0 \|\| pos>=fText->length()) {

	3073 return -1;

	3074 } else {

	3075 return fText->char32At(pos);

	3076 }

	3077 }

	3078

	3079 int32_t RBBISentMonkey::next(int32_t prevPos) {

	3080 int p0, p1, p2, p3; // Indices of the significant code points around t he

	3081 // break position being tested. The candidate b reak

	3082 // location is before p2.

	3083

	3084 int breakPos = -1;

	3085

	3086 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.

	3087 UChar32 c;

	3088

	3089 if (U_FAILURE(deferredStatus)) {

	3090 return -1;

	3091 }

	3092

	3093 // Prev break at end of string. return DONE.

	3094 if (prevPos >= fText->length()) {

	3095 return -1;

	3096 }

	3097 p0 = p1 = p2 = p3 = prevPos;

	3098 c3 = fText->char32At(prevPos);

	3099 c0 = c1 = c2 = 0;

	3100

	3101 // Loop runs once per "significant" character position in the input text.

	3102 for (;;) {

	3103 // Move all of the positions forward in the input string.

	3104 p0 = p1; c0 = c1;

	3105 p1 = p2; c1 = c2;

	3106 p2 = p3; c2 = c3;

	3107

	3108 // Advancd p3 by X(Extend \| Format)* Rule 4

	3109 p3 = moveForward(p3);

	3110 c3 = cAt(p3);

	3111

	3112 // Rule (3) CR x LF

	3113 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {

	3114 continue;

	3115 }

	3116

	3117 // Rule (4). Sep <break>

	3118 if (fSepSet->contains(c1)) {

	3119 p2 = p1+1; // Separators don't combine with Extend or Format.

	3120 break;

	3121 }

	3122

	3123 if (p2 >= fText->length()) {

	3124 // Reached end of string. Always a break position.

	3125 break;

	3126 }

	3127

	3128 if (p2 == prevPos) {

	3129 // Still warming up the loop. (won't work with zero length strings, but we don't care)

	3130 continue;

	3131 }

	3132

	3133 // Rule (6). ATerm x Numeric

	3134 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {

	3135 continue;

	3136 }

	3137

	3138 // Rule (7). Upper ATerm x Uppper

	3139 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->con tains(c2)) {

	3140 continue;

	3141 }

	3142

	3143 // Rule (8) ATerm Close* Sp* x (not (OLettter \| Upper \| Lower \| Sep \| STerm \| ATerm))* Lower

	3144 // Note: STerm \| ATerm are added to the negated part of the e xpression by a

	3145 // note to the Unicode 5.0 documents.

	3146 int p8 = p1;

	3147 while (fSpSet->contains(cAt(p8))) {

	3148 p8 = moveBack(p8);

	3149 }

	3150 while (fCloseSet->contains(cAt(p8))) {

	3151 p8 = moveBack(p8);

	3152 }

	3153 if (fATermSet->contains(cAt(p8))) {

	3154 p8=p2;

	3155 for (;;) {

	3156 c = cAt(p8);

	3157 if (c==-1 \|\| fOLetterSet->contains(c) \|\| fUpperSet->contains(c) \|\|

	3158 fLowerSet->contains(c) \|\| fSepSet->contains(c) \|\|

	3159 fATermSet->contains(c) \|\| fSTermSet->contains(c)) {

	3160 break;

	3161 }

	3162 p8 = moveForward(p8);

	3163 }

	3164 if (fLowerSet->contains(cAt(p8))) {

	3165 continue;

	3166 }

	3167 }

	3168

	3169 // Rule 8a (STerm \| ATerm) Close* Sp* x (SContinue \| STerm \| ATerm);

	3170 if (fSContinueSet->contains(c2) \|\| fSTermSet->contains(c2) \|\| fATermSet- >contains(c2)) {

	3171 p8 = p1;

	3172 while (fSpSet->contains(cAt(p8))) {

	3173 p8 = moveBack(p8);

	3174 }

	3175 while (fCloseSet->contains(cAt(p8))) {

	3176 p8 = moveBack(p8);

	3177 }

	3178 c = cAt(p8);

	3179 if (fSTermSet->contains(c) \|\| fATermSet->contains(c)) {

	3180 continue;

	3181 }

	3182 }

	3183

	3184 // Rule (9) (STerm \| ATerm) Close* x (Close \| Sp \| Sep \| CR \| LF)

	3185 int p9 = p1;

	3186 while (fCloseSet->contains(cAt(p9))) {

	3187 p9 = moveBack(p9);

	3188 }

	3189 c = cAt(p9);

	3190 if ((fSTermSet->contains(c) \|\| fATermSet->contains(c))) {

	3191 if (fCloseSet->contains(c2) \|\| fSpSet->contains(c2) \|\| fSepSet->cont ains(c2)) {

	3192 continue;

	3193 }

	3194 }

	3195

	3196 // Rule (10) (Sterm \| ATerm) Close* Sp* x (Sp \| Sep \| CR \| LF)

	3197 int p10 = p1;

	3198 while (fSpSet->contains(cAt(p10))) {

	3199 p10 = moveBack(p10);

	3200 }

	3201 while (fCloseSet->contains(cAt(p10))) {

	3202 p10 = moveBack(p10);

	3203 }

	3204 if (fSTermSet->contains(cAt(p10)) \|\| fATermSet->contains(cAt(p10))) {

	3205 if (fSpSet->contains(c2) \|\| fSepSet->contains(c2)) {

	3206 continue;

	3207 }

	3208 }

	3209

	3210 // Rule (11) (STerm \| ATerm) Close* Sp* (Sep \| CR \| LF)? <break>

	3211 int p11 = p1;

	3212 if (fSepSet->contains(cAt(p11))) {

	3213 p11 = moveBack(p11);

	3214 }

	3215 while (fSpSet->contains(cAt(p11))) {

	3216 p11 = moveBack(p11);

	3217 }

	3218 while (fCloseSet->contains(cAt(p11))) {

	3219 p11 = moveBack(p11);

	3220 }

	3221 if (fSTermSet->contains(cAt(p11)) \|\| fATermSet->contains(cAt(p11))) {

	3222 break;

	3223 }

	3224

	3225 // Rule (12) Any x Any

	3226 continue;

	3227 }

	3228 breakPos = p2;

	3229 return breakPos;

	3230 }

	3231

	3232 RBBISentMonkey::~RBBISentMonkey() {

	3233 delete fSets;

	3234 delete fSepSet;

	3235 delete fFormatSet;

	3236 delete fSpSet;

	3237 delete fLowerSet;

	3238 delete fUpperSet;

	3239 delete fOLetterSet;

	3240 delete fNumericSet;

	3241 delete fATermSet;

	3242 delete fSContinueSet;

	3243 delete fSTermSet;

	3244 delete fCloseSet;

	3245 delete fOtherSet;

	3246 delete fExtendSet;

	3247 }

	3248

	3249

	3250

	3251 //------------------------------------------------------------------------------ -------------

	3252 //

	3253 // RBBILineMonkey

	3254 //

	3255 //------------------------------------------------------------------------------ -------------

	3256

	3257 class RBBILineMonkey: public RBBIMonkeyKind {

	3258 public:

	3259 RBBILineMonkey();

	3260 virtual ~RBBILineMonkey();

	3261 virtual UVector *charClasses();

	3262 virtual void setText(const UnicodeString &s);

	3263 virtual int32_t next(int32_t i);

	3264 virtual void rule9Adjust(int32_t pos, UChar32 posChar, int32_t nextPo s, UChar32 *nextChar);

	3265 private:

	3266 UVector *fSets;

	3267

	3268 UnicodeSet *fBK;

	3269 UnicodeSet *fCR;

	3270 UnicodeSet *fLF;

	3271 UnicodeSet *fCM;

	3272 UnicodeSet *fNL;

	3273 UnicodeSet *fSG;

	3274 UnicodeSet *fWJ;

	3275 UnicodeSet *fZW;

	3276 UnicodeSet *fGL;

	3277 UnicodeSet *fCB;

	3278 UnicodeSet *fSP;

	3279 UnicodeSet *fB2;

	3280 UnicodeSet *fBA;

	3281 UnicodeSet *fBB;

	3282 UnicodeSet *fHY;

	3283 UnicodeSet *fH2;

	3284 UnicodeSet *fH3;

	3285 UnicodeSet *fCL;

	3286 UnicodeSet *fCP;

	3287 UnicodeSet *fEX;

	3288 UnicodeSet *fIN;

	3289 UnicodeSet *fJL;

	3290 UnicodeSet *fJV;

	3291 UnicodeSet *fJT;

	3292 UnicodeSet *fNS;

	3293 UnicodeSet *fOP;

	3294 UnicodeSet *fQU;

	3295 UnicodeSet *fIS;

	3296 UnicodeSet *fNU;

	3297 UnicodeSet *fPO;

	3298 UnicodeSet *fPR;

	3299 UnicodeSet *fSY;

	3300 UnicodeSet *fAI;

	3301 UnicodeSet *fAL;

	3302 UnicodeSet *fID;

	3303 UnicodeSet *fSA;

	3304 UnicodeSet *fXX;

	3305

	3306 BreakIterator *fCharBI;

	3307

	3308 const UnicodeString *fText;

	3309 int32_t *fOrigPositions;

	3310

	3311 RegexMatcher *fNumberMatcher;

	3312 RegexMatcher *fLB11Matcher;

	3313 };

	3314

	3315

	3316 RBBILineMonkey::RBBILineMonkey()

	3317 {

	3318 UErrorCode status = U_ZERO_ERROR;

	3319

	3320 fSets = new UVector(status);

	3321

	3322 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), statu s);

	3323 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), statu s);

	3324 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), statu s);

	3325 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), statu s);

	3326 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), statu s);

	3327 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), statu s);

	3328 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), statu s);

	3329 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), statu s);

	3330 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), statu s);

	3331 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), statu s);

	3332 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), statu s);

	3333 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), statu s);

	3334 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), statu s);

	3335 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), statu s);

	3336 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), statu s);

	3337 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), statu s);

	3338 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), statu s);

	3339 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), statu s);

	3340 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), statu s);

	3341 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), statu s);

	3342 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), statu s);

	3343 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), statu s);

	3344 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), statu s);

	3345 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), statu s);

	3346 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), statu s);

	3347 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), statu s);

	3348 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), statu s);

	3349 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), statu s);

	3350 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), statu s);

	3351 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), statu s);

	3352 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), statu s);

	3353 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), statu s);

	3354 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), statu s);

	3355 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), statu s);

	3356 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), statu s);

	3357 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);

	3358 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), statu s);

	3359

	3360 if (U_FAILURE(status)) {

	3361 deferredStatus = status;

	3362 fCharBI = NULL;

	3363 fNumberMatcher = NULL;

	3364 return;

	3365 }

	3366

	3367 fAL->addAll(*fXX); // Default behavior for XX is identical to AL

	3368 fAL->addAll(*fAI); // Default behavior for AI is identical to AL

	3369 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to A L

	3370 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.

	3371

	3372 fSets->addElement(fBK, status);

	3373 fSets->addElement(fCR, status);

	3374 fSets->addElement(fLF, status);

	3375 fSets->addElement(fCM, status);

	3376 fSets->addElement(fNL, status);

	3377 fSets->addElement(fWJ, status);

	3378 fSets->addElement(fZW, status);

	3379 fSets->addElement(fGL, status);

	3380 fSets->addElement(fCB, status);

	3381 fSets->addElement(fSP, status);

	3382 fSets->addElement(fB2, status);

	3383 fSets->addElement(fBA, status);

	3384 fSets->addElement(fBB, status);

	3385 fSets->addElement(fHY, status);

	3386 fSets->addElement(fH2, status);

	3387 fSets->addElement(fH3, status);

	3388 fSets->addElement(fCL, status);

	3389 fSets->addElement(fCP, status);

	3390 fSets->addElement(fEX, status);

	3391 fSets->addElement(fIN, status);

	3392 fSets->addElement(fJL, status);

	3393 fSets->addElement(fJT, status);

	3394 fSets->addElement(fJV, status);

	3395 fSets->addElement(fNS, status);

	3396 fSets->addElement(fOP, status);

	3397 fSets->addElement(fQU, status);

	3398 fSets->addElement(fIS, status);

	3399 fSets->addElement(fNU, status);

	3400 fSets->addElement(fPO, status);

	3401 fSets->addElement(fPR, status);

	3402 fSets->addElement(fSY, status);

	3403 fSets->addElement(fAI, status);

	3404 fSets->addElement(fAL, status);

	3405 fSets->addElement(fID, status);

	3406 fSets->addElement(fWJ, status);

	3407 fSets->addElement(fSA, status);

	3408 fSets->addElement(fSG, status);

	3409

	3410 const char *rules =

	3411 "((\\p{Line_Break=PR}\|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"

	3412 "((\\p{Line_Break=OP}\|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"

	3413 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"

	3414 "((\\p{Line_Break=NU}\|\\p{Line_Break=IS}\|\\p{Line_Break=SY})\\p{Line _Break=CM})"

	3415 "((\\p{Line_Break=CL}\|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"

	3416 "((\\p{Line_Break=PR}\|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";

	3417

	3418 fNumberMatcher = new RegexMatcher(

	3419 UnicodeString(rules, -1, US_INV), 0, status);

	3420

	3421 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), statu s);

	3422

	3423 if (U_FAILURE(status)) {

	3424 deferredStatus = status;

	3425 }

	3426 }

	3427

	3428

	3429 void RBBILineMonkey::setText(const UnicodeString &s) {

	3430 fText = &s;

	3431 fCharBI->setText(s);

	3432 fNumberMatcher->reset(s);

	3433 }

	3434

	3435 //

	3436 // rule9Adjust

	3437 // Line Break TR rules 9 and 10 implementation.

	3438 // This deals with combining marks and other sequences that

	3439 // that must be treated as if they were something other than what they actua lly are.

	3440 //

	3441 // This is factored out into a separate function because it must be applied twice for

	3442 // each potential break, once to the chars before the position being checked , then

	3443 // again to the text following the possible break.

	3444 //

	3445 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 posChar, int32_t nextPos , UChar32 *nextChar) {

	3446 if (pos == -1) {

	3447 // Invalid initial position. Happens during the warmup iteration of the

	3448 // main loop in next().

	3449 return;

	3450 }

	3451

	3452 int32_t nPos = *nextPos;

	3453

	3454 // LB 9 Keep combining sequences together.

	3455 // advance over any CM class chars. Note that Line Break CM is different

	3456 // from the normal Grapheme Extend property.

	3457 if (!(fSP->contains(posChar) \|\| fBK->contains(posChar) \|\| *posChar==0x0d \| \|

	3458 posChar==0x0a \|\|fNL->contains(posChar) \|\| fZW->contains(*posChar))) {

	3459 for (;;) {

	3460 *nextChar = fText->char32At(nPos);

	3461 if (!fCM->contains(*nextChar)) {

	3462 break;

	3463 }

	3464 nPos = fText->moveIndex32(nPos, 1);

	3465 }

	3466 }

	3467

	3468

	3469 // LB 9 Treat X CM* as if it were x.

	3470 // No explicit action required.

	3471

	3472 // LB 10 Treat any remaining combining mark as AL

	3473 if (fCM->contains(*posChar)) {

	3474 *posChar = 0x41; // thisChar = 'A';

	3475 }

	3476

	3477 // Push the updated nextPos and nextChar back to our caller.

	3478 // This only makes a difference if posChar got bigger by consuming a

	3479 // combining sequence.

	3480 *nextPos = nPos;

	3481 *nextChar = fText->char32At(nPos);

	3482 }

	3483

	3484

	3485

	3486 int32_t RBBILineMonkey::next(int32_t startPos) {

	3487 UErrorCode status = U_ZERO_ERROR;

	3488 int32_t pos; // Index of the char following a potential break posi tion

	3489 UChar32 thisChar; // Character at above position "pos"

	3490

	3491 int32_t prevPos; // Index of the char preceding a potential break posi tion

	3492 UChar32 prevChar; // Character at above position. Note that prevChar

	3493 // and thisChar may not be adjacent because combinin g

	3494 // characters between them will be ignored.

	3495

	3496 int32_t nextPos; // Index of the next character following pos.

	3497 // Usually skips over combining marks.

	3498 int32_t nextCPPos; // Index of the code point following "pos."

	3499 // May point to a combining mark.

	3500 int32_t tPos; // temp value.

	3501 UChar32 c;

	3502

	3503 if (U_FAILURE(deferredStatus)) {

	3504 return -1;

	3505 }

	3506

	3507 if (startPos >= fText->length()) {

	3508 return -1;

	3509 }

	3510

	3511

	3512 // Initial values for loop. Loop will run the first time without finding br eaks,

	3513 // while the invalid values shift out and the "thi s" and

	3514 // "prev" positions are filled in with good values .

	3515 pos = prevPos = -1; // Invalid value, serves as flag for initial l oop iteration.

	3516 thisChar = prevChar = 0;

	3517 nextPos = nextCPPos = startPos;

	3518

	3519

	3520 // Loop runs once per position in the test text, until a break position

	3521 // is found.

	3522 for (;;) {

	3523 prevPos = pos;

	3524 prevChar = thisChar;

	3525

	3526 pos = nextPos;

	3527 thisChar = fText->char32At(pos);

	3528

	3529 nextCPPos = fText->moveIndex32(pos, 1);

	3530 nextPos = nextCPPos;

	3531

	3532 // Rule LB2 - Break at end of text.

	3533 if (pos >= fText->length()) {

	3534 break;

	3535 }

	3536

	3537 // Rule LB 9 - adjust for combining sequences.

	3538 // We do this one out-of-order because the adjustment does n ot change anything

	3539 // that would match rules LB 3 - LB 6, but after the adjustm ent, LB 3-6 do need to

	3540 // be applied.

	3541 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);

	3542 nextCPPos = nextPos = fText->moveIndex32(pos, 1);

	3543 c = fText->char32At(nextPos);

	3544 rule9Adjust(pos, &thisChar, &nextPos, &c);

	3545

	3546 // If the loop is still warming up - if we haven't shifted the initial

	3547 // -1 positions out of prevPos yet - loop back to advance the

	3548 // position in the input without any further looking for breaks.

	3549 if (prevPos == -1) {

	3550 continue;

	3551 }

	3552

	3553 // LB 4 Always break after hard line breaks,

	3554 if (fBK->contains(prevChar)) {

	3555 break;

	3556 }

	3557

	3558 // LB 5 Break after CR, LF, NL, but not inside CR LF

	3559 if (prevChar == 0x0d && thisChar == 0x0a) {

	3560 continue;

	3561 }

	3562 if (prevChar == 0x0d \|\|

	3563 prevChar == 0x0a \|\|

	3564 prevChar == 0x85) {

	3565 break;

	3566 }

	3567

	3568 // LB 6 Don't break before hard line breaks

	3569 if (thisChar == 0x0d \|\| thisChar == 0x0a \|\| thisChar == 0x85 \|\|

	3570 fBK->contains(thisChar)) {

	3571 continue;

	3572 }

	3573

	3574

	3575 // LB 7 Don't break before spaces or zero-width space.

	3576 if (fSP->contains(thisChar)) {

	3577 continue;

	3578 }

	3579

	3580 if (fZW->contains(thisChar)) {

	3581 continue;

	3582 }

	3583

	3584 // LB 8 Break after zero width space

	3585 if (fZW->contains(prevChar)) {

	3586 break;

	3587 }

	3588

	3589 // LB 9, 10 Already done, at top of loop.

	3590 //

	3591

	3592

	3593 // LB 11 Do not break before or after WORD JOINER and related character s.

	3594 // x WJ

	3595 // WJ x

	3596 //

	3597 if (fWJ->contains(thisChar) \|\| fWJ->contains(prevChar)) {

	3598 continue;

	3599 }

	3600

	3601 // LB 12

	3602 // GL x

	3603 if (fGL->contains(prevChar)) {

	3604 continue;

	3605 }

	3606

	3607 // LB 12a

	3608 // [^SP BA HY] x GL

	3609 if (!(fSP->contains(prevChar) \|\|

	3610 fBA->contains(prevChar) \|\|

	3611 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {

	3612 continue;

	3613 }

	3614

	3615

	3616

	3617 // LB 13 Don't break before closings.

	3618 // NU x CL, NU x CP and NU x IS are not matched here so that th ey will

	3619 // fall into LB 17 and the more general number regular expression .

	3620 //

	3621 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) \|\|

	3622 (!fNU->contains(prevChar) && fCP->contains(thisChar)) \|\|

	3623 fEX->contains(thisChar) \|\|

	3624 (!fNU->contains(prevChar) && fIS->contains(thisChar)) \|\|

	3625 (!fNU->contains(prevChar) && fSY->contains(thisChar))) {

	3626 continue;

	3627 }

	3628

	3629 // LB 14 Don't break after OP SP*

	3630 // Scan backwards, checking for this sequence.

	3631 // The OP char could include combining marks, so we actually check for

	3632 // OP CM* SP*

	3633 // Another Twist: The Rule 67 fixes may have changed a SP CM

	3634 // sequence into a ID char, so before scanning back through spaces ,

	3635 // verify that prevChar is indeed a space. The prevChar variable

	3636 // may differ from fText[prevPos]

	3637 tPos = prevPos;

	3638 if (fSP->contains(prevChar)) {

	3639 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {

	3640 tPos=fText->moveIndex32(tPos, -1);

	3641 }

	3642 }

	3643 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {

	3644 tPos=fText->moveIndex32(tPos, -1);

	3645 }

	3646 if (fOP->contains(fText->char32At(tPos))) {

	3647 continue;

	3648 }

	3649

	3650

	3651 // LB 15 QU SP* x OP

	3652 if (fOP->contains(thisChar)) {

	3653 // Scan backwards from prevChar to see if it is preceded by QU CM* S P*

	3654 int tPos = prevPos;

	3655 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {

	3656 tPos = fText->moveIndex32(tPos, -1);

	3657 }

	3658 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {

	3659 tPos = fText->moveIndex32(tPos, -1);

	3660 }

	3661 if (fQU->contains(fText->char32At(tPos))) {

	3662 continue;

	3663 }

	3664 }

	3665

	3666

	3667

	3668 // LB 16 (CL \| CP) SP* x NS

	3669 // Scan backwards for SP* CM* (CL \| CP)

	3670 if (fNS->contains(thisChar)) {

	3671 int tPos = prevPos;

	3672 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {

	3673 tPos = fText->moveIndex32(tPos, -1);

	3674 }

	3675 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {

	3676 tPos = fText->moveIndex32(tPos, -1);

	3677 }

	3678 if (fCL->contains(fText->char32At(tPos)) \|\| fCP->contains(fText->cha r32At(tPos))) {

	3679 continue;

	3680 }

	3681 }

	3682

	3683

	3684 // LB 17 B2 SP* x B2

	3685 if (fB2->contains(thisChar)) {

	3686 // Scan backwards, checking for the B2 CM* SP* sequence.

	3687 tPos = prevPos;

	3688 if (fSP->contains(prevChar)) {

	3689 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {

	3690 tPos=fText->moveIndex32(tPos, -1);

	3691 }

	3692 }

	3693 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {

	3694 tPos=fText->moveIndex32(tPos, -1);

	3695 }

	3696 if (fB2->contains(fText->char32At(tPos))) {

	3697 continue;

	3698 }

	3699 }

	3700

	3701

	3702 // LB 18 break after space

	3703 if (fSP->contains(prevChar)) {

	3704 break;

	3705 }

	3706

	3707 // LB 19

	3708 // x QU

	3709 // QU x

	3710 if (fQU->contains(thisChar) \|\| fQU->contains(prevChar)) {

	3711 continue;

	3712 }

	3713

	3714 // LB 20 Break around a CB

	3715 if (fCB->contains(thisChar) \|\| fCB->contains(prevChar)) {

	3716 break;

	3717 }

	3718

	3719 // LB 21

	3720 if (fBA->contains(thisChar) \|\|

	3721 fHY->contains(thisChar) \|\|

	3722 fNS->contains(thisChar) \|\|

	3723 fBB->contains(prevChar) ) {

	3724 continue;

	3725 }

	3726

	3727 // LB 22

	3728 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) \|\|

	3729 (fID->contains(prevChar) && fIN->contains(thisChar)) \|\|

	3730 (fIN->contains(prevChar) && fIN->contains(thisChar)) \|\|

	3731 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) {

	3732 continue;

	3733 }

	3734

	3735

	3736 // LB 23 ID x PO

	3737 // AL x NU

	3738 // NU x AL

	3739 if ((fID->contains(prevChar) && fPO->contains(thisChar)) \|\|

	3740 (fAL->contains(prevChar) && fNU->contains(thisChar)) \|\|

	3741 (fNU->contains(prevChar) && fAL->contains(thisChar)) ) {

	3742 continue;

	3743 }

	3744

	3745 // LB 24 Do not break between prefix and letters or ideographs.

	3746 // PR x ID

	3747 // PR x AL

	3748 // PO x AL

	3749 if ((fPR->contains(prevChar) && fID->contains(thisChar)) \|\|

	3750 (fPR->contains(prevChar) && fAL->contains(thisChar)) \|\|

	3751 (fPO->contains(prevChar) && fAL->contains(thisChar)) ) {

	3752 continue;

	3753 }

	3754

	3755

	3756

	3757 // LB 25 Numbers

	3758 if (fNumberMatcher->lookingAt(prevPos, status)) {

	3759 if (U_FAILURE(status)) {

	3760 break;

	3761 }

	3762 // Matched a number. But could have been just a single digit, which would

	3763 // not represent a "no break here" between prevChar and thisChar

	3764 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first ch ar following num

	3765 if (numEndIdx > pos) {

	3766 // Number match includes at least our two chars being checked

	3767 if (numEndIdx > nextPos) {

	3768 // Number match includes additional chars. Update pos and n extPos

	3769 // so that next loop iteration will continue at the end of the number,

	3770 // checking for breaks between last char in number & whate ver follows.

	3771 pos = nextPos = numEndIdx;

	3772 do {

	3773 pos = fText->moveIndex32(pos, -1);

	3774 thisChar = fText->char32At(pos);

	3775 } while (fCM->contains(thisChar));

	3776 }

	3777 continue;

	3778 }

	3779 }

	3780

	3781

	3782 // LB 26 Do not break a Korean syllable.

	3783 if (fJL->contains(prevChar) && (fJL->contains(thisChar) \|\|

	3784 fJV->contains(thisChar) \|\|

	3785 fH2->contains(thisChar) \|\|

	3786 fH3->contains(thisChar))) {

	3787 continue;

	3788 }

	3789

	3790 if ((fJV->contains(prevChar) \|\| fH2->contains(prevChar)) &&

	3791 (fJV->contains(thisChar) \|\| fJT->contains(thisChar))) {

	3792 continue;

	3793 }

	3794

	3795 if ((fJT->contains(prevChar) \|\| fH3->contains(prevChar)) &&

	3796 fJT->contains(thisChar)) {

	3797 continue;

	3798 }

	3799

	3800 // LB 27 Treat a Korean Syllable Block the same as ID.

	3801 if ((fJL->contains(prevChar) \|\| fJV->contains(prevChar) \|\|

	3802 fJT->contains(prevChar) \|\| fH2->contains(prevChar) \|\| fH3->contains( prevChar)) &&

	3803 fIN->contains(thisChar)) {

	3804 continue;

	3805 }

	3806 if ((fJL->contains(prevChar) \|\| fJV->contains(prevChar) \|\|

	3807 fJT->contains(prevChar) \|\| fH2->contains(prevChar) \|\| fH3->contains( prevChar)) &&

	3808 fPO->contains(thisChar)) {

	3809 continue;

	3810 }

	3811 if (fPR->contains(prevChar) && (fJL->contains(thisChar) \|\| fJV->contains (thisChar) \|\|

	3812 fJT->contains(thisChar) \|\| fH2->contains(thisChar) \|\| fH3->contains( thisChar))) {

	3813 continue;

	3814 }

	3815

	3816

	3817

	3818 // LB 28 Do not break between alphabetics ("at").

	3819 if (fAL->contains(prevChar) && fAL->contains(thisChar)) {

	3820 continue;

	3821 }

	3822

	3823 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g .").

	3824 if (fIS->contains(prevChar) && fAL->contains(thisChar)) {

	3825 continue;

	3826 }

	3827

	3828 // LB 30 Do not break between letters, numbers, or ordinary symbols a nd opening or closing punctuation.

	3829 // (AL \| NU) x OP

	3830 // CP x (AL \| NU)

	3831 if ((fAL->contains(prevChar) \|\| fNU->contains(prevChar)) && fOP->contain s(thisChar)) {

	3832 continue;

	3833 }

	3834 if (fCP->contains(prevChar) && (fAL->contains(thisChar) \|\| fNU->contains (thisChar))) {

	3835 continue;

	3836 }

	3837

	3838 // LB 31 Break everywhere else

	3839 break;

	3840

	3841 }

	3842

	3843 return pos;

	3844 }

	3845

	3846

	3847 UVector *RBBILineMonkey::charClasses() {

	3848 return fSets;

	3849 }

	3850

	3851

	3852 RBBILineMonkey::~RBBILineMonkey() {

	3853 delete fSets;

	3854

	3855 delete fBK;

	3856 delete fCR;

	3857 delete fLF;

	3858 delete fCM;

	3859 delete fNL;

	3860 delete fWJ;

	3861 delete fZW;

	3862 delete fGL;

	3863 delete fCB;

	3864 delete fSP;

	3865 delete fB2;

	3866 delete fBA;

	3867 delete fBB;

	3868 delete fHY;

	3869 delete fH2;

	3870 delete fH3;

	3871 delete fCL;

	3872 delete fCP;

	3873 delete fEX;

	3874 delete fIN;

	3875 delete fJL;

	3876 delete fJV;

	3877 delete fJT;

	3878 delete fNS;

	3879 delete fOP;

	3880 delete fQU;

	3881 delete fIS;

	3882 delete fNU;

	3883 delete fPO;

	3884 delete fPR;

	3885 delete fSY;

	3886 delete fAI;

	3887 delete fAL;

	3888 delete fID;

	3889 delete fSA;

	3890 delete fSG;

	3891 delete fXX;

	3892

	3893 delete fCharBI;

	3894 delete fNumberMatcher;

	3895 }

	3896

	3897

	3898 //------------------------------------------------------------------------------ -------------

	3899 //

	3900 // TestMonkey

	3901 //

	3902 // params

	3903 // seed=nnnnn Random number starting seed.

	3904 // Setting the seed allows errors to be reproduced.

	3905 // loop=nnn Looping count. Controls running time.

	3906 // -1: run forever.

	3907 // 0 or greater: run length.

	3908 //

	3909 // type = char \| word \| line \| sent \| title

	3910 //

	3911 //------------------------------------------------------------------------------ -------------

	3912

	3913 static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t d efaultVal) {

	3914 int32_t val = defaultVal;

	3915 name.append(" = (-?\\d+)");

	3916 UErrorCode status = U_ZERO_ERROR;

	3917 RegexMatcher m(name, params, 0, status);

	3918 if (m.find()) {

	3919 // The param exists. Convert the string to an int.

	3920 char valString[100];

	3921 int32_t paramLength = m.end(1, status) - m.start(1, status);

	3922 if (paramLength >= (int32_t)(sizeof(valString)-1)) {

	3923 paramLength = (int32_t)(sizeof(valString)-2);

	3924 }

	3925 params.extract(m.start(1, status), paramLength, valString, sizeof(valStr ing));

	3926 val = strtol(valString, NULL, 10);

	3927

	3928 // Delete this parameter from the params string.

	3929 m.reset();

	3930 params = m.replaceFirst("", status);

	3931 }

	3932 U_ASSERT(U_SUCCESS(status));

	3933 return val;

	3934 }

	3935 #endif

	3936

	3937 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,

	3938 BreakIterator *bi,

	3939 int expected[],

	3940 int expectedcount)

	3941 {

	3942 int count = 0;

	3943 int i = 0;

	3944 int forward[50];

	3945 bi->setText(ustr);

	3946 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {

	3947 forward[count] = i;

	3948 if (count < expectedcount && expected[count] != i) {

	3949 test->errln("break forward test failed: expected %d but got %d",

	3950 expected[count], i);

	3951 break;

	3952 }

	3953 count ++;

	3954 }

	3955 if (count != expectedcount) {

	3956 printStringBreaks(ustr, expected, expectedcount);

	3957 test->errln("break forward test failed: missed %d match",

	3958 expectedcount - count);

	3959 return;

	3960 }

	3961 // testing boundaries

	3962 for (i = 1; i < expectedcount; i ++) {

	3963 int j = expected[i - 1];

	3964 if (!bi->isBoundary(j)) {

	3965 printStringBreaks(ustr, expected, expectedcount);

	3966 test->errln("isBoundary() failed. Expected boundary at position %d" , j);

	3967 return;

	3968 }

	3969 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {

	3970 if (bi->isBoundary(j)) {

	3971 printStringBreaks(ustr, expected, expectedcount);

	3972 test->errln("isBoundary() failed. Not expecting boundary at pos ition %d", j);

	3973 return;

	3974 }

	3975 }

	3976 }

	3977

	3978 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {

	3979 count --;

	3980 if (forward[count] != i) {

	3981 test->errln("happy break test previous() failed: expected %d but got %d",

	3982 forward[count], i);

	3983 break;

	3984 }

	3985 }

	3986 if (count != 0) {

	3987 printStringBreaks(ustr, expected, expectedcount);

	3988 test->errln("break test previous() failed: missed a match");

	3989 return;

	3990 }

	3991

	3992 // testing preceding

	3993 for (i = 0; i < expectedcount - 1; i ++) {

	3994 // int j = expected[i] + 1;

	3995 int j = ustr.moveIndex32(expected[i], 1);

	3996 for (; j <= expected[i + 1]; j ++) {

	3997 if (bi->preceding(j) != expected[i]) {

	3998 printStringBreaks(ustr, expected, expectedcount);

	3999 test->errln("preceding(): Not expecting boundary at position %d" , j);

	4000 return;

	4001 }

	4002 }

	4003 }

	4004 }

	4005

	4006 void RBBITest::TestWordBreaks(void)

	4007 {

	4008 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

	4009

	4010 Locale locale("en");

	4011 UErrorCode status = U_ZERO_ERROR;

	4012 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, statu s);

	4013 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);

	4014 static const char *strlist[] =

	4015 {

	4016 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",

	4017 "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040 \\u003b",

	4018 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e 0061\\u003a",

	4019 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",

	4020 "\\u90ca\\u3588\\u009c\\u0953\\u194b",

	4021 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",

	4022 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e" ,

	4023 "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",

	4024 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",

	4025 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",

	4026 "\\u2027\\U000e0067\\u0a47\\u00b7",

	4027 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",

	4028 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",

	4029 "\\u0589\\U000e006e\\u0a42\\U000104a5",

	4030 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",

	4031 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",

	4032 "\\u0027\\u11af\\U000e0057\\u0602",

	4033 "\\U0001d7f2\\U000e007\\u0004\\u0589",

	4034 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b ",

	4035 "\\U0001d7f2\\U000e007d\\u0004\\u0589",

	4036 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",

	4037 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",

	4038 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",

	4039 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",

	4040 "\\u0233\\U000e0020\\u0a69\\u0d6a",

	4041 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",

	4042 "\\u58f4\\U000e0049\\u20e7\\u2027",

	4043 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",

	4044 "\\ua183\\u102d\\u0bec\\u003a",

	4045 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",

	4046 "\\u003a\\u0e57\\u0fad\\u002e",

	4047 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",

	4048 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",

	4049 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",

	4050 "\\u003a\\u0664\\u00b7\\u1fba",

	4051 "\\u003b\\u0027\\u00b7\\u47a3",

	4052 "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",

	4053 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u 0e51\\u1058\\U000e0058\\u00b7\\u0673",

	4054 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",

	4055 };

	4056 int loop;

	4057 if (U_FAILURE(status)) {

	4058 errcheckln(status, "Creation of break iterator failed %s", u_errorName(s tatus));

	4059 return;

	4060 }

	4061 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {

	4062 // printf("looping %d\n", loop);

	4063 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);

	4064 // RBBICharMonkey monkey;

	4065 RBBIWordMonkey monkey;

	4066

	4067 int expected[50];

	4068 int expectedcount = 0;

	4069

	4070 monkey.setText(ustr);

	4071 int i;

	4072 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {

	4073 expected[expectedcount ++] = i;

	4074 }

	4075

	4076 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);

	4077 }

	4078 delete bi;

	4079 #endif

	4080 }

	4081

	4082 void RBBITest::TestWordBoundary(void)

	4083 {

	4084 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>

	4085 Locale locale("en");

	4086 UErrorCode status = U_ZERO_ERROR;

	4087 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, statu s);

	4088 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);

	4089 UChar str[50];

	4090 static const char *strlist[] =

	4091 {

	4092 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",

	4093 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",

	4094 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",

	4095 "\\u2027\\U000e0067\\u0a47\\u00b7",

	4096 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",

	4097 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",

	4098 "\\u0589\\U000e006e\\u0a42\\U000104a5",

	4099 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",

	4100 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",

	4101 "\\u0027\\u11af\\U000e0057\\u0602",

	4102 "\\U0001d7f2\\U000e007\\u0004\\u0589",

	4103 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b ",

	4104 "\\U0001d7f2\\U000e007d\\u0004\\u0589",

	4105 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",

	4106 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",

	4107 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",

	4108 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",

	4109 "\\u0233\\U000e0020\\u0a69\\u0d6a",

	4110 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",

	4111 "\\u58f4\\U000e0049\\u20e7\\u2027",

	4112 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",

	4113 "\\ua183\\u102d\\u0bec\\u003a",

	4114 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",

	4115 "\\u003a\\u0e57\\u0fad\\u002e",

	4116 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",

	4117 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",

	4118 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019" ,

	4119 "\\u003a\\u0664\\u00b7\\u1fba",

	4120 "\\u003b\\u0027\\u00b7\\u47a3",

	4121 };

	4122 int loop;

	4123 if (U_FAILURE(status)) {

	4124 errcheckln(status, "Creation of break iterator failed %s", u_errorName(s tatus));

	4125 return;

	4126 }

	4127 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {

	4128 // printf("looping %d\n", loop);

	4129 u_unescape(strlist[loop], str, 20);

	4130 UnicodeString ustr(str);

	4131 int forward[50];

	4132 int count = 0;

	4133

	4134 bi->setText(ustr);

	4135 int prev = 0;

	4136 int i;

	4137 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {

	4138 forward[count ++] = i;

	4139 if (i > prev) {

	4140 int j;

	4141 for (j = prev + 1; j < i; j ++) {

	4142 if (bi->isBoundary(j)) {

	4143 printStringBreaks(ustr, forward, count);

	4144 errln("happy boundary test failed: expected %d not a bou ndary",

	4145 j);

	4146 return;

	4147 }

	4148 }

	4149 }

	4150 if (!bi->isBoundary(i)) {

	4151 printStringBreaks(ustr, forward, count);

	4152 errln("happy boundary test failed: expected %d a boundary",

	4153 i);

	4154 return;

	4155 }

	4156 prev = i;

	4157 }

	4158 }

	4159 delete bi;

	4160 }

	4161

	4162 void RBBITest::TestLineBreaks(void)

	4163 {

	4164 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

	4165 Locale locale("en");

	4166 UErrorCode status = U_ZERO_ERROR;

	4167 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);

	4168 const int32_t STRSIZE = 50;

	4169 UChar str[STRSIZE];

	4170 static const char *strlist[] =

	4171 {

	4172 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",

	4173 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"

	4174 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",

	4175 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"

	4176 "u2014\\U000e0105\\u118c\\u000a\\u07f8",

	4177 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",

	4178 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a \\U000e0123",

	4179 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u1 7a4",

	4180 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",

	4181 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2 009\\u000a\\u06f7\\u02cc\\u1019\\u2060",

	4182 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e 007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",

	4183 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7 \\u0f3b\\u002f",

	4184 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c \\u002f\\u17b1",

	4185 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u0 60d\\u02c8\\ua4e8\\u002f\\u17d5",

	4186 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",

	4187 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc ",

	4188 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",

	4189 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020 \\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",

	4190 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d \\u02c8\\u003b",

	4191 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u2 9fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",

	4192 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d \\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",

	4193 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uf f09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",

	4194 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u0 02d\\u09cc\\u1782\\u000d\\uff6f\\u0025",

	4195 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0 f0c\\u0085\\u2763",

	4196 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a \\u3d0a\\ufe57\\u2035\\u2028\\u2029",

	4197 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc \\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",

	4198 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",

	4199 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",

	4200 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",

	4201 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",

	4202 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",

	4203 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",

	4204 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",

	4205 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",

	4206 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a \\u180e\\u2009\\u3111",

	4207 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",

	4208 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",

	4209 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",

	4210 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",

	4211 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",

	4212 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",

	4213 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",

	4214 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"

	4215 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"

	4216 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",

	4217 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",

	4218 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",

	4219 };

	4220 int loop;

	4221 TEST_ASSERT_SUCCESS(status);

	4222 if (U_FAILURE(status)) {

	4223 return;

	4224 }

	4225 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {

	4226 // printf("looping %d\n", loop);

	4227 int32_t t = u_unescape(strlist[loop], str, STRSIZE);

	4228 if (t >= STRSIZE) {

	4229 TEST_ASSERT(FALSE);

	4230 continue;

	4231 }

	4232

	4233

	4234 UnicodeString ustr(str);

	4235 RBBILineMonkey monkey;

	4236 if (U_FAILURE(monkey.deferredStatus)) {

	4237 continue;

	4238 }

	4239

	4240 const int EXPECTEDSIZE = 50;

	4241 int expected[EXPECTEDSIZE];

	4242 int expectedcount = 0;

	4243

	4244 monkey.setText(ustr);

	4245 int i;

	4246 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {

	4247 if (expectedcount >= EXPECTEDSIZE) {

	4248 TEST_ASSERT(expectedcount < EXPECTEDSIZE);

	4249 return;

	4250 }

	4251 expected[expectedcount ++] = i;

	4252 }

	4253

	4254 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);

	4255 }

	4256 delete bi;

	4257 #endif

	4258 }

	4259

	4260 void RBBITest::TestSentBreaks(void)

	4261 {

	4262 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

	4263 Locale locale("en");

	4264 UErrorCode status = U_ZERO_ERROR;

	4265 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);

	4266 UChar str[200];

	4267 static const char *strlist[] =

	4268 {

	4269 "Now\ris\nthe\r\ntime\n\rfor\r\r",

	4270 "This\n",

	4271 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $ 20,00,000.",

	4272 "\"Sentence ending with a quote.\" Bye.",

	4273 " (This is it). Testing the sentence iterator. \"This isn't it.\"",

	4274 "Hi! This is a simple sample sentence. (This is it.) This is a simple sampl e sentence. \"This isn't it.\"",

	4275 "Hi! This is a simple sample sentence. It does not have to make any sense a s you can see. ",

	4276 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",

	4277 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",

	4278 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",

	4279 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a \\ufe56\\ufe52"

	4280 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u00 2e\\ua6ab\\u104a"

	4281 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u 5f61\\u202f"

	4282 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",

	4283 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U 0001d171"

	4284 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc7 2\\u0030"

	4285 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180 e\\u000b"

	4286 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\ u202b"

	4287 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\ u2e05"

	4288 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"

	4289 };

	4290 int loop;

	4291 if (U_FAILURE(status)) {

	4292 errcheckln(status, "Creation of break iterator failed %s", u_errorName(s tatus));

	4293 return;

	4294 }

	4295 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {

	4296 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));

	4297 UnicodeString ustr(str);

	4298

	4299 RBBISentMonkey monkey;

	4300 if (U_FAILURE(monkey.deferredStatus)) {

	4301 continue;

	4302 }

	4303

	4304 const int EXPECTEDSIZE = 50;

	4305 int expected[EXPECTEDSIZE];

	4306 int expectedcount = 0;

	4307

	4308 monkey.setText(ustr);

	4309 int i;

	4310 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {

	4311 if (expectedcount >= EXPECTEDSIZE) {

	4312 TEST_ASSERT(expectedcount < EXPECTEDSIZE);

	4313 return;

	4314 }

	4315 expected[expectedcount ++] = i;

	4316 }

	4317

	4318 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);

	4319 }

	4320 delete bi;

	4321 #endif

	4322 }

	4323

	4324 void RBBITest::TestMonkey(char *params) {

	4325 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

	4326

	4327 UErrorCode status = U_ZERO_ERROR;

	4328 int32_t loopCount = 500;

	4329 int32_t seed = 1;

	4330 UnicodeString breakType = "all";

	4331 Locale locale("en");

	4332 UBool useUText = FALSE;

	4333

	4334 if (quick == FALSE) {

	4335 loopCount = 10000;

	4336 }

	4337

	4338 if (params) {

	4339 UnicodeString p(params);

	4340 loopCount = getIntParam("loop", p, loopCount);

	4341 seed = getIntParam("seed", p, seed);

	4342

	4343 RegexMatcher m(" type = (char\|word\|line\|sent\|title) ", p, 0, status) ;

	4344 if (m.find()) {

	4345 breakType = m.group(1, status);

	4346 m.reset();

	4347 p = m.replaceFirst("", status);

	4348 }

	4349

	4350 RegexMatcher u(" *utext", p, 0, status);

	4351 if (u.find()) {

	4352 useUText = TRUE;

	4353 u.reset();

	4354 p = u.replaceFirst("", status);

	4355 }

	4356

	4357

	4358 // m.reset(p);

	4359 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {

	4360 // Each option is stripped out of the option string as it is process ed.

	4361 // All options have been checked. The option string should have bee n completely emptied..

	4362 char buf[100];

	4363 p.extract(buf, sizeof(buf), NULL, status);

	4364 buf[sizeof(buf)-1] = 0;

	4365 errln("Unrecognized or extra parameter: %s\n", buf);

	4366 return;

	4367 }

	4368

	4369 }

	4370

	4371 if (breakType == "char" \|\| breakType == "all") {

	4372 RBBICharMonkey m;

	4373 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, stat us);

	4374 if (U_SUCCESS(status)) {

	4375 RunMonkey(bi, m, "char", seed, loopCount, useUText);

	4376 if (breakType == "all" && useUText==FALSE) {

	4377 // Also run a quick test with UText when "all" is specified

	4378 RunMonkey(bi, m, "char", seed, loopCount, TRUE);

	4379 }

	4380 }

	4381 else {

	4382 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));

	4383 }

	4384 delete bi;

	4385 }

	4386

	4387 if (breakType == "word" \|\| breakType == "all") {

	4388 logln("Word Break Monkey Test");

	4389 RBBIWordMonkey m;

	4390 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);

	4391 if (U_SUCCESS(status)) {

	4392 RunMonkey(bi, m, "word", seed, loopCount, useUText);

	4393 }

	4394 else {

	4395 errcheckln(status, "Creation of word break iterator failed %s", u_er rorName(status));

	4396 }

	4397 delete bi;

	4398 }

	4399

	4400 if (breakType == "line" \|\| breakType == "all") {

	4401 logln("Line Break Monkey Test");

	4402 RBBILineMonkey m;

	4403 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);

	4404 if (loopCount >= 10) {

	4405 loopCount = loopCount / 5; // Line break runs slower than the othe rs.

	4406 }

	4407 if (U_SUCCESS(status)) {

	4408 RunMonkey(bi, m, "line", seed, loopCount, useUText);

	4409 }

	4410 else {

	4411 errcheckln(status, "Creation of line break iterator failed %s", u_er rorName(status));

	4412 }

	4413 delete bi;

	4414 }

	4415

	4416 if (breakType == "sent" \|\| breakType == "all" ) {

	4417 logln("Sentence Break Monkey Test");

	4418 RBBISentMonkey m;

	4419 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, statu s);

	4420 if (loopCount >= 10) {

	4421 loopCount = loopCount / 10; // Sentence runs slower than the other break types

	4422 }

	4423 if (U_SUCCESS(status)) {

	4424 RunMonkey(bi, m, "sentence", seed, loopCount, useUText);

	4425 }

	4426 else {

	4427 errcheckln(status, "Creation of line break iterator failed %s", u_er rorName(status));

	4428 }

	4429 delete bi;

	4430 }

	4431

	4432 #endif

	4433 }

	4434

	4435 //

	4436 // Run a RBBI monkey test. Common routine, for all break iterator types.

	4437 // Parameters:

	4438 // bi - the break iterator to use

	4439 // mk - MonkeyKind, abstraction for obtaining expected results

	4440 // name - Name of test (char, word, etc.) for use in error messages

	4441 // seed - Seed for starting random number generator (parameter from use r)

	4442 // numIterations

	4443 //

	4444 void RBBITest::RunMonkey(BreakIterator bi, RBBIMonkeyKind &mk, const char name , uint32_t seed,

	4445 int32_t numIterations, UBool useUText) {

	4446

	4447 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

	4448

	4449 const int32_t TESTSTRINGLEN = 500;

	4450 UnicodeString testText;

	4451 int32_t numCharClasses;

	4452 UVector *chClasses;

	4453 int expected[TESTSTRINGLEN*2 + 1];

	4454 int expectedCount = 0;

	4455 char expectedBreaks[TESTSTRINGLEN*2 + 1];

	4456 char forwardBreaks[TESTSTRINGLEN*2 + 1];

	4457 char reverseBreaks[TESTSTRINGLEN*2+1];

	4458 char isBoundaryBreaks[TESTSTRINGLEN*2+1];

	4459 char followingBreaks[TESTSTRINGLEN*2+1];

	4460 char precedingBreaks[TESTSTRINGLEN*2+1];

	4461 int i;

	4462 int loopCount = 0;

	4463

	4464 m_seed = seed;

	4465

	4466 numCharClasses = mk.charClasses()->size();

	4467 chClasses = mk.charClasses();

	4468

	4469 // Check for errors that occured during the construction of the MonkeyKind o bject.

	4470 // Can't report them where they occured because errln() is a method coming from intlTest,

	4471 // and is not visible outside of RBBITest :-(

	4472 if (U_FAILURE(mk.deferredStatus)) {

	4473 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk. deferredStatus));

	4474 return;

	4475 }

	4476

	4477 // Verify that the character classes all have at least one member.

	4478 for (i=0; i<numCharClasses; i++) {

	4479 UnicodeSet s = (UnicodeSet )chClasses->elementAt(i);

	4480 if (s == NULL \|\| s->size() == 0) {

	4481 errln("Character Class #%d is null or of zero size.", i);

	4482 return;

	4483 }

	4484 }

	4485

	4486 while (loopCount < numIterations \|\| numIterations == -1) {

	4487 if (numIterations == -1 && loopCount % 10 == 0) {

	4488 // If test is running in an infinite loop, display a periodic tic so

	4489 // we can tell that it is making progress.

	4490 fprintf(stderr, ".");

	4491 }

	4492 // Save current random number seed, so that we can recreate the random n umbers

	4493 // for this loop iteration in event of an error.

	4494 seed = m_seed;

	4495

	4496 // Populate a test string with data.

	4497 testText.truncate(0);

	4498 for (i=0; i<TESTSTRINGLEN; i++) {

	4499 int32_t aClassNum = m_rand() % numCharClasses;

	4500 UnicodeSet classSet = (UnicodeSet )chClasses->elementAt(aClassNum) ;

	4501 int32_t charIdx = m_rand() % classSet->size();

	4502 UChar32 c = classSet->charAt(charIdx);

	4503 if (c < 0) { // TODO: deal with sets containing strings.

	4504 errln("c < 0");

	4505 break;

	4506 }

	4507 testText.append(c);

	4508 }

	4509

	4510 // Calculate the expected results for this test string.

	4511 mk.setText(testText);

	4512 memset(expectedBreaks, 0, sizeof(expectedBreaks));

	4513 expectedBreaks[0] = 1;

	4514 int32_t breakPos = 0;

	4515 expectedCount = 0;

	4516 for (;;) {

	4517 breakPos = mk.next(breakPos);

	4518 if (breakPos == -1) {

	4519 break;

	4520 }

	4521 if (breakPos > testText.length()) {

	4522 errln("breakPos > testText.length()");

	4523 }

	4524 expectedBreaks[breakPos] = 1;

	4525 U_ASSERT(expectedCount<testText.length());

	4526 expected[expectedCount ++] = breakPos;

	4527 }

	4528

	4529 // Find the break positions using forward iteration

	4530 memset(forwardBreaks, 0, sizeof(forwardBreaks));

	4531 if (useUText) {

	4532 UErrorCode status = U_ZERO_ERROR;

	4533 UText *testUText = utext_openReplaceable(NULL, &testText, &status);

	4534 // testUText = utext_openUnicodeString(testUText, &testText, &status );

	4535 bi->setText(testUText, status);

	4536 TEST_ASSERT_SUCCESS(status);

	4537 utext_close(testUText); // The break iterator does a shallow clone of the UText

	4538 // This UText can be closed immediately, so long as the

	4539 // testText string continues to exist.

	4540 } else {

	4541 bi->setText(testText);

	4542 }

	4543

	4544 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {

	4545 if (i < 0 \|\| i > testText.length()) {

	4546 errln("%s break monkey test: Out of range value returned by brea kIterator::next()", name);

	4547 break;

	4548 }

	4549 forwardBreaks[i] = 1;

	4550 }

	4551

	4552 // Find the break positions using reverse iteration

	4553 memset(reverseBreaks, 0, sizeof(reverseBreaks));

	4554 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {

	4555 if (i < 0 \|\| i > testText.length()) {

	4556 errln("%s break monkey test: Out of range value returned by brea kIterator::next()", name);

	4557 break;

	4558 }

	4559 reverseBreaks[i] = 1;

	4560 }

	4561

	4562 // Find the break positions using isBoundary() tests.

	4563 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));

	4564 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());

	4565 for (i=0; i<=testText.length(); i++) {

	4566 isBoundaryBreaks[i] = bi->isBoundary(i);

	4567 }

	4568

	4569

	4570 // Find the break positions using the following() function.

	4571 // printf(".");

	4572 memset(followingBreaks, 0, sizeof(followingBreaks));

	4573 int32_t lastBreakPos = 0;

	4574 followingBreaks[0] = 1;

	4575 for (i=0; i<testText.length(); i++) {

	4576 breakPos = bi->following(i);

	4577 if (breakPos <= i \|\|

	4578 breakPos < lastBreakPos \|\|

	4579 breakPos > testText.length() \|\|

	4580 (breakPos > lastBreakPos && lastBreakPos > i)) {

	4581 errln("%s break monkey test: "

	4582 "Out of range value returned by BreakIterator::following().\ n"

	4583 "Random seed=%d index=%d; following returned %d; lastb reak=%d",

	4584 name, seed, i, breakPos, lastBreakPos);

	4585 break;

	4586 }

	4587 followingBreaks[breakPos] = 1;

	4588 lastBreakPos = breakPos;

	4589 }

	4590

	4591 // Find the break positions using the preceding() function.

	4592 memset(precedingBreaks, 0, sizeof(precedingBreaks));

	4593 lastBreakPos = testText.length();

	4594 precedingBreaks[testText.length()] = 1;

	4595 for (i=testText.length(); i>0; i--) {

	4596 breakPos = bi->preceding(i);

	4597 if (breakPos >= i \|\|

	4598 breakPos > lastBreakPos \|\|

	4599 (breakPos < 0 && testText.getChar32Start(i)>0) \|\|

	4600 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Sta rt(i)) ) {

	4601 errln("%s break monkey test: "

	4602 "Out of range value returned by BreakIterator::preceding().\ n"

	4603 "index=%d; prev returned %d; lastBreak=%d" ,

	4604 name, i, breakPos, lastBreakPos);

	4605 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks) ) {

	4606 precedingBreaks[i] = 2; // Forces an error.

	4607 }

	4608 } else {

	4609 if (breakPos >= 0) {

	4610 precedingBreaks[breakPos] = 1;

	4611 }

	4612 lastBreakPos = breakPos;

	4613 }

	4614 }

	4615

	4616 // Compare the expected and actual results.

	4617 for (i=0; i<=testText.length(); i++) {

	4618 const char *errorType = NULL;

	4619 if (forwardBreaks[i] != expectedBreaks[i]) {

	4620 errorType = "next()";

	4621 } else if (reverseBreaks[i] != forwardBreaks[i]) {

	4622 errorType = "previous()";

	4623 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {

	4624 errorType = "isBoundary()";

	4625 } else if (followingBreaks[i] != expectedBreaks[i]) {

	4626 errorType = "following()";

	4627 } else if (precedingBreaks[i] != expectedBreaks[i]) {

	4628 errorType = "preceding()";

	4629 }

	4630

	4631

	4632 if (errorType != NULL) {

	4633 // Format a range of the test text that includes the failure as

	4634 // a data item that can be included in the rbbi test data file.

	4635

	4636 // Start of the range is the last point where expected and actua l results

	4637 // both agreed that there was a break position.

	4638 int startContext = i;

	4639 int32_t count = 0;

	4640 for (;;) {

	4641 if (startContext==0) { break; }

	4642 startContext --;

	4643 if (expectedBreaks[startContext] != 0) {

	4644 if (count == 2) break;

	4645 count ++;

	4646 }

	4647 }

	4648

	4649 // End of range is two expected breaks past the start position.

	4650 int endContext = i + 1;

	4651 int ci;

	4652 for (ci=0; ci<2; ci++) { // Number of items to include in error text.

	4653 for (;;) {

	4654 if (endContext >= testText.length()) {break;}

	4655 if (expectedBreaks[endContext-1] != 0) {

	4656 if (count == 0) break;

	4657 count --;

	4658 }

	4659 endContext ++;

	4660 }

	4661 }

	4662

	4663 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</dat a>"

	4664 UnicodeString errorText = "<data>";

	4665 /***if (strcmp(errorType, "next()") == 0) {

	4666 startContext = 0;

	4667 endContext = testText.length();

	4668

	4669 printStringBreaks(testText, expected, expectedCount);

	4670 }***/

	4671

	4672 for (ci=startContext; ci<endContext;) {

	4673 UnicodeString hexChars("0123456789abcdef");

	4674 UChar32 c;

	4675 int bn;

	4676 c = testText.char32At(ci);

	4677 if (ci == i) {

	4678 // This is the location of the error.

	4679 errorText.append("<?>");

	4680 } else if (expectedBreaks[ci] != 0) {

	4681 // This a non-error expected break position.

	4682 errorText.append("\\");

	4683 }

	4684 if (c < 0x10000) {

	4685 errorText.append("\\u");

	4686 for (bn=12; bn>=0; bn-=4) {

	4687 errorText.append(hexChars.charAt((c>>bn)&0xf));

	4688 }

	4689 } else {

	4690 errorText.append("\\U");

	4691 for (bn=28; bn>=0; bn-=4) {

	4692 errorText.append(hexChars.charAt((c>>bn)&0xf));

	4693 }

	4694 }

	4695 ci = testText.moveIndex32(ci, 1);

	4696 }

	4697 errorText.append("\\");

	4698 errorText.append("</data>\n");

	4699

	4700 // Output the error

	4701 char charErrorTxt[500];

	4702 UErrorCode status = U_ZERO_ERROR;

	4703 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, stat us);

	4704 charErrorTxt[sizeof(charErrorTxt)-1] = 0;

	4705 errln("%s break monkey test error. %s. Operation = %s; Random s eed = %d; buf Idx = %d\n%s",

	4706 name, (expectedBreaks[i]? "break expected but not found" : " break found but not expected"),

	4707 errorType, seed, i, charErrorTxt);

	4708 break;

	4709 }

	4710 }

	4711

	4712 loopCount++;

	4713 }

	4714 #endif

	4715 }

	4716

	4717

	4718 // Bug 5532. UTF-8 based UText fails in dictionary code.

	4719 // This test checks the initial patch,

	4720 // which is to just keep it from crashing. Correct word boundaries

	4721 // await a proper fix to the dictionary code.

	4722 //

	4723 void RBBITest::TestBug5532(void) {

	4724 // Text includes a mixture of Thai and Latin.

	4725 const unsigned char utf8Data[] = {

	4726 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,

	4727 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,

	4728 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,

	4729 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,

	4730 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,

	4731 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,

	4732 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,

	4733 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,

	4734 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,

	4735 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,

	4736 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};

	4737

	4738 UErrorCode status = U_ZERO_ERROR;

	4739 UText utext=UTEXT_INITIALIZER;

	4740 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);

	4741 TEST_ASSERT_SUCCESS(status);

	4742

	4743 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);

	4744 TEST_ASSERT_SUCCESS(status);

	4745 if (U_SUCCESS(status)) {

	4746 bi->setText(&utext, status);

	4747 TEST_ASSERT_SUCCESS(status);

	4748

	4749 int32_t breakCount = 0;

	4750 int32_t previousBreak = -1;

	4751 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {

	4752 // For now, just make sure that the break iterator doesn't hang.

	4753 TEST_ASSERT(previousBreak < bi->current());

	4754 previousBreak = bi->current();

	4755 }

	4756 TEST_ASSERT(breakCount > 0);

	4757 }

	4758 delete bi;

	4759 utext_close(&utext);

	4760 }

	4761

	4762

	4763 //

	4764 // TestDebug - A place-holder test for debugging purposes.

	4765 // For putting in fragments of other tests that can be invoked

	4766 // for tracing without a lot of unwanted extra stuff happening .

	4767 //

	4768 void RBBITest::TestDebug(void) {

	4769 #if 0

	4770 UErrorCode status = U_ZERO_ERROR;

	4771 int pos = 0;

	4772 int ruleStatus = 0;

	4773

	4774 RuleBasedBreakIterator* bi =

	4775 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::ge tDefault(), status);

	4776 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Lo cale("th"), status);

	4777 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::g etDefault(), status);

	4778 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002 e\\u0046\\ufd3f\\u000a\\u002e");

	4779 // UnicodeString s("Aaa. Bcd");

	4780 s = s.unescape();

	4781 bi->setText(s);

	4782 UBool r = bi->isBoundary(8);

	4783 printf("%s", r?"true":"false");

	4784 return;

	4785 pos = bi->last();

	4786 do {

	4787 // ruleStatus = bi->getRuleStatus();

	4788 printf("%d\t%d\n", pos, ruleStatus);

	4789 pos = bi->previous();

	4790 } while (pos != BreakIterator::DONE);

	4791 #endif

	4792 }

	4793

	4794 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

OLD	NEW

« no previous file with comments | « icu46/source/test/intltest/rbbitst.h ('k') | icu46/source/test/intltest/regcoll.h » ('j') | no next file with comments »