source/test/intltest/rbbitst.cpp - Issue 2435373002: Delete source/test

Side by Side Diff: source/test/intltest/rbbitst.cpp

Issue 2435373002: Delete source/test (Closed)

Patch Set: Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 /********************************************************************

2 * COPYRIGHT:

3 * Copyright (c) 1999-2015, International Business Machines Corporation and

4 * others. All Rights Reserved.

5 ********************************************************************/

6 /************************************************************************

7 * Date Name Description

8 * 12/15/99 Madhu Creation.

9 * 01/12/2000 Madhu Updated for changed API and added new tests

10 ************************************************************************/

11

12 #include "utypeinfo.h" // for 'typeid' to work

13

14 #include "unicode/utypes.h"

15

16 #if !UCONFIG_NO_BREAK_ITERATION

17

18 #include "unicode/utypes.h"

19 #include "unicode/brkiter.h"

20 #include "unicode/rbbi.h"

21 #include "unicode/uchar.h"

22 #include "unicode/utf16.h"

23 #include "unicode/ucnv.h"

24 #include "unicode/schriter.h"

25 #include "unicode/uniset.h"

26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

27 #include "unicode/regex.h"

28 #endif

29 #include "unicode/ustring.h"

30 #include "unicode/utext.h"

31 #include "intltest.h"

32 #include "rbbitst.h"

33 #include <string.h>

34 #include "charstr.h"

35 #include "uvector.h"

36 #include "uvectr32.h"

37 #include <stdio.h>

38 #include <stdlib.h>

39 #include "unicode/numfmt.h"

40 #include "unicode/uscript.h"

41 #include "cmemory.h"

42

43 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION

44 #include "unicode/filteredbrk.h"

45 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION

46

47 #define TEST_ASSERT(x) {if (!(x)) { \

48 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}

49

50 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \

51 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__ , __LINE__, u_errorName(errcode));}}

52

53

54 //---------------------------------------------

55 // runIndexedTest

56 //---------------------------------------------

57

58

59 // Note: Before adding new tests to this file, check whether the desired test data can

60 // simply be added to the file testdata/rbbitest.txt. In most cases it can,

61 // it's much less work than writing a new test, diagnostic output in the event of failures

62 // is good, and the test data file will is shared with ICU4J, so eventua lly the test

63 // will run there as well, without additional effort.

64

65 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha r* params )

66 {

67 if (exec) logln("TestSuite RuleBasedBreakIterator: ");

68

69 switch (index) {

70 #if !UCONFIG_NO_FILE_IO

71 case 0: name = "TestBug4153072";

72 if(exec) TestBug4153072(); break;

73 #else

74 case 0: name = "skip";

75 break;

76 #endif

77

78 case 1: name = "skip";

79 break;

80 case 2: name = "TestStatusReturn";

81 if(exec) TestStatusReturn(); break;

82

83 #if !UCONFIG_NO_FILE_IO

84 case 3: name = "TestUnicodeFiles";

85 if(exec) TestUnicodeFiles(); break;

86 case 4: name = "TestEmptyString";

87 if(exec) TestEmptyString(); break;

88 #else

89 case 3: case 4: name = "skip";

90 break;

91 #endif

92

93 case 5: name = "TestGetAvailableLocales";

94 if(exec) TestGetAvailableLocales(); break;

95

96 case 6: name = "TestGetDisplayName";

97 if(exec) TestGetDisplayName(); break;

98

99 #if !UCONFIG_NO_FILE_IO

100 case 7: name = "TestEndBehaviour";

101 if(exec) TestEndBehaviour(); break;

102 case 8: case 9: case 10: name = "skip";

103 break;

104 case 11: name = "TestWordBreaks";

105 if(exec) TestWordBreaks(); break;

106 case 12: name = "TestWordBoundary";

107 if(exec) TestWordBoundary(); break;

108 case 13: name = "TestLineBreaks";

109 if(exec) TestLineBreaks(); break;

110 case 14: name = "TestSentBreaks";

111 if(exec) TestSentBreaks(); break;

112 case 15: name = "TestExtended";

113 if(exec) TestExtended(); break;

114 #else

115 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: cas e 15: name = "skip";

116 break;

117 #endif

118

119 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO

120 case 16:

121 name = "TestMonkey"; if(exec) TestMonkey(params); break;

122 #else

123 case 16:

124 name = "skip"; break;

125 #endif

126

127 #if !UCONFIG_NO_FILE_IO

128 case 17: name = "TestBug3818";

129 if(exec) TestBug3818(); break;

130 #else

131 case 17: name = "skip";

132 break;

133 #endif

134

135 case 18: name = "skip";

136 break;

137 case 19: name = "TestDebug";

138 if(exec) TestDebug(); break;

139 case 20: name = "skip";

140 break;

141

142 #if !UCONFIG_NO_FILE_IO

143 case 21: name = "TestBug5775";

144 if (exec) TestBug5775(); break;

145 #else

146 case 21: name = "skip";

147 break;

148 #endif

149

150 case 22: name = "TestBug9983";

151 if (exec) TestBug9983(); break;

152 case 23: name = "TestDictRules";

153 if (exec) TestDictRules(); break;

154 case 24: name = "TestBug5532";

155 if (exec) TestBug5532(); break;

156 default: name = ""; break; //needed to end loop

157 }

158 }

159

160

161 //---------------------------------------------------------------------------

162 //

163 // class BITestData Holds a set of Break iterator test data and results

164 // Includes

165 // - the string data to be broken

166 // - a vector of the expected break positions.

167 // - a vector of source line numbers for the data,

168 // (to help see where errors occured.)

169 // - The expected break tag values.

170 // - Vectors of actual break positions and tag values.

171 // - Functions for comparing actual with expected and

172 // reporting errors.

173 //

174 //----------------------------------------------------------------------------

175 class BITestData {

176 public:

177 UnicodeString fDataToBreak;

178 UVector fExpectedBreakPositions;

179 UVector fExpectedTags;

180 UVector fLineNum;

181 UVector fActualBreakPositions; // Test Results.

182 UVector fActualTags;

183

184 BITestData(UErrorCode &status);

185 void addDataChunk(const char *data, int32_t tag, int32_t lineNum , UErrorCode status);

186 void checkResults(const char heading, RBBITest test);

187 void err(const char heading, RBBITest test, int32_t expectedId x, int32_t actualIdx);

188 void clearResults();

189 };

190

191 //

192 // Constructor.

193 //

194 BITestData::BITestData(UErrorCode &status)

195 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fAc tualBreakPositions(status),

196 fActualTags(status)

197 {

198 }

199

200 //

201 // addDataChunk. Add a section (non-breaking) piece if data to the test data.

202 // The macro form collects the line number, which is helpful

203 // when tracking down failures.

204 //

205 // A null data item is inserted at the start of each test's data

206 // to put the starting zero into the data list. The position s aved for

207 // each non-null item is its ending position.

208 //

209 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE __, status);

210 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UE rrorCode status) {

211 if (U_FAILURE(status)) {return;}

212 if (data != NULL) {

213 fDataToBreak.append(CharsToUnicodeString(data));

214 }

215 fExpectedBreakPositions.addElement(fDataToBreak.length(), status);

216 fExpectedTags.addElement(tag, status);

217 fLineNum.addElement(lineNum, status);

218 }

219

220

221 //

222 // checkResults. Compare the actual and expected break positions, report any differences.

223 //

224 void BITestData::checkResults(const char heading, RBBITest test) {

225 int32_t expectedIndex = 0;

226 int32_t actualIndex = 0;

227

228 for (;;) {

229 // If we've run through both the expected and actual results vectors, we 're done.

230 // break out of the loop.

231 if (expectedIndex >= fExpectedBreakPositions.size() &&

232 actualIndex >= fActualBreakPositions.size()) {

233 break;

234 }

235

236

237 if (expectedIndex >= fExpectedBreakPositions.size()) {

238 err(heading, test, expectedIndex-1, actualIndex);

239 actualIndex++;

240 continue;

241 }

242

243 if (actualIndex >= fActualBreakPositions.size()) {

244 err(heading, test, expectedIndex, actualIndex-1);

245 expectedIndex++;

246 continue;

247 }

248

249 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPosit ions.elementAti(expectedIndex)) {

250 err(heading, test, expectedIndex, actualIndex);

251 // Try to resync the positions of the indices, to avoid a rash of sp urious erros.

252 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPo sitions.elementAti(expectedIndex)) {

253 actualIndex++;

254 } else {

255 expectedIndex++;

256 }

257 continue;

258 }

259

260 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expe ctedIndex)) {

261 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",

262 heading, fLineNum.elementAt(expectedIndex),

263 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti( actualIndex));

264 }

265

266 actualIndex++;

267 expectedIndex++;

268 }

269 }

270

271 //

272 // err - An error was found. Report it, along with information about where the

273 // incorrectly broken test data appeared in the s ource file.

274 //

275 void BITestData::err(const char heading, RBBITest test, int32_t expectedIdx , int32_t actualIdx)

276 {

277 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);

278 int32_t actual = fActualBreakPositions.elementAti(actualIdx);

279 int32_t o = 0;

280 int32_t line = fLineNum.elementAti(expectedIdx);

281 if (expectedIdx > 0) {

282 // The line numbers are off by one because a premature break occurs some where

283 // within the previous item, rather than at the start of the current (expected) item.

284 // We want to report the offset of the unexpected break from the star t of

285 // this previous item.

286 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);

287 }

288 if (actual < expected) {

289 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected);

290 } else {

291 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected);

292 }

293 }

294

295

296 void BITestData::clearResults() {

297 fActualBreakPositions.removeAllElements();

298 fActualTags.removeAllElements();

299 }

300

301

302 //------------------------------------------------------------------------------ --------

303 //

304 // RBBITest constructor and destructor

305 //

306 //------------------------------------------------------------------------------ --------

307

308 RBBITest::RBBITest() {

309 }

310

311

312 RBBITest::~RBBITest() {

313 }

314

315 //------------------------------------------------------------------------------ -----

316 //

317 // Test for status {tag} return value from break rules.

318 // TODO: a more thorough test.

319 //

320 //------------------------------------------------------------------------------ -----

321 void RBBITest::TestStatusReturn() {

322 UnicodeString rulesString1("$Letters = [:L:];\n"

323 "$Numbers = [:N:];\n"

324 "$Letters+{1};\n"

325 "$Numbers+{2};\n"

326 "Help\\ {4}/me\\!;\n"

327 "[^$Letters $Numbers];\n"

328 "!.*;\n", -1, US_INV);

329 UnicodeString testString1 = "abc123..abc Help me Help me!";

330 // 01234567890123456789012345678

331 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, - 1};

332 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, - 1};

333

334 UErrorCode status=U_ZERO_ERROR;

335 UParseError parseError;

336

337 BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, st atus);

338 if(U_FAILURE(status)) {

339 dataerrln("FAIL : in construction - %s", u_errorName(status));

340 } else {

341 int32_t pos;

342 int32_t i = 0;

343 bi->setText(testString1);

344 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {

345 if (pos != bounds1[i]) {

346 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos) ;

347 break;

348 }

349

350 int tag = bi->getRuleStatus();

351 if (tag != brkStatus[i]) {

352 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);

353 break;

354 }

355 i++;

356 }

357 }

358 delete bi;

359 }

360

361

362 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {

363 UErrorCode status = U_ZERO_ERROR;

364 char name[100];

365 printf("code alpha extend alphanum type word sent line name\n");

366 int nextExpectedIndex = 0;

367 utext_setNativeIndex(tstr, 0);

368 for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {

369 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex ] ) {

370 printf("------------------------------------------------ %d\n", j);

371 ++nextExpectedIndex;

372 }

373

374 UChar32 c = utext_next32(tstr);

375 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);

376 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,

377 u_isUAlphabetic(c),

378 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),

379 u_isalnum(c),

380 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,

381 u_charType(c),

382 U_SHORT_PROPERTY_NAME),

383 u_getPropertyValueName(UCHAR_WORD_BREAK,

384 u_getIntPropertyValue(c,

385 UCHAR_WORD_BREAK),

386 U_SHORT_PROPERTY_NAME),

387 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,

388 u_getIntPropertyValue(c,

389 UCHAR_SENTENCE_BREAK),

390 U_SHORT_PROPERTY_NAME),

391 u_getPropertyValueName(UCHAR_LINE_BREAK,

392 u_getIntPropertyValue(c,

393 UCHAR_LINE_BREAK),

394 U_SHORT_PROPERTY_NAME),

395 name);

396 }

397 }

398

399

400 static void printStringBreaks(const UnicodeString &ustr, int expected[], int exp ectedCount) {

401 UErrorCode status = U_ZERO_ERROR;

402 UText *tstr = NULL;

403 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);

404 if (U_FAILURE(status)) {

405 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));

406 return;

407 }

408 printStringBreaks(tstr, expected, expectedCount);

409 utext_close(tstr);

410 }

411

412

413 void RBBITest::TestBug3818() {

414 UErrorCode status = U_ZERO_ERROR;

415

416 // Four Thai words...

417 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0 x0E2B,0x0E0D,0x0E48,

418 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0 x0E2B,0x0E0D,0x0E48, 0 };

419 UnicodeString thaiStr(thaiWordData);

420

421 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);

422 if (U_FAILURE(status) \|\| bi == NULL) {

423 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __ LINE__, u_errorName(status));

424 return;

425 }

426 bi->setText(thaiStr);

427

428 int32_t startOfSecondWord = bi->following(1);

429 if (startOfSecondWord != 4) {

430 errln("Fail at file %s, line %d expected start of word at 4, got %d",

431 __FILE__, __LINE__, startOfSecondWord);

432 }

433 startOfSecondWord = bi->following(0);

434 if (startOfSecondWord != 4) {

435 errln("Fail at file %s, line %d expected start of word at 4, got %d",

436 __FILE__, __LINE__, startOfSecondWord);

437 }

438 delete bi;

439 }

440

441 //----------------------------------------------------------------------------

442 //

443 // generalIteratorTest Given a break iterator and a set of test data,

444 // Run the tests and report the results.

445 //

446 //----------------------------------------------------------------------------

447 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)

448 {

449

450 bi.setText(td.fDataToBreak);

451

452 testFirstAndNext(bi, td);

453

454 testLastAndPrevious(bi, td);

455

456 testFollowing(bi, td);

457 testPreceding(bi, td);

458 testIsBoundary(bi, td);

459 doMultipleSelectionTest(bi, td);

460 }

461

462

463 //

464 // testFirstAndNext. Run the iterator forwards in the obvious first(), next( )

465 // kind of loop.

466 //

467 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)

468 {

469 UErrorCode status = U_ZERO_ERROR;

470 int32_t p;

471 int32_t lastP = -1;

472 int32_t tag;

473

474 logln("Test first and next");

475 bi.setText(td.fDataToBreak);

476 td.clearResults();

477

478 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {

479 td.fActualBreakPositions.addElement(p, status); // Save result.

480 tag = bi.getRuleStatus();

481 td.fActualTags.addElement(tag, status);

482 if (p <= lastP) {

483 // If the iterator is not making forward progress, stop.

484 // No need to raise an error here, it'll be detected in the normal check of results.

485 break;

486 }

487 lastP = p;

488 }

489 td.checkResults("testFirstAndNext", this);

490 }

491

492

493 //

494 // TestLastAndPrevious. Run the iterator backwards, starting with last().

495 //

496 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)

497 {

498 UErrorCode status = U_ZERO_ERROR;

499 int32_t p;

500 int32_t lastP = 0x7ffffffe;

501 int32_t tag;

502

503 logln("Test last and previous");

504 bi.setText(td.fDataToBreak);

505 td.clearResults();

506

507 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {

508 // Save break position. Insert it at start of vector of results, shovin g

509 // already-saved results further towards the end.

510 td.fActualBreakPositions.insertElementAt(p, 0, status);

511 // bi.previous(); // TODO: Why does this fix things up????

512 // bi.next();

513 tag = bi.getRuleStatus();

514 td.fActualTags.insertElementAt(tag, 0, status);

515 if (p >= lastP) {

516 // If the iterator is not making progress, stop.

517 // No need to raise an error here, it'll be detected in the normal check of results.

518 break;

519 }

520 lastP = p;

521 }

522 td.checkResults("testLastAndPrevious", this);

523 }

524

525

526 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)

527 {

528 UErrorCode status = U_ZERO_ERROR;

529 int32_t p;

530 int32_t tag;

531 int32_t lastP = -2; // A value that will never be returned as a bre ak position.

532 // cannot be -1; that is returned for DONE.

533 int i;

534

535 logln("testFollowing():");

536 bi.setText(td.fDataToBreak);

537 td.clearResults();

538

539 // Save the starting point, since we won't get that out of following.

540 p = bi.first();

541 td.fActualBreakPositions.addElement(p, status); // Save result.

542 tag = bi.getRuleStatus();

543 td.fActualTags.addElement(tag, status);

544

545 for (i = 0; i <= td.fDataToBreak.length()+1; i++) {

546 p = bi.following(i);

547 if (p != lastP) {

548 if (p == RuleBasedBreakIterator::DONE) {

549 break;

550 }

551 // We've reached a new break position. Save it.

552 td.fActualBreakPositions.addElement(p, status); // Save result.

553 tag = bi.getRuleStatus();

554 td.fActualTags.addElement(tag, status);

555 lastP = p;

556 }

557 }

558 // The loop normally exits by means of the break in the middle.

559 // Make sure that the index was at the correct position for the break iterat or to have

560 // returned DONE.

561 if (i != td.fDataToBreak.length()) {

562 errln("testFollowing(): iterator returned DONE prematurely.");

563 }

564

565 // Full check of all results.

566 td.checkResults("testFollowing", this);

567 }

568

569

570

571 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {

572 UErrorCode status = U_ZERO_ERROR;

573 int32_t p;

574 int32_t tag;

575 int32_t lastP = 0x7ffffffe;

576 int i;

577

578 logln("testPreceding():");

579 bi.setText(td.fDataToBreak);

580 td.clearResults();

581

582 p = bi.last();

583 td.fActualBreakPositions.addElement(p, status);

584 tag = bi.getRuleStatus();

585 td.fActualTags.addElement(tag, status);

586

587 for (i = td.fDataToBreak.length(); i>=-1; i--) {

588 p = bi.preceding(i);

589 if (p != lastP) {

590 if (p == RuleBasedBreakIterator::DONE) {

591 break;

592 }

593 // We've reached a new break position. Save it.

594 td.fActualBreakPositions.insertElementAt(p, 0, status);

595 lastP = p;

596 tag = bi.getRuleStatus();

597 td.fActualTags.insertElementAt(tag, 0, status);

598 }

599 }

600 // The loop normally exits by means of the break in the middle.

601 // Make sure that the index was at the correct position for the break iterat or to have

602 // returned DONE.

603 if (i != 0) {

604 errln("testPreceding(): iterator returned DONE prematurely.");

605 }

606

607 // Full check of all results.

608 td.checkResults("testPreceding", this);

609 }

610

611

612

613 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {

614 UErrorCode status = U_ZERO_ERROR;

615 int i;

616 int32_t tag;

617

618 logln("testIsBoundary():");

619 bi.setText(td.fDataToBreak);

620 td.clearResults();

621

622 for (i = 0; i <= td.fDataToBreak.length(); i++) {

623 if (bi.isBoundary(i)) {

624 td.fActualBreakPositions.addElement(i, status); // Save result.

625 tag = bi.getRuleStatus();

626 td.fActualTags.addElement(tag, status);

627 }

628 }

629 td.checkResults("testIsBoundary: ", this);

630 }

631

632

633

634 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestD ata &td)

635 {

636 iterator.setText(td.fDataToBreak);

637

638 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clon e();

639 int32_t offset = iterator.first();

640 int32_t testOffset;

641 int32_t count = 0;

642

643 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length() );

644

645 if (*testIterator != iterator)

646 errln("clone() or operator!= failed: two clones compared unequal");

647

648 do {

649 testOffset = testIterator->first();

650 testOffset = testIterator->next(count);

651 if (offset != testOffset)

652 errln(UnicodeString("next(n) and next() not returning consistent res ults: for step ") + count + ", next(n) returned " + testOffset + " and next() ha d " + offset);

653

654 if (offset != RuleBasedBreakIterator::DONE) {

655 count++;

656 offset = iterator.next();

657

658 if (offset != RuleBasedBreakIterator::DONE && *testIterator == itera tor) {

659 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);

660 if (count > 10000 \|\| offset == -1) {

661 errln("operator== failed too many times. Stopping test.");

662 if (offset == -1) {

663 errln("Does (RuleBasedBreakIterator::DONE == -1)?");

664 }

665 return;

666 }

667 }

668 }

669 } while (offset != RuleBasedBreakIterator::DONE);

670

671 // now do it backwards...

672 offset = iterator.last();

673 count = 0;

674

675 do {

676 testOffset = testIterator->last();

677 testOffset = testIterator->next(count); // next() with a negative arg is same as previous

678 if (offset != testOffset)

679 errln(UnicodeString("next(n) and next() not returning consistent res ults: for step ") + count + ", next(n) returned " + testOffset + " and next() ha d " + offset);

680

681 if (offset != RuleBasedBreakIterator::DONE) {

682 count--;

683 offset = iterator.previous();

684 }

685 } while (offset != RuleBasedBreakIterator::DONE);

686

687 delete testIterator;

688 }

689

690

691 //---------------------------------------------

692 //

693 // other tests

694 //

695 //---------------------------------------------

696 void RBBITest::TestEmptyString()

697 {

698 UnicodeString text = "";

699 UErrorCode status = U_ZERO_ERROR;

700

701 BITestData x(status);

702 ADD_DATACHUNK(x, "", 0, status); // Break at start of data

703 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::create LineInstance(Locale::getDefault(), status);

704 if (U_FAILURE(status))

705 {

706 errcheckln(status, "Failed to create the BreakIterator for default local e in TestEmptyString. - %s", u_errorName(status));

707 return;

708 }

709 generalIteratorTest(*bi, x);

710 delete bi;

711 }

712

713 void RBBITest::TestGetAvailableLocales()

714 {

715 int32_t locCount = 0;

716 const Locale* locList = BreakIterator::getAvailableLocales(locCount);

717

718 if (locCount == 0)

719 dataerrln("getAvailableLocales() returned an empty list!");

720 // Just make sure that it's returning good memory.

721 int32_t i;

722 for (i = 0; i < locCount; ++i) {

723 logln(locList[i].getName());

724 }

725 }

726

727 //Testing the BreakIterator::getDisplayName() function

728 void RBBITest::TestGetDisplayName()

729 {

730 UnicodeString result;

731

732 BreakIterator::getDisplayName(Locale::getUS(), result);

733 if (Locale::getDefault() == Locale::getUS() && result != "English (United St ates)")

734 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (U nited States)\", got \""

735 + result);

736

737 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);

738 if (result != "French (France)")

739 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (Fr ance)\", got \""

740 + result);

741 }

742 /**

743 * Test End Behaviour

744 * @bug 4068137

745 */

746 void RBBITest::TestEndBehaviour()

747 {

748 UErrorCode status = U_ZERO_ERROR;

749 UnicodeString testString("boo.");

750 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);

751 if (U_FAILURE(status))

752 {

753 errcheckln(status, "Failed to create the BreakIterator for default local e in TestEndBehaviour. - %s", u_errorName(status));

754 return;

755 }

756 wb->setText(testString);

757

758 if (wb->first() != 0)

759 errln("Didn't get break at beginning of string.");

760 if (wb->next() != 3)

761 errln("Didn't get break before period in \"boo.\"");

762 if (wb->current() != 4 && wb->next() != 4)

763 errln("Didn't get break at end of string.");

764 delete wb;

765 }

766 /*

767 * @bug 4153072

768 */

769 void RBBITest::TestBug4153072() {

770 UErrorCode status = U_ZERO_ERROR;

771 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault() , status);

772 if (U_FAILURE(status))

773 {

774 errcheckln(status, "Failed to create the BreakIterator for default local e in TestBug4153072 - %s", u_errorName(status));

775 return;

776 }

777 UnicodeString str("...Hello, World!...");

778 int32_t begin = 3;

779 int32_t end = str.length() - 3;

780 UBool onBoundary;

781

782 StringCharacterIterator* textIterator = new StringCharacterIterator(str, beg in, end, begin);

783 iter->adoptText(textIterator);

784 int index;

785 // Note: with the switch to UText, there is no way to restrict the

786 // iteration range to begin at an index other than zero.

787 // String character iterators created with a non-zero bound are

788 // treated by RBBI as being empty.

789 for (index = -1; index < begin + 1; ++index) {

790 onBoundary = iter->isBoundary(index);

791 if (index == 0? !onBoundary : onBoundary) {

792 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +

793 " and begin index = " + begin);

794 }

795 }

796 delete iter;

797 }

798

799

800 //

801 // Test for problem reported by Ashok Matoria on 9 July 2007

802 // One.<kSoftHyphen><kSpace>Two.

803 //

804 // Sentence break at start (0) and then on calling next() it breaks at

805 // 'T' of "Two". Now, at this point if I do next() and

806 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".

807 //

808 void RBBITest::TestBug5775() {

809 UErrorCode status = U_ZERO_ERROR;

810 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish (), status);

811 TEST_ASSERT_SUCCESS(status);

812 if (U_FAILURE(status)) {

813 return;

814 }

815 // Check for status first for better handling of no data errors.

816 TEST_ASSERT(bi != NULL);

817 if (bi == NULL) {

818 return;

819 }

820

821 UnicodeString s("One.\\u00ad Two.", -1, US_INV);

822 // 01234 56789

823 s = s.unescape();

824 bi->setText(s);

825 int pos = bi->next();

826 TEST_ASSERT(pos == 6);

827 pos = bi->next();

828 TEST_ASSERT(pos == 10);

829 pos = bi->previous();

830 TEST_ASSERT(pos == 6);

831 delete bi;

832 }

833

834

835

836 //------------------------------------------------------------------------------

837 //

838 // RBBITest::Extended Run RBBI Tests from an external test data file

839 //

840 //------------------------------------------------------------------------------

841

842 struct TestParams {

843 BreakIterator *bi; // Break iterator is set while parsin g test source.

844 // Changed out whenever test data c hanges break type.

845

846 UnicodeString dataToBreak; // Data that is built up while parsin g the test.

847 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.

848 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.

849 UVector32 *srcCol;

850

851 UText *textToBreak; // UText, could be UTF8 or UTF16.

852 UVector32 *textMap; // Map from UTF-16 dataToBreak offset s to UText offsets.

853 CharString utf8String; // UTF-8 form of text to break.

854

855 TestParams(UErrorCode &status) : dataToBreak() {

856 bi = NULL;

857 expectedBreaks = new UVector32(status);

858 srcLine = new UVector32(status);

859 srcCol = new UVector32(status);

860 textToBreak = NULL;

861 textMap = new UVector32(status);

862 }

863

864 ~TestParams() {

865 delete bi;

866 delete expectedBreaks;

867 delete srcLine;

868 delete srcCol;

869 utext_close(textToBreak);

870 delete textMap;

871 }

872

873 int32_t getSrcLine(int32_t bp);

874 int32_t getExpectedBreak(int32_t bp);

875 int32_t getSrcCol(int32_t bp);

876

877 void setUTF16(UErrorCode &status);

878 void setUTF8(UErrorCode &status);

879 };

880

881 // Append a UnicodeString to a CharString with UTF-8 encoding.

882 // Substitute any invalid chars.

883 // Note: this is used with test data that includes a few unpaired surrogates i n the UTF-16 that will be substituted.

884 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorC ode &status) {

885 if (U_FAILURE(status)) {

886 return;

887 }

888 int32_t utf8Length;

889 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL f or preflight.

890 src.getBuffer(), src.length(), // UTF-16 data

891 0xfffd, NULL, // Substitution char, nu mber of subs.

892 &status);

893 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {

894 return;

895 }

896 status = U_ZERO_ERROR;

897 int32_t capacity;

898 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status );

899 u_strToUTF8WithSub(buffer, utf8Length, NULL,

900 src.getBuffer(), src.length(),

901 0xfffd, NULL, &status);

902 dest.append(buffer, utf8Length, status);

903 }

904

905

906 void TestParams::setUTF16(UErrorCode &status) {

907 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);

908 textMap->removeAllElements();

909 for (int32_t i=0; i<dataToBreak.length(); i++) {

910 if (i == dataToBreak.getChar32Start(i)) {

911 textMap->addElement(i, status);

912 } else {

913 textMap->addElement(-1, status);

914 }

915 }

916 textMap->addElement(dataToBreak.length(), status);

917 U_ASSERT(dataToBreak.length() + 1 == textMap->size());

918 }

919

920

921 void TestParams::setUTF8(UErrorCode &status) {

922 if (U_FAILURE(status)) {

923 return;

924 }

925 utf8String.clear();

926 CharStringAppend(utf8String, dataToBreak, status);

927 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.leng th(), &status);

928 if (U_FAILURE(status)) {

929 return;

930 }

931

932 textMap->removeAllElements();

933 int32_t utf16Index = 0;

934 for (;;) {

935 textMap->addElement(utf16Index, status);

936 UChar32 c32 = utext_current32(textToBreak);

937 if (c32 < 0) {

938 break;

939 }

940 utf16Index += U16_LENGTH(c32);

941 utext_next32(textToBreak);

942 while (textMap->size() < utext_getNativeIndex(textToBreak)) {

943 textMap->addElement(-1, status);

944 }

945 }

946 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());

947 }

948

949

950 int32_t TestParams::getSrcLine(int bp) {

951 if (bp >= textMap->size()) {

952 bp = textMap->size() - 1;

953 }

954 int32_t i = 0;

955 for(; bp >= 0 ; --bp) {

956 // Move to a character boundary if we are not on one already.

957 i = textMap->elementAti(bp);

958 if (i >= 0) {

959 break;

960 }

961 }

962 return srcLine->elementAti(i);

963 }

964

965

966 int32_t TestParams::getExpectedBreak(int bp) {

967 if (bp >= textMap->size()) {

968 return 0;

969 }

970 int32_t i = textMap->elementAti(bp);

971 int32_t retVal = 0;

972 if (i >= 0) {

973 retVal = expectedBreaks->elementAti(i);

974 }

975 return retVal;

976 }

977

978

979 int32_t TestParams::getSrcCol(int bp) {

980 if (bp >= textMap->size()) {

981 bp = textMap->size() - 1;

982 }

983 int32_t i = 0;

984 for(; bp >= 0; --bp) {

985 // Move bp to a character boundary if we are not on one already.

986 i = textMap->elementAti(bp);

987 if (i >= 0) {

988 break;

989 }

990 }

991 return srcCol->elementAti(i);

992 }

993

994

995 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {

996 int32_t bp;

997 int32_t prevBP;

998 int32_t i;

999

1000 TEST_ASSERT_SUCCESS(status);

1001 if (U_FAILURE(status)) {

1002 return;

1003 }

1004

1005 if (t->bi == NULL) {

1006 return;

1007 }

1008

1009 t->bi->setText(t->textToBreak, status);

1010 //

1011 // Run the iterator forward

1012 //

1013 prevBP = -1;

1014 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {

1015 if (prevBP == bp) {

1016 // Fail for lack of forward progress.

1017 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",

1018 bp, t->getSrcLine(bp), t->getSrcCol(bp));

1019 break;

1020 }

1021

1022 // Check that there we didn't miss an expected break between the last on e

1023 // and this one.

1024 for (i=prevBP+1; i<bp; i++) {

1025 if (t->getExpectedBreak(i) != 0) {

1026 int expected[] = {0, i};

1027 printStringBreaks(t->dataToBreak, expected, 2);

1028 errln("Forward Iteration, break expected, but not found. Pos=%4 d File line,col= %4d,%4d",

1029 i, t->getSrcLine(i), t->getSrcCol(i));

1030 }

1031 }

1032

1033 // Check that the break we did find was expected

1034 if (t->getExpectedBreak(bp) == 0) {

1035 int expected[] = {0, bp};

1036 printStringBreaks(t->textToBreak, expected, 2);

1037 errln("Forward Iteration, break found, but not expected. Pos=%4d F ile line,col= %4d,%4d",

1038 bp, t->getSrcLine(bp), t->getSrcCol(bp));

1039 } else {

1040 // The break was expected.

1041 // Check that the {nnn} tag value is correct.

1042 int32_t expectedTagVal = t->getExpectedBreak(bp);

1043 if (expectedTagVal == -1) {

1044 expectedTagVal = 0;

1045 }

1046 int32_t line = t->getSrcLine(bp);

1047 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();

1048 if (rs != expectedTagVal) {

1049 errln("Incorrect status for forward break. Pos=%4d File line,c ol= %4d,%4d.\n"

1050 " Actual, Expected status = %4d, %4d",

1051 bp, line, t->getSrcCol(bp), rs, expectedTagVal);

1052 }

1053 }

1054

1055 prevBP = bp;

1056 }

1057

1058 // Verify that there were no missed expected breaks after the last one found

1059 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {

1060 if (t->getExpectedBreak(i) != 0) {

1061 errln("Forward Iteration, break expected, but not found. Pos=%4d F ile line,col= %4d,%4d",

1062 i, t->getSrcLine(i), t->getSrcCol(i));

1063 }

1064 }

1065

1066 //

1067 // Run the iterator backwards, verify that the same breaks are found.

1068 //

1069 prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen.

1070 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {

1071 if (prevBP == bp) {

1072 // Fail for lack of progress.

1073 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col =%4d,%4d",

1074 bp, t->getSrcLine(bp), t->getSrcCol(bp));

1075 break;

1076 }

1077

1078 // Check that we didn't miss an expected break between the last one

1079 // and this one. (UVector returns zeros for index out of bounds.)

1080 for (i=prevBP-1; i>bp; i--) {

1081 if (t->getExpectedBreak(i) != 0) {

1082 errln("Reverse Iteration, break expected, but not found. Pos=%4 d File line,col= %4d,%4d",

1083 i, t->getSrcLine(i), t->getSrcCol(i));

1084 }

1085 }

1086

1087 // Check that the break we did find was expected

1088 if (t->getExpectedBreak(bp) == 0) {

1089 errln("Reverse Itertion, break found, but not expected. Pos=%4d Fi le line,col= %4d,%4d",

1090 bp, t->getSrcLine(bp), t->getSrcCol(bp));

1091 } else {

1092 // The break was expected.

1093 // Check that the {nnn} tag value is correct.

1094 int32_t expectedTagVal = t->getExpectedBreak(bp);

1095 if (expectedTagVal == -1) {

1096 expectedTagVal = 0;

1097 }

1098 int line = t->getSrcLine(bp);

1099 int32_t rs = t->bi->getRuleStatus();

1100 if (rs != expectedTagVal) {

1101 errln("Incorrect status for reverse break. Pos=%4d File line,c ol= %4d,%4d.\n"

1102 " Actual, Expected status = %4d, %4d",

1103 bp, line, t->getSrcCol(bp), rs, expectedTagVal);

1104 }

1105 }

1106

1107 prevBP = bp;

1108 }

1109

1110 // Verify that there were no missed breaks prior to the last one found

1111 for (i=prevBP-1; i>=0; i--) {

1112 if (t->getExpectedBreak(i) != 0) {

1113 errln("Forward Itertion, break expected, but not found. Pos=%4d Fi le line,col= %4d,%4d",

1114 i, t->getSrcLine(i), t->getSrcCol(i));

1115 }

1116 }

1117

1118 // Check isBoundary()

1119 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {

1120 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);

1121 UBool boundaryFound = t->bi->isBoundary(i);

1122 if (boundaryExpected != boundaryFound) {

1123 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"

1124 " Expected, Actual= %s, %s",

1125 i, t->getSrcLine(i), t->getSrcCol(i),

1126 boundaryExpected ? "true":"false", boundaryFound? "true" : "fa lse");

1127 }

1128 }

1129

1130 // Check following()

1131 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {

1132 int32_t actualBreak = t->bi->following(i);

1133 int32_t expectedBreak = BreakIterator::DONE;

1134 for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {

1135 if (t->getExpectedBreak(j) != 0) {

1136 expectedBreak = j;

1137 break;

1138 }

1139 }

1140 if (expectedBreak != actualBreak) {

1141 errln("following(%d) incorrect. File line,col= %4d,%4d\n"

1142 " Expected, Actual= %d, %d",

1143 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBre ak);

1144 }

1145 }

1146

1147 // Check preceding()

1148 for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {

1149 int32_t actualBreak = t->bi->preceding(i);

1150 int32_t expectedBreak = BreakIterator::DONE;

1151

1152 // For UTF-8 & UTF-16 supplementals, all code units of a character are e quivalent.

1153 // preceding(trailing byte) will return the index of some preceding code point,

1154 // not the lead byte of the current code point, even though that has a s maller index.

1155 // Therefore, start looking at the expected break data not at i-1, but a t

1156 // the start of code point index - 1.

1157 utext_setNativeIndex(t->textToBreak, i);

1158 int32_t j = utext_getNativeIndex(t->textToBreak) - 1;

1159 for (; j >= 0; j--) {

1160 if (t->getExpectedBreak(j) != 0) {

1161 expectedBreak = j;

1162 break;

1163 }

1164 }

1165 if (expectedBreak != actualBreak) {

1166 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"

1167 " Expected, Actual= %d, %d",

1168 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBre ak);

1169 }

1170 }

1171 }

1172

1173

1174 void RBBITest::TestExtended() {

1175 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

1176 UErrorCode status = U_ZERO_ERROR;

1177 Locale locale("");

1178

1179 UnicodeString rules;

1180 TestParams tp(status);

1181

1182 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale ([\\p{L}\\p{ Nd}_@&=-]) *>"), 0, status);

1183 if (U_FAILURE(status)) {

1184 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LI NE__, u_errorName(status));

1185 }

1186

1187

1188 //

1189 // Open and read the test data file.

1190 //

1191 const char *testDataDirectory = IntlTest::getSourceTestData(status);

1192 char testFileName[1000];

1193 if (testDataDirectory == NULL \|\| strlen(testDataDirectory) >= sizeof(testFil eName)) {

1194 errln("Can't open test data. Path too long.");

1195 return;

1196 }

1197 strcpy(testFileName, testDataDirectory);

1198 strcat(testFileName, "rbbitst.txt");

1199

1200 int len;

1201 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);

1202 if (U_FAILURE(status)) {

1203 return; /* something went wrong, error already output */

1204 }

1205

1206

1207 bool skipTest = false; // Skip this test?

1208

1209 //

1210 // Put the test data into a UnicodeString

1211 //

1212 UnicodeString testString(FALSE, testFile, len);

1213

1214 enum EParseState{

1215 PARSE_COMMENT,

1216 PARSE_TAG,

1217 PARSE_DATA,

1218 PARSE_NUM

1219 }

1220 parseState = PARSE_TAG;

1221

1222 EParseState savedState = PARSE_TAG;

1223

1224 static const UChar CH_LF = 0x0a;

1225 static const UChar CH_CR = 0x0d;

1226 static const UChar CH_HASH = 0x23;

1227 /static const UChar CH_PERIOD = 0x2e;/

1228 static const UChar CH_LT = 0x3c;

1229 static const UChar CH_GT = 0x3e;

1230 static const UChar CH_BACKSLASH = 0x5c;

1231 static const UChar CH_BULLET = 0x2022;

1232

1233 int32_t lineNum = 1;

1234 int32_t colStart = 0;

1235 int32_t column = 0;

1236 int32_t charIdx = 0;

1237

1238 int32_t tagValue = 0; // The numeric value of a <nnn> tag.

1239

1240 for (charIdx = 0; charIdx < len; ) {

1241 status = U_ZERO_ERROR;

1242 UChar c = testString.charAt(charIdx);

1243 charIdx++;

1244 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {

1245 // treat CRLF as a unit

1246 c = CH_LF;

1247 charIdx++;

1248 }

1249 if (c == CH_LF \|\| c == CH_CR) {

1250 lineNum++;

1251 colStart = charIdx;

1252 }

1253 column = charIdx - colStart + 1;

1254

1255 switch (parseState) {

1256 case PARSE_COMMENT:

1257 if (c == 0x0a \|\| c == 0x0d) {

1258 parseState = savedState;

1259 }

1260 break;

1261

1262 case PARSE_TAG:

1263 {

1264 if (c == CH_HASH) {

1265 parseState = PARSE_COMMENT;

1266 savedState = PARSE_TAG;

1267 break;

1268 }

1269 if (u_isUWhiteSpace(c)) {

1270 break;

1271 }

1272 if (testString.compare(charIdx-1, 6, "<word>") == 0) {

1273 delete tp.bi;

1274 tp.bi = BreakIterator::createWordInstance(locale, status);

1275 skipTest = false;

1276 charIdx += 5;

1277 break;

1278 }

1279 if (testString.compare(charIdx-1, 6, "<char>") == 0) {

1280 delete tp.bi;

1281 tp.bi = BreakIterator::createCharacterInstance(locale, status);

1282 skipTest = false;

1283 charIdx += 5;

1284 break;

1285 }

1286 if (testString.compare(charIdx-1, 6, "<line>") == 0) {

1287 delete tp.bi;

1288 tp.bi = BreakIterator::createLineInstance(locale, status);

1289 skipTest = false;

1290 charIdx += 5;

1291 break;

1292 }

1293 if (testString.compare(charIdx-1, 6, "<sent>") == 0) {

1294 delete tp.bi;

1295 tp.bi = BreakIterator::createSentenceInstance(locale, status);

1296 skipTest = false;

1297 charIdx += 5;

1298 break;

1299 }

1300 if (testString.compare(charIdx-1, 7, "<title>") == 0) {

1301 delete tp.bi;

1302 tp.bi = BreakIterator::createTitleInstance(locale, status);

1303 charIdx += 6;

1304 break;

1305 }

1306

1307 // <locale loc_name>

1308 localeMatcher.reset(testString);

1309 if (localeMatcher.lookingAt(charIdx-1, status)) {

1310 UnicodeString localeName = localeMatcher.group(1, status);

1311 char localeName8[100];

1312 localeName.extract(0, localeName.length(), localeName8, sizeof(l ocaleName8), 0);

1313 locale = Locale::createFromName(localeName8);

1314 charIdx += localeMatcher.group(0, status).length() - 1;

1315 TEST_ASSERT_SUCCESS(status);

1316 break;

1317 }

1318 if (testString.compare(charIdx-1, 6, "<data>") == 0) {

1319 parseState = PARSE_DATA;

1320 charIdx += 5;

1321 tp.dataToBreak = "";

1322 tp.expectedBreaks->removeAllElements();

1323 tp.srcCol ->removeAllElements();

1324 tp.srcLine->removeAllElements();

1325 break;

1326 }

1327

1328 errln("line %d: Tag expected in test file.", lineNum);

1329 parseState = PARSE_COMMENT;

1330 savedState = PARSE_DATA;

1331 goto end_test; // Stop the test.

1332 }

1333 break;

1334

1335 case PARSE_DATA:

1336 if (c == CH_BULLET) {

1337 int32_t breakIdx = tp.dataToBreak.length();

1338 tp.expectedBreaks->setSize(breakIdx+1);

1339 tp.expectedBreaks->setElementAt(-1, breakIdx);

1340 tp.srcLine->setSize(breakIdx+1);

1341 tp.srcLine->setElementAt(lineNum, breakIdx);

1342 tp.srcCol ->setSize(breakIdx+1);

1343 tp.srcCol ->setElementAt(column, breakIdx);

1344 break;

1345 }

1346

1347 if (testString.compare(charIdx-1, 7, "</data>") == 0) {

1348 // Add final entry to mappings from break location to source fil e position.

1349 // Need one extra because last break position returned is after the

1350 // last char in the data, not at the last char.

1351 tp.srcLine->addElement(lineNum, status);

1352 tp.srcCol ->addElement(column, status);

1353

1354 parseState = PARSE_TAG;

1355 charIdx += 6;

1356

1357 if (!skipTest) {

1358 // RUN THE TEST!

1359 status = U_ZERO_ERROR;

1360 tp.setUTF16(status);

1361 executeTest(&tp, status);

1362 TEST_ASSERT_SUCCESS(status);

1363

1364 // Run again, this time with UTF-8 text wrapped in a UText.

1365 status = U_ZERO_ERROR;

1366 tp.setUTF8(status);

1367 TEST_ASSERT_SUCCESS(status);

1368 executeTest(&tp, status);

1369 }

1370 break;

1371 }

1372

1373 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {

1374 // Named character, e.g. \N{COMBINING GRAVE ACCENT}

1375 // Get the code point from the name and insert it into the test data.

1376 // (Damn, no API takes names in Unicode !!!

1377 // we've got to take it back to char *)

1378 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/'}'/, char Idx);

1379 int32_t nameLength = nameEndIdx - (charIdx+2);

1380 char charNameBuf[200];

1381 UChar32 theChar = -1;

1382 if (nameEndIdx != -1) {

1383 UErrorCode status = U_ZERO_ERROR;

1384 testString.extract(charIdx+2, nameLength, charNameBuf, sizeo f(charNameBuf));

1385 charNameBuf[sizeof(charNameBuf)-1] = 0;

1386 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, & status);

1387 if (U_FAILURE(status)) {

1388 theChar = -1;

1389 }

1390 }

1391 if (theChar == -1) {

1392 errln("Error in named character in test file at line %d, col %d",

1393 lineNum, column);

1394 } else {

1395 // Named code point was recognized. Insert it

1396 // into the test data.

1397 tp.dataToBreak.append(theChar);

1398 while (tp.dataToBreak.length() > tp.srcLine->size()) {

1399 tp.srcLine->addElement(lineNum, status);

1400 tp.srcCol ->addElement(column, status);

1401 }

1402 }

1403 if (nameEndIdx > charIdx) {

1404 charIdx = nameEndIdx+1;

1405

1406 }

1407 break;

1408 }

1409

1410

1411

1412

1413 if (testString.compare(charIdx-1, 2, "<>") == 0) {

1414 charIdx++;

1415 int32_t breakIdx = tp.dataToBreak.length();

1416 tp.expectedBreaks->setSize(breakIdx+1);

1417 tp.expectedBreaks->setElementAt(-1, breakIdx);

1418 tp.srcLine->setSize(breakIdx+1);

1419 tp.srcLine->setElementAt(lineNum, breakIdx);

1420 tp.srcCol ->setSize(breakIdx+1);

1421 tp.srcCol ->setElementAt(column, breakIdx);

1422 break;

1423 }

1424

1425 if (c == CH_LT) {

1426 tagValue = 0;

1427 parseState = PARSE_NUM;

1428 break;

1429 }

1430

1431 if (c == CH_HASH && column==3) { // TODO: why is column off so fa r?

1432 parseState = PARSE_COMMENT;

1433 savedState = PARSE_DATA;

1434 break;

1435 }

1436

1437 if (c == CH_BACKSLASH) {

1438 // Check for \ at end of line, a line continuation.

1439 // Advance over (discard) the newline

1440 UChar32 cp = testString.char32At(charIdx);

1441 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) = = CH_LF) {

1442 // We have a CR LF

1443 // Need an extra increment of the input ptr to move over bo th of them

1444 charIdx++;

1445 }

1446 if (cp == CH_LF \|\| cp == CH_CR) {

1447 lineNum++;

1448 colStart = charIdx;

1449 charIdx++;

1450 break;

1451 }

1452

1453 // Let unescape handle the back slash.

1454 cp = testString.unescapeAt(charIdx);

1455 if (cp != -1) {

1456 // Escape sequence was recognized. Insert the char

1457 // into the test data.

1458 tp.dataToBreak.append(cp);

1459 while (tp.dataToBreak.length() > tp.srcLine->size()) {

1460 tp.srcLine->addElement(lineNum, status);

1461 tp.srcCol ->addElement(column, status);

1462 }

1463 break;

1464 }

1465

1466

1467 // Not a recognized backslash escape sequence.

1468 // Take the next char as a literal.

1469 // TODO: Should this be an error?

1470 c = testString.charAt(charIdx);

1471 charIdx = testString.moveIndex32(charIdx, 1);

1472 }

1473

1474 // Normal, non-escaped data char.

1475 tp.dataToBreak.append(c);

1476

1477 // Save the mapping from offset in the data to line/column numbers i n

1478 // the original input file. Will be used for better error message s only.

1479 // If there's an expected break before this char, the slot in the mapping

1480 // vector will already be set for this char; don't overwrite it.

1481 if (tp.dataToBreak.length() > tp.srcLine->size()) {

1482 tp.srcLine->addElement(lineNum, status);

1483 tp.srcCol ->addElement(column, status);

1484 }

1485 break;

1486

1487

1488 case PARSE_NUM:

1489 // We are parsing an expected numeric tag value, like <1234>,

1490 // within a chunk of data.

1491 if (u_isUWhiteSpace(c)) {

1492 break;

1493 }

1494

1495 if (c == CH_GT) {

1496 // Finished the number. Add the info to the expected break data ,

1497 // and switch parse state back to doing plain data.

1498 parseState = PARSE_DATA;

1499 if (tagValue == 0) {

1500 tagValue = -1;

1501 }

1502 int32_t breakIdx = tp.dataToBreak.length();

1503 tp.expectedBreaks->setSize(breakIdx+1);

1504 tp.expectedBreaks->setElementAt(tagValue, breakIdx);

1505 tp.srcLine->setSize(breakIdx+1);

1506 tp.srcLine->setElementAt(lineNum, breakIdx);

1507 tp.srcCol ->setSize(breakIdx+1);

1508 tp.srcCol ->setElementAt(column, breakIdx);

1509 break;

1510 }

1511

1512 if (u_isdigit(c)) {

1513 tagValue = tagValue*10 + u_charDigitValue(c);

1514 break;

1515 }

1516

1517 errln("Syntax Error in test file at line %d, col %d",

1518 lineNum, column);

1519 parseState = PARSE_COMMENT;

1520 goto end_test; // Stop the test

1521 break;

1522 }

1523

1524

1525 if (U_FAILURE(status)) {

1526 dataerrln("ICU Error %s while parsing test file at line %d.",

1527 u_errorName(status), lineNum);

1528 status = U_ZERO_ERROR;

1529 goto end_test; // Stop the test

1530 }

1531

1532 }

1533

1534 end_test:

1535 delete [] testFile;

1536 #endif

1537 }

1538

1539

1540 //------------------------------------------------------------------------------ -

1541 //

1542 // TestDictRules create a break iterator from source rules that includes a

1543 // dictionary range. Regression for bug #7130. Source rules

1544 // do not declare a break iterator type (word, line, sentence, etc.

1545 // but the dictionary code, without a type, would loop.

1546 //

1547 //------------------------------------------------------------------------------ -

1548 void RBBITest::TestDictRules() {

1549 const char *rules = "$dictionary = [a-z]; \n"

1550 "!!forward; \n"

1551 "$dictionary $dictionary; \n"

1552 "!!reverse; \n"

1553 "$dictionary $dictionary; \n";

1554 const char *text = "aa";

1555 UErrorCode status = U_ZERO_ERROR;

1556 UParseError parseError;

1557

1558 RuleBasedBreakIterator bi(rules, parseError, status);

1559 if (U_SUCCESS(status)) {

1560 UnicodeString utext = text;

1561 bi.setText(utext);

1562 int32_t position;

1563 int32_t loops;

1564 for (loops = 0; loops<10; loops++) {

1565 position = bi.next();

1566 if (position == RuleBasedBreakIterator::DONE) {

1567 break;

1568 }

1569 }

1570 TEST_ASSERT(loops == 1);

1571 } else {

1572 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(statu s));

1573 }

1574 }

1575

1576

1577

1578 //------------------------------------------------------------------------------ -

1579 //

1580 // ReadAndConvertFile Read a text data file, convert it to UChars, and

1581 // return the datain one big UChar * buffer, which the caller must delete.

1582 //

1583 // parameters:

1584 // fileName: the name of the file, with no directory part. The test data directory

1585 // is assumed.

1586 // ulen an out parameter, receives the actual length (in UChars) of the file data.

1587 // encoding The file encoding. If the file contains a BOM, that wil l override the encoding

1588 // specified here. The BOM, if it exists, will be stripped from the returned data.

1589 // Pass NULL for the system default encoding.

1590 // status

1591 // returns:

1592 // The file data, converted to UChar.

1593 // The caller must delete this when done with

1594 // delete [] theBuffer;

1595 //

1596 // TODO: This is a clone of RegexTest::ReadAndConvertFile.

1597 // Move this function to some common place.

1598 //

1599 //------------------------------------------------------------------------------ --

1600 UChar RBBITest::ReadAndConvertFile(const char fileName, int &ulen, const char *encoding, UErrorCode &status) {

1601 UChar *retPtr = NULL;

1602 char *fileBuf = NULL;

1603 UConverter* conv = NULL;

1604 FILE *f = NULL;

1605

1606 ulen = 0;

1607 if (U_FAILURE(status)) {

1608 return retPtr;

1609 }

1610

1611 //

1612 // Open the file.

1613 //

1614 f = fopen(fileName, "rb");

1615 if (f == 0) {

1616 dataerrln("Error opening test data file %s\n", fileName);

1617 status = U_FILE_ACCESS_ERROR;

1618 return NULL;

1619 }

1620 //

1621 // Read it in

1622 //

1623 int fileSize;

1624 int amt_read;

1625

1626 fseek( f, 0, SEEK_END);

1627 fileSize = ftell(f);

1628 fileBuf = new char[fileSize];

1629 fseek(f, 0, SEEK_SET);

1630 amt_read = fread(fileBuf, 1, fileSize, f);

1631 if (amt_read != fileSize \|\| fileSize <= 0) {

1632 errln("Error reading test data file.");

1633 goto cleanUpAndReturn;

1634 }

1635

1636 //

1637 // Look for a Unicode Signature (BOM) on the data just read

1638 //

1639 int32_t signatureLength;

1640 const char * fileBufC;

1641 const char* bomEncoding;

1642

1643 fileBufC = fileBuf;

1644 bomEncoding = ucnv_detectUnicodeSignature(

1645 fileBuf, fileSize, &signatureLength, &status);

1646 if(bomEncoding!=NULL ){

1647 fileBufC += signatureLength;

1648 fileSize -= signatureLength;

1649 encoding = bomEncoding;

1650 }

1651

1652 //

1653 // Open a converter to take the rule file to UTF-16

1654 //

1655 conv = ucnv_open(encoding, &status);

1656 if (U_FAILURE(status)) {

1657 goto cleanUpAndReturn;

1658 }

1659

1660 //

1661 // Convert the rules to UChar.

1662 // Preflight first to determine required buffer size.

1663 //

1664 ulen = ucnv_toUChars(conv,

1665 NULL, // dest,

1666 0, // destCapacity,

1667 fileBufC,

1668 fileSize,

1669 &status);

1670 if (status == U_BUFFER_OVERFLOW_ERROR) {

1671 // Buffer Overflow is expected from the preflight operation.

1672 status = U_ZERO_ERROR;

1673

1674 retPtr = new UChar[ulen+1];

1675 ucnv_toUChars(conv,

1676 retPtr, // dest,

1677 ulen+1,

1678 fileBufC,

1679 fileSize,

1680 &status);

1681 }

1682

1683 cleanUpAndReturn:

1684 fclose(f);

1685 delete []fileBuf;

1686 ucnv_close(conv);

1687 if (U_FAILURE(status)) {

1688 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));

1689 delete []retPtr;

1690 retPtr = 0;

1691 ulen = 0;

1692 };

1693 return retPtr;

1694 }

1695

1696

1697

1698 //------------------------------------------------------------------------------ --------------

1699 //

1700 // Run tests from each of the boundary test data files distributed by the Unic ode Consortium

1701 //

1702 //------------------------------------------------------------------------------ -------------

1703 void RBBITest::TestUnicodeFiles() {

1704 RuleBasedBreakIterator *bi;

1705 UErrorCode status = U_ZERO_ERROR;

1706

1707 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Local e::getEnglish(), status);

1708 TEST_ASSERT_SUCCESS(status);

1709 if (U_SUCCESS(status)) {

1710 runUnicodeTestData("GraphemeBreakTest.txt", bi);

1711 }

1712 delete bi;

1713

1714 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::ge tEnglish(), status);

1715 TEST_ASSERT_SUCCESS(status);

1716 if (U_SUCCESS(status)) {

1717 runUnicodeTestData("WordBreakTest.txt", bi);

1718 }

1719 delete bi;

1720

1721 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale ::getEnglish(), status);

1722 TEST_ASSERT_SUCCESS(status);

1723 if (U_SUCCESS(status)) {

1724 runUnicodeTestData("SentenceBreakTest.txt", bi);

1725 }

1726 delete bi;

1727

1728 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::ge tEnglish(), status);

1729 TEST_ASSERT_SUCCESS(status);

1730 if (U_SUCCESS(status)) {

1731 runUnicodeTestData("LineBreakTest.txt", bi);

1732 }

1733 delete bi;

1734 }

1735

1736

1737 // Check for test cases from the Unicode test data files that are known to fail

1738 // and should be skipped because ICU is not yet able to fully implement the spec .

1739 // See ticket #7270.

1740

1741 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char * fileName) {

1742 static const UChar badTestCases[][4] = { // Line Numbers from Unicode 7.0.0 file.

1743 {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000}, // Line 5198

1744 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000}, // Line 5202

1745 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000}, // Line 5214

1746 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000}, // Line 5246

1747 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000}, // Line 5298

1748 {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000} // Line 5302

1749 };

1750 if (strcmp(fileName, "LineBreakTest.txt") != 0) {

1751 return FALSE;

1752 }

1753

1754 for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {

1755 if (testCase == UnicodeString(badTestCases[i])) {

1756 return logKnownIssue("7270");

1757 }

1758 }

1759 return FALSE;

1760 }

1761

1762

1763 //------------------------------------------------------------------------------ --------------

1764 //

1765 // Run tests from one of the boundary test data files distributed by the Unico de Consortium

1766 //

1767 //------------------------------------------------------------------------------ -------------

1768 void RBBITest::runUnicodeTestData(const char fileName, RuleBasedBreakIterator bi) {

1769 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

1770 UErrorCode status = U_ZERO_ERROR;

1771

1772 //

1773 // Open and read the test data file, put it into a UnicodeString.

1774 //

1775 const char *testDataDirectory = IntlTest::getSourceTestData(status);

1776 char testFileName[1000];

1777 if (testDataDirectory == NULL \|\| strlen(testDataDirectory) >= sizeof(testFil eName)) {

1778 dataerrln("Can't open test data. Path too long.");

1779 return;

1780 }

1781 strcpy(testFileName, testDataDirectory);

1782 strcat(testFileName, fileName);

1783

1784 logln("Opening data file %s\n", fileName);

1785

1786 int len;

1787 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);

1788 if (status != U_FILE_ACCESS_ERROR) {

1789 TEST_ASSERT_SUCCESS(status);

1790 TEST_ASSERT(testFile != NULL);

1791 }

1792 if (U_FAILURE(status) \|\| testFile == NULL) {

1793 return; /* something went wrong, error already output */

1794 }

1795 UnicodeString testFileAsString(TRUE, testFile, len);

1796

1797 //

1798 // Parse the test data file using a regular expression.

1799 // Each kind of token is recognized in its own capture group; what type of item was scanned

1800 // is identified by which group had a match.

1801 //

1802 // Caputure Group # 1 2 3 4 5

1803 // Parses this item: divide x hex digits comme nt \n unrecognized \n

1804 //

1805 UnicodeString tokenExpr("[ \t](?:(\\u00F7)\|(\\u00D7)\|([0-9a-fA-F]+)\|((?:#. ?)?$.)\|(.*?$.))", -1, US_INV);

1806 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE \| UREGEX_DOTALL, status);

1807 UnicodeString testString;

1808 UVector32 breakPositions(status);

1809 int lineNumber = 1;

1810 TEST_ASSERT_SUCCESS(status);

1811 if (U_FAILURE(status)) {

1812 return;

1813 }

1814

1815 //

1816 // Scan through each test case, building up the string to be broken in test String,

1817 // and the positions that should be boundaries in the breakPositions vecto r.

1818 //

1819 int spin = 0;

1820 while (tokenMatcher.find()) {

1821 if(tokenMatcher.hitEnd()) {

1822 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.

1823 This occurred when the text file was corrupt (wasn't marked as UTF- 8)

1824 and caused an infinite loop here on EBCDIC systems!

1825 */

1826 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt d ata file?\r", fileName, ++spin);

1827 // return;

1828 }

1829 if (tokenMatcher.start(1, status) >= 0) {

1830 // Scanned a divide sign, indicating a break position in the test da ta.

1831 if (testString.length()>0) {

1832 breakPositions.addElement(testString.length(), status);

1833 }

1834 }

1835 else if (tokenMatcher.start(2, status) >= 0) {

1836 // Scanned an 'x', meaning no break at this position in the test dat a

1837 // Nothing to be done here.

1838 }

1839 else if (tokenMatcher.start(3, status) >= 0) {

1840 // Scanned Hex digits. Convert them to binary, append to the charac ter data string.

1841 const UnicodeString &hexNumber = tokenMatcher.group(3, status);

1842 int length = hexNumber.length();

1843 if (length<=8) {

1844 char buf[10];

1845 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);

1846 UChar32 c = (UChar32)strtol(buf, NULL, 16);

1847 if (c<=0x10ffff) {

1848 testString.append(c);

1849 } else {

1850 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",

1851 fileName, lineNumber);

1852 }

1853 } else {

1854 errln("Syntax Error: Hex Unicode Character value must have no mo re than 8 digits at \'%s\', line %d.\n",

1855 fileName, lineNumber);

1856 }

1857 }

1858 else if (tokenMatcher.start(4, status) >= 0) {

1859 // Scanned to end of a line, possibly skipping over a comment in the process.

1860 // If the line from the file contained test data, run the test now .

1861 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fil eName)) {

1862 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPos itions, bi);

1863 }

1864

1865 // Clear out this test case.

1866 // The string and breakPositions vector will be refilled as the n ext

1867 // test case is parsed.

1868 testString.remove();

1869 breakPositions.removeAllElements();

1870 lineNumber++;

1871 } else {

1872 // Scanner catchall. Something unrecognized appeared on the line.

1873 char token[16];

1874 UnicodeString uToken = tokenMatcher.group(0, status);

1875 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));

1876 token[sizeof(token)-1] = 0;

1877 errln("Syntax error in test data file \'%s\', line %d. Scanning \"% s\"\n", fileName, lineNumber, token);

1878

1879 // Clean up, in preparation for continuing with the next line.

1880 testString.remove();

1881 breakPositions.removeAllElements();

1882 lineNumber++;

1883 }

1884 TEST_ASSERT_SUCCESS(status);

1885 if (U_FAILURE(status)) {

1886 break;

1887 }

1888 }

1889

1890 delete [] testFile;

1891 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS

1892 }

1893

1894 //------------------------------------------------------------------------------ --------------

1895 //

1896 // checkUnicodeTestCase() Run one test case from one of the Unicode Consorti um

1897 // test data files. Do only a simple, forward-only c heck -

1898 // this test is mostly to check that ICU and the Unic ode

1899 // data agree with each other.

1900 //

1901 //------------------------------------------------------------------------------ --------------

1902 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,

1903 const UnicodeString &testString, // Text data to be b roken

1904 UVector32 *breakPositions, // Positions where b reaks should be found.

1905 RuleBasedBreakIterator *bi) {

1906 int32_t pos; // Break Position in the test string

1907 int32_t expectedI = 0; // Index of expected break position in the vect or of expected results.

1908 int32_t expectedPos; // Expected break position (index into test str ing)

1909

1910 bi->setText(testString);

1911 pos = bi->first();

1912 pos = bi->next();

1913

1914 while (pos != BreakIterator::DONE) {

1915 if (expectedI >= breakPositions->size()) {

1916 errln("Test file \"%s\", line %d, unexpected break found at position %d",

1917 testFileName, lineNumber, pos);

1918 break;

1919 }

1920 expectedPos = breakPositions->elementAti(expectedI);

1921 if (pos < expectedPos) {

1922 errln("Test file \"%s\", line %d, unexpected break found at position %d",

1923 testFileName, lineNumber, pos);

1924 break;

1925 }

1926 if (pos > expectedPos) {

1927 errln("Test file \"%s\", line %d, failed to find expected break at p osition %d",

1928 testFileName, lineNumber, expectedPos);

1929 break;

1930 }

1931 pos = bi->next();

1932 expectedI++;

1933 }

1934

1935 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {

1936 errln("Test file \"%s\", line %d, failed to find expected break at posit ion %d",

1937 testFileName, lineNumber, breakPositions->elementAti(expectedI));

1938 }

1939 }

1940

1941

1942

1943 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

1944 //------------------------------------------------------------------------------ ---------

1945 //

1946 // classs RBBIMonkeyKind

1947 //

1948 // Monkey Test for Break Iteration

1949 // Abstract interface class. Concrete derived classes independently

1950 // implement the break rules for different iterator types.

1951 //

1952 // The Monkey Test itself uses doesn't know which type of break iterator it is

1953 // testing, but works purely in terms of the interface defined here.

1954 //

1955 //------------------------------------------------------------------------------ ---------

1956 class RBBIMonkeyKind {

1957 public:

1958 // Return a UVector of UnicodeSets, representing the character classes used

1959 // for this type of iterator.

1960 virtual UVector *charClasses() = 0;

1961

1962 // Set the test text on which subsequent calls to next() will operate

1963 virtual void setText(const UnicodeString &s) = 0;

1964

1965 // Find the next break postion, starting from the prev break position, or fr om zero.

1966 // Return -1 after reaching end of string.

1967 virtual int32_t next(int32_t i) = 0;

1968

1969 virtual ~RBBIMonkeyKind();

1970 UErrorCode deferredStatus;

1971

1972

1973 protected:

1974 RBBIMonkeyKind();

1975

1976 private:

1977 };

1978

1979 RBBIMonkeyKind::RBBIMonkeyKind() {

1980 deferredStatus = U_ZERO_ERROR;

1981 }

1982

1983 RBBIMonkeyKind::~RBBIMonkeyKind() {

1984 }

1985

1986

1987 //------------------------------------------------------------------------------ ----------

1988 //

1989 // Random Numbers. Similar to standard lib rand() and srand()

1990 // Not using library to

1991 // 1. Get same results on all platforms.

1992 // 2. Get access to current seed, to more easily reproduce failures.

1993 //

1994 //------------------------------------------------------------------------------ ---------

1995 static uint32_t m_seed = 1;

1996

1997 static uint32_t m_rand()

1998 {

1999 m_seed = m_seed * 1103515245 + 12345;

2000 return (uint32_t)(m_seed/65536) % 32768;

2001 }

2002

2003

2004 //------------------------------------------------------------------------------ ------------

2005 //

2006 // class RBBICharMonkey Character (Grapheme Cluster) specific implementat ion

2007 // of RBBIMonkeyKind.

2008 //

2009 //------------------------------------------------------------------------------ ------------

2010 class RBBICharMonkey: public RBBIMonkeyKind {

2011 public:

2012 RBBICharMonkey();

2013 virtual ~RBBICharMonkey();

2014 virtual UVector *charClasses();

2015 virtual void setText(const UnicodeString &s);

2016 virtual int32_t next(int32_t i);

2017 private:

2018 UVector *fSets;

2019

2020 UnicodeSet *fCRLFSet;

2021 UnicodeSet *fControlSet;

2022 UnicodeSet *fExtendSet;

2023 UnicodeSet *fRegionalIndicatorSet;

2024 UnicodeSet *fPrependSet;

2025 UnicodeSet *fSpacingSet;

2026 UnicodeSet *fLSet;

2027 UnicodeSet *fVSet;

2028 UnicodeSet *fTSet;

2029 UnicodeSet *fLVSet;

2030 UnicodeSet *fLVTSet;

2031 UnicodeSet *fHangulSet;

2032 UnicodeSet *fAnySet;

2033

2034 const UnicodeString *fText;

2035 };

2036

2037

2038 RBBICharMonkey::RBBICharMonkey() {

2039 UErrorCode status = U_ZERO_ERROR;

2040

2041 fText = NULL;

2042

2043 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);

2044 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Br eak = Control}]"), status);

2045 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Br eak = Extend}]"), status);

2046 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_ Cluster_Break = Regional_Indicator}]"), status);

2047 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Br eak = Prepend}]"), status);

2048 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Br eak = SpacingMark}]"), status);

2049 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Br eak = L}]"), status);

2050 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Br eak = V}]"), status);

2051 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Br eak = T}]"), status);

2052 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Br eak = LV}]"), status);

2053 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Br eak = LVT}]"), status);

2054 fHangulSet = new UnicodeSet();

2055 fHangulSet->addAll(*fLSet);

2056 fHangulSet->addAll(*fVSet);

2057 fHangulSet->addAll(*fTSet);

2058 fHangulSet->addAll(*fLVSet);

2059 fHangulSet->addAll(*fLVTSet);

2060 fAnySet = new UnicodeSet(0, 0x10ffff);

2061

2062 fSets = new UVector(status);

2063 fSets->addElement(fCRLFSet, status);

2064 fSets->addElement(fControlSet, status);

2065 fSets->addElement(fExtendSet, status);

2066 fSets->addElement(fRegionalIndicatorSet, status);

2067 if (!fPrependSet->isEmpty()) {

2068 fSets->addElement(fPrependSet, status);

2069 }

2070 fSets->addElement(fSpacingSet, status);

2071 fSets->addElement(fHangulSet, status);

2072 fSets->addElement(fAnySet, status);

2073 if (U_FAILURE(status)) {

2074 deferredStatus = status;

2075 }

2076 }

2077

2078

2079 void RBBICharMonkey::setText(const UnicodeString &s) {

2080 fText = &s;

2081 }

2082

2083

2084

2085 int32_t RBBICharMonkey::next(int32_t prevPos) {

2086 int p0, p1, p2, p3; // Indices of the significant code points around t he

2087 // break position being tested. The candidate b reak

2088 // location is before p2.

2089

2090 int breakPos = -1;

2091

2092 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.

2093

2094 if (U_FAILURE(deferredStatus)) {

2095 return -1;

2096 }

2097

2098 // Previous break at end of string. return DONE.

2099 if (prevPos >= fText->length()) {

2100 return -1;

2101 }

2102 p0 = p1 = p2 = p3 = prevPos;

2103 c3 = fText->char32At(prevPos);

2104 c0 = c1 = c2 = 0;

2105 (void)p0; // suppress set but not used warning.

2106 (void)c0;

2107

2108 // Loop runs once per "significant" character position in the input text.

2109 for (;;) {

2110 // Move all of the positions forward in the input string.

2111 p0 = p1; c0 = c1;

2112 p1 = p2; c1 = c2;

2113 p2 = p3; c2 = c3;

2114

2115 // Advancd p3 by one codepoint

2116 p3 = fText->moveIndex32(p3, 1);

2117 c3 = fText->char32At(p3);

2118

2119 if (p1 == p2) {

2120 // Still warming up the loop. (won't work with zero length strings, but we don't care)

2121 continue;

2122 }

2123 if (p2 == fText->length()) {

2124 // Reached end of string. Always a break position.

2125 break;

2126 }

2127

2128 // Rule GB3 CR x LF

2129 // No Extend or Format characters may appear between the CR and LF,

2130 // which requires the additional check for p2 immediately following p1.

2131 //

2132 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {

2133 continue;

2134 }

2135

2136 // Rule (GB4). ( Control \| CR \| LF ) <break>

2137 if (fControlSet->contains(c1) \|\|

2138 c1 == 0x0D \|\|

2139 c1 == 0x0A) {

2140 break;

2141 }

2142

2143 // Rule (GB5) <break> ( Control \| CR \| LF )

2144 //

2145 if (fControlSet->contains(c2) \|\|

2146 c2 == 0x0D \|\|

2147 c2 == 0x0A) {

2148 break;

2149 }

2150

2151

2152 // Rule (GB6) L x ( L \| V \| LV \| LVT )

2153 if (fLSet->contains(c1) &&

2154 (fLSet->contains(c2) \|\|

2155 fVSet->contains(c2) \|\|

2156 fLVSet->contains(c2) \|\|

2157 fLVTSet->contains(c2))) {

2158 continue;

2159 }

2160

2161 // Rule (GB7) ( LV \| V ) x ( V \| T )

2162 if ((fLVSet->contains(c1) \|\| fVSet->contains(c1)) &&

2163 (fVSet->contains(c2) \|\| fTSet->contains(c2))) {

2164 continue;

2165 }

2166

2167 // Rule (GB8) ( LVT \| T) x T

2168 if ((fLVTSet->contains(c1) \|\| fTSet->contains(c1)) &&

2169 fTSet->contains(c2)) {

2170 continue;

2171 }

2172

2173 // Rule (GB8a) Regional_Indicator x Regional_Indicator

2174 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contai ns(c2)) {

2175 continue;

2176 }

2177

2178 // Rule (GB9) Numeric x ALetter

2179 if (fExtendSet->contains(c2)) {

2180 continue;

2181 }

2182

2183 // Rule (GB9a) x SpacingMark

2184 if (fSpacingSet->contains(c2)) {

2185 continue;

2186 }

2187

2188 // Rule (GB9b) Prepend x

2189 if (fPrependSet->contains(c1)) {

2190 continue;

2191 }

2192

2193 // Rule (GB10) Any <break> Any

2194 break;

2195 }

2196

2197 breakPos = p2;

2198 return breakPos;

2199 }

2200

2201

2202

2203 UVector *RBBICharMonkey::charClasses() {

2204 return fSets;

2205 }

2206

2207

2208 RBBICharMonkey::~RBBICharMonkey() {

2209 delete fSets;

2210 delete fCRLFSet;

2211 delete fControlSet;

2212 delete fExtendSet;

2213 delete fRegionalIndicatorSet;

2214 delete fPrependSet;

2215 delete fSpacingSet;

2216 delete fLSet;

2217 delete fVSet;

2218 delete fTSet;

2219 delete fLVSet;

2220 delete fLVTSet;

2221 delete fHangulSet;

2222 delete fAnySet;

2223 }

2224

2225 //------------------------------------------------------------------------------ ------------

2226 //

2227 // class RBBIWordMonkey Word Break specific implementation

2228 // of RBBIMonkeyKind.

2229 //

2230 //------------------------------------------------------------------------------ ------------

2231 class RBBIWordMonkey: public RBBIMonkeyKind {

2232 public:

2233 RBBIWordMonkey();

2234 virtual ~RBBIWordMonkey();

2235 virtual UVector *charClasses();

2236 virtual void setText(const UnicodeString &s);

2237 virtual int32_t next(int32_t i);

2238 private:

2239 UVector *fSets;

2240

2241 UnicodeSet *fCRSet;

2242 UnicodeSet *fLFSet;

2243 UnicodeSet *fNewlineSet;

2244 UnicodeSet *fRegionalIndicatorSet;

2245 UnicodeSet *fKatakanaSet;

2246 UnicodeSet *fHebrew_LetterSet;

2247 UnicodeSet *fALetterSet;

2248 // TODO(jungshik): Do we still need this change?

2249 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt

2250 UnicodeSet *fSingle_QuoteSet;

2251 UnicodeSet *fDouble_QuoteSet;

2252 UnicodeSet *fMidNumLetSet;

2253 UnicodeSet *fMidLetterSet;

2254 UnicodeSet *fMidNumSet;

2255 UnicodeSet *fNumericSet;

2256 UnicodeSet *fFormatSet;

2257 UnicodeSet *fOtherSet;

2258 UnicodeSet *fExtendSet;

2259 UnicodeSet *fExtendNumLetSet;

2260 UnicodeSet *fDictionaryCjkSet;

2261

2262 const UnicodeString *fText;

2263 };

2264

2265

2266 RBBIWordMonkey::RBBIWordMonkey()

2267 {

2268 UErrorCode status = U_ZERO_ERROR;

2269

2270 fSets = new UVector(status);

2271

2272 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = C R}]"), status);

2273 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = L F}]"), status);

2274 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = N ewline}]"), status);

2275 fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Ka takana:]]", status);

2276 // Exclude Hangul syllables from ALetterSet during testing.

2277 // Leave CJK dictionary characters out from the monkey tests!

2278 #if 0

2279 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"

2280 "[\\p{Line_Break = Complex_Context}"

2281 "-\\p{Grapheme_Cluster_Break = Extend}"

2282 "-\\p{Grapheme_Cluster_Break = Control}"

2283 "]]",

2284 status);

2285 #endif

2286 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Bre ak = Regional_Indicator}]"), status);

2287 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);

2288 fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);

2289 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);

2290 fALetterSet->removeAll(*fDictionaryCjkSet);

2291 fSingle_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status);

2292 fDouble_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status);

2293 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);

2294 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);

2295 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);

2296 // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test

2297 // we should figure out why

2298 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);

2299 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);

2300 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);

2301 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);

2302

2303 fOtherSet = new UnicodeSet();

2304 if(U_FAILURE(status)) {

2305 deferredStatus = status;

2306 return;

2307 }

2308

2309 fOtherSet->complement();

2310 fOtherSet->removeAll(*fCRSet);

2311 fOtherSet->removeAll(*fLFSet);

2312 fOtherSet->removeAll(*fNewlineSet);

2313 fOtherSet->removeAll(*fKatakanaSet);

2314 fOtherSet->removeAll(*fHebrew_LetterSet);

2315 fOtherSet->removeAll(*fALetterSet);

2316 fOtherSet->removeAll(*fSingle_QuoteSet);

2317 fOtherSet->removeAll(*fDouble_QuoteSet);

2318 fOtherSet->removeAll(*fMidLetterSet);

2319 fOtherSet->removeAll(*fMidNumSet);

2320 fOtherSet->removeAll(*fNumericSet);

2321 fOtherSet->removeAll(*fExtendNumLetSet);

2322 fOtherSet->removeAll(*fFormatSet);

2323 fOtherSet->removeAll(*fExtendSet);

2324 fOtherSet->removeAll(*fRegionalIndicatorSet);

2325 // Inhibit dictionary characters from being tested at all.

2326 fOtherSet->removeAll(*fDictionaryCjkSet);

2327 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Comp lex_Context}]"), status));

2328

2329 fSets->addElement(fCRSet, status);

2330 fSets->addElement(fLFSet, status);

2331 fSets->addElement(fNewlineSet, status);

2332 fSets->addElement(fRegionalIndicatorSet, status);

2333 fSets->addElement(fHebrew_LetterSet, status);

2334 fSets->addElement(fALetterSet, status);

2335 fSets->addElement(fSingle_QuoteSet, status);

2336 fSets->addElement(fDouble_QuoteSet, status);

2337 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana

2338 fSets->addElement(fMidLetterSet, status);

2339 fSets->addElement(fMidNumLetSet, status);

2340 fSets->addElement(fMidNumSet, status);

2341 fSets->addElement(fNumericSet, status);

2342 fSets->addElement(fFormatSet, status);

2343 fSets->addElement(fExtendSet, status);

2344 fSets->addElement(fOtherSet, status);

2345 fSets->addElement(fExtendNumLetSet, status);

2346

2347 if (U_FAILURE(status)) {

2348 deferredStatus = status;

2349 }

2350 }

2351

2352 void RBBIWordMonkey::setText(const UnicodeString &s) {

2353 fText = &s;

2354 }

2355

2356

2357 int32_t RBBIWordMonkey::next(int32_t prevPos) {

2358 int p0, p1, p2, p3; // Indices of the significant code points around t he

2359 // break position being tested. The candidate b reak

2360 // location is before p2.

2361

2362 int breakPos = -1;

2363

2364 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.

2365

2366 if (U_FAILURE(deferredStatus)) {

2367 return -1;

2368 }

2369

2370 // Prev break at end of string. return DONE.

2371 if (prevPos >= fText->length()) {

2372 return -1;

2373 }

2374 p0 = p1 = p2 = p3 = prevPos;

2375 c3 = fText->char32At(prevPos);

2376 c0 = c1 = c2 = 0;

2377 (void)p0; // Suppress set but not used warning.

2378

2379 // Loop runs once per "significant" character position in the input text.

2380 for (;;) {

2381 // Move all of the positions forward in the input string.

2382 p0 = p1; c0 = c1;

2383 p1 = p2; c1 = c2;

2384 p2 = p3; c2 = c3;

2385

2386 // Advancd p3 by X(Extend \| Format)* Rule 4

2387 // But do not advance over Extend & Format following a new line. (Uni code 5.1 change)

2388 do {

2389 p3 = fText->moveIndex32(p3, 1);

2390 c3 = fText->char32At(p3);

2391 if (fCRSet->contains(c2) \|\| fLFSet->contains(c2) \|\| fNewlineSet->con tains(c2)) {

2392 break;

2393 };

2394 }

2395 while (fFormatSet->contains(c3) \|\| fExtendSet->contains(c3));

2396

2397

2398 if (p1 == p2) {

2399 // Still warming up the loop. (won't work with zero length strings, but we don't care)

2400 continue;

2401 }

2402 if (p2 == fText->length()) {

2403 // Reached end of string. Always a break position.

2404 break;

2405 }

2406

2407 // Rule (3) CR x LF

2408 // No Extend or Format characters may appear between the CR and LF,

2409 // which requires the additional check for p2 immediately following p1.

2410 //

2411 if (c1==0x0D && c2==0x0A) {

2412 continue;

2413 }

2414

2415 // Rule (3a) Break before and after newlines (including CR and LF)

2416 //

2417 if (fCRSet->contains(c1) \|\| fLFSet->contains(c1) \|\| fNewlineSet->contain s(c1)) {

2418 break;

2419 };

2420 if (fCRSet->contains(c2) \|\| fLFSet->contains(c2) \|\| fNewlineSet->contain s(c2)) {

2421 break;

2422 };

2423

2424 // Rule (5). (ALetter \| Hebrew_Letter) x (ALetter \| Hebrew_Letter)

2425 if ((fALetterSet->contains(c1) \|\| fHebrew_LetterSet->contains(c1)) &&

2426 (fALetterSet->contains(c2) \|\| fHebrew_LetterSet->contains(c2))) {

2427 continue;

2428 }

2429

2430 // Rule (6) (ALetter \| Hebrew_Letter) x (MidLetter \| MidNumLet \| Sing le_Quote) (ALetter \| Hebrew_Letter)

2431 //

2432 if ( (fALetterSet->contains(c1) \|\| fHebrew_LetterSet->contains(c1)) &&

2433 (fMidLetterSet->contains(c2) \|\| fMidNumLetSet->contains(c2) \|\| fSin gle_QuoteSet->contains(c2)) &&

2434 (fALetterSet->contains(c3) \|\| fHebrew_LetterSet->contains(c3))) {

2435 continue;

2436 }

2437

2438 // Rule (7) (ALetter \| Hebrew_Letter) (MidLetter \| MidNumLet \| Single_Q uote) x (ALetter \| Hebrew_Letter)

2439 if ((fALetterSet->contains(c0) \|\| fHebrew_LetterSet->contains(c0)) &&

2440 (fMidLetterSet->contains(c1) \|\| fMidNumLetSet->contains(c1) \|\| fSing le_QuoteSet->contains(c1)) &&

2441 (fALetterSet->contains(c2) \|\| fHebrew_LetterSet->contains(c2))) {

2442 continue;

2443 }

2444

2445 // Rule (7a) Hebrew_Letter x Single_Quote

2446 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {

2447 continue;

2448 }

2449

2450 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter

2451 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {

2452 continue;

2453 }

2454

2455 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter

2456 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {

2457 continue;

2458 }

2459

2460 // Rule (8) Numeric x Numeric

2461 if (fNumericSet->contains(c1) &&

2462 fNumericSet->contains(c2)) {

2463 continue;

2464 }

2465

2466 // Rule (9) (ALetter \| Hebrew_Letter) x Numeric

2467 if ((fALetterSet->contains(c1) \|\| fHebrew_LetterSet->contains(c1)) &&

2468 fNumericSet->contains(c2)) {

2469 continue;

2470 }

2471

2472 // Rule (10) Numeric x (ALetter \| Hebrew_Letter)

2473 if (fNumericSet->contains(c1) &&

2474 (fALetterSet->contains(c2) \|\| fHebrew_LetterSet->contains(c2))) {

2475 continue;

2476 }

2477

2478 // Rule (11) Numeric (MidNum \| MidNumLet \| Single_Quote) x Numeric

2479 if (fNumericSet->contains(c0) &&

2480 (fMidNumSet->contains(c1) \|\| fMidNumLetSet->contains(c1) \|\| fSingle_ QuoteSet->contains(c1)) &&

2481 fNumericSet->contains(c2)) {

2482 continue;

2483 }

2484

2485 // Rule (12) Numeric x (MidNum \| MidNumLet \| SingleQuote) Numeric

2486 if (fNumericSet->contains(c1) &&

2487 (fMidNumSet->contains(c2) \|\| fMidNumLetSet->contains(c2) \|\| fSingle_ QuoteSet->contains(c2)) &&

2488 fNumericSet->contains(c3)) {

2489 continue;

2490 }

2491

2492 // Rule (13) Katakana x Katakana

2493 if (fKatakanaSet->contains(c1) &&

2494 fKatakanaSet->contains(c2)) {

2495 continue;

2496 }

2497

2498 // Rule 13a (ALetter \| Hebrew_Letter \| Numeric \| KataKana \| ExtendNum Let) x ExtendNumLet

2499 if ((fALetterSet->contains(c1) \|\| fHebrew_LetterSet->contains(c1) \|\|fNum ericSet->contains(c1) \|\|

2500 fKatakanaSet->contains(c1) \|\| fExtendNumLetSet->contains(c1)) &&

2501 fExtendNumLetSet->contains(c2)) {

2502 continue;

2503 }

2504

2505 // Rule 13b ExtendNumLet x (ALetter \| Hebrew_Letter \| Numeric \| Kataka na)

2506 if (fExtendNumLetSet->contains(c1) &&

2507 (fALetterSet->contains(c2) \|\| fHebrew_LetterSet->contains(c2) \|\|

2508 fNumericSet->contains(c2) \|\| fKatakanaSet->contains(c2))) {

2509 continue;

2510 }

2511

2512 // Rule 13c

2513 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contai ns(c2)) {

2514 continue;

2515 }

2516

2517 // Rule 14. Break found here.

2518 break;

2519 }

2520

2521 breakPos = p2;

2522 return breakPos;

2523 }

2524

2525

2526 UVector *RBBIWordMonkey::charClasses() {

2527 return fSets;

2528 }

2529

2530

2531 RBBIWordMonkey::~RBBIWordMonkey() {

2532 delete fSets;

2533 delete fCRSet;

2534 delete fLFSet;

2535 delete fNewlineSet;

2536 delete fKatakanaSet;

2537 delete fHebrew_LetterSet;

2538 delete fALetterSet;

2539 delete fSingle_QuoteSet;

2540 delete fDouble_QuoteSet;

2541 delete fMidNumLetSet;

2542 delete fMidLetterSet;

2543 delete fMidNumSet;

2544 delete fNumericSet;

2545 delete fFormatSet;

2546 delete fExtendSet;

2547 delete fExtendNumLetSet;

2548 delete fRegionalIndicatorSet;

2549 delete fDictionaryCjkSet;

2550 delete fOtherSet;

2551 }

2552

2553

2554

2555

2556 //------------------------------------------------------------------------------ ------------

2557 //

2558 // class RBBISentMonkey Sentence Break specific implementation

2559 // of RBBIMonkeyKind.

2560 //

2561 //------------------------------------------------------------------------------ ------------

2562 class RBBISentMonkey: public RBBIMonkeyKind {

2563 public:

2564 RBBISentMonkey();

2565 virtual ~RBBISentMonkey();

2566 virtual UVector *charClasses();

2567 virtual void setText(const UnicodeString &s);

2568 virtual int32_t next(int32_t i);

2569 private:

2570 int moveBack(int posFrom);

2571 int moveForward(int posFrom);

2572 UChar32 cAt(int pos);

2573

2574 UVector *fSets;

2575

2576 UnicodeSet *fSepSet;

2577 UnicodeSet *fFormatSet;

2578 UnicodeSet *fSpSet;

2579 UnicodeSet *fLowerSet;

2580 UnicodeSet *fUpperSet;

2581 UnicodeSet *fOLetterSet;

2582 UnicodeSet *fNumericSet;

2583 UnicodeSet *fATermSet;

2584 UnicodeSet *fSContinueSet;

2585 UnicodeSet *fSTermSet;

2586 UnicodeSet *fCloseSet;

2587 UnicodeSet *fOtherSet;

2588 UnicodeSet *fExtendSet;

2589

2590 const UnicodeString *fText;

2591

2592 };

2593

2594 RBBISentMonkey::RBBISentMonkey()

2595 {

2596 UErrorCode status = U_ZERO_ERROR;

2597

2598 fSets = new UVector(status);

2599

2600 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator

2601 // set and made into character classes of their own. For the monkey impl,

2602 // they remain in SEP, since Sep always appears with C R and LF in the rules.

2603 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);

2604 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);

2605 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);

2606 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);

2607 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);

2608 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);

2609 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);

2610 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);

2611 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);

2612 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);

2613 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);

2614 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);

2615 fOtherSet = new UnicodeSet();

2616

2617 if(U_FAILURE(status)) {

2618 deferredStatus = status;

2619 return;

2620 }

2621

2622 fOtherSet->complement();

2623 fOtherSet->removeAll(*fSepSet);

2624 fOtherSet->removeAll(*fFormatSet);

2625 fOtherSet->removeAll(*fSpSet);

2626 fOtherSet->removeAll(*fLowerSet);

2627 fOtherSet->removeAll(*fUpperSet);

2628 fOtherSet->removeAll(*fOLetterSet);

2629 fOtherSet->removeAll(*fNumericSet);

2630 fOtherSet->removeAll(*fATermSet);

2631 fOtherSet->removeAll(*fSContinueSet);

2632 fOtherSet->removeAll(*fSTermSet);

2633 fOtherSet->removeAll(*fCloseSet);

2634 fOtherSet->removeAll(*fExtendSet);

2635

2636 fSets->addElement(fSepSet, status);

2637 fSets->addElement(fFormatSet, status);

2638 fSets->addElement(fSpSet, status);

2639 fSets->addElement(fLowerSet, status);

2640 fSets->addElement(fUpperSet, status);

2641 fSets->addElement(fOLetterSet, status);

2642 fSets->addElement(fNumericSet, status);

2643 fSets->addElement(fATermSet, status);

2644 fSets->addElement(fSContinueSet, status);

2645 fSets->addElement(fSTermSet, status);

2646 fSets->addElement(fCloseSet, status);

2647 fSets->addElement(fOtherSet, status);

2648 fSets->addElement(fExtendSet, status);

2649

2650 if (U_FAILURE(status)) {

2651 deferredStatus = status;

2652 }

2653 }

2654

2655

2656

2657 void RBBISentMonkey::setText(const UnicodeString &s) {

2658 fText = &s;

2659 }

2660

2661 UVector *RBBISentMonkey::charClasses() {

2662 return fSets;

2663 }

2664

2665

2666 // moveBack() Find the "significant" code point preceding the index i.

2667 // Skips over ($Extend \| $Format)* .

2668 //

2669 int RBBISentMonkey::moveBack(int i) {

2670 if (i <= 0) {

2671 return -1;

2672 }

2673 UChar32 c;

2674 int32_t j = i;

2675 do {

2676 j = fText->moveIndex32(j, -1);

2677 c = fText->char32At(j);

2678 }

2679 while (j>0 &&(fFormatSet->contains(c) \|\| fExtendSet->contains(c)));

2680 return j;

2681

2682 }

2683

2684

2685 int RBBISentMonkey::moveForward(int i) {

2686 if (i>=fText->length()) {

2687 return fText->length();

2688 }

2689 UChar32 c;

2690 int32_t j = i;

2691 do {

2692 j = fText->moveIndex32(j, 1);

2693 c = cAt(j);

2694 }

2695 while (fFormatSet->contains(c) \|\| fExtendSet->contains(c));

2696 return j;

2697 }

2698

2699 UChar32 RBBISentMonkey::cAt(int pos) {

2700 if (pos<0 \|\| pos>=fText->length()) {

2701 return -1;

2702 } else {

2703 return fText->char32At(pos);

2704 }

2705 }

2706

2707 int32_t RBBISentMonkey::next(int32_t prevPos) {

2708 int p0, p1, p2, p3; // Indices of the significant code points around t he

2709 // break position being tested. The candidate b reak

2710 // location is before p2.

2711

2712 int breakPos = -1;

2713

2714 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.

2715 UChar32 c;

2716

2717 if (U_FAILURE(deferredStatus)) {

2718 return -1;

2719 }

2720

2721 // Prev break at end of string. return DONE.

2722 if (prevPos >= fText->length()) {

2723 return -1;

2724 }

2725 p0 = p1 = p2 = p3 = prevPos;

2726 c3 = fText->char32At(prevPos);

2727 c0 = c1 = c2 = 0;

2728 (void)p0; // Suppress set but not used warning.

2729

2730 // Loop runs once per "significant" character position in the input text.

2731 for (;;) {

2732 // Move all of the positions forward in the input string.

2733 p0 = p1; c0 = c1;

2734 p1 = p2; c1 = c2;

2735 p2 = p3; c2 = c3;

2736

2737 // Advancd p3 by X(Extend \| Format)* Rule 4

2738 p3 = moveForward(p3);

2739 c3 = cAt(p3);

2740

2741 // Rule (3) CR x LF

2742 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {

2743 continue;

2744 }

2745

2746 // Rule (4). Sep <break>

2747 if (fSepSet->contains(c1)) {

2748 p2 = p1+1; // Separators don't combine with Extend or Format.

2749 break;

2750 }

2751

2752 if (p2 >= fText->length()) {

2753 // Reached end of string. Always a break position.

2754 break;

2755 }

2756

2757 if (p2 == prevPos) {

2758 // Still warming up the loop. (won't work with zero length strings, but we don't care)

2759 continue;

2760 }

2761

2762 // Rule (6). ATerm x Numeric

2763 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {

2764 continue;

2765 }

2766

2767 // Rule (7). (Upper \| Lower) ATerm x Uppper

2768 if ((fUpperSet->contains(c0) \|\| fLowerSet->contains(c0)) &&

2769 fATermSet->contains(c1) && fUpperSet->contains(c2)) {

2770 continue;

2771 }

2772

2773 // Rule (8) ATerm Close* Sp* x (not (OLettter \| Upper \| Lower \| Sep \| STerm \| ATerm))* Lower

2774 // Note: STerm \| ATerm are added to the negated part of the e xpression by a

2775 // note to the Unicode 5.0 documents.

2776 int p8 = p1;

2777 while (fSpSet->contains(cAt(p8))) {

2778 p8 = moveBack(p8);

2779 }

2780 while (fCloseSet->contains(cAt(p8))) {

2781 p8 = moveBack(p8);

2782 }

2783 if (fATermSet->contains(cAt(p8))) {

2784 p8=p2;

2785 for (;;) {

2786 c = cAt(p8);

2787 if (c==-1 \|\| fOLetterSet->contains(c) \|\| fUpperSet->contains(c) \|\|

2788 fLowerSet->contains(c) \|\| fSepSet->contains(c) \|\|

2789 fATermSet->contains(c) \|\| fSTermSet->contains(c)) {

2790 break;

2791 }

2792 p8 = moveForward(p8);

2793 }

2794 if (fLowerSet->contains(cAt(p8))) {

2795 continue;

2796 }

2797 }

2798

2799 // Rule 8a (STerm \| ATerm) Close* Sp* x (SContinue \| STerm \| ATerm);

2800 if (fSContinueSet->contains(c2) \|\| fSTermSet->contains(c2) \|\| fATermSet- >contains(c2)) {

2801 p8 = p1;

2802 while (fSpSet->contains(cAt(p8))) {

2803 p8 = moveBack(p8);

2804 }

2805 while (fCloseSet->contains(cAt(p8))) {

2806 p8 = moveBack(p8);

2807 }

2808 c = cAt(p8);

2809 if (fSTermSet->contains(c) \|\| fATermSet->contains(c)) {

2810 continue;

2811 }

2812 }

2813

2814 // Rule (9) (STerm \| ATerm) Close* x (Close \| Sp \| Sep \| CR \| LF)

2815 int p9 = p1;

2816 while (fCloseSet->contains(cAt(p9))) {

2817 p9 = moveBack(p9);

2818 }

2819 c = cAt(p9);

2820 if ((fSTermSet->contains(c) \|\| fATermSet->contains(c))) {

2821 if (fCloseSet->contains(c2) \|\| fSpSet->contains(c2) \|\| fSepSet->cont ains(c2)) {

2822 continue;

2823 }

2824 }

2825

2826 // Rule (10) (Sterm \| ATerm) Close* Sp* x (Sp \| Sep \| CR \| LF)

2827 int p10 = p1;

2828 while (fSpSet->contains(cAt(p10))) {

2829 p10 = moveBack(p10);

2830 }

2831 while (fCloseSet->contains(cAt(p10))) {

2832 p10 = moveBack(p10);

2833 }

2834 if (fSTermSet->contains(cAt(p10)) \|\| fATermSet->contains(cAt(p10))) {

2835 if (fSpSet->contains(c2) \|\| fSepSet->contains(c2)) {

2836 continue;

2837 }

2838 }

2839

2840 // Rule (11) (STerm \| ATerm) Close* Sp* (Sep \| CR \| LF)? <break>

2841 int p11 = p1;

2842 if (fSepSet->contains(cAt(p11))) {

2843 p11 = moveBack(p11);

2844 }

2845 while (fSpSet->contains(cAt(p11))) {

2846 p11 = moveBack(p11);

2847 }

2848 while (fCloseSet->contains(cAt(p11))) {

2849 p11 = moveBack(p11);

2850 }

2851 if (fSTermSet->contains(cAt(p11)) \|\| fATermSet->contains(cAt(p11))) {

2852 break;

2853 }

2854

2855 // Rule (12) Any x Any

2856 continue;

2857 }

2858 breakPos = p2;

2859 return breakPos;

2860 }

2861

2862 RBBISentMonkey::~RBBISentMonkey() {

2863 delete fSets;

2864 delete fSepSet;

2865 delete fFormatSet;

2866 delete fSpSet;

2867 delete fLowerSet;

2868 delete fUpperSet;

2869 delete fOLetterSet;

2870 delete fNumericSet;

2871 delete fATermSet;

2872 delete fSContinueSet;

2873 delete fSTermSet;

2874 delete fCloseSet;

2875 delete fOtherSet;

2876 delete fExtendSet;

2877 }

2878

2879

2880

2881 //------------------------------------------------------------------------------ -------------

2882 //

2883 // RBBILineMonkey

2884 //

2885 //------------------------------------------------------------------------------ -------------

2886

2887 class RBBILineMonkey: public RBBIMonkeyKind {

2888 public:

2889 RBBILineMonkey();

2890 virtual ~RBBILineMonkey();

2891 virtual UVector *charClasses();

2892 virtual void setText(const UnicodeString &s);

2893 virtual int32_t next(int32_t i);

2894 virtual void rule9Adjust(int32_t pos, UChar32 posChar, int32_t nextPo s, UChar32 *nextChar);

2895 private:

2896 UVector *fSets;

2897

2898 UnicodeSet *fBK;

2899 UnicodeSet *fCR;

2900 UnicodeSet *fLF;

2901 UnicodeSet *fCM;

2902 UnicodeSet *fNL;

2903 UnicodeSet *fSG;

2904 UnicodeSet *fWJ;

2905 UnicodeSet *fZW;

2906 UnicodeSet *fGL;

2907 UnicodeSet *fCB;

2908 UnicodeSet *fSP;

2909 UnicodeSet *fB2;

2910 UnicodeSet *fBA;

2911 UnicodeSet *fBB;

2912 UnicodeSet *fHY;

2913 UnicodeSet *fH2;

2914 UnicodeSet *fH3;

2915 UnicodeSet *fCL;

2916 UnicodeSet *fCP;

2917 UnicodeSet *fEX;

2918 UnicodeSet *fIN;

2919 UnicodeSet *fJL;

2920 UnicodeSet *fJV;

2921 UnicodeSet *fJT;

2922 UnicodeSet *fNS;

2923 UnicodeSet *fOP;

2924 UnicodeSet *fQU;

2925 UnicodeSet *fIS;

2926 UnicodeSet *fNU;

2927 UnicodeSet *fPO;

2928 UnicodeSet *fPR;

2929 UnicodeSet *fSY;

2930 UnicodeSet *fAI;

2931 UnicodeSet *fAL;

2932 UnicodeSet *fCJ;

2933 UnicodeSet *fHL;

2934 UnicodeSet *fID;

2935 UnicodeSet *fRI;

2936 UnicodeSet *fSA;

2937 UnicodeSet *fXX;

2938

2939 BreakIterator *fCharBI;

2940 const UnicodeString *fText;

2941 RegexMatcher *fNumberMatcher;

2942 };

2943

2944

2945 RBBILineMonkey::RBBILineMonkey()

2946 {

2947 UErrorCode status = U_ZERO_ERROR;

2948

2949 fSets = new UVector(status);

2950

2951 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), statu s);

2952 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), statu s);

2953 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), statu s);

2954 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), statu s);

2955 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), statu s);

2956 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), statu s);

2957 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), statu s);

2958 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), statu s);

2959 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), statu s);

2960 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), statu s);

2961 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), statu s);

2962 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), statu s);

2963 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), statu s);

2964 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), statu s);

2965 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), statu s);

2966 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), statu s);

2967 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), statu s);

2968 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), statu s);

2969 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), statu s);

2970 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), statu s);

2971 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), statu s);

2972 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), statu s);

2973 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), statu s);

2974 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), statu s);

2975 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), statu s);

2976 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), statu s);

2977 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), statu s);

2978 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), statu s);

2979 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), statu s);

2980 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), statu s);

2981 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), statu s);

2982 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), statu s);

2983 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), statu s);

2984 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), statu s);

2985 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), statu s);

2986 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), statu s);

2987 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), statu s);

2988 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), statu s);

2989 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);

2990 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), statu s);

2991

2992 if (U_FAILURE(status)) {

2993 deferredStatus = status;

2994 fCharBI = NULL;

2995 fNumberMatcher = NULL;

2996 return;

2997 }

2998

2999 fAL->addAll(*fXX); // Default behavior for XX is identical to AL

3000 fAL->addAll(*fAI); // Default behavior for AI is identical to AL

3001 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to A L

3002 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.

3003

3004 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.

3005

3006 fSets->addElement(fBK, status);

3007 fSets->addElement(fCR, status);

3008 fSets->addElement(fLF, status);

3009 fSets->addElement(fCM, status);

3010 fSets->addElement(fNL, status);

3011 fSets->addElement(fWJ, status);

3012 fSets->addElement(fZW, status);

3013 fSets->addElement(fGL, status);

3014 fSets->addElement(fCB, status);

3015 fSets->addElement(fSP, status);

3016 fSets->addElement(fB2, status);

3017 fSets->addElement(fBA, status);

3018 fSets->addElement(fBB, status);

3019 fSets->addElement(fHY, status);

3020 fSets->addElement(fH2, status);

3021 fSets->addElement(fH3, status);

3022 fSets->addElement(fCL, status);

3023 fSets->addElement(fCP, status);

3024 fSets->addElement(fEX, status);

3025 fSets->addElement(fIN, status);

3026 fSets->addElement(fJL, status);

3027 fSets->addElement(fJT, status);

3028 fSets->addElement(fJV, status);

3029 fSets->addElement(fNS, status);

3030 fSets->addElement(fOP, status);

3031 fSets->addElement(fQU, status);

3032 fSets->addElement(fIS, status);

3033 fSets->addElement(fNU, status);

3034 fSets->addElement(fPO, status);

3035 fSets->addElement(fPR, status);

3036 fSets->addElement(fSY, status);

3037 fSets->addElement(fAI, status);

3038 fSets->addElement(fAL, status);

3039 fSets->addElement(fHL, status);

3040 fSets->addElement(fID, status);

3041 fSets->addElement(fWJ, status);

3042 fSets->addElement(fRI, status);

3043 fSets->addElement(fSA, status);

3044 fSets->addElement(fSG, status);

3045

3046 const char *rules =

3047 "((\\p{Line_Break=PR}\|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"

3048 "((\\p{Line_Break=OP}\|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"

3049 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"

3050 "((\\p{Line_Break=NU}\|\\p{Line_Break=IS}\|\\p{Line_Break=SY})\\p{Line _Break=CM})"

3051 "((\\p{Line_Break=CL}\|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"

3052 "((\\p{Line_Break=PR}\|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";

3053

3054 fNumberMatcher = new RegexMatcher(

3055 UnicodeString(rules, -1, US_INV), 0, status);

3056

3057 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), statu s);

3058

3059 if (U_FAILURE(status)) {

3060 deferredStatus = status;

3061 }

3062 }

3063

3064

3065 void RBBILineMonkey::setText(const UnicodeString &s) {

3066 fText = &s;

3067 fCharBI->setText(s);

3068 fNumberMatcher->reset(s);

3069 }

3070

3071 //

3072 // rule9Adjust

3073 // Line Break TR rules 9 and 10 implementation.

3074 // This deals with combining marks and other sequences that

3075 // that must be treated as if they were something other than what they actua lly are.

3076 //

3077 // This is factored out into a separate function because it must be applied twice for

3078 // each potential break, once to the chars before the position being checked , then

3079 // again to the text following the possible break.

3080 //

3081 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 posChar, int32_t nextPos , UChar32 *nextChar) {

3082 if (pos == -1) {

3083 // Invalid initial position. Happens during the warmup iteration of the

3084 // main loop in next().

3085 return;

3086 }

3087

3088 int32_t nPos = *nextPos;

3089

3090 // LB 9 Keep combining sequences together.

3091 // advance over any CM class chars. Note that Line Break CM is different

3092 // from the normal Grapheme Extend property.

3093 if (!(fSP->contains(posChar) \|\| fBK->contains(posChar) \|\| *posChar==0x0d \| \|

3094 posChar==0x0a \|\|fNL->contains(posChar) \|\| fZW->contains(*posChar))) {

3095 for (;;) {

3096 *nextChar = fText->char32At(nPos);

3097 if (!fCM->contains(*nextChar)) {

3098 break;

3099 }

3100 nPos = fText->moveIndex32(nPos, 1);

3101 }

3102 }

3103

3104

3105 // LB 9 Treat X CM* as if it were x.

3106 // No explicit action required.

3107

3108 // LB 10 Treat any remaining combining mark as AL

3109 if (fCM->contains(*posChar)) {

3110 *posChar = 0x41; // thisChar = 'A';

3111 }

3112

3113 // Push the updated nextPos and nextChar back to our caller.

3114 // This only makes a difference if posChar got bigger by consuming a

3115 // combining sequence.

3116 *nextPos = nPos;

3117 *nextChar = fText->char32At(nPos);

3118 }

3119

3120

3121

3122 int32_t RBBILineMonkey::next(int32_t startPos) {

3123 UErrorCode status = U_ZERO_ERROR;

3124 int32_t pos; // Index of the char following a potential break posi tion

3125 UChar32 thisChar; // Character at above position "pos"

3126

3127 int32_t prevPos; // Index of the char preceding a potential break posi tion

3128 UChar32 prevChar; // Character at above position. Note that prevChar

3129 // and thisChar may not be adjacent because combinin g

3130 // characters between them will be ignored.

3131

3132 int32_t prevPosX2; // Second previous character. Wider context for LB21 a.

3133 UChar32 prevCharX2;

3134

3135 int32_t nextPos; // Index of the next character following pos.

3136 // Usually skips over combining marks.

3137 int32_t nextCPPos; // Index of the code point following "pos."

3138 // May point to a combining mark.

3139 int32_t tPos; // temp value.

3140 UChar32 c;

3141

3142 if (U_FAILURE(deferredStatus)) {

3143 return -1;

3144 }

3145

3146 if (startPos >= fText->length()) {

3147 return -1;

3148 }

3149

3150

3151 // Initial values for loop. Loop will run the first time without finding br eaks,

3152 // while the invalid values shift out and the "thi s" and

3153 // "prev" positions are filled in with good values .

3154 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.

3155 thisChar = prevChar = prevCharX2 = 0;

3156 nextPos = nextCPPos = startPos;

3157

3158

3159 // Loop runs once per position in the test text, until a break position

3160 // is found.

3161 for (;;) {

3162 prevPosX2 = prevPos;

3163 prevCharX2 = prevChar;

3164

3165 prevPos = pos;

3166 prevChar = thisChar;

3167

3168 pos = nextPos;

3169 thisChar = fText->char32At(pos);

3170

3171 nextCPPos = fText->moveIndex32(pos, 1);

3172 nextPos = nextCPPos;

3173

3174 // Rule LB2 - Break at end of text.

3175 if (pos >= fText->length()) {

3176 break;

3177 }

3178

3179 // Rule LB 9 - adjust for combining sequences.

3180 // We do this one out-of-order because the adjustment does n ot change anything

3181 // that would match rules LB 3 - LB 6, but after the adjustm ent, LB 3-6 do need to

3182 // be applied.

3183 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);

3184 nextCPPos = nextPos = fText->moveIndex32(pos, 1);

3185 c = fText->char32At(nextPos);

3186 rule9Adjust(pos, &thisChar, &nextPos, &c);

3187

3188 // If the loop is still warming up - if we haven't shifted the initial

3189 // -1 positions out of prevPos yet - loop back to advance the

3190 // position in the input without any further looking for breaks.

3191 if (prevPos == -1) {

3192 continue;

3193 }

3194

3195 // LB 4 Always break after hard line breaks,

3196 if (fBK->contains(prevChar)) {

3197 break;

3198 }

3199

3200 // LB 5 Break after CR, LF, NL, but not inside CR LF

3201 if (prevChar == 0x0d && thisChar == 0x0a) {

3202 continue;

3203 }

3204 if (prevChar == 0x0d \|\|

3205 prevChar == 0x0a \|\|

3206 prevChar == 0x85) {

3207 break;

3208 }

3209

3210 // LB 6 Don't break before hard line breaks

3211 if (thisChar == 0x0d \|\| thisChar == 0x0a \|\| thisChar == 0x85 \|\|

3212 fBK->contains(thisChar)) {

3213 continue;

3214 }

3215

3216

3217 // LB 7 Don't break before spaces or zero-width space.

3218 if (fSP->contains(thisChar)) {

3219 continue;

3220 }

3221

3222 if (fZW->contains(thisChar)) {

3223 continue;

3224 }

3225

3226 // LB 8 Break after zero width space

3227 if (fZW->contains(prevChar)) {

3228 break;

3229 }

3230

3231 // LB 9, 10 Already done, at top of loop.

3232 //

3233

3234

3235 // LB 11 Do not break before or after WORD JOINER and related character s.

3236 // x WJ

3237 // WJ x

3238 //

3239 if (fWJ->contains(thisChar) \|\| fWJ->contains(prevChar)) {

3240 continue;

3241 }

3242

3243 // LB 12

3244 // GL x

3245 if (fGL->contains(prevChar)) {

3246 continue;

3247 }

3248

3249 // LB 12a

3250 // [^SP BA HY] x GL

3251 if (!(fSP->contains(prevChar) \|\|

3252 fBA->contains(prevChar) \|\|

3253 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {

3254 continue;

3255 }

3256

3257

3258

3259 // LB 13 Don't break before closings.

3260 // NU x CL, NU x CP and NU x IS are not matched here so that th ey will

3261 // fall into LB 17 and the more general number regular expression .

3262 //

3263 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) \|\|

3264 (!fNU->contains(prevChar) && fCP->contains(thisChar)) \|\|

3265 fEX->contains(thisChar) \|\|

3266 (!fNU->contains(prevChar) && fIS->contains(thisChar)) \|\|

3267 (!fNU->contains(prevChar) && fSY->contains(thisChar))) {

3268 continue;

3269 }

3270

3271 // LB 14 Don't break after OP SP*

3272 // Scan backwards, checking for this sequence.

3273 // The OP char could include combining marks, so we actually check for

3274 // OP CM* SP*

3275 // Another Twist: The Rule 67 fixes may have changed a SP CM

3276 // sequence into a ID char, so before scanning back through spaces ,

3277 // verify that prevChar is indeed a space. The prevChar variable

3278 // may differ from fText[prevPos]

3279 tPos = prevPos;

3280 if (fSP->contains(prevChar)) {

3281 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {

3282 tPos=fText->moveIndex32(tPos, -1);

3283 }

3284 }

3285 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {

3286 tPos=fText->moveIndex32(tPos, -1);

3287 }

3288 if (fOP->contains(fText->char32At(tPos))) {

3289 continue;

3290 }

3291

3292

3293 // LB 15 QU SP* x OP

3294 if (fOP->contains(thisChar)) {

3295 // Scan backwards from prevChar to see if it is preceded by QU CM* S P*

3296 int tPos = prevPos;

3297 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {

3298 tPos = fText->moveIndex32(tPos, -1);

3299 }

3300 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {

3301 tPos = fText->moveIndex32(tPos, -1);

3302 }

3303 if (fQU->contains(fText->char32At(tPos))) {

3304 continue;

3305 }

3306 }

3307

3308

3309

3310 // LB 16 (CL \| CP) SP* x NS

3311 // Scan backwards for SP* CM* (CL \| CP)

3312 if (fNS->contains(thisChar)) {

3313 int tPos = prevPos;

3314 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {

3315 tPos = fText->moveIndex32(tPos, -1);

3316 }

3317 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {

3318 tPos = fText->moveIndex32(tPos, -1);

3319 }

3320 if (fCL->contains(fText->char32At(tPos)) \|\| fCP->contains(fText->cha r32At(tPos))) {

3321 continue;

3322 }

3323 }

3324

3325

3326 // LB 17 B2 SP* x B2

3327 if (fB2->contains(thisChar)) {

3328 // Scan backwards, checking for the B2 CM* SP* sequence.

3329 tPos = prevPos;

3330 if (fSP->contains(prevChar)) {

3331 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {

3332 tPos=fText->moveIndex32(tPos, -1);

3333 }

3334 }

3335 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {

3336 tPos=fText->moveIndex32(tPos, -1);

3337 }

3338 if (fB2->contains(fText->char32At(tPos))) {

3339 continue;

3340 }

3341 }

3342

3343

3344 // LB 18 break after space

3345 if (fSP->contains(prevChar)) {

3346 break;

3347 }

3348

3349 // LB 19

3350 // x QU

3351 // QU x

3352 if (fQU->contains(thisChar) \|\| fQU->contains(prevChar)) {

3353 continue;

3354 }

3355

3356 // LB 20 Break around a CB

3357 if (fCB->contains(thisChar) \|\| fCB->contains(prevChar)) {

3358 break;

3359 }

3360

3361 // LB 21

3362 if (fBA->contains(thisChar) \|\|

3363 fHY->contains(thisChar) \|\|

3364 fNS->contains(thisChar) \|\|

3365 fBB->contains(prevChar) ) {

3366 continue;

3367 }

3368

3369 // LB 21a

3370 // HL (HY \| BA) x

3371 if (fHL->contains(prevCharX2) &&

3372 (fHY->contains(prevChar) \|\| fBA->contains(prevChar))) {

3373 continue;

3374 }

3375

3376 // LB 21b

3377 // SY x HL

3378 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {

3379 continue;

3380 }

3381

3382 // LB 22

3383 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) \|\|

3384 (fEX->contains(prevChar) && fIN->contains(thisChar)) \|\|

3385 (fHL->contains(prevChar) && fIN->contains(thisChar)) \|\|

3386 (fID->contains(prevChar) && fIN->contains(thisChar)) \|\|

3387 (fIN->contains(prevChar) && fIN->contains(thisChar)) \|\|

3388 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) {

3389 continue;

3390 }

3391

3392

3393 // LB 23 ID x PO

3394 // AL x NU

3395 // HL x NU

3396 // NU x AL

3397 if ((fID->contains(prevChar) && fPO->contains(thisChar)) \|\|

3398 (fAL->contains(prevChar) && fNU->contains(thisChar)) \|\|

3399 (fHL->contains(prevChar) && fNU->contains(thisChar)) \|\|

3400 (fNU->contains(prevChar) && fAL->contains(thisChar)) \|\|

3401 (fNU->contains(prevChar) && fHL->contains(thisChar)) ) {

3402 continue;

3403 }

3404

3405 // LB 24 Do not break between prefix and letters or ideographs.

3406 // PR x ID

3407 // PR x (AL \| HL)

3408 // PO x (AL \| HL)

3409 if ((fPR->contains(prevChar) && fID->contains(thisChar)) \|\|

3410 (fPR->contains(prevChar) && (fAL->contains(thisChar) \|\| fHL->contain s(thisChar))) \|\|

3411 (fPO->contains(prevChar) && (fAL->contains(thisChar) \|\| fHL->contain s(thisChar)))) {

3412 continue;

3413 }

3414

3415

3416

3417 // LB 25 Numbers

3418 if (fNumberMatcher->lookingAt(prevPos, status)) {

3419 if (U_FAILURE(status)) {

3420 break;

3421 }

3422 // Matched a number. But could have been just a single digit, which would

3423 // not represent a "no break here" between prevChar and thisChar

3424 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first ch ar following num

3425 if (numEndIdx > pos) {

3426 // Number match includes at least our two chars being checked

3427 if (numEndIdx > nextPos) {

3428 // Number match includes additional chars. Update pos and n extPos

3429 // so that next loop iteration will continue at the end of the number,

3430 // checking for breaks between last char in number & whate ver follows.

3431 pos = nextPos = numEndIdx;

3432 do {

3433 pos = fText->moveIndex32(pos, -1);

3434 thisChar = fText->char32At(pos);

3435 } while (fCM->contains(thisChar));

3436 }

3437 continue;

3438 }

3439 }

3440

3441

3442 // LB 26 Do not break a Korean syllable.

3443 if (fJL->contains(prevChar) && (fJL->contains(thisChar) \|\|

3444 fJV->contains(thisChar) \|\|

3445 fH2->contains(thisChar) \|\|

3446 fH3->contains(thisChar))) {

3447 continue;

3448 }

3449

3450 if ((fJV->contains(prevChar) \|\| fH2->contains(prevChar)) &&

3451 (fJV->contains(thisChar) \|\| fJT->contains(thisChar))) {

3452 continue;

3453 }

3454

3455 if ((fJT->contains(prevChar) \|\| fH3->contains(prevChar)) &&

3456 fJT->contains(thisChar)) {

3457 continue;

3458 }

3459

3460 // LB 27 Treat a Korean Syllable Block the same as ID.

3461 if ((fJL->contains(prevChar) \|\| fJV->contains(prevChar) \|\|

3462 fJT->contains(prevChar) \|\| fH2->contains(prevChar) \|\| fH3->contains( prevChar)) &&

3463 fIN->contains(thisChar)) {

3464 continue;

3465 }

3466 if ((fJL->contains(prevChar) \|\| fJV->contains(prevChar) \|\|

3467 fJT->contains(prevChar) \|\| fH2->contains(prevChar) \|\| fH3->contains( prevChar)) &&

3468 fPO->contains(thisChar)) {

3469 continue;

3470 }

3471 if (fPR->contains(prevChar) && (fJL->contains(thisChar) \|\| fJV->contains (thisChar) \|\|

3472 fJT->contains(thisChar) \|\| fH2->contains(thisChar) \|\| fH3->contains( thisChar))) {

3473 continue;

3474 }

3475

3476

3477

3478 // LB 28 Do not break between alphabetics ("at").

3479 if ((fAL->contains(prevChar) \|\| fHL->contains(prevChar)) && (fAL->contai ns(thisChar) \|\| fHL->contains(thisChar))) {

3480 continue;

3481 }

3482

3483 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g .").

3484 if (fIS->contains(prevChar) && (fAL->contains(thisChar) \|\| fHL->contains (thisChar))) {

3485 continue;

3486 }

3487

3488 // LB 30 Do not break between letters, numbers, or ordinary symbols a nd opening or closing punctuation.

3489 // (AL \| NU) x OP

3490 // CP x (AL \| NU)

3491 if ((fAL->contains(prevChar) \|\| fHL->contains(prevChar) \|\| fNU->contains (prevChar)) && fOP->contains(thisChar)) {

3492 continue;

3493 }

3494 if (fCP->contains(prevChar) && (fAL->contains(thisChar) \|\| fHL->contains (thisChar) \|\| fNU->contains(thisChar))) {

3495 continue;

3496 }

3497

3498 // LB30a Do not break between regional indicators.

3499 // RI x RI

3500 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {

3501 continue;

3502 }

3503

3504 // LB 31 Break everywhere else

3505 break;

3506

3507 }

3508

3509 return pos;

3510 }

3511

3512

3513 UVector *RBBILineMonkey::charClasses() {

3514 return fSets;

3515 }

3516

3517

3518 RBBILineMonkey::~RBBILineMonkey() {

3519 delete fSets;

3520

3521 delete fBK;

3522 delete fCR;

3523 delete fLF;

3524 delete fCM;

3525 delete fNL;

3526 delete fWJ;

3527 delete fZW;

3528 delete fGL;

3529 delete fCB;

3530 delete fSP;

3531 delete fB2;

3532 delete fBA;

3533 delete fBB;

3534 delete fHY;

3535 delete fH2;

3536 delete fH3;

3537 delete fCL;

3538 delete fCP;

3539 delete fEX;

3540 delete fIN;

3541 delete fJL;

3542 delete fJV;

3543 delete fJT;

3544 delete fNS;

3545 delete fOP;

3546 delete fQU;

3547 delete fIS;

3548 delete fNU;

3549 delete fPO;

3550 delete fPR;

3551 delete fSY;

3552 delete fAI;

3553 delete fAL;

3554 delete fCJ;

3555 delete fHL;

3556 delete fID;

3557 delete fRI;

3558 delete fSA;

3559 delete fSG;

3560 delete fXX;

3561

3562 delete fCharBI;

3563 delete fNumberMatcher;

3564 }

3565

3566

3567 //------------------------------------------------------------------------------ -------------

3568 //

3569 // TestMonkey

3570 //

3571 // params

3572 // seed=nnnnn Random number starting seed.

3573 // Setting the seed allows errors to be reproduced.

3574 // loop=nnn Looping count. Controls running time.

3575 // -1: run forever.

3576 // 0 or greater: run length.

3577 //

3578 // type = char \| word \| line \| sent \| title

3579 //

3580 //------------------------------------------------------------------------------ -------------

3581

3582 static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t d efaultVal) {

3583 int32_t val = defaultVal;

3584 name.append(" = (-?\\d+)");

3585 UErrorCode status = U_ZERO_ERROR;

3586 RegexMatcher m(name, params, 0, status);

3587 if (m.find()) {

3588 // The param exists. Convert the string to an int.

3589 char valString[100];

3590 int32_t paramLength = m.end(1, status) - m.start(1, status);

3591 if (paramLength >= (int32_t)(sizeof(valString)-1)) {

3592 paramLength = (int32_t)(sizeof(valString)-2);

3593 }

3594 params.extract(m.start(1, status), paramLength, valString, sizeof(valStr ing));

3595 val = strtol(valString, NULL, 10);

3596

3597 // Delete this parameter from the params string.

3598 m.reset();

3599 params = m.replaceFirst("", status);

3600 }

3601 U_ASSERT(U_SUCCESS(status));

3602 return val;

3603 }

3604 #endif

3605

3606 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

3607 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,

3608 BreakIterator *bi,

3609 int expected[],

3610 int expectedcount)

3611 {

3612 int count = 0;

3613 int i = 0;

3614 int forward[50];

3615 bi->setText(ustr);

3616 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {

3617 forward[count] = i;

3618 if (count < expectedcount && expected[count] != i) {

3619 test->errln("break forward test failed: expected %d but got %d",

3620 expected[count], i);

3621 break;

3622 }

3623 count ++;

3624 }

3625 if (count != expectedcount) {

3626 printStringBreaks(ustr, expected, expectedcount);

3627 test->errln("break forward test failed: missed %d match",

3628 expectedcount - count);

3629 return;

3630 }

3631 // testing boundaries

3632 for (i = 1; i < expectedcount; i ++) {

3633 int j = expected[i - 1];

3634 if (!bi->isBoundary(j)) {

3635 printStringBreaks(ustr, expected, expectedcount);

3636 test->errln("isBoundary() failed. Expected boundary at position %d" , j);

3637 return;

3638 }

3639 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {

3640 if (bi->isBoundary(j)) {

3641 printStringBreaks(ustr, expected, expectedcount);

3642 test->errln("isBoundary() failed. Not expecting boundary at pos ition %d", j);

3643 return;

3644 }

3645 }

3646 }

3647

3648 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {

3649 count --;

3650 if (forward[count] != i) {

3651 printStringBreaks(ustr, expected, expectedcount);

3652 test->errln("happy break test previous() failed: expected %d but got %d",

3653 forward[count], i);

3654 break;

3655 }

3656 }

3657 if (count != 0) {

3658 printStringBreaks(ustr, expected, expectedcount);

3659 test->errln("break test previous() failed: missed a match");

3660 return;

3661 }

3662

3663 // testing preceding

3664 for (i = 0; i < expectedcount - 1; i ++) {

3665 // int j = expected[i] + 1;

3666 int j = ustr.moveIndex32(expected[i], 1);

3667 for (; j <= expected[i + 1]; j ++) {

3668 if (bi->preceding(j) != expected[i]) {

3669 printStringBreaks(ustr, expected, expectedcount);

3670 test->errln("preceding(): Not expecting boundary at position %d" , j);

3671 return;

3672 }

3673 }

3674 }

3675 }

3676 #endif

3677

3678 void RBBITest::TestWordBreaks(void)

3679 {

3680 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

3681

3682 Locale locale("en");

3683 UErrorCode status = U_ZERO_ERROR;

3684 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, statu s);

3685 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);

3686 // Replaced any C+J characters in a row with a random sequence of characters

3687 // of the same length to make our C+J segmentation not get in the way.

3688 static const char *strlist[] =

3689 {

3690 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",

3691 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040 \\u003b",

3692 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e 0061\\u003a",

3693 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",

3694 "\\uac00\\u3588\\u009c\\u0953\\u194b",

3695 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",

3696 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e" ,

3697 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",

3698 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",

3699 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",

3700 "\\u2027\\U000e0067\\u0a47\\u00b7",

3701 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",

3702 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",

3703 "\\u0589\\U000e006e\\u0a42\\U000104a5",

3704 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",

3705 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",

3706 "\\u0027\\u11af\\U000e0057\\u0602",

3707 "\\U0001d7f2\\U000e007\\u0004\\u0589",

3708 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b ",

3709 "\\U0001d7f2\\U000e007d\\u0004\\u0589",

3710 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",

3711 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",

3712 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",

3713 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",

3714 "\\u0233\\U000e0020\\u0a69\\u0d6a",

3715 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",

3716 "\\u18f4\\U000e0049\\u20e7\\u2027",

3717 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",

3718 "\\ua183\\u102d\\u0bec\\u003a",

3719 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",

3720 "\\u003a\\u0e57\\u0fad\\u002e",

3721 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",

3722 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",

3723 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",

3724 "\\u003a\\u0664\\u00b7\\u1fba",

3725 "\\u003b\\u0027\\u00b7\\u47a3",

3726 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",

3727 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u 0e51\\u1058\\U000e0058\\u00b7\\u0673",

3728 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",

3729 };

3730 int loop;

3731 if (U_FAILURE(status)) {

3732 errcheckln(status, "Creation of break iterator failed %s", u_errorName(s tatus));

3733 return;

3734 }

3735 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {

3736 // printf("looping %d\n", loop);

3737 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);

3738 // RBBICharMonkey monkey;

3739 RBBIWordMonkey monkey;

3740

3741 int expected[50];

3742 int expectedcount = 0;

3743

3744 monkey.setText(ustr);

3745 int i;

3746 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {

3747 expected[expectedcount ++] = i;

3748 }

3749

3750 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);

3751 }

3752 delete bi;

3753 #endif

3754 }

3755

3756 void RBBITest::TestWordBoundary(void)

3757 {

3758 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>

3759 Locale locale("en");

3760 UErrorCode status = U_ZERO_ERROR;

3761 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, statu s);

3762 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);

3763 UChar str[50];

3764 static const char *strlist[] =

3765 {

3766 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",

3767 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",

3768 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",

3769 "\\u2027\\U000e0067\\u0a47\\u00b7",

3770 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",

3771 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",

3772 "\\u0589\\U000e006e\\u0a42\\U000104a5",

3773 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",

3774 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",

3775 "\\u0027\\u11af\\U000e0057\\u0602",

3776 "\\U0001d7f2\\U000e007\\u0004\\u0589",

3777 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b ",

3778 "\\U0001d7f2\\U000e007d\\u0004\\u0589",

3779 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",

3780 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",

3781 "\\U000e0065\\u302c\\u09ee\\U000e0068",

3782 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",

3783 "\\u0233\\U000e0020\\u0a69\\u0d6a",

3784 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",

3785 "\\u58f4\\U000e0049\\u20e7\\u2027",

3786 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",

3787 "\\ua183\\u102d\\u0bec\\u003a",

3788 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",

3789 "\\u003a\\u0e57\\u0fad\\u002e",

3790 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",

3791 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",

3792 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019" ,

3793 "\\u003a\\u0664\\u00b7\\u1fba",

3794 "\\u003b\\u0027\\u00b7\\u47a3",

3795 };

3796 int loop;

3797 if (U_FAILURE(status)) {

3798 errcheckln(status, "Creation of break iterator failed %s", u_errorName(s tatus));

3799 return;

3800 }

3801 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {

3802 // printf("looping %d\n", loop);

3803 u_unescape(strlist[loop], str, 20);

3804 UnicodeString ustr(str);

3805 int forward[50];

3806 int count = 0;

3807

3808 bi->setText(ustr);

3809 int prev = 0;

3810 int i;

3811 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {

3812 forward[count ++] = i;

3813 if (i > prev) {

3814 int j;

3815 for (j = prev + 1; j < i; j ++) {

3816 if (bi->isBoundary(j)) {

3817 printStringBreaks(ustr, forward, count);

3818 errln("happy boundary test failed: expected %d not a bou ndary",

3819 j);

3820 return;

3821 }

3822 }

3823 }

3824 if (!bi->isBoundary(i)) {

3825 printStringBreaks(ustr, forward, count);

3826 errln("happy boundary test failed: expected %d a boundary",

3827 i);

3828 return;

3829 }

3830 prev = i;

3831 }

3832 }

3833 delete bi;

3834 }

3835

3836 void RBBITest::TestLineBreaks(void)

3837 {

3838 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

3839 Locale locale("en");

3840 UErrorCode status = U_ZERO_ERROR;

3841 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);

3842 const int32_t STRSIZE = 50;

3843 UChar str[STRSIZE];

3844 static const char *strlist[] =

3845 {

3846 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",

3847 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"

3848 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",

3849 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"

3850 "u2014\\U000e0105\\u118c\\u000a\\u07f8",

3851 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",

3852 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a \\U000e0123",

3853 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u1 7a4",

3854 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",

3855 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2 009\\u000a\\u06f7\\u02cc\\u1019\\u2060",

3856 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e 007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",

3857 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7 \\u0f3b\\u002f",

3858 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c \\u002f\\u17b1",

3859 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u0 60d\\u02c8\\ua4e8\\u002f\\u17d5",

3860 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",

3861 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc ",

3862 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",

3863 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020 \\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",

3864 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d \\u02c8\\u003b",

3865 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u2 9fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",

3866 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d \\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",

3867 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uf f09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",

3868 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u0 02d\\u09cc\\u1782\\u000d\\uff6f\\u0025",

3869 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0 f0c\\u0085\\u2763",

3870 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a \\u3d0a\\ufe57\\u2035\\u2028\\u2029",

3871 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc \\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",

3872 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",

3873 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",

3874 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",

3875 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",

3876 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",

3877 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",

3878 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",

3879 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",

3880 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a \\u180e\\u2009\\u3111",

3881 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",

3882 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",

3883 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",

3884 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",

3885 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",

3886 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",

3887 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",

3888 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"

3889 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"

3890 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",

3891 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",

3892 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",

3893 };

3894 int loop;

3895 TEST_ASSERT_SUCCESS(status);

3896 if (U_FAILURE(status)) {

3897 return;

3898 }

3899 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {

3900 // printf("looping %d\n", loop);

3901 int32_t t = u_unescape(strlist[loop], str, STRSIZE);

3902 if (t >= STRSIZE) {

3903 TEST_ASSERT(FALSE);

3904 continue;

3905 }

3906

3907

3908 UnicodeString ustr(str);

3909 RBBILineMonkey monkey;

3910 if (U_FAILURE(monkey.deferredStatus)) {

3911 continue;

3912 }

3913

3914 const int EXPECTEDSIZE = 50;

3915 int expected[EXPECTEDSIZE];

3916 int expectedcount = 0;

3917

3918 monkey.setText(ustr);

3919 int i;

3920 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {

3921 if (expectedcount >= EXPECTEDSIZE) {

3922 TEST_ASSERT(expectedcount < EXPECTEDSIZE);

3923 return;

3924 }

3925 expected[expectedcount ++] = i;

3926 }

3927

3928 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);

3929 }

3930 delete bi;

3931 #endif

3932 }

3933

3934 void RBBITest::TestSentBreaks(void)

3935 {

3936 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

3937 Locale locale("en");

3938 UErrorCode status = U_ZERO_ERROR;

3939 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);

3940 UChar str[200];

3941 static const char *strlist[] =

3942 {

3943 "Now\ris\nthe\r\ntime\n\rfor\r\r",

3944 "This\n",

3945 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $ 20,00,000.",

3946 "\"Sentence ending with a quote.\" Bye.",

3947 " (This is it). Testing the sentence iterator. \"This isn't it.\"",

3948 "Hi! This is a simple sample sentence. (This is it.) This is a simple sampl e sentence. \"This isn't it.\"",

3949 "Hi! This is a simple sample sentence. It does not have to make any sense a s you can see. ",

3950 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",

3951 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",

3952 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",

3953 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a \\ufe56\\ufe52"

3954 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u00 2e\\ua6ab\\u104a"

3955 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u 5f61\\u202f"

3956 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",

3957 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U 0001d171"

3958 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc7 2\\u0030"

3959 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180 e\\u000b"

3960 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\ u202b"

3961 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\ u2e05"

3962 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"

3963 };

3964 int loop;

3965 if (U_FAILURE(status)) {

3966 errcheckln(status, "Creation of break iterator failed %s", u_errorName(s tatus));

3967 return;

3968 }

3969 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {

3970 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));

3971 UnicodeString ustr(str);

3972

3973 RBBISentMonkey monkey;

3974 if (U_FAILURE(monkey.deferredStatus)) {

3975 continue;

3976 }

3977

3978 const int EXPECTEDSIZE = 50;

3979 int expected[EXPECTEDSIZE];

3980 int expectedcount = 0;

3981

3982 monkey.setText(ustr);

3983 int i;

3984 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {

3985 if (expectedcount >= EXPECTEDSIZE) {

3986 TEST_ASSERT(expectedcount < EXPECTEDSIZE);

3987 return;

3988 }

3989 expected[expectedcount ++] = i;

3990 }

3991

3992 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);

3993 }

3994 delete bi;

3995 #endif

3996 }

3997

3998 void RBBITest::TestMonkey(char *params) {

3999 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

4000

4001 UErrorCode status = U_ZERO_ERROR;

4002 int32_t loopCount = 500;

4003 int32_t seed = 1;

4004 UnicodeString breakType = "all";

4005 Locale locale("en");

4006 UBool useUText = FALSE;

4007

4008 if (quick == FALSE) {

4009 loopCount = 10000;

4010 }

4011

4012 if (params) {

4013 UnicodeString p(params);

4014 loopCount = getIntParam("loop", p, loopCount);

4015 seed = getIntParam("seed", p, seed);

4016

4017 RegexMatcher m(" type = (char\|word\|line\|sent\|title) ", p, 0, status) ;

4018 if (m.find()) {

4019 breakType = m.group(1, status);

4020 m.reset();

4021 p = m.replaceFirst("", status);

4022 }

4023

4024 RegexMatcher u(" *utext", p, 0, status);

4025 if (u.find()) {

4026 useUText = TRUE;

4027 u.reset();

4028 p = u.replaceFirst("", status);

4029 }

4030

4031

4032 // m.reset(p);

4033 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {

4034 // Each option is stripped out of the option string as it is process ed.

4035 // All options have been checked. The option string should have bee n completely emptied..

4036 char buf[100];

4037 p.extract(buf, sizeof(buf), NULL, status);

4038 buf[sizeof(buf)-1] = 0;

4039 errln("Unrecognized or extra parameter: %s\n", buf);

4040 return;

4041 }

4042

4043 }

4044

4045 if (breakType == "char" \|\| breakType == "all") {

4046 RBBICharMonkey m;

4047 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, stat us);

4048 if (U_SUCCESS(status)) {

4049 RunMonkey(bi, m, "char", seed, loopCount, useUText);

4050 if (breakType == "all" && useUText==FALSE) {

4051 // Also run a quick test with UText when "all" is specified

4052 RunMonkey(bi, m, "char", seed, loopCount, TRUE);

4053 }

4054 }

4055 else {

4056 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));

4057 }

4058 delete bi;

4059 }

4060

4061 if (breakType == "word" \|\| breakType == "all") {

4062 logln("Word Break Monkey Test");

4063 RBBIWordMonkey m;

4064 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);

4065 if (U_SUCCESS(status)) {

4066 RunMonkey(bi, m, "word", seed, loopCount, useUText);

4067 }

4068 else {

4069 errcheckln(status, "Creation of word break iterator failed %s", u_er rorName(status));

4070 }

4071 delete bi;

4072 }

4073

4074 if (breakType == "line" \|\| breakType == "all") {

4075 logln("Line Break Monkey Test");

4076 RBBILineMonkey m;

4077 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);

4078 if (loopCount >= 10) {

4079 loopCount = loopCount / 5; // Line break runs slower than the othe rs.

4080 }

4081 if (U_SUCCESS(status)) {

4082 RunMonkey(bi, m, "line", seed, loopCount, useUText);

4083 }

4084 else {

4085 errcheckln(status, "Creation of line break iterator failed %s", u_er rorName(status));

4086 }

4087 delete bi;

4088 }

4089

4090 if (breakType == "sent" \|\| breakType == "all" ) {

4091 logln("Sentence Break Monkey Test");

4092 RBBISentMonkey m;

4093 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, statu s);

4094 if (loopCount >= 10) {

4095 loopCount = loopCount / 10; // Sentence runs slower than the other break types

4096 }

4097 if (U_SUCCESS(status)) {

4098 RunMonkey(bi, m, "sentence", seed, loopCount, useUText);

4099 }

4100 else {

4101 errcheckln(status, "Creation of line break iterator failed %s", u_er rorName(status));

4102 }

4103 delete bi;

4104 }

4105

4106 #endif

4107 }

4108

4109 //

4110 // Run a RBBI monkey test. Common routine, for all break iterator types.

4111 // Parameters:

4112 // bi - the break iterator to use

4113 // mk - MonkeyKind, abstraction for obtaining expected results

4114 // name - Name of test (char, word, etc.) for use in error messages

4115 // seed - Seed for starting random number generator (parameter from use r)

4116 // numIterations

4117 //

4118 void RBBITest::RunMonkey(BreakIterator bi, RBBIMonkeyKind &mk, const char name , uint32_t seed,

4119 int32_t numIterations, UBool useUText) {

4120

4121 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

4122

4123 const int32_t TESTSTRINGLEN = 500;

4124 UnicodeString testText;

4125 int32_t numCharClasses;

4126 UVector *chClasses;

4127 int expected[TESTSTRINGLEN*2 + 1];

4128 int expectedCount = 0;

4129 char expectedBreaks[TESTSTRINGLEN*2 + 1];

4130 char forwardBreaks[TESTSTRINGLEN*2 + 1];

4131 char reverseBreaks[TESTSTRINGLEN*2+1];

4132 char isBoundaryBreaks[TESTSTRINGLEN*2+1];

4133 char followingBreaks[TESTSTRINGLEN*2+1];

4134 char precedingBreaks[TESTSTRINGLEN*2+1];

4135 int i;

4136 int loopCount = 0;

4137

4138 m_seed = seed;

4139

4140 numCharClasses = mk.charClasses()->size();

4141 chClasses = mk.charClasses();

4142

4143 // Check for errors that occured during the construction of the MonkeyKind o bject.

4144 // Can't report them where they occured because errln() is a method coming from intlTest,

4145 // and is not visible outside of RBBITest :-(

4146 if (U_FAILURE(mk.deferredStatus)) {

4147 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk. deferredStatus));

4148 return;

4149 }

4150

4151 // Verify that the character classes all have at least one member.

4152 for (i=0; i<numCharClasses; i++) {

4153 UnicodeSet s = (UnicodeSet )chClasses->elementAt(i);

4154 if (s == NULL \|\| s->size() == 0) {

4155 errln("Character Class #%d is null or of zero size.", i);

4156 return;

4157 }

4158 }

4159

4160 while (loopCount < numIterations \|\| numIterations == -1) {

4161 if (numIterations == -1 && loopCount % 10 == 0) {

4162 // If test is running in an infinite loop, display a periodic tic so

4163 // we can tell that it is making progress.

4164 fprintf(stderr, ".");

4165 }

4166 // Save current random number seed, so that we can recreate the random n umbers

4167 // for this loop iteration in event of an error.

4168 seed = m_seed;

4169

4170 // Populate a test string with data.

4171 testText.truncate(0);

4172 for (i=0; i<TESTSTRINGLEN; i++) {

4173 int32_t aClassNum = m_rand() % numCharClasses;

4174 UnicodeSet classSet = (UnicodeSet )chClasses->elementAt(aClassNum) ;

4175 int32_t charIdx = m_rand() % classSet->size();

4176 UChar32 c = classSet->charAt(charIdx);

4177 if (c < 0) { // TODO: deal with sets containing strings.

4178 errln("c < 0");

4179 break;

4180 }

4181 testText.append(c);

4182 }

4183

4184 // Calculate the expected results for this test string.

4185 mk.setText(testText);

4186 memset(expectedBreaks, 0, sizeof(expectedBreaks));

4187 expectedBreaks[0] = 1;

4188 int32_t breakPos = 0;

4189 expectedCount = 0;

4190 for (;;) {

4191 breakPos = mk.next(breakPos);

4192 if (breakPos == -1) {

4193 break;

4194 }

4195 if (breakPos > testText.length()) {

4196 errln("breakPos > testText.length()");

4197 }

4198 expectedBreaks[breakPos] = 1;

4199 U_ASSERT(expectedCount<testText.length());

4200 expected[expectedCount ++] = breakPos;

4201 (void)expected; // Set but not used warning.

4202 // TODO (andy): check it out.

4203 }

4204

4205 // Find the break positions using forward iteration

4206 memset(forwardBreaks, 0, sizeof(forwardBreaks));

4207 if (useUText) {

4208 UErrorCode status = U_ZERO_ERROR;

4209 UText *testUText = utext_openReplaceable(NULL, &testText, &status);

4210 // testUText = utext_openUnicodeString(testUText, &testText, &status );

4211 bi->setText(testUText, status);

4212 TEST_ASSERT_SUCCESS(status);

4213 utext_close(testUText); // The break iterator does a shallow clone of the UText

4214 // This UText can be closed immediately, so long as the

4215 // testText string continues to exist.

4216 } else {

4217 bi->setText(testText);

4218 }

4219

4220 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {

4221 if (i < 0 \|\| i > testText.length()) {

4222 errln("%s break monkey test: Out of range value returned by brea kIterator::next()", name);

4223 break;

4224 }

4225 forwardBreaks[i] = 1;

4226 }

4227

4228 // Find the break positions using reverse iteration

4229 memset(reverseBreaks, 0, sizeof(reverseBreaks));

4230 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {

4231 if (i < 0 \|\| i > testText.length()) {

4232 errln("%s break monkey test: Out of range value returned by brea kIterator::next()", name);

4233 break;

4234 }

4235 reverseBreaks[i] = 1;

4236 }

4237

4238 // Find the break positions using isBoundary() tests.

4239 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));

4240 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());

4241 for (i=0; i<=testText.length(); i++) {

4242 isBoundaryBreaks[i] = bi->isBoundary(i);

4243 }

4244

4245

4246 // Find the break positions using the following() function.

4247 // printf(".");

4248 memset(followingBreaks, 0, sizeof(followingBreaks));

4249 int32_t lastBreakPos = 0;

4250 followingBreaks[0] = 1;

4251 for (i=0; i<testText.length(); i++) {

4252 breakPos = bi->following(i);

4253 if (breakPos <= i \|\|

4254 breakPos < lastBreakPos \|\|

4255 breakPos > testText.length() \|\|

4256 (breakPos > lastBreakPos && lastBreakPos > i)) {

4257 errln("%s break monkey test: "

4258 "Out of range value returned by BreakIterator::following().\ n"

4259 "Random seed=%d index=%d; following returned %d; lastb reak=%d",

4260 name, seed, i, breakPos, lastBreakPos);

4261 break;

4262 }

4263 followingBreaks[breakPos] = 1;

4264 lastBreakPos = breakPos;

4265 }

4266

4267 // Find the break positions using the preceding() function.

4268 memset(precedingBreaks, 0, sizeof(precedingBreaks));

4269 lastBreakPos = testText.length();

4270 precedingBreaks[testText.length()] = 1;

4271 for (i=testText.length(); i>0; i--) {

4272 breakPos = bi->preceding(i);

4273 if (breakPos >= i \|\|

4274 breakPos > lastBreakPos \|\|

4275 (breakPos < 0 && testText.getChar32Start(i)>0) \|\|

4276 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Sta rt(i)) ) {

4277 errln("%s break monkey test: "

4278 "Out of range value returned by BreakIterator::preceding().\ n"

4279 "index=%d; prev returned %d; lastBreak=%d" ,

4280 name, i, breakPos, lastBreakPos);

4281 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks) ) {

4282 precedingBreaks[i] = 2; // Forces an error.

4283 }

4284 } else {

4285 if (breakPos >= 0) {

4286 precedingBreaks[breakPos] = 1;

4287 }

4288 lastBreakPos = breakPos;

4289 }

4290 }

4291

4292 // Compare the expected and actual results.

4293 for (i=0; i<=testText.length(); i++) {

4294 const char *errorType = NULL;

4295 if (forwardBreaks[i] != expectedBreaks[i]) {

4296 errorType = "next()";

4297 } else if (reverseBreaks[i] != forwardBreaks[i]) {

4298 errorType = "previous()";

4299 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {

4300 errorType = "isBoundary()";

4301 } else if (followingBreaks[i] != expectedBreaks[i]) {

4302 errorType = "following()";

4303 } else if (precedingBreaks[i] != expectedBreaks[i]) {

4304 errorType = "preceding()";

4305 }

4306

4307

4308 if (errorType != NULL) {

4309 // Format a range of the test text that includes the failure as

4310 // a data item that can be included in the rbbi test data file.

4311

4312 // Start of the range is the last point where expected and actua l results

4313 // both agreed that there was a break position.

4314 int startContext = i;

4315 int32_t count = 0;

4316 for (;;) {

4317 if (startContext==0) { break; }

4318 startContext --;

4319 if (expectedBreaks[startContext] != 0) {

4320 if (count == 2) break;

4321 count ++;

4322 }

4323 }

4324

4325 // End of range is two expected breaks past the start position.

4326 int endContext = i + 1;

4327 int ci;

4328 for (ci=0; ci<2; ci++) { // Number of items to include in error text.

4329 for (;;) {

4330 if (endContext >= testText.length()) {break;}

4331 if (expectedBreaks[endContext-1] != 0) {

4332 if (count == 0) break;

4333 count --;

4334 }

4335 endContext ++;

4336 }

4337 }

4338

4339 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</dat a>"

4340 UnicodeString errorText = "<data>";

4341 /***if (strcmp(errorType, "next()") == 0) {

4342 startContext = 0;

4343 endContext = testText.length();

4344

4345 printStringBreaks(testText, expected, expectedCount);

4346 }***/

4347

4348 for (ci=startContext; ci<endContext;) {

4349 UnicodeString hexChars("0123456789abcdef");

4350 UChar32 c;

4351 int bn;

4352 c = testText.char32At(ci);

4353 if (ci == i) {

4354 // This is the location of the error.

4355 errorText.append("<?>");

4356 } else if (expectedBreaks[ci] != 0) {

4357 // This a non-error expected break position.

4358 errorText.append("\\");

4359 }

4360 if (c < 0x10000) {

4361 errorText.append("\\u");

4362 for (bn=12; bn>=0; bn-=4) {

4363 errorText.append(hexChars.charAt((c>>bn)&0xf));

4364 }

4365 } else {

4366 errorText.append("\\U");

4367 for (bn=28; bn>=0; bn-=4) {

4368 errorText.append(hexChars.charAt((c>>bn)&0xf));

4369 }

4370 }

4371 ci = testText.moveIndex32(ci, 1);

4372 }

4373 errorText.append("\\");

4374 errorText.append("</data>\n");

4375

4376 // Output the error

4377 char charErrorTxt[500];

4378 UErrorCode status = U_ZERO_ERROR;

4379 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, stat us);

4380 charErrorTxt[sizeof(charErrorTxt)-1] = 0;

4381 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, stat us);

4382

4383 errln("%s break monkey test error [%s]. %s. Operation = %s; Ran dom seed = %d; buf Idx = %d\n%s",

4384 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),

4385 errorType, seed, i, charErrorTxt);

4386 break;

4387 }

4388 }

4389

4390 loopCount++;

4391 }

4392 #endif

4393 }

4394

4395

4396 // Bug 5532. UTF-8 based UText fails in dictionary code.

4397 // This test checks the initial patch,

4398 // which is to just keep it from crashing. Correct word boundaries

4399 // await a proper fix to the dictionary code.

4400 //

4401 void RBBITest::TestBug5532(void) {

4402 // Text includes a mixture of Thai and Latin.

4403 const unsigned char utf8Data[] = {

4404 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,

4405 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,

4406 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,

4407 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,

4408 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,

4409 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,

4410 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,

4411 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,

4412 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,

4413 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,

4414 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};

4415

4416 UErrorCode status = U_ZERO_ERROR;

4417 UText utext=UTEXT_INITIALIZER;

4418 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);

4419 TEST_ASSERT_SUCCESS(status);

4420

4421 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);

4422 TEST_ASSERT_SUCCESS(status);

4423 if (U_SUCCESS(status)) {

4424 bi->setText(&utext, status);

4425 TEST_ASSERT_SUCCESS(status);

4426

4427 int32_t breakCount = 0;

4428 int32_t previousBreak = -1;

4429 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {

4430 // For now, just make sure that the break iterator doesn't hang.

4431 TEST_ASSERT(previousBreak < bi->current());

4432 previousBreak = bi->current();

4433 }

4434 TEST_ASSERT(breakCount > 0);

4435 }

4436 delete bi;

4437 utext_close(&utext);

4438 }

4439

4440

4441 void RBBITest::TestBug9983(void) {

4442 UnicodeString text = UnicodeString("\\u002A" // * Other

4443 "\\uFF65" // Other

4444 "\\u309C" // Katakana

4445 "\\uFF9F" // Extend

4446 "\\uFF65" // Other

4447 "\\u0020" // Other

4448 "\\u0000").unescape();

4449

4450 UErrorCode status = U_ZERO_ERROR;

4451 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakItera tor *>(

4452 BreakIterator::createWordInstance(Locale::getRoot(), status)));

4453 TEST_ASSERT_SUCCESS(status);

4454 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreak Iterator *>(

4455 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));

4456 TEST_ASSERT_SUCCESS(status);

4457 if (U_FAILURE(status)) {

4458 return;

4459 }

4460 int32_t offset, rstatus, iterationCount;

4461

4462 brkiter->setText(text);

4463 brkiter->last();

4464 iterationCount = 0;

4465 while ( (offset = brkiter->previous()) != UBRK_DONE ) {

4466 iterationCount++;

4467 rstatus = brkiter->getRuleStatus();

4468 (void)rstatus; // Suppress set but not used warning.

4469 if (iterationCount >= 10) {

4470 break;

4471 }

4472 }

4473 TEST_ASSERT(iterationCount == 6);

4474

4475 brkiterPOSIX->setText(text);

4476 brkiterPOSIX->last();

4477 iterationCount = 0;

4478 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {

4479 iterationCount++;

4480 rstatus = brkiterPOSIX->getRuleStatus();

4481 (void)rstatus; // Suppress set but not used warning.

4482 if (iterationCount >= 10) {

4483 break;

4484 }

4485 }

4486 TEST_ASSERT(iterationCount == 6);

4487 }

4488

4489

4490 //

4491 // TestDebug - A place-holder test for debugging purposes.

4492 // For putting in fragments of other tests that can be invoked

4493 // for tracing without a lot of unwanted extra stuff happening .

4494 //

4495 void RBBITest::TestDebug(void) {

4496 #if 0

4497 UErrorCode status = U_ZERO_ERROR;

4498 int pos = 0;

4499 int ruleStatus = 0;

4500

4501 RuleBasedBreakIterator* bi =

4502 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::ge tDefault(), status);

4503 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Lo cale("th"), status);

4504 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::g etDefault(), status);

4505 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002 e\\u0046\\ufd3f\\u000a\\u002e");

4506 // UnicodeString s("Aaa. Bcd");

4507 s = s.unescape();

4508 bi->setText(s);

4509 UBool r = bi->isBoundary(8);

4510 printf("%s", r?"true":"false");

4511 return;

4512 pos = bi->last();

4513 do {

4514 // ruleStatus = bi->getRuleStatus();

4515 printf("%d\t%d\n", pos, ruleStatus);

4516 pos = bi->previous();

4517 } while (pos != BreakIterator::DONE);

4518 #endif

4519 }

4520

4521 void RBBITest::TestProperties() {

4522 UErrorCode errorCode = U_ZERO_ERROR;

4523 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);

4524 if (!prependSet.isEmpty()) {

4525 errln(

4526 "[:GCB=Prepend:] is not empty any more. "

4527 "Uncomment relevant lines in source/data/brkitr/char.txt and "

4528 "change this test to the opposite condition.");

4529 }

4530 }

4531

4532 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

OLD	NEW

« no previous file with comments | « source/test/intltest/rbbitst.h ('k') | source/test/intltest/regcoll.h » ('j') | no next file with comments »