source/test/intltest/regextst.cpp - Issue 2435373002: Delete source/test

Side by Side Diff: source/test/intltest/regextst.cpp

Issue 2435373002: Delete source/test (Closed)

Patch Set: Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 /********************************************************************

2 * COPYRIGHT:

3 * Copyright (c) 2002-2015, International Business Machines Corporation and

4 * others. All Rights Reserved.

5 ********************************************************************/

6

7 //

8 // regextst.cpp

9 //

10 // ICU Regular Expressions test, part of intltest.

11 //

12

13 /*

14 NOTE!!

15

16 PLEASE be careful about ASCII assumptions in this test.

17 This test is one of the worst repeat offenders.

18 If you have questions, contact someone on the ICU PMC

19 who has access to an EBCDIC system.

20

21 */

22

23 #include "intltest.h"

24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

25

26 #include <stdlib.h>

27 #include <stdio.h>

28 #include <string.h>

29

30 #include "unicode/localpointer.h"

31 #include "unicode/regex.h"

32 #include "unicode/uchar.h"

33 #include "unicode/ucnv.h"

34 #include "unicode/uniset.h"

35 #include "unicode/uregex.h"

36 #include "unicode/usetiter.h"

37 #include "unicode/ustring.h"

38 #include "unicode/utext.h"

39

40 #include "regextst.h"

41 #include "regexcmp.h"

42 #include "uvector.h"

43 #include "util.h"

44 #include "cmemory.h"

45 #include "cstring.h"

46 #include "uinvchar.h"

47

48 #define SUPPORT_MUTATING_INPUT_STRING 0

49

50 //---------------------------------------------------------------------------

51 //

52 // Test class boilerplate

53 //

54 //---------------------------------------------------------------------------

55 RegexTest::RegexTest()

56 {

57 }

58

59

60 RegexTest::~RegexTest()

61 {

62 }

63

64

65

66 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch ar* /par/ )

67 {

68 if (exec) logln("TestSuite RegexTest: ");

69 switch (index) {

70

71 case 0: name = "Basic";

72 if (exec) Basic();

73 break;

74 case 1: name = "API_Match";

75 if (exec) API_Match();

76 break;

77 case 2: name = "API_Replace";

78 if (exec) API_Replace();

79 break;

80 case 3: name = "API_Pattern";

81 if (exec) API_Pattern();

82 break;

83 case 4:

84 #if !UCONFIG_NO_FILE_IO

85 name = "Extended";

86 if (exec) Extended();

87 #else

88 name = "skip";

89 #endif

90 break;

91 case 5: name = "Errors";

92 if (exec) Errors();

93 break;

94 case 6: name = "PerlTests";

95 if (exec) PerlTests();

96 break;

97 case 7: name = "Callbacks";

98 if (exec) Callbacks();

99 break;

100 case 8: name = "FindProgressCallbacks";

101 if (exec) FindProgressCallbacks();

102 break;

103 case 9: name = "Bug 6149";

104 if (exec) Bug6149();

105 break;

106 case 10: name = "UTextBasic";

107 if (exec) UTextBasic();

108 break;

109 case 11: name = "API_Match_UTF8";

110 if (exec) API_Match_UTF8();

111 break;

112 case 12: name = "API_Replace_UTF8";

113 if (exec) API_Replace_UTF8();

114 break;

115 case 13: name = "API_Pattern_UTF8";

116 if (exec) API_Pattern_UTF8();

117 break;

118 case 14: name = "PerlTestsUTF8";

119 if (exec) PerlTestsUTF8();

120 break;

121 case 15: name = "PreAllocatedUTextCAPI";

122 if (exec) PreAllocatedUTextCAPI();

123 break;

124 case 16: name = "Bug 7651";

125 if (exec) Bug7651();

126 break;

127 case 17: name = "Bug 7740";

128 if (exec) Bug7740();

129 break;

130 case 18: name = "Bug 8479";

131 if (exec) Bug8479();

132 break;

133 case 19: name = "Bug 7029";

134 if (exec) Bug7029();

135 break;

136 case 20: name = "CheckInvBufSize";

137 if (exec) CheckInvBufSize();

138 break;

139 case 21: name = "Bug 9283";

140 if (exec) Bug9283();

141 break;

142 case 22: name = "Bug10459";

143 if (exec) Bug10459();

144 break;

145 case 23: name = "TestCaseInsensitiveStarters";

146 if (exec) TestCaseInsensitiveStarters();

147 break;

148 case 24: name = "TestBug11049";

149 if (exec) TestBug11049();

150 break;

151 case 25: name = "TestBug11371";

152 if (exec) TestBug11371();

153 break;

154 case 26: name = "TestBug11480";

155 if (exec) TestBug11480();

156 break;

157 case 27: name = "NamedCapture";

158 if (exec) NamedCapture();

159 break;

160 case 28: name = "NamedCaptureLimits";

161 if (exec) NamedCaptureLimits();

162 break;

163 default: name = "";

164 break; //needed to end loop

165 }

166 }

167

168

169

170 /**

171 * Calls utext_openUTF8 after, potentially, converting invariant text from the c ompilation codepage

172 * into ASCII.

173 * @see utext_openUTF8

174 */

175 static UText* regextst_openUTF8FromInvariant(UText* ut, const char inv, int64_t length, UErrorCode status);

176

177 //---------------------------------------------------------------------------

178 //

179 // Error Checking / Reporting macros used in all of the tests.

180 //

181 //---------------------------------------------------------------------------

182

183 static void utextToPrintable(char buf, int32_t bufLen, UText text) {

184 int64_t oldIndex = utext_getNativeIndex(text);

185 utext_setNativeIndex(text, 0);

186 char *bufPtr = buf;

187 UChar32 c = utext_next32From(text, 0);

188 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {

189 if (0x000020<=c && c<0x00007e) {

190 *bufPtr = c;

191 } else {

192 #if 0

193 sprintf(bufPtr,"U+%04X", c);

194 bufPtr+= strlen(bufPtr)-1;

195 #else

196 *bufPtr = '%';

197 #endif

198 }

199 bufPtr++;

200 c = UTEXT_NEXT32(text);

201 }

202 *bufPtr = 0;

203 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)

204 char ebuf = (char)malloc(bufLen);

205 uprv_eastrncpy((unsigned char)ebuf, (const unsigned char)buf, bufLen);

206 uprv_strncpy(buf, ebuf, bufLen);

207 free((void*)ebuf);

208 #endif

209 utext_setNativeIndex(text, oldIndex);

210 }

211

212

213 static char ASSERT_BUF[1024];

214

215 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {

216 if(message.length()==0) {

217 strcpy(ASSERT_BUF, "[[empty UnicodeString]]");

218 } else {

219 UnicodeString buf;

220 IntlTest::prettify(message,buf);

221 if(buf.length()==0) {

222 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");

223 } else {

224 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);

225 if(ASSERT_BUF[0]==0) {

226 ASSERT_BUF[0]=0;

227 for(int32_t i=0;i<buf.length();i++) {

228 UChar ch = buf[i];

229 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);

230 }

231 }

232 }

233 }

234 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;

235 return ASSERT_BUF;

236 }

237

238 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf) /sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}

239

240 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \

241 __FILE__, __LINE__ , u_errorName(status)); return;}}

242

243 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}

244

245 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr) ;\

246 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status= %s, got %s", \

247 __LINE__, u_errorName(errcode), u_errorName(status));};}

248

249 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \

250 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), stat us); }}

251

252 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \

253 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}

254

255 // expected: const char * , restricted to invariant characters.

256 // actual: const UnicodeString &

257 #define REGEX_ASSERT_UNISTR(expected, actual) { \

258 if (UnicodeString(expected, -1, US_INV) != (actual)) { \

259 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \

260 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}

261

262

263 static UBool testUTextEqual(UText uta, UText utb) {

264 UChar32 ca = 0;

265 UChar32 cb = 0;

266 utext_setNativeIndex(uta, 0);

267 utext_setNativeIndex(utb, 0);

268 do {

269 ca = utext_next32(uta);

270 cb = utext_next32(utb);

271 if (ca != cb) {

272 break;

273 }

274 } while (ca != U_SENTINEL);

275 return ca == cb;

276 }

277

278

279 /**

280 * @param expected expected text in UTF-8 (not platform) codepage

281 */

282 void RegexTest::assertUText(const char expected, UText actual, const char *fil e, int line) {

283 UErrorCode status = U_ZERO_ERROR;

284 UText expectedText = UTEXT_INITIALIZER;

285 utext_openUTF8(&expectedText, expected, -1, &status);

286 if(U_FAILURE(status)) {

287 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d ch ars)\n", file, line, u_errorName(status), strlen(expected));

288 return;

289 }

290 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {

291 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLe ngth(expectedText) returned 0.", file, line, strlen(expected));

292 return;

293 }

294 utext_setNativeIndex(actual, 0);

295 if (!testUTextEqual(&expectedText, actual)) {

296 char buf[201 /21/];

297 char expectedBuf[201];

298 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);

299 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]) , &expectedText);

300 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s \" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));

301 }

302 utext_close(&expectedText);

303 }

304 /**

305 * @param expected invariant (platform local text) input

306 */

307

308 void RegexTest::assertUTextInvariant(const char expected, UText actual, const char *file, int line) {

309 UErrorCode status = U_ZERO_ERROR;

310 UText expectedText = UTEXT_INITIALIZER;

311 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);

312 if(U_FAILURE(status)) {

313 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8From Invariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expect ed));

314 return;

315 }

316 utext_setNativeIndex(actual, 0);

317 if (!testUTextEqual(&expectedText, actual)) {

318 char buf[201 /21/];

319 char expectedBuf[201];

320 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);

321 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]) , &expectedText);

322 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars) , got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expe ctedText), buf, (int)utext_nativeLength(actual));

323 }

324 utext_close(&expectedText);

325 }

326

327 /**

328 * Assumes utf-8 input

329 */

330 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actua l), __FILE__, __LINE__)

331 /**

332 * Assumes Invariant input

333 */

334 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((exp ected), (actual), __FILE__, __LINE__)

335

336 /**

337 * This buffer ( inv_buf ) is used to hold the UTF-8 strings

338 * passed into utext_openUTF8. An error will be given if

339 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.

340 */

341

342 #define INV_BUFSIZ 2048 /* increase this if too small */

343

344 static int64_t inv_next=0;

345

346 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY

347 static char inv_buf[INV_BUFSIZ];

348 #endif

349

350 static UText* regextst_openUTF8FromInvariant(UText ut, const char inv, int64_t length, UErrorCode *status) {

351 if(length==-1) length=strlen(inv);

352 #if U_CHARSET_FAMILY==U_ASCII_FAMILY

353 inv_next+=length;

354 return utext_openUTF8(ut, inv, length, status);

355 #else

356 if(inv_next+length+1>INV_BUFSIZ) {

357 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be a t least %d.\n",

358 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));

359 *status = U_MEMORY_ALLOCATION_ERROR;

360 return NULL;

361 }

362

363 unsigned char buf = (unsigned char)inv_buf+inv_next;

364 uprv_aestrncpy(buf, (const uint8_t*)inv, length);

365 inv_next+=length;

366

367 #if 0

368 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);

369 #endif

370

371 return utext_openUTF8(ut, (const char*)buf, length, status);

372 #endif

373 }

374

375

376 //---------------------------------------------------------------------------

377 //

378 // REGEX_TESTLM Macro + invocation function to simplify writing quick t ests

379 // for the LookingAt() and Match() functions.

380 //

381 // usage:

382 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);

383 //

384 // The expected results are UBool - TRUE or FALSE.

385 // The input text is unescaped. The pattern is not.

386 //

387 //

388 //---------------------------------------------------------------------------

389

390 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, lookin g, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}

391

392 UBool RegexTest::doRegexLMTest(const char pat, const char text, UBool looking, UBool match, int32_t line) {

393 const UnicodeString pattern(pat, -1, US_INV);

394 const UnicodeString inputText(text, -1, US_INV);

395 UErrorCode status = U_ZERO_ERROR;

396 UParseError pe;

397 RegexPattern *REPattern = NULL;

398 RegexMatcher *REMatcher = NULL;

399 UBool retVal = TRUE;

400

401 UnicodeString patString(pat, -1, US_INV);

402 REPattern = RegexPattern::compile(patString, 0, pe, status);

403 if (U_FAILURE(status)) {

404 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Sta tus = %s",

405 line, u_errorName(status));

406 return FALSE;

407 }

408 if (line==376) { REPattern->dumpPattern();}

409

410 UnicodeString inputString(inputText);

411 UnicodeString unEscapedInput = inputString.unescape();

412 REMatcher = REPattern->matcher(unEscapedInput, status);

413 if (U_FAILURE(status)) {

414 errln("RegexTest failure in REPattern::matcher() at line %d. Status = % s\n",

415 line, u_errorName(status));

416 return FALSE;

417 }

418

419 UBool actualmatch;

420 actualmatch = REMatcher->lookingAt(status);

421 if (U_FAILURE(status)) {

422 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",

423 line, u_errorName(status));

424 retVal = FALSE;

425 }

426 if (actualmatch != looking) {

427 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);

428 retVal = FALSE;

429 }

430

431 status = U_ZERO_ERROR;

432 actualmatch = REMatcher->matches(status);

433 if (U_FAILURE(status)) {

434 errln("RegexTest failure in matches() at line %d. Status = %s\n",

435 line, u_errorName(status));

436 retVal = FALSE;

437 }

438 if (actualmatch != match) {

439 errln("RegexTest: wrong return from matches() at line %d.\n", line);

440 retVal = FALSE;

441 }

442

443 if (retVal == FALSE) {

444 REPattern->dumpPattern();

445 }

446

447 delete REPattern;

448 delete REMatcher;

449 return retVal;

450 }

451

452

453 UBool RegexTest::doRegexLMTestUTF8(const char pat, const char text, UBool look ing, UBool match, int32_t line) {

454 UText pattern = UTEXT_INITIALIZER;

455 int32_t inputUTF8Length;

456 char *textChars = NULL;

457 UText inputText = UTEXT_INITIALIZER;

458 UErrorCode status = U_ZERO_ERROR;

459 UParseError pe;

460 RegexPattern *REPattern = NULL;

461 RegexMatcher *REMatcher = NULL;

462 UBool retVal = TRUE;

463

464 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);

465 REPattern = RegexPattern::compile(&pattern, 0, pe, status);

466 if (U_FAILURE(status)) {

467 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8 ). Status = %s\n",

468 line, u_errorName(status));

469 return FALSE;

470 }

471

472 UnicodeString inputString(text, -1, US_INV);

473 UnicodeString unEscapedInput = inputString.unescape();

474 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));

475 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, N ULL, NULL, NULL, &status);

476

477 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);

478 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {

479 // UTF-8 does not allow unpaired surrogates, so this could actually happ en

480 logln("RegexTest unable to convert input to UTF8 at line %d. Status = % s\n", line, u_errorName(status));

481 return TRUE; // not a failure of the Regex engine

482 }

483 status = U_ZERO_ERROR; // buffer overflow

484 textChars = new char[inputUTF8Length+1];

485 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias( ), status);

486 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);

487

488 REMatcher = &REPattern->matcher(status)->reset(&inputText);

489 if (U_FAILURE(status)) {

490 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Sta tus = %s\n",

491 line, u_errorName(status));

492 return FALSE;

493 }

494

495 UBool actualmatch;

496 actualmatch = REMatcher->lookingAt(status);

497 if (U_FAILURE(status)) {

498 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\ n",

499 line, u_errorName(status));

500 retVal = FALSE;

501 }

502 if (actualmatch != looking) {

503 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", l ine);

504 retVal = FALSE;

505 }

506

507 status = U_ZERO_ERROR;

508 actualmatch = REMatcher->matches(status);

509 if (U_FAILURE(status)) {

510 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n" ,

511 line, u_errorName(status));

512 retVal = FALSE;

513 }

514 if (actualmatch != match) {

515 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", lin e);

516 retVal = FALSE;

517 }

518

519 if (retVal == FALSE) {

520 REPattern->dumpPattern();

521 }

522

523 delete REPattern;

524 delete REMatcher;

525 utext_close(&inputText);

526 utext_close(&pattern);

527 delete[] textChars;

528 return retVal;

529 }

530

531

532

533 //---------------------------------------------------------------------------

534 //

535 // REGEX_ERR Macro + invocation function to simplify writing tests

536 // regex tests for incorrect patterns

537 //

538 // usage:

539 // REGEX_ERR("pattern", expected error line, column, expected status) ;

540 //

541 //---------------------------------------------------------------------------

542 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LI NE__);

543

544 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,

545 UErrorCode expectedStatus, int32_t line) {

546 UnicodeString pattern(pat);

547

548 UErrorCode status = U_ZERO_ERROR;

549 UParseError pe;

550 RegexPattern *callerPattern = NULL;

551

552 //

553 // Compile the caller's pattern

554 //

555 UnicodeString patString(pat);

556 callerPattern = RegexPattern::compile(patString, 0, pe, status);

557 if (status != expectedStatus) {

558 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_err orName(status));

559 } else {

560 if (status != U_ZERO_ERROR) {

561 if (pe.line != errLine \|\| pe.offset != errCol) {

562 errln("Line %d: incorrect line/offset from UParseError. Expecte d %d/%d; got %d/%d.\n",

563 line, errLine, errCol, pe.line, pe.offset);

564 }

565 }

566 }

567

568 delete callerPattern;

569

570 //

571 // Compile again, using a UTF-8-based UText

572 //

573 UText patternText = UTEXT_INITIALIZER;

574 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);

575 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);

576 if (status != expectedStatus) {

577 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_err orName(status));

578 } else {

579 if (status != U_ZERO_ERROR) {

580 if (pe.line != errLine \|\| pe.offset != errCol) {

581 errln("Line %d: incorrect line/offset from UParseError. Expecte d %d/%d; got %d/%d.\n",

582 line, errLine, errCol, pe.line, pe.offset);

583 }

584 }

585 }

586

587 delete callerPattern;

588 utext_close(&patternText);

589 }

590

591

592

593 //---------------------------------------------------------------------------

594 //

595 // Basic Check for basic functionality of regex pattern matching.

596 // Avoid the use of REGEX_FIND test macro, which has

597 // substantial dependencies on basic Regex functionality.

598 //

599 //---------------------------------------------------------------------------

600 void RegexTest::Basic() {

601

602

603 //

604 // Debug - slide failing test cases early

605 //

606 #if 0

607 {

608 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);

609 UParseError pe;

610 UErrorCode status = U_ZERO_ERROR;

611 RegexPattern *pattern;

612 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unesc ape(), UREGEX_CASE_INSENSITIVE, pe, status);

613 pattern->dumpPattern();

614 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz") .unescape(), status);

615 UBool result = m->find();

616 printf("result = %d\n", result);

617 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");

618 // REGEX_FIND("(X([abc=X]+)+X)\|(y[abc=]+)", "=XX====================");

619 }

620 exit(1);

621 #endif

622

623

624 //

625 // Pattern with parentheses

626 //

627 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);

628 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);

629 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);

630

631 //

632 // Patterns with *

633 //

634 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);

635 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);

636 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);

637 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);

638 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);

639

640 REGEX_TESTLM("a*", "", TRUE, TRUE);

641 REGEX_TESTLM("a*", "b", TRUE, FALSE);

642

643

644 //

645 // Patterns with "."

646 //

647 REGEX_TESTLM(".", "abc", TRUE, FALSE);

648 REGEX_TESTLM("...", "abc", TRUE, TRUE);

649 REGEX_TESTLM("....", "abc", FALSE, FALSE);

650 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);

651 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);

652 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);

653 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);

654 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);

655

656 //

657 // Patterns with * applied to chars at end of literal string

658 //

659 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);

660 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);

661

662 //

663 // Supplemental chars match as single chars, not a pair of surrogates.

664 //

665 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);

666 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);

667 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);

668

669

670 //

671 // UnicodeSets in the pattern

672 //

673 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);

674 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);

675 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);

676 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);

677 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);

678 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);

679

680 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);

681 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);

682 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);

683 REGEX_TESTLM("[\\p{Nd}]", "a123456", TRUE, FALSE); // note that matches 0 occurences.

684 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);

685

686 //

687 // OR operator in patterns

688 //

689 REGEX_TESTLM("(a\|b)", "a", TRUE, TRUE);

690 REGEX_TESTLM("(a\|b)", "b", TRUE, TRUE);

691 REGEX_TESTLM("(a\|b)", "c", FALSE, FALSE);

692 REGEX_TESTLM("a\|b", "b", TRUE, TRUE);

693

694 REGEX_TESTLM("(a\|b\|c)*", "aabcaaccbcabc", TRUE, TRUE);

695 REGEX_TESTLM("(a\|b\|c)*", "aabcaaccbcabdc", TRUE, FALSE);

696 REGEX_TESTLM("(a(b\|c\|d)(x\|y\|z)*\|123)", "ac", TRUE, TRUE);

697 REGEX_TESTLM("(a(b\|c\|d)(x\|y\|z)*\|123)", "123", TRUE, TRUE);

698 REGEX_TESTLM("(a\|(1\|2))(b\|c\|d)(x\|y\|z)\|123", "123", TRUE, TRUE);

699 REGEX_TESTLM("(a\|(1\|2))(b\|c\|d)(x\|y\|z)\|123", "222211111czzzzw", TRUE, FALSE );

700

701 //

702 // +

703 //

704 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);

705 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);

706 REGEX_TESTLM("b+", "", FALSE, FALSE);

707 REGEX_TESTLM("(abc\|def)+", "defabc", TRUE, TRUE);

708 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);

709 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);

710

711 //

712 // ?

713 //

714 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);

715 REGEX_TESTLM("ab?", "a", TRUE, TRUE);

716 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);

717 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);

718 REGEX_TESTLM("a(b\|c)?d", "abd", TRUE, TRUE);

719 REGEX_TESTLM("a(b\|c)?d", "acd", TRUE, TRUE);

720 REGEX_TESTLM("a(b\|c)?d", "ad", TRUE, TRUE);

721 REGEX_TESTLM("a(b\|c)?d", "abcd", FALSE, FALSE);

722 REGEX_TESTLM("a(b\|c)?d", "ab", FALSE, FALSE);

723

724 //

725 // Escape sequences that become single literal chars, handled internally

726 // by ICU's Unescape.

727 //

728

729 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not i mplemented yet.

730 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL

731 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L

732 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape

733 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed

734 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line

735 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR

736 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab

737 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);

738 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);

739

740 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the begi nning of input

741 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the be ginning of input

742

743 // Escape of special chars in patterns

744 REGEX_TESTLM("\\\\\\\|\$\$\\[\\{\\~\\$\\\\+\\?\\.", "\\\\\|()[{~$+?.", TRU E, TRUE);

745 }

746

747

748 //---------------------------------------------------------------------------

749 //

750 // UTextBasic Check for quirks that are specific to the UText

751 // implementation.

752 //

753 //---------------------------------------------------------------------------

754 void RegexTest::UTextBasic() {

755 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */

756 UErrorCode status = U_ZERO_ERROR;

757 UText pattern = UTEXT_INITIALIZER;

758 utext_openUTF8(&pattern, str_abc, -1, &status);

759 RegexMatcher matcher(&pattern, 0, status);

760 REGEX_CHECK_STATUS;

761

762 UText input = UTEXT_INITIALIZER;

763 utext_openUTF8(&input, str_abc, -1, &status);

764 REGEX_CHECK_STATUS;

765 matcher.reset(&input);

766 REGEX_CHECK_STATUS;

767 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());

768

769 matcher.reset(matcher.inputText());

770 REGEX_CHECK_STATUS;

771 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());

772

773 utext_close(&pattern);

774 utext_close(&input);

775 }

776

777

778 //---------------------------------------------------------------------------

779 //

780 // API_Match Test that the API for class RegexMatcher

781 // is present and nominally working, but excluding functions

782 // implementing replace operations.

783 //

784 //---------------------------------------------------------------------------

785 void RegexTest::API_Match() {

786 UParseError pe;

787 UErrorCode status=U_ZERO_ERROR;

788 int32_t flags = 0;

789

790 //

791 // Debug - slide failing test cases early

792 //

793 #if 0

794 {

795 }

796 return;

797 #endif

798

799 //

800 // Simple pattern compilation

801 //

802 {

803 UnicodeString re("abc");

804 RegexPattern *pat2;

805 pat2 = RegexPattern::compile(re, flags, pe, status);

806 REGEX_CHECK_STATUS;

807

808 UnicodeString inStr1 = "abcdef this is a test";

809 UnicodeString instr2 = "not abc";

810 UnicodeString empty = "";

811

812

813 //

814 // Matcher creation and reset.

815 //

816 RegexMatcher *m1 = pat2->matcher(inStr1, status);

817 REGEX_CHECK_STATUS;

818 REGEX_ASSERT(m1->lookingAt(status) == TRUE);

819 REGEX_ASSERT(m1->input() == inStr1);

820 m1->reset(instr2);

821 REGEX_ASSERT(m1->lookingAt(status) == FALSE);

822 REGEX_ASSERT(m1->input() == instr2);

823 m1->reset(inStr1);

824 REGEX_ASSERT(m1->input() == inStr1);

825 REGEX_ASSERT(m1->lookingAt(status) == TRUE);

826 m1->reset(empty);

827 REGEX_ASSERT(m1->lookingAt(status) == FALSE);

828 REGEX_ASSERT(m1->input() == empty);

829 REGEX_ASSERT(&m1->pattern() == pat2);

830

831 //

832 // reset(pos, status)

833 //

834 m1->reset(inStr1);

835 m1->reset(4, status);

836 REGEX_CHECK_STATUS;

837 REGEX_ASSERT(m1->input() == inStr1);

838 REGEX_ASSERT(m1->lookingAt(status) == TRUE);

839

840 m1->reset(-1, status);

841 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

842 status = U_ZERO_ERROR;

843

844 m1->reset(0, status);

845 REGEX_CHECK_STATUS;

846 status = U_ZERO_ERROR;

847

848 int32_t len = m1->input().length();

849 m1->reset(len-1, status);

850 REGEX_CHECK_STATUS;

851 status = U_ZERO_ERROR;

852

853 m1->reset(len, status);

854 REGEX_CHECK_STATUS;

855 status = U_ZERO_ERROR;

856

857 m1->reset(len+1, status);

858 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

859 status = U_ZERO_ERROR;

860

861 //

862 // match(pos, status)

863 //

864 m1->reset(instr2);

865 REGEX_ASSERT(m1->matches(4, status) == TRUE);

866 m1->reset();

867 REGEX_ASSERT(m1->matches(3, status) == FALSE);

868 m1->reset();

869 REGEX_ASSERT(m1->matches(5, status) == FALSE);

870 REGEX_ASSERT(m1->matches(4, status) == TRUE);

871 REGEX_ASSERT(m1->matches(-1, status) == FALSE);

872 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

873

874 // Match() at end of string should fail, but should not

875 // be an error.

876 status = U_ZERO_ERROR;

877 len = m1->input().length();

878 REGEX_ASSERT(m1->matches(len, status) == FALSE);

879 REGEX_CHECK_STATUS;

880

881 // Match beyond end of string should fail with an error.

882 status = U_ZERO_ERROR;

883 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);

884 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

885

886 // Successful match at end of string.

887 {

888 status = U_ZERO_ERROR;

889 RegexMatcher m("A?", 0, status); // will match zero length string.

890 REGEX_CHECK_STATUS;

891 m.reset(inStr1);

892 len = inStr1.length();

893 REGEX_ASSERT(m.matches(len, status) == TRUE);

894 REGEX_CHECK_STATUS;

895 m.reset(empty);

896 REGEX_ASSERT(m.matches(0, status) == TRUE);

897 REGEX_CHECK_STATUS;

898 }

899

900

901 //

902 // lookingAt(pos, status)

903 //

904 status = U_ZERO_ERROR;

905 m1->reset(instr2); // "not abc"

906 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);

907 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);

908 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);

909 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);

910 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);

911 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

912 status = U_ZERO_ERROR;

913 len = m1->input().length();

914 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);

915 REGEX_CHECK_STATUS;

916 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);

917 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

918

919 delete m1;

920 delete pat2;

921 }

922

923

924 //

925 // Capture Group.

926 // RegexMatcher::start();

927 // RegexMatcher::end();

928 // RegexMatcher::groupCount();

929 //

930 {

931 int32_t flags=0;

932 UParseError pe;

933 UErrorCode status=U_ZERO_ERROR;

934

935 UnicodeString re("01(23(45)67)(.*)");

936 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);

937 REGEX_CHECK_STATUS;

938 UnicodeString data = "0123456789";

939

940 RegexMatcher *matcher = pat->matcher(data, status);

941 REGEX_CHECK_STATUS;

942 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);

943 static const int32_t matchStarts[] = {0, 2, 4, 8};

944 static const int32_t matchEnds[] = {10, 8, 6, 10};

945 int32_t i;

946 for (i=0; i<4; i++) {

947 int32_t actualStart = matcher->start(i, status);

948 REGEX_CHECK_STATUS;

949 if (actualStart != matchStarts[i]) {

950 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",

951 __LINE__, i, matchStarts[i], actualStart);

952 }

953 int32_t actualEnd = matcher->end(i, status);

954 REGEX_CHECK_STATUS;

955 if (actualEnd != matchEnds[i]) {

956 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",

957 __LINE__, i, matchEnds[i], actualEnd);

958 }

959 }

960

961 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));

962 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));

963

964 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR) ;

965 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR) ;

966 matcher->reset();

967 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);

968

969 matcher->lookingAt(status);

970 REGEX_ASSERT(matcher->group(status) == "0123456789");

971 REGEX_ASSERT(matcher->group(0, status) == "0123456789");

972 REGEX_ASSERT(matcher->group(1, status) == "234567" );

973 REGEX_ASSERT(matcher->group(2, status) == "45" );

974 REGEX_ASSERT(matcher->group(3, status) == "89" );

975 REGEX_CHECK_STATUS;

976 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR) ;

977 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR) ;

978 matcher->reset();

979 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);

980

981 delete matcher;

982 delete pat;

983

984 }

985

986 //

987 // find

988 //

989 {

990 int32_t flags=0;

991 UParseError pe;

992 UErrorCode status=U_ZERO_ERROR;

993

994 UnicodeString re("abc");

995 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);

996 REGEX_CHECK_STATUS;

997 UnicodeString data = ".abc..abc...abc..";

998 // 012345678901234567

999

1000 RegexMatcher *matcher = pat->matcher(data, status);

1001 REGEX_CHECK_STATUS;

1002 REGEX_ASSERT(matcher->find());

1003 REGEX_ASSERT(matcher->start(status) == 1);

1004 REGEX_ASSERT(matcher->find());

1005 REGEX_ASSERT(matcher->start(status) == 6);

1006 REGEX_ASSERT(matcher->find());

1007 REGEX_ASSERT(matcher->start(status) == 12);

1008 REGEX_ASSERT(matcher->find() == FALSE);

1009 REGEX_ASSERT(matcher->find() == FALSE);

1010

1011 matcher->reset();

1012 REGEX_ASSERT(matcher->find());

1013 REGEX_ASSERT(matcher->start(status) == 1);

1014

1015 REGEX_ASSERT(matcher->find(0, status));

1016 REGEX_ASSERT(matcher->start(status) == 1);

1017 REGEX_ASSERT(matcher->find(1, status));

1018 REGEX_ASSERT(matcher->start(status) == 1);

1019 REGEX_ASSERT(matcher->find(2, status));

1020 REGEX_ASSERT(matcher->start(status) == 6);

1021 REGEX_ASSERT(matcher->find(12, status));

1022 REGEX_ASSERT(matcher->start(status) == 12);

1023 REGEX_ASSERT(matcher->find(13, status) == FALSE);

1024 REGEX_ASSERT(matcher->find(16, status) == FALSE);

1025 REGEX_ASSERT(matcher->find(17, status) == FALSE);

1026 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);

1027

1028 status = U_ZERO_ERROR;

1029 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);

1030 status = U_ZERO_ERROR;

1031 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);

1032

1033 REGEX_ASSERT(matcher->groupCount() == 0);

1034

1035 delete matcher;

1036 delete pat;

1037 }

1038

1039

1040 //

1041 // find, with \G in pattern (true if at the end of a previous match).

1042 //

1043 {

1044 int32_t flags=0;

1045 UParseError pe;

1046 UErrorCode status=U_ZERO_ERROR;

1047

1048 UnicodeString re(".*?(?:(\\Gabc)\|(abc))", -1, US_INV);

1049 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);

1050 REGEX_CHECK_STATUS;

1051 UnicodeString data = ".abcabc.abc..";

1052 // 012345678901234567

1053

1054 RegexMatcher *matcher = pat->matcher(data, status);

1055 REGEX_CHECK_STATUS;

1056 REGEX_ASSERT(matcher->find());

1057 REGEX_ASSERT(matcher->start(status) == 0);

1058 REGEX_ASSERT(matcher->start(1, status) == -1);

1059 REGEX_ASSERT(matcher->start(2, status) == 1);

1060

1061 REGEX_ASSERT(matcher->find());

1062 REGEX_ASSERT(matcher->start(status) == 4);

1063 REGEX_ASSERT(matcher->start(1, status) == 4);

1064 REGEX_ASSERT(matcher->start(2, status) == -1);

1065 REGEX_CHECK_STATUS;

1066

1067 delete matcher;

1068 delete pat;

1069 }

1070

1071 //

1072 // find with zero length matches, match position should bump ahead

1073 // to prevent loops.

1074 //

1075 {

1076 int32_t i;

1077 UErrorCode status=U_ZERO_ERROR;

1078 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero- length matches anywhere,

1079 // using an always-true look-ahead.

1080 REGEX_CHECK_STATUS;

1081 UnicodeString s(" ");

1082 m.reset(s);

1083 for (i=0; ; i++) {

1084 if (m.find() == FALSE) {

1085 break;

1086 }

1087 REGEX_ASSERT(m.start(status) == i);

1088 REGEX_ASSERT(m.end(status) == i);

1089 }

1090 REGEX_ASSERT(i==5);

1091

1092 // Check that the bump goes over surrogate pairs OK

1093 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004" );

1094 s = s.unescape();

1095 m.reset(s);

1096 for (i=0; ; i+=2) {

1097 if (m.find() == FALSE) {

1098 break;

1099 }

1100 REGEX_ASSERT(m.start(status) == i);

1101 REGEX_ASSERT(m.end(status) == i);

1102 }

1103 REGEX_ASSERT(i==10);

1104 }

1105 {

1106 // find() loop breaking test.

1107 // with pattern of /.?/, should see a series of one char matches, then a single

1108 // match of zero length at the end of the input string.

1109 int32_t i;

1110 UErrorCode status=U_ZERO_ERROR;

1111 RegexMatcher m(".?", 0, status);

1112 REGEX_CHECK_STATUS;

1113 UnicodeString s(" ");

1114 m.reset(s);

1115 for (i=0; ; i++) {

1116 if (m.find() == FALSE) {

1117 break;

1118 }

1119 REGEX_ASSERT(m.start(status) == i);

1120 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));

1121 }

1122 REGEX_ASSERT(i==5);

1123 }

1124

1125

1126 //

1127 // Matchers with no input string behave as if they had an empty input string .

1128 //

1129

1130 {

1131 UErrorCode status = U_ZERO_ERROR;

1132 RegexMatcher m(".?", 0, status);

1133 REGEX_CHECK_STATUS;

1134 REGEX_ASSERT(m.find());

1135 REGEX_ASSERT(m.start(status) == 0);

1136 REGEX_ASSERT(m.input() == "");

1137 }

1138 {

1139 UErrorCode status = U_ZERO_ERROR;

1140 RegexPattern *p = RegexPattern::compile(".", 0, status);

1141 RegexMatcher *m = p->matcher(status);

1142 REGEX_CHECK_STATUS;

1143

1144 REGEX_ASSERT(m->find() == FALSE);

1145 REGEX_ASSERT(m->input() == "");

1146 delete m;

1147 delete p;

1148 }

1149

1150 //

1151 // Regions

1152 //

1153 {

1154 UErrorCode status = U_ZERO_ERROR;

1155 UnicodeString testString("This is test data");

1156 RegexMatcher m(".*", testString, 0, status);

1157 REGEX_CHECK_STATUS;

1158 REGEX_ASSERT(m.regionStart() == 0);

1159 REGEX_ASSERT(m.regionEnd() == testString.length());

1160 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);

1161 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);

1162

1163 m.region(2,4, status);

1164 REGEX_CHECK_STATUS;

1165 REGEX_ASSERT(m.matches(status));

1166 REGEX_ASSERT(m.start(status)==2);

1167 REGEX_ASSERT(m.end(status)==4);

1168 REGEX_CHECK_STATUS;

1169

1170 m.reset();

1171 REGEX_ASSERT(m.regionStart() == 0);

1172 REGEX_ASSERT(m.regionEnd() == testString.length());

1173

1174 UnicodeString shorterString("short");

1175 m.reset(shorterString);

1176 REGEX_ASSERT(m.regionStart() == 0);

1177 REGEX_ASSERT(m.regionEnd() == shorterString.length());

1178

1179 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);

1180 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));

1181 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);

1182 REGEX_ASSERT(&m == &m.reset());

1183 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);

1184

1185 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));

1186 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);

1187 REGEX_ASSERT(&m == &m.reset());

1188 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);

1189

1190 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);

1191 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));

1192 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);

1193 REGEX_ASSERT(&m == &m.reset());

1194 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);

1195

1196 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));

1197 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);

1198 REGEX_ASSERT(&m == &m.reset());

1199 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);

1200

1201 }

1202

1203 //

1204 // hitEnd() and requireEnd()

1205 //

1206 {

1207 UErrorCode status = U_ZERO_ERROR;

1208 UnicodeString testString("aabb");

1209 RegexMatcher m1(".*", testString, 0, status);

1210 REGEX_ASSERT(m1.lookingAt(status) == TRUE);

1211 REGEX_ASSERT(m1.hitEnd() == TRUE);

1212 REGEX_ASSERT(m1.requireEnd() == FALSE);

1213 REGEX_CHECK_STATUS;

1214

1215 status = U_ZERO_ERROR;

1216 RegexMatcher m2("a*", testString, 0, status);

1217 REGEX_ASSERT(m2.lookingAt(status) == TRUE);

1218 REGEX_ASSERT(m2.hitEnd() == FALSE);

1219 REGEX_ASSERT(m2.requireEnd() == FALSE);

1220 REGEX_CHECK_STATUS;

1221

1222 status = U_ZERO_ERROR;

1223 RegexMatcher m3(".*$", testString, 0, status);

1224 REGEX_ASSERT(m3.lookingAt(status) == TRUE);

1225 REGEX_ASSERT(m3.hitEnd() == TRUE);

1226 REGEX_ASSERT(m3.requireEnd() == TRUE);

1227 REGEX_CHECK_STATUS;

1228 }

1229

1230

1231 //

1232 // Compilation error on reset with UChar *

1233 // These were a hazard that people were stumbling over with runtime errors .

1234 // Changed them to compiler errors by adding private methods that more clo sely

1235 // matched the incorrect use of the functions.

1236 //

1237 #if 0

1238 {

1239 UErrorCode status = U_ZERO_ERROR;

1240 UChar ucharString[20];

1241 RegexMatcher m(".", 0, status);

1242 m.reset(ucharString); // should not compile.

1243

1244 RegexPattern *p = RegexPattern::compile(".", 0, status);

1245 RegexMatcher *m2 = p->matcher(ucharString, status); // should not co mpile.

1246

1247 RegexMatcher m3(".", ucharString, 0, status); // Should not compile

1248 }

1249 #endif

1250

1251 //

1252 // Time Outs.

1253 // Note: These tests will need to be changed when the regexp engine i s

1254 // able to detect and cut short the exponential time behavior o n

1255 // this type of match.

1256 //

1257 {

1258 UErrorCode status = U_ZERO_ERROR;

1259 // Enough 'a's in the string to cause the match to time out.

1260 // (Each on additonal 'a' doubles the time)

1261 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");

1262 RegexMatcher matcher("(a+)+b", testString, 0, status);

1263 REGEX_CHECK_STATUS;

1264 REGEX_ASSERT(matcher.getTimeLimit() == 0);

1265 matcher.setTimeLimit(100, status);

1266 REGEX_ASSERT(matcher.getTimeLimit() == 100);

1267 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);

1268 REGEX_ASSERT(status == U_REGEX_TIME_OUT);

1269 }

1270 {

1271 UErrorCode status = U_ZERO_ERROR;

1272 // Few enough 'a's to slip in under the time limit.

1273 UnicodeString testString("aaaaaaaaaaaaaaaaaa");

1274 RegexMatcher matcher("(a+)+b", testString, 0, status);

1275 REGEX_CHECK_STATUS;

1276 matcher.setTimeLimit(100, status);

1277 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);

1278 REGEX_CHECK_STATUS;

1279 }

1280

1281 //

1282 // Stack Limits

1283 //

1284 {

1285 UErrorCode status = U_ZERO_ERROR;

1286 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'

1287

1288 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits opt imizations

1289 // of the '+', and makes the stack frames larger.

1290 RegexMatcher matcher("(A)+A$", testString, 0, status);

1291

1292 // With the default stack, this match should fail to run

1293 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);

1294 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);

1295

1296 // With unlimited stack, it should run

1297 status = U_ZERO_ERROR;

1298 matcher.setStackLimit(0, status);

1299 REGEX_CHECK_STATUS;

1300 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);

1301 REGEX_CHECK_STATUS;

1302 REGEX_ASSERT(matcher.getStackLimit() == 0);

1303

1304 // With a limited stack, it the match should fail

1305 status = U_ZERO_ERROR;

1306 matcher.setStackLimit(10000, status);

1307 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);

1308 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);

1309 REGEX_ASSERT(matcher.getStackLimit() == 10000);

1310 }

1311

1312 // A pattern that doesn't save state should work with

1313 // a minimal sized stack

1314 {

1315 UErrorCode status = U_ZERO_ERROR;

1316 UnicodeString testString = "abc";

1317 RegexMatcher matcher("abc", testString, 0, status);

1318 REGEX_CHECK_STATUS;

1319 matcher.setStackLimit(30, status);

1320 REGEX_CHECK_STATUS;

1321 REGEX_ASSERT(matcher.matches(status) == TRUE);

1322 REGEX_CHECK_STATUS;

1323 REGEX_ASSERT(matcher.getStackLimit() == 30);

1324

1325 // Negative stack sizes should fail

1326 status = U_ZERO_ERROR;

1327 matcher.setStackLimit(1000, status);

1328 REGEX_CHECK_STATUS;

1329 matcher.setStackLimit(-1, status);

1330 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);

1331 REGEX_ASSERT(matcher.getStackLimit() == 1000);

1332 }

1333

1334

1335 }

1336

1337

1338

1339

1340

1341

1342 //---------------------------------------------------------------------------

1343 //

1344 // API_Replace API test for class RegexMatcher, testing the

1345 // Replace family of functions.

1346 //

1347 //---------------------------------------------------------------------------

1348 void RegexTest::API_Replace() {

1349 //

1350 // Replace

1351 //

1352 int32_t flags=0;

1353 UParseError pe;

1354 UErrorCode status=U_ZERO_ERROR;

1355

1356 UnicodeString re("abc");

1357 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);

1358 REGEX_CHECK_STATUS;

1359 UnicodeString data = ".abc..abc...abc..";

1360 // 012345678901234567

1361 RegexMatcher *matcher = pat->matcher(data, status);

1362

1363 //

1364 // Plain vanilla matches.

1365 //

1366 UnicodeString dest;

1367 dest = matcher->replaceFirst("yz", status);

1368 REGEX_CHECK_STATUS;

1369 REGEX_ASSERT(dest == ".yz..abc...abc..");

1370

1371 dest = matcher->replaceAll("yz", status);

1372 REGEX_CHECK_STATUS;

1373 REGEX_ASSERT(dest == ".yz..yz...yz..");

1374

1375 //

1376 // Plain vanilla non-matches.

1377 //

1378 UnicodeString d2 = ".abx..abx...abx..";

1379 matcher->reset(d2);

1380 dest = matcher->replaceFirst("yz", status);

1381 REGEX_CHECK_STATUS;

1382 REGEX_ASSERT(dest == ".abx..abx...abx..");

1383

1384 dest = matcher->replaceAll("yz", status);

1385 REGEX_CHECK_STATUS;

1386 REGEX_ASSERT(dest == ".abx..abx...abx..");

1387

1388 //

1389 // Empty source string

1390 //

1391 UnicodeString d3 = "";

1392 matcher->reset(d3);

1393 dest = matcher->replaceFirst("yz", status);

1394 REGEX_CHECK_STATUS;

1395 REGEX_ASSERT(dest == "");

1396

1397 dest = matcher->replaceAll("yz", status);

1398 REGEX_CHECK_STATUS;

1399 REGEX_ASSERT(dest == "");

1400

1401 //

1402 // Empty substitution string

1403 //

1404 matcher->reset(data); // ".abc..abc...abc.."

1405 dest = matcher->replaceFirst("", status);

1406 REGEX_CHECK_STATUS;

1407 REGEX_ASSERT(dest == "...abc...abc..");

1408

1409 dest = matcher->replaceAll("", status);

1410 REGEX_CHECK_STATUS;

1411 REGEX_ASSERT(dest == "........");

1412

1413 //

1414 // match whole string

1415 //

1416 UnicodeString d4 = "abc";

1417 matcher->reset(d4);

1418 dest = matcher->replaceFirst("xyz", status);

1419 REGEX_CHECK_STATUS;

1420 REGEX_ASSERT(dest == "xyz");

1421

1422 dest = matcher->replaceAll("xyz", status);

1423 REGEX_CHECK_STATUS;

1424 REGEX_ASSERT(dest == "xyz");

1425

1426 //

1427 // Capture Group, simple case

1428 //

1429 UnicodeString re2("a(..)");

1430 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);

1431 REGEX_CHECK_STATUS;

1432 UnicodeString d5 = "abcdefg";

1433 RegexMatcher *matcher2 = pat2->matcher(d5, status);

1434 REGEX_CHECK_STATUS;

1435 dest = matcher2->replaceFirst("$1$1", status);

1436 REGEX_CHECK_STATUS;

1437 REGEX_ASSERT(dest == "bcbcdefg");

1438

1439 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1 ."), status);

1440 REGEX_CHECK_STATUS;

1441 REGEX_ASSERT(dest == "The value of $1 is bc.defg");

1442

1443 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);

1444 REGEX_ASSERT(U_FAILURE(status));

1445 status = U_ZERO_ERROR;

1446

1447 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U 0001D7CF.");

1448 replacement = replacement.unescape();

1449 dest = matcher2->replaceFirst(replacement, status);

1450 REGEX_CHECK_STATUS;

1451 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");

1452

1453 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",st atus), U_INDEX_OUTOFBOUNDS_ERROR);

1454

1455

1456 //

1457 // Replacement String with \u hex escapes

1458 //

1459 {

1460 UnicodeString src = "abc 1 abc 2 abc 3";

1461 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");

1462 matcher->reset(src);

1463 UnicodeString result = matcher->replaceAll(substitute, status);

1464 REGEX_CHECK_STATUS;

1465 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");

1466 }

1467 {

1468 UnicodeString src = "abc !";

1469 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");

1470 matcher->reset(src);

1471 UnicodeString result = matcher->replaceAll(substitute, status);

1472 REGEX_CHECK_STATUS;

1473 UnicodeString expected = UnicodeString("--");

1474 expected.append((UChar32)0x10000);

1475 expected.append("-- !");

1476 REGEX_ASSERT(result == expected);

1477 }

1478 // TODO: need more through testing of capture substitutions.

1479

1480 // Bug 4057

1481 //

1482 {

1483 status = U_ZERO_ERROR;

1484 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";

1485 RegexMatcher m("ss(.*?)ee", 0, status);

1486 REGEX_CHECK_STATUS;

1487 UnicodeString result;

1488

1489 // Multiple finds do NOT bump up the previous appendReplacement postion.

1490 m.reset(s);

1491 m.find();

1492 m.find();

1493 m.appendReplacement(result, "ooh", status);

1494 REGEX_CHECK_STATUS;

1495 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");

1496

1497 // After a reset into the interior of a string, appendReplacemnt still s tarts at beginning.

1498 status = U_ZERO_ERROR;

1499 result.truncate(0);

1500 m.reset(10, status);

1501 m.find();

1502 m.find();

1503 m.appendReplacement(result, "ooh", status);

1504 REGEX_CHECK_STATUS;

1505 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");

1506

1507 // find() at interior of string, appendReplacemnt still starts at beginn ing.

1508 status = U_ZERO_ERROR;

1509 result.truncate(0);

1510 m.reset();

1511 m.find(10, status);

1512 m.find();

1513 m.appendReplacement(result, "ooh", status);

1514 REGEX_CHECK_STATUS;

1515 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");

1516

1517 m.appendTail(result);

1518 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fi n");

1519

1520 }

1521

1522 delete matcher2;

1523 delete pat2;

1524 delete matcher;

1525 delete pat;

1526 }

1527

1528

1529 //---------------------------------------------------------------------------

1530 //

1531 // API_Pattern Test that the API for class RegexPattern is

1532 // present and nominally working.

1533 //

1534 //---------------------------------------------------------------------------

1535 void RegexTest::API_Pattern() {

1536 RegexPattern pata; // Test default constructor to not crash.

1537 RegexPattern patb;

1538

1539 REGEX_ASSERT(pata == patb);

1540 REGEX_ASSERT(pata == pata);

1541

1542 UnicodeString re1("abc[a-l][m-z]");

1543 UnicodeString re2("def");

1544 UErrorCode status = U_ZERO_ERROR;

1545 UParseError pe;

1546

1547 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);

1548 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);

1549 REGEX_CHECK_STATUS;

1550 REGEX_ASSERT(pat1 == pat1);

1551 REGEX_ASSERT(*pat1 != pata);

1552

1553 // Assign

1554 patb = *pat1;

1555 REGEX_ASSERT(patb == *pat1);

1556

1557 // Copy Construct

1558 RegexPattern patc(*pat1);

1559 REGEX_ASSERT(patc == *pat1);

1560 REGEX_ASSERT(patb == patc);

1561 REGEX_ASSERT(pat1 != pat2);

1562 patb = *pat2;

1563 REGEX_ASSERT(patb != patc);

1564 REGEX_ASSERT(patb == *pat2);

1565

1566 // Compile with no flags.

1567 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);

1568 REGEX_ASSERT(pat1a == pat1);

1569

1570 REGEX_ASSERT(pat1a->flags() == 0);

1571

1572 // Compile with different flags should be not equal

1573 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSIT IVE, pe, status);

1574 REGEX_CHECK_STATUS;

1575

1576 REGEX_ASSERT(pat1b != pat1a);

1577 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);

1578 REGEX_ASSERT(pat1a->flags() == 0);

1579 delete pat1b;

1580

1581 // clone

1582 RegexPattern *pat1c = pat1->clone();

1583 REGEX_ASSERT(pat1c == pat1);

1584 REGEX_ASSERT(pat1c != pat2);

1585

1586 delete pat1c;

1587 delete pat1a;

1588 delete pat1;

1589 delete pat2;

1590

1591

1592 //

1593 // Verify that a matcher created from a cloned pattern works.

1594 // (Jitterbug 3423)

1595 //

1596 {

1597 UErrorCode status = U_ZERO_ERROR;

1598 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE( "\\p{L}+"), 0, status);

1599 RegexPattern *pClone = pSource->clone();

1600 delete pSource;

1601 RegexMatcher *mFromClone = pClone->matcher(status);

1602 REGEX_CHECK_STATUS;

1603 UnicodeString s = "Hello World";

1604 mFromClone->reset(s);

1605 REGEX_ASSERT(mFromClone->find() == TRUE);

1606 REGEX_ASSERT(mFromClone->group(status) == "Hello");

1607 REGEX_ASSERT(mFromClone->find() == TRUE);

1608 REGEX_ASSERT(mFromClone->group(status) == "World");

1609 REGEX_ASSERT(mFromClone->find() == FALSE);

1610 delete mFromClone;

1611 delete pClone;

1612 }

1613

1614 //

1615 // matches convenience API

1616 //

1617 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE );

1618 REGEX_CHECK_STATUS;

1619 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FAL SE);

1620 REGEX_CHECK_STATUS;

1621 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);

1622 REGEX_CHECK_STATUS;

1623 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, statu s) == TRUE);

1624 REGEX_CHECK_STATUS;

1625 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FAL SE);

1626 REGEX_CHECK_STATUS;

1627 status = U_INDEX_OUTOFBOUNDS_ERROR;

1628 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);

1629 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

1630

1631

1632 //

1633 // Split()

1634 //

1635 status = U_ZERO_ERROR;

1636 pat1 = RegexPattern::compile(" +", pe, status);

1637 REGEX_CHECK_STATUS;

1638 UnicodeString fields[10];

1639

1640 int32_t n;

1641 n = pat1->split("Now is the time", fields, 10, status);

1642 REGEX_CHECK_STATUS;

1643 REGEX_ASSERT(n==4);

1644 REGEX_ASSERT(fields[0]=="Now");

1645 REGEX_ASSERT(fields[1]=="is");

1646 REGEX_ASSERT(fields[2]=="the");

1647 REGEX_ASSERT(fields[3]=="time");

1648 REGEX_ASSERT(fields[4]=="");

1649

1650 n = pat1->split("Now is the time", fields, 2, status);

1651 REGEX_CHECK_STATUS;

1652 REGEX_ASSERT(n==2);

1653 REGEX_ASSERT(fields[0]=="Now");

1654 REGEX_ASSERT(fields[1]=="is the time");

1655 REGEX_ASSERT(fields[2]=="the"); // left over from previous test

1656

1657 fields[1] = "*";

1658 status = U_ZERO_ERROR;

1659 n = pat1->split("Now is the time", fields, 1, status);

1660 REGEX_CHECK_STATUS;

1661 REGEX_ASSERT(n==1);

1662 REGEX_ASSERT(fields[0]=="Now is the time");

1663 REGEX_ASSERT(fields[1]=="*");

1664 status = U_ZERO_ERROR;

1665

1666 n = pat1->split(" Now is the time ", fields, 10, status);

1667 REGEX_CHECK_STATUS;

1668 REGEX_ASSERT(n==6);

1669 REGEX_ASSERT(fields[0]=="");

1670 REGEX_ASSERT(fields[1]=="Now");

1671 REGEX_ASSERT(fields[2]=="is");

1672 REGEX_ASSERT(fields[3]=="the");

1673 REGEX_ASSERT(fields[4]=="time");

1674 REGEX_ASSERT(fields[5]=="");

1675

1676 n = pat1->split(" ", fields, 10, status);

1677 REGEX_CHECK_STATUS;

1678 REGEX_ASSERT(n==2);

1679 REGEX_ASSERT(fields[0]=="");

1680 REGEX_ASSERT(fields[1]=="");

1681

1682 fields[0] = "foo";

1683 n = pat1->split("", fields, 10, status);

1684 REGEX_CHECK_STATUS;

1685 REGEX_ASSERT(n==0);

1686 REGEX_ASSERT(fields[0]=="foo");

1687

1688 delete pat1;

1689

1690 // split, with a pattern with (capture)

1691 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status) ;

1692 REGEX_CHECK_STATUS;

1693

1694 status = U_ZERO_ERROR;

1695 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);

1696 REGEX_CHECK_STATUS;

1697 REGEX_ASSERT(n==7);

1698 REGEX_ASSERT(fields[0]=="");

1699 REGEX_ASSERT(fields[1]=="a");

1700 REGEX_ASSERT(fields[2]=="Now is ");

1701 REGEX_ASSERT(fields[3]=="b");

1702 REGEX_ASSERT(fields[4]=="the time");

1703 REGEX_ASSERT(fields[5]=="c");

1704 REGEX_ASSERT(fields[6]=="");

1705 REGEX_ASSERT(status==U_ZERO_ERROR);

1706

1707 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);

1708 REGEX_CHECK_STATUS;

1709 REGEX_ASSERT(n==7);

1710 REGEX_ASSERT(fields[0]==" ");

1711 REGEX_ASSERT(fields[1]=="a");

1712 REGEX_ASSERT(fields[2]=="Now is ");

1713 REGEX_ASSERT(fields[3]=="b");

1714 REGEX_ASSERT(fields[4]=="the time");

1715 REGEX_ASSERT(fields[5]=="c");

1716 REGEX_ASSERT(fields[6]=="");

1717

1718 status = U_ZERO_ERROR;

1719 fields[6] = "foo";

1720 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);

1721 REGEX_CHECK_STATUS;

1722 REGEX_ASSERT(n==6);

1723 REGEX_ASSERT(fields[0]==" ");

1724 REGEX_ASSERT(fields[1]=="a");

1725 REGEX_ASSERT(fields[2]=="Now is ");

1726 REGEX_ASSERT(fields[3]=="b");

1727 REGEX_ASSERT(fields[4]=="the time");

1728 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.

1729 REGEX_ASSERT(fields[6]=="foo");

1730

1731 status = U_ZERO_ERROR;

1732 fields[5] = "foo";

1733 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);

1734 REGEX_CHECK_STATUS;

1735 REGEX_ASSERT(n==5);

1736 REGEX_ASSERT(fields[0]==" ");

1737 REGEX_ASSERT(fields[1]=="a");

1738 REGEX_ASSERT(fields[2]=="Now is ");

1739 REGEX_ASSERT(fields[3]=="b");

1740 REGEX_ASSERT(fields[4]=="the time<c>");

1741 REGEX_ASSERT(fields[5]=="foo");

1742

1743 status = U_ZERO_ERROR;

1744 fields[5] = "foo";

1745 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);

1746 REGEX_CHECK_STATUS;

1747 REGEX_ASSERT(n==5);

1748 REGEX_ASSERT(fields[0]==" ");

1749 REGEX_ASSERT(fields[1]=="a");

1750 REGEX_ASSERT(fields[2]=="Now is ");

1751 REGEX_ASSERT(fields[3]=="b");

1752 REGEX_ASSERT(fields[4]=="the time");

1753 REGEX_ASSERT(fields[5]=="foo");

1754

1755 status = U_ZERO_ERROR;

1756 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);

1757 REGEX_CHECK_STATUS;

1758 REGEX_ASSERT(n==4);

1759 REGEX_ASSERT(fields[0]==" ");

1760 REGEX_ASSERT(fields[1]=="a");

1761 REGEX_ASSERT(fields[2]=="Now is ");

1762 REGEX_ASSERT(fields[3]=="the time<c>");

1763 status = U_ZERO_ERROR;

1764 delete pat1;

1765

1766 pat1 = RegexPattern::compile("([-,])", pe, status);

1767 REGEX_CHECK_STATUS;

1768 n = pat1->split("1-10,20", fields, 10, status);

1769 REGEX_CHECK_STATUS;

1770 REGEX_ASSERT(n==5);

1771 REGEX_ASSERT(fields[0]=="1");

1772 REGEX_ASSERT(fields[1]=="-");

1773 REGEX_ASSERT(fields[2]=="10");

1774 REGEX_ASSERT(fields[3]==",");

1775 REGEX_ASSERT(fields[4]=="20");

1776 delete pat1;

1777

1778 // Test split of string with empty trailing fields

1779 pat1 = RegexPattern::compile(",", pe, status);

1780 REGEX_CHECK_STATUS;

1781 n = pat1->split("a,b,c,", fields, 10, status);

1782 REGEX_CHECK_STATUS;

1783 REGEX_ASSERT(n==4);

1784 REGEX_ASSERT(fields[0]=="a");

1785 REGEX_ASSERT(fields[1]=="b");

1786 REGEX_ASSERT(fields[2]=="c");

1787 REGEX_ASSERT(fields[3]=="");

1788

1789 n = pat1->split("a,,,", fields, 10, status);

1790 REGEX_CHECK_STATUS;

1791 REGEX_ASSERT(n==4);

1792 REGEX_ASSERT(fields[0]=="a");

1793 REGEX_ASSERT(fields[1]=="");

1794 REGEX_ASSERT(fields[2]=="");

1795 REGEX_ASSERT(fields[3]=="");

1796 delete pat1;

1797

1798 // Split Separator with zero length match.

1799 pat1 = RegexPattern::compile(":?", pe, status);

1800 REGEX_CHECK_STATUS;

1801 n = pat1->split("abc", fields, 10, status);

1802 REGEX_CHECK_STATUS;

1803 REGEX_ASSERT(n==5);

1804 REGEX_ASSERT(fields[0]=="");

1805 REGEX_ASSERT(fields[1]=="a");

1806 REGEX_ASSERT(fields[2]=="b");

1807 REGEX_ASSERT(fields[3]=="c");

1808 REGEX_ASSERT(fields[4]=="");

1809

1810 delete pat1;

1811

1812 //

1813 // RegexPattern::pattern()

1814 //

1815 pat1 = new RegexPattern();

1816 REGEX_ASSERT(pat1->pattern() == "");

1817 delete pat1;

1818

1819 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);

1820 REGEX_CHECK_STATUS;

1821 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");

1822 delete pat1;

1823

1824

1825 //

1826 // classID functions

1827 //

1828 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);

1829 REGEX_CHECK_STATUS;

1830 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());

1831 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);

1832 UnicodeString Hello("Hello, world.");

1833 RegexMatcher *m = pat1->matcher(Hello, status);

1834 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());

1835 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());

1836 REGEX_ASSERT(m->getDynamicClassID() != NULL);

1837 delete m;

1838 delete pat1;

1839

1840 }

1841

1842 //---------------------------------------------------------------------------

1843 //

1844 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher

1845 // is present and working, but excluding functions

1846 // implementing replace operations.

1847 //

1848 //---------------------------------------------------------------------------

1849 void RegexTest::API_Match_UTF8() {

1850 UParseError pe;

1851 UErrorCode status=U_ZERO_ERROR;

1852 int32_t flags = 0;

1853

1854 //

1855 // Debug - slide failing test cases early

1856 //

1857 #if 0

1858 {

1859 }

1860 return;

1861 #endif

1862

1863 //

1864 // Simple pattern compilation

1865 //

1866 {

1867 UText re = UTEXT_INITIALIZER;

1868 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);

1869 REGEX_VERBOSE_TEXT(&re);

1870 RegexPattern *pat2;

1871 pat2 = RegexPattern::compile(&re, flags, pe, status);

1872 REGEX_CHECK_STATUS;

1873

1874 UText input1 = UTEXT_INITIALIZER;

1875 UText input2 = UTEXT_INITIALIZER;

1876 UText empty = UTEXT_INITIALIZER;

1877 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &st atus);

1878 REGEX_VERBOSE_TEXT(&input1);

1879 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);

1880 REGEX_VERBOSE_TEXT(&input2);

1881 utext_openUChars(&empty, NULL, 0, &status);

1882

1883 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not na tivelen (input1) ? */

1884 int32_t input2Len = strlen("not abc");

1885

1886

1887 //

1888 // Matcher creation and reset.

1889 //

1890 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);

1891 REGEX_CHECK_STATUS;

1892 REGEX_ASSERT(m1->lookingAt(status) == TRUE);

1893 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x6 6, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */

1894 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());

1895 m1->reset(&input2);

1896 REGEX_ASSERT(m1->lookingAt(status) == FALSE);

1897 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x 00 }; /* not abc */

1898 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());

1899 m1->reset(&input1);

1900 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());

1901 REGEX_ASSERT(m1->lookingAt(status) == TRUE);

1902 m1->reset(&empty);

1903 REGEX_ASSERT(m1->lookingAt(status) == FALSE);

1904 REGEX_ASSERT(utext_nativeLength(&empty) == 0);

1905

1906 //

1907 // reset(pos, status)

1908 //

1909 m1->reset(&input1);

1910 m1->reset(4, status);

1911 REGEX_CHECK_STATUS;

1912 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());

1913 REGEX_ASSERT(m1->lookingAt(status) == TRUE);

1914

1915 m1->reset(-1, status);

1916 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

1917 status = U_ZERO_ERROR;

1918

1919 m1->reset(0, status);

1920 REGEX_CHECK_STATUS;

1921 status = U_ZERO_ERROR;

1922

1923 m1->reset(input1Len-1, status);

1924 REGEX_CHECK_STATUS;

1925 status = U_ZERO_ERROR;

1926

1927 m1->reset(input1Len, status);

1928 REGEX_CHECK_STATUS;

1929 status = U_ZERO_ERROR;

1930

1931 m1->reset(input1Len+1, status);

1932 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

1933 status = U_ZERO_ERROR;

1934

1935 //

1936 // match(pos, status)

1937 //

1938 m1->reset(&input2);

1939 REGEX_ASSERT(m1->matches(4, status) == TRUE);

1940 m1->reset();

1941 REGEX_ASSERT(m1->matches(3, status) == FALSE);

1942 m1->reset();

1943 REGEX_ASSERT(m1->matches(5, status) == FALSE);

1944 REGEX_ASSERT(m1->matches(4, status) == TRUE);

1945 REGEX_ASSERT(m1->matches(-1, status) == FALSE);

1946 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

1947

1948 // Match() at end of string should fail, but should not

1949 // be an error.

1950 status = U_ZERO_ERROR;

1951 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);

1952 REGEX_CHECK_STATUS;

1953

1954 // Match beyond end of string should fail with an error.

1955 status = U_ZERO_ERROR;

1956 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);

1957 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

1958

1959 // Successful match at end of string.

1960 {

1961 status = U_ZERO_ERROR;

1962 RegexMatcher m("A?", 0, status); // will match zero length string.

1963 REGEX_CHECK_STATUS;

1964 m.reset(&input1);

1965 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);

1966 REGEX_CHECK_STATUS;

1967 m.reset(&empty);

1968 REGEX_ASSERT(m.matches(0, status) == TRUE);

1969 REGEX_CHECK_STATUS;

1970 }

1971

1972

1973 //

1974 // lookingAt(pos, status)

1975 //

1976 status = U_ZERO_ERROR;

1977 m1->reset(&input2); // "not abc"

1978 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);

1979 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);

1980 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);

1981 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);

1982 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);

1983 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

1984 status = U_ZERO_ERROR;

1985 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);

1986 REGEX_CHECK_STATUS;

1987 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);

1988 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

1989

1990 delete m1;

1991 delete pat2;

1992

1993 utext_close(&re);

1994 utext_close(&input1);

1995 utext_close(&input2);

1996 utext_close(&empty);

1997 }

1998

1999

2000 //

2001 // Capture Group.

2002 // RegexMatcher::start();

2003 // RegexMatcher::end();

2004 // RegexMatcher::groupCount();

2005 //

2006 {

2007 int32_t flags=0;

2008 UParseError pe;

2009 UErrorCode status=U_ZERO_ERROR;

2010 UText re=UTEXT_INITIALIZER;

2011 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x 34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67 )(.) /

2012 utext_openUTF8(&re, str_01234567_pat, -1, &status);

2013

2014 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);

2015 REGEX_CHECK_STATUS;

2016

2017 UText input = UTEXT_INITIALIZER;

2018 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36 , 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */

2019 utext_openUTF8(&input, str_0123456789, -1, &status);

2020

2021 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);

2022 REGEX_CHECK_STATUS;

2023 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);

2024 static const int32_t matchStarts[] = {0, 2, 4, 8};

2025 static const int32_t matchEnds[] = {10, 8, 6, 10};

2026 int32_t i;

2027 for (i=0; i<4; i++) {

2028 int32_t actualStart = matcher->start(i, status);

2029 REGEX_CHECK_STATUS;

2030 if (actualStart != matchStarts[i]) {

2031 errln("RegexTest failure at %s:%d, index %d. Expected %d, got % d\n",

2032 __FILE__, __LINE__, i, matchStarts[i], actualStart);

2033 }

2034 int32_t actualEnd = matcher->end(i, status);

2035 REGEX_CHECK_STATUS;

2036 if (actualEnd != matchEnds[i]) {

2037 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d \n",

2038 __FILE__, __LINE__, i, matchEnds[i], actualEnd);

2039 }

2040 }

2041

2042 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));

2043 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));

2044

2045 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR) ;

2046 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR) ;

2047 matcher->reset();

2048 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);

2049

2050 matcher->lookingAt(status);

2051

2052 UnicodeString dest;

2053 UText destText = UTEXT_INITIALIZER;

2054 utext_openUnicodeString(&destText, &dest, &status);

2055 UText *result;

2056 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x 36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */

2057 // Test shallow-clone API

2058 int64_t group_len;

2059 result = matcher->group((UText *)NULL, group_len, status);

2060 REGEX_CHECK_STATUS;

2061 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);

2062 utext_close(result);

2063 result = matcher->group(0, &destText, group_len, status);

2064 REGEX_CHECK_STATUS;

2065 REGEX_ASSERT(result == &destText);

2066 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);

2067 // destText is now immutable, reopen it

2068 utext_close(&destText);

2069 utext_openUnicodeString(&destText, &dest, &status);

2070

2071 int64_t length;

2072 result = matcher->group(0, NULL, length, status);

2073 REGEX_CHECK_STATUS;

2074 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);

2075 utext_close(result);

2076 result = matcher->group(0, &destText, length, status);

2077 REGEX_CHECK_STATUS;

2078 REGEX_ASSERT(result == &destText);

2079 REGEX_ASSERT(utext_getNativeIndex(result) == 0);

2080 REGEX_ASSERT(length == 10);

2081 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);

2082

2083 // Capture Group 1 == "234567"

2084 result = matcher->group(1, NULL, length, status);

2085 REGEX_CHECK_STATUS;

2086 REGEX_ASSERT(utext_getNativeIndex(result) == 2);

2087 REGEX_ASSERT(length == 6);

2088 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);

2089 utext_close(result);

2090

2091 result = matcher->group(1, &destText, length, status);

2092 REGEX_CHECK_STATUS;

2093 REGEX_ASSERT(result == &destText);

2094 REGEX_ASSERT(utext_getNativeIndex(result) == 2);

2095 REGEX_ASSERT(length == 6);

2096 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);

2097 utext_close(result);

2098

2099 // Capture Group 2 == "45"

2100 result = matcher->group(2, NULL, length, status);

2101 REGEX_CHECK_STATUS;

2102 REGEX_ASSERT(utext_getNativeIndex(result) == 4);

2103 REGEX_ASSERT(length == 2);

2104 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);

2105 utext_close(result);

2106

2107 result = matcher->group(2, &destText, length, status);

2108 REGEX_CHECK_STATUS;

2109 REGEX_ASSERT(result == &destText);

2110 REGEX_ASSERT(utext_getNativeIndex(result) == 4);

2111 REGEX_ASSERT(length == 2);

2112 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);

2113 utext_close(result);

2114

2115 // Capture Group 3 == "89"

2116 result = matcher->group(3, NULL, length, status);

2117 REGEX_CHECK_STATUS;

2118 REGEX_ASSERT(utext_getNativeIndex(result) == 8);

2119 REGEX_ASSERT(length == 2);

2120 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);

2121 utext_close(result);

2122

2123 result = matcher->group(3, &destText, length, status);

2124 REGEX_CHECK_STATUS;

2125 REGEX_ASSERT(result == &destText);

2126 REGEX_ASSERT(utext_getNativeIndex(result) == 8);

2127 REGEX_ASSERT(length == 2);

2128 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);

2129 utext_close(result);

2130

2131 // Capture Group number out of range.

2132 status = U_ZERO_ERROR;

2133 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR) ;

2134 status = U_ZERO_ERROR;

2135 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR) ;

2136 status = U_ZERO_ERROR;

2137 matcher->reset();

2138 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);

2139

2140 delete matcher;

2141 delete pat;

2142

2143 utext_close(&destText);

2144 utext_close(&input);

2145 utext_close(&re);

2146 }

2147

2148 //

2149 // find

2150 //

2151 {

2152 int32_t flags=0;

2153 UParseError pe;

2154 UErrorCode status=U_ZERO_ERROR;

2155 UText re=UTEXT_INITIALIZER;

2156 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */

2157 utext_openUTF8(&re, str_abc, -1, &status);

2158

2159 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);

2160 REGEX_CHECK_STATUS;

2161 UText input = UTEXT_INITIALIZER;

2162 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..ab c...abc.. */

2163 utext_openUTF8(&input, str_abcabcabc, -1, &status);

2164 // 012345678901234567

2165

2166 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);

2167 REGEX_CHECK_STATUS;

2168 REGEX_ASSERT(matcher->find());

2169 REGEX_ASSERT(matcher->start(status) == 1);

2170 REGEX_ASSERT(matcher->find());

2171 REGEX_ASSERT(matcher->start(status) == 6);

2172 REGEX_ASSERT(matcher->find());

2173 REGEX_ASSERT(matcher->start(status) == 12);

2174 REGEX_ASSERT(matcher->find() == FALSE);

2175 REGEX_ASSERT(matcher->find() == FALSE);

2176

2177 matcher->reset();

2178 REGEX_ASSERT(matcher->find());

2179 REGEX_ASSERT(matcher->start(status) == 1);

2180

2181 REGEX_ASSERT(matcher->find(0, status));

2182 REGEX_ASSERT(matcher->start(status) == 1);

2183 REGEX_ASSERT(matcher->find(1, status));

2184 REGEX_ASSERT(matcher->start(status) == 1);

2185 REGEX_ASSERT(matcher->find(2, status));

2186 REGEX_ASSERT(matcher->start(status) == 6);

2187 REGEX_ASSERT(matcher->find(12, status));

2188 REGEX_ASSERT(matcher->start(status) == 12);

2189 REGEX_ASSERT(matcher->find(13, status) == FALSE);

2190 REGEX_ASSERT(matcher->find(16, status) == FALSE);

2191 REGEX_ASSERT(matcher->find(17, status) == FALSE);

2192 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);

2193

2194 status = U_ZERO_ERROR;

2195 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);

2196 status = U_ZERO_ERROR;

2197 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);

2198

2199 REGEX_ASSERT(matcher->groupCount() == 0);

2200

2201 delete matcher;

2202 delete pat;

2203

2204 utext_close(&input);

2205 utext_close(&re);

2206 }

2207

2208

2209 //

2210 // find, with \G in pattern (true if at the end of a previous match).

2211 //

2212 {

2213 int32_t flags=0;

2214 UParseError pe;

2215 UErrorCode status=U_ZERO_ERROR;

2216 UText re=UTEXT_INITIALIZER;

2217 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0 x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x0 0 }; /* .?(?:(\\Gabc)\|(abc)) /

2218 utext_openUTF8(&re, str_Gabcabc, -1, &status);

2219

2220 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);

2221

2222 REGEX_CHECK_STATUS;

2223 UText input = UTEXT_INITIALIZER;

2224 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */

2225 utext_openUTF8(&input, str_abcabcabc, -1, &status);

2226 // 012345678901234567

2227

2228 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);

2229 REGEX_CHECK_STATUS;

2230 REGEX_ASSERT(matcher->find());

2231 REGEX_ASSERT(matcher->start(status) == 0);

2232 REGEX_ASSERT(matcher->start(1, status) == -1);

2233 REGEX_ASSERT(matcher->start(2, status) == 1);

2234

2235 REGEX_ASSERT(matcher->find());

2236 REGEX_ASSERT(matcher->start(status) == 4);

2237 REGEX_ASSERT(matcher->start(1, status) == 4);

2238 REGEX_ASSERT(matcher->start(2, status) == -1);

2239 REGEX_CHECK_STATUS;

2240

2241 delete matcher;

2242 delete pat;

2243

2244 utext_close(&input);

2245 utext_close(&re);

2246 }

2247

2248 //

2249 // find with zero length matches, match position should bump ahead

2250 // to prevent loops.

2251 //

2252 {

2253 int32_t i;

2254 UErrorCode status=U_ZERO_ERROR;

2255 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero- length matches anywhere,

2256 // using an always-true look-ahead.

2257 REGEX_CHECK_STATUS;

2258 UText s = UTEXT_INITIALIZER;

2259 utext_openUTF8(&s, " ", -1, &status);

2260 m.reset(&s);

2261 for (i=0; ; i++) {

2262 if (m.find() == FALSE) {

2263 break;

2264 }

2265 REGEX_ASSERT(m.start(status) == i);

2266 REGEX_ASSERT(m.end(status) == i);

2267 }

2268 REGEX_ASSERT(i==5);

2269

2270 // Check that the bump goes over characters outside the BMP OK

2271 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8

2272 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x 82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};

2273 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);

2274 m.reset(&s);

2275 for (i=0; ; i+=4) {

2276 if (m.find() == FALSE) {

2277 break;

2278 }

2279 REGEX_ASSERT(m.start(status) == i);

2280 REGEX_ASSERT(m.end(status) == i);

2281 }

2282 REGEX_ASSERT(i==20);

2283

2284 utext_close(&s);

2285 }

2286 {

2287 // find() loop breaking test.

2288 // with pattern of /.?/, should see a series of one char matches, then a single

2289 // match of zero length at the end of the input string.

2290 int32_t i;

2291 UErrorCode status=U_ZERO_ERROR;

2292 RegexMatcher m(".?", 0, status);

2293 REGEX_CHECK_STATUS;

2294 UText s = UTEXT_INITIALIZER;

2295 utext_openUTF8(&s, " ", -1, &status);

2296 m.reset(&s);

2297 for (i=0; ; i++) {

2298 if (m.find() == FALSE) {

2299 break;

2300 }

2301 REGEX_ASSERT(m.start(status) == i);

2302 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));

2303 }

2304 REGEX_ASSERT(i==5);

2305

2306 utext_close(&s);

2307 }

2308

2309

2310 //

2311 // Matchers with no input string behave as if they had an empty input string .

2312 //

2313

2314 {

2315 UErrorCode status = U_ZERO_ERROR;

2316 RegexMatcher m(".?", 0, status);

2317 REGEX_CHECK_STATUS;

2318 REGEX_ASSERT(m.find());

2319 REGEX_ASSERT(m.start(status) == 0);

2320 REGEX_ASSERT(m.input() == "");

2321 }

2322 {

2323 UErrorCode status = U_ZERO_ERROR;

2324 RegexPattern *p = RegexPattern::compile(".", 0, status);

2325 RegexMatcher *m = p->matcher(status);

2326 REGEX_CHECK_STATUS;

2327

2328 REGEX_ASSERT(m->find() == FALSE);

2329 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);

2330 delete m;

2331 delete p;

2332 }

2333

2334 //

2335 // Regions

2336 //

2337 {

2338 UErrorCode status = U_ZERO_ERROR;

2339 UText testPattern = UTEXT_INITIALIZER;

2340 UText testText = UTEXT_INITIALIZER;

2341 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);

2342 REGEX_VERBOSE_TEXT(&testPattern);

2343 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &stat us);

2344 REGEX_VERBOSE_TEXT(&testText);

2345

2346 RegexMatcher m(&testPattern, &testText, 0, status);

2347 REGEX_CHECK_STATUS;

2348 REGEX_ASSERT(m.regionStart() == 0);

2349 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));

2350 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);

2351 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);

2352

2353 m.region(2,4, status);

2354 REGEX_CHECK_STATUS;

2355 REGEX_ASSERT(m.matches(status));

2356 REGEX_ASSERT(m.start(status)==2);

2357 REGEX_ASSERT(m.end(status)==4);

2358 REGEX_CHECK_STATUS;

2359

2360 m.reset();

2361 REGEX_ASSERT(m.regionStart() == 0);

2362 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));

2363

2364 regextst_openUTF8FromInvariant(&testText, "short", -1, &status);

2365 REGEX_VERBOSE_TEXT(&testText);

2366 m.reset(&testText);

2367 REGEX_ASSERT(m.regionStart() == 0);

2368 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));

2369

2370 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);

2371 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));

2372 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);

2373 REGEX_ASSERT(&m == &m.reset());

2374 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);

2375

2376 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));

2377 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);

2378 REGEX_ASSERT(&m == &m.reset());

2379 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);

2380

2381 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);

2382 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));

2383 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);

2384 REGEX_ASSERT(&m == &m.reset());

2385 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);

2386

2387 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));

2388 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);

2389 REGEX_ASSERT(&m == &m.reset());

2390 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);

2391

2392 utext_close(&testText);

2393 utext_close(&testPattern);

2394 }

2395

2396 //

2397 // hitEnd() and requireEnd()

2398 //

2399 {

2400 UErrorCode status = U_ZERO_ERROR;

2401 UText testPattern = UTEXT_INITIALIZER;

2402 UText testText = UTEXT_INITIALIZER;

2403 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */

2404 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */

2405 utext_openUTF8(&testPattern, str_, -1, &status);

2406 utext_openUTF8(&testText, str_aabb, -1, &status);

2407

2408 RegexMatcher m1(&testPattern, &testText, 0, status);

2409 REGEX_ASSERT(m1.lookingAt(status) == TRUE);

2410 REGEX_ASSERT(m1.hitEnd() == TRUE);

2411 REGEX_ASSERT(m1.requireEnd() == FALSE);

2412 REGEX_CHECK_STATUS;

2413

2414 status = U_ZERO_ERROR;

2415 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */

2416 utext_openUTF8(&testPattern, str_a, -1, &status);

2417 RegexMatcher m2(&testPattern, &testText, 0, status);

2418 REGEX_ASSERT(m2.lookingAt(status) == TRUE);

2419 REGEX_ASSERT(m2.hitEnd() == FALSE);

2420 REGEX_ASSERT(m2.requireEnd() == FALSE);

2421 REGEX_CHECK_STATUS;

2422

2423 status = U_ZERO_ERROR;

2424 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .$ /

2425 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);

2426 RegexMatcher m3(&testPattern, &testText, 0, status);

2427 REGEX_ASSERT(m3.lookingAt(status) == TRUE);

2428 REGEX_ASSERT(m3.hitEnd() == TRUE);

2429 REGEX_ASSERT(m3.requireEnd() == TRUE);

2430 REGEX_CHECK_STATUS;

2431

2432 utext_close(&testText);

2433 utext_close(&testPattern);

2434 }

2435 }

2436

2437

2438 //---------------------------------------------------------------------------

2439 //

2440 // API_Replace_UTF8 API test for class RegexMatcher, testing the

2441 // Replace family of functions.

2442 //

2443 //---------------------------------------------------------------------------

2444 void RegexTest::API_Replace_UTF8() {

2445 //

2446 // Replace

2447 //

2448 int32_t flags=0;

2449 UParseError pe;

2450 UErrorCode status=U_ZERO_ERROR;

2451

2452 UText re=UTEXT_INITIALIZER;

2453 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);

2454 REGEX_VERBOSE_TEXT(&re);

2455 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);

2456 REGEX_CHECK_STATUS;

2457

2458 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */

2459 // 012345678901234567

2460 UText dataText = UTEXT_INITIALIZER;

2461 utext_openUTF8(&dataText, data, -1, &status);

2462 REGEX_CHECK_STATUS;

2463 REGEX_VERBOSE_TEXT(&dataText);

2464 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);

2465

2466 //

2467 // Plain vanilla matches.

2468 //

2469 UnicodeString dest;

2470 UText destText = UTEXT_INITIALIZER;

2471 utext_openUnicodeString(&destText, &dest, &status);

2472 UText *result;

2473

2474 UText replText = UTEXT_INITIALIZER;

2475

2476 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */

2477 utext_openUTF8(&replText, str_yz, -1, &status);

2478 REGEX_VERBOSE_TEXT(&replText);

2479 result = matcher->replaceFirst(&replText, NULL, status);

2480 REGEX_CHECK_STATUS;

2481 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63 , 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */

2482 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);

2483 utext_close(result);

2484 result = matcher->replaceFirst(&replText, &destText, status);

2485 REGEX_CHECK_STATUS;

2486 REGEX_ASSERT(result == &destText);

2487 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);

2488

2489 result = matcher->replaceAll(&replText, NULL, status);

2490 REGEX_CHECK_STATUS;

2491 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */

2492 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);

2493 utext_close(result);

2494

2495 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

2496 result = matcher->replaceAll(&replText, &destText, status);

2497 REGEX_CHECK_STATUS;

2498 REGEX_ASSERT(result == &destText);

2499 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);

2500

2501 //

2502 // Plain vanilla non-matches.

2503 //

2504 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x6 2, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx... abx.. */

2505 utext_openUTF8(&dataText, str_abxabxabx, -1, &status);

2506 matcher->reset(&dataText);

2507

2508 result = matcher->replaceFirst(&replText, NULL, status);

2509 REGEX_CHECK_STATUS;

2510 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);

2511 utext_close(result);

2512 result = matcher->replaceFirst(&replText, &destText, status);

2513 REGEX_CHECK_STATUS;

2514 REGEX_ASSERT(result == &destText);

2515 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);

2516

2517 result = matcher->replaceAll(&replText, NULL, status);

2518 REGEX_CHECK_STATUS;

2519 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);

2520 utext_close(result);

2521 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

2522 result = matcher->replaceAll(&replText, &destText, status);

2523 REGEX_CHECK_STATUS;

2524 REGEX_ASSERT(result == &destText);

2525 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);

2526

2527 //

2528 // Empty source string

2529 //

2530 utext_openUTF8(&dataText, NULL, 0, &status);

2531 matcher->reset(&dataText);

2532

2533 result = matcher->replaceFirst(&replText, NULL, status);

2534 REGEX_CHECK_STATUS;

2535 REGEX_ASSERT_UTEXT_UTF8("", result);

2536 utext_close(result);

2537 result = matcher->replaceFirst(&replText, &destText, status);

2538 REGEX_CHECK_STATUS;

2539 REGEX_ASSERT(result == &destText);

2540 REGEX_ASSERT_UTEXT_UTF8("", result);

2541

2542 result = matcher->replaceAll(&replText, NULL, status);

2543 REGEX_CHECK_STATUS;

2544 REGEX_ASSERT_UTEXT_UTF8("", result);

2545 utext_close(result);

2546 result = matcher->replaceAll(&replText, &destText, status);

2547 REGEX_CHECK_STATUS;

2548 REGEX_ASSERT(result == &destText);

2549 REGEX_ASSERT_UTEXT_UTF8("", result);

2550

2551 //

2552 // Empty substitution string

2553 //

2554 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."

2555 matcher->reset(&dataText);

2556

2557 utext_openUTF8(&replText, NULL, 0, &status);

2558 result = matcher->replaceFirst(&replText, NULL, status);

2559 REGEX_CHECK_STATUS;

2560 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */

2561 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);

2562 utext_close(result);

2563 result = matcher->replaceFirst(&replText, &destText, status);

2564 REGEX_CHECK_STATUS;

2565 REGEX_ASSERT(result == &destText);

2566 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);

2567

2568 result = matcher->replaceAll(&replText, NULL, status);

2569 REGEX_CHECK_STATUS;

2570 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x 00 }; /* ........ */

2571 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);

2572 utext_close(result);

2573 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

2574 result = matcher->replaceAll(&replText, &destText, status);

2575 REGEX_CHECK_STATUS;

2576 REGEX_ASSERT(result == &destText);

2577 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);

2578

2579 //

2580 // match whole string

2581 //

2582 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */

2583 utext_openUTF8(&dataText, str_abc, -1, &status);

2584 matcher->reset(&dataText);

2585

2586 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */

2587 utext_openUTF8(&replText, str_xyz, -1, &status);

2588 result = matcher->replaceFirst(&replText, NULL, status);

2589 REGEX_CHECK_STATUS;

2590 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);

2591 utext_close(result);

2592 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

2593 result = matcher->replaceFirst(&replText, &destText, status);

2594 REGEX_CHECK_STATUS;

2595 REGEX_ASSERT(result == &destText);

2596 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);

2597

2598 result = matcher->replaceAll(&replText, NULL, status);

2599 REGEX_CHECK_STATUS;

2600 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);

2601 utext_close(result);

2602 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

2603 result = matcher->replaceAll(&replText, &destText, status);

2604 REGEX_CHECK_STATUS;

2605 REGEX_ASSERT(result == &destText);

2606 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);

2607

2608 //

2609 // Capture Group, simple case

2610 //

2611 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */

2612 utext_openUTF8(&re, str_add, -1, &status);

2613 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);

2614 REGEX_CHECK_STATUS;

2615

2616 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */

2617 utext_openUTF8(&dataText, str_abcdefg, -1, &status);

2618 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);

2619 REGEX_CHECK_STATUS;

2620

2621 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */

2622 utext_openUTF8(&replText, str_11, -1, &status);

2623 result = matcher2->replaceFirst(&replText, NULL, status);

2624 REGEX_CHECK_STATUS;

2625 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67 , 0x00 }; /* bcbcdefg */

2626 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);

2627 utext_close(result);

2628 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

2629 result = matcher2->replaceFirst(&replText, &destText, status);

2630 REGEX_CHECK_STATUS;

2631 REGEX_ASSERT(result == &destText);

2632 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);

2633

2634 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x6 5, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */

2635 utext_openUTF8(&replText, str_v, -1, &status);

2636 REGEX_VERBOSE_TEXT(&replText);

2637 result = matcher2->replaceFirst(&replText, NULL, status);

2638 REGEX_CHECK_STATUS;

2639 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0 x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg * /

2640 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);

2641 utext_close(result);

2642 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

2643 result = matcher2->replaceFirst(&replText, &destText, status);

2644 REGEX_CHECK_STATUS;

2645 REGEX_ASSERT(result == &destText);

2646 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);

2647

2648 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x2 0, 0x69, 0x74, 0x73, 0x65, 0x6c,

2649 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,

2650 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */

2651 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);

2652 result = matcher2->replaceFirst(&replText, NULL, status);

2653 REGEX_CHECK_STATUS;

2654 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0 x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x2 4, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */

2655 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);

2656 utext_close(result);

2657 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

2658 result = matcher2->replaceFirst(&replText, &destText, status);

2659 REGEX_CHECK_STATUS;

2660 REGEX_ASSERT(result == &destText);

2661 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);

2662

2663 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d , 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */

2664 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001 D7CF, MATHEMATICAL BOLD DIGIT ONE

2665 // 012345678901234567890123456

2666 supplDigitChars[22] = 0xF0;

2667 supplDigitChars[23] = 0x9D;

2668 supplDigitChars[24] = 0x9F;

2669 supplDigitChars[25] = 0x8F;

2670 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);

2671

2672 result = matcher2->replaceFirst(&replText, NULL, status);

2673 REGEX_CHECK_STATUS;

2674 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x 20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplementa l Digit 1 bc.defg */

2675 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);

2676 utext_close(result);

2677 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

2678 result = matcher2->replaceFirst(&replText, &destText, status);

2679 REGEX_CHECK_STATUS;

2680 REGEX_ASSERT(result == &destText);

2681 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);

2682 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x 61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e , 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */

2683 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);

2684 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)) , U_INDEX_OUTOFBOUNDS_ERROR);

2685 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);

2686 utext_close(result);

2687 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

2688 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, sta tus)), U_INDEX_OUTOFBOUNDS_ERROR);

2689 REGEX_ASSERT(result == &destText);

2690 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);

2691

2692 //

2693 // Replacement String with \u hex escapes

2694 //

2695 {

2696 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61 , 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 a bc 2 abc 3 */

2697 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */

2698 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);

2699 utext_openUTF8(&replText, str_u0043, -1, &status);

2700 matcher->reset(&dataText);

2701

2702 result = matcher->replaceAll(&replText, NULL, status);

2703 REGEX_CHECK_STATUS;

2704 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x 20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d , 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */

2705 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);

2706 utext_close(result);

2707 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &sta tus);

2708 result = matcher->replaceAll(&replText, &destText, status);

2709 REGEX_CHECK_STATUS;

2710 REGEX_ASSERT(result == &destText);

2711 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);

2712 }

2713 {

2714 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */

2715 utext_openUTF8(&dataText, str_abc, -1, &status);

2716 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */

2717 utext_openUTF8(&replText, str_U00010000, -1, &status);

2718 matcher->reset(&dataText);

2719

2720 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0 x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"

2721 // 0123456789

2722 expected[2] = 0xF0;

2723 expected[3] = 0x90;

2724 expected[4] = 0x80;

2725 expected[5] = 0x80;

2726

2727 result = matcher->replaceAll(&replText, NULL, status);

2728 REGEX_CHECK_STATUS;

2729 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);

2730 utext_close(result);

2731 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &sta tus);

2732 result = matcher->replaceAll(&replText, &destText, status);

2733 REGEX_CHECK_STATUS;

2734 REGEX_ASSERT(result == &destText);

2735 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);

2736 }

2737 // TODO: need more through testing of capture substitutions.

2738

2739 // Bug 4057

2740 //

2741 {

2742 status = U_ZERO_ERROR;

2743 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.?)ee /

2744 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x 20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69 , 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start wit h ss and end with ee ss stuff ee fin */

2745 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */

2746 utext_openUTF8(&re, str_ssee, -1, &status);

2747 utext_openUTF8(&dataText, str_blah, -1, &status);

2748 utext_openUTF8(&replText, str_ooh, -1, &status);

2749

2750 RegexMatcher m(&re, 0, status);

2751 REGEX_CHECK_STATUS;

2752

2753 UnicodeString result;

2754 UText resultText = UTEXT_INITIALIZER;

2755 utext_openUnicodeString(&resultText, &result, &status);

2756

2757 // Multiple finds do NOT bump up the previous appendReplacement postion.

2758 m.reset(&dataText);

2759 m.find();

2760 m.find();

2761 m.appendReplacement(&resultText, &replText, status);

2762 REGEX_CHECK_STATUS;

2763 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x6 3, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0 x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */

2764 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);

2765

2766 // After a reset into the interior of a string, appendReplacement still starts at beginning.

2767 status = U_ZERO_ERROR;

2768 result.truncate(0);

2769 utext_openUnicodeString(&resultText, &result, &status);

2770 m.reset(10, status);

2771 m.find();

2772 m.find();

2773 m.appendReplacement(&resultText, &replText, status);

2774 REGEX_CHECK_STATUS;

2775 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x6 3, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0 x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */

2776 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);

2777

2778 // find() at interior of string, appendReplacement still starts at begin ning.

2779 status = U_ZERO_ERROR;

2780 result.truncate(0);

2781 utext_openUnicodeString(&resultText, &result, &status);

2782 m.reset();

2783 m.find(10, status);

2784 m.find();

2785 m.appendReplacement(&resultText, &replText, status);

2786 REGEX_CHECK_STATUS;

2787 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x6 3, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0 x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */

2788 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);

2789

2790 m.appendTail(&resultText, status);

2791 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x6 3, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0 x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x6 9, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */

2792 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);

2793

2794 utext_close(&resultText);

2795 }

2796

2797 delete matcher2;

2798 delete pat2;

2799 delete matcher;

2800 delete pat;

2801

2802 utext_close(&dataText);

2803 utext_close(&replText);

2804 utext_close(&destText);

2805 utext_close(&re);

2806 }

2807

2808

2809 //---------------------------------------------------------------------------

2810 //

2811 // API_Pattern_UTF8 Test that the API for class RegexPattern is

2812 // present and nominally working.

2813 //

2814 //---------------------------------------------------------------------------

2815 void RegexTest::API_Pattern_UTF8() {

2816 RegexPattern pata; // Test default constructor to not crash.

2817 RegexPattern patb;

2818

2819 REGEX_ASSERT(pata == patb);

2820 REGEX_ASSERT(pata == pata);

2821

2822 UText re1 = UTEXT_INITIALIZER;

2823 UText re2 = UTEXT_INITIALIZER;

2824 UErrorCode status = U_ZERO_ERROR;

2825 UParseError pe;

2826

2827 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */

2828 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */

2829 utext_openUTF8(&re1, str_abcalmz, -1, &status);

2830 utext_openUTF8(&re2, str_def, -1, &status);

2831

2832 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);

2833 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);

2834 REGEX_CHECK_STATUS;

2835 REGEX_ASSERT(pat1 == pat1);

2836 REGEX_ASSERT(*pat1 != pata);

2837

2838 // Assign

2839 patb = *pat1;

2840 REGEX_ASSERT(patb == *pat1);

2841

2842 // Copy Construct

2843 RegexPattern patc(*pat1);

2844 REGEX_ASSERT(patc == *pat1);

2845 REGEX_ASSERT(patb == patc);

2846 REGEX_ASSERT(pat1 != pat2);

2847 patb = *pat2;

2848 REGEX_ASSERT(patb != patc);

2849 REGEX_ASSERT(patb == *pat2);

2850

2851 // Compile with no flags.

2852 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);

2853 REGEX_ASSERT(pat1a == pat1);

2854

2855 REGEX_ASSERT(pat1a->flags() == 0);

2856

2857 // Compile with different flags should be not equal

2858 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSI TIVE, pe, status);

2859 REGEX_CHECK_STATUS;

2860

2861 REGEX_ASSERT(pat1b != pat1a);

2862 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);

2863 REGEX_ASSERT(pat1a->flags() == 0);

2864 delete pat1b;

2865

2866 // clone

2867 RegexPattern *pat1c = pat1->clone();

2868 REGEX_ASSERT(pat1c == pat1);

2869 REGEX_ASSERT(pat1c != pat2);

2870

2871 delete pat1c;

2872 delete pat1a;

2873 delete pat1;

2874 delete pat2;

2875

2876 utext_close(&re1);

2877 utext_close(&re2);

2878

2879

2880 //

2881 // Verify that a matcher created from a cloned pattern works.

2882 // (Jitterbug 3423)

2883 //

2884 {

2885 UErrorCode status = U_ZERO_ERROR;

2886 UText pattern = UTEXT_INITIALIZER;

2887 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \ p{L}+ */

2888 utext_openUTF8(&pattern, str_pL, -1, &status);

2889

2890 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);

2891 RegexPattern *pClone = pSource->clone();

2892 delete pSource;

2893 RegexMatcher *mFromClone = pClone->matcher(status);

2894 REGEX_CHECK_STATUS;

2895

2896 UText input = UTEXT_INITIALIZER;

2897 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57 , 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */

2898 utext_openUTF8(&input, str_HelloWorld, -1, &status);

2899 mFromClone->reset(&input);

2900 REGEX_ASSERT(mFromClone->find() == TRUE);

2901 REGEX_ASSERT(mFromClone->group(status) == "Hello");

2902 REGEX_ASSERT(mFromClone->find() == TRUE);

2903 REGEX_ASSERT(mFromClone->group(status) == "World");

2904 REGEX_ASSERT(mFromClone->find() == FALSE);

2905 delete mFromClone;

2906 delete pClone;

2907

2908 utext_close(&input);

2909 utext_close(&pattern);

2910 }

2911

2912 //

2913 // matches convenience API

2914 //

2915 {

2916 UErrorCode status = U_ZERO_ERROR;

2917 UText pattern = UTEXT_INITIALIZER;

2918 UText input = UTEXT_INITIALIZER;

2919

2920 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x2 0, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */

2921 utext_openUTF8(&input, str_randominput, -1, &status);

2922

2923 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */

2924 utext_openUTF8(&pattern, str_dotstar, -1, &status);

2925 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE );

2926 REGEX_CHECK_STATUS;

2927

2928 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */

2929 utext_openUTF8(&pattern, str_abc, -1, &status);

2930 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);

2931 REGEX_CHECK_STATUS;

2932

2933 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .nput /

2934 utext_openUTF8(&pattern, str_nput, -1, &status);

2935 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);

2936 REGEX_CHECK_STATUS;

2937

2938 utext_openUTF8(&pattern, str_randominput, -1, &status);

2939 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, s tatus) == TRUE);

2940 REGEX_CHECK_STATUS;

2941

2942 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .u /

2943 utext_openUTF8(&pattern, str_u, -1, &status);

2944 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);

2945 REGEX_CHECK_STATUS;

2946

2947 utext_openUTF8(&input, str_abc, -1, &status);

2948 utext_openUTF8(&pattern, str_abc, -1, &status);

2949 status = U_INDEX_OUTOFBOUNDS_ERROR;

2950 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);

2951 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

2952

2953 utext_close(&input);

2954 utext_close(&pattern);

2955 }

2956

2957

2958 //

2959 // Split()

2960 //

2961 status = U_ZERO_ERROR;

2962 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */

2963 utext_openUTF8(&re1, str_spaceplus, -1, &status);

2964 pat1 = RegexPattern::compile(&re1, pe, status);

2965 REGEX_CHECK_STATUS;

2966 UnicodeString fields[10];

2967

2968 int32_t n;

2969 n = pat1->split("Now is the time", fields, 10, status);

2970 REGEX_CHECK_STATUS;

2971 REGEX_ASSERT(n==4);

2972 REGEX_ASSERT(fields[0]=="Now");

2973 REGEX_ASSERT(fields[1]=="is");

2974 REGEX_ASSERT(fields[2]=="the");

2975 REGEX_ASSERT(fields[3]=="time");

2976 REGEX_ASSERT(fields[4]=="");

2977

2978 n = pat1->split("Now is the time", fields, 2, status);

2979 REGEX_CHECK_STATUS;

2980 REGEX_ASSERT(n==2);

2981 REGEX_ASSERT(fields[0]=="Now");

2982 REGEX_ASSERT(fields[1]=="is the time");

2983 REGEX_ASSERT(fields[2]=="the"); // left over from previous test

2984

2985 fields[1] = "*";

2986 status = U_ZERO_ERROR;

2987 n = pat1->split("Now is the time", fields, 1, status);

2988 REGEX_CHECK_STATUS;

2989 REGEX_ASSERT(n==1);

2990 REGEX_ASSERT(fields[0]=="Now is the time");

2991 REGEX_ASSERT(fields[1]=="*");

2992 status = U_ZERO_ERROR;

2993

2994 n = pat1->split(" Now is the time ", fields, 10, status);

2995 REGEX_CHECK_STATUS;

2996 REGEX_ASSERT(n==6);

2997 REGEX_ASSERT(fields[0]=="");

2998 REGEX_ASSERT(fields[1]=="Now");

2999 REGEX_ASSERT(fields[2]=="is");

3000 REGEX_ASSERT(fields[3]=="the");

3001 REGEX_ASSERT(fields[4]=="time");

3002 REGEX_ASSERT(fields[5]=="");

3003 REGEX_ASSERT(fields[6]=="");

3004

3005 fields[2] = "*";

3006 n = pat1->split(" ", fields, 10, status);

3007 REGEX_CHECK_STATUS;

3008 REGEX_ASSERT(n==2);

3009 REGEX_ASSERT(fields[0]=="");

3010 REGEX_ASSERT(fields[1]=="");

3011 REGEX_ASSERT(fields[2]=="*");

3012

3013 fields[0] = "foo";

3014 n = pat1->split("", fields, 10, status);

3015 REGEX_CHECK_STATUS;

3016 REGEX_ASSERT(n==0);

3017 REGEX_ASSERT(fields[0]=="foo");

3018

3019 delete pat1;

3020

3021 // split, with a pattern with (capture)

3022 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);

3023 pat1 = RegexPattern::compile(&re1, pe, status);

3024 REGEX_CHECK_STATUS;

3025

3026 status = U_ZERO_ERROR;

3027 fields[6] = fields[7] = "*";

3028 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);

3029 REGEX_CHECK_STATUS;

3030 REGEX_ASSERT(n==7);

3031 REGEX_ASSERT(fields[0]=="");

3032 REGEX_ASSERT(fields[1]=="a");

3033 REGEX_ASSERT(fields[2]=="Now is ");

3034 REGEX_ASSERT(fields[3]=="b");

3035 REGEX_ASSERT(fields[4]=="the time");

3036 REGEX_ASSERT(fields[5]=="c");

3037 REGEX_ASSERT(fields[6]=="");

3038 REGEX_ASSERT(fields[7]=="*");

3039 REGEX_ASSERT(status==U_ZERO_ERROR);

3040

3041 fields[6] = fields[7] = "*";

3042 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);

3043 REGEX_CHECK_STATUS;

3044 REGEX_ASSERT(n==7);

3045 REGEX_ASSERT(fields[0]==" ");

3046 REGEX_ASSERT(fields[1]=="a");

3047 REGEX_ASSERT(fields[2]=="Now is ");

3048 REGEX_ASSERT(fields[3]=="b");

3049 REGEX_ASSERT(fields[4]=="the time");

3050 REGEX_ASSERT(fields[5]=="c");

3051 REGEX_ASSERT(fields[6]=="");

3052 REGEX_ASSERT(fields[7]=="*");

3053

3054 status = U_ZERO_ERROR;

3055 fields[6] = "foo";

3056 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);

3057 REGEX_CHECK_STATUS;

3058 REGEX_ASSERT(n==6);

3059 REGEX_ASSERT(fields[0]==" ");

3060 REGEX_ASSERT(fields[1]=="a");

3061 REGEX_ASSERT(fields[2]=="Now is ");

3062 REGEX_ASSERT(fields[3]=="b");

3063 REGEX_ASSERT(fields[4]=="the time");

3064 REGEX_ASSERT(fields[5]==" ");

3065 REGEX_ASSERT(fields[6]=="foo");

3066

3067 status = U_ZERO_ERROR;

3068 fields[5] = "foo";

3069 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);

3070 REGEX_CHECK_STATUS;

3071 REGEX_ASSERT(n==5);

3072 REGEX_ASSERT(fields[0]==" ");

3073 REGEX_ASSERT(fields[1]=="a");

3074 REGEX_ASSERT(fields[2]=="Now is ");

3075 REGEX_ASSERT(fields[3]=="b");

3076 REGEX_ASSERT(fields[4]=="the time<c>");

3077 REGEX_ASSERT(fields[5]=="foo");

3078

3079 status = U_ZERO_ERROR;

3080 fields[5] = "foo";

3081 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);

3082 REGEX_CHECK_STATUS;

3083 REGEX_ASSERT(n==5);

3084 REGEX_ASSERT(fields[0]==" ");

3085 REGEX_ASSERT(fields[1]=="a");

3086 REGEX_ASSERT(fields[2]=="Now is ");

3087 REGEX_ASSERT(fields[3]=="b");

3088 REGEX_ASSERT(fields[4]=="the time");

3089 REGEX_ASSERT(fields[5]=="foo");

3090

3091 status = U_ZERO_ERROR;

3092 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);

3093 REGEX_CHECK_STATUS;

3094 REGEX_ASSERT(n==4);

3095 REGEX_ASSERT(fields[0]==" ");

3096 REGEX_ASSERT(fields[1]=="a");

3097 REGEX_ASSERT(fields[2]=="Now is ");

3098 REGEX_ASSERT(fields[3]=="the time<c>");

3099 status = U_ZERO_ERROR;

3100 delete pat1;

3101

3102 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);

3103 pat1 = RegexPattern::compile(&re1, pe, status);

3104 REGEX_CHECK_STATUS;

3105 n = pat1->split("1-10,20", fields, 10, status);

3106 REGEX_CHECK_STATUS;

3107 REGEX_ASSERT(n==5);

3108 REGEX_ASSERT(fields[0]=="1");

3109 REGEX_ASSERT(fields[1]=="-");

3110 REGEX_ASSERT(fields[2]=="10");

3111 REGEX_ASSERT(fields[3]==",");

3112 REGEX_ASSERT(fields[4]=="20");

3113 delete pat1;

3114

3115

3116 //

3117 // split of a UText based string, with library allocating output UTexts.

3118 //

3119 {

3120 status = U_ZERO_ERROR;

3121 RegexMatcher matcher(UnicodeString("(:)"), 0, status);

3122 UnicodeString stringToSplit("first:second:third");

3123 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &stat us);

3124 REGEX_CHECK_STATUS;

3125

3126 UText *splits[10] = {NULL};

3127 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(spl its), status);

3128 REGEX_CHECK_STATUS;

3129 REGEX_ASSERT(numFields == 5);

3130 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);

3131 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);

3132 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);

3133 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);

3134 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);

3135 REGEX_ASSERT(splits[5] == NULL);

3136

3137 for (int i=0; i<UPRV_LENGTHOF(splits); i++) {

3138 if (splits[i]) {

3139 utext_close(splits[i]);

3140 splits[i] = NULL;

3141 }

3142 }

3143 utext_close(textToSplit);

3144 }

3145

3146

3147 //

3148 // RegexPattern::pattern() and patternText()

3149 //

3150 pat1 = new RegexPattern();

3151 REGEX_ASSERT(pat1->pattern() == "");

3152 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));

3153 delete pat1;

3154 const char helloWorldInvariant = "(Hello, world)";

3155 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);

3156 pat1 = RegexPattern::compile(&re1, pe, status);

3157 REGEX_CHECK_STATUS;

3158 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());

3159 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));

3160 delete pat1;

3161

3162 utext_close(&re1);

3163 }

3164

3165

3166 //---------------------------------------------------------------------------

3167 //

3168 // Extended A more thorough check for features of regex patterns

3169 // The test cases are in a separate data file,

3170 // source/tests/testdata/regextst.txt

3171 // A description of the test data format is included in that file.

3172 //

3173 //---------------------------------------------------------------------------

3174

3175 const char *

3176 RegexTest::getPath(char buffer[2048], const char *filename) {

3177 UErrorCode status=U_ZERO_ERROR;

3178 const char *testDataDirectory = IntlTest::getSourceTestData(status);

3179 if (U_FAILURE(status)) {

3180 errln("ERROR: loadTestData() failed - %s", u_errorName(status));

3181 return NULL;

3182 }

3183

3184 strcpy(buffer, testDataDirectory);

3185 strcat(buffer, filename);

3186 return buffer;

3187 }

3188

3189 void RegexTest::Extended() {

3190 char tdd[2048];

3191 const char *srcPath;

3192 UErrorCode status = U_ZERO_ERROR;

3193 int32_t lineNum = 0;

3194

3195 //

3196 // Open and read the test data file.

3197 //

3198 srcPath=getPath(tdd, "regextst.txt");

3199 if(srcPath==NULL) {

3200 return; /* something went wrong, error already output */

3201 }

3202

3203 int32_t len;

3204 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);

3205 if (U_FAILURE(status)) {

3206 return; /* something went wrong, error already output */

3207 }

3208

3209 //

3210 // Put the test data into a UnicodeString

3211 //

3212 UnicodeString testString(FALSE, testData, len);

3213

3214 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s([\\'\\\"/])(.?)\ \1"), 0, status);

3215 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s(#.)?$"), 0, stat us);

3216 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s([ixsmdteDEGLMQvab tyYzZ2-9])([:letter:]*)"), 0, status);

3217

3218 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0 , status);

3219 UnicodeString testPattern; // The pattern for test from the test file.

3220 UnicodeString testFlags; // the flags for a test.

3221 UnicodeString matchString; // The marked up string to be used as input

3222

3223 if (U_FAILURE(status)){

3224 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));

3225 delete [] testData;

3226 return;

3227 }

3228

3229 //

3230 // Loop over the test data file, once per line.

3231 //

3232 while (lineMat.find()) {

3233 lineNum++;

3234 if (U_FAILURE(status)) {

3235 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status) );

3236 }

3237

3238 status = U_ZERO_ERROR;

3239 UnicodeString testLine = lineMat.group(1, status);

3240 if (testLine.length() == 0) {

3241 continue;

3242 }

3243

3244 //

3245 // Parse the test line. Skip blank and comment only lines.

3246 // Separate out the three main fields - pattern, flags, target.

3247 //

3248

3249 commentMat.reset(testLine);

3250 if (commentMat.lookingAt(status)) {

3251 // This line is a comment, or blank.

3252 continue;

3253 }

3254

3255 //

3256 // Pull out the pattern field, remove it from the test file line.

3257 //

3258 quotedStuffMat.reset(testLine);

3259 if (quotedStuffMat.lookingAt(status)) {

3260 testPattern = quotedStuffMat.group(2, status);

3261 testLine.remove(0, quotedStuffMat.end(0, status));

3262 } else {

3263 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);

3264 continue;

3265 }

3266

3267

3268 //

3269 // Pull out the flags from the test file line.

3270 //

3271 flagsMat.reset(testLine);

3272 flagsMat.lookingAt(status); // Will always match, possi bly an empty string.

3273 testFlags = flagsMat.group(1, status);

3274 if (flagsMat.group(2, status).length() > 0) {

3275 errln("Bad Match flag at line %d. Scanning %c\n",

3276 lineNum, flagsMat.group(2, status).charAt(0));

3277 continue;

3278 }

3279 testLine.remove(0, flagsMat.end(0, status));

3280

3281 //

3282 // Pull out the match string, as a whole.

3283 // We'll process the <tags> later.

3284 //

3285 quotedStuffMat.reset(testLine);

3286 if (quotedStuffMat.lookingAt(status)) {

3287 matchString = quotedStuffMat.group(2, status);

3288 testLine.remove(0, quotedStuffMat.end(0, status));

3289 } else {

3290 errln("Bad match string at test file line %d", lineNum);

3291 continue;

3292 }

3293

3294 //

3295 // The only thing left from the input line should be an optional traili ng comment.

3296 //

3297 commentMat.reset(testLine);

3298 if (commentMat.lookingAt(status) == FALSE) {

3299 errln("Line %d: unexpected characters at end of test line.", lineNum );

3300 continue;

3301 }

3302

3303 //

3304 // Run the test

3305 //

3306 regex_find(testPattern, testFlags, matchString, srcPath, lineNum);

3307 }

3308

3309 delete [] testData;

3310

3311 }

3312

3313

3314

3315 //---------------------------------------------------------------------------

3316 //

3317 // regex_find(pattern, flags, inputString, lineNumber)

3318 //

3319 // Function to run a single test from the Extended (data driven) tests.

3320 // See file test/testdata/regextst.txt for a description of the

3321 // pattern and inputString fields, and the allowed flags.

3322 // lineNumber is the source line in regextst.txt of the test.

3323 //

3324 //---------------------------------------------------------------------------

3325

3326

3327 // Set a value into a UVector at position specified by a decimal number in

3328 // a UnicodeString. This is a utility function needed by the actual test fun ction,

3329 // which follows.

3330 static void set(UVector &vec, int32_t val, UnicodeString index) {

3331 UErrorCode status=U_ZERO_ERROR;

3332 int32_t idx = 0;

3333 for (int32_t i=0; i<index.length(); i++) {

3334 int32_t d=u_charDigitValue(index.charAt(i));

3335 if (d<0) {return;}

3336 idx = idx*10 + d;

3337 }

3338 while (vec.size()<idx+1) {vec.addElement(-1, status);}

3339 vec.setElementAt(val, idx);

3340 }

3341

3342 static void setInt(UVector &vec, int32_t val, int32_t idx) {

3343 UErrorCode status=U_ZERO_ERROR;

3344 while (vec.size()<idx+1) {vec.addElement(-1, status);}

3345 vec.setElementAt(val, idx);

3346 }

3347

3348 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& na tiveIndex)

3349 {

3350 UBool couldFind = TRUE;

3351 UTEXT_SETNATIVEINDEX(utext, 0);

3352 int32_t i = 0;

3353 while (i < unistrOffset) {

3354 UChar32 c = UTEXT_NEXT32(utext);

3355 if (c != U_SENTINEL) {

3356 i += U16_LENGTH(c);

3357 } else {

3358 couldFind = FALSE;

3359 break;

3360 }

3361 }

3362 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);

3363 return couldFind;

3364 }

3365

3366

3367 void RegexTest::regex_find(const UnicodeString &pattern,

3368 const UnicodeString &flags,

3369 const UnicodeString &inputString,

3370 const char *srcPath,

3371 int32_t line) {

3372 UnicodeString unEscapedInput;

3373 UnicodeString deTaggedInput;

3374

3375 int32_t patternUTF8Length, inputUTF8Length;

3376 char patternChars = NULL, inputChars = NULL;

3377 UText patternText = UTEXT_INITIALIZER;

3378 UText inputText = UTEXT_INITIALIZER;

3379 UConverter *UTF8Converter = NULL;

3380

3381 UErrorCode status = U_ZERO_ERROR;

3382 UParseError pe;

3383 RegexPattern *parsePat = NULL;

3384 RegexMatcher *parseMatcher = NULL;

3385 RegexPattern callerPattern = NULL, UTF8Pattern = NULL;

3386 RegexMatcher matcher = NULL, UTF8Matcher = NULL;

3387 UVector groupStarts(status);

3388 UVector groupEnds(status);

3389 UVector groupStartsUTF8(status);

3390 UVector groupEndsUTF8(status);

3391 UBool isMatch = FALSE, isUTF8Match = FALSE;

3392 UBool failed = FALSE;

3393 int32_t numFinds;

3394 int32_t i;

3395 UBool useMatchesFunc = FALSE;

3396 UBool useLookingAtFunc = FALSE;

3397 int32_t regionStart = -1;

3398 int32_t regionEnd = -1;

3399 int32_t regionStartUTF8 = -1;

3400 int32_t regionEndUTF8 = -1;

3401

3402

3403 //

3404 // Compile the caller's pattern

3405 //

3406 uint32_t bflags = 0;

3407 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag

3408 bflags \|= UREGEX_CASE_INSENSITIVE;

3409 }

3410 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag

3411 bflags \|= UREGEX_COMMENTS;

3412 }

3413 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag

3414 bflags \|= UREGEX_DOTALL;

3415 }

3416 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag

3417 bflags \|= UREGEX_MULTILINE;

3418 }

3419

3420 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag

3421 bflags \|= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;

3422 }

3423 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag

3424 bflags \|= UREGEX_UNIX_LINES;

3425 }

3426 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag

3427 bflags \|= UREGEX_LITERAL;

3428 }

3429

3430

3431 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);

3432 if (status != U_ZERO_ERROR) {

3433 #if UCONFIG_NO_BREAK_ITERATION==1

3434 // 'v' test flag means that the test pattern should not compile if ICU w as configured

3435 // to not include break iteration. RBBI is needed for Unicode word boundaries.

3436 if (flags.indexOf((UChar)0x76) >= 0 /'v'/ && status == U_UNSUPPORTED_E RROR) {

3437 goto cleanupAndReturn;

3438 }

3439 #endif

3440 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'

3441 // Expected pattern compilation error.

3442 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'

3443 logln("Pattern Compile returns \"%s\"", u_errorName(status));

3444 }

3445 goto cleanupAndReturn;

3446 } else {

3447 // Unexpected pattern compilation error.

3448 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName( status));

3449 goto cleanupAndReturn;

3450 }

3451 }

3452

3453 UTF8Converter = ucnv_open("UTF8", &status);

3454 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);

3455

3456 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);

3457 status = U_ZERO_ERROR; // buffer overflow

3458 patternChars = new char[patternUTF8Length+1];

3459 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);

3460 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);

3461

3462 if (status == U_ZERO_ERROR) {

3463 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);

3464

3465 if (status != U_ZERO_ERROR) {

3466 #if UCONFIG_NO_BREAK_ITERATION==1

3467 // 'v' test flag means that the test pattern should not compile if I CU was configured

3468 // to not include break iteration. RBBI is needed for Unicode w ord boundaries.

3469 if (flags.indexOf((UChar)0x76) >= 0 /'v'/ && status == U_UNSUPPORT ED_ERROR) {

3470 goto cleanupAndReturn;

3471 }

3472 #endif

3473 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'

3474 // Expected pattern compilation error.

3475 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'

3476 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(s tatus));

3477 }

3478 goto cleanupAndReturn;

3479 } else {

3480 // Unexpected pattern compilation error.

3481 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_err orName(status));

3482 goto cleanupAndReturn;

3483 }

3484 }

3485 }

3486

3487 if (UTF8Pattern == NULL) {

3488 // UTF-8 does not allow unpaired surrogates, so this could actually happ en without being a failure of the engine

3489 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);

3490 status = U_ZERO_ERROR;

3491 }

3492

3493 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag

3494 callerPattern->dumpPattern();

3495 }

3496

3497 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag

3498 errln("%s, Line %d: Expected, but did not get, a pattern compilation err or.", srcPath, line);

3499 goto cleanupAndReturn;

3500 }

3501

3502

3503 //

3504 // Number of times find() should be called on the test string, default to 1

3505 //

3506 numFinds = 1;

3507 for (i=2; i<=9; i++) {

3508 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag

3509 if (numFinds != 1) {

3510 errln("Line %d: more than one digit flag. Scanning %d.", line, i);

3511 goto cleanupAndReturn;

3512 }

3513 numFinds = i;

3514 }

3515 }

3516

3517 // 'M' flag. Use matches() instead of find()

3518 if (flags.indexOf((UChar)0x4d) >= 0) {

3519 useMatchesFunc = TRUE;

3520 }

3521 if (flags.indexOf((UChar)0x4c) >= 0) {

3522 useLookingAtFunc = TRUE;

3523 }

3524

3525 //

3526 // Find the tags in the input data, remove them, and record the group bound ary

3527 // positions.

3528 //

3529 parsePat = RegexPattern::compile("<(/?)(r\|[0-9]+)>", 0, pe, status);

3530 REGEX_CHECK_STATUS_L(line);

3531

3532 unEscapedInput = inputString.unescape();

3533 parseMatcher = parsePat->matcher(unEscapedInput, status);

3534 REGEX_CHECK_STATUS_L(line);

3535 while(parseMatcher->find()) {

3536 parseMatcher->appendReplacement(deTaggedInput, "", status);

3537 REGEX_CHECK_STATUS;

3538 UnicodeString groupNum = parseMatcher->group(2, status);

3539 if (groupNum == "r") {

3540 // <r> or </r>, a region specification within the string

3541 if (parseMatcher->group(1, status) == "/") {

3542 regionEnd = deTaggedInput.length();

3543 } else {

3544 regionStart = deTaggedInput.length();

3545 }

3546 } else {

3547 // <digits> or </digits>, a group match boundary tag.

3548 if (parseMatcher->group(1, status) == "/") {

3549 set(groupEnds, deTaggedInput.length(), groupNum);

3550 } else {

3551 set(groupStarts, deTaggedInput.length(), groupNum);

3552 }

3553 }

3554 }

3555 parseMatcher->appendTail(deTaggedInput);

3556 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);

3557 if ((regionStart>=0 \|\| regionEnd>=0) && (regionStart<0 \|\| regionStart>region End)) {

3558 errln("mismatched <r> tags");

3559 failed = TRUE;

3560 goto cleanupAndReturn;

3561 }

3562

3563 //

3564 // Configure the matcher according to the flags specified with this test.

3565 //

3566 matcher = callerPattern->matcher(deTaggedInput, status);

3567 REGEX_CHECK_STATUS_L(line);

3568 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag

3569 matcher->setTrace(TRUE);

3570 }

3571

3572 if (UTF8Pattern != NULL) {

3573 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);

3574 status = U_ZERO_ERROR; // buffer overflow

3575 inputChars = new char[inputUTF8Length+1];

3576 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, stat us);

3577 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);

3578

3579 if (status == U_ZERO_ERROR) {

3580 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);

3581 REGEX_CHECK_STATUS_L(line);

3582 }

3583

3584 if (UTF8Matcher == NULL) {

3585 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine

3586 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d" , srcPath, line);

3587 status = U_ZERO_ERROR;

3588 }

3589 }

3590

3591 //

3592 // Generate native indices for UTF8 versions of region and capture group in fo

3593 //

3594 if (UTF8Matcher != NULL) {

3595 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStar t, regionStartUTF8);

3596 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);

3597

3598 // Fill out the native index UVector info.

3599 // Only need 1 loop, from above we know groupStarts.size() = groupEnds. size()

3600 for (i=0; i<groupStarts.size(); i++) {

3601 int32_t start = groupStarts.elementAti(i);

3602 // -1 means there was no UVector slot and we won't be requesting th at capture group for this test, don't bother inserting

3603 if (start >= 0) {

3604 int32_t startUTF8;

3605 if (!utextOffsetToNative(&inputText, start, startUTF8)) {

3606 errln("Error at line %d: could not find native index for gro up start %d. UTF16 index %d", line, i, start);

3607 failed = TRUE;

3608 goto cleanupAndReturn; // Good chance of subsequent bogus e rrors. Stop now.

3609 }

3610 setInt(groupStartsUTF8, startUTF8, i);

3611 }

3612

3613 int32_t end = groupEnds.elementAti(i);

3614 // -1 means there was no UVector slot and we won't be requesting th at capture group for this test, don't bother inserting

3615 if (end >= 0) {

3616 int32_t endUTF8;

3617 if (!utextOffsetToNative(&inputText, end, endUTF8)) {

3618 errln("Error at line %d: could not find native index for gro up end %d. UTF16 index %d", line, i, end);

3619 failed = TRUE;

3620 goto cleanupAndReturn; // Good chance of subsequent bogus e rrors. Stop now.

3621 }

3622 setInt(groupEndsUTF8, endUTF8, i);

3623 }

3624 }

3625 }

3626

3627 if (regionStart>=0) {

3628 matcher->region(regionStart, regionEnd, status);

3629 REGEX_CHECK_STATUS_L(line);

3630 if (UTF8Matcher != NULL) {

3631 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);

3632 REGEX_CHECK_STATUS_L(line);

3633 }

3634 }

3635 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag

3636 matcher->useAnchoringBounds(FALSE);

3637 if (UTF8Matcher != NULL) {

3638 UTF8Matcher->useAnchoringBounds(FALSE);

3639 }

3640 }

3641 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag

3642 matcher->useTransparentBounds(TRUE);

3643 if (UTF8Matcher != NULL) {

3644 UTF8Matcher->useTransparentBounds(TRUE);

3645 }

3646 }

3647

3648

3649

3650 //

3651 // Do a find on the de-tagged input using the caller's pattern

3652 // TODO: error on count>1 and not find().

3653 // error on both matches() and lookingAt().

3654 //

3655 for (i=0; i<numFinds; i++) {

3656 if (useMatchesFunc) {

3657 isMatch = matcher->matches(status);

3658 if (UTF8Matcher != NULL) {

3659 isUTF8Match = UTF8Matcher->matches(status);

3660 }

3661 } else if (useLookingAtFunc) {

3662 isMatch = matcher->lookingAt(status);

3663 if (UTF8Matcher != NULL) {

3664 isUTF8Match = UTF8Matcher->lookingAt(status);

3665 }

3666 } else {

3667 isMatch = matcher->find();

3668 if (UTF8Matcher != NULL) {

3669 isUTF8Match = UTF8Matcher->find();

3670 }

3671 }

3672 }

3673 matcher->setTrace(FALSE);

3674 if (U_FAILURE(status)) {

3675 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));

3676 }

3677

3678 //

3679 // Match up the groups from the find() with the groups from the tags

3680 //

3681

3682 // number of tags should match number of groups from find operation.

3683 // matcher->groupCount does not include group 0, the entire match, hence the +1.

3684 // G option in test means that capture group data is not available in the

3685 // expected results, so the check needs to be suppressed.

3686 if (isMatch == FALSE && groupStarts.size() != 0) {

3687 dataerrln("Error at line %d: Match expected, but none found.", line);

3688 failed = TRUE;

3689 goto cleanupAndReturn;

3690 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {

3691 errln("Error at line %d: Match expected, but none found. (UTF8)", line) ;

3692 failed = TRUE;

3693 goto cleanupAndReturn;

3694 }

3695

3696 if (flags.indexOf((UChar)0x47 /G/) >= 0) {

3697 // Only check for match / no match. Don't check capture groups.

3698 if (isMatch && groupStarts.size() == 0) {

3699 errln("Error at line %d: No match expected, but one found.", line);

3700 failed = TRUE;

3701 } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0 ) {

3702 errln("Error at line %d: No match expected, but one found. (UTF8)", line);

3703 failed = TRUE;

3704 }

3705 goto cleanupAndReturn;

3706 }

3707

3708 REGEX_CHECK_STATUS_L(line);

3709 for (i=0; i<=matcher->groupCount(); i++) {

3710 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elem entAti(i));

3711 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupSta rtsUTF8.elementAti(i));

3712 if (matcher->start(i, status) != expectedStart) {

3713 errln("Error at line %d: incorrect start position for group %d. Exp ected %d, got %d",

3714 line, i, expectedStart, matcher->start(i, status));

3715 failed = TRUE;

3716 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.

3717 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expec tedStartUTF8) {

3718 errln("Error at line %d: incorrect start position for group %d. Exp ected %d, got %d (UTF8)",

3719 line, i, expectedStartUTF8, UTF8Matcher->start(i, status));

3720 failed = TRUE;

3721 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.

3722 }

3723

3724 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti (i));

3725 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF 8.elementAti(i));

3726 if (matcher->end(i, status) != expectedEnd) {

3727 errln("Error at line %d: incorrect end position for group %d. Expec ted %d, got %d",

3728 line, i, expectedEnd, matcher->end(i, status));

3729 failed = TRUE;

3730 // Error on end position; keep going; real error is probably yet to come as group

3731 // end positions work from end of the input data towards the front .

3732 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expecte dEndUTF8) {

3733 errln("Error at line %d: incorrect end position for group %d. Expec ted %d, got %d (UTF8)",

3734 line, i, expectedEndUTF8, UTF8Matcher->end(i, status));

3735 failed = TRUE;

3736 // Error on end position; keep going; real error is probably yet to come as group

3737 // end positions work from end of the input data towards the front .

3738 }

3739 }

3740 if ( matcher->groupCount()+1 < groupStarts.size()) {

3741 errln("Error at line %d: Expected %d capture groups, found %d.",

3742 line, groupStarts.size()-1, matcher->groupCount());

3743 failed = TRUE;

3744 }

3745 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.si ze()) {

3746 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",

3747 line, groupStarts.size()-1, UTF8Matcher->groupCount());

3748 failed = TRUE;

3749 }

3750

3751 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == fa lse

3752 matcher->requireEnd() == TRUE) {

3753 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", l ine);

3754 failed = TRUE;

3755 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false

3756 UTF8Matcher->requireEnd() == TRUE) {

3757 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UT F8)", line);

3758 failed = TRUE;

3759 }

3760

3761 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == tr ue

3762 matcher->requireEnd() == FALSE) {

3763 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", l ine);

3764 failed = TRUE;

3765 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false

3766 UTF8Matcher->requireEnd() == FALSE) {

3767 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UT F8)", line);

3768 failed = TRUE;

3769 }

3770

3771 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false

3772 matcher->hitEnd() == TRUE) {

3773 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line) ;

3774 failed = TRUE;

3775 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false

3776 UTF8Matcher->hitEnd() == TRUE) {

3777 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)" , line);

3778 failed = TRUE;

3779 }

3780

3781 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true

3782 matcher->hitEnd() == FALSE) {

3783 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line) ;

3784 failed = TRUE;

3785 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true

3786 UTF8Matcher->hitEnd() == FALSE) {

3787 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)" , line);

3788 failed = TRUE;

3789 }

3790

3791

3792 cleanupAndReturn:

3793 if (failed) {

3794 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "

3795 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");

3796 // callerPattern->dump();

3797 }

3798 delete parseMatcher;

3799 delete parsePat;

3800 delete UTF8Matcher;

3801 delete UTF8Pattern;

3802 delete matcher;

3803 delete callerPattern;

3804

3805 utext_close(&inputText);

3806 delete[] inputChars;

3807 utext_close(&patternText);

3808 delete[] patternChars;

3809 ucnv_close(UTF8Converter);

3810 }

3811

3812

3813

3814

3815 //---------------------------------------------------------------------------

3816 //

3817 // Errors Check for error handling in patterns.

3818 //

3819 //---------------------------------------------------------------------------

3820 void RegexTest::Errors() {

3821 // \escape sequences that aren't implemented yet.

3822 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEME NTED);

3823

3824 // Missing close parentheses

3825 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);

3826 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);

3827 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PA REN);

3828

3829 // Extra close paren

3830 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_ PAREN);

3831 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);

3832 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);

3833

3834 // Look-ahead, Look-behind

3835 // TODO: add tests for unbounded length look-behinds.

3836 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal cons truct

3837

3838 // Attempt to use non-default flags

3839 {

3840 UParseError pe;

3841 UErrorCode status = U_ZERO_ERROR;

3842 int32_t flags = UREGEX_CANON_EQ \|

3843 UREGEX_COMMENTS \| UREGEX_DOTALL \|

3844 UREGEX_MULTILINE;

3845 RegexPattern pat1= RegexPattern::compile(".", flags, pe, status);

3846 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);

3847 delete pat1;

3848 }

3849

3850

3851 // Quantifiers are allowed only after something that can be quantified.

3852 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);

3853 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);

3854 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);

3855

3856 // Mal-formed {min,max} quantifiers

3857 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);

3858 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);

3859 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);

3860 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);

3861 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);

3862 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);

3863 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Ov erflows int during scan

3864 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Ov erflows regex binary format

3865 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);

3866

3867 // Ticket 5389

3868 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);

3869

3870 // Invalid Back Reference \0

3871 // For ICU 3.8 and earlier

3872 // For ICU versions newer than 3.8, \0 introduces an octal escape.

3873 //

3874 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);

3875

3876 }

3877

3878

3879 //------------------------------------------------------------------------------ -

3880 //

3881 // Read a text data file, convert it to UChars, and return the data

3882 // in one big UChar * buffer, which the caller must delete.

3883 //

3884 //------------------------------------------------------------------------------ --

3885 UChar RegexTest::ReadAndConvertFile(const char fileName, int32_t &ulen,

3886 const char *defEncoding, UErrorCode &status ) {

3887 UChar *retPtr = NULL;

3888 char *fileBuf = NULL;

3889 UConverter* conv = NULL;

3890 FILE *f = NULL;

3891

3892 ulen = 0;

3893 if (U_FAILURE(status)) {

3894 return retPtr;

3895 }

3896

3897 //

3898 // Open the file.

3899 //

3900 f = fopen(fileName, "rb");

3901 if (f == 0) {

3902 dataerrln("Error opening test data file %s\n", fileName);

3903 status = U_FILE_ACCESS_ERROR;

3904 return NULL;

3905 }

3906 //

3907 // Read it in

3908 //

3909 int32_t fileSize;

3910 int32_t amt_read;

3911

3912 fseek( f, 0, SEEK_END);

3913 fileSize = ftell(f);

3914 fileBuf = new char[fileSize];

3915 fseek(f, 0, SEEK_SET);

3916 amt_read = fread(fileBuf, 1, fileSize, f);

3917 if (amt_read != fileSize \|\| fileSize <= 0) {

3918 errln("Error reading test data file.");

3919 goto cleanUpAndReturn;

3920 }

3921

3922 //

3923 // Look for a Unicode Signature (BOM) on the data just read

3924 //

3925 int32_t signatureLength;

3926 const char * fileBufC;

3927 const char* encoding;

3928

3929 fileBufC = fileBuf;

3930 encoding = ucnv_detectUnicodeSignature(

3931 fileBuf, fileSize, &signatureLength, &status);

3932 if(encoding!=NULL ){

3933 fileBufC += signatureLength;

3934 fileSize -= signatureLength;

3935 } else {

3936 encoding = defEncoding;

3937 if (strcmp(encoding, "utf-8") == 0) {

3938 errln("file %s is missing its BOM", fileName);

3939 }

3940 }

3941

3942 //

3943 // Open a converter to take the rule file to UTF-16

3944 //

3945 conv = ucnv_open(encoding, &status);

3946 if (U_FAILURE(status)) {

3947 goto cleanUpAndReturn;

3948 }

3949

3950 //

3951 // Convert the rules to UChar.

3952 // Preflight first to determine required buffer size.

3953 //

3954 ulen = ucnv_toUChars(conv,

3955 NULL, // dest,

3956 0, // destCapacity,

3957 fileBufC,

3958 fileSize,

3959 &status);

3960 if (status == U_BUFFER_OVERFLOW_ERROR) {

3961 // Buffer Overflow is expected from the preflight operation.

3962 status = U_ZERO_ERROR;

3963

3964 retPtr = new UChar[ulen+1];

3965 ucnv_toUChars(conv,

3966 retPtr, // dest,

3967 ulen+1,

3968 fileBufC,

3969 fileSize,

3970 &status);

3971 }

3972

3973 cleanUpAndReturn:

3974 fclose(f);

3975 delete[] fileBuf;

3976 ucnv_close(conv);

3977 if (U_FAILURE(status)) {

3978 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));

3979 delete []retPtr;

3980 retPtr = 0;

3981 ulen = 0;

3982 };

3983 return retPtr;

3984 }

3985

3986

3987 //------------------------------------------------------------------------------ -

3988 //

3989 // PerlTests - Run Perl's regular expression tests

3990 // The input file for this test is re_tests, the standard regular

3991 // expression test data distributed with the Perl source code.

3992 //

3993 // Here is Perl's description of the test data file:

3994 //

3995 // # The tests are in a separate file 't/op/re_tests'.

3996 // # Each line in that file is a separate test.

3997 // # There are five columns, separated by tabs.

3998 // #

3999 // # Column 1 contains the pattern, optionally enclosed in C<''>.

4000 // # Modifiers can be put after the closing C<'>.

4001 // #

4002 // # Column 2 contains the string to be matched.

4003 // #

4004 // # Column 3 contains the expected result:

4005 // # y expect a match

4006 // # n expect no match

4007 // # c expect an error

4008 // # B test exposes a known bug in Perl, should be skipped

4009 // # b test exposes a known bug in Perl, should be skipped if noamp

4010 // #

4011 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.

4012 // #

4013 // # Column 4 contains a string, usually C<$&>.

4014 // #

4015 // # Column 5 contains the expected result of double-quote

4016 // # interpolating that string after the match, or start of error message .

4017 // #

4018 // # Column 6, if present, contains a reason why the test is skipped.

4019 // # This is printed with "skipped", for harness to pick up.

4020 // #

4021 // # \n in the tests are interpolated, as are variables of the form ${\w+ }.

4022 // #

4023 // # If you want to add a regular expression test that can't be expressed

4024 // # in this format, don't add it here: put it in op/pat.t instead.

4025 //

4026 // For ICU, if field 3 contains an 'i', the test will be skipped.

4027 // The test exposes is some known incompatibility between ICU and Perl re gexps.

4028 // (The i is in addition to whatever was there before.)

4029 //

4030 //------------------------------------------------------------------------------ -

4031 void RegexTest::PerlTests() {

4032 char tdd[2048];

4033 const char *srcPath;

4034 UErrorCode status = U_ZERO_ERROR;

4035 UParseError pe;

4036

4037 //

4038 // Open and read the test data file.

4039 //

4040 srcPath=getPath(tdd, "re_tests.txt");

4041 if(srcPath==NULL) {

4042 return; /* something went wrong, error already output */

4043 }

4044

4045 int32_t len;

4046 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);

4047 if (U_FAILURE(status)) {

4048 return; /* something went wrong, error already output */

4049 }

4050

4051 //

4052 // Put the test data into a UnicodeString

4053 //

4054 UnicodeString testDataString(FALSE, testData, len);

4055

4056 //

4057 // Regex to break the input file into lines, and strip the new lines.

4058 // One line per match, capture group one is the desired data.

4059 //

4060 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\ \r\\n]+"), 0, pe, status);

4061 if (U_FAILURE(status)) {

4062 dataerrln("RegexPattern::compile() error");

4063 return;

4064 }

4065 RegexMatcher* lineMat = linePat->matcher(testDataString, status);

4066

4067 //

4068 // Regex to split a test file line into fields.

4069 // There are six fields, separated by tabs.

4070 //

4071 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);

4072

4073 //

4074 // Regex to identify test patterns with flag settings, and to separate them .

4075 // Test patterns with flags look like 'pattern'i

4076 // Test patterns without flags are not quoted: pattern

4077 // Coming out, capture group 2 is the pattern, capture group 3 is the flag s.

4078 //

4079 RegexPattern flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(. )\\1(.*)"), 0, pe, status);

4080 RegexMatcher* flagMat = flagPat->matcher(status);

4081

4082 //

4083 // The Perl tests reference several perl-isms, which are evaluated/substitut ed

4084 // in the test data. Not being perl, this must be done explicitly. Here

4085 // are string constants and REs for these constructs.

4086 //

4087 UnicodeString nulnulSrc("${nulnul}");

4088 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);

4089 nulnul = nulnul.unescape();

4090

4091 UnicodeString ffffSrc("${ffff}");

4092 UnicodeString ffff("\\uffff", -1, US_INV);

4093 ffff = ffff.unescape();

4094

4095 // regexp for $-[0], $+[2], etc.

4096 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([ +\\-])\\[(\\d+)\\]"), 0, pe, status);

4097 RegexMatcher *groupsMat = groupsPat->matcher(status);

4098

4099 // regexp for $0, $1, $2, etc.

4100 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+) "), 0, pe, status);

4101 RegexMatcher *cgMat = cgPat->matcher(status);

4102

4103

4104 //

4105 // Main Loop for the Perl Tests, runs once per line from the

4106 // test data file.

4107 //

4108 int32_t lineNum = 0;

4109 int32_t skippedUnimplementedCount = 0;

4110 while (lineMat->find()) {

4111 lineNum++;

4112

4113 //

4114 // Get a line, break it into its fields, do the Perl

4115 // variable substitutions.

4116 //

4117 UnicodeString line = lineMat->group(1, status);

4118 UnicodeString fields[7];

4119 fieldPat->split(line, fields, 7, status);

4120

4121 flagMat->reset(fields[0]);

4122 flagMat->matches(status);

4123 UnicodeString pattern = flagMat->group(2, status);

4124 pattern.findAndReplace("${bang}", "!");

4125 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000" ));

4126 pattern.findAndReplace(ffffSrc, ffff);

4127

4128 //

4129 // Identify patterns that include match flag settings,

4130 // split off the flags, remove the extra quotes.

4131 //

4132 UnicodeString flagStr = flagMat->group(3, status);

4133 if (U_FAILURE(status)) {

4134 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));

4135 return;

4136 }

4137 int32_t flags = 0;

4138 const UChar UChar_c = 0x63; // Char constants for the flag letters.

4139 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C )

4140 const UChar UChar_m = 0x6d;

4141 const UChar UChar_x = 0x78;

4142 const UChar UChar_y = 0x79;

4143 if (flagStr.indexOf(UChar_i) != -1) {

4144 flags \|= UREGEX_CASE_INSENSITIVE;

4145 }

4146 if (flagStr.indexOf(UChar_m) != -1) {

4147 flags \|= UREGEX_MULTILINE;

4148 }

4149 if (flagStr.indexOf(UChar_x) != -1) {

4150 flags \|= UREGEX_COMMENTS;

4151 }

4152

4153 //

4154 // Compile the test pattern.

4155 //

4156 status = U_ZERO_ERROR;

4157 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status );

4158 if (status == U_REGEX_UNIMPLEMENTED) {

4159 //

4160 // Test of a feature that is planned for ICU, but not yet implemente d.

4161 // skip the test.

4162 skippedUnimplementedCount++;

4163 delete testPat;

4164 status = U_ZERO_ERROR;

4165 continue;

4166 }

4167

4168 if (U_FAILURE(status)) {

4169 // Some tests are supposed to generate errors.

4170 // Only report an error for tests that are supposed to succeed.

4171 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supp osed to fail AND

4172 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility

4173 {

4174 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status ));

4175 }

4176 status = U_ZERO_ERROR;

4177 delete testPat;

4178 continue;

4179 }

4180

4181 if (fields[2].indexOf(UChar_i) >= 0) {

4182 // ICU should skip this test.

4183 delete testPat;

4184 continue;

4185 }

4186

4187 if (fields[2].indexOf(UChar_c) >= 0) {

4188 // This pattern should have caused a compilation error, but didn't/

4189 errln("line %d: Expected a pattern compile error, got success.", lin eNum);

4190 delete testPat;

4191 continue;

4192 }

4193

4194 //

4195 // replace the Perl variables that appear in some of the

4196 // match data strings.

4197 //

4198 UnicodeString matchString = fields[1];

4199 matchString.findAndReplace(nulnulSrc, nulnul);

4200 matchString.findAndReplace(ffffSrc, ffff);

4201

4202 // Replace any \n in the match string with an actual new-line char.

4203 // Don't do full unescape, as this unescapes more than Perl does, which

4204 // causes other spurious failures in the tests.

4205 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");

4206

4207

4208

4209 //

4210 // Run the test, check for expected match/don't match result.

4211 //

4212 RegexMatcher *testMat = testPat->matcher(matchString, status);

4213 UBool found = testMat->find();

4214 UBool expected = FALSE;

4215 if (fields[2].indexOf(UChar_y) >=0) {

4216 expected = TRUE;

4217 }

4218 if (expected != found) {

4219 errln("line %d: Expected %smatch, got %smatch",

4220 lineNum, expected?"":"no ", found?"":"no " );

4221 continue;

4222 }

4223

4224 // Don't try to check expected results if there is no match.

4225 // (Some have stuff in the expected fields)

4226 if (!found) {

4227 delete testMat;

4228 delete testPat;

4229 continue;

4230 }

4231

4232 //

4233 // Interpret the Perl expression from the fourth field of the data file,

4234 // building up an ICU string from the results of the ICU match.

4235 // The Perl expression will contain references to the results of

4236 // a regex match, including the matched string, capture group string s,

4237 // group starting and ending indicies, etc.

4238 //

4239 UnicodeString resultString;

4240 UnicodeString perlExpr = fields[3];

4241 #if SUPPORT_MUTATING_INPUT_STRING

4242 groupsMat->reset(perlExpr);

4243 cgMat->reset(perlExpr);

4244 #endif

4245

4246 while (perlExpr.length() > 0) {

4247 #if !SUPPORT_MUTATING_INPUT_STRING

4248 // Perferred usage. Reset after any modification to input string.

4249 groupsMat->reset(perlExpr);

4250 cgMat->reset(perlExpr);

4251 #endif

4252

4253 if (perlExpr.startsWith("$&")) {

4254 resultString.append(testMat->group(status));

4255 perlExpr.remove(0, 2);

4256 }

4257

4258 else if (groupsMat->lookingAt(status)) {

4259 // $-[0] $+[2] etc.

4260 UnicodeString digitString = groupsMat->group(2, status);

4261 int32_t t = 0;

4262 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);

4263 UnicodeString plusOrMinus = groupsMat->group(1, status);

4264 int32_t matchPosition;

4265 if (plusOrMinus.compare("+") == 0) {

4266 matchPosition = testMat->end(groupNum, status);

4267 } else {

4268 matchPosition = testMat->start(groupNum, status);

4269 }

4270 if (matchPosition != -1) {

4271 ICU_Utility::appendNumber(resultString, matchPosition);

4272 }

4273 perlExpr.remove(0, groupsMat->end(status));

4274 }

4275

4276 else if (cgMat->lookingAt(status)) {

4277 // $1, $2, $3, etc.

4278 UnicodeString digitString = cgMat->group(1, status);

4279 int32_t t = 0;

4280 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);

4281 if (U_SUCCESS(status)) {

4282 resultString.append(testMat->group(groupNum, status));

4283 status = U_ZERO_ERROR;

4284 }

4285 perlExpr.remove(0, cgMat->end(status));

4286 }

4287

4288 else if (perlExpr.startsWith("@-")) {

4289 int32_t i;

4290 for (i=0; i<=testMat->groupCount(); i++) {

4291 if (i>0) {

4292 resultString.append(" ");

4293 }

4294 ICU_Utility::appendNumber(resultString, testMat->start(i, st atus));

4295 }

4296 perlExpr.remove(0, 2);

4297 }

4298

4299 else if (perlExpr.startsWith("@+")) {

4300 int32_t i;

4301 for (i=0; i<=testMat->groupCount(); i++) {

4302 if (i>0) {

4303 resultString.append(" ");

4304 }

4305 ICU_Utility::appendNumber(resultString, testMat->end(i, stat us));

4306 }

4307 perlExpr.remove(0, 2);

4308 }

4309

4310 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \ Escape. Take following char as a literal.

4311 // or as an escap ed sequence (e.g. \n)

4312 if (perlExpr.length() > 1) {

4313 perlExpr.remove(0, 1); // Remove the '\', but only if not l ast char.

4314 }

4315 UChar c = perlExpr.charAt(0);

4316 switch (c) {

4317 case 'n': c = '\n'; break;

4318 // add any other escape sequences that show up in the test expec ted results.

4319 }

4320 resultString.append(c);

4321 perlExpr.remove(0, 1);

4322 }

4323

4324 else {

4325 // Any characters from the perl expression that we don't explici tly

4326 // recognize before here are assumed to be literals and copied

4327 // as-is to the expected results.

4328 resultString.append(perlExpr.charAt(0));

4329 perlExpr.remove(0, 1);

4330 }

4331

4332 if (U_FAILURE(status)) {

4333 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)) ;

4334 break;

4335 }

4336 }

4337

4338 //

4339 // Expected Results Compare

4340 //

4341 UnicodeString expectedS(fields[4]);

4342 expectedS.findAndReplace(nulnulSrc, nulnul);

4343 expectedS.findAndReplace(ffffSrc, ffff);

4344 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");

4345

4346

4347 if (expectedS.compare(resultString) != 0) {

4348 err("Line %d: Incorrect perl expression results.", lineNum);

4349 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; go t \""+resultString+(UnicodeString)"\"");

4350 }

4351

4352 delete testMat;

4353 delete testPat;

4354 }

4355

4356 //

4357 // All done. Clean up allocated stuff.

4358 //

4359 delete cgMat;

4360 delete cgPat;

4361

4362 delete groupsMat;

4363 delete groupsPat;

4364

4365 delete flagMat;

4366 delete flagPat;

4367

4368 delete lineMat;

4369 delete linePat;

4370

4371 delete fieldPat;

4372 delete [] testData;

4373

4374

4375 logln("%d tests skipped because of unimplemented regexp features.", skippedU nimplementedCount);

4376

4377 }

4378

4379

4380 //------------------------------------------------------------------------------ -

4381 //

4382 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts

4383 // (instead of using UnicodeStrings) to test the alternate engi ne.

4384 // The input file for this test is re_tests, the standard regul ar

4385 // expression test data distributed with the Perl source code.

4386 // See PerlTests() for more information.

4387 //

4388 //------------------------------------------------------------------------------ -

4389 void RegexTest::PerlTestsUTF8() {

4390 char tdd[2048];

4391 const char *srcPath;

4392 UErrorCode status = U_ZERO_ERROR;

4393 UParseError pe;

4394 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));

4395 UText patternText = UTEXT_INITIALIZER;

4396 char *patternChars = NULL;

4397 int32_t patternLength;

4398 int32_t patternCapacity = 0;

4399 UText inputText = UTEXT_INITIALIZER;

4400 char *inputChars = NULL;

4401 int32_t inputLength;

4402 int32_t inputCapacity = 0;

4403

4404 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, N ULL, NULL, NULL, &status);

4405

4406 //

4407 // Open and read the test data file.

4408 //

4409 srcPath=getPath(tdd, "re_tests.txt");

4410 if(srcPath==NULL) {

4411 return; /* something went wrong, error already output */

4412 }

4413

4414 int32_t len;

4415 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);

4416 if (U_FAILURE(status)) {

4417 return; /* something went wrong, error already output */

4418 }

4419

4420 //

4421 // Put the test data into a UnicodeString

4422 //

4423 UnicodeString testDataString(FALSE, testData, len);

4424

4425 //

4426 // Regex to break the input file into lines, and strip the new lines.

4427 // One line per match, capture group one is the desired data.

4428 //

4429 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\ \r\\n]+"), 0, pe, status);

4430 if (U_FAILURE(status)) {

4431 dataerrln("RegexPattern::compile() error");

4432 return;

4433 }

4434 RegexMatcher* lineMat = linePat->matcher(testDataString, status);

4435

4436 //

4437 // Regex to split a test file line into fields.

4438 // There are six fields, separated by tabs.

4439 //

4440 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);

4441

4442 //

4443 // Regex to identify test patterns with flag settings, and to separate them .

4444 // Test patterns with flags look like 'pattern'i

4445 // Test patterns without flags are not quoted: pattern

4446 // Coming out, capture group 2 is the pattern, capture group 3 is the flag s.

4447 //

4448 RegexPattern flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(. )\\1(.*)"), 0, pe, status);

4449 RegexMatcher* flagMat = flagPat->matcher(status);

4450

4451 //

4452 // The Perl tests reference several perl-isms, which are evaluated/substitut ed

4453 // in the test data. Not being perl, this must be done explicitly. Here

4454 // are string constants and REs for these constructs.

4455 //

4456 UnicodeString nulnulSrc("${nulnul}");

4457 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);

4458 nulnul = nulnul.unescape();

4459

4460 UnicodeString ffffSrc("${ffff}");

4461 UnicodeString ffff("\\uffff", -1, US_INV);

4462 ffff = ffff.unescape();

4463

4464 // regexp for $-[0], $+[2], etc.

4465 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([ +\\-])\\[(\\d+)\\]"), 0, pe, status);

4466 RegexMatcher *groupsMat = groupsPat->matcher(status);

4467

4468 // regexp for $0, $1, $2, etc.

4469 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+) "), 0, pe, status);

4470 RegexMatcher *cgMat = cgPat->matcher(status);

4471

4472

4473 //

4474 // Main Loop for the Perl Tests, runs once per line from the

4475 // test data file.

4476 //

4477 int32_t lineNum = 0;

4478 int32_t skippedUnimplementedCount = 0;

4479 while (lineMat->find()) {

4480 lineNum++;

4481

4482 //

4483 // Get a line, break it into its fields, do the Perl

4484 // variable substitutions.

4485 //

4486 UnicodeString line = lineMat->group(1, status);

4487 UnicodeString fields[7];

4488 fieldPat->split(line, fields, 7, status);

4489

4490 flagMat->reset(fields[0]);

4491 flagMat->matches(status);

4492 UnicodeString pattern = flagMat->group(2, status);

4493 pattern.findAndReplace("${bang}", "!");

4494 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000" ));

4495 pattern.findAndReplace(ffffSrc, ffff);

4496

4497 //

4498 // Identify patterns that include match flag settings,

4499 // split off the flags, remove the extra quotes.

4500 //

4501 UnicodeString flagStr = flagMat->group(3, status);

4502 if (U_FAILURE(status)) {

4503 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));

4504 return;

4505 }

4506 int32_t flags = 0;

4507 const UChar UChar_c = 0x63; // Char constants for the flag letters.

4508 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C )

4509 const UChar UChar_m = 0x6d;

4510 const UChar UChar_x = 0x78;

4511 const UChar UChar_y = 0x79;

4512 if (flagStr.indexOf(UChar_i) != -1) {

4513 flags \|= UREGEX_CASE_INSENSITIVE;

4514 }

4515 if (flagStr.indexOf(UChar_m) != -1) {

4516 flags \|= UREGEX_MULTILINE;

4517 }

4518 if (flagStr.indexOf(UChar_x) != -1) {

4519 flags \|= UREGEX_COMMENTS;

4520 }

4521

4522 //

4523 // Put the pattern in a UTF-8 UText

4524 //

4525 status = U_ZERO_ERROR;

4526 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Conve rter.getAlias(), status);

4527 if (status == U_BUFFER_OVERFLOW_ERROR) {

4528 status = U_ZERO_ERROR;

4529 delete[] patternChars;

4530 patternCapacity = patternLength + 1;

4531 patternChars = new char[patternCapacity];

4532 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlia s(), status);

4533 }

4534 utext_openUTF8(&patternText, patternChars, patternLength, &status);

4535

4536 //

4537 // Compile the test pattern.

4538 //

4539 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, s tatus);

4540 if (status == U_REGEX_UNIMPLEMENTED) {

4541 //

4542 // Test of a feature that is planned for ICU, but not yet implemente d.

4543 // skip the test.

4544 skippedUnimplementedCount++;

4545 delete testPat;

4546 status = U_ZERO_ERROR;

4547 continue;

4548 }

4549

4550 if (U_FAILURE(status)) {

4551 // Some tests are supposed to generate errors.

4552 // Only report an error for tests that are supposed to succeed.

4553 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supp osed to fail AND

4554 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility

4555 {

4556 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status ));

4557 }

4558 status = U_ZERO_ERROR;

4559 delete testPat;

4560 continue;

4561 }

4562

4563 if (fields[2].indexOf(UChar_i) >= 0) {

4564 // ICU should skip this test.

4565 delete testPat;

4566 continue;

4567 }

4568

4569 if (fields[2].indexOf(UChar_c) >= 0) {

4570 // This pattern should have caused a compilation error, but didn't/

4571 errln("line %d: Expected a pattern compile error, got success.", lin eNum);

4572 delete testPat;

4573 continue;

4574 }

4575

4576

4577 //

4578 // replace the Perl variables that appear in some of the

4579 // match data strings.

4580 //

4581 UnicodeString matchString = fields[1];

4582 matchString.findAndReplace(nulnulSrc, nulnul);

4583 matchString.findAndReplace(ffffSrc, ffff);

4584

4585 // Replace any \n in the match string with an actual new-line char.

4586 // Don't do full unescape, as this unescapes more than Perl does, which

4587 // causes other spurious failures in the tests.

4588 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");

4589

4590 //

4591 // Put the input in a UTF-8 UText

4592 //

4593 status = U_ZERO_ERROR;

4594 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Convert er.getAlias(), status);

4595 if (status == U_BUFFER_OVERFLOW_ERROR) {

4596 status = U_ZERO_ERROR;

4597 delete[] inputChars;

4598 inputCapacity = inputLength + 1;

4599 inputChars = new char[inputCapacity];

4600 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlia s(), status);

4601 }

4602 utext_openUTF8(&inputText, inputChars, inputLength, &status);

4603

4604 //

4605 // Run the test, check for expected match/don't match result.

4606 //

4607 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);

4608 UBool found = testMat->find();

4609 UBool expected = FALSE;

4610 if (fields[2].indexOf(UChar_y) >=0) {

4611 expected = TRUE;

4612 }

4613 if (expected != found) {

4614 errln("line %d: Expected %smatch, got %smatch",

4615 lineNum, expected?"":"no ", found?"":"no " );

4616 continue;

4617 }

4618

4619 // Don't try to check expected results if there is no match.

4620 // (Some have stuff in the expected fields)

4621 if (!found) {

4622 delete testMat;

4623 delete testPat;

4624 continue;

4625 }

4626

4627 //

4628 // Interpret the Perl expression from the fourth field of the data file,

4629 // building up an ICU string from the results of the ICU match.

4630 // The Perl expression will contain references to the results of

4631 // a regex match, including the matched string, capture group string s,

4632 // group starting and ending indicies, etc.

4633 //

4634 UnicodeString resultString;

4635 UnicodeString perlExpr = fields[3];

4636

4637 while (perlExpr.length() > 0) {

4638 groupsMat->reset(perlExpr);

4639 cgMat->reset(perlExpr);

4640

4641 if (perlExpr.startsWith("$&")) {

4642 resultString.append(testMat->group(status));

4643 perlExpr.remove(0, 2);

4644 }

4645

4646 else if (groupsMat->lookingAt(status)) {

4647 // $-[0] $+[2] etc.

4648 UnicodeString digitString = groupsMat->group(2, status);

4649 int32_t t = 0;

4650 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);

4651 UnicodeString plusOrMinus = groupsMat->group(1, status);

4652 int32_t matchPosition;

4653 if (plusOrMinus.compare("+") == 0) {

4654 matchPosition = testMat->end(groupNum, status);

4655 } else {

4656 matchPosition = testMat->start(groupNum, status);

4657 }

4658 if (matchPosition != -1) {

4659 ICU_Utility::appendNumber(resultString, matchPosition);

4660 }

4661 perlExpr.remove(0, groupsMat->end(status));

4662 }

4663

4664 else if (cgMat->lookingAt(status)) {

4665 // $1, $2, $3, etc.

4666 UnicodeString digitString = cgMat->group(1, status);

4667 int32_t t = 0;

4668 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);

4669 if (U_SUCCESS(status)) {

4670 resultString.append(testMat->group(groupNum, status));

4671 status = U_ZERO_ERROR;

4672 }

4673 perlExpr.remove(0, cgMat->end(status));

4674 }

4675

4676 else if (perlExpr.startsWith("@-")) {

4677 int32_t i;

4678 for (i=0; i<=testMat->groupCount(); i++) {

4679 if (i>0) {

4680 resultString.append(" ");

4681 }

4682 ICU_Utility::appendNumber(resultString, testMat->start(i, st atus));

4683 }

4684 perlExpr.remove(0, 2);

4685 }

4686

4687 else if (perlExpr.startsWith("@+")) {

4688 int32_t i;

4689 for (i=0; i<=testMat->groupCount(); i++) {

4690 if (i>0) {

4691 resultString.append(" ");

4692 }

4693 ICU_Utility::appendNumber(resultString, testMat->end(i, stat us));

4694 }

4695 perlExpr.remove(0, 2);

4696 }

4697

4698 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \ Escape. Take following char as a literal.

4699 // or as an escap ed sequence (e.g. \n)

4700 if (perlExpr.length() > 1) {

4701 perlExpr.remove(0, 1); // Remove the '\', but only if not l ast char.

4702 }

4703 UChar c = perlExpr.charAt(0);

4704 switch (c) {

4705 case 'n': c = '\n'; break;

4706 // add any other escape sequences that show up in the test expec ted results.

4707 }

4708 resultString.append(c);

4709 perlExpr.remove(0, 1);

4710 }

4711

4712 else {

4713 // Any characters from the perl expression that we don't explici tly

4714 // recognize before here are assumed to be literals and copied

4715 // as-is to the expected results.

4716 resultString.append(perlExpr.charAt(0));

4717 perlExpr.remove(0, 1);

4718 }

4719

4720 if (U_FAILURE(status)) {

4721 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)) ;

4722 break;

4723 }

4724 }

4725

4726 //

4727 // Expected Results Compare

4728 //

4729 UnicodeString expectedS(fields[4]);

4730 expectedS.findAndReplace(nulnulSrc, nulnul);

4731 expectedS.findAndReplace(ffffSrc, ffff);

4732 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");

4733

4734

4735 if (expectedS.compare(resultString) != 0) {

4736 err("Line %d: Incorrect perl expression results.", lineNum);

4737 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; go t \""+resultString+(UnicodeString)"\"");

4738 }

4739

4740 delete testMat;

4741 delete testPat;

4742 }

4743

4744 //

4745 // All done. Clean up allocated stuff.

4746 //

4747 delete cgMat;

4748 delete cgPat;

4749

4750 delete groupsMat;

4751 delete groupsPat;

4752

4753 delete flagMat;

4754 delete flagPat;

4755

4756 delete lineMat;

4757 delete linePat;

4758

4759 delete fieldPat;

4760 delete [] testData;

4761

4762 utext_close(&patternText);

4763 utext_close(&inputText);

4764

4765 delete [] patternChars;

4766 delete [] inputChars;

4767

4768

4769 logln("%d tests skipped because of unimplemented regexp features.", skippedU nimplementedCount);

4770

4771 }

4772

4773

4774 //--------------------------------------------------------------

4775 //

4776 // Bug6149 Verify limits to heap expansion for backtrack stack.

4777 // Use this pattern,

4778 // "(a?){1,8000000}"

4779 // Note: was an unbounded upperbounds, but that now has loop-breakin g enabled.

4780 // This test is likely to be fragile, as further optimizations stop

4781 // more cases of pointless looping in the match engine.

4782 //

4783 //---------------------------------------------------------------

4784 void RegexTest::Bug6149() {

4785 UnicodeString pattern("(a?){1,8000000}");

4786 UnicodeString s("xyz");

4787 uint32_t flags = 0;

4788 UErrorCode status = U_ZERO_ERROR;

4789

4790 RegexMatcher matcher(pattern, s, flags, status);

4791 UBool result = false;

4792 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);

4793 REGEX_ASSERT(result == FALSE);

4794 }

4795

4796

4797 //

4798 // Callbacks() Test the callback function.

4799 // When set, callbacks occur periodically during matching opera tions,

4800 // giving the application code the ability to abort the operati on

4801 // before it's normal completion.

4802 //

4803

4804 struct callBackContext {

4805 RegexTest *test;

4806 int32_t maxCalls;

4807 int32_t numCalls;

4808 int32_t lastSteps;

4809 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};

4810 };

4811

4812 U_CDECL_BEGIN

4813 static UBool U_CALLCONV

4814 testCallBackFn(const void *context, int32_t steps) {

4815 callBackContext info = (callBackContext )context;

4816 if (info->lastSteps+1 != steps) {

4817 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);

4818 }

4819 info->lastSteps = steps;

4820 info->numCalls++;

4821 return (info->numCalls < info->maxCalls);

4822 }

4823 U_CDECL_END

4824

4825 void RegexTest::Callbacks() {

4826 {

4827 // Getter returns NULLs if no callback has been set

4828

4829 // The variables that the getter will fill in.

4830 // Init to non-null values so that the action of the getter can be see n.

4831 const void *returnedContext = &returnedContext;

4832 URegexMatchCallback *returnedFn = &testCallBackFn;

4833

4834 UErrorCode status = U_ZERO_ERROR;

4835 RegexMatcher matcher("x", 0, status);

4836 REGEX_CHECK_STATUS;

4837 matcher.getMatchCallback(returnedFn, returnedContext, status);

4838 REGEX_CHECK_STATUS;

4839 REGEX_ASSERT(returnedFn == NULL);

4840 REGEX_ASSERT(returnedContext == NULL);

4841 }

4842

4843 {

4844 // Set and Get work

4845 callBackContext cbInfo = {this, 0, 0, 0};

4846 const void *returnedContext;

4847 URegexMatchCallback *returnedFn;

4848 UErrorCode status = U_ZERO_ERROR;

4849 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.

4850 REGEX_CHECK_STATUS;

4851 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);

4852 REGEX_CHECK_STATUS;

4853 matcher.getMatchCallback(returnedFn, returnedContext, status);

4854 REGEX_CHECK_STATUS;

4855 REGEX_ASSERT(returnedFn == testCallBackFn);

4856 REGEX_ASSERT(returnedContext == &cbInfo);

4857

4858 // A short-running match shouldn't invoke the callback

4859 status = U_ZERO_ERROR;

4860 cbInfo.reset(1);

4861 UnicodeString s = "xxx";

4862 matcher.reset(s);

4863 REGEX_ASSERT(matcher.matches(status));

4864 REGEX_CHECK_STATUS;

4865 REGEX_ASSERT(cbInfo.numCalls == 0);

4866

4867 // A medium-length match that runs long enough to invoke the

4868 // callback, but not so long that the callback aborts it.

4869 status = U_ZERO_ERROR;

4870 cbInfo.reset(4);

4871 s = "aaaaaaaaaaaaaaaaaaab";

4872 matcher.reset(s);

4873 REGEX_ASSERT(matcher.matches(status)==FALSE);

4874 REGEX_CHECK_STATUS;

4875 REGEX_ASSERT(cbInfo.numCalls > 0);

4876

4877 // A longer running match that the callback function will abort.

4878 status = U_ZERO_ERROR;

4879 cbInfo.reset(4);

4880 s = "aaaaaaaaaaaaaaaaaaaaaaab";

4881 matcher.reset(s);

4882 REGEX_ASSERT(matcher.matches(status)==FALSE);

4883 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);

4884 REGEX_ASSERT(cbInfo.numCalls == 4);

4885

4886 // A longer running find that the callback function will abort.

4887 status = U_ZERO_ERROR;

4888 cbInfo.reset(4);

4889 s = "aaaaaaaaaaaaaaaaaaaaaaab";

4890 matcher.reset(s);

4891 REGEX_ASSERT(matcher.find(status)==FALSE);

4892 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);

4893 REGEX_ASSERT(cbInfo.numCalls == 4);

4894 }

4895

4896

4897 }

4898

4899

4900 //

4901 // FindProgressCallbacks() Test the find "progress" callback function.

4902 // When set, the find progress callback will be invoked during a find operations

4903 // after each return from a match attempt, giving the applicati on the opportunity

4904 // to terminate a long-running find operation before it's norma l completion.

4905 //

4906

4907 struct progressCallBackContext {

4908 RegexTest *test;

4909 int64_t lastIndex;

4910 int32_t maxCalls;

4911 int32_t numCalls;

4912 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};

4913 };

4914

4915 // call-back function for find().

4916 // Return TRUE to continue the find().

4917 // Return FALSE to stop the find().

4918 U_CDECL_BEGIN

4919 static UBool U_CALLCONV

4920 testProgressCallBackFn(const void *context, int64_t matchIndex) {

4921 progressCallBackContext info = (progressCallBackContext )context;

4922 info->numCalls++;

4923 info->lastIndex = matchIndex;

4924 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);

4925 return (info->numCalls < info->maxCalls);

4926 }

4927 U_CDECL_END

4928

4929 void RegexTest::FindProgressCallbacks() {

4930 {

4931 // Getter returns NULLs if no callback has been set

4932

4933 // The variables that the getter will fill in.

4934 // Init to non-null values so that the action of the getter can be see n.

4935 const void *returnedContext = &returnedContext;

4936 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;

4937

4938 UErrorCode status = U_ZERO_ERROR;

4939 RegexMatcher matcher("x", 0, status);

4940 REGEX_CHECK_STATUS;

4941 matcher.getFindProgressCallback(returnedFn, returnedContext, status);

4942 REGEX_CHECK_STATUS;

4943 REGEX_ASSERT(returnedFn == NULL);

4944 REGEX_ASSERT(returnedContext == NULL);

4945 }

4946

4947 {

4948 // Set and Get work

4949 progressCallBackContext cbInfo = {this, 0, 0, 0};

4950 const void *returnedContext;

4951 URegexFindProgressCallback *returnedFn;

4952 UErrorCode status = U_ZERO_ERROR;

4953 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);

4954 REGEX_CHECK_STATUS;

4955 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status) ;

4956 REGEX_CHECK_STATUS;

4957 matcher.getFindProgressCallback(returnedFn, returnedContext, status);

4958 REGEX_CHECK_STATUS;

4959 REGEX_ASSERT(returnedFn == testProgressCallBackFn);

4960 REGEX_ASSERT(returnedContext == &cbInfo);

4961

4962 // A find that matches on the initial position does NOT invoke the callb ack.

4963 status = U_ZERO_ERROR;

4964 cbInfo.reset(100);

4965 UnicodeString s = "aaxxx";

4966 matcher.reset(s);

4967 #if 0

4968 matcher.setTrace(TRUE);

4969 #endif

4970 REGEX_ASSERT(matcher.find(0, status));

4971 REGEX_CHECK_STATUS;

4972 REGEX_ASSERT(cbInfo.numCalls == 0);

4973

4974 // A medium running find() that causes matcher.find() to invoke our call back for each index,

4975 // but not so many times that we interrupt the operation.

4976 status = U_ZERO_ERROR;

4977 s = "aaaaaaaaaaaaaaaaaaab";

4978 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string

4979 matcher.reset(s);

4980 REGEX_ASSERT(matcher.find(0, status)==FALSE);

4981 REGEX_CHECK_STATUS;

4982 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);

4983

4984 // A longer running match that causes matcher.find() to invoke our callb ack which we cancel/interrupt at some point.

4985 status = U_ZERO_ERROR;

4986 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";

4987 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string

4988 matcher.reset(s1);

4989 REGEX_ASSERT(matcher.find(0, status)==FALSE);

4990 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);

4991 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);

4992

4993 // Now a match that will succeed, but after an interruption

4994 status = U_ZERO_ERROR;

4995 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";

4996 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string

4997 matcher.reset(s2);

4998 REGEX_ASSERT(matcher.find(0, status)==FALSE);

4999 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);

5000 // Now retry the match from where left off

5001 cbInfo.maxCalls = 100; // No callback limit

5002 status = U_ZERO_ERROR;

5003 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));

5004 REGEX_CHECK_STATUS;

5005 }

5006

5007

5008 }

5009

5010

5011 //---------------------------------------------------------------------------

5012 //

5013 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable

5014 // UTexts. The pure-C implementation of UText

5015 // has no mutable backing stores, but we can

5016 // use UnicodeString here to test the functionality.

5017 //

5018 //---------------------------------------------------------------------------

5019 void RegexTest::PreAllocatedUTextCAPI () {

5020 UErrorCode status = U_ZERO_ERROR;

5021 URegularExpression *re;

5022 UText patternText = UTEXT_INITIALIZER;

5023 UnicodeString buffer;

5024 UText bufferText = UTEXT_INITIALIZER;

5025

5026 utext_openUnicodeString(&bufferText, &buffer, &status);

5027

5028 /*

5029 * getText() and getUText()

5030 */

5031 {

5032 UText text1 = UTEXT_INITIALIZER;

5033 UText text2 = UTEXT_INITIALIZER;

5034 UChar text2Chars[20];

5035 UText *resultText;

5036

5037 status = U_ZERO_ERROR;

5038 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);

5039 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);

5040 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);

5041 utext_openUChars(&text2, text2Chars, -1, &status);

5042

5043 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);

5044 re = uregex_openUText(&patternText, 0, NULL, &status);

5045

5046 /* First set a UText */

5047 uregex_setUText(re, &text1, &status);

5048 resultText = uregex_getUText(re, &bufferText, &status);

5049 REGEX_CHECK_STATUS;

5050 REGEX_ASSERT(resultText == &bufferText);

5051 utext_setNativeIndex(resultText, 0);

5052 utext_setNativeIndex(&text1, 0);

5053 REGEX_ASSERT(testUTextEqual(resultText, &text1));

5054

5055 resultText = uregex_getUText(re, &bufferText, &status);

5056 REGEX_CHECK_STATUS;

5057 REGEX_ASSERT(resultText == &bufferText);

5058 utext_setNativeIndex(resultText, 0);

5059 utext_setNativeIndex(&text1, 0);

5060 REGEX_ASSERT(testUTextEqual(resultText, &text1));

5061

5062 /* Then set a UChar * */

5063 uregex_setText(re, text2Chars, 7, &status);

5064 resultText = uregex_getUText(re, &bufferText, &status);

5065 REGEX_CHECK_STATUS;

5066 REGEX_ASSERT(resultText == &bufferText);

5067 utext_setNativeIndex(resultText, 0);

5068 utext_setNativeIndex(&text2, 0);

5069 REGEX_ASSERT(testUTextEqual(resultText, &text2));

5070

5071 uregex_close(re);

5072 utext_close(&text1);

5073 utext_close(&text2);

5074 }

5075

5076 /*

5077 * group()

5078 */

5079 {

5080 UChar text1[80];

5081 UText *actual;

5082 UBool result;

5083 int64_t length = 0;

5084

5085 u_uastrncpy(text1, "noise abc interior def, and this is off the end", U PRV_LENGTHOF(text1));

5086 // 012345678901234567890123456789012345678901234567

5087 // 0 1 2 3 4

5088

5089 status = U_ZERO_ERROR;

5090 re = uregex_openC("abc(.*?)def", 0, NULL, &status);

5091 REGEX_CHECK_STATUS;

5092

5093 uregex_setText(re, text1, -1, &status);

5094 result = uregex_find(re, 0, &status);

5095 REGEX_ASSERT(result==TRUE);

5096

5097 /* Capture Group 0, the full match. Should succeed. "abc interior def" */

5098 status = U_ZERO_ERROR;

5099 actual = uregex_groupUText(re, 0, &bufferText, &length, &status);

5100 REGEX_CHECK_STATUS;

5101 REGEX_ASSERT(actual == &bufferText);

5102 REGEX_ASSERT(utext_getNativeIndex(actual) == 6);

5103 REGEX_ASSERT(length == 16);

5104 REGEX_ASSERT(utext_nativeLength(actual) == 47);

5105

5106 /* Capture group #1. Should succeed, matching " interior ". */

5107 status = U_ZERO_ERROR;

5108 actual = uregex_groupUText(re, 1, &bufferText, &length, &status);

5109 REGEX_CHECK_STATUS;

5110 REGEX_ASSERT(actual == &bufferText);

5111 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " inte rior "

5112 REGEX_ASSERT(length == 10);

5113 REGEX_ASSERT(utext_nativeLength(actual) == 47);

5114

5115 /* Capture group out of range. Error. */

5116 status = U_ZERO_ERROR;

5117 actual = uregex_groupUText(re, 2, &bufferText, &length, &status);

5118 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

5119 REGEX_ASSERT(actual == &bufferText);

5120 uregex_close(re);

5121

5122 }

5123

5124 /*

5125 * replaceFirst()

5126 */

5127 {

5128 UChar text1[80];

5129 UChar text2[80];

5130 UText replText = UTEXT_INITIALIZER;

5131 UText *result;

5132 status = U_ZERO_ERROR;

5133 utext_openUnicodeString(&bufferText, &buffer, &status);

5134

5135 status = U_ZERO_ERROR;

5136 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1));

5137 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2);

5138 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);

5139

5140 re = uregex_openC("x(.*?)x", 0, NULL, &status);

5141 REGEX_CHECK_STATUS;

5142

5143 /* Normal case, with match */

5144 uregex_setText(re, text1, -1, &status);

5145 REGEX_CHECK_STATUS;

5146 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);

5147 REGEX_CHECK_STATUS;

5148 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);

5149 REGEX_CHECK_STATUS;

5150 REGEX_ASSERT(result == &bufferText);

5151 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);

5152

5153 /* No match. Text should copy to output with no changes. */

5154 uregex_setText(re, text2, -1, &status);

5155 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);

5156 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);

5157 REGEX_CHECK_STATUS;

5158 REGEX_ASSERT(result == &bufferText);

5159 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);

5160

5161 /* Unicode escapes */

5162 uregex_setText(re, text1, -1, &status);

5163 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\ a", -1, &status);

5164 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);

5165 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);

5166 REGEX_CHECK_STATUS;

5167 REGEX_ASSERT(result == &bufferText);

5168 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);

5169

5170 uregex_close(re);

5171 utext_close(&replText);

5172 }

5173

5174

5175 /*

5176 * replaceAll()

5177 */

5178 {

5179 UChar text1[80];

5180 UChar text2[80];

5181 UText replText = UTEXT_INITIALIZER;

5182 UText *result;

5183

5184 status = U_ZERO_ERROR;

5185 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);

5186 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);

5187 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);

5188

5189 re = uregex_openC("x(.*?)x", 0, NULL, &status);

5190 REGEX_CHECK_STATUS;

5191

5192 /* Normal case, with match */

5193 uregex_setText(re, text1, -1, &status);

5194 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);

5195 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);

5196 REGEX_CHECK_STATUS;

5197 REGEX_ASSERT(result == &bufferText);

5198 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);

5199

5200 /* No match. Text should copy to output with no changes. */

5201 uregex_setText(re, text2, -1, &status);

5202 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);

5203 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);

5204 REGEX_CHECK_STATUS;

5205 REGEX_ASSERT(result == &bufferText);

5206 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);

5207

5208 uregex_close(re);

5209 utext_close(&replText);

5210 }

5211

5212

5213 /*

5214 * splitUText() uses the C++ API directly, and the UnicodeString version us es mutable UTexts,

5215 * so we don't need to test it here.

5216 */

5217

5218 utext_close(&bufferText);

5219 utext_close(&patternText);

5220 }

5221

5222

5223 //--------------------------------------------------------------

5224 //

5225 // NamedCapture Check basic named capture group functionality

5226 //

5227 //--------------------------------------------------------------

5228 void RegexTest::NamedCapture() {

5229 UErrorCode status = U_ZERO_ERROR;

5230 RegexPattern *pat = RegexPattern::compile(UnicodeString(

5231 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, stat us);

5232 REGEX_CHECK_STATUS;

5233 int32_t group = pat->groupNumberFromName("five", -1, status);

5234 REGEX_CHECK_STATUS;

5235 REGEX_ASSERT(5 == group);

5236 group = pat->groupNumberFromName("three", -1, status);

5237 REGEX_CHECK_STATUS;

5238 REGEX_ASSERT(3 == group);

5239

5240 status = U_ZERO_ERROR;

5241 group = pat->groupNumberFromName(UnicodeString("six"), status);

5242 REGEX_CHECK_STATUS;

5243 REGEX_ASSERT(6 == group);

5244

5245 status = U_ZERO_ERROR;

5246 group = pat->groupNumberFromName(UnicodeString("nosuch"), status);

5247 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);

5248

5249 status = U_ZERO_ERROR;

5250

5251 // After copying a pattern, named capture should still work in the copy.

5252 RegexPattern copiedPat = new RegexPattern(pat);

5253 REGEX_ASSERT(copiedPat == pat);

5254 delete pat; pat = NULL; // Delete original, copy should have no references back to it.

5255

5256 group = copiedPat->groupNumberFromName("five", -1, status);

5257 REGEX_CHECK_STATUS;

5258 REGEX_ASSERT(5 == group);

5259 group = copiedPat->groupNumberFromName("three", -1, status);

5260 REGEX_CHECK_STATUS;

5261 REGEX_ASSERT(3 == group);

5262 delete copiedPat;

5263

5264 // ReplaceAll with named capture group.

5265 status = U_ZERO_ERROR;

5266 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");

5267 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0 , status);

5268 REGEX_CHECK_STATUS;

5269 // m.pattern().dumpPattern();

5270 UnicodeString replacedText = m->replaceAll("'${mid}'", status);

5271 REGEX_CHECK_STATUS;

5272 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);

5273 delete m;

5274

5275 // ReplaceAll, allowed capture group numbers.

5276 text = UnicodeString("abcmxyz");

5277 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);

5278 REGEX_CHECK_STATUS;

5279

5280 status = U_ZERO_ERROR;

5281 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.

5282 REGEX_CHECK_STATUS;

5283 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);

5284

5285 status = U_ZERO_ERROR;

5286 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.

5287 REGEX_CHECK_STATUS;

5288 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);

5289

5290 status = U_ZERO_ERROR;

5291 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.

5292 REGEX_CHECK_STATUS;

5293 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);

5294

5295 status = U_ZERO_ERROR;

5296 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.

5297 REGEX_CHECK_STATUS;

5298 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);

5299

5300 status = U_ZERO_ERROR;

5301 replacedText = m->replaceAll(UnicodeString("<$3>"), status);

5302 REGEX_CHECK_STATUS;

5303 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);

5304

5305 status = U_ZERO_ERROR;

5306 replacedText = m->replaceAll(UnicodeString("<$4>"), status);

5307 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

5308

5309 status = U_ZERO_ERROR;

5310 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,

5311 REGEX_CHECK_STATUS; // tr ailing out-of-range 4 passes through.

5312 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);

5313

5314 status = U_ZERO_ERROR;

5315 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consu me leading zeroes. Don't consume digits

5316 REGEX_CHECK_STATUS; // tha t push group num out of range.

5317 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // Thi s is group 1.

5318

5319 status = U_ZERO_ERROR;

5320 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);

5321 REGEX_CHECK_STATUS;

5322 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);

5323

5324 status = U_ZERO_ERROR;

5325 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);

5326 REGEX_CHECK_STATUS;

5327 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);

5328

5329 status = U_ZERO_ERROR;

5330 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);

5331 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);

5332

5333 status = U_ZERO_ERROR;

5334 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);

5335 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);

5336

5337 status = U_ZERO_ERROR;

5338 replacedText = m->replaceAll(UnicodeString("<${one"), status);

5339 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);

5340

5341 status = U_ZERO_ERROR;

5342 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status) ;

5343 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);

5344

5345 delete m;

5346

5347 // Repeat the above replaceAll() tests using the plain C API, which

5348 // has a separate implementation internally.

5349 // TODO: factor out the test data.

5350

5351 status = U_ZERO_ERROR;

5352 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status) ;

5353 REGEX_CHECK_STATUS;

5354 text = UnicodeString("abcmxyz");

5355 uregex_setText(re, text.getBuffer(), text.length(), &status);

5356 REGEX_CHECK_STATUS;

5357

5358 UChar resultBuf[100];

5359 int32_t resultLength;

5360 UnicodeString repl;

5361

5362 status = U_ZERO_ERROR;

5363 repl = UnicodeString("<$0>");

5364 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);

5365 REGEX_CHECK_STATUS;

5366 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLe ngth));

5367

5368 status = U_ZERO_ERROR;

5369 repl = UnicodeString("<$1>");

5370 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);

5371 REGEX_CHECK_STATUS;

5372 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength ));

5373

5374 status = U_ZERO_ERROR;

5375 repl = UnicodeString("<${one}>");

5376 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);

5377 REGEX_CHECK_STATUS;

5378 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength ));

5379

5380 status = U_ZERO_ERROR;

5381 repl = UnicodeString("<$2>");

5382 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);

5383 REGEX_CHECK_STATUS;

5384 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength ));

5385

5386 status = U_ZERO_ERROR;

5387 repl = UnicodeString("<$3>");

5388 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);

5389 REGEX_CHECK_STATUS;

5390 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength ));

5391

5392 status = U_ZERO_ERROR;

5393 repl = UnicodeString("<$4>");

5394 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);

5395 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

5396

5397 status = U_ZERO_ERROR;

5398 repl = UnicodeString("<$04>");

5399 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);

5400 REGEX_CHECK_STATUS;

5401 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultL ength));

5402

5403 status = U_ZERO_ERROR;

5404 repl = UnicodeString("<$000016>");

5405 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);

5406 REGEX_CHECK_STATUS;

5407 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLengt h));

5408

5409 status = U_ZERO_ERROR;

5410 repl = UnicodeString("<$3$2$1${one}>");

5411 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);

5412 REGEX_CHECK_STATUS;

5413 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLen gth));

5414

5415 status = U_ZERO_ERROR;

5416 repl = UnicodeString("$3$2$1${one}");

5417 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);

5418 REGEX_CHECK_STATUS;

5419 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLengt h));

5420

5421 status = U_ZERO_ERROR;

5422 repl = UnicodeString("<${noSuchName}>");

5423 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);

5424 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);

5425

5426 status = U_ZERO_ERROR;

5427 repl = UnicodeString("<${invalid-name}>");

5428 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);

5429 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);

5430

5431 status = U_ZERO_ERROR;

5432 repl = UnicodeString("<${one");

5433 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);

5434 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);

5435

5436 status = U_ZERO_ERROR;

5437 repl = UnicodeString("$not a capture group");

5438 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);

5439 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);

5440

5441 uregex_close(re);

5442 }

5443

5444 //--------------------------------------------------------------

5445 //

5446 // NamedCaptureLimits Patterns with huge numbers of named capture groups.

5447 // The point is not so much what the exact limit is,

5448 // but that a largish number doesn't hit bad non-linear pe rformance,

5449 // and that exceeding the limit fails cleanly.

5450 //

5451 //--------------------------------------------------------------

5452 void RegexTest::NamedCaptureLimits() {

5453 if (quick) {

5454 logln("Skipping test. Runs in exhuastive mode only.");

5455 return;

5456 }

5457 const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.

5458 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, f ails to compile.

5459 char nnbuf[100];

5460 UnicodeString pattern;

5461 int32_t nn;

5462

5463 for (nn=1; nn<goodLimit; nn++) {

5464 sprintf(nnbuf, "(?<nn%d>)", nn);

5465 pattern.append(UnicodeString(nnbuf, -1, US_INV));

5466 }

5467 UErrorCode status = U_ZERO_ERROR;

5468 RegexPattern *pat = RegexPattern::compile(pattern, 0, status);

5469 REGEX_CHECK_STATUS;

5470 for (nn=1; nn<goodLimit; nn++) {

5471 sprintf(nnbuf, "nn%d", nn);

5472 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);

5473 REGEX_ASSERT(nn == groupNum);

5474 if (nn != groupNum) {

5475 break;

5476 }

5477 }

5478 delete pat;

5479

5480 pattern.remove();

5481 for (nn=1; nn<failLimit; nn++) {

5482 sprintf(nnbuf, "(?<nn%d>)", nn);

5483 pattern.append(UnicodeString(nnbuf, -1, US_INV));

5484 }

5485 status = U_ZERO_ERROR;

5486 pat = RegexPattern::compile(pattern, 0, status);

5487 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);

5488 delete pat;

5489 }

5490

5491

5492 //--------------------------------------------------------------

5493 //

5494 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher .

5495 //

5496 //---------------------------------------------------------------

5497 void RegexTest::Bug7651() {

5498 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\ u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*\|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z 0-9_]+(?:\\/[\\w-]+)?\|(https?\\:\\/\\/\|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\ uFFFF])\|\\$[A-Za-z]+)");

5499 // The following should exceed the default operator stack depth in the matc her, i.e. force the matcher to malloc instead of using fSmallData.

5500 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allo cation.

5501 UnicodeString pattern2("((https?\\:\\/\\/\|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u 0080-\\uFFFF])\|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?\|(?<![ A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u0 0f8-\\u00ff]*\|\\$[A-Za-z]+)");

5502 UnicodeString s("#ff @abcd This is test");

5503 RegexPattern *REPattern = NULL;

5504 RegexMatcher *REMatcher = NULL;

5505 UErrorCode status = U_ZERO_ERROR;

5506 UParseError pe;

5507

5508 REPattern = RegexPattern::compile(pattern1, 0, pe, status);

5509 REGEX_CHECK_STATUS;

5510 REMatcher = REPattern->matcher(s, status);

5511 REGEX_CHECK_STATUS;

5512 REGEX_ASSERT(REMatcher->find());

5513 REGEX_ASSERT(REMatcher->start(status) == 0);

5514 delete REPattern;

5515 delete REMatcher;

5516 status = U_ZERO_ERROR;

5517

5518 REPattern = RegexPattern::compile(pattern2, 0, pe, status);

5519 REGEX_CHECK_STATUS;

5520 REMatcher = REPattern->matcher(s, status);

5521 REGEX_CHECK_STATUS;

5522 REGEX_ASSERT(REMatcher->find());

5523 REGEX_ASSERT(REMatcher->start(status) == 0);

5524 delete REPattern;

5525 delete REMatcher;

5526 status = U_ZERO_ERROR;

5527 }

5528

5529 void RegexTest::Bug7740() {

5530 UErrorCode status = U_ZERO_ERROR;

5531 UnicodeString pattern = "(a)";

5532 UnicodeString text = "abcdef";

5533 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);

5534 REGEX_CHECK_STATUS;

5535 REGEX_ASSERT(m->lookingAt(status));

5536 REGEX_CHECK_STATUS;

5537 status = U_ILLEGAL_ARGUMENT_ERROR;

5538 UnicodeString s = m->group(1, status); // Bug 7740: segfault here.

5539 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);

5540 REGEX_ASSERT(s == "");

5541 delete m;

5542 }

5543

5544 // Bug 8479: was crashing whith a Bogus UnicodeString as input.

5545

5546 void RegexTest::Bug8479() {

5547 UErrorCode status = U_ZERO_ERROR;

5548

5549 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL\|U REGEX_CASE_INSENSITIVE, status);

5550 REGEX_CHECK_STATUS;

5551 if (U_SUCCESS(status))

5552 {

5553 UnicodeString str;

5554 str.setToBogus();

5555 pMatcher->reset(str);

5556 status = U_ZERO_ERROR;

5557 pMatcher->matches(status);

5558 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);

5559 delete pMatcher;

5560 }

5561 }

5562

5563

5564 // Bug 7029

5565 void RegexTest::Bug7029() {

5566 UErrorCode status = U_ZERO_ERROR;

5567

5568 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);

5569 UnicodeString text = "abc.def";

5570 UnicodeString splits[10];

5571 REGEX_CHECK_STATUS;

5572 int32_t numFields = pMatcher->split(text, splits, 10, status);

5573 REGEX_CHECK_STATUS;

5574 REGEX_ASSERT(numFields == 8);

5575 delete pMatcher;

5576 }

5577

5578 // Bug 9283

5579 // This test is checking for the existance of any supplemental characters that case-fold

5580 // to a bmp character.

5581 //

5582 // At the time of this writing there are none. If any should appear in a subse quent release

5583 // of Unicode, the code in regular expressions compilation that determines the longest

5584 // posssible match for a literal string will need to be enhanced.

5585 //

5586 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()

5587 // for details on what to do in case of a failure of this test.

5588 //

5589 void RegexTest::Bug9283() {

5590 #if !UCONFIG_NO_NORMALIZATION

5591 UErrorCode status = U_ZERO_ERROR;

5592 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF] ]", status);

5593 REGEX_CHECK_STATUS;

5594 int32_t index;

5595 UChar32 c;

5596 for (index=0; ; index++) {

5597 c = supplementalsWithCaseFolding.charAt(index);

5598 if (c == -1) {

5599 break;

5600 }

5601 UnicodeString cf = UnicodeString(c).foldCase();

5602 REGEX_ASSERT(cf.length() >= 2);

5603 }

5604 #endif /* #if !UCONFIG_NO_NORMALIZATION */

5605 }

5606

5607

5608 void RegexTest::CheckInvBufSize() {

5609 if(inv_next>=INV_BUFSIZ) {

5610 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least % d )\n",

5611 __FILE__, INV_BUFSIZ, inv_next);

5612 } else {

5613 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);

5614 }

5615 }

5616

5617

5618 void RegexTest::Bug10459() {

5619 UErrorCode status = U_ZERO_ERROR;

5620 UnicodeString patternString("(txt)");

5621 UnicodeString txtString("txt");

5622

5623 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);

5624 REGEX_CHECK_STATUS;

5625 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);

5626 REGEX_CHECK_STATUS;

5627

5628 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);

5629 REGEX_CHECK_STATUS;

5630

5631 uregex_setUText(icu_re, utext_txt, &status);

5632 REGEX_CHECK_STATUS;

5633

5634 // The bug was that calling uregex_group() before doing a matching operation

5635 // was causing a segfault. Only for Regular Expressions created from UText .

5636 // It should set an U_REGEX_INVALID_STATE.

5637

5638 UChar buf[100];

5639 int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);

5640 REGEX_ASSERT(status == U_REGEX_INVALID_STATE);

5641 REGEX_ASSERT(len == 0);

5642

5643 uregex_close(icu_re);

5644 utext_close(utext_pat);

5645 utext_close(utext_txt);

5646 }

5647

5648 void RegexTest::TestCaseInsensitiveStarters() {

5649 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() ha sn't

5650 // become stale because of new Unicode characters.

5651 // If it is stale, rerun the generation tool

5652 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genre gexcasing

5653 // and replace the embedded data in i18n/regexcmp.cpp

5654

5655 for (UChar32 cp=0; cp<=0x10ffff; cp++) {

5656 if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {

5657 continue;

5658 }

5659 UnicodeSet s(cp, cp);

5660 s.closeOver(USET_CASE_INSENSITIVE);

5661 UnicodeSetIterator setIter(s);

5662 while (setIter.next()) {

5663 if (!setIter.isString()) {

5664 continue;

5665 }

5666 const UnicodeString &str = setIter.getString();

5667 UChar32 firstChar = str.char32At(0);

5668 UnicodeSet starters;

5669 RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);

5670 if (!starters.contains(cp)) {

5671 errln("CaseInsensitiveStarters for \\u%x is missing character \\ u%x.", cp, firstChar);

5672 return;

5673 }

5674 }

5675 }

5676 }

5677

5678

5679 void RegexTest::TestBug11049() {

5680 // Original bug report: pattern with match start consisting of one of severa l individual characters,

5681 // and the text being matched ending with a supplementary character. find() would read past the

5682 // end of the input text when searching for potential match starting points .

5683

5684 // To see the problem, the text must exactly fill an allocated buffer, so th at valgrind will

5685 // detect the bad read.

5686

5687 TestCase11049("A\|B\|C", "a string \\ud800\\udc00", FALSE, __LINE__);

5688 TestCase11049("A\|B\|C", "string matches at end C", TRUE, __LINE__);

5689

5690 // Test again with a pattern starting with a single character,

5691 // which takes a different code path than starting with an OR expression,

5692 // but with similar logic.

5693 TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);

5694 TestCase11049("C", "string matches at end C", TRUE, __LINE__);

5695 }

5696

5697 // Run a single test case from TestBug11049(). Internal function.

5698 void RegexTest::TestCase11049(const char pattern, const char data, UBool expec tMatch, int32_t lineNumber) {

5699 UErrorCode status = U_ZERO_ERROR;

5700 UnicodeString patternString = UnicodeString(pattern).unescape();

5701 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));

5702

5703 UnicodeString dataString = UnicodeString(data).unescape();

5704 UChar *exactBuffer = new UChar[dataString.length()];

5705 dataString.extract(exactBuffer, dataString.length(), status);

5706 UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status );

5707

5708 LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));

5709 REGEX_CHECK_STATUS;

5710 matcher->reset(ut);

5711 UBool result = matcher->find();

5712 if (result != expectMatch) {

5713 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \ "%s\"",

5714 __FILE__, lineNumber, expectMatch, result, pattern, data);

5715 }

5716

5717 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could s ee

5718 // off-by-one on find() with match at the last code point.

5719 // Size of the original char * data (invariant charset) will be <= than th e equivalent UTF-8

5720 // because string.unescape() will only shrink it.

5721 char * utf8Buffer = new char[uprv_strlen(data)+1];

5722 u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), d ataString.length(), &status);

5723 REGEX_CHECK_STATUS;

5724 ut = utext_openUTF8(ut, utf8Buffer, -1, &status);

5725 REGEX_CHECK_STATUS;

5726 matcher->reset(ut);

5727 result = matcher->find();

5728 if (result != expectMatch) {

5729 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \" %s\", text = \"%s\"",

5730 __FILE__, lineNumber, expectMatch, result, pattern, data);

5731 }

5732 delete [] utf8Buffer;

5733

5734 utext_close(ut);

5735 delete [] exactBuffer;

5736 }

5737

5738

5739 void RegexTest::TestBug11371() {

5740 if (quick) {

5741 logln("Skipping test. Runs in exhuastive mode only.");

5742 return;

5743 }

5744 UErrorCode status = U_ZERO_ERROR;

5745 UnicodeString patternString;

5746

5747 for (int i=0; i<8000000; i++) {

5748 patternString.append(UnicodeString("()"));

5749 }

5750 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));

5751 if (status != U_REGEX_PATTERN_TOO_BIG) {

5752 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s. ",

5753 __FILE__, __LINE__, u_errorName(status));

5754 }

5755

5756 status = U_ZERO_ERROR;

5757 patternString = "(";

5758 for (int i=0; i<20000000; i++) {

5759 patternString.append(UnicodeString("A++"));

5760 }

5761 patternString.append(UnicodeString("){0}B++"));

5762 LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));

5763 if (status != U_REGEX_PATTERN_TOO_BIG) {

5764 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s. ",

5765 __FILE__, __LINE__, u_errorName(status));

5766 }

5767

5768 // Pattern with too much string data, such that string indexes overflow oper and data field size

5769 // in compiled instruction.

5770 status = U_ZERO_ERROR;

5771 patternString = "";

5772 while (patternString.length() < 0x00ffffff) {

5773 patternString.append(UnicodeString("stuff and things dont you know, thes e are a few of my favorite strings\n"));

5774 }

5775 patternString.append(UnicodeString("X? trailing string"));

5776 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));

5777 if (status != U_REGEX_PATTERN_TOO_BIG) {

5778 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s. ",

5779 __FILE__, __LINE__, u_errorName(status));

5780 }

5781 }

5782

5783 void RegexTest::TestBug11480() {

5784 // C API, get capture group of a group that does not participate in the matc h.

5785 // (Returns a zero length string, with nul termination,

5786 // indistinguishable from a group with a zero length match.)

5787

5788 UErrorCode status = U_ZERO_ERROR;

5789 URegularExpression *re = uregex_openC("(A)\|(B)", 0, NULL, &status);

5790 REGEX_CHECK_STATUS;

5791 UnicodeString text = UNICODE_STRING_SIMPLE("A");

5792 uregex_setText(re, text.getBuffer(), text.length(), &status);

5793 REGEX_CHECK_STATUS;

5794 REGEX_ASSERT(uregex_lookingAt(re, 0, &status));

5795 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};

5796 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);

5797 REGEX_ASSERT(length == 0);

5798 REGEX_ASSERT(buf[0] == 13);

5799 REGEX_ASSERT(buf[1] == 0);

5800 REGEX_ASSERT(buf[2] == 13);

5801 uregex_close(re);

5802

5803 // UText C++ API, length of match is 0 for non-participating matches.

5804 UText ut = UTEXT_INITIALIZER;

5805 utext_openUnicodeString(&ut, &text, &status);

5806 RegexMatcher matcher(UnicodeString("(A)\|(B)"), 0, status);

5807 REGEX_CHECK_STATUS;

5808 matcher.reset(&ut);

5809 REGEX_ASSERT(matcher.lookingAt(0, status));

5810

5811 // UText C++ API, Capture group 1 matches "A", position 0, length 1.

5812 int64_t groupLen = -666;

5813 UText group = UTEXT_INITIALIZER;

5814 matcher.group(1, &group, groupLen, status);

5815 REGEX_CHECK_STATUS;

5816 REGEX_ASSERT(groupLen == 1);

5817 REGEX_ASSERT(utext_getNativeIndex(&group) == 0);

5818

5819 // Capture group 2, the (B), does not participate in the match.

5820 matcher.group(2, &group, groupLen, status);

5821 REGEX_CHECK_STATUS;

5822 REGEX_ASSERT(groupLen == 0);

5823 REGEX_ASSERT(matcher.start(2, status) == -1);

5824 REGEX_CHECK_STATUS;

5825 }

5826

5827

5828 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */

OLD	NEW

« no previous file with comments | « source/test/intltest/regextst.h ('k') | source/test/intltest/regiontst.h » ('j') | no next file with comments »