icu46/source/test/intltest/regextst.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/test/intltest/regextst.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /********************************************************************

	2 * COPYRIGHT:

	3 * Copyright (c) 2002-2010, International Business Machines Corporation and

	4 * others. All Rights Reserved.

	5 ********************************************************************/

	6

	7 //

	8 // regextst.cpp

	9 //

	10 // ICU Regular Expressions test, part of intltest.

	11 //

	12

	13 #include "intltest.h"

	14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

	15

	16 #include "unicode/regex.h"

	17 #include "unicode/uchar.h"

	18 #include "unicode/ucnv.h"

	19 #include "unicode/ustring.h"

	20 #include "regextst.h"

	21 #include "uvector.h"

	22 #include "util.h"

	23 #include <stdlib.h>

	24 #include <string.h>

	25 #include <stdio.h>

	26 #include "cstring.h"

	27 #include "uinvchar.h"

	28

	29 #define SUPPORT_MUTATING_INPUT_STRING 0

	30

	31 //---------------------------------------------------------------------------

	32 //

	33 // Test class boilerplate

	34 //

	35 //---------------------------------------------------------------------------

	36 RegexTest::RegexTest()

	37 {

	38 }

	39

	40

	41 RegexTest::~RegexTest()

	42 {

	43 }

	44

	45

	46

	47 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch ar* /par/ )

	48 {

	49 if (exec) logln("TestSuite RegexTest: ");

	50 switch (index) {

	51

	52 case 0: name = "Basic";

	53 if (exec) Basic();

	54 break;

	55 case 1: name = "API_Match";

	56 if (exec) API_Match();

	57 break;

	58 case 2: name = "API_Replace";

	59 if (exec) API_Replace();

	60 break;

	61 case 3: name = "API_Pattern";

	62 if (exec) API_Pattern();

	63 break;

	64 case 4:

	65 #if !UCONFIG_NO_FILE_IO

	66 name = "Extended";

	67 if (exec) Extended();

	68 #else

	69 name = "skip";

	70 #endif

	71 break;

	72 case 5: name = "Errors";

	73 if (exec) Errors();

	74 break;

	75 case 6: name = "PerlTests";

	76 if (exec) PerlTests();

	77 break;

	78 case 7: name = "Callbacks";

	79 if (exec) Callbacks();

	80 break;

	81 case 8: name = "FindProgressCallbacks";

	82 if (exec) FindProgressCallbacks();

	83 break;

	84 case 9: name = "Bug 6149";

	85 if (exec) Bug6149();

	86 break;

	87 case 10: name = "UTextBasic";

	88 if (exec) UTextBasic();

	89 break;

	90 case 11: name = "API_Match_UTF8";

	91 if (exec) API_Match_UTF8();

	92 break;

	93 case 12: name = "API_Replace_UTF8";

	94 if (exec) API_Replace_UTF8();

	95 break;

	96 case 13: name = "API_Pattern_UTF8";

	97 if (exec) API_Pattern_UTF8();

	98 break;

	99 case 14: name = "PerlTestsUTF8";

	100 if (exec) PerlTestsUTF8();

	101 break;

	102 case 15: name = "PreAllocatedUTextCAPI";

	103 if (exec) PreAllocatedUTextCAPI();

	104 break;

	105 case 16: name = "Bug 7651";

	106 if (exec) Bug7651();

	107 break;

	108 case 17: name = "Bug 7740";

	109 if (exec) Bug7740();

	110 break;

	111

	112 default: name = "";

	113 break; //needed to end loop

	114 }

	115 }

	116

	117

	118 /**

	119 * Calls utext_openUTF8 after, potentially, converting invariant text from the c ompilation codepage

	120 * into ASCII.

	121 * @see utext_openUTF8

	122 */

	123 static UText* regextst_openUTF8FromInvariant(UText* ut, const char inv, int64_t length, UErrorCode status);

	124

	125 static UText* regextst_openUTF8FromInvariant(UText ut, const char inv, int64_t length, UErrorCode *status) {

	126 #if U_CHARSET_FAMILY==U_ASCII_FAMILY

	127 return utext_openUTF8(ut, inv, length, status);

	128 #else

	129 char buf[1024];

	130

	131 uprv_aestrncpy((uint8_t)buf, (const uint8_t)inv, length);

	132

	133 return utext_openUTF8(ut, buf, length, status);

	134 #endif

	135 }

	136

	137 //---------------------------------------------------------------------------

	138 //

	139 // Error Checking / Reporting macros used in all of the tests.

	140 //

	141 //---------------------------------------------------------------------------

	142

	143 static void utextToPrintable(char buf, int32_t bufLen, UText text) {

	144 int64_t oldIndex = utext_getNativeIndex(text);

	145 utext_setNativeIndex(text, 0);

	146 char *bufPtr = buf;

	147 UChar32 c = utext_next32From(text, 0);

	148 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {

	149 if (0x000020<=c && c<0x00007e) {

	150 *bufPtr = c;

	151 } else {

	152 #if 0

	153 sprintf(bufPtr,"U+%04X", c);

	154 bufPtr+= strlen(bufPtr)-1;

	155 #else

	156 *bufPtr = '%';

	157 #endif

	158 }

	159 bufPtr++;

	160 c = UTEXT_NEXT32(text);

	161 }

	162 *bufPtr = 0;

	163 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)

	164 char ebuf = (char)malloc(bufLen);

	165 uprv_eastrncpy((unsigned char)ebuf, (const unsigned char)buf, bufLen);

	166 uprv_strncpy(buf, ebuf, bufLen);

	167 free((void*)ebuf);

	168 #endif

	169 utext_setNativeIndex(text, oldIndex);

	170 }

	171

	172 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf) /sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}

	173

	174 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \

	175 __FILE__, __LINE__ , u_errorName(status)); return;}}

	176

	177 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}

	178

	179 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr) ;\

	180 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status= %s, got %s", \

	181 __LINE__, u_errorName(errcode), u_errorName(status));};}

	182

	183 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \

	184 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), stat us); }}

	185

	186 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \

	187 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}

	188

	189 /**

	190 * @param expected expected text in UTF-8 (not platform) codepage

	191 */

	192 void RegexTest::assertUText(const char expected, UText actual, const char *fil e, int line) {

	193 UErrorCode status = U_ZERO_ERROR;

	194 UText expectedText = UTEXT_INITIALIZER;

	195 utext_openUTF8(&expectedText, expected, -1, &status);

	196 if(U_FAILURE(status)) {

	197 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d ch ars)\n", file, line, u_errorName(status), strlen(expected));

	198 return;

	199 }

	200 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {

	201 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLe ngth(expectedText) returned 0.", file, line, strlen(expected));

	202 return;

	203 }

	204 utext_setNativeIndex(actual, 0);

	205 if (utext_compare(&expectedText, -1, actual, -1) != 0) {

	206 char buf[201 /21/];

	207 char expectedBuf[201];

	208 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);

	209 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]) , &expectedText);

	210 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s \" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));

	211 }

	212 utext_close(&expectedText);

	213 }

	214 /**

	215 * @param expected invariant (platform local text) input

	216 */

	217

	218 void RegexTest::assertUTextInvariant(const char expected, UText actual, const char *file, int line) {

	219 UErrorCode status = U_ZERO_ERROR;

	220 UText expectedText = UTEXT_INITIALIZER;

	221 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);

	222 if(U_FAILURE(status)) {

	223 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8From Invariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expect ed));

	224 return;

	225 }

	226 utext_setNativeIndex(actual, 0);

	227 if (utext_compare(&expectedText, -1, actual, -1) != 0) {

	228 char buf[201 /21/];

	229 char expectedBuf[201];

	230 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);

	231 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]) , &expectedText);

	232 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars) , got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expe ctedText), buf, (int)utext_nativeLength(actual));

	233 }

	234 utext_close(&expectedText);

	235 }

	236

	237 /**

	238 * Assumes utf-8 input

	239 */

	240 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actua l), __FILE__, __LINE__)

	241 /**

	242 * Assumes Invariant input

	243 */

	244 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((exp ected), (actual), __FILE__, __LINE__)

	245

	246

	247 //---------------------------------------------------------------------------

	248 //

	249 // REGEX_TESTLM Macro + invocation function to simplify writing quick t ests

	250 // for the LookingAt() and Match() functions.

	251 //

	252 // usage:

	253 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);

	254 //

	255 // The expected results are UBool - TRUE or FALSE.

	256 // The input text is unescaped. The pattern is not.

	257 //

	258 //

	259 //---------------------------------------------------------------------------

	260

	261 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, lookin g, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}

	262

	263 UBool RegexTest::doRegexLMTest(const char pat, const char text, UBool looking, UBool match, int32_t line) {

	264 const UnicodeString pattern(pat, -1, US_INV);

	265 const UnicodeString inputText(text, -1, US_INV);

	266 UErrorCode status = U_ZERO_ERROR;

	267 UParseError pe;

	268 RegexPattern *REPattern = NULL;

	269 RegexMatcher *REMatcher = NULL;

	270 UBool retVal = TRUE;

	271

	272 UnicodeString patString(pat, -1, US_INV);

	273 REPattern = RegexPattern::compile(patString, 0, pe, status);

	274 if (U_FAILURE(status)) {

	275 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Sta tus = %s",

	276 line, u_errorName(status));

	277 return FALSE;

	278 }

	279 if (line==376) { RegexPatternDump(REPattern);}

	280

	281 UnicodeString inputString(inputText);

	282 UnicodeString unEscapedInput = inputString.unescape();

	283 REMatcher = REPattern->matcher(unEscapedInput, status);

	284 if (U_FAILURE(status)) {

	285 errln("RegexTest failure in REPattern::matcher() at line %d. Status = % s\n",

	286 line, u_errorName(status));

	287 return FALSE;

	288 }

	289

	290 UBool actualmatch;

	291 actualmatch = REMatcher->lookingAt(status);

	292 if (U_FAILURE(status)) {

	293 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",

	294 line, u_errorName(status));

	295 retVal = FALSE;

	296 }

	297 if (actualmatch != looking) {

	298 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);

	299 retVal = FALSE;

	300 }

	301

	302 status = U_ZERO_ERROR;

	303 actualmatch = REMatcher->matches(status);

	304 if (U_FAILURE(status)) {

	305 errln("RegexTest failure in matches() at line %d. Status = %s\n",

	306 line, u_errorName(status));

	307 retVal = FALSE;

	308 }

	309 if (actualmatch != match) {

	310 errln("RegexTest: wrong return from matches() at line %d.\n", line);

	311 retVal = FALSE;

	312 }

	313

	314 if (retVal == FALSE) {

	315 RegexPatternDump(REPattern);

	316 }

	317

	318 delete REPattern;

	319 delete REMatcher;

	320 return retVal;

	321 }

	322

	323

	324 UBool RegexTest::doRegexLMTestUTF8(const char pat, const char text, UBool look ing, UBool match, int32_t line) {

	325 UText pattern = UTEXT_INITIALIZER;

	326 int32_t inputUTF8Length;

	327 char *textChars = NULL;

	328 UText inputText = UTEXT_INITIALIZER;

	329 UErrorCode status = U_ZERO_ERROR;

	330 UParseError pe;

	331 RegexPattern *REPattern = NULL;

	332 RegexMatcher *REMatcher = NULL;

	333 UBool retVal = TRUE;

	334

	335 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);

	336 REPattern = RegexPattern::compile(&pattern, 0, pe, status);

	337 if (U_FAILURE(status)) {

	338 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8 ). Status = %s\n",

	339 line, u_errorName(status));

	340 return FALSE;

	341 }

	342

	343 UnicodeString inputString(text, -1, US_INV);

	344 UnicodeString unEscapedInput = inputString.unescape();

	345 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));

	346 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, N ULL, NULL, NULL, &status);

	347

	348 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);

	349 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {

	350 // UTF-8 does not allow unpaired surrogates, so this could actually happ en

	351 logln("RegexTest unable to convert input to UTF8 at line %d. Status = % s\n", line, u_errorName(status));

	352 return TRUE; // not a failure of the Regex engine

	353 }

	354 status = U_ZERO_ERROR; // buffer overflow

	355 textChars = new char[inputUTF8Length+1];

	356 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias( ), status);

	357 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);

	358

	359 REMatcher = REPattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, s tatus);

	360 if (U_FAILURE(status)) {

	361 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Sta tus = %s\n",

	362 line, u_errorName(status));

	363 return FALSE;

	364 }

	365

	366 UBool actualmatch;

	367 actualmatch = REMatcher->lookingAt(status);

	368 if (U_FAILURE(status)) {

	369 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\ n",

	370 line, u_errorName(status));

	371 retVal = FALSE;

	372 }

	373 if (actualmatch != looking) {

	374 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", l ine);

	375 retVal = FALSE;

	376 }

	377

	378 status = U_ZERO_ERROR;

	379 actualmatch = REMatcher->matches(status);

	380 if (U_FAILURE(status)) {

	381 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n" ,

	382 line, u_errorName(status));

	383 retVal = FALSE;

	384 }

	385 if (actualmatch != match) {

	386 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", lin e);

	387 retVal = FALSE;

	388 }

	389

	390 if (retVal == FALSE) {

	391 RegexPatternDump(REPattern);

	392 }

	393

	394 delete REPattern;

	395 delete REMatcher;

	396 utext_close(&inputText);

	397 utext_close(&pattern);

	398 delete[] textChars;

	399 return retVal;

	400 }

	401

	402

	403

	404 //---------------------------------------------------------------------------

	405 //

	406 // REGEX_ERR Macro + invocation function to simplify writing tests

	407 // regex tests for incorrect patterns

	408 //

	409 // usage:

	410 // REGEX_ERR("pattern", expected error line, column, expected status) ;

	411 //

	412 //---------------------------------------------------------------------------

	413 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LI NE__);

	414

	415 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,

	416 UErrorCode expectedStatus, int32_t line) {

	417 UnicodeString pattern(pat);

	418

	419 UErrorCode status = U_ZERO_ERROR;

	420 UParseError pe;

	421 RegexPattern *callerPattern = NULL;

	422

	423 //

	424 // Compile the caller's pattern

	425 //

	426 UnicodeString patString(pat);

	427 callerPattern = RegexPattern::compile(patString, 0, pe, status);

	428 if (status != expectedStatus) {

	429 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_err orName(status));

	430 } else {

	431 if (status != U_ZERO_ERROR) {

	432 if (pe.line != errLine \|\| pe.offset != errCol) {

	433 errln("Line %d: incorrect line/offset from UParseError. Expecte d %d/%d; got %d/%d.\n",

	434 line, errLine, errCol, pe.line, pe.offset);

	435 }

	436 }

	437 }

	438

	439 delete callerPattern;

	440

	441 //

	442 // Compile again, using a UTF-8-based UText

	443 //

	444 UText patternText = UTEXT_INITIALIZER;

	445 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);

	446 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);

	447 if (status != expectedStatus) {

	448 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_err orName(status));

	449 } else {

	450 if (status != U_ZERO_ERROR) {

	451 if (pe.line != errLine \|\| pe.offset != errCol) {

	452 errln("Line %d: incorrect line/offset from UParseError. Expecte d %d/%d; got %d/%d.\n",

	453 line, errLine, errCol, pe.line, pe.offset);

	454 }

	455 }

	456 }

	457

	458 delete callerPattern;

	459 utext_close(&patternText);

	460 }

	461

	462

	463

	464 //---------------------------------------------------------------------------

	465 //

	466 // Basic Check for basic functionality of regex pattern matching.

	467 // Avoid the use of REGEX_FIND test macro, which has

	468 // substantial dependencies on basic Regex functionality.

	469 //

	470 //---------------------------------------------------------------------------

	471 void RegexTest::Basic() {

	472

	473

	474 //

	475 // Debug - slide failing test cases early

	476 //

	477 #if 0

	478 {

	479 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);

	480 UParseError pe;

	481 UErrorCode status = U_ZERO_ERROR;

	482 RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);

	483 // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");

	484 // REGEX_FIND("(X([abc=X]+)+X)\|(y[abc=]+)", "=XX====================");

	485 }

	486 exit(1);

	487 #endif

	488

	489

	490 //

	491 // Pattern with parentheses

	492 //

	493 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);

	494 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);

	495 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);

	496

	497 //

	498 // Patterns with *

	499 //

	500 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);

	501 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);

	502 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);

	503 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);

	504 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);

	505

	506 REGEX_TESTLM("a*", "", TRUE, TRUE);

	507 REGEX_TESTLM("a*", "b", TRUE, FALSE);

	508

	509

	510 //

	511 // Patterns with "."

	512 //

	513 REGEX_TESTLM(".", "abc", TRUE, FALSE);

	514 REGEX_TESTLM("...", "abc", TRUE, TRUE);

	515 REGEX_TESTLM("....", "abc", FALSE, FALSE);

	516 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);

	517 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);

	518 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);

	519 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);

	520 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);

	521

	522 //

	523 // Patterns with * applied to chars at end of literal string

	524 //

	525 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);

	526 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);

	527

	528 //

	529 // Supplemental chars match as single chars, not a pair of surrogates.

	530 //

	531 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);

	532 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);

	533 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);

	534

	535

	536 //

	537 // UnicodeSets in the pattern

	538 //

	539 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);

	540 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);

	541 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);

	542 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);

	543 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);

	544 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);

	545

	546 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);

	547 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);

	548 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);

	549 REGEX_TESTLM("[\\p{Nd}]", "a123456", TRUE, FALSE); // note that matches 0 occurences.

	550 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);

	551

	552 //

	553 // OR operator in patterns

	554 //

	555 REGEX_TESTLM("(a\|b)", "a", TRUE, TRUE);

	556 REGEX_TESTLM("(a\|b)", "b", TRUE, TRUE);

	557 REGEX_TESTLM("(a\|b)", "c", FALSE, FALSE);

	558 REGEX_TESTLM("a\|b", "b", TRUE, TRUE);

	559

	560 REGEX_TESTLM("(a\|b\|c)*", "aabcaaccbcabc", TRUE, TRUE);

	561 REGEX_TESTLM("(a\|b\|c)*", "aabcaaccbcabdc", TRUE, FALSE);

	562 REGEX_TESTLM("(a(b\|c\|d)(x\|y\|z)*\|123)", "ac", TRUE, TRUE);

	563 REGEX_TESTLM("(a(b\|c\|d)(x\|y\|z)*\|123)", "123", TRUE, TRUE);

	564 REGEX_TESTLM("(a\|(1\|2))(b\|c\|d)(x\|y\|z)\|123", "123", TRUE, TRUE);

	565 REGEX_TESTLM("(a\|(1\|2))(b\|c\|d)(x\|y\|z)\|123", "222211111czzzzw", TRUE, FALSE );

	566

	567 //

	568 // +

	569 //

	570 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);

	571 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);

	572 REGEX_TESTLM("b+", "", FALSE, FALSE);

	573 REGEX_TESTLM("(abc\|def)+", "defabc", TRUE, TRUE);

	574 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);

	575 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);

	576

	577 //

	578 // ?

	579 //

	580 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);

	581 REGEX_TESTLM("ab?", "a", TRUE, TRUE);

	582 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);

	583 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);

	584 REGEX_TESTLM("a(b\|c)?d", "abd", TRUE, TRUE);

	585 REGEX_TESTLM("a(b\|c)?d", "acd", TRUE, TRUE);

	586 REGEX_TESTLM("a(b\|c)?d", "ad", TRUE, TRUE);

	587 REGEX_TESTLM("a(b\|c)?d", "abcd", FALSE, FALSE);

	588 REGEX_TESTLM("a(b\|c)?d", "ab", FALSE, FALSE);

	589

	590 //

	591 // Escape sequences that become single literal chars, handled internally

	592 // by ICU's Unescape.

	593 //

	594

	595 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not i mplemented yet.

	596 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL

	597 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L

	598 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape

	599 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed

	600 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line

	601 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR

	602 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab

	603 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);

	604 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);

	605

	606 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the begi nning of input

	607 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the be ginning of input

	608

	609 // Escape of special chars in patterns

	610 REGEX_TESTLM("\\\\\\\|\$\$\\[\\{\\~\\$\\\\+\\?\\.", "\\\\\|()[{~$+?.", TRU E, TRUE);

	611 }

	612

	613

	614 //---------------------------------------------------------------------------

	615 //

	616 // UTextBasic Check for quirks that are specific to the UText

	617 // implementation.

	618 //

	619 //---------------------------------------------------------------------------

	620 void RegexTest::UTextBasic() {

	621 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */

	622 UErrorCode status = U_ZERO_ERROR;

	623 UText pattern = UTEXT_INITIALIZER;

	624 utext_openUTF8(&pattern, str_abc, -1, &status);

	625 RegexMatcher matcher(&pattern, 0, status);

	626 REGEX_CHECK_STATUS;

	627

	628 UText input = UTEXT_INITIALIZER;

	629 utext_openUTF8(&input, str_abc, -1, &status);

	630 REGEX_CHECK_STATUS;

	631 matcher.reset(&input);

	632 REGEX_CHECK_STATUS;

	633 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());

	634

	635 matcher.reset(matcher.inputText());

	636 REGEX_CHECK_STATUS;

	637 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());

	638

	639 utext_close(&pattern);

	640 utext_close(&input);

	641 }

	642

	643

	644 //---------------------------------------------------------------------------

	645 //

	646 // API_Match Test that the API for class RegexMatcher

	647 // is present and nominally working, but excluding functions

	648 // implementing replace operations.

	649 //

	650 //---------------------------------------------------------------------------

	651 void RegexTest::API_Match() {

	652 UParseError pe;

	653 UErrorCode status=U_ZERO_ERROR;

	654 int32_t flags = 0;

	655

	656 //

	657 // Debug - slide failing test cases early

	658 //

	659 #if 0

	660 {

	661 }

	662 return;

	663 #endif

	664

	665 //

	666 // Simple pattern compilation

	667 //

	668 {

	669 UnicodeString re("abc");

	670 RegexPattern *pat2;

	671 pat2 = RegexPattern::compile(re, flags, pe, status);

	672 REGEX_CHECK_STATUS;

	673

	674 UnicodeString inStr1 = "abcdef this is a test";

	675 UnicodeString instr2 = "not abc";

	676 UnicodeString empty = "";

	677

	678

	679 //

	680 // Matcher creation and reset.

	681 //

	682 RegexMatcher *m1 = pat2->matcher(inStr1, status);

	683 REGEX_CHECK_STATUS;

	684 REGEX_ASSERT(m1->lookingAt(status) == TRUE);

	685 REGEX_ASSERT(m1->input() == inStr1);

	686 m1->reset(instr2);

	687 REGEX_ASSERT(m1->lookingAt(status) == FALSE);

	688 REGEX_ASSERT(m1->input() == instr2);

	689 m1->reset(inStr1);

	690 REGEX_ASSERT(m1->input() == inStr1);

	691 REGEX_ASSERT(m1->lookingAt(status) == TRUE);

	692 m1->reset(empty);

	693 REGEX_ASSERT(m1->lookingAt(status) == FALSE);

	694 REGEX_ASSERT(m1->input() == empty);

	695 REGEX_ASSERT(&m1->pattern() == pat2);

	696

	697 //

	698 // reset(pos, status)

	699 //

	700 m1->reset(inStr1);

	701 m1->reset(4, status);

	702 REGEX_CHECK_STATUS;

	703 REGEX_ASSERT(m1->input() == inStr1);

	704 REGEX_ASSERT(m1->lookingAt(status) == TRUE);

	705

	706 m1->reset(-1, status);

	707 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

	708 status = U_ZERO_ERROR;

	709

	710 m1->reset(0, status);

	711 REGEX_CHECK_STATUS;

	712 status = U_ZERO_ERROR;

	713

	714 int32_t len = m1->input().length();

	715 m1->reset(len-1, status);

	716 REGEX_CHECK_STATUS;

	717 status = U_ZERO_ERROR;

	718

	719 m1->reset(len, status);

	720 REGEX_CHECK_STATUS;

	721 status = U_ZERO_ERROR;

	722

	723 m1->reset(len+1, status);

	724 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

	725 status = U_ZERO_ERROR;

	726

	727 //

	728 // match(pos, status)

	729 //

	730 m1->reset(instr2);

	731 REGEX_ASSERT(m1->matches(4, status) == TRUE);

	732 m1->reset();

	733 REGEX_ASSERT(m1->matches(3, status) == FALSE);

	734 m1->reset();

	735 REGEX_ASSERT(m1->matches(5, status) == FALSE);

	736 REGEX_ASSERT(m1->matches(4, status) == TRUE);

	737 REGEX_ASSERT(m1->matches(-1, status) == FALSE);

	738 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

	739

	740 // Match() at end of string should fail, but should not

	741 // be an error.

	742 status = U_ZERO_ERROR;

	743 len = m1->input().length();

	744 REGEX_ASSERT(m1->matches(len, status) == FALSE);

	745 REGEX_CHECK_STATUS;

	746

	747 // Match beyond end of string should fail with an error.

	748 status = U_ZERO_ERROR;

	749 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);

	750 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

	751

	752 // Successful match at end of string.

	753 {

	754 status = U_ZERO_ERROR;

	755 RegexMatcher m("A?", 0, status); // will match zero length string.

	756 REGEX_CHECK_STATUS;

	757 m.reset(inStr1);

	758 len = inStr1.length();

	759 REGEX_ASSERT(m.matches(len, status) == TRUE);

	760 REGEX_CHECK_STATUS;

	761 m.reset(empty);

	762 REGEX_ASSERT(m.matches(0, status) == TRUE);

	763 REGEX_CHECK_STATUS;

	764 }

	765

	766

	767 //

	768 // lookingAt(pos, status)

	769 //

	770 status = U_ZERO_ERROR;

	771 m1->reset(instr2); // "not abc"

	772 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);

	773 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);

	774 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);

	775 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);

	776 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);

	777 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

	778 status = U_ZERO_ERROR;

	779 len = m1->input().length();

	780 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);

	781 REGEX_CHECK_STATUS;

	782 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);

	783 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

	784

	785 delete m1;

	786 delete pat2;

	787 }

	788

	789

	790 //

	791 // Capture Group.

	792 // RegexMatcher::start();

	793 // RegexMatcher::end();

	794 // RegexMatcher::groupCount();

	795 //

	796 {

	797 int32_t flags=0;

	798 UParseError pe;

	799 UErrorCode status=U_ZERO_ERROR;

	800

	801 UnicodeString re("01(23(45)67)(.*)");

	802 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);

	803 REGEX_CHECK_STATUS;

	804 UnicodeString data = "0123456789";

	805

	806 RegexMatcher *matcher = pat->matcher(data, status);

	807 REGEX_CHECK_STATUS;

	808 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);

	809 static const int32_t matchStarts[] = {0, 2, 4, 8};

	810 static const int32_t matchEnds[] = {10, 8, 6, 10};

	811 int32_t i;

	812 for (i=0; i<4; i++) {

	813 int32_t actualStart = matcher->start(i, status);

	814 REGEX_CHECK_STATUS;

	815 if (actualStart != matchStarts[i]) {

	816 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",

	817 __LINE__, i, matchStarts[i], actualStart);

	818 }

	819 int32_t actualEnd = matcher->end(i, status);

	820 REGEX_CHECK_STATUS;

	821 if (actualEnd != matchEnds[i]) {

	822 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",

	823 __LINE__, i, matchEnds[i], actualEnd);

	824 }

	825 }

	826

	827 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));

	828 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));

	829

	830 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR) ;

	831 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR) ;

	832 matcher->reset();

	833 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);

	834

	835 matcher->lookingAt(status);

	836 REGEX_ASSERT(matcher->group(status) == "0123456789");

	837 REGEX_ASSERT(matcher->group(0, status) == "0123456789");

	838 REGEX_ASSERT(matcher->group(1, status) == "234567" );

	839 REGEX_ASSERT(matcher->group(2, status) == "45" );

	840 REGEX_ASSERT(matcher->group(3, status) == "89" );

	841 REGEX_CHECK_STATUS;

	842 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR) ;

	843 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR) ;

	844 matcher->reset();

	845 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);

	846

	847 delete matcher;

	848 delete pat;

	849

	850 }

	851

	852 //

	853 // find

	854 //

	855 {

	856 int32_t flags=0;

	857 UParseError pe;

	858 UErrorCode status=U_ZERO_ERROR;

	859

	860 UnicodeString re("abc");

	861 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);

	862 REGEX_CHECK_STATUS;

	863 UnicodeString data = ".abc..abc...abc..";

	864 // 012345678901234567

	865

	866 RegexMatcher *matcher = pat->matcher(data, status);

	867 REGEX_CHECK_STATUS;

	868 REGEX_ASSERT(matcher->find());

	869 REGEX_ASSERT(matcher->start(status) == 1);

	870 REGEX_ASSERT(matcher->find());

	871 REGEX_ASSERT(matcher->start(status) == 6);

	872 REGEX_ASSERT(matcher->find());

	873 REGEX_ASSERT(matcher->start(status) == 12);

	874 REGEX_ASSERT(matcher->find() == FALSE);

	875 REGEX_ASSERT(matcher->find() == FALSE);

	876

	877 matcher->reset();

	878 REGEX_ASSERT(matcher->find());

	879 REGEX_ASSERT(matcher->start(status) == 1);

	880

	881 REGEX_ASSERT(matcher->find(0, status));

	882 REGEX_ASSERT(matcher->start(status) == 1);

	883 REGEX_ASSERT(matcher->find(1, status));

	884 REGEX_ASSERT(matcher->start(status) == 1);

	885 REGEX_ASSERT(matcher->find(2, status));

	886 REGEX_ASSERT(matcher->start(status) == 6);

	887 REGEX_ASSERT(matcher->find(12, status));

	888 REGEX_ASSERT(matcher->start(status) == 12);

	889 REGEX_ASSERT(matcher->find(13, status) == FALSE);

	890 REGEX_ASSERT(matcher->find(16, status) == FALSE);

	891 REGEX_ASSERT(matcher->find(17, status) == FALSE);

	892 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);

	893

	894 status = U_ZERO_ERROR;

	895 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);

	896 status = U_ZERO_ERROR;

	897 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);

	898

	899 REGEX_ASSERT(matcher->groupCount() == 0);

	900

	901 delete matcher;

	902 delete pat;

	903 }

	904

	905

	906 //

	907 // find, with \G in pattern (true if at the end of a previous match).

	908 //

	909 {

	910 int32_t flags=0;

	911 UParseError pe;

	912 UErrorCode status=U_ZERO_ERROR;

	913

	914 UnicodeString re(".*?(?:(\\Gabc)\|(abc))", -1, US_INV);

	915 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);

	916 REGEX_CHECK_STATUS;

	917 UnicodeString data = ".abcabc.abc..";

	918 // 012345678901234567

	919

	920 RegexMatcher *matcher = pat->matcher(data, status);

	921 REGEX_CHECK_STATUS;

	922 REGEX_ASSERT(matcher->find());

	923 REGEX_ASSERT(matcher->start(status) == 0);

	924 REGEX_ASSERT(matcher->start(1, status) == -1);

	925 REGEX_ASSERT(matcher->start(2, status) == 1);

	926

	927 REGEX_ASSERT(matcher->find());

	928 REGEX_ASSERT(matcher->start(status) == 4);

	929 REGEX_ASSERT(matcher->start(1, status) == 4);

	930 REGEX_ASSERT(matcher->start(2, status) == -1);

	931 REGEX_CHECK_STATUS;

	932

	933 delete matcher;

	934 delete pat;

	935 }

	936

	937 //

	938 // find with zero length matches, match position should bump ahead

	939 // to prevent loops.

	940 //

	941 {

	942 int32_t i;

	943 UErrorCode status=U_ZERO_ERROR;

	944 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero- length matches anywhere,

	945 // using an always-true look-ahead.

	946 REGEX_CHECK_STATUS;

	947 UnicodeString s(" ");

	948 m.reset(s);

	949 for (i=0; ; i++) {

	950 if (m.find() == FALSE) {

	951 break;

	952 }

	953 REGEX_ASSERT(m.start(status) == i);

	954 REGEX_ASSERT(m.end(status) == i);

	955 }

	956 REGEX_ASSERT(i==5);

	957

	958 // Check that the bump goes over surrogate pairs OK

	959 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004" );

	960 s = s.unescape();

	961 m.reset(s);

	962 for (i=0; ; i+=2) {

	963 if (m.find() == FALSE) {

	964 break;

	965 }

	966 REGEX_ASSERT(m.start(status) == i);

	967 REGEX_ASSERT(m.end(status) == i);

	968 }

	969 REGEX_ASSERT(i==10);

	970 }

	971 {

	972 // find() loop breaking test.

	973 // with pattern of /.?/, should see a series of one char matches, then a single

	974 // match of zero length at the end of the input string.

	975 int32_t i;

	976 UErrorCode status=U_ZERO_ERROR;

	977 RegexMatcher m(".?", 0, status);

	978 REGEX_CHECK_STATUS;

	979 UnicodeString s(" ");

	980 m.reset(s);

	981 for (i=0; ; i++) {

	982 if (m.find() == FALSE) {

	983 break;

	984 }

	985 REGEX_ASSERT(m.start(status) == i);

	986 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));

	987 }

	988 REGEX_ASSERT(i==5);

	989 }

	990

	991

	992 //

	993 // Matchers with no input string behave as if they had an empty input string .

	994 //

	995

	996 {

	997 UErrorCode status = U_ZERO_ERROR;

	998 RegexMatcher m(".?", 0, status);

	999 REGEX_CHECK_STATUS;

	1000 REGEX_ASSERT(m.find());

	1001 REGEX_ASSERT(m.start(status) == 0);

	1002 REGEX_ASSERT(m.input() == "");

	1003 }

	1004 {

	1005 UErrorCode status = U_ZERO_ERROR;

	1006 RegexPattern *p = RegexPattern::compile(".", 0, status);

	1007 RegexMatcher *m = p->matcher(status);

	1008 REGEX_CHECK_STATUS;

	1009

	1010 REGEX_ASSERT(m->find() == FALSE);

	1011 REGEX_ASSERT(m->input() == "");

	1012 delete m;

	1013 delete p;

	1014 }

	1015

	1016 //

	1017 // Regions

	1018 //

	1019 {

	1020 UErrorCode status = U_ZERO_ERROR;

	1021 UnicodeString testString("This is test data");

	1022 RegexMatcher m(".*", testString, 0, status);

	1023 REGEX_CHECK_STATUS;

	1024 REGEX_ASSERT(m.regionStart() == 0);

	1025 REGEX_ASSERT(m.regionEnd() == testString.length());

	1026 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);

	1027 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);

	1028

	1029 m.region(2,4, status);

	1030 REGEX_CHECK_STATUS;

	1031 REGEX_ASSERT(m.matches(status));

	1032 REGEX_ASSERT(m.start(status)==2);

	1033 REGEX_ASSERT(m.end(status)==4);

	1034 REGEX_CHECK_STATUS;

	1035

	1036 m.reset();

	1037 REGEX_ASSERT(m.regionStart() == 0);

	1038 REGEX_ASSERT(m.regionEnd() == testString.length());

	1039

	1040 UnicodeString shorterString("short");

	1041 m.reset(shorterString);

	1042 REGEX_ASSERT(m.regionStart() == 0);

	1043 REGEX_ASSERT(m.regionEnd() == shorterString.length());

	1044

	1045 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);

	1046 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));

	1047 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);

	1048 REGEX_ASSERT(&m == &m.reset());

	1049 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);

	1050

	1051 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));

	1052 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);

	1053 REGEX_ASSERT(&m == &m.reset());

	1054 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);

	1055

	1056 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);

	1057 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));

	1058 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);

	1059 REGEX_ASSERT(&m == &m.reset());

	1060 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);

	1061

	1062 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));

	1063 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);

	1064 REGEX_ASSERT(&m == &m.reset());

	1065 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);

	1066

	1067 }

	1068

	1069 //

	1070 // hitEnd() and requireEnd()

	1071 //

	1072 {

	1073 UErrorCode status = U_ZERO_ERROR;

	1074 UnicodeString testString("aabb");

	1075 RegexMatcher m1(".*", testString, 0, status);

	1076 REGEX_ASSERT(m1.lookingAt(status) == TRUE);

	1077 REGEX_ASSERT(m1.hitEnd() == TRUE);

	1078 REGEX_ASSERT(m1.requireEnd() == FALSE);

	1079 REGEX_CHECK_STATUS;

	1080

	1081 status = U_ZERO_ERROR;

	1082 RegexMatcher m2("a*", testString, 0, status);

	1083 REGEX_ASSERT(m2.lookingAt(status) == TRUE);

	1084 REGEX_ASSERT(m2.hitEnd() == FALSE);

	1085 REGEX_ASSERT(m2.requireEnd() == FALSE);

	1086 REGEX_CHECK_STATUS;

	1087

	1088 status = U_ZERO_ERROR;

	1089 RegexMatcher m3(".*$", testString, 0, status);

	1090 REGEX_ASSERT(m3.lookingAt(status) == TRUE);

	1091 REGEX_ASSERT(m3.hitEnd() == TRUE);

	1092 REGEX_ASSERT(m3.requireEnd() == TRUE);

	1093 REGEX_CHECK_STATUS;

	1094 }

	1095

	1096

	1097 //

	1098 // Compilation error on reset with UChar *

	1099 // These were a hazard that people were stumbling over with runtime errors .

	1100 // Changed them to compiler errors by adding private methods that more clo sely

	1101 // matched the incorrect use of the functions.

	1102 //

	1103 #if 0

	1104 {

	1105 UErrorCode status = U_ZERO_ERROR;

	1106 UChar ucharString[20];

	1107 RegexMatcher m(".", 0, status);

	1108 m.reset(ucharString); // should not compile.

	1109

	1110 RegexPattern *p = RegexPattern::compile(".", 0, status);

	1111 RegexMatcher *m2 = p->matcher(ucharString, status); // should not co mpile.

	1112

	1113 RegexMatcher m3(".", ucharString, 0, status); // Should not compile

	1114 }

	1115 #endif

	1116

	1117 //

	1118 // Time Outs.

	1119 // Note: These tests will need to be changed when the regexp engine i s

	1120 // able to detect and cut short the exponential time behavior o n

	1121 // this type of match.

	1122 //

	1123 {

	1124 UErrorCode status = U_ZERO_ERROR;

	1125 // Enough 'a's in the string to cause the match to time out.

	1126 // (Each on additonal 'a' doubles the time)

	1127 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");

	1128 RegexMatcher matcher("(a+)+b", testString, 0, status);

	1129 REGEX_CHECK_STATUS;

	1130 REGEX_ASSERT(matcher.getTimeLimit() == 0);

	1131 matcher.setTimeLimit(100, status);

	1132 REGEX_ASSERT(matcher.getTimeLimit() == 100);

	1133 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);

	1134 REGEX_ASSERT(status == U_REGEX_TIME_OUT);

	1135 }

	1136 {

	1137 UErrorCode status = U_ZERO_ERROR;

	1138 // Few enough 'a's to slip in under the time limit.

	1139 UnicodeString testString("aaaaaaaaaaaaaaaaaa");

	1140 RegexMatcher matcher("(a+)+b", testString, 0, status);

	1141 REGEX_CHECK_STATUS;

	1142 matcher.setTimeLimit(100, status);

	1143 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);

	1144 REGEX_CHECK_STATUS;

	1145 }

	1146

	1147 //

	1148 // Stack Limits

	1149 //

	1150 {

	1151 UErrorCode status = U_ZERO_ERROR;

	1152 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'

	1153

	1154 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits opt imizations

	1155 // of the '+', and makes the stack frames larger.

	1156 RegexMatcher matcher("(A)+A$", testString, 0, status);

	1157

	1158 // With the default stack, this match should fail to run

	1159 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);

	1160 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);

	1161

	1162 // With unlimited stack, it should run

	1163 status = U_ZERO_ERROR;

	1164 matcher.setStackLimit(0, status);

	1165 REGEX_CHECK_STATUS;

	1166 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);

	1167 REGEX_CHECK_STATUS;

	1168 REGEX_ASSERT(matcher.getStackLimit() == 0);

	1169

	1170 // With a limited stack, it the match should fail

	1171 status = U_ZERO_ERROR;

	1172 matcher.setStackLimit(10000, status);

	1173 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);

	1174 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);

	1175 REGEX_ASSERT(matcher.getStackLimit() == 10000);

	1176 }

	1177

	1178 // A pattern that doesn't save state should work with

	1179 // a minimal sized stack

	1180 {

	1181 UErrorCode status = U_ZERO_ERROR;

	1182 UnicodeString testString = "abc";

	1183 RegexMatcher matcher("abc", testString, 0, status);

	1184 REGEX_CHECK_STATUS;

	1185 matcher.setStackLimit(30, status);

	1186 REGEX_CHECK_STATUS;

	1187 REGEX_ASSERT(matcher.matches(status) == TRUE);

	1188 REGEX_CHECK_STATUS;

	1189 REGEX_ASSERT(matcher.getStackLimit() == 30);

	1190

	1191 // Negative stack sizes should fail

	1192 status = U_ZERO_ERROR;

	1193 matcher.setStackLimit(1000, status);

	1194 REGEX_CHECK_STATUS;

	1195 matcher.setStackLimit(-1, status);

	1196 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);

	1197 REGEX_ASSERT(matcher.getStackLimit() == 1000);

	1198 }

	1199

	1200

	1201 }

	1202

	1203

	1204

	1205

	1206

	1207

	1208 //---------------------------------------------------------------------------

	1209 //

	1210 // API_Replace API test for class RegexMatcher, testing the

	1211 // Replace family of functions.

	1212 //

	1213 //---------------------------------------------------------------------------

	1214 void RegexTest::API_Replace() {

	1215 //

	1216 // Replace

	1217 //

	1218 int32_t flags=0;

	1219 UParseError pe;

	1220 UErrorCode status=U_ZERO_ERROR;

	1221

	1222 UnicodeString re("abc");

	1223 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);

	1224 REGEX_CHECK_STATUS;

	1225 UnicodeString data = ".abc..abc...abc..";

	1226 // 012345678901234567

	1227 RegexMatcher *matcher = pat->matcher(data, status);

	1228

	1229 //

	1230 // Plain vanilla matches.

	1231 //

	1232 UnicodeString dest;

	1233 dest = matcher->replaceFirst("yz", status);

	1234 REGEX_CHECK_STATUS;

	1235 REGEX_ASSERT(dest == ".yz..abc...abc..");

	1236

	1237 dest = matcher->replaceAll("yz", status);

	1238 REGEX_CHECK_STATUS;

	1239 REGEX_ASSERT(dest == ".yz..yz...yz..");

	1240

	1241 //

	1242 // Plain vanilla non-matches.

	1243 //

	1244 UnicodeString d2 = ".abx..abx...abx..";

	1245 matcher->reset(d2);

	1246 dest = matcher->replaceFirst("yz", status);

	1247 REGEX_CHECK_STATUS;

	1248 REGEX_ASSERT(dest == ".abx..abx...abx..");

	1249

	1250 dest = matcher->replaceAll("yz", status);

	1251 REGEX_CHECK_STATUS;

	1252 REGEX_ASSERT(dest == ".abx..abx...abx..");

	1253

	1254 //

	1255 // Empty source string

	1256 //

	1257 UnicodeString d3 = "";

	1258 matcher->reset(d3);

	1259 dest = matcher->replaceFirst("yz", status);

	1260 REGEX_CHECK_STATUS;

	1261 REGEX_ASSERT(dest == "");

	1262

	1263 dest = matcher->replaceAll("yz", status);

	1264 REGEX_CHECK_STATUS;

	1265 REGEX_ASSERT(dest == "");

	1266

	1267 //

	1268 // Empty substitution string

	1269 //

	1270 matcher->reset(data); // ".abc..abc...abc.."

	1271 dest = matcher->replaceFirst("", status);

	1272 REGEX_CHECK_STATUS;

	1273 REGEX_ASSERT(dest == "...abc...abc..");

	1274

	1275 dest = matcher->replaceAll("", status);

	1276 REGEX_CHECK_STATUS;

	1277 REGEX_ASSERT(dest == "........");

	1278

	1279 //

	1280 // match whole string

	1281 //

	1282 UnicodeString d4 = "abc";

	1283 matcher->reset(d4);

	1284 dest = matcher->replaceFirst("xyz", status);

	1285 REGEX_CHECK_STATUS;

	1286 REGEX_ASSERT(dest == "xyz");

	1287

	1288 dest = matcher->replaceAll("xyz", status);

	1289 REGEX_CHECK_STATUS;

	1290 REGEX_ASSERT(dest == "xyz");

	1291

	1292 //

	1293 // Capture Group, simple case

	1294 //

	1295 UnicodeString re2("a(..)");

	1296 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);

	1297 REGEX_CHECK_STATUS;

	1298 UnicodeString d5 = "abcdefg";

	1299 RegexMatcher *matcher2 = pat2->matcher(d5, status);

	1300 REGEX_CHECK_STATUS;

	1301 dest = matcher2->replaceFirst("$1$1", status);

	1302 REGEX_CHECK_STATUS;

	1303 REGEX_ASSERT(dest == "bcbcdefg");

	1304

	1305 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1 ."), status);

	1306 REGEX_CHECK_STATUS;

	1307 REGEX_ASSERT(dest == "The value of $1 is bc.defg");

	1308

	1309 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);

	1310 REGEX_CHECK_STATUS;

	1311 REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");

	1312

	1313 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U 0001D7CF.");

	1314 replacement = replacement.unescape();

	1315 dest = matcher2->replaceFirst(replacement, status);

	1316 REGEX_CHECK_STATUS;

	1317 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");

	1318

	1319 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",st atus), U_INDEX_OUTOFBOUNDS_ERROR);

	1320

	1321

	1322 //

	1323 // Replacement String with \u hex escapes

	1324 //

	1325 {

	1326 UnicodeString src = "abc 1 abc 2 abc 3";

	1327 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");

	1328 matcher->reset(src);

	1329 UnicodeString result = matcher->replaceAll(substitute, status);

	1330 REGEX_CHECK_STATUS;

	1331 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");

	1332 }

	1333 {

	1334 UnicodeString src = "abc !";

	1335 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");

	1336 matcher->reset(src);

	1337 UnicodeString result = matcher->replaceAll(substitute, status);

	1338 REGEX_CHECK_STATUS;

	1339 UnicodeString expected = UnicodeString("--");

	1340 expected.append((UChar32)0x10000);

	1341 expected.append("-- !");

	1342 REGEX_ASSERT(result == expected);

	1343 }

	1344 // TODO: need more through testing of capture substitutions.

	1345

	1346 // Bug 4057

	1347 //

	1348 {

	1349 status = U_ZERO_ERROR;

	1350 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";

	1351 RegexMatcher m("ss(.*?)ee", 0, status);

	1352 REGEX_CHECK_STATUS;

	1353 UnicodeString result;

	1354

	1355 // Multiple finds do NOT bump up the previous appendReplacement postion.

	1356 m.reset(s);

	1357 m.find();

	1358 m.find();

	1359 m.appendReplacement(result, "ooh", status);

	1360 REGEX_CHECK_STATUS;

	1361 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");

	1362

	1363 // After a reset into the interior of a string, appendReplacemnt still s tarts at beginning.

	1364 status = U_ZERO_ERROR;

	1365 result.truncate(0);

	1366 m.reset(10, status);

	1367 m.find();

	1368 m.find();

	1369 m.appendReplacement(result, "ooh", status);

	1370 REGEX_CHECK_STATUS;

	1371 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");

	1372

	1373 // find() at interior of string, appendReplacemnt still starts at beginn ing.

	1374 status = U_ZERO_ERROR;

	1375 result.truncate(0);

	1376 m.reset();

	1377 m.find(10, status);

	1378 m.find();

	1379 m.appendReplacement(result, "ooh", status);

	1380 REGEX_CHECK_STATUS;

	1381 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");

	1382

	1383 m.appendTail(result);

	1384 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fi n");

	1385

	1386 }

	1387

	1388 delete matcher2;

	1389 delete pat2;

	1390 delete matcher;

	1391 delete pat;

	1392 }

	1393

	1394

	1395 //---------------------------------------------------------------------------

	1396 //

	1397 // API_Pattern Test that the API for class RegexPattern is

	1398 // present and nominally working.

	1399 //

	1400 //---------------------------------------------------------------------------

	1401 void RegexTest::API_Pattern() {

	1402 RegexPattern pata; // Test default constructor to not crash.

	1403 RegexPattern patb;

	1404

	1405 REGEX_ASSERT(pata == patb);

	1406 REGEX_ASSERT(pata == pata);

	1407

	1408 UnicodeString re1("abc[a-l][m-z]");

	1409 UnicodeString re2("def");

	1410 UErrorCode status = U_ZERO_ERROR;

	1411 UParseError pe;

	1412

	1413 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);

	1414 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);

	1415 REGEX_CHECK_STATUS;

	1416 REGEX_ASSERT(pat1 == pat1);

	1417 REGEX_ASSERT(*pat1 != pata);

	1418

	1419 // Assign

	1420 patb = *pat1;

	1421 REGEX_ASSERT(patb == *pat1);

	1422

	1423 // Copy Construct

	1424 RegexPattern patc(*pat1);

	1425 REGEX_ASSERT(patc == *pat1);

	1426 REGEX_ASSERT(patb == patc);

	1427 REGEX_ASSERT(pat1 != pat2);

	1428 patb = *pat2;

	1429 REGEX_ASSERT(patb != patc);

	1430 REGEX_ASSERT(patb == *pat2);

	1431

	1432 // Compile with no flags.

	1433 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);

	1434 REGEX_ASSERT(pat1a == pat1);

	1435

	1436 REGEX_ASSERT(pat1a->flags() == 0);

	1437

	1438 // Compile with different flags should be not equal

	1439 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSIT IVE, pe, status);

	1440 REGEX_CHECK_STATUS;

	1441

	1442 REGEX_ASSERT(pat1b != pat1a);

	1443 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);

	1444 REGEX_ASSERT(pat1a->flags() == 0);

	1445 delete pat1b;

	1446

	1447 // clone

	1448 RegexPattern *pat1c = pat1->clone();

	1449 REGEX_ASSERT(pat1c == pat1);

	1450 REGEX_ASSERT(pat1c != pat2);

	1451

	1452 delete pat1c;

	1453 delete pat1a;

	1454 delete pat1;

	1455 delete pat2;

	1456

	1457

	1458 //

	1459 // Verify that a matcher created from a cloned pattern works.

	1460 // (Jitterbug 3423)

	1461 //

	1462 {

	1463 UErrorCode status = U_ZERO_ERROR;

	1464 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE( "\\p{L}+"), 0, status);

	1465 RegexPattern *pClone = pSource->clone();

	1466 delete pSource;

	1467 RegexMatcher *mFromClone = pClone->matcher(status);

	1468 REGEX_CHECK_STATUS;

	1469 UnicodeString s = "Hello World";

	1470 mFromClone->reset(s);

	1471 REGEX_ASSERT(mFromClone->find() == TRUE);

	1472 REGEX_ASSERT(mFromClone->group(status) == "Hello");

	1473 REGEX_ASSERT(mFromClone->find() == TRUE);

	1474 REGEX_ASSERT(mFromClone->group(status) == "World");

	1475 REGEX_ASSERT(mFromClone->find() == FALSE);

	1476 delete mFromClone;

	1477 delete pClone;

	1478 }

	1479

	1480 //

	1481 // matches convenience API

	1482 //

	1483 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE );

	1484 REGEX_CHECK_STATUS;

	1485 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FAL SE);

	1486 REGEX_CHECK_STATUS;

	1487 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);

	1488 REGEX_CHECK_STATUS;

	1489 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, statu s) == TRUE);

	1490 REGEX_CHECK_STATUS;

	1491 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FAL SE);

	1492 REGEX_CHECK_STATUS;

	1493 status = U_INDEX_OUTOFBOUNDS_ERROR;

	1494 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);

	1495 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

	1496

	1497

	1498 //

	1499 // Split()

	1500 //

	1501 status = U_ZERO_ERROR;

	1502 pat1 = RegexPattern::compile(" +", pe, status);

	1503 REGEX_CHECK_STATUS;

	1504 UnicodeString fields[10];

	1505

	1506 int32_t n;

	1507 n = pat1->split("Now is the time", fields, 10, status);

	1508 REGEX_CHECK_STATUS;

	1509 REGEX_ASSERT(n==4);

	1510 REGEX_ASSERT(fields[0]=="Now");

	1511 REGEX_ASSERT(fields[1]=="is");

	1512 REGEX_ASSERT(fields[2]=="the");

	1513 REGEX_ASSERT(fields[3]=="time");

	1514 REGEX_ASSERT(fields[4]=="");

	1515

	1516 n = pat1->split("Now is the time", fields, 2, status);

	1517 REGEX_CHECK_STATUS;

	1518 REGEX_ASSERT(n==2);

	1519 REGEX_ASSERT(fields[0]=="Now");

	1520 REGEX_ASSERT(fields[1]=="is the time");

	1521 REGEX_ASSERT(fields[2]=="the"); // left over from previous test

	1522

	1523 fields[1] = "*";

	1524 status = U_ZERO_ERROR;

	1525 n = pat1->split("Now is the time", fields, 1, status);

	1526 REGEX_CHECK_STATUS;

	1527 REGEX_ASSERT(n==1);

	1528 REGEX_ASSERT(fields[0]=="Now is the time");

	1529 REGEX_ASSERT(fields[1]=="*");

	1530 status = U_ZERO_ERROR;

	1531

	1532 n = pat1->split(" Now is the time ", fields, 10, status);

	1533 REGEX_CHECK_STATUS;

	1534 REGEX_ASSERT(n==5);

	1535 REGEX_ASSERT(fields[0]=="");

	1536 REGEX_ASSERT(fields[1]=="Now");

	1537 REGEX_ASSERT(fields[2]=="is");

	1538 REGEX_ASSERT(fields[3]=="the");

	1539 REGEX_ASSERT(fields[4]=="time");

	1540 REGEX_ASSERT(fields[5]=="");

	1541

	1542 n = pat1->split(" ", fields, 10, status);

	1543 REGEX_CHECK_STATUS;

	1544 REGEX_ASSERT(n==1);

	1545 REGEX_ASSERT(fields[0]=="");

	1546

	1547 fields[0] = "foo";

	1548 n = pat1->split("", fields, 10, status);

	1549 REGEX_CHECK_STATUS;

	1550 REGEX_ASSERT(n==0);

	1551 REGEX_ASSERT(fields[0]=="foo");

	1552

	1553 delete pat1;

	1554

	1555 // split, with a pattern with (capture)

	1556 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status) ;

	1557 REGEX_CHECK_STATUS;

	1558

	1559 status = U_ZERO_ERROR;

	1560 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);

	1561 REGEX_CHECK_STATUS;

	1562 REGEX_ASSERT(n==6);

	1563 REGEX_ASSERT(fields[0]=="");

	1564 REGEX_ASSERT(fields[1]=="a");

	1565 REGEX_ASSERT(fields[2]=="Now is ");

	1566 REGEX_ASSERT(fields[3]=="b");

	1567 REGEX_ASSERT(fields[4]=="the time");

	1568 REGEX_ASSERT(fields[5]=="c");

	1569 REGEX_ASSERT(fields[6]=="");

	1570 REGEX_ASSERT(status==U_ZERO_ERROR);

	1571

	1572 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);

	1573 REGEX_CHECK_STATUS;

	1574 REGEX_ASSERT(n==6);

	1575 REGEX_ASSERT(fields[0]==" ");

	1576 REGEX_ASSERT(fields[1]=="a");

	1577 REGEX_ASSERT(fields[2]=="Now is ");

	1578 REGEX_ASSERT(fields[3]=="b");

	1579 REGEX_ASSERT(fields[4]=="the time");

	1580 REGEX_ASSERT(fields[5]=="c");

	1581 REGEX_ASSERT(fields[6]=="");

	1582

	1583 status = U_ZERO_ERROR;

	1584 fields[6] = "foo";

	1585 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);

	1586 REGEX_CHECK_STATUS;

	1587 REGEX_ASSERT(n==6);

	1588 REGEX_ASSERT(fields[0]==" ");

	1589 REGEX_ASSERT(fields[1]=="a");

	1590 REGEX_ASSERT(fields[2]=="Now is ");

	1591 REGEX_ASSERT(fields[3]=="b");

	1592 REGEX_ASSERT(fields[4]=="the time");

	1593 REGEX_ASSERT(fields[5]=="c");

	1594 REGEX_ASSERT(fields[6]=="foo");

	1595

	1596 status = U_ZERO_ERROR;

	1597 fields[5] = "foo";

	1598 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);

	1599 REGEX_CHECK_STATUS;

	1600 REGEX_ASSERT(n==5);

	1601 REGEX_ASSERT(fields[0]==" ");

	1602 REGEX_ASSERT(fields[1]=="a");

	1603 REGEX_ASSERT(fields[2]=="Now is ");

	1604 REGEX_ASSERT(fields[3]=="b");

	1605 REGEX_ASSERT(fields[4]=="the time<c>");

	1606 REGEX_ASSERT(fields[5]=="foo");

	1607

	1608 status = U_ZERO_ERROR;

	1609 fields[5] = "foo";

	1610 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);

	1611 REGEX_CHECK_STATUS;

	1612 REGEX_ASSERT(n==5);

	1613 REGEX_ASSERT(fields[0]==" ");

	1614 REGEX_ASSERT(fields[1]=="a");

	1615 REGEX_ASSERT(fields[2]=="Now is ");

	1616 REGEX_ASSERT(fields[3]=="b");

	1617 REGEX_ASSERT(fields[4]=="the time");

	1618 REGEX_ASSERT(fields[5]=="foo");

	1619

	1620 status = U_ZERO_ERROR;

	1621 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);

	1622 REGEX_CHECK_STATUS;

	1623 REGEX_ASSERT(n==4);

	1624 REGEX_ASSERT(fields[0]==" ");

	1625 REGEX_ASSERT(fields[1]=="a");

	1626 REGEX_ASSERT(fields[2]=="Now is ");

	1627 REGEX_ASSERT(fields[3]=="the time<c>");

	1628 status = U_ZERO_ERROR;

	1629 delete pat1;

	1630

	1631 pat1 = RegexPattern::compile("([-,])", pe, status);

	1632 REGEX_CHECK_STATUS;

	1633 n = pat1->split("1-10,20", fields, 10, status);

	1634 REGEX_CHECK_STATUS;

	1635 REGEX_ASSERT(n==5);

	1636 REGEX_ASSERT(fields[0]=="1");

	1637 REGEX_ASSERT(fields[1]=="-");

	1638 REGEX_ASSERT(fields[2]=="10");

	1639 REGEX_ASSERT(fields[3]==",");

	1640 REGEX_ASSERT(fields[4]=="20");

	1641 delete pat1;

	1642

	1643

	1644 //

	1645 // RegexPattern::pattern()

	1646 //

	1647 pat1 = new RegexPattern();

	1648 REGEX_ASSERT(pat1->pattern() == "");

	1649 delete pat1;

	1650

	1651 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);

	1652 REGEX_CHECK_STATUS;

	1653 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");

	1654 delete pat1;

	1655

	1656

	1657 //

	1658 // classID functions

	1659 //

	1660 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);

	1661 REGEX_CHECK_STATUS;

	1662 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());

	1663 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);

	1664 UnicodeString Hello("Hello, world.");

	1665 RegexMatcher *m = pat1->matcher(Hello, status);

	1666 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());

	1667 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());

	1668 REGEX_ASSERT(m->getDynamicClassID() != NULL);

	1669 delete m;

	1670 delete pat1;

	1671

	1672 }

	1673

	1674 //---------------------------------------------------------------------------

	1675 //

	1676 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher

	1677 // is present and working, but excluding functions

	1678 // implementing replace operations.

	1679 //

	1680 //---------------------------------------------------------------------------

	1681 void RegexTest::API_Match_UTF8() {

	1682 UParseError pe;

	1683 UErrorCode status=U_ZERO_ERROR;

	1684 int32_t flags = 0;

	1685

	1686 //

	1687 // Debug - slide failing test cases early

	1688 //

	1689 #if 0

	1690 {

	1691 }

	1692 return;

	1693 #endif

	1694

	1695 //

	1696 // Simple pattern compilation

	1697 //

	1698 {

	1699 UText re = UTEXT_INITIALIZER;

	1700 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);

	1701 RegexPattern *pat2;

	1702 pat2 = RegexPattern::compile(&re, flags, pe, status);

	1703 REGEX_CHECK_STATUS;

	1704

	1705 UText input1 = UTEXT_INITIALIZER;

	1706 UText input2 = UTEXT_INITIALIZER;

	1707 UText empty = UTEXT_INITIALIZER;

	1708 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &st atus);

	1709 REGEX_VERBOSE_TEXT(&input1);

	1710 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);

	1711 REGEX_VERBOSE_TEXT(&input2);

	1712 utext_openUChars(&empty, NULL, 0, &status);

	1713

	1714 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not na tivelen (input1) ? */

	1715 int32_t input2Len = strlen("not abc");

	1716

	1717

	1718 //

	1719 // Matcher creation and reset.

	1720 //

	1721 RegexMatcher *m1 = pat2->matcher(&input1, RegexPattern::PATTERN_IS_UTEXT , status);

	1722 REGEX_CHECK_STATUS;

	1723 REGEX_ASSERT(m1->lookingAt(status) == TRUE);

	1724 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x6 6, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */

	1725 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());

	1726 m1->reset(&input2);

	1727 REGEX_ASSERT(m1->lookingAt(status) == FALSE);

	1728 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x 00 }; /* not abc */

	1729 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());

	1730 m1->reset(&input1);

	1731 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());

	1732 REGEX_ASSERT(m1->lookingAt(status) == TRUE);

	1733 m1->reset(&empty);

	1734 REGEX_ASSERT(m1->lookingAt(status) == FALSE);

	1735 REGEX_ASSERT(utext_nativeLength(&empty) == 0);

	1736

	1737 //

	1738 // reset(pos, status)

	1739 //

	1740 m1->reset(&input1);

	1741 m1->reset(4, status);

	1742 REGEX_CHECK_STATUS;

	1743 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());

	1744 REGEX_ASSERT(m1->lookingAt(status) == TRUE);

	1745

	1746 m1->reset(-1, status);

	1747 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

	1748 status = U_ZERO_ERROR;

	1749

	1750 m1->reset(0, status);

	1751 REGEX_CHECK_STATUS;

	1752 status = U_ZERO_ERROR;

	1753

	1754 m1->reset(input1Len-1, status);

	1755 REGEX_CHECK_STATUS;

	1756 status = U_ZERO_ERROR;

	1757

	1758 m1->reset(input1Len, status);

	1759 REGEX_CHECK_STATUS;

	1760 status = U_ZERO_ERROR;

	1761

	1762 m1->reset(input1Len+1, status);

	1763 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

	1764 status = U_ZERO_ERROR;

	1765

	1766 //

	1767 // match(pos, status)

	1768 //

	1769 m1->reset(&input2);

	1770 REGEX_ASSERT(m1->matches(4, status) == TRUE);

	1771 m1->reset();

	1772 REGEX_ASSERT(m1->matches(3, status) == FALSE);

	1773 m1->reset();

	1774 REGEX_ASSERT(m1->matches(5, status) == FALSE);

	1775 REGEX_ASSERT(m1->matches(4, status) == TRUE);

	1776 REGEX_ASSERT(m1->matches(-1, status) == FALSE);

	1777 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

	1778

	1779 // Match() at end of string should fail, but should not

	1780 // be an error.

	1781 status = U_ZERO_ERROR;

	1782 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);

	1783 REGEX_CHECK_STATUS;

	1784

	1785 // Match beyond end of string should fail with an error.

	1786 status = U_ZERO_ERROR;

	1787 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);

	1788 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

	1789

	1790 // Successful match at end of string.

	1791 {

	1792 status = U_ZERO_ERROR;

	1793 RegexMatcher m("A?", 0, status); // will match zero length string.

	1794 REGEX_CHECK_STATUS;

	1795 m.reset(&input1);

	1796 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);

	1797 REGEX_CHECK_STATUS;

	1798 m.reset(&empty);

	1799 REGEX_ASSERT(m.matches(0, status) == TRUE);

	1800 REGEX_CHECK_STATUS;

	1801 }

	1802

	1803

	1804 //

	1805 // lookingAt(pos, status)

	1806 //

	1807 status = U_ZERO_ERROR;

	1808 m1->reset(&input2); // "not abc"

	1809 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);

	1810 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);

	1811 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);

	1812 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);

	1813 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);

	1814 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

	1815 status = U_ZERO_ERROR;

	1816 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);

	1817 REGEX_CHECK_STATUS;

	1818 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);

	1819 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

	1820

	1821 delete m1;

	1822 delete pat2;

	1823

	1824 utext_close(&re);

	1825 utext_close(&input1);

	1826 utext_close(&input2);

	1827 utext_close(&empty);

	1828 }

	1829

	1830

	1831 //

	1832 // Capture Group.

	1833 // RegexMatcher::start();

	1834 // RegexMatcher::end();

	1835 // RegexMatcher::groupCount();

	1836 //

	1837 {

	1838 int32_t flags=0;

	1839 UParseError pe;

	1840 UErrorCode status=U_ZERO_ERROR;

	1841 UText re=UTEXT_INITIALIZER;

	1842 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x 34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67 )(.) /

	1843 utext_openUTF8(&re, str_01234567_pat, -1, &status);

	1844

	1845 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);

	1846 REGEX_CHECK_STATUS;

	1847

	1848 UText input = UTEXT_INITIALIZER;

	1849 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36 , 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */

	1850 utext_openUTF8(&input, str_0123456789, -1, &status);

	1851

	1852 RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UT EXT, status);

	1853 REGEX_CHECK_STATUS;

	1854 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);

	1855 static const int32_t matchStarts[] = {0, 2, 4, 8};

	1856 static const int32_t matchEnds[] = {10, 8, 6, 10};

	1857 int32_t i;

	1858 for (i=0; i<4; i++) {

	1859 int32_t actualStart = matcher->start(i, status);

	1860 REGEX_CHECK_STATUS;

	1861 if (actualStart != matchStarts[i]) {

	1862 errln("RegexTest failure at %s:%d, index %d. Expected %d, got % d\n",

	1863 __FILE__, __LINE__, i, matchStarts[i], actualStart);

	1864 }

	1865 int32_t actualEnd = matcher->end(i, status);

	1866 REGEX_CHECK_STATUS;

	1867 if (actualEnd != matchEnds[i]) {

	1868 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d \n",

	1869 __FILE__, __LINE__, i, matchEnds[i], actualEnd);

	1870 }

	1871 }

	1872

	1873 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));

	1874 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));

	1875

	1876 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR) ;

	1877 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR) ;

	1878 matcher->reset();

	1879 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);

	1880

	1881 matcher->lookingAt(status);

	1882

	1883 UnicodeString dest;

	1884 UText destText = UTEXT_INITIALIZER;

	1885 utext_openUnicodeString(&destText, &dest, &status);

	1886 UText *result;

	1887 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x 36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */

	1888 // Test shallow-clone API

	1889 int64_t group_len;

	1890 result = matcher->group((UText *)NULL, group_len, status);

	1891 REGEX_CHECK_STATUS;

	1892 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);

	1893 utext_close(result);

	1894 result = matcher->group(0, &destText, group_len, status);

	1895 REGEX_CHECK_STATUS;

	1896 REGEX_ASSERT(result == &destText);

	1897 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);

	1898 // destText is now immutable, reopen it

	1899 utext_close(&destText);

	1900 utext_openUnicodeString(&destText, &dest, &status);

	1901

	1902 result = matcher->group(0, NULL, status);

	1903 REGEX_CHECK_STATUS;

	1904 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);

	1905 utext_close(result);

	1906 result = matcher->group(0, &destText, status);

	1907 REGEX_CHECK_STATUS;

	1908 REGEX_ASSERT(result == &destText);

	1909 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);

	1910

	1911 result = matcher->group(1, NULL, status);

	1912 REGEX_CHECK_STATUS;

	1913 const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */

	1914 REGEX_ASSERT_UTEXT_UTF8(str_234567, result);

	1915 utext_close(result);

	1916 result = matcher->group(1, &destText, status);

	1917 REGEX_CHECK_STATUS;

	1918 REGEX_ASSERT(result == &destText);

	1919 REGEX_ASSERT_UTEXT_UTF8(str_234567, result);

	1920

	1921 result = matcher->group(2, NULL, status);

	1922 REGEX_CHECK_STATUS;

	1923 const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */

	1924 REGEX_ASSERT_UTEXT_UTF8(str_45, result);

	1925 utext_close(result);

	1926 result = matcher->group(2, &destText, status);

	1927 REGEX_CHECK_STATUS;

	1928 REGEX_ASSERT(result == &destText);

	1929 REGEX_ASSERT_UTEXT_UTF8(str_45, result);

	1930

	1931 result = matcher->group(3, NULL, status);

	1932 REGEX_CHECK_STATUS;

	1933 const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */

	1934 REGEX_ASSERT_UTEXT_UTF8(str_89, result);

	1935 utext_close(result);

	1936 result = matcher->group(3, &destText, status);

	1937 REGEX_CHECK_STATUS;

	1938 REGEX_ASSERT(result == &destText);

	1939 REGEX_ASSERT_UTEXT_UTF8(str_89, result);

	1940

	1941 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR) ;

	1942 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR) ;

	1943 matcher->reset();

	1944 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);

	1945

	1946 delete matcher;

	1947 delete pat;

	1948

	1949 utext_close(&destText);

	1950 utext_close(&input);

	1951 utext_close(&re);

	1952 }

	1953

	1954 //

	1955 // find

	1956 //

	1957 {

	1958 int32_t flags=0;

	1959 UParseError pe;

	1960 UErrorCode status=U_ZERO_ERROR;

	1961 UText re=UTEXT_INITIALIZER;

	1962 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */

	1963 utext_openUTF8(&re, str_abc, -1, &status);

	1964

	1965 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);

	1966 REGEX_CHECK_STATUS;

	1967 UText input = UTEXT_INITIALIZER;

	1968 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..ab c...abc.. */

	1969 utext_openUTF8(&input, str_abcabcabc, -1, &status);

	1970 // 012345678901234567

	1971

	1972 RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UT EXT, status);

	1973 REGEX_CHECK_STATUS;

	1974 REGEX_ASSERT(matcher->find());

	1975 REGEX_ASSERT(matcher->start(status) == 1);

	1976 REGEX_ASSERT(matcher->find());

	1977 REGEX_ASSERT(matcher->start(status) == 6);

	1978 REGEX_ASSERT(matcher->find());

	1979 REGEX_ASSERT(matcher->start(status) == 12);

	1980 REGEX_ASSERT(matcher->find() == FALSE);

	1981 REGEX_ASSERT(matcher->find() == FALSE);

	1982

	1983 matcher->reset();

	1984 REGEX_ASSERT(matcher->find());

	1985 REGEX_ASSERT(matcher->start(status) == 1);

	1986

	1987 REGEX_ASSERT(matcher->find(0, status));

	1988 REGEX_ASSERT(matcher->start(status) == 1);

	1989 REGEX_ASSERT(matcher->find(1, status));

	1990 REGEX_ASSERT(matcher->start(status) == 1);

	1991 REGEX_ASSERT(matcher->find(2, status));

	1992 REGEX_ASSERT(matcher->start(status) == 6);

	1993 REGEX_ASSERT(matcher->find(12, status));

	1994 REGEX_ASSERT(matcher->start(status) == 12);

	1995 REGEX_ASSERT(matcher->find(13, status) == FALSE);

	1996 REGEX_ASSERT(matcher->find(16, status) == FALSE);

	1997 REGEX_ASSERT(matcher->find(17, status) == FALSE);

	1998 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);

	1999

	2000 status = U_ZERO_ERROR;

	2001 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);

	2002 status = U_ZERO_ERROR;

	2003 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);

	2004

	2005 REGEX_ASSERT(matcher->groupCount() == 0);

	2006

	2007 delete matcher;

	2008 delete pat;

	2009

	2010 utext_close(&input);

	2011 utext_close(&re);

	2012 }

	2013

	2014

	2015 //

	2016 // find, with \G in pattern (true if at the end of a previous match).

	2017 //

	2018 {

	2019 int32_t flags=0;

	2020 UParseError pe;

	2021 UErrorCode status=U_ZERO_ERROR;

	2022 UText re=UTEXT_INITIALIZER;

	2023 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0 x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x0 0 }; /* .?(?:(\\Gabc)\|(abc)) /

	2024 utext_openUTF8(&re, str_Gabcabc, -1, &status);

	2025

	2026 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);

	2027

	2028 REGEX_CHECK_STATUS;

	2029 UText input = UTEXT_INITIALIZER;

	2030 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */

	2031 utext_openUTF8(&input, str_abcabcabc, -1, &status);

	2032 // 012345678901234567

	2033

	2034 RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UT EXT, status);

	2035 REGEX_CHECK_STATUS;

	2036 REGEX_ASSERT(matcher->find());

	2037 REGEX_ASSERT(matcher->start(status) == 0);

	2038 REGEX_ASSERT(matcher->start(1, status) == -1);

	2039 REGEX_ASSERT(matcher->start(2, status) == 1);

	2040

	2041 REGEX_ASSERT(matcher->find());

	2042 REGEX_ASSERT(matcher->start(status) == 4);

	2043 REGEX_ASSERT(matcher->start(1, status) == 4);

	2044 REGEX_ASSERT(matcher->start(2, status) == -1);

	2045 REGEX_CHECK_STATUS;

	2046

	2047 delete matcher;

	2048 delete pat;

	2049

	2050 utext_close(&input);

	2051 utext_close(&re);

	2052 }

	2053

	2054 //

	2055 // find with zero length matches, match position should bump ahead

	2056 // to prevent loops.

	2057 //

	2058 {

	2059 int32_t i;

	2060 UErrorCode status=U_ZERO_ERROR;

	2061 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero- length matches anywhere,

	2062 // using an always-true look-ahead.

	2063 REGEX_CHECK_STATUS;

	2064 UText s = UTEXT_INITIALIZER;

	2065 utext_openUTF8(&s, " ", -1, &status);

	2066 m.reset(&s);

	2067 for (i=0; ; i++) {

	2068 if (m.find() == FALSE) {

	2069 break;

	2070 }

	2071 REGEX_ASSERT(m.start(status) == i);

	2072 REGEX_ASSERT(m.end(status) == i);

	2073 }

	2074 REGEX_ASSERT(i==5);

	2075

	2076 // Check that the bump goes over characters outside the BMP OK

	2077 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8

	2078 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x 82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};

	2079 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);

	2080 m.reset(&s);

	2081 for (i=0; ; i+=4) {

	2082 if (m.find() == FALSE) {

	2083 break;

	2084 }

	2085 REGEX_ASSERT(m.start(status) == i);

	2086 REGEX_ASSERT(m.end(status) == i);

	2087 }

	2088 REGEX_ASSERT(i==20);

	2089

	2090 utext_close(&s);

	2091 }

	2092 {

	2093 // find() loop breaking test.

	2094 // with pattern of /.?/, should see a series of one char matches, then a single

	2095 // match of zero length at the end of the input string.

	2096 int32_t i;

	2097 UErrorCode status=U_ZERO_ERROR;

	2098 RegexMatcher m(".?", 0, status);

	2099 REGEX_CHECK_STATUS;

	2100 UText s = UTEXT_INITIALIZER;

	2101 utext_openUTF8(&s, " ", -1, &status);

	2102 m.reset(&s);

	2103 for (i=0; ; i++) {

	2104 if (m.find() == FALSE) {

	2105 break;

	2106 }

	2107 REGEX_ASSERT(m.start(status) == i);

	2108 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));

	2109 }

	2110 REGEX_ASSERT(i==5);

	2111

	2112 utext_close(&s);

	2113 }

	2114

	2115

	2116 //

	2117 // Matchers with no input string behave as if they had an empty input string .

	2118 //

	2119

	2120 {

	2121 UErrorCode status = U_ZERO_ERROR;

	2122 RegexMatcher m(".?", 0, status);

	2123 REGEX_CHECK_STATUS;

	2124 REGEX_ASSERT(m.find());

	2125 REGEX_ASSERT(m.start(status) == 0);

	2126 REGEX_ASSERT(m.input() == "");

	2127 }

	2128 {

	2129 UErrorCode status = U_ZERO_ERROR;

	2130 RegexPattern *p = RegexPattern::compile(".", 0, status);

	2131 RegexMatcher *m = p->matcher(status);

	2132 REGEX_CHECK_STATUS;

	2133

	2134 REGEX_ASSERT(m->find() == FALSE);

	2135 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);

	2136 delete m;

	2137 delete p;

	2138 }

	2139

	2140 //

	2141 // Regions

	2142 //

	2143 {

	2144 UErrorCode status = U_ZERO_ERROR;

	2145 UText testPattern = UTEXT_INITIALIZER;

	2146 UText testText = UTEXT_INITIALIZER;

	2147 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);

	2148 REGEX_VERBOSE_TEXT(&testPattern);

	2149 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &stat us);

	2150 REGEX_VERBOSE_TEXT(&testText);

	2151

	2152 RegexMatcher m(&testPattern, &testText, 0, status);

	2153 REGEX_CHECK_STATUS;

	2154 REGEX_ASSERT(m.regionStart() == 0);

	2155 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));

	2156 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);

	2157 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);

	2158

	2159 m.region(2,4, status);

	2160 REGEX_CHECK_STATUS;

	2161 REGEX_ASSERT(m.matches(status));

	2162 REGEX_ASSERT(m.start(status)==2);

	2163 REGEX_ASSERT(m.end(status)==4);

	2164 REGEX_CHECK_STATUS;

	2165

	2166 m.reset();

	2167 REGEX_ASSERT(m.regionStart() == 0);

	2168 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));

	2169

	2170 regextst_openUTF8FromInvariant(&testText, "short", -1, &status);

	2171 REGEX_VERBOSE_TEXT(&testText);

	2172 m.reset(&testText);

	2173 REGEX_ASSERT(m.regionStart() == 0);

	2174 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));

	2175

	2176 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);

	2177 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));

	2178 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);

	2179 REGEX_ASSERT(&m == &m.reset());

	2180 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);

	2181

	2182 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));

	2183 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);

	2184 REGEX_ASSERT(&m == &m.reset());

	2185 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);

	2186

	2187 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);

	2188 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));

	2189 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);

	2190 REGEX_ASSERT(&m == &m.reset());

	2191 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);

	2192

	2193 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));

	2194 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);

	2195 REGEX_ASSERT(&m == &m.reset());

	2196 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);

	2197

	2198 utext_close(&testText);

	2199 utext_close(&testPattern);

	2200 }

	2201

	2202 //

	2203 // hitEnd() and requireEnd()

	2204 //

	2205 {

	2206 UErrorCode status = U_ZERO_ERROR;

	2207 UText testPattern = UTEXT_INITIALIZER;

	2208 UText testText = UTEXT_INITIALIZER;

	2209 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */

	2210 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */

	2211 utext_openUTF8(&testPattern, str_, -1, &status);

	2212 utext_openUTF8(&testText, str_aabb, -1, &status);

	2213

	2214 RegexMatcher m1(&testPattern, &testText, 0, status);

	2215 REGEX_ASSERT(m1.lookingAt(status) == TRUE);

	2216 REGEX_ASSERT(m1.hitEnd() == TRUE);

	2217 REGEX_ASSERT(m1.requireEnd() == FALSE);

	2218 REGEX_CHECK_STATUS;

	2219

	2220 status = U_ZERO_ERROR;

	2221 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */

	2222 utext_openUTF8(&testPattern, str_a, -1, &status);

	2223 RegexMatcher m2(&testPattern, &testText, 0, status);

	2224 REGEX_ASSERT(m2.lookingAt(status) == TRUE);

	2225 REGEX_ASSERT(m2.hitEnd() == FALSE);

	2226 REGEX_ASSERT(m2.requireEnd() == FALSE);

	2227 REGEX_CHECK_STATUS;

	2228

	2229 status = U_ZERO_ERROR;

	2230 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .$ /

	2231 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);

	2232 RegexMatcher m3(&testPattern, &testText, 0, status);

	2233 REGEX_ASSERT(m3.lookingAt(status) == TRUE);

	2234 REGEX_ASSERT(m3.hitEnd() == TRUE);

	2235 REGEX_ASSERT(m3.requireEnd() == TRUE);

	2236 REGEX_CHECK_STATUS;

	2237

	2238 utext_close(&testText);

	2239 utext_close(&testPattern);

	2240 }

	2241 }

	2242

	2243

	2244 //---------------------------------------------------------------------------

	2245 //

	2246 // API_Replace_UTF8 API test for class RegexMatcher, testing the

	2247 // Replace family of functions.

	2248 //

	2249 //---------------------------------------------------------------------------

	2250 void RegexTest::API_Replace_UTF8() {

	2251 //

	2252 // Replace

	2253 //

	2254 int32_t flags=0;

	2255 UParseError pe;

	2256 UErrorCode status=U_ZERO_ERROR;

	2257

	2258 UText re=UTEXT_INITIALIZER;

	2259 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);

	2260 REGEX_VERBOSE_TEXT(&re);

	2261 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);

	2262 REGEX_CHECK_STATUS;

	2263

	2264 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */

	2265 // 012345678901234567

	2266 UText dataText = UTEXT_INITIALIZER;

	2267 utext_openUTF8(&dataText, data, -1, &status);

	2268 REGEX_CHECK_STATUS;

	2269 REGEX_VERBOSE_TEXT(&dataText);

	2270 RegexMatcher *matcher = pat->matcher(&dataText, RegexPattern::PATTERN_IS_UTE XT, status);

	2271

	2272 //

	2273 // Plain vanilla matches.

	2274 //

	2275 UnicodeString dest;

	2276 UText destText = UTEXT_INITIALIZER;

	2277 utext_openUnicodeString(&destText, &dest, &status);

	2278 UText *result;

	2279

	2280 UText replText = UTEXT_INITIALIZER;

	2281

	2282 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */

	2283 utext_openUTF8(&replText, str_yz, -1, &status);

	2284 REGEX_VERBOSE_TEXT(&replText);

	2285 result = matcher->replaceFirst(&replText, NULL, status);

	2286 REGEX_CHECK_STATUS;

	2287 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63 , 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */

	2288 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);

	2289 utext_close(result);

	2290 result = matcher->replaceFirst(&replText, &destText, status);

	2291 REGEX_CHECK_STATUS;

	2292 REGEX_ASSERT(result == &destText);

	2293 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);

	2294

	2295 result = matcher->replaceAll(&replText, NULL, status);

	2296 REGEX_CHECK_STATUS;

	2297 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */

	2298 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);

	2299 utext_close(result);

	2300

	2301 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

	2302 result = matcher->replaceAll(&replText, &destText, status);

	2303 REGEX_CHECK_STATUS;

	2304 REGEX_ASSERT(result == &destText);

	2305 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);

	2306

	2307 //

	2308 // Plain vanilla non-matches.

	2309 //

	2310 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x6 2, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx... abx.. */

	2311 utext_openUTF8(&dataText, str_abxabxabx, -1, &status);

	2312 matcher->reset(&dataText);

	2313

	2314 result = matcher->replaceFirst(&replText, NULL, status);

	2315 REGEX_CHECK_STATUS;

	2316 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);

	2317 utext_close(result);

	2318 result = matcher->replaceFirst(&replText, &destText, status);

	2319 REGEX_CHECK_STATUS;

	2320 REGEX_ASSERT(result == &destText);

	2321 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);

	2322

	2323 result = matcher->replaceAll(&replText, NULL, status);

	2324 REGEX_CHECK_STATUS;

	2325 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);

	2326 utext_close(result);

	2327 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

	2328 result = matcher->replaceAll(&replText, &destText, status);

	2329 REGEX_CHECK_STATUS;

	2330 REGEX_ASSERT(result == &destText);

	2331 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);

	2332

	2333 //

	2334 // Empty source string

	2335 //

	2336 utext_openUTF8(&dataText, NULL, 0, &status);

	2337 matcher->reset(&dataText);

	2338

	2339 result = matcher->replaceFirst(&replText, NULL, status);

	2340 REGEX_CHECK_STATUS;

	2341 REGEX_ASSERT_UTEXT_UTF8("", result);

	2342 utext_close(result);

	2343 result = matcher->replaceFirst(&replText, &destText, status);

	2344 REGEX_CHECK_STATUS;

	2345 REGEX_ASSERT(result == &destText);

	2346 REGEX_ASSERT_UTEXT_UTF8("", result);

	2347

	2348 result = matcher->replaceAll(&replText, NULL, status);

	2349 REGEX_CHECK_STATUS;

	2350 REGEX_ASSERT_UTEXT_UTF8("", result);

	2351 utext_close(result);

	2352 result = matcher->replaceAll(&replText, &destText, status);

	2353 REGEX_CHECK_STATUS;

	2354 REGEX_ASSERT(result == &destText);

	2355 REGEX_ASSERT_UTEXT_UTF8("", result);

	2356

	2357 //

	2358 // Empty substitution string

	2359 //

	2360 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."

	2361 matcher->reset(&dataText);

	2362

	2363 utext_openUTF8(&replText, NULL, 0, &status);

	2364 result = matcher->replaceFirst(&replText, NULL, status);

	2365 REGEX_CHECK_STATUS;

	2366 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */

	2367 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);

	2368 utext_close(result);

	2369 result = matcher->replaceFirst(&replText, &destText, status);

	2370 REGEX_CHECK_STATUS;

	2371 REGEX_ASSERT(result == &destText);

	2372 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);

	2373

	2374 result = matcher->replaceAll(&replText, NULL, status);

	2375 REGEX_CHECK_STATUS;

	2376 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x 00 }; /* ........ */

	2377 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);

	2378 utext_close(result);

	2379 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

	2380 result = matcher->replaceAll(&replText, &destText, status);

	2381 REGEX_CHECK_STATUS;

	2382 REGEX_ASSERT(result == &destText);

	2383 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);

	2384

	2385 //

	2386 // match whole string

	2387 //

	2388 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */

	2389 utext_openUTF8(&dataText, str_abc, -1, &status);

	2390 matcher->reset(&dataText);

	2391

	2392 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */

	2393 utext_openUTF8(&replText, str_xyz, -1, &status);

	2394 result = matcher->replaceFirst(&replText, NULL, status);

	2395 REGEX_CHECK_STATUS;

	2396 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);

	2397 utext_close(result);

	2398 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

	2399 result = matcher->replaceFirst(&replText, &destText, status);

	2400 REGEX_CHECK_STATUS;

	2401 REGEX_ASSERT(result == &destText);

	2402 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);

	2403

	2404 result = matcher->replaceAll(&replText, NULL, status);

	2405 REGEX_CHECK_STATUS;

	2406 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);

	2407 utext_close(result);

	2408 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

	2409 result = matcher->replaceAll(&replText, &destText, status);

	2410 REGEX_CHECK_STATUS;

	2411 REGEX_ASSERT(result == &destText);

	2412 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);

	2413

	2414 //

	2415 // Capture Group, simple case

	2416 //

	2417 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */

	2418 utext_openUTF8(&re, str_add, -1, &status);

	2419 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);

	2420 REGEX_CHECK_STATUS;

	2421

	2422 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */

	2423 utext_openUTF8(&dataText, str_abcdefg, -1, &status);

	2424 RegexMatcher *matcher2 = pat2->matcher(&dataText, RegexPattern::PATTERN_IS_U TEXT, status);

	2425 REGEX_CHECK_STATUS;

	2426

	2427 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */

	2428 utext_openUTF8(&replText, str_11, -1, &status);

	2429 result = matcher2->replaceFirst(&replText, NULL, status);

	2430 REGEX_CHECK_STATUS;

	2431 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67 , 0x00 }; /* bcbcdefg */

	2432 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);

	2433 utext_close(result);

	2434 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

	2435 result = matcher2->replaceFirst(&replText, &destText, status);

	2436 REGEX_CHECK_STATUS;

	2437 REGEX_ASSERT(result == &destText);

	2438 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);

	2439

	2440 regextst_openUTF8FromInvariant(&replText, "The value of \\$1 is $1.", -1, &s tatus);

	2441 result = matcher2->replaceFirst(&replText, NULL, status);

	2442 REGEX_CHECK_STATUS;

	2443 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0 x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg * /

	2444 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);

	2445 utext_close(result);

	2446 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

	2447 result = matcher2->replaceFirst(&replText, &destText, status);

	2448 REGEX_CHECK_STATUS;

	2449 REGEX_ASSERT(result == &destText);

	2450 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);

	2451

	2452 const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x6 9, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0 x00 }; /* $ by itself, no group number $$$ */

	2453 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);

	2454 result = matcher2->replaceFirst(&replText, NULL, status);

	2455 REGEX_CHECK_STATUS;

	2456 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0 x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x2 4, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */

	2457 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);

	2458 utext_close(result);

	2459 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

	2460 result = matcher2->replaceFirst(&replText, &destText, status);

	2461 REGEX_CHECK_STATUS;

	2462 REGEX_ASSERT(result == &destText);

	2463 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);

	2464

	2465 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d , 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */

	2466 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001 D7CF, MATHEMATICAL BOLD DIGIT ONE

	2467 // 012345678901234567890123456

	2468 supplDigitChars[22] = 0xF0;

	2469 supplDigitChars[23] = 0x9D;

	2470 supplDigitChars[24] = 0x9F;

	2471 supplDigitChars[25] = 0x8F;

	2472 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);

	2473

	2474 result = matcher2->replaceFirst(&replText, NULL, status);

	2475 REGEX_CHECK_STATUS;

	2476 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x 20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplementa l Digit 1 bc.defg */

	2477 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);

	2478 utext_close(result);

	2479 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

	2480 result = matcher2->replaceFirst(&replText, &destText, status);

	2481 REGEX_CHECK_STATUS;

	2482 REGEX_ASSERT(result == &destText);

	2483 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);

	2484 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x 61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e , 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */

	2485 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);

	2486 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)) , U_INDEX_OUTOFBOUNDS_ERROR);

	2487 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);

	2488 utext_close(result);

	2489 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;

	2490 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, sta tus)), U_INDEX_OUTOFBOUNDS_ERROR);

	2491 REGEX_ASSERT(result == &destText);

	2492 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);

	2493

	2494 //

	2495 // Replacement String with \u hex escapes

	2496 //

	2497 {

	2498 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61 , 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 a bc 2 abc 3 */

	2499 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */

	2500 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);

	2501 utext_openUTF8(&replText, str_u0043, -1, &status);

	2502 matcher->reset(&dataText);

	2503

	2504 result = matcher->replaceAll(&replText, NULL, status);

	2505 REGEX_CHECK_STATUS;

	2506 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x 20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d , 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */

	2507 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);

	2508 utext_close(result);

	2509 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &sta tus);

	2510 result = matcher->replaceAll(&replText, &destText, status);

	2511 REGEX_CHECK_STATUS;

	2512 REGEX_ASSERT(result == &destText);

	2513 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);

	2514 }

	2515 {

	2516 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */

	2517 utext_openUTF8(&dataText, str_abc, -1, &status);

	2518 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */

	2519 utext_openUTF8(&replText, str_U00010000, -1, &status);

	2520 matcher->reset(&dataText);

	2521

	2522 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0 x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"

	2523 // 0123456789

	2524 expected[2] = 0xF0;

	2525 expected[3] = 0x90;

	2526 expected[4] = 0x80;

	2527 expected[5] = 0x80;

	2528

	2529 result = matcher->replaceAll(&replText, NULL, status);

	2530 REGEX_CHECK_STATUS;

	2531 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);

	2532 utext_close(result);

	2533 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &sta tus);

	2534 result = matcher->replaceAll(&replText, &destText, status);

	2535 REGEX_CHECK_STATUS;

	2536 REGEX_ASSERT(result == &destText);

	2537 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);

	2538 }

	2539 // TODO: need more through testing of capture substitutions.

	2540

	2541 // Bug 4057

	2542 //

	2543 {

	2544 status = U_ZERO_ERROR;

	2545 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.?)ee /

	2546 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x 20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69 , 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start wit h ss and end with ee ss stuff ee fin */

	2547 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */

	2548 utext_openUTF8(&re, str_ssee, -1, &status);

	2549 utext_openUTF8(&dataText, str_blah, -1, &status);

	2550 utext_openUTF8(&replText, str_ooh, -1, &status);

	2551

	2552 RegexMatcher m(&re, 0, status);

	2553 REGEX_CHECK_STATUS;

	2554

	2555 UnicodeString result;

	2556 UText resultText = UTEXT_INITIALIZER;

	2557 utext_openUnicodeString(&resultText, &result, &status);

	2558

	2559 // Multiple finds do NOT bump up the previous appendReplacement postion.

	2560 m.reset(&dataText);

	2561 m.find();

	2562 m.find();

	2563 m.appendReplacement(&resultText, &replText, status);

	2564 REGEX_CHECK_STATUS;

	2565 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x6 3, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0 x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */

	2566 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);

	2567

	2568 // After a reset into the interior of a string, appendReplacement still starts at beginning.

	2569 status = U_ZERO_ERROR;

	2570 result.truncate(0);

	2571 utext_openUnicodeString(&resultText, &result, &status);

	2572 m.reset(10, status);

	2573 m.find();

	2574 m.find();

	2575 m.appendReplacement(&resultText, &replText, status);

	2576 REGEX_CHECK_STATUS;

	2577 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x6 3, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0 x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */

	2578 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);

	2579

	2580 // find() at interior of string, appendReplacement still starts at begin ning.

	2581 status = U_ZERO_ERROR;

	2582 result.truncate(0);

	2583 utext_openUnicodeString(&resultText, &result, &status);

	2584 m.reset();

	2585 m.find(10, status);

	2586 m.find();

	2587 m.appendReplacement(&resultText, &replText, status);

	2588 REGEX_CHECK_STATUS;

	2589 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x6 3, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0 x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */

	2590 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);

	2591

	2592 m.appendTail(&resultText, status);

	2593 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x6 3, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0 x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x6 9, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */

	2594 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);

	2595

	2596 utext_close(&resultText);

	2597 }

	2598

	2599 delete matcher2;

	2600 delete pat2;

	2601 delete matcher;

	2602 delete pat;

	2603

	2604 utext_close(&dataText);

	2605 utext_close(&replText);

	2606 utext_close(&destText);

	2607 utext_close(&re);

	2608 }

	2609

	2610

	2611 //---------------------------------------------------------------------------

	2612 //

	2613 // API_Pattern_UTF8 Test that the API for class RegexPattern is

	2614 // present and nominally working.

	2615 //

	2616 //---------------------------------------------------------------------------

	2617 void RegexTest::API_Pattern_UTF8() {

	2618 RegexPattern pata; // Test default constructor to not crash.

	2619 RegexPattern patb;

	2620

	2621 REGEX_ASSERT(pata == patb);

	2622 REGEX_ASSERT(pata == pata);

	2623

	2624 UText re1 = UTEXT_INITIALIZER;

	2625 UText re2 = UTEXT_INITIALIZER;

	2626 UErrorCode status = U_ZERO_ERROR;

	2627 UParseError pe;

	2628

	2629 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */

	2630 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */

	2631 utext_openUTF8(&re1, str_abcalmz, -1, &status);

	2632 utext_openUTF8(&re2, str_def, -1, &status);

	2633

	2634 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);

	2635 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);

	2636 REGEX_CHECK_STATUS;

	2637 REGEX_ASSERT(pat1 == pat1);

	2638 REGEX_ASSERT(*pat1 != pata);

	2639

	2640 // Assign

	2641 patb = *pat1;

	2642 REGEX_ASSERT(patb == *pat1);

	2643

	2644 // Copy Construct

	2645 RegexPattern patc(*pat1);

	2646 REGEX_ASSERT(patc == *pat1);

	2647 REGEX_ASSERT(patb == patc);

	2648 REGEX_ASSERT(pat1 != pat2);

	2649 patb = *pat2;

	2650 REGEX_ASSERT(patb != patc);

	2651 REGEX_ASSERT(patb == *pat2);

	2652

	2653 // Compile with no flags.

	2654 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);

	2655 REGEX_ASSERT(pat1a == pat1);

	2656

	2657 REGEX_ASSERT(pat1a->flags() == 0);

	2658

	2659 // Compile with different flags should be not equal

	2660 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSI TIVE, pe, status);

	2661 REGEX_CHECK_STATUS;

	2662

	2663 REGEX_ASSERT(pat1b != pat1a);

	2664 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);

	2665 REGEX_ASSERT(pat1a->flags() == 0);

	2666 delete pat1b;

	2667

	2668 // clone

	2669 RegexPattern *pat1c = pat1->clone();

	2670 REGEX_ASSERT(pat1c == pat1);

	2671 REGEX_ASSERT(pat1c != pat2);

	2672

	2673 delete pat1c;

	2674 delete pat1a;

	2675 delete pat1;

	2676 delete pat2;

	2677

	2678 utext_close(&re1);

	2679 utext_close(&re2);

	2680

	2681

	2682 //

	2683 // Verify that a matcher created from a cloned pattern works.

	2684 // (Jitterbug 3423)

	2685 //

	2686 {

	2687 UErrorCode status = U_ZERO_ERROR;

	2688 UText pattern = UTEXT_INITIALIZER;

	2689 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \ p{L}+ */

	2690 utext_openUTF8(&pattern, str_pL, -1, &status);

	2691

	2692 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);

	2693 RegexPattern *pClone = pSource->clone();

	2694 delete pSource;

	2695 RegexMatcher *mFromClone = pClone->matcher(status);

	2696 REGEX_CHECK_STATUS;

	2697

	2698 UText input = UTEXT_INITIALIZER;

	2699 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57 , 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */

	2700 utext_openUTF8(&input, str_HelloWorld, -1, &status);

	2701 mFromClone->reset(&input);

	2702 REGEX_ASSERT(mFromClone->find() == TRUE);

	2703 REGEX_ASSERT(mFromClone->group(status) == "Hello");

	2704 REGEX_ASSERT(mFromClone->find() == TRUE);

	2705 REGEX_ASSERT(mFromClone->group(status) == "World");

	2706 REGEX_ASSERT(mFromClone->find() == FALSE);

	2707 delete mFromClone;

	2708 delete pClone;

	2709

	2710 utext_close(&input);

	2711 utext_close(&pattern);

	2712 }

	2713

	2714 //

	2715 // matches convenience API

	2716 //

	2717 {

	2718 UErrorCode status = U_ZERO_ERROR;

	2719 UText pattern = UTEXT_INITIALIZER;

	2720 UText input = UTEXT_INITIALIZER;

	2721

	2722 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x2 0, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */

	2723 utext_openUTF8(&input, str_randominput, -1, &status);

	2724

	2725 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */

	2726 utext_openUTF8(&pattern, str_dotstar, -1, &status);

	2727 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE );

	2728 REGEX_CHECK_STATUS;

	2729

	2730 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */

	2731 utext_openUTF8(&pattern, str_abc, -1, &status);

	2732 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);

	2733 REGEX_CHECK_STATUS;

	2734

	2735 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .nput /

	2736 utext_openUTF8(&pattern, str_nput, -1, &status);

	2737 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);

	2738 REGEX_CHECK_STATUS;

	2739

	2740 utext_openUTF8(&pattern, str_randominput, -1, &status);

	2741 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, s tatus) == TRUE);

	2742 REGEX_CHECK_STATUS;

	2743

	2744 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .u /

	2745 utext_openUTF8(&pattern, str_u, -1, &status);

	2746 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);

	2747 REGEX_CHECK_STATUS;

	2748

	2749 utext_openUTF8(&input, str_abc, -1, &status);

	2750 utext_openUTF8(&pattern, str_abc, -1, &status);

	2751 status = U_INDEX_OUTOFBOUNDS_ERROR;

	2752 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);

	2753 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

	2754

	2755 utext_close(&input);

	2756 utext_close(&pattern);

	2757 }

	2758

	2759

	2760 //

	2761 // Split()

	2762 //

	2763 status = U_ZERO_ERROR;

	2764 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */

	2765 utext_openUTF8(&re1, str_spaceplus, -1, &status);

	2766 pat1 = RegexPattern::compile(&re1, pe, status);

	2767 REGEX_CHECK_STATUS;

	2768 UnicodeString fields[10];

	2769

	2770 int32_t n;

	2771 n = pat1->split("Now is the time", fields, 10, status);

	2772 REGEX_CHECK_STATUS;

	2773 REGEX_ASSERT(n==4);

	2774 REGEX_ASSERT(fields[0]=="Now");

	2775 REGEX_ASSERT(fields[1]=="is");

	2776 REGEX_ASSERT(fields[2]=="the");

	2777 REGEX_ASSERT(fields[3]=="time");

	2778 REGEX_ASSERT(fields[4]=="");

	2779

	2780 n = pat1->split("Now is the time", fields, 2, status);

	2781 REGEX_CHECK_STATUS;

	2782 REGEX_ASSERT(n==2);

	2783 REGEX_ASSERT(fields[0]=="Now");

	2784 REGEX_ASSERT(fields[1]=="is the time");

	2785 REGEX_ASSERT(fields[2]=="the"); // left over from previous test

	2786

	2787 fields[1] = "*";

	2788 status = U_ZERO_ERROR;

	2789 n = pat1->split("Now is the time", fields, 1, status);

	2790 REGEX_CHECK_STATUS;

	2791 REGEX_ASSERT(n==1);

	2792 REGEX_ASSERT(fields[0]=="Now is the time");

	2793 REGEX_ASSERT(fields[1]=="*");

	2794 status = U_ZERO_ERROR;

	2795

	2796 n = pat1->split(" Now is the time ", fields, 10, status);

	2797 REGEX_CHECK_STATUS;

	2798 REGEX_ASSERT(n==5);

	2799 REGEX_ASSERT(fields[0]=="");

	2800 REGEX_ASSERT(fields[1]=="Now");

	2801 REGEX_ASSERT(fields[2]=="is");

	2802 REGEX_ASSERT(fields[3]=="the");

	2803 REGEX_ASSERT(fields[4]=="time");

	2804 REGEX_ASSERT(fields[5]=="");

	2805

	2806 n = pat1->split(" ", fields, 10, status);

	2807 REGEX_CHECK_STATUS;

	2808 REGEX_ASSERT(n==1);

	2809 REGEX_ASSERT(fields[0]=="");

	2810

	2811 fields[0] = "foo";

	2812 n = pat1->split("", fields, 10, status);

	2813 REGEX_CHECK_STATUS;

	2814 REGEX_ASSERT(n==0);

	2815 REGEX_ASSERT(fields[0]=="foo");

	2816

	2817 delete pat1;

	2818

	2819 // split, with a pattern with (capture)

	2820 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);

	2821 pat1 = RegexPattern::compile(&re1, pe, status);

	2822 REGEX_CHECK_STATUS;

	2823

	2824 status = U_ZERO_ERROR;

	2825 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);

	2826 REGEX_CHECK_STATUS;

	2827 REGEX_ASSERT(n==6);

	2828 REGEX_ASSERT(fields[0]=="");

	2829 REGEX_ASSERT(fields[1]=="a");

	2830 REGEX_ASSERT(fields[2]=="Now is ");

	2831 REGEX_ASSERT(fields[3]=="b");

	2832 REGEX_ASSERT(fields[4]=="the time");

	2833 REGEX_ASSERT(fields[5]=="c");

	2834 REGEX_ASSERT(fields[6]=="");

	2835 REGEX_ASSERT(status==U_ZERO_ERROR);

	2836

	2837 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);

	2838 REGEX_CHECK_STATUS;

	2839 REGEX_ASSERT(n==6);

	2840 REGEX_ASSERT(fields[0]==" ");

	2841 REGEX_ASSERT(fields[1]=="a");

	2842 REGEX_ASSERT(fields[2]=="Now is ");

	2843 REGEX_ASSERT(fields[3]=="b");

	2844 REGEX_ASSERT(fields[4]=="the time");

	2845 REGEX_ASSERT(fields[5]=="c");

	2846 REGEX_ASSERT(fields[6]=="");

	2847

	2848 status = U_ZERO_ERROR;

	2849 fields[6] = "foo";

	2850 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);

	2851 REGEX_CHECK_STATUS;

	2852 REGEX_ASSERT(n==6);

	2853 REGEX_ASSERT(fields[0]==" ");

	2854 REGEX_ASSERT(fields[1]=="a");

	2855 REGEX_ASSERT(fields[2]=="Now is ");

	2856 REGEX_ASSERT(fields[3]=="b");

	2857 REGEX_ASSERT(fields[4]=="the time");

	2858 REGEX_ASSERT(fields[5]=="c");

	2859 REGEX_ASSERT(fields[6]=="foo");

	2860

	2861 status = U_ZERO_ERROR;

	2862 fields[5] = "foo";

	2863 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);

	2864 REGEX_CHECK_STATUS;

	2865 REGEX_ASSERT(n==5);

	2866 REGEX_ASSERT(fields[0]==" ");

	2867 REGEX_ASSERT(fields[1]=="a");

	2868 REGEX_ASSERT(fields[2]=="Now is ");

	2869 REGEX_ASSERT(fields[3]=="b");

	2870 REGEX_ASSERT(fields[4]=="the time<c>");

	2871 REGEX_ASSERT(fields[5]=="foo");

	2872

	2873 status = U_ZERO_ERROR;

	2874 fields[5] = "foo";

	2875 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);

	2876 REGEX_CHECK_STATUS;

	2877 REGEX_ASSERT(n==5);

	2878 REGEX_ASSERT(fields[0]==" ");

	2879 REGEX_ASSERT(fields[1]=="a");

	2880 REGEX_ASSERT(fields[2]=="Now is ");

	2881 REGEX_ASSERT(fields[3]=="b");

	2882 REGEX_ASSERT(fields[4]=="the time");

	2883 REGEX_ASSERT(fields[5]=="foo");

	2884

	2885 status = U_ZERO_ERROR;

	2886 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);

	2887 REGEX_CHECK_STATUS;

	2888 REGEX_ASSERT(n==4);

	2889 REGEX_ASSERT(fields[0]==" ");

	2890 REGEX_ASSERT(fields[1]=="a");

	2891 REGEX_ASSERT(fields[2]=="Now is ");

	2892 REGEX_ASSERT(fields[3]=="the time<c>");

	2893 status = U_ZERO_ERROR;

	2894 delete pat1;

	2895

	2896 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);

	2897 pat1 = RegexPattern::compile(&re1, pe, status);

	2898 REGEX_CHECK_STATUS;

	2899 n = pat1->split("1-10,20", fields, 10, status);

	2900 REGEX_CHECK_STATUS;

	2901 REGEX_ASSERT(n==5);

	2902 REGEX_ASSERT(fields[0]=="1");

	2903 REGEX_ASSERT(fields[1]=="-");

	2904 REGEX_ASSERT(fields[2]=="10");

	2905 REGEX_ASSERT(fields[3]==",");

	2906 REGEX_ASSERT(fields[4]=="20");

	2907 delete pat1;

	2908

	2909

	2910 //

	2911 // RegexPattern::pattern() and patternText()

	2912 //

	2913 pat1 = new RegexPattern();

	2914 REGEX_ASSERT(pat1->pattern() == "");

	2915 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));

	2916 delete pat1;

	2917

	2918 regextst_openUTF8FromInvariant(&re1, "(Hello, world)*", -1, &status);

	2919 pat1 = RegexPattern::compile(&re1, pe, status);

	2920 REGEX_CHECK_STATUS;

	2921 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");

	2922 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));

	2923 delete pat1;

	2924

	2925 utext_close(&re1);

	2926 }

	2927

	2928

	2929 //---------------------------------------------------------------------------

	2930 //

	2931 // Extended A more thorough check for features of regex patterns

	2932 // The test cases are in a separate data file,

	2933 // source/tests/testdata/regextst.txt

	2934 // A description of the test data format is included in that file.

	2935 //

	2936 //---------------------------------------------------------------------------

	2937

	2938 const char *

	2939 RegexTest::getPath(char buffer[2048], const char *filename) {

	2940 UErrorCode status=U_ZERO_ERROR;

	2941 const char *testDataDirectory = IntlTest::getSourceTestData(status);

	2942 if (U_FAILURE(status)) {

	2943 errln("ERROR: loadTestData() failed - %s", u_errorName(status));

	2944 return NULL;

	2945 }

	2946

	2947 strcpy(buffer, testDataDirectory);

	2948 strcat(buffer, filename);

	2949 return buffer;

	2950 }

	2951

	2952 void RegexTest::Extended() {

	2953 char tdd[2048];

	2954 const char *srcPath;

	2955 UErrorCode status = U_ZERO_ERROR;

	2956 int32_t lineNum = 0;

	2957

	2958 //

	2959 // Open and read the test data file.

	2960 //

	2961 srcPath=getPath(tdd, "regextst.txt");

	2962 if(srcPath==NULL) {

	2963 return; /* something went wrong, error already output */

	2964 }

	2965

	2966 int32_t len;

	2967 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);

	2968 if (U_FAILURE(status)) {

	2969 return; /* something went wrong, error already output */

	2970 }

	2971

	2972 //

	2973 // Put the test data into a UnicodeString

	2974 //

	2975 UnicodeString testString(FALSE, testData, len);

	2976

	2977 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s([\\'\\\"/])(.?)\ \1"), 0, status);

	2978 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s(#.)?$"), 0, stat us);

	2979 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s([ixsmdteDEGLMvabt yYzZ2-9])([:letter:]*)"), 0, status);

	2980

	2981 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0 , status);

	2982 UnicodeString testPattern; // The pattern for test from the test file.

	2983 UnicodeString testFlags; // the flags for a test.

	2984 UnicodeString matchString; // The marked up string to be used as input

	2985

	2986 if (U_FAILURE(status)){

	2987 dataerrln("Construct RegexMatcher() error.");

	2988 delete [] testData;

	2989 return;

	2990 }

	2991

	2992 //

	2993 // Loop over the test data file, once per line.

	2994 //

	2995 while (lineMat.find()) {

	2996 lineNum++;

	2997 if (U_FAILURE(status)) {

	2998 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status) );

	2999 }

	3000

	3001 status = U_ZERO_ERROR;

	3002 UnicodeString testLine = lineMat.group(1, status);

	3003 if (testLine.length() == 0) {

	3004 continue;

	3005 }

	3006

	3007 //

	3008 // Parse the test line. Skip blank and comment only lines.

	3009 // Separate out the three main fields - pattern, flags, target.

	3010 //

	3011

	3012 commentMat.reset(testLine);

	3013 if (commentMat.lookingAt(status)) {

	3014 // This line is a comment, or blank.

	3015 continue;

	3016 }

	3017

	3018 //

	3019 // Pull out the pattern field, remove it from the test file line.

	3020 //

	3021 quotedStuffMat.reset(testLine);

	3022 if (quotedStuffMat.lookingAt(status)) {

	3023 testPattern = quotedStuffMat.group(2, status);

	3024 testLine.remove(0, quotedStuffMat.end(0, status));

	3025 } else {

	3026 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);

	3027 continue;

	3028 }

	3029

	3030

	3031 //

	3032 // Pull out the flags from the test file line.

	3033 //

	3034 flagsMat.reset(testLine);

	3035 flagsMat.lookingAt(status); // Will always match, possi bly an empty string.

	3036 testFlags = flagsMat.group(1, status);

	3037 if (flagsMat.group(2, status).length() > 0) {

	3038 errln("Bad Match flag at line %d. Scanning %c\n",

	3039 lineNum, flagsMat.group(2, status).charAt(0));

	3040 continue;

	3041 }

	3042 testLine.remove(0, flagsMat.end(0, status));

	3043

	3044 //

	3045 // Pull out the match string, as a whole.

	3046 // We'll process the <tags> later.

	3047 //

	3048 quotedStuffMat.reset(testLine);

	3049 if (quotedStuffMat.lookingAt(status)) {

	3050 matchString = quotedStuffMat.group(2, status);

	3051 testLine.remove(0, quotedStuffMat.end(0, status));

	3052 } else {

	3053 errln("Bad match string at test file line %d", lineNum);

	3054 continue;

	3055 }

	3056

	3057 //

	3058 // The only thing left from the input line should be an optional traili ng comment.

	3059 //

	3060 commentMat.reset(testLine);

	3061 if (commentMat.lookingAt(status) == FALSE) {

	3062 errln("Line %d: unexpected characters at end of test line.", lineNum );

	3063 continue;

	3064 }

	3065

	3066 //

	3067 // Run the test

	3068 //

	3069 regex_find(testPattern, testFlags, matchString, srcPath, lineNum);

	3070 }

	3071

	3072 delete [] testData;

	3073

	3074 }

	3075

	3076

	3077

	3078 //---------------------------------------------------------------------------

	3079 //

	3080 // regex_find(pattern, flags, inputString, lineNumber)

	3081 //

	3082 // Function to run a single test from the Extended (data driven) tests.

	3083 // See file test/testdata/regextst.txt for a description of the

	3084 // pattern and inputString fields, and the allowed flags.

	3085 // lineNumber is the source line in regextst.txt of the test.

	3086 //

	3087 //---------------------------------------------------------------------------

	3088

	3089

	3090 // Set a value into a UVector at position specified by a decimal number in

	3091 // a UnicodeString. This is a utility function needed by the actual test fun ction,

	3092 // which follows.

	3093 static void set(UVector &vec, int32_t val, UnicodeString index) {

	3094 UErrorCode status=U_ZERO_ERROR;

	3095 int32_t idx = 0;

	3096 for (int32_t i=0; i<index.length(); i++) {

	3097 int32_t d=u_charDigitValue(index.charAt(i));

	3098 if (d<0) {return;}

	3099 idx = idx*10 + d;

	3100 }

	3101 while (vec.size()<idx+1) {vec.addElement(-1, status);}

	3102 vec.setElementAt(val, idx);

	3103 }

	3104

	3105 static void setInt(UVector &vec, int32_t val, int32_t idx) {

	3106 UErrorCode status=U_ZERO_ERROR;

	3107 while (vec.size()<idx+1) {vec.addElement(-1, status);}

	3108 vec.setElementAt(val, idx);

	3109 }

	3110

	3111 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& na tiveIndex)

	3112 {

	3113 UBool couldFind = TRUE;

	3114 UTEXT_SETNATIVEINDEX(utext, 0);

	3115 int32_t i = 0;

	3116 while (i < unistrOffset) {

	3117 UChar32 c = UTEXT_NEXT32(utext);

	3118 if (c != U_SENTINEL) {

	3119 i += U16_LENGTH(c);

	3120 } else {

	3121 couldFind = FALSE;

	3122 break;

	3123 }

	3124 }

	3125 nativeIndex = UTEXT_GETNATIVEINDEX(utext);

	3126 return couldFind;

	3127 }

	3128

	3129

	3130 void RegexTest::regex_find(const UnicodeString &pattern,

	3131 const UnicodeString &flags,

	3132 const UnicodeString &inputString,

	3133 const char *srcPath,

	3134 int32_t line) {

	3135 UnicodeString unEscapedInput;

	3136 UnicodeString deTaggedInput;

	3137

	3138 int32_t patternUTF8Length, inputUTF8Length;

	3139 char patternChars = NULL, inputChars = NULL;

	3140 UText patternText = UTEXT_INITIALIZER;

	3141 UText inputText = UTEXT_INITIALIZER;

	3142 UConverter *UTF8Converter = NULL;

	3143

	3144 UErrorCode status = U_ZERO_ERROR;

	3145 UParseError pe;

	3146 RegexPattern *parsePat = NULL;

	3147 RegexMatcher *parseMatcher = NULL;

	3148 RegexPattern callerPattern = NULL, UTF8Pattern = NULL;

	3149 RegexMatcher matcher = NULL, UTF8Matcher = NULL;

	3150 UVector groupStarts(status);

	3151 UVector groupEnds(status);

	3152 UVector groupStartsUTF8(status);

	3153 UVector groupEndsUTF8(status);

	3154 UBool isMatch = FALSE, isUTF8Match = FALSE;

	3155 UBool failed = FALSE;

	3156 int32_t numFinds;

	3157 int32_t i;

	3158 UBool useMatchesFunc = FALSE;

	3159 UBool useLookingAtFunc = FALSE;

	3160 int32_t regionStart = -1;

	3161 int32_t regionEnd = -1;

	3162 int32_t regionStartUTF8 = -1;

	3163 int32_t regionEndUTF8 = -1;

	3164

	3165

	3166 //

	3167 // Compile the caller's pattern

	3168 //

	3169 uint32_t bflags = 0;

	3170 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag

	3171 bflags \|= UREGEX_CASE_INSENSITIVE;

	3172 }

	3173 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag

	3174 bflags \|= UREGEX_COMMENTS;

	3175 }

	3176 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag

	3177 bflags \|= UREGEX_DOTALL;

	3178 }

	3179 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag

	3180 bflags \|= UREGEX_MULTILINE;

	3181 }

	3182

	3183 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag

	3184 bflags \|= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;

	3185 }

	3186 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag

	3187 bflags \|= UREGEX_UNIX_LINES;

	3188 }

	3189

	3190

	3191 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);

	3192 if (status != U_ZERO_ERROR) {

	3193 #if UCONFIG_NO_BREAK_ITERATION==1

	3194 // 'v' test flag means that the test pattern should not compile if ICU w as configured

	3195 // to not include break iteration. RBBI is needed for Unicode word boundaries.

	3196 if (flags.indexOf((UChar)0x76) >= 0 /'v'/ && status == U_UNSUPPORTED_E RROR) {

	3197 goto cleanupAndReturn;

	3198 }

	3199 #endif

	3200 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'

	3201 // Expected pattern compilation error.

	3202 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'

	3203 logln("Pattern Compile returns \"%s\"", u_errorName(status));

	3204 }

	3205 goto cleanupAndReturn;

	3206 } else {

	3207 // Unexpected pattern compilation error.

	3208 errln("Line %d: error %s compiling pattern.", line, u_errorName(stat us));

	3209 goto cleanupAndReturn;

	3210 }

	3211 }

	3212

	3213 UTF8Converter = ucnv_open("UTF8", &status);

	3214 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);

	3215

	3216 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);

	3217 status = U_ZERO_ERROR; // buffer overflow

	3218 patternChars = new char[patternUTF8Length+1];

	3219 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);

	3220 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);

	3221

	3222 if (status == U_ZERO_ERROR) {

	3223 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);

	3224

	3225 if (status != U_ZERO_ERROR) {

	3226 #if UCONFIG_NO_BREAK_ITERATION==1

	3227 // 'v' test flag means that the test pattern should not compile if I CU was configured

	3228 // to not include break iteration. RBBI is needed for Unicode w ord boundaries.

	3229 if (flags.indexOf((UChar)0x76) >= 0 /'v'/ && status == U_UNSUPPORT ED_ERROR) {

	3230 goto cleanupAndReturn;

	3231 }

	3232 #endif

	3233 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'

	3234 // Expected pattern compilation error.

	3235 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'

	3236 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(s tatus));

	3237 }

	3238 goto cleanupAndReturn;

	3239 } else {

	3240 // Unexpected pattern compilation error.

	3241 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_err orName(status));

	3242 goto cleanupAndReturn;

	3243 }

	3244 }

	3245 }

	3246

	3247 if (UTF8Pattern == NULL) {

	3248 // UTF-8 does not allow unpaired surrogates, so this could actually happ en without being a failure of the engine

	3249 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);

	3250 status = U_ZERO_ERROR;

	3251 }

	3252

	3253 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag

	3254 RegexPatternDump(callerPattern);

	3255 }

	3256

	3257 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag

	3258 errln("%s, Line %d: Expected, but did not get, a pattern compilation err or.", srcPath, line);

	3259 goto cleanupAndReturn;

	3260 }

	3261

	3262

	3263 //

	3264 // Number of times find() should be called on the test string, default to 1

	3265 //

	3266 numFinds = 1;

	3267 for (i=2; i<=9; i++) {

	3268 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag

	3269 if (numFinds != 1) {

	3270 errln("Line %d: more than one digit flag. Scanning %d.", line, i);

	3271 goto cleanupAndReturn;

	3272 }

	3273 numFinds = i;

	3274 }

	3275 }

	3276

	3277 // 'M' flag. Use matches() instead of find()

	3278 if (flags.indexOf((UChar)0x4d) >= 0) {

	3279 useMatchesFunc = TRUE;

	3280 }

	3281 if (flags.indexOf((UChar)0x4c) >= 0) {

	3282 useLookingAtFunc = TRUE;

	3283 }

	3284

	3285 //

	3286 // Find the tags in the input data, remove them, and record the group bound ary

	3287 // positions.

	3288 //

	3289 parsePat = RegexPattern::compile("<(/?)(r\|[0-9]+)>", 0, pe, status);

	3290 REGEX_CHECK_STATUS_L(line);

	3291

	3292 unEscapedInput = inputString.unescape();

	3293 parseMatcher = parsePat->matcher(unEscapedInput, status);

	3294 REGEX_CHECK_STATUS_L(line);

	3295 while(parseMatcher->find()) {

	3296 parseMatcher->appendReplacement(deTaggedInput, "", status);

	3297 REGEX_CHECK_STATUS;

	3298 UnicodeString groupNum = parseMatcher->group(2, status);

	3299 if (groupNum == "r") {

	3300 // <r> or </r>, a region specification within the string

	3301 if (parseMatcher->group(1, status) == "/") {

	3302 regionEnd = deTaggedInput.length();

	3303 } else {

	3304 regionStart = deTaggedInput.length();

	3305 }

	3306 } else {

	3307 // <digits> or </digits>, a group match boundary tag.

	3308 if (parseMatcher->group(1, status) == "/") {

	3309 set(groupEnds, deTaggedInput.length(), groupNum);

	3310 } else {

	3311 set(groupStarts, deTaggedInput.length(), groupNum);

	3312 }

	3313 }

	3314 }

	3315 parseMatcher->appendTail(deTaggedInput);

	3316 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);

	3317 if ((regionStart>=0 \|\| regionEnd>=0) && (regionStart<0 \|\| regionStart>region End)) {

	3318 errln("mismatched <r> tags");

	3319 failed = TRUE;

	3320 goto cleanupAndReturn;

	3321 }

	3322

	3323 //

	3324 // Configure the matcher according to the flags specified with this test.

	3325 //

	3326 matcher = callerPattern->matcher(deTaggedInput, status);

	3327 REGEX_CHECK_STATUS_L(line);

	3328 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag

	3329 matcher->setTrace(TRUE);

	3330 }

	3331

	3332 if (UTF8Pattern != NULL) {

	3333 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);

	3334 status = U_ZERO_ERROR; // buffer overflow

	3335 inputChars = new char[inputUTF8Length+1];

	3336 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, stat us);

	3337 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);

	3338

	3339 if (status == U_ZERO_ERROR) {

	3340 UTF8Matcher = UTF8Pattern->matcher(&inputText, RegexPattern::PATTERN _IS_UTEXT, status);

	3341 REGEX_CHECK_STATUS_L(line);

	3342 }

	3343

	3344 if (UTF8Matcher == NULL) {

	3345 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine

	3346 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d" , srcPath, line);

	3347 status = U_ZERO_ERROR;

	3348 }

	3349 }

	3350

	3351 //

	3352 // Generate native indices for UTF8 versions of region and capture group in fo

	3353 //

	3354 if (UTF8Matcher != NULL) {

	3355 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStar t, regionStartUTF8);

	3356 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);

	3357

	3358 // Fill out the native index UVector info.

	3359 // Only need 1 loop, from above we know groupStarts.size() = groupEnds. size()

	3360 for (i=0; i<groupStarts.size(); i++) {

	3361 int32_t start = groupStarts.elementAti(i);

	3362 // -1 means there was no UVector slot and we won't be requesting th at capture group for this test, don't bother inserting

	3363 if (start >= 0) {

	3364 int32_t startUTF8;

	3365 if (!utextOffsetToNative(&inputText, start, startUTF8)) {

	3366 errln("Error at line %d: could not find native index for gro up start %d. UTF16 index %d", line, i, start);

	3367 failed = TRUE;

	3368 goto cleanupAndReturn; // Good chance of subsequent bogus e rrors. Stop now.

	3369 }

	3370 setInt(groupStartsUTF8, startUTF8, i);

	3371 }

	3372

	3373 int32_t end = groupEnds.elementAti(i);

	3374 // -1 means there was no UVector slot and we won't be requesting th at capture group for this test, don't bother inserting

	3375 if (end >= 0) {

	3376 int32_t endUTF8;

	3377 if (!utextOffsetToNative(&inputText, end, endUTF8)) {

	3378 errln("Error at line %d: could not find native index for gro up end %d. UTF16 index %d", line, i, end);

	3379 failed = TRUE;

	3380 goto cleanupAndReturn; // Good chance of subsequent bogus e rrors. Stop now.

	3381 }

	3382 setInt(groupEndsUTF8, endUTF8, i);

	3383 }

	3384 }

	3385 }

	3386

	3387 if (regionStart>=0) {

	3388 matcher->region(regionStart, regionEnd, status);

	3389 REGEX_CHECK_STATUS_L(line);

	3390 if (UTF8Matcher != NULL) {

	3391 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);

	3392 REGEX_CHECK_STATUS_L(line);

	3393 }

	3394 }

	3395 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag

	3396 matcher->useAnchoringBounds(FALSE);

	3397 if (UTF8Matcher != NULL) {

	3398 UTF8Matcher->useAnchoringBounds(FALSE);

	3399 }

	3400 }

	3401 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag

	3402 matcher->useTransparentBounds(TRUE);

	3403 if (UTF8Matcher != NULL) {

	3404 UTF8Matcher->useTransparentBounds(TRUE);

	3405 }

	3406 }

	3407

	3408

	3409

	3410 //

	3411 // Do a find on the de-tagged input using the caller's pattern

	3412 // TODO: error on count>1 and not find().

	3413 // error on both matches() and lookingAt().

	3414 //

	3415 for (i=0; i<numFinds; i++) {

	3416 if (useMatchesFunc) {

	3417 isMatch = matcher->matches(status);

	3418 if (UTF8Matcher != NULL) {

	3419 isUTF8Match = UTF8Matcher->matches(status);

	3420 }

	3421 } else if (useLookingAtFunc) {

	3422 isMatch = matcher->lookingAt(status);

	3423 if (UTF8Matcher != NULL) {

	3424 isUTF8Match = UTF8Matcher->lookingAt(status);

	3425 }

	3426 } else {

	3427 isMatch = matcher->find();

	3428 if (UTF8Matcher != NULL) {

	3429 isUTF8Match = UTF8Matcher->find();

	3430 }

	3431 }

	3432 }

	3433 matcher->setTrace(FALSE);

	3434

	3435 //

	3436 // Match up the groups from the find() with the groups from the tags

	3437 //

	3438

	3439 // number of tags should match number of groups from find operation.

	3440 // matcher->groupCount does not include group 0, the entire match, hence the +1.

	3441 // G option in test means that capture group data is not available in the

	3442 // expected results, so the check needs to be suppressed.

	3443 if (isMatch == FALSE && groupStarts.size() != 0) {

	3444 errln("Error at line %d: Match expected, but none found.", line);

	3445 failed = TRUE;

	3446 goto cleanupAndReturn;

	3447 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {

	3448 errln("Error at line %d: Match expected, but none found. (UTF8)", line) ;

	3449 failed = TRUE;

	3450 goto cleanupAndReturn;

	3451 }

	3452

	3453 if (flags.indexOf((UChar)0x47 /G/) >= 0) {

	3454 // Only check for match / no match. Don't check capture groups.

	3455 if (isMatch && groupStarts.size() == 0) {

	3456 errln("Error at line %d: No match expected, but one found.", line);

	3457 failed = TRUE;

	3458 } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0 ) {

	3459 errln("Error at line %d: No match expected, but one found. (UTF8)", line);

	3460 failed = TRUE;

	3461 }

	3462 goto cleanupAndReturn;

	3463 }

	3464

	3465 REGEX_CHECK_STATUS_L(line);

	3466 for (i=0; i<=matcher->groupCount(); i++) {

	3467 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elem entAti(i));

	3468 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupSta rtsUTF8.elementAti(i));

	3469 if (matcher->start(i, status) != expectedStart) {

	3470 errln("Error at line %d: incorrect start position for group %d. Exp ected %d, got %d",

	3471 line, i, expectedStart, matcher->start(i, status));

	3472 failed = TRUE;

	3473 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.

	3474 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expec tedStartUTF8) {

	3475 errln("Error at line %d: incorrect start position for group %d. Exp ected %d, got %d (UTF8)",

	3476 line, i, expectedStartUTF8, UTF8Matcher->start(i, status));

	3477 failed = TRUE;

	3478 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.

	3479 }

	3480

	3481 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti (i));

	3482 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF 8.elementAti(i));

	3483 if (matcher->end(i, status) != expectedEnd) {

	3484 errln("Error at line %d: incorrect end position for group %d. Expec ted %d, got %d",

	3485 line, i, expectedEnd, matcher->end(i, status));

	3486 failed = TRUE;

	3487 // Error on end position; keep going; real error is probably yet to come as group

	3488 // end positions work from end of the input data towards the front .

	3489 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expecte dEndUTF8) {

	3490 errln("Error at line %d: incorrect end position for group %d. Expec ted %d, got %d (UTF8)",

	3491 line, i, expectedEndUTF8, UTF8Matcher->end(i, status));

	3492 failed = TRUE;

	3493 // Error on end position; keep going; real error is probably yet to come as group

	3494 // end positions work from end of the input data towards the front .

	3495 }

	3496 }

	3497 if ( matcher->groupCount()+1 < groupStarts.size()) {

	3498 errln("Error at line %d: Expected %d capture groups, found %d.",

	3499 line, groupStarts.size()-1, matcher->groupCount());

	3500 failed = TRUE;

	3501 }

	3502 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.si ze()) {

	3503 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",

	3504 line, groupStarts.size()-1, UTF8Matcher->groupCount());

	3505 failed = TRUE;

	3506 }

	3507

	3508 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == fa lse

	3509 matcher->requireEnd() == TRUE) {

	3510 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", l ine);

	3511 failed = TRUE;

	3512 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false

	3513 UTF8Matcher->requireEnd() == TRUE) {

	3514 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UT F8)", line);

	3515 failed = TRUE;

	3516 }

	3517

	3518 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == tr ue

	3519 matcher->requireEnd() == FALSE) {

	3520 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", l ine);

	3521 failed = TRUE;

	3522 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false

	3523 UTF8Matcher->requireEnd() == FALSE) {

	3524 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UT F8)", line);

	3525 failed = TRUE;

	3526 }

	3527

	3528 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false

	3529 matcher->hitEnd() == TRUE) {

	3530 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line) ;

	3531 failed = TRUE;

	3532 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false

	3533 UTF8Matcher->hitEnd() == TRUE) {

	3534 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)" , line);

	3535 failed = TRUE;

	3536 }

	3537

	3538 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true

	3539 matcher->hitEnd() == FALSE) {

	3540 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line) ;

	3541 failed = TRUE;

	3542 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true

	3543 UTF8Matcher->hitEnd() == FALSE) {

	3544 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)" , line);

	3545 failed = TRUE;

	3546 }

	3547

	3548

	3549 cleanupAndReturn:

	3550 if (failed) {

	3551 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "

	3552 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");

	3553 // callerPattern->dump();

	3554 }

	3555 delete parseMatcher;

	3556 delete parsePat;

	3557 delete UTF8Matcher;

	3558 delete UTF8Pattern;

	3559 delete matcher;

	3560 delete callerPattern;

	3561

	3562 utext_close(&inputText);

	3563 delete[] inputChars;

	3564 utext_close(&patternText);

	3565 delete[] patternChars;

	3566 ucnv_close(UTF8Converter);

	3567 }

	3568

	3569

	3570

	3571

	3572 //---------------------------------------------------------------------------

	3573 //

	3574 // Errors Check for error handling in patterns.

	3575 //

	3576 //---------------------------------------------------------------------------

	3577 void RegexTest::Errors() {

	3578 // \escape sequences that aren't implemented yet.

	3579 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEME NTED);

	3580

	3581 // Missing close parentheses

	3582 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);

	3583 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);

	3584 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PA REN);

	3585

	3586 // Extra close paren

	3587 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_ PAREN);

	3588 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);

	3589 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);

	3590

	3591 // Look-ahead, Look-behind

	3592 // TODO: add tests for unbounded length look-behinds.

	3593 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal cons truct

	3594

	3595 // Attempt to use non-default flags

	3596 {

	3597 UParseError pe;

	3598 UErrorCode status = U_ZERO_ERROR;

	3599 int32_t flags = UREGEX_CANON_EQ \|

	3600 UREGEX_COMMENTS \| UREGEX_DOTALL \|

	3601 UREGEX_MULTILINE;

	3602 RegexPattern pat1= RegexPattern::compile(".", flags, pe, status);

	3603 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);

	3604 delete pat1;

	3605 }

	3606

	3607

	3608 // Quantifiers are allowed only after something that can be quantified.

	3609 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);

	3610 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);

	3611 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);

	3612

	3613 // Mal-formed {min,max} quantifiers

	3614 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);

	3615 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);

	3616 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);

	3617 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);

	3618 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);

	3619 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);

	3620 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Ov erflows int during scan

	3621 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Ov erflows regex binary format

	3622 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);

	3623

	3624 // Ticket 5389

	3625 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);

	3626

	3627 // Invalid Back Reference \0

	3628 // For ICU 3.8 and earlier

	3629 // For ICU versions newer than 3.8, \0 introduces an octal escape.

	3630 //

	3631 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);

	3632

	3633 }

	3634

	3635

	3636 //------------------------------------------------------------------------------ -

	3637 //

	3638 // Read a text data file, convert it to UChars, and return the data

	3639 // in one big UChar * buffer, which the caller must delete.

	3640 //

	3641 //------------------------------------------------------------------------------ --

	3642 UChar RegexTest::ReadAndConvertFile(const char fileName, int32_t &ulen,

	3643 const char *defEncoding, UErrorCode &status ) {

	3644 UChar *retPtr = NULL;

	3645 char *fileBuf = NULL;

	3646 UConverter* conv = NULL;

	3647 FILE *f = NULL;

	3648

	3649 ulen = 0;

	3650 if (U_FAILURE(status)) {

	3651 return retPtr;

	3652 }

	3653

	3654 //

	3655 // Open the file.

	3656 //

	3657 f = fopen(fileName, "rb");

	3658 if (f == 0) {

	3659 dataerrln("Error opening test data file %s\n", fileName);

	3660 status = U_FILE_ACCESS_ERROR;

	3661 return NULL;

	3662 }

	3663 //

	3664 // Read it in

	3665 //

	3666 int32_t fileSize;

	3667 int32_t amt_read;

	3668

	3669 fseek( f, 0, SEEK_END);

	3670 fileSize = ftell(f);

	3671 fileBuf = new char[fileSize];

	3672 fseek(f, 0, SEEK_SET);

	3673 amt_read = fread(fileBuf, 1, fileSize, f);

	3674 if (amt_read != fileSize \|\| fileSize <= 0) {

	3675 errln("Error reading test data file.");

	3676 goto cleanUpAndReturn;

	3677 }

	3678

	3679 //

	3680 // Look for a Unicode Signature (BOM) on the data just read

	3681 //

	3682 int32_t signatureLength;

	3683 const char * fileBufC;

	3684 const char* encoding;

	3685

	3686 fileBufC = fileBuf;

	3687 encoding = ucnv_detectUnicodeSignature(

	3688 fileBuf, fileSize, &signatureLength, &status);

	3689 if(encoding!=NULL ){

	3690 fileBufC += signatureLength;

	3691 fileSize -= signatureLength;

	3692 } else {

	3693 encoding = defEncoding;

	3694 if (strcmp(encoding, "utf-8") == 0) {

	3695 errln("file %s is missing its BOM", fileName);

	3696 }

	3697 }

	3698

	3699 //

	3700 // Open a converter to take the rule file to UTF-16

	3701 //

	3702 conv = ucnv_open(encoding, &status);

	3703 if (U_FAILURE(status)) {

	3704 goto cleanUpAndReturn;

	3705 }

	3706

	3707 //

	3708 // Convert the rules to UChar.

	3709 // Preflight first to determine required buffer size.

	3710 //

	3711 ulen = ucnv_toUChars(conv,

	3712 NULL, // dest,

	3713 0, // destCapacity,

	3714 fileBufC,

	3715 fileSize,

	3716 &status);

	3717 if (status == U_BUFFER_OVERFLOW_ERROR) {

	3718 // Buffer Overflow is expected from the preflight operation.

	3719 status = U_ZERO_ERROR;

	3720

	3721 retPtr = new UChar[ulen+1];

	3722 ucnv_toUChars(conv,

	3723 retPtr, // dest,

	3724 ulen+1,

	3725 fileBufC,

	3726 fileSize,

	3727 &status);

	3728 }

	3729

	3730 cleanUpAndReturn:

	3731 fclose(f);

	3732 delete[] fileBuf;

	3733 ucnv_close(conv);

	3734 if (U_FAILURE(status)) {

	3735 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));

	3736 delete retPtr;

	3737 retPtr = 0;

	3738 ulen = 0;

	3739 };

	3740 return retPtr;

	3741 }

	3742

	3743

	3744 //------------------------------------------------------------------------------ -

	3745 //

	3746 // PerlTests - Run Perl's regular expression tests

	3747 // The input file for this test is re_tests, the standard regular

	3748 // expression test data distributed with the Perl source code.

	3749 //

	3750 // Here is Perl's description of the test data file:

	3751 //

	3752 // # The tests are in a separate file 't/op/re_tests'.

	3753 // # Each line in that file is a separate test.

	3754 // # There are five columns, separated by tabs.

	3755 // #

	3756 // # Column 1 contains the pattern, optionally enclosed in C<''>.

	3757 // # Modifiers can be put after the closing C<'>.

	3758 // #

	3759 // # Column 2 contains the string to be matched.

	3760 // #

	3761 // # Column 3 contains the expected result:

	3762 // # y expect a match

	3763 // # n expect no match

	3764 // # c expect an error

	3765 // # B test exposes a known bug in Perl, should be skipped

	3766 // # b test exposes a known bug in Perl, should be skipped if noamp

	3767 // #

	3768 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.

	3769 // #

	3770 // # Column 4 contains a string, usually C<$&>.

	3771 // #

	3772 // # Column 5 contains the expected result of double-quote

	3773 // # interpolating that string after the match, or start of error message .

	3774 // #

	3775 // # Column 6, if present, contains a reason why the test is skipped.

	3776 // # This is printed with "skipped", for harness to pick up.

	3777 // #

	3778 // # \n in the tests are interpolated, as are variables of the form ${\w+ }.

	3779 // #

	3780 // # If you want to add a regular expression test that can't be expressed

	3781 // # in this format, don't add it here: put it in op/pat.t instead.

	3782 //

	3783 // For ICU, if field 3 contains an 'i', the test will be skipped.

	3784 // The test exposes is some known incompatibility between ICU and Perl re gexps.

	3785 // (The i is in addition to whatever was there before.)

	3786 //

	3787 //------------------------------------------------------------------------------ -

	3788 void RegexTest::PerlTests() {

	3789 char tdd[2048];

	3790 const char *srcPath;

	3791 UErrorCode status = U_ZERO_ERROR;

	3792 UParseError pe;

	3793

	3794 //

	3795 // Open and read the test data file.

	3796 //

	3797 srcPath=getPath(tdd, "re_tests.txt");

	3798 if(srcPath==NULL) {

	3799 return; /* something went wrong, error already output */

	3800 }

	3801

	3802 int32_t len;

	3803 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);

	3804 if (U_FAILURE(status)) {

	3805 return; /* something went wrong, error already output */

	3806 }

	3807

	3808 //

	3809 // Put the test data into a UnicodeString

	3810 //

	3811 UnicodeString testDataString(FALSE, testData, len);

	3812

	3813 //

	3814 // Regex to break the input file into lines, and strip the new lines.

	3815 // One line per match, capture group one is the desired data.

	3816 //

	3817 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\ \r\\n]+"), 0, pe, status);

	3818 if (U_FAILURE(status)) {

	3819 dataerrln("RegexPattern::compile() error");

	3820 return;

	3821 }

	3822 RegexMatcher* lineMat = linePat->matcher(testDataString, status);

	3823

	3824 //

	3825 // Regex to split a test file line into fields.

	3826 // There are six fields, separated by tabs.

	3827 //

	3828 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);

	3829

	3830 //

	3831 // Regex to identify test patterns with flag settings, and to separate them .

	3832 // Test patterns with flags look like 'pattern'i

	3833 // Test patterns without flags are not quoted: pattern

	3834 // Coming out, capture group 2 is the pattern, capture group 3 is the flag s.

	3835 //

	3836 RegexPattern flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(. )\\1(.*)"), 0, pe, status);

	3837 RegexMatcher* flagMat = flagPat->matcher(status);

	3838

	3839 //

	3840 // The Perl tests reference several perl-isms, which are evaluated/substitut ed

	3841 // in the test data. Not being perl, this must be done explicitly. Here

	3842 // are string constants and REs for these constructs.

	3843 //

	3844 UnicodeString nulnulSrc("${nulnul}");

	3845 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);

	3846 nulnul = nulnul.unescape();

	3847

	3848 UnicodeString ffffSrc("${ffff}");

	3849 UnicodeString ffff("\\uffff", -1, US_INV);

	3850 ffff = ffff.unescape();

	3851

	3852 // regexp for $-[0], $+[2], etc.

	3853 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([ +\\-])\\[(\\d+)\\]"), 0, pe, status);

	3854 RegexMatcher *groupsMat = groupsPat->matcher(status);

	3855

	3856 // regexp for $0, $1, $2, etc.

	3857 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+) "), 0, pe, status);

	3858 RegexMatcher *cgMat = cgPat->matcher(status);

	3859

	3860

	3861 //

	3862 // Main Loop for the Perl Tests, runs once per line from the

	3863 // test data file.

	3864 //

	3865 int32_t lineNum = 0;

	3866 int32_t skippedUnimplementedCount = 0;

	3867 while (lineMat->find()) {

	3868 lineNum++;

	3869

	3870 //

	3871 // Get a line, break it into its fields, do the Perl

	3872 // variable substitutions.

	3873 //

	3874 UnicodeString line = lineMat->group(1, status);

	3875 UnicodeString fields[7];

	3876 fieldPat->split(line, fields, 7, status);

	3877

	3878 flagMat->reset(fields[0]);

	3879 flagMat->matches(status);

	3880 UnicodeString pattern = flagMat->group(2, status);

	3881 pattern.findAndReplace("${bang}", "!");

	3882 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000" ));

	3883 pattern.findAndReplace(ffffSrc, ffff);

	3884

	3885 //

	3886 // Identify patterns that include match flag settings,

	3887 // split off the flags, remove the extra quotes.

	3888 //

	3889 UnicodeString flagStr = flagMat->group(3, status);

	3890 if (U_FAILURE(status)) {

	3891 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));

	3892 return;

	3893 }

	3894 int32_t flags = 0;

	3895 const UChar UChar_c = 0x63; // Char constants for the flag letters.

	3896 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C )

	3897 const UChar UChar_m = 0x6d;

	3898 const UChar UChar_x = 0x78;

	3899 const UChar UChar_y = 0x79;

	3900 if (flagStr.indexOf(UChar_i) != -1) {

	3901 flags \|= UREGEX_CASE_INSENSITIVE;

	3902 }

	3903 if (flagStr.indexOf(UChar_m) != -1) {

	3904 flags \|= UREGEX_MULTILINE;

	3905 }

	3906 if (flagStr.indexOf(UChar_x) != -1) {

	3907 flags \|= UREGEX_COMMENTS;

	3908 }

	3909

	3910 //

	3911 // Compile the test pattern.

	3912 //

	3913 status = U_ZERO_ERROR;

	3914 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status );

	3915 if (status == U_REGEX_UNIMPLEMENTED) {

	3916 //

	3917 // Test of a feature that is planned for ICU, but not yet implemente d.

	3918 // skip the test.

	3919 skippedUnimplementedCount++;

	3920 delete testPat;

	3921 status = U_ZERO_ERROR;

	3922 continue;

	3923 }

	3924

	3925 if (U_FAILURE(status)) {

	3926 // Some tests are supposed to generate errors.

	3927 // Only report an error for tests that are supposed to succeed.

	3928 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supp osed to fail AND

	3929 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility

	3930 {

	3931 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status ));

	3932 }

	3933 status = U_ZERO_ERROR;

	3934 delete testPat;

	3935 continue;

	3936 }

	3937

	3938 if (fields[2].indexOf(UChar_i) >= 0) {

	3939 // ICU should skip this test.

	3940 delete testPat;

	3941 continue;

	3942 }

	3943

	3944 if (fields[2].indexOf(UChar_c) >= 0) {

	3945 // This pattern should have caused a compilation error, but didn't/

	3946 errln("line %d: Expected a pattern compile error, got success.", lin eNum);

	3947 delete testPat;

	3948 continue;

	3949 }

	3950

	3951 //

	3952 // replace the Perl variables that appear in some of the

	3953 // match data strings.

	3954 //

	3955 UnicodeString matchString = fields[1];

	3956 matchString.findAndReplace(nulnulSrc, nulnul);

	3957 matchString.findAndReplace(ffffSrc, ffff);

	3958

	3959 // Replace any \n in the match string with an actual new-line char.

	3960 // Don't do full unescape, as this unescapes more than Perl does, which

	3961 // causes other spurious failures in the tests.

	3962 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");

	3963

	3964

	3965

	3966 //

	3967 // Run the test, check for expected match/don't match result.

	3968 //

	3969 RegexMatcher *testMat = testPat->matcher(matchString, status);

	3970 UBool found = testMat->find();

	3971 UBool expected = FALSE;

	3972 if (fields[2].indexOf(UChar_y) >=0) {

	3973 expected = TRUE;

	3974 }

	3975 if (expected != found) {

	3976 errln("line %d: Expected %smatch, got %smatch",

	3977 lineNum, expected?"":"no ", found?"":"no " );

	3978 continue;

	3979 }

	3980

	3981 // Don't try to check expected results if there is no match.

	3982 // (Some have stuff in the expected fields)

	3983 if (!found) {

	3984 delete testMat;

	3985 delete testPat;

	3986 continue;

	3987 }

	3988

	3989 //

	3990 // Interpret the Perl expression from the fourth field of the data file,

	3991 // building up an ICU string from the results of the ICU match.

	3992 // The Perl expression will contain references to the results of

	3993 // a regex match, including the matched string, capture group string s,

	3994 // group starting and ending indicies, etc.

	3995 //

	3996 UnicodeString resultString;

	3997 UnicodeString perlExpr = fields[3];

	3998 #if SUPPORT_MUTATING_INPUT_STRING

	3999 groupsMat->reset(perlExpr);

	4000 cgMat->reset(perlExpr);

	4001 #endif

	4002

	4003 while (perlExpr.length() > 0) {

	4004 #if !SUPPORT_MUTATING_INPUT_STRING

	4005 // Perferred usage. Reset after any modification to input string.

	4006 groupsMat->reset(perlExpr);

	4007 cgMat->reset(perlExpr);

	4008 #endif

	4009

	4010 if (perlExpr.startsWith("$&")) {

	4011 resultString.append(testMat->group(status));

	4012 perlExpr.remove(0, 2);

	4013 }

	4014

	4015 else if (groupsMat->lookingAt(status)) {

	4016 // $-[0] $+[2] etc.

	4017 UnicodeString digitString = groupsMat->group(2, status);

	4018 int32_t t = 0;

	4019 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);

	4020 UnicodeString plusOrMinus = groupsMat->group(1, status);

	4021 int32_t matchPosition;

	4022 if (plusOrMinus.compare("+") == 0) {

	4023 matchPosition = testMat->end(groupNum, status);

	4024 } else {

	4025 matchPosition = testMat->start(groupNum, status);

	4026 }

	4027 if (matchPosition != -1) {

	4028 ICU_Utility::appendNumber(resultString, matchPosition);

	4029 }

	4030 perlExpr.remove(0, groupsMat->end(status));

	4031 }

	4032

	4033 else if (cgMat->lookingAt(status)) {

	4034 // $1, $2, $3, etc.

	4035 UnicodeString digitString = cgMat->group(1, status);

	4036 int32_t t = 0;

	4037 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);

	4038 if (U_SUCCESS(status)) {

	4039 resultString.append(testMat->group(groupNum, status));

	4040 status = U_ZERO_ERROR;

	4041 }

	4042 perlExpr.remove(0, cgMat->end(status));

	4043 }

	4044

	4045 else if (perlExpr.startsWith("@-")) {

	4046 int32_t i;

	4047 for (i=0; i<=testMat->groupCount(); i++) {

	4048 if (i>0) {

	4049 resultString.append(" ");

	4050 }

	4051 ICU_Utility::appendNumber(resultString, testMat->start(i, st atus));

	4052 }

	4053 perlExpr.remove(0, 2);

	4054 }

	4055

	4056 else if (perlExpr.startsWith("@+")) {

	4057 int32_t i;

	4058 for (i=0; i<=testMat->groupCount(); i++) {

	4059 if (i>0) {

	4060 resultString.append(" ");

	4061 }

	4062 ICU_Utility::appendNumber(resultString, testMat->end(i, stat us));

	4063 }

	4064 perlExpr.remove(0, 2);

	4065 }

	4066

	4067 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \ Escape. Take following char as a literal.

	4068 // or as an escap ed sequence (e.g. \n)

	4069 if (perlExpr.length() > 1) {

	4070 perlExpr.remove(0, 1); // Remove the '\', but only if not l ast char.

	4071 }

	4072 UChar c = perlExpr.charAt(0);

	4073 switch (c) {

	4074 case 'n': c = '\n'; break;

	4075 // add any other escape sequences that show up in the test expec ted results.

	4076 }

	4077 resultString.append(c);

	4078 perlExpr.remove(0, 1);

	4079 }

	4080

	4081 else {

	4082 // Any characters from the perl expression that we don't explici tly

	4083 // recognize before here are assumed to be literals and copied

	4084 // as-is to the expected results.

	4085 resultString.append(perlExpr.charAt(0));

	4086 perlExpr.remove(0, 1);

	4087 }

	4088

	4089 if (U_FAILURE(status)) {

	4090 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)) ;

	4091 break;

	4092 }

	4093 }

	4094

	4095 //

	4096 // Expected Results Compare

	4097 //

	4098 UnicodeString expectedS(fields[4]);

	4099 expectedS.findAndReplace(nulnulSrc, nulnul);

	4100 expectedS.findAndReplace(ffffSrc, ffff);

	4101 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");

	4102

	4103

	4104 if (expectedS.compare(resultString) != 0) {

	4105 err("Line %d: Incorrect perl expression results.", lineNum);

	4106 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; go t \""+resultString+(UnicodeString)"\"");

	4107 }

	4108

	4109 delete testMat;

	4110 delete testPat;

	4111 }

	4112

	4113 //

	4114 // All done. Clean up allocated stuff.

	4115 //

	4116 delete cgMat;

	4117 delete cgPat;

	4118

	4119 delete groupsMat;

	4120 delete groupsPat;

	4121

	4122 delete flagMat;

	4123 delete flagPat;

	4124

	4125 delete lineMat;

	4126 delete linePat;

	4127

	4128 delete fieldPat;

	4129 delete [] testData;

	4130

	4131

	4132 logln("%d tests skipped because of unimplemented regexp features.", skippedU nimplementedCount);

	4133

	4134 }

	4135

	4136

	4137 //------------------------------------------------------------------------------ -

	4138 //

	4139 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts

	4140 // (instead of using UnicodeStrings) to test the alternate engi ne.

	4141 // The input file for this test is re_tests, the standard regul ar

	4142 // expression test data distributed with the Perl source code.

	4143 // See PerlTests() for more information.

	4144 //

	4145 //------------------------------------------------------------------------------ -

	4146 void RegexTest::PerlTestsUTF8() {

	4147 char tdd[2048];

	4148 const char *srcPath;

	4149 UErrorCode status = U_ZERO_ERROR;

	4150 UParseError pe;

	4151 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));

	4152 UText patternText = UTEXT_INITIALIZER;

	4153 char *patternChars = NULL;

	4154 int32_t patternLength;

	4155 int32_t patternCapacity = 0;

	4156 UText inputText = UTEXT_INITIALIZER;

	4157 char *inputChars = NULL;

	4158 int32_t inputLength;

	4159 int32_t inputCapacity = 0;

	4160

	4161 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, N ULL, NULL, NULL, &status);

	4162

	4163 //

	4164 // Open and read the test data file.

	4165 //

	4166 srcPath=getPath(tdd, "re_tests.txt");

	4167 if(srcPath==NULL) {

	4168 return; /* something went wrong, error already output */

	4169 }

	4170

	4171 int32_t len;

	4172 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);

	4173 if (U_FAILURE(status)) {

	4174 return; /* something went wrong, error already output */

	4175 }

	4176

	4177 //

	4178 // Put the test data into a UnicodeString

	4179 //

	4180 UnicodeString testDataString(FALSE, testData, len);

	4181

	4182 //

	4183 // Regex to break the input file into lines, and strip the new lines.

	4184 // One line per match, capture group one is the desired data.

	4185 //

	4186 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\ \r\\n]+"), 0, pe, status);

	4187 if (U_FAILURE(status)) {

	4188 dataerrln("RegexPattern::compile() error");

	4189 return;

	4190 }

	4191 RegexMatcher* lineMat = linePat->matcher(testDataString, status);

	4192

	4193 //

	4194 // Regex to split a test file line into fields.

	4195 // There are six fields, separated by tabs.

	4196 //

	4197 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);

	4198

	4199 //

	4200 // Regex to identify test patterns with flag settings, and to separate them .

	4201 // Test patterns with flags look like 'pattern'i

	4202 // Test patterns without flags are not quoted: pattern

	4203 // Coming out, capture group 2 is the pattern, capture group 3 is the flag s.

	4204 //

	4205 RegexPattern flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(. )\\1(.*)"), 0, pe, status);

	4206 RegexMatcher* flagMat = flagPat->matcher(status);

	4207

	4208 //

	4209 // The Perl tests reference several perl-isms, which are evaluated/substitut ed

	4210 // in the test data. Not being perl, this must be done explicitly. Here

	4211 // are string constants and REs for these constructs.

	4212 //

	4213 UnicodeString nulnulSrc("${nulnul}");

	4214 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);

	4215 nulnul = nulnul.unescape();

	4216

	4217 UnicodeString ffffSrc("${ffff}");

	4218 UnicodeString ffff("\\uffff", -1, US_INV);

	4219 ffff = ffff.unescape();

	4220

	4221 // regexp for $-[0], $+[2], etc.

	4222 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([ +\\-])\\[(\\d+)\\]"), 0, pe, status);

	4223 RegexMatcher *groupsMat = groupsPat->matcher(status);

	4224

	4225 // regexp for $0, $1, $2, etc.

	4226 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+) "), 0, pe, status);

	4227 RegexMatcher *cgMat = cgPat->matcher(status);

	4228

	4229

	4230 //

	4231 // Main Loop for the Perl Tests, runs once per line from the

	4232 // test data file.

	4233 //

	4234 int32_t lineNum = 0;

	4235 int32_t skippedUnimplementedCount = 0;

	4236 while (lineMat->find()) {

	4237 lineNum++;

	4238

	4239 //

	4240 // Get a line, break it into its fields, do the Perl

	4241 // variable substitutions.

	4242 //

	4243 UnicodeString line = lineMat->group(1, status);

	4244 UnicodeString fields[7];

	4245 fieldPat->split(line, fields, 7, status);

	4246

	4247 flagMat->reset(fields[0]);

	4248 flagMat->matches(status);

	4249 UnicodeString pattern = flagMat->group(2, status);

	4250 pattern.findAndReplace("${bang}", "!");

	4251 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000" ));

	4252 pattern.findAndReplace(ffffSrc, ffff);

	4253

	4254 //

	4255 // Identify patterns that include match flag settings,

	4256 // split off the flags, remove the extra quotes.

	4257 //

	4258 UnicodeString flagStr = flagMat->group(3, status);

	4259 if (U_FAILURE(status)) {

	4260 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));

	4261 return;

	4262 }

	4263 int32_t flags = 0;

	4264 const UChar UChar_c = 0x63; // Char constants for the flag letters.

	4265 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C )

	4266 const UChar UChar_m = 0x6d;

	4267 const UChar UChar_x = 0x78;

	4268 const UChar UChar_y = 0x79;

	4269 if (flagStr.indexOf(UChar_i) != -1) {

	4270 flags \|= UREGEX_CASE_INSENSITIVE;

	4271 }

	4272 if (flagStr.indexOf(UChar_m) != -1) {

	4273 flags \|= UREGEX_MULTILINE;

	4274 }

	4275 if (flagStr.indexOf(UChar_x) != -1) {

	4276 flags \|= UREGEX_COMMENTS;

	4277 }

	4278

	4279 //

	4280 // Put the pattern in a UTF-8 UText

	4281 //

	4282 status = U_ZERO_ERROR;

	4283 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Conve rter.getAlias(), status);

	4284 if (status == U_BUFFER_OVERFLOW_ERROR) {

	4285 status = U_ZERO_ERROR;

	4286 delete[] patternChars;

	4287 patternCapacity = patternLength + 1;

	4288 patternChars = new char[patternCapacity];

	4289 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlia s(), status);

	4290 }

	4291 utext_openUTF8(&patternText, patternChars, patternLength, &status);

	4292

	4293 //

	4294 // Compile the test pattern.

	4295 //

	4296 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, s tatus);

	4297 if (status == U_REGEX_UNIMPLEMENTED) {

	4298 //

	4299 // Test of a feature that is planned for ICU, but not yet implemente d.

	4300 // skip the test.

	4301 skippedUnimplementedCount++;

	4302 delete testPat;

	4303 status = U_ZERO_ERROR;

	4304 continue;

	4305 }

	4306

	4307 if (U_FAILURE(status)) {

	4308 // Some tests are supposed to generate errors.

	4309 // Only report an error for tests that are supposed to succeed.

	4310 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supp osed to fail AND

	4311 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility

	4312 {

	4313 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status ));

	4314 }

	4315 status = U_ZERO_ERROR;

	4316 delete testPat;

	4317 continue;

	4318 }

	4319

	4320 if (fields[2].indexOf(UChar_i) >= 0) {

	4321 // ICU should skip this test.

	4322 delete testPat;

	4323 continue;

	4324 }

	4325

	4326 if (fields[2].indexOf(UChar_c) >= 0) {

	4327 // This pattern should have caused a compilation error, but didn't/

	4328 errln("line %d: Expected a pattern compile error, got success.", lin eNum);

	4329 delete testPat;

	4330 continue;

	4331 }

	4332

	4333

	4334 //

	4335 // replace the Perl variables that appear in some of the

	4336 // match data strings.

	4337 //

	4338 UnicodeString matchString = fields[1];

	4339 matchString.findAndReplace(nulnulSrc, nulnul);

	4340 matchString.findAndReplace(ffffSrc, ffff);

	4341

	4342 // Replace any \n in the match string with an actual new-line char.

	4343 // Don't do full unescape, as this unescapes more than Perl does, which

	4344 // causes other spurious failures in the tests.

	4345 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");

	4346

	4347 //

	4348 // Put the input in a UTF-8 UText

	4349 //

	4350 status = U_ZERO_ERROR;

	4351 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Convert er.getAlias(), status);

	4352 if (status == U_BUFFER_OVERFLOW_ERROR) {

	4353 status = U_ZERO_ERROR;

	4354 delete[] inputChars;

	4355 inputCapacity = inputLength + 1;

	4356 inputChars = new char[inputCapacity];

	4357 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlia s(), status);

	4358 }

	4359 utext_openUTF8(&inputText, inputChars, inputLength, &status);

	4360

	4361 //

	4362 // Run the test, check for expected match/don't match result.

	4363 //

	4364 RegexMatcher *testMat = testPat->matcher(&inputText, RegexPattern::PATTE RN_IS_UTEXT, status);

	4365 UBool found = testMat->find();

	4366 UBool expected = FALSE;

	4367 if (fields[2].indexOf(UChar_y) >=0) {

	4368 expected = TRUE;

	4369 }

	4370 if (expected != found) {

	4371 errln("line %d: Expected %smatch, got %smatch",

	4372 lineNum, expected?"":"no ", found?"":"no " );

	4373 continue;

	4374 }

	4375

	4376 // Don't try to check expected results if there is no match.

	4377 // (Some have stuff in the expected fields)

	4378 if (!found) {

	4379 delete testMat;

	4380 delete testPat;

	4381 continue;

	4382 }

	4383

	4384 //

	4385 // Interpret the Perl expression from the fourth field of the data file,

	4386 // building up an ICU string from the results of the ICU match.

	4387 // The Perl expression will contain references to the results of

	4388 // a regex match, including the matched string, capture group string s,

	4389 // group starting and ending indicies, etc.

	4390 //

	4391 UnicodeString resultString;

	4392 UnicodeString perlExpr = fields[3];

	4393

	4394 while (perlExpr.length() > 0) {

	4395 groupsMat->reset(perlExpr);

	4396 cgMat->reset(perlExpr);

	4397

	4398 if (perlExpr.startsWith("$&")) {

	4399 resultString.append(testMat->group(status));

	4400 perlExpr.remove(0, 2);

	4401 }

	4402

	4403 else if (groupsMat->lookingAt(status)) {

	4404 // $-[0] $+[2] etc.

	4405 UnicodeString digitString = groupsMat->group(2, status);

	4406 int32_t t = 0;

	4407 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);

	4408 UnicodeString plusOrMinus = groupsMat->group(1, status);

	4409 int32_t matchPosition;

	4410 if (plusOrMinus.compare("+") == 0) {

	4411 matchPosition = testMat->end(groupNum, status);

	4412 } else {

	4413 matchPosition = testMat->start(groupNum, status);

	4414 }

	4415 if (matchPosition != -1) {

	4416 ICU_Utility::appendNumber(resultString, matchPosition);

	4417 }

	4418 perlExpr.remove(0, groupsMat->end(status));

	4419 }

	4420

	4421 else if (cgMat->lookingAt(status)) {

	4422 // $1, $2, $3, etc.

	4423 UnicodeString digitString = cgMat->group(1, status);

	4424 int32_t t = 0;

	4425 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);

	4426 if (U_SUCCESS(status)) {

	4427 resultString.append(testMat->group(groupNum, status));

	4428 status = U_ZERO_ERROR;

	4429 }

	4430 perlExpr.remove(0, cgMat->end(status));

	4431 }

	4432

	4433 else if (perlExpr.startsWith("@-")) {

	4434 int32_t i;

	4435 for (i=0; i<=testMat->groupCount(); i++) {

	4436 if (i>0) {

	4437 resultString.append(" ");

	4438 }

	4439 ICU_Utility::appendNumber(resultString, testMat->start(i, st atus));

	4440 }

	4441 perlExpr.remove(0, 2);

	4442 }

	4443

	4444 else if (perlExpr.startsWith("@+")) {

	4445 int32_t i;

	4446 for (i=0; i<=testMat->groupCount(); i++) {

	4447 if (i>0) {

	4448 resultString.append(" ");

	4449 }

	4450 ICU_Utility::appendNumber(resultString, testMat->end(i, stat us));

	4451 }

	4452 perlExpr.remove(0, 2);

	4453 }

	4454

	4455 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \ Escape. Take following char as a literal.

	4456 // or as an escap ed sequence (e.g. \n)

	4457 if (perlExpr.length() > 1) {

	4458 perlExpr.remove(0, 1); // Remove the '\', but only if not l ast char.

	4459 }

	4460 UChar c = perlExpr.charAt(0);

	4461 switch (c) {

	4462 case 'n': c = '\n'; break;

	4463 // add any other escape sequences that show up in the test expec ted results.

	4464 }

	4465 resultString.append(c);

	4466 perlExpr.remove(0, 1);

	4467 }

	4468

	4469 else {

	4470 // Any characters from the perl expression that we don't explici tly

	4471 // recognize before here are assumed to be literals and copied

	4472 // as-is to the expected results.

	4473 resultString.append(perlExpr.charAt(0));

	4474 perlExpr.remove(0, 1);

	4475 }

	4476

	4477 if (U_FAILURE(status)) {

	4478 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)) ;

	4479 break;

	4480 }

	4481 }

	4482

	4483 //

	4484 // Expected Results Compare

	4485 //

	4486 UnicodeString expectedS(fields[4]);

	4487 expectedS.findAndReplace(nulnulSrc, nulnul);

	4488 expectedS.findAndReplace(ffffSrc, ffff);

	4489 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");

	4490

	4491

	4492 if (expectedS.compare(resultString) != 0) {

	4493 err("Line %d: Incorrect perl expression results.", lineNum);

	4494 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; go t \""+resultString+(UnicodeString)"\"");

	4495 }

	4496

	4497 delete testMat;

	4498 delete testPat;

	4499 }

	4500

	4501 //

	4502 // All done. Clean up allocated stuff.

	4503 //

	4504 delete cgMat;

	4505 delete cgPat;

	4506

	4507 delete groupsMat;

	4508 delete groupsPat;

	4509

	4510 delete flagMat;

	4511 delete flagPat;

	4512

	4513 delete lineMat;

	4514 delete linePat;

	4515

	4516 delete fieldPat;

	4517 delete [] testData;

	4518

	4519 utext_close(&patternText);

	4520 utext_close(&inputText);

	4521

	4522 delete [] patternChars;

	4523 delete [] inputChars;

	4524

	4525

	4526 logln("%d tests skipped because of unimplemented regexp features.", skippedU nimplementedCount);

	4527

	4528 }

	4529

	4530

	4531 //--------------------------------------------------------------

	4532 //

	4533 // Bug6149 Verify limits to heap expansion for backtrack stack.

	4534 // Use this pattern,

	4535 // "(a?){1,}"

	4536 // The zero-length match will repeat forever.

	4537 // (That this goes into a loop is another bug)

	4538 //

	4539 //---------------------------------------------------------------

	4540 void RegexTest::Bug6149() {

	4541 UnicodeString pattern("(a?){1,}");

	4542 UnicodeString s("xyz");

	4543 uint32_t flags = 0;

	4544 UErrorCode status = U_ZERO_ERROR;

	4545

	4546 RegexMatcher matcher(pattern, s, flags, status);

	4547 UBool result = false;

	4548 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);

	4549 REGEX_ASSERT(result == FALSE);

	4550 }

	4551

	4552

	4553 //

	4554 // Callbacks() Test the callback function.

	4555 // When set, callbacks occur periodically during matching opera tions,

	4556 // giving the application code the ability to abort the operati on

	4557 // before it's normal completion.

	4558 //

	4559

	4560 struct callBackContext {

	4561 RegexTest *test;

	4562 int32_t maxCalls;

	4563 int32_t numCalls;

	4564 int32_t lastSteps;

	4565 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};

	4566 };

	4567

	4568 U_CDECL_BEGIN

	4569 static UBool U_CALLCONV

	4570 testCallBackFn(const void *context, int32_t steps) {

	4571 callBackContext info = (callBackContext )context;

	4572 if (info->lastSteps+1 != steps) {

	4573 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);

	4574 }

	4575 info->lastSteps = steps;

	4576 info->numCalls++;

	4577 return (info->numCalls < info->maxCalls);

	4578 }

	4579 U_CDECL_END

	4580

	4581 void RegexTest::Callbacks() {

	4582 {

	4583 // Getter returns NULLs if no callback has been set

	4584

	4585 // The variables that the getter will fill in.

	4586 // Init to non-null values so that the action of the getter can be see n.

	4587 const void *returnedContext = &returnedContext;

	4588 URegexMatchCallback *returnedFn = &testCallBackFn;

	4589

	4590 UErrorCode status = U_ZERO_ERROR;

	4591 RegexMatcher matcher("x", 0, status);

	4592 REGEX_CHECK_STATUS;

	4593 matcher.getMatchCallback(returnedFn, returnedContext, status);

	4594 REGEX_CHECK_STATUS;

	4595 REGEX_ASSERT(returnedFn == NULL);

	4596 REGEX_ASSERT(returnedContext == NULL);

	4597 }

	4598

	4599 {

	4600 // Set and Get work

	4601 callBackContext cbInfo = {this, 0, 0, 0};

	4602 const void *returnedContext;

	4603 URegexMatchCallback *returnedFn;

	4604 UErrorCode status = U_ZERO_ERROR;

	4605 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.

	4606 REGEX_CHECK_STATUS;

	4607 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);

	4608 REGEX_CHECK_STATUS;

	4609 matcher.getMatchCallback(returnedFn, returnedContext, status);

	4610 REGEX_CHECK_STATUS;

	4611 REGEX_ASSERT(returnedFn == testCallBackFn);

	4612 REGEX_ASSERT(returnedContext == &cbInfo);

	4613

	4614 // A short-running match shouldn't invoke the callback

	4615 status = U_ZERO_ERROR;

	4616 cbInfo.reset(1);

	4617 UnicodeString s = "xxx";

	4618 matcher.reset(s);

	4619 REGEX_ASSERT(matcher.matches(status));

	4620 REGEX_CHECK_STATUS;

	4621 REGEX_ASSERT(cbInfo.numCalls == 0);

	4622

	4623 // A medium-length match that runs long enough to invoke the

	4624 // callback, but not so long that the callback aborts it.

	4625 status = U_ZERO_ERROR;

	4626 cbInfo.reset(4);

	4627 s = "aaaaaaaaaaaaaaaaaaab";

	4628 matcher.reset(s);

	4629 REGEX_ASSERT(matcher.matches(status)==FALSE);

	4630 REGEX_CHECK_STATUS;

	4631 REGEX_ASSERT(cbInfo.numCalls > 0);

	4632

	4633 // A longer running match that the callback function will abort.

	4634 status = U_ZERO_ERROR;

	4635 cbInfo.reset(4);

	4636 s = "aaaaaaaaaaaaaaaaaaaaaaab";

	4637 matcher.reset(s);

	4638 REGEX_ASSERT(matcher.matches(status)==FALSE);

	4639 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);

	4640 REGEX_ASSERT(cbInfo.numCalls == 4);

	4641 }

	4642

	4643

	4644 }

	4645

	4646

	4647 //

	4648 // FindProgressCallbacks() Test the find "progress" callback function.

	4649 // When set, the find progress callback will be invoked during a find operations

	4650 // after each return from a match attempt, giving the applicati on the opportunity

	4651 // to terminate a long-running find operation before it's norma l completion.

	4652 //

	4653

	4654 struct progressCallBackContext {

	4655 RegexTest *test;

	4656 int64_t lastIndex;

	4657 int32_t maxCalls;

	4658 int32_t numCalls;

	4659 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};

	4660 };

	4661

	4662 U_CDECL_BEGIN

	4663 static UBool U_CALLCONV

	4664 testProgressCallBackFn(const void *context, int64_t matchIndex) {

	4665 progressCallBackContext info = (progressCallBackContext )context;

	4666 info->numCalls++;

	4667 info->lastIndex = matchIndex;

	4668 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);

	4669 return (info->numCalls < info->maxCalls);

	4670 }

	4671 U_CDECL_END

	4672

	4673 void RegexTest::FindProgressCallbacks() {

	4674 {

	4675 // Getter returns NULLs if no callback has been set

	4676

	4677 // The variables that the getter will fill in.

	4678 // Init to non-null values so that the action of the getter can be see n.

	4679 const void *returnedContext = &returnedContext;

	4680 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;

	4681

	4682 UErrorCode status = U_ZERO_ERROR;

	4683 RegexMatcher matcher("x", 0, status);

	4684 REGEX_CHECK_STATUS;

	4685 matcher.getFindProgressCallback(returnedFn, returnedContext, status);

	4686 REGEX_CHECK_STATUS;

	4687 REGEX_ASSERT(returnedFn == NULL);

	4688 REGEX_ASSERT(returnedContext == NULL);

	4689 }

	4690

	4691 {

	4692 // Set and Get work

	4693 progressCallBackContext cbInfo = {this, 0, 0, 0};

	4694 const void *returnedContext;

	4695 URegexFindProgressCallback *returnedFn;

	4696 UErrorCode status = U_ZERO_ERROR;

	4697 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.

	4698 REGEX_CHECK_STATUS;

	4699 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status) ;

	4700 REGEX_CHECK_STATUS;

	4701 matcher.getFindProgressCallback(returnedFn, returnedContext, status);

	4702 REGEX_CHECK_STATUS;

	4703 REGEX_ASSERT(returnedFn == testProgressCallBackFn);

	4704 REGEX_ASSERT(returnedContext == &cbInfo);

	4705

	4706 // A short-running match should NOT invoke the callback.

	4707 status = U_ZERO_ERROR;

	4708 cbInfo.reset(100);

	4709 UnicodeString s = "abxxx";

	4710 matcher.reset(s);

	4711 #if 0

	4712 matcher.setTrace(TRUE);

	4713 #endif

	4714 REGEX_ASSERT(matcher.find(0, status));

	4715 REGEX_CHECK_STATUS;

	4716 REGEX_ASSERT(cbInfo.numCalls == 0);

	4717

	4718 // A medium running match that causes matcher.find() to invoke our callb ack for each index.

	4719 status = U_ZERO_ERROR;

	4720 s = "aaaaaaaaaaaaaaaaaaab";

	4721 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string

	4722 matcher.reset(s);

	4723 REGEX_ASSERT(matcher.find(0, status)==FALSE);

	4724 REGEX_CHECK_STATUS;

	4725 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);

	4726

	4727 // A longer running match that causes matcher.find() to invoke our callb ack which we cancel/interrupt at some point.

	4728 status = U_ZERO_ERROR;

	4729 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";

	4730 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string

	4731 matcher.reset(s1);

	4732 REGEX_ASSERT(matcher.find(0, status)==FALSE);

	4733 REGEX_CHECK_STATUS;

	4734 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);

	4735

	4736 #if 0

	4737 // Now a match that will succeed, but after an interruption

	4738 status = U_ZERO_ERROR;

	4739 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";

	4740 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string

	4741 matcher.reset(s2);

	4742 REGEX_ASSERT(matcher.find(0, status)==FALSE);

	4743 REGEX_CHECK_STATUS;

	4744 // Now retry the match from where left off

	4745 cbInfo.maxCalls = 100; // No callback limit

	4746 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));

	4747 REGEX_CHECK_STATUS;

	4748 #endif

	4749 }

	4750

	4751

	4752 }

	4753

	4754

	4755 //---------------------------------------------------------------------------

	4756 //

	4757 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable

	4758 // UTexts. The pure-C implementation of UText

	4759 // has no mutable backing stores, but we can

	4760 // use UnicodeString here to test the functionality.

	4761 //

	4762 //---------------------------------------------------------------------------

	4763 void RegexTest::PreAllocatedUTextCAPI () {

	4764 UErrorCode status = U_ZERO_ERROR;

	4765 URegularExpression *re;

	4766 UText patternText = UTEXT_INITIALIZER;

	4767 UnicodeString buffer;

	4768 UText bufferText = UTEXT_INITIALIZER;

	4769

	4770 utext_openUnicodeString(&bufferText, &buffer, &status);

	4771

	4772 /*

	4773 * getText() and getUText()

	4774 */

	4775 {

	4776 UText text1 = UTEXT_INITIALIZER;

	4777 UText text2 = UTEXT_INITIALIZER;

	4778 UChar text2Chars[20];

	4779 UText *resultText;

	4780

	4781 status = U_ZERO_ERROR;

	4782 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);

	4783 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);

	4784 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);

	4785 utext_openUChars(&text2, text2Chars, -1, &status);

	4786

	4787 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);

	4788 re = uregex_openUText(&patternText, 0, NULL, &status);

	4789

	4790 /* First set a UText */

	4791 uregex_setUText(re, &text1, &status);

	4792 resultText = uregex_getUText(re, &bufferText, &status);

	4793 REGEX_CHECK_STATUS;

	4794 REGEX_ASSERT(resultText == &bufferText);

	4795 utext_setNativeIndex(resultText, 0);

	4796 utext_setNativeIndex(&text1, 0);

	4797 REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);

	4798

	4799 resultText = uregex_getUText(re, &bufferText, &status);

	4800 REGEX_CHECK_STATUS;

	4801 REGEX_ASSERT(resultText == &bufferText);

	4802 utext_setNativeIndex(resultText, 0);

	4803 utext_setNativeIndex(&text1, 0);

	4804 REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);

	4805

	4806 /* Then set a UChar * */

	4807 uregex_setText(re, text2Chars, 7, &status);

	4808 resultText = uregex_getUText(re, &bufferText, &status);

	4809 REGEX_CHECK_STATUS;

	4810 REGEX_ASSERT(resultText == &bufferText);

	4811 utext_setNativeIndex(resultText, 0);

	4812 utext_setNativeIndex(&text2, 0);

	4813 REGEX_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0);

	4814

	4815 uregex_close(re);

	4816 utext_close(&text1);

	4817 utext_close(&text2);

	4818 }

	4819

	4820 /*

	4821 * group()

	4822 */

	4823 {

	4824 UChar text1[80];

	4825 UText *actual;

	4826 UBool result;

	4827 u_uastrncpy(text1, "noise abc interior def, and this is off the end", s izeof(text1)/2);

	4828

	4829 status = U_ZERO_ERROR;

	4830 re = uregex_openC("abc(.*?)def", 0, NULL, &status);

	4831 REGEX_CHECK_STATUS;

	4832

	4833 uregex_setText(re, text1, -1, &status);

	4834 result = uregex_find(re, 0, &status);

	4835 REGEX_ASSERT(result==TRUE);

	4836

	4837 /* Capture Group 0, the full match. Should succeed. */

	4838 status = U_ZERO_ERROR;

	4839 actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);

	4840 REGEX_CHECK_STATUS;

	4841 REGEX_ASSERT(actual == &bufferText);

	4842 REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);

	4843

	4844 /* Capture group #1. Should succeed. */

	4845 status = U_ZERO_ERROR;

	4846 actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);

	4847 REGEX_CHECK_STATUS;

	4848 REGEX_ASSERT(actual == &bufferText);

	4849 REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);

	4850

	4851 /* Capture group out of range. Error. */

	4852 status = U_ZERO_ERROR;

	4853 actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);

	4854 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);

	4855 REGEX_ASSERT(actual == &bufferText);

	4856

	4857 uregex_close(re);

	4858

	4859 }

	4860

	4861 /*

	4862 * replaceFirst()

	4863 */

	4864 {

	4865 UChar text1[80];

	4866 UChar text2[80];

	4867 UText replText = UTEXT_INITIALIZER;

	4868 UText *result;

	4869

	4870 status = U_ZERO_ERROR;

	4871 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);

	4872 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);

	4873 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);

	4874

	4875 re = uregex_openC("x(.*?)x", 0, NULL, &status);

	4876 REGEX_CHECK_STATUS;

	4877

	4878 /* Normal case, with match */

	4879 uregex_setText(re, text1, -1, &status);

	4880 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);

	4881 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);

	4882 REGEX_CHECK_STATUS;

	4883 REGEX_ASSERT(result == &bufferText);

	4884 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);

	4885

	4886 /* No match. Text should copy to output with no changes. */

	4887 uregex_setText(re, text2, -1, &status);

	4888 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);

	4889 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);

	4890 REGEX_CHECK_STATUS;

	4891 REGEX_ASSERT(result == &bufferText);

	4892 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);

	4893

	4894 /* Unicode escapes */

	4895 uregex_setText(re, text1, -1, &status);

	4896 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a" , -1, &status);

	4897 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);

	4898 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);

	4899 REGEX_CHECK_STATUS;

	4900 REGEX_ASSERT(result == &bufferText);

	4901 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);

	4902

	4903 uregex_close(re);

	4904 utext_close(&replText);

	4905 }

	4906

	4907

	4908 /*

	4909 * replaceAll()

	4910 */

	4911 {

	4912 UChar text1[80];

	4913 UChar text2[80];

	4914 UText replText = UTEXT_INITIALIZER;

	4915 UText *result;

	4916

	4917 status = U_ZERO_ERROR;

	4918 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);

	4919 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);

	4920 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);

	4921

	4922 re = uregex_openC("x(.*?)x", 0, NULL, &status);

	4923 REGEX_CHECK_STATUS;

	4924

	4925 /* Normal case, with match */

	4926 uregex_setText(re, text1, -1, &status);

	4927 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);

	4928 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);

	4929 REGEX_CHECK_STATUS;

	4930 REGEX_ASSERT(result == &bufferText);

	4931 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);

	4932

	4933 /* No match. Text should copy to output with no changes. */

	4934 uregex_setText(re, text2, -1, &status);

	4935 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);

	4936 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);

	4937 REGEX_CHECK_STATUS;

	4938 REGEX_ASSERT(result == &bufferText);

	4939 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);

	4940

	4941 uregex_close(re);

	4942 utext_close(&replText);

	4943 }

	4944

	4945

	4946 /*

	4947 * splitUText() uses the C++ API directly, and the UnicodeString version us es mutable UTexts,

	4948 * so we don't need to test it here.

	4949 */

	4950

	4951 utext_close(&bufferText);

	4952 utext_close(&patternText);

	4953 }

	4954

	4955 //--------------------------------------------------------------

	4956 //

	4957 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher .

	4958 //

	4959 //---------------------------------------------------------------

	4960 void RegexTest::Bug7651() {

	4961 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\ u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*\|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z 0-9_]+(?:\\/[\\w-]+)?\|(https?\\:\\/\\/\|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\ uFFFF])\|\\$[A-Za-z]+)");

	4962 // The following should exceed the default operator stack depth in the matc her, i.e. force the matcher to malloc instead of using fSmallData.

	4963 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allo cation.

	4964 UnicodeString pattern2("((https?\\:\\/\\/\|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u 0080-\\uFFFF])\|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?\|(?<![ A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u0 0f8-\\u00ff]*\|\\$[A-Za-z]+)");

	4965 UnicodeString s("#ff @abcd This is test");

	4966 RegexPattern *REPattern = NULL;

	4967 RegexMatcher *REMatcher = NULL;

	4968 UErrorCode status = U_ZERO_ERROR;

	4969 UParseError pe;

	4970

	4971 REPattern = RegexPattern::compile(pattern1, 0, pe, status);

	4972 REGEX_CHECK_STATUS;

	4973 REMatcher = REPattern->matcher(s, status);

	4974 REGEX_CHECK_STATUS;

	4975 REGEX_ASSERT(REMatcher->find());

	4976 REGEX_ASSERT(REMatcher->start(status) == 0);

	4977 delete REPattern;

	4978 delete REMatcher;

	4979 status = U_ZERO_ERROR;

	4980

	4981 REPattern = RegexPattern::compile(pattern2, 0, pe, status);

	4982 REGEX_CHECK_STATUS;

	4983 REMatcher = REPattern->matcher(s, status);

	4984 REGEX_CHECK_STATUS;

	4985 REGEX_ASSERT(REMatcher->find());

	4986 REGEX_ASSERT(REMatcher->start(status) == 0);

	4987 delete REPattern;

	4988 delete REMatcher;

	4989 status = U_ZERO_ERROR;

	4990 }

	4991

	4992 void RegexTest::Bug7740() {

	4993 UErrorCode status = U_ZERO_ERROR;

	4994 UnicodeString pattern = "(a)";

	4995 UnicodeString text = "abcdef";

	4996 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);

	4997 REGEX_CHECK_STATUS;

	4998 REGEX_ASSERT(m->lookingAt(status));

	4999 REGEX_CHECK_STATUS;

	5000 status = U_ILLEGAL_ARGUMENT_ERROR;

	5001 UnicodeString s = m->group(1, status); // Bug 7740: segfault here.

	5002 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);

	5003 REGEX_ASSERT(s == "");

	5004 delete m;

	5005 }

	5006

	5007

	5008

	5009

	5010 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */

	5011

OLD	NEW

« no previous file with comments | « icu46/source/test/intltest/regextst.h ('k') | icu46/source/test/intltest/reptest.h » ('j') | no next file with comments »