source/i18n/rematch.cpp - Issue 1621843002: ICU 56 update step 1

Side by Side Diff: source/i18n/rematch.cpp

Issue 1621843002: ICU 56 update step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@561

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 **************************************************************************	2 **************************************************************************

3 * Copyright (C) 2002-2014 International Business Machines Corporation *	3 * Copyright (C) 2002-2015 International Business Machines Corporation *

4 * and others. All rights reserved. *	4 * and others. All rights reserved. *

5 **************************************************************************	5 **************************************************************************

6 */	6 */

7 //	7 //

8 // file: rematch.cpp	8 // file: rematch.cpp

9 //	9 //

10 // Contains the implementation of class RegexMatcher,	10 // Contains the implementation of class RegexMatcher,

11 // which is one of the main API classes for the ICU regular expression p ackage.	11 // which is one of the main API classes for the ICU regular expression p ackage.

12 //	12 //

13	13

(...skipping 28 matching lines...) Expand all Loading...
42 // backtrack point.	42 // backtrack point.

43 //	43 //

44 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;	44 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;

45	45

46 // Time limit counter constant.	46 // Time limit counter constant.

47 // Time limits for expression evaluation are in terms of quanta of work by	47 // Time limits for expression evaluation are in terms of quanta of work by

48 // the engine, each of which is 10,000 state saves.	48 // the engine, each of which is 10,000 state saves.

49 // This constant determines that state saves per tick number.	49 // This constant determines that state saves per tick number.

50 static const int32_t TIMER_INITIAL_VALUE = 10000;	50 static const int32_t TIMER_INITIAL_VALUE = 10000;

51	51

	52

	53 // Test for any of the Unicode line terminating characters.

	54 static inline UBool isLineTerminator(UChar32 c) {

	55 if (c & ~(0x0a \| 0x0b \| 0x0c \| 0x0d \| 0x85 \| 0x2028 \| 0x2029)) {

	56 return false;

	57 }

	58 return (c<=0x0d && c>=0x0a) \|\| c==0x85 \|\| c==0x2028 \|\| c==0x2029;

	59 }

	60

52 //-----------------------------------------------------------------------------	61 //-----------------------------------------------------------------------------

53 //	62 //

54 // Constructor and Destructor	63 // Constructor and Destructor

55 //	64 //

56 //-----------------------------------------------------------------------------	65 //-----------------------------------------------------------------------------

57 RegexMatcher::RegexMatcher(const RegexPattern *pat) {	66 RegexMatcher::RegexMatcher(const RegexPattern *pat) {

58 fDeferredStatus = U_ZERO_ERROR;	67 fDeferredStatus = U_ZERO_ERROR;

59 init(fDeferredStatus);	68 init(fDeferredStatus);

60 if (U_FAILURE(fDeferredStatus)) {	69 if (U_FAILURE(fDeferredStatus)) {

61 return;	70 return;

(...skipping 147 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
209 fDeferredStatus = status;	218 fDeferredStatus = status;

210 fData = fSmallData;	219 fData = fSmallData;

211 fWordBreakItr = NULL;	220 fWordBreakItr = NULL;

212	221

213 fStack = NULL;	222 fStack = NULL;

214 fInputText = NULL;	223 fInputText = NULL;

215 fAltInputText = NULL;	224 fAltInputText = NULL;

216 fInput = NULL;	225 fInput = NULL;

217 fInputLength = 0;	226 fInputLength = 0;

218 fInputUniStrMaybeMutable = FALSE;	227 fInputUniStrMaybeMutable = FALSE;

219

220 if (U_FAILURE(status)) {

221 fDeferredStatus = status;

222 }

223 }	228 }

224	229

225 //	230 //

226 // init2() Common initialization for use by RegexMatcher constructors, part 2 .	231 // init2() Common initialization for use by RegexMatcher constructors, part 2 .

227 // This handles the common setup to be done after the Pattern is avai lable.	232 // This handles the common setup to be done after the Pattern is avai lable.

228 //	233 //

229 void RegexMatcher::init2(UText *input, UErrorCode &status) {	234 void RegexMatcher::init2(UText *input, UErrorCode &status) {

230 if (U_FAILURE(status)) {	235 if (U_FAILURE(status)) {

231 fDeferredStatus = status;	236 fDeferredStatus = status;

232 return;	237 return;

(...skipping 17 matching lines...) Expand all Loading...
250 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);	255 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);

251 if (U_FAILURE(status)) {	256 if (U_FAILURE(status)) {

252 fDeferredStatus = status;	257 fDeferredStatus = status;

253 return;	258 return;

254 }	259 }

255 }	260 }

256	261

257	262

258 static const UChar BACKSLASH = 0x5c;	263 static const UChar BACKSLASH = 0x5c;

259 static const UChar DOLLARSIGN = 0x24;	264 static const UChar DOLLARSIGN = 0x24;

	265 static const UChar LEFTBRACKET = 0x7b;

	266 static const UChar RIGHTBRACKET = 0x7d;

	267

260 //------------------------------------------------------------------------------ --	268 //------------------------------------------------------------------------------ --

261 //	269 //

262 // appendReplacement	270 // appendReplacement

263 //	271 //

264 //------------------------------------------------------------------------------ --	272 //------------------------------------------------------------------------------ --

265 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,	273 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,

266 const UnicodeString &replacement,	274 const UnicodeString &replacement,

267 UErrorCode &status) {	275 UErrorCode &status) {

268 UText replacementText = UTEXT_INITIALIZER;	276 UText replacementText = UTEXT_INITIALIZER;

269	277

(...skipping 54 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
324 uprv_free(inputChars);	332 uprv_free(inputChars);

325 }	333 }

326 }	334 }

327 fAppendPosition = fMatchEnd;	335 fAppendPosition = fMatchEnd;

328	336

329	337

330 // scan the replacement text, looking for substitutions ($n) and \escapes.	338 // scan the replacement text, looking for substitutions ($n) and \escapes.

331 // TODO: optimize this loop by efficiently scanning for '$' or '\',	339 // TODO: optimize this loop by efficiently scanning for '$' or '\',

332 // move entire ranges not containing substitutions.	340 // move entire ranges not containing substitutions.

333 UTEXT_SETNATIVEINDEX(replacement, 0);	341 UTEXT_SETNATIVEINDEX(replacement, 0);

334 UChar32 c = UTEXT_NEXT32(replacement);	342 for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENT INEL; c = UTEXT_NEXT32(replacement)) {

335 while (c != U_SENTINEL) {

336 if (c == BACKSLASH) {	343 if (c == BACKSLASH) {

337 // Backslash Escape. Copy the following char out without further ch ecks.	344 // Backslash Escape. Copy the following char out without further ch ecks.

338 // Note: Surrogate pairs don't need any special handling	345 // Note: Surrogate pairs don't need any special handling

339 // The second half wont be a '$' or a '\', and	346 // The second half wont be a '$' or a '\', and

340 // will move to the dest normally on the n ext	347 // will move to the dest normally on the n ext

341 // loop iteration.	348 // loop iteration.

342 c = UTEXT_CURRENT32(replacement);	349 c = UTEXT_CURRENT32(replacement);

343 if (c == U_SENTINEL) {	350 if (c == U_SENTINEL) {

344 break;	351 break;

345 }	352 }

(...skipping 45 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
391 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &statu s);	398 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &statu s);

392 } else {	399 } else {

393 UChar surrogate[2];	400 UChar surrogate[2];

394 surrogate[0] = U16_LEAD(c);	401 surrogate[0] = U16_LEAD(c);

395 surrogate[1] = U16_TRAIL(c);	402 surrogate[1] = U16_TRAIL(c);

396 if (U_SUCCESS(status)) {	403 if (U_SUCCESS(status)) {

397 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);	404 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);

398 }	405 }

399 }	406 }

400 } else {	407 } else {

401 // We've got a $. Pick up a capture group number if one follows.	408 // We've got a $. Pick up a capture group name or number if one fol lows.

402 // Consume at most the number of digits necessary for the largest ca pture	409 // Consume digits so long as the resulting group number <= the numbe r of

403 // number that is valid for this pattern.	410 // number of capture groups in the pattern.

404	411

	412 int32_t groupNum = 0;

405 int32_t numDigits = 0;	413 int32_t numDigits = 0;

406 int32_t groupNum = 0;	414 UChar32 nextChar = utext_current32(replacement);

407 UChar32 digitC;	415 if (nextChar == LEFTBRACKET) {

408 for (;;) {	416 // Scan for a Named Capture Group, ${name}.

409 digitC = UTEXT_CURRENT32(replacement);	417 UnicodeString groupName;

410 if (digitC == U_SENTINEL) {	418 utext_next32(replacement);

411 break;	419 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {

	420 nextChar = utext_next32(replacement);

	421 if (nextChar == U_SENTINEL) {

	422 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;

	423 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) \|\| / / A..Z

	424 (nextChar >= 0x61 && nextChar <= 0x7a) \|\| / / a..z

	425 (nextChar >= 0x31 && nextChar <= 0x39)) { / / 0..9

	426 groupName.append(nextChar);

	427 } else if (nextChar == RIGHTBRACKET) {

	428 groupNum = uhash_geti(fPattern->fNamedCaptureMap, &group Name);

	429 if (groupNum == 0) {

	430 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;

	431 }

	432 } else {

	433 // Character was something other than a name char or a c losing '}'

	434 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;

	435 }

412 }	436 }

413 if (u_isdigit(digitC) == FALSE) {	437

414 break;	438 } else if (u_isdigit(nextChar)) {

	439 // $n Scan for a capture group number

	440 int32_t numCaptureGroups = fPattern->fGroupMap->size();

	441 for (;;) {

	442 nextChar = UTEXT_CURRENT32(replacement);

	443 if (nextChar == U_SENTINEL) {

	444 break;

	445 }

	446 if (u_isdigit(nextChar) == FALSE) {

	447 break;

	448 }

	449 int32_t nextDigitVal = u_charDigitValue(nextChar);

	450 if (groupNum*10 + nextDigitVal > numCaptureGroups) {

	451 // Don't consume the next digit if it makes the capture group number too big.

	452 if (numDigits == 0) {

	453 status = U_INDEX_OUTOFBOUNDS_ERROR;

	454 }

	455 break;

	456 }

	457 (void)UTEXT_NEXT32(replacement);

	458 groupNum=groupNum*10 + nextDigitVal;

	459 ++numDigits;

415 }	460 }

416 (void)UTEXT_NEXT32(replacement);	461 } else {

417 groupNum=groupNum*10 + u_charDigitValue(digitC);	462 // $ not followed by capture group name or number.

418 numDigits++;	463 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;

419 if (numDigits >= fPattern->fMaxCaptureDigits) {

420 break;

421 }

422 }	464 }

423	465

424	466 if (U_SUCCESS(status)) {

425 if (numDigits == 0) {

426 // The $ didn't introduce a group number at all.

427 // Treat it as just part of the substitution text.

428 UChar c16 = DOLLARSIGN;

429 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &statu s);

430 } else {

431 // Finally, append the capture group data to the destination.

432 destLen += appendGroup(groupNum, dest, status);	467 destLen += appendGroup(groupNum, dest, status);

433 if (U_FAILURE(status)) {

434 // Can fail if group number is out of range.

435 break;

436 }

437 }	468 }

438 }	469 } // End of $ capture group handling

439	470 } // End of per-character loop through the replacement string.

440 if (U_FAILURE(status)) {

441 break;

442 } else {

443 c = UTEXT_NEXT32(replacement);

444 }

445 }

446	471

447 return *this;	472 return *this;

448 }	473 }

449	474

450	475

451	476

452 //------------------------------------------------------------------------------ --	477 //------------------------------------------------------------------------------ --

453 //	478 //

454 // appendTail Intended to be used in conjunction with appendReplacement()	479 // appendTail Intended to be used in conjunction with appendReplacement()

455 // To the destination string, append everything following	480 // To the destination string, append everything following

(...skipping 354 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
810 c = UTEXT_NEXT32(fInputText);	835 c = UTEXT_NEXT32(fInputText);

811 startPos = UTEXT_GETNATIVEINDEX(fInputText);	836 startPos = UTEXT_GETNATIVEINDEX(fInputText);

812 // Note that it's perfectly OK for a pattern to have a zero- length	837 // Note that it's perfectly OK for a pattern to have a zero- length

813 // match at the end of a string, so we must make sure that the loop	838 // match at the end of a string, so we must make sure that the loop

814 // runs with startPos == testStartLimit the last time thro ugh.	839 // runs with startPos == testStartLimit the last time thro ugh.

815 if (findProgressInterrupt(startPos, status))	840 if (findProgressInterrupt(startPos, status))

816 return FALSE;	841 return FALSE;

817 }	842 }

818 } else {	843 } else {

819 for (;;) {	844 for (;;) {

820 if (((c & 0x7f) <= 0x29) && // First quickly bypass as m any chars as possible	845 if (isLineTerminator(c)) {

821 ((c<=0x0d && c>=0x0a) \|\| c==0x85 \|\|c==0x2028 \|\| c==0x202 9 )) {	846 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURREN T32(fInputText) == 0x0a) {

822 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CU RRENT32(fInputText) == 0x0a) {	847 (void)UTEXT_NEXT32(fInputText);

823 (void)UTEXT_NEXT32(fInputText);	848 startPos = UTEXT_GETNATIVEINDEX(fInputText);

824 startPos = UTEXT_GETNATIVEINDEX(fInputText);	849 }

825 }	850 MatchAt(startPos, FALSE, status);

826 MatchAt(startPos, FALSE, status);	851 if (U_FAILURE(status)) {

827 if (U_FAILURE(status)) {	852 return FALSE;

828 return FALSE;	853 }

829 }	854 if (fMatch) {

830 if (fMatch) {	855 return TRUE;

831 return TRUE;	856 }

832 }	857 UTEXT_SETNATIVEINDEX(fInputText, startPos);

833 UTEXT_SETNATIVEINDEX(fInputText, startPos);

834 }	858 }

835 if (startPos >= testStartLimit) {	859 if (startPos >= testStartLimit) {

836 fMatch = FALSE;	860 fMatch = FALSE;

837 fHitEnd = TRUE;	861 fHitEnd = TRUE;

838 return FALSE;	862 return FALSE;

839 }	863 }

840 c = UTEXT_NEXT32(fInputText);	864 c = UTEXT_NEXT32(fInputText);

841 startPos = UTEXT_GETNATIVEINDEX(fInputText);	865 startPos = UTEXT_GETNATIVEINDEX(fInputText);

842 // Note that it's perfectly OK for a pattern to have a zero- length	866 // Note that it's perfectly OK for a pattern to have a zero- length

843 // match at the end of a string, so we must make sure that the loop	867 // match at the end of a string, so we must make sure that the loop

(...skipping 227 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1071 U16_FWD_1(inputBuf, startPos, fActiveLimit);	1095 U16_FWD_1(inputBuf, startPos, fActiveLimit);

1072 // Note that it's perfectly OK for a pattern to have a zero-leng th	1096 // Note that it's perfectly OK for a pattern to have a zero-leng th

1073 // match at the end of a string, so we must make sure that the loop	1097 // match at the end of a string, so we must make sure that the loop

1074 // runs with startPos == testLen the last time through.	1098 // runs with startPos == testLen the last time through.

1075 if (findProgressInterrupt(startPos, status))	1099 if (findProgressInterrupt(startPos, status))

1076 return FALSE;	1100 return FALSE;

1077 }	1101 }

1078 } else {	1102 } else {

1079 for (;;) {	1103 for (;;) {

1080 c = inputBuf[startPos-1];	1104 c = inputBuf[startPos-1];

1081 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible	1105 if (isLineTerminator(c)) {

1082 ((c<=0x0d && c>=0x0a) \|\| c==0x85 \|\|c==0x2028 \|\| c==0x2029 )) {

1083 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPo s] == 0x0a) {	1106 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPo s] == 0x0a) {

1084 startPos++;	1107 startPos++;

1085 }	1108 }

1086 MatchChunkAt(startPos, FALSE, status);	1109 MatchChunkAt(startPos, FALSE, status);

1087 if (U_FAILURE(status)) {	1110 if (U_FAILURE(status)) {

1088 return FALSE;	1111 return FALSE;

1089 }	1112 }

1090 if (fMatch) {	1113 if (fMatch) {

1091 return TRUE;	1114 return TRUE;

1092 }	1115 }

(...skipping 75 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1168 group_len = e - s;	1191 group_len = e - s;

1169	1192

1170 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);	1193 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);

1171 if (dest)	1194 if (dest)

1172 UTEXT_SETNATIVEINDEX(dest, s);	1195 UTEXT_SETNATIVEINDEX(dest, s);

1173 return dest;	1196 return dest;

1174 }	1197 }

1175	1198

1176 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {	1199 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {

1177 UnicodeString result;	1200 UnicodeString result;

1178 if (U_FAILURE(status)) {	1201 int64_t groupStart = start64(groupNum, status);

	1202 int64_t groupEnd = end64(groupNum, status);

	1203 if (U_FAILURE(status) \|\| groupStart == -1 \|\| groupStart == groupEnd) {

1179 return result;	1204 return result;

1180 }	1205 }

1181 UText resultText = UTEXT_INITIALIZER;	1206

1182 utext_openUnicodeString(&resultText, &result, &status);	1207 // Get the group length using a utext_extract preflight.

1183 group(groupNum, &resultText, status);	1208 // UText is actually pretty efficient at this when underlying encoding is UTF-16.

1184 utext_close(&resultText);	1209 int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &s tatus);

	1210 if (status != U_BUFFER_OVERFLOW_ERROR) {

	1211 return result;

	1212 }

	1213

	1214 status = U_ZERO_ERROR;

	1215 UChar *buf = result.getBuffer(length);

	1216 if (buf == NULL) {

	1217 status = U_MEMORY_ALLOCATION_ERROR;

	1218 } else {

	1219 int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);

	1220 result.releaseBuffer(extractLength);

	1221 U_ASSERT(length == extractLength);

	1222 }

1185 return result;	1223 return result;

1186 }	1224 }

1187	1225

1188	1226

1189 // Return deep (mutable) clone

1190 // Technology Preview (as an API), but note that the UnicodeString API is i mplemented

1191 // using this function.

1192 UText RegexMatcher::group(int32_t groupNum, UText dest, UErrorCode &status) co nst {

1193 if (U_FAILURE(status)) {

1194 return dest;

1195 }

1196

1197 if (U_FAILURE(fDeferredStatus)) {

1198 status = fDeferredStatus;

1199 } else if (fMatch == FALSE) {

1200 status = U_REGEX_INVALID_STATE;

1201 } else if (groupNum < 0 \|\| groupNum > fPattern->fGroupMap->size()) {

1202 status = U_INDEX_OUTOFBOUNDS_ERROR;

1203 }

1204 if (U_FAILURE(status)) {

1205 return dest;

1206 }

1207

1208 int64_t s, e;

1209 if (groupNum == 0) {

1210 s = fMatchStart;

1211 e = fMatchEnd;

1212 } else {

1213 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);

1214 U_ASSERT(groupOffset < fPattern->fFrameSize);

1215 U_ASSERT(groupOffset >= 0);

1216 s = fFrame->fExtra[groupOffset];

1217 e = fFrame->fExtra[groupOffset+1];

1218 }

1219

1220 if (s < 0) {

1221 // A capture group wasn't part of the match

1222 if (dest) {

1223 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);

1224 return dest;

1225 } else {

1226 return utext_openUChars(NULL, NULL, 0, &status);

1227 }

1228 }

1229 U_ASSERT(s <= e);

1230

1231 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {

1232 U_ASSERT(e <= fInputLength);

1233 if (dest) {

1234 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkCo ntents+s, (int32_t)(e-s), &status);

1235 } else {

1236 UText groupText = UTEXT_INITIALIZER;

1237 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &stat us);

1238 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);

1239 utext_close(&groupText);

1240 }

1241 } else {

1242 int32_t len16;

1243 if (UTEXT_USES_U16(fInputText)) {

1244 len16 = (int32_t)(e-s);

1245 } else {

1246 UErrorCode lengthStatus = U_ZERO_ERROR;

1247 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);

1248 }

1249 UChar groupChars = (UChar )uprv_malloc(sizeof(UChar)*(len16+1));

1250 if (groupChars == NULL) {

1251 status = U_MEMORY_ALLOCATION_ERROR;

1252 return dest;

1253 }

1254 utext_extract(fInputText, s, e, groupChars, len16+1, &status);

1255

1256 if (dest) {

1257 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);

1258 } else {

1259 UText groupText = UTEXT_INITIALIZER;

1260 utext_openUChars(&groupText, groupChars, len16, &status);

1261 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);

1262 utext_close(&groupText);

1263 }

1264

1265 uprv_free(groupChars);

1266 }

1267 return dest;

1268 }

1269

1270 //------------------------------------------------------------------------------ --	1227 //------------------------------------------------------------------------------ --

1271 //	1228 //

1272 // appendGroup() -- currently internal only, appends a group to a UText rather	1229 // appendGroup() -- currently internal only, appends a group to a UText rather

1273 // than replacing its contents	1230 // than replacing its contents

1274 //	1231 //

1275 //------------------------------------------------------------------------------ --	1232 //------------------------------------------------------------------------------ --

1276	1233

1277 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &sta tus) const {	1234 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &sta tus) const {

1278 if (U_FAILURE(status)) {	1235 if (U_FAILURE(status)) {

1279 return 0;	1236 return 0;

(...skipping 60 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1340	1297

1341 //------------------------------------------------------------------------------ --	1298 //------------------------------------------------------------------------------ --

1342 //	1299 //

1343 // groupCount()	1300 // groupCount()

1344 //	1301 //

1345 //------------------------------------------------------------------------------ --	1302 //------------------------------------------------------------------------------ --

1346 int32_t RegexMatcher::groupCount() const {	1303 int32_t RegexMatcher::groupCount() const {

1347 return fPattern->fGroupMap->size();	1304 return fPattern->fGroupMap->size();

1348 }	1305 }

1349	1306

1350

1351

1352 //------------------------------------------------------------------------------ --	1307 //------------------------------------------------------------------------------ --

1353 //	1308 //

1354 // hasAnchoringBounds()	1309 // hasAnchoringBounds()

1355 //	1310 //

1356 //------------------------------------------------------------------------------ --	1311 //------------------------------------------------------------------------------ --

1357 UBool RegexMatcher::hasAnchoringBounds() const {	1312 UBool RegexMatcher::hasAnchoringBounds() const {

1358 return fAnchoringBounds;	1313 return fAnchoringBounds;

1359 }	1314 }

1360	1315

1361	1316

(...skipping 515 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1877 fTickCounter = TIMER_INITIAL_VALUE;	1832 fTickCounter = TIMER_INITIAL_VALUE;

1878 //resetStack(); // more expensive than it looks...	1833 //resetStack(); // more expensive than it looks...

1879 }	1834 }

1880	1835

1881	1836

1882 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {	1837 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {

1883 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStat us);	1838 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStat us);

1884 if (fPattern->fNeedsAltInput) {	1839 if (fPattern->fNeedsAltInput) {

1885 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDe ferredStatus);	1840 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDe ferredStatus);

1886 }	1841 }

	1842 if (U_FAILURE(fDeferredStatus)) {

	1843 return *this;

	1844 }

1887 fInputLength = utext_nativeLength(fInputText);	1845 fInputLength = utext_nativeLength(fInputText);

1888	1846

1889 reset();	1847 reset();

1890 delete fInput;	1848 delete fInput;

1891 fInput = NULL;	1849 fInput = NULL;

1892	1850

1893 // Do the following for any UnicodeString.	1851 // Do the following for any UnicodeString.

1894 // This is for compatibility for those clients who modify the input string "live" during regex operations.	1852 // This is for compatibility for those clients who modify the input string "live" during regex operations.

1895 fInputUniStrMaybeMutable = TRUE;	1853 fInputUniStrMaybeMutable = TRUE;

1896	1854

1897 if (fWordBreakItr != NULL) {	1855 if (fWordBreakItr != NULL) {

1898 #if UCONFIG_NO_BREAK_ITERATION==0	1856 #if UCONFIG_NO_BREAK_ITERATION==0

1899 UErrorCode status = U_ZERO_ERROR;	1857 UErrorCode status = U_ZERO_ERROR;

1900 fWordBreakItr->setText(fInputText, status);	1858 fWordBreakItr->setText(fInputText, status);

1901 #endif	1859 #endif

1902 }	1860 }

1903 return *this;	1861 return *this;

1904 }	1862 }

1905	1863

1906	1864

1907 RegexMatcher &RegexMatcher::reset(UText *input) {	1865 RegexMatcher &RegexMatcher::reset(UText *input) {

1908 if (fInputText != input) {	1866 if (fInputText != input) {

1909 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatu s);	1867 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatu s);

1910 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);	1868 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);

	1869 if (U_FAILURE(fDeferredStatus)) {

	1870 return *this;

	1871 }

1911 fInputLength = utext_nativeLength(fInputText);	1872 fInputLength = utext_nativeLength(fInputText);

1912	1873

1913 delete fInput;	1874 delete fInput;

1914 fInput = NULL;	1875 fInput = NULL;

1915	1876

1916 if (fWordBreakItr != NULL) {	1877 if (fWordBreakItr != NULL) {

1917 #if UCONFIG_NO_BREAK_ITERATION==0	1878 #if UCONFIG_NO_BREAK_ITERATION==0

1918 UErrorCode status = U_ZERO_ERROR;	1879 UErrorCode status = U_ZERO_ERROR;

1919 fWordBreakItr->setText(input, status);	1880 fWordBreakItr->setText(input, status);

1920 #endif	1881 #endif

(...skipping 67 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1988 //	1949 //

1989 // setTrace	1950 // setTrace

1990 //	1951 //

1991 //------------------------------------------------------------------------------ --	1952 //------------------------------------------------------------------------------ --

1992 void RegexMatcher::setTrace(UBool state) {	1953 void RegexMatcher::setTrace(UBool state) {

1993 fTraceDebug = state;	1954 fTraceDebug = state;

1994 }	1955 }

1995	1956

1996	1957

1997	1958

	1959 /**

	1960 * UText, replace entire contents of the destination UText with a substring of the source UText.

	1961 *

	1962 * @param src The source UText

	1963 * @param dest The destination UText. Must be writable.

	1964 * May be NULL, in which case a new UText will be allocated.

	1965 * @param start Start index of source substring.

	1966 * @param limit Limit index of source substring.

	1967 * @param status An error code.

	1968 */

	1969 static UText utext_extract_replace(UText src, UText dest, int64_t start, int6 4_t limit, UErrorCode status) {

	1970 if (U_FAILURE(*status)) {

	1971 return dest;

	1972 }

	1973 if (start == limit) {

	1974 if (dest) {

	1975 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);

	1976 return dest;

	1977 } else {

	1978 return utext_openUChars(NULL, NULL, 0, status);

	1979 }

	1980 }

	1981 int32_t length = utext_extract(src, start, limit, NULL, 0, status);

	1982 if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status)) {

	1983 return dest;

	1984 }

	1985 *status = U_ZERO_ERROR;

	1986 MaybeStackArray<UChar, 40> buffer;

	1987 if (length >= buffer.getCapacity()) {

	1988 UChar *newBuf = buffer.resize(length+1); // Leave space for terminatin g Nul.

	1989 if (newBuf == NULL) {

	1990 *status = U_MEMORY_ALLOCATION_ERROR;

	1991 }

	1992 }

	1993 utext_extract(src, start, limit, buffer.getAlias(), length+1, status);

	1994 if (dest) {

	1995 utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), leng th, status);

	1996 return dest;

	1997 }

	1998

	1999 // Caller did not provide a prexisting UText.

	2000 // Open a new one, and have it adopt the text buffer storage.

	2001 if (U_FAILURE(*status)) {

	2002 return NULL;

	2003 }

	2004 int32_t ownedLength = 0;

	2005 UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);

	2006 if (ownedBuf == NULL) {

	2007 *status = U_MEMORY_ALLOCATION_ERROR;

	2008 return NULL;

	2009 }

	2010 UText *result = utext_openUChars(NULL, ownedBuf, length, status);

	2011 if (U_FAILURE(*status)) {

	2012 uprv_free(ownedBuf);

	2013 return NULL;

	2014 }

	2015 result->providerProperties \|= (1 << UTEXT_PROVIDER_OWNS_TEXT);

	2016 return result;

	2017 }

	2018

	2019

1998 //---------------------------------------------------------------------	2020 //---------------------------------------------------------------------

1999 //	2021 //

2000 // split	2022 // split

2001 //	2023 //

2002 //---------------------------------------------------------------------	2024 //---------------------------------------------------------------------

2003 int32_t RegexMatcher::split(const UnicodeString &input,	2025 int32_t RegexMatcher::split(const UnicodeString &input,

2004 UnicodeString dest[],	2026 UnicodeString dest[],

2005 int32_t destCapacity,	2027 int32_t destCapacity,

2006 UErrorCode &status)	2028 UErrorCode &status)

2007 {	2029 {

(...skipping 146 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2154 // text goes out into the next n destination strings.	2176 // text goes out into the next n destination strings.

2155 int32_t groupNum;	2177 int32_t groupNum;

2156 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {	2178 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {

2157 if (i >= destCapacity-2) {	2179 if (i >= destCapacity-2) {

2158 // Never fill the last available output string with capture group text.	2180 // Never fill the last available output string with capture group text.

2159 // It will filled with the last field, the remainder of the	2181 // It will filled with the last field, the remainder of the

2160 // unsplit input text.	2182 // unsplit input text.

2161 break;	2183 break;

2162 }	2184 }

2163 i++;	2185 i++;

2164 dest[i] = group(groupNum, dest[i], status);	2186 dest[i] = utext_extract_replace(fInputText, dest[i],

	2187 start64(groupNum, status), end64( groupNum, status), &status);

2165 }	2188 }

2166	2189

2167 if (nextOutputStringStart == fActiveLimit) {	2190 if (nextOutputStringStart == fActiveLimit) {

2168 // The delimiter was at the end of the string. We're done, but first	2191 // The delimiter was at the end of the string. We're done, but first

2169 // we output one last empty string, for the empty field followin g	2192 // we output one last empty string, for the empty field followin g

2170 // the delimiter at the end of input.	2193 // the delimiter at the end of input.

2171 if (i+1 < destCapacity) {	2194 if (i+1 < destCapacity) {

2172 ++i;	2195 ++i;

2173 if (dest[i] == NULL) {	2196 if (dest[i] == NULL) {

2174 dest[i] = utext_openUChars(NULL, NULL, 0, &status);	2197 dest[i] = utext_openUChars(NULL, NULL, 0, &status);

(...skipping 291 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2466 // new stack frame to all -1. The -1s are needed for capture group li mits,	2489 // new stack frame to all -1. The -1s are needed for capture group li mits,

2467 // where they indicate that a group has not yet matched anything.	2490 // where they indicate that a group has not yet matched anything.

2468 //------------------------------------------------------------------------------ --	2491 //------------------------------------------------------------------------------ --

2469 REStackFrame *RegexMatcher::resetStack() {	2492 REStackFrame *RegexMatcher::resetStack() {

2470 // Discard any previous contents of the state save stack, and initialize a	2493 // Discard any previous contents of the state save stack, and initialize a

2471 // new stack frame with all -1 data. The -1s are needed for capture group limits,	2494 // new stack frame with all -1 data. The -1s are needed for capture group limits,

2472 // where they indicate that a group has not yet matched anything.	2495 // where they indicate that a group has not yet matched anything.

2473 fStack->removeAllElements();	2496 fStack->removeAllElements();

2474	2497

2475 REStackFrame iFrame = (REStackFrame )fStack->reserveBlock(fPattern->fFrame Size, fDeferredStatus);	2498 REStackFrame iFrame = (REStackFrame )fStack->reserveBlock(fPattern->fFrame Size, fDeferredStatus);

	2499 if(U_FAILURE(fDeferredStatus)) {

	2500 return NULL;

	2501 }

	2502

2476 int32_t i;	2503 int32_t i;

2477 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {	2504 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {

2478 iFrame->fExtra[i] = -1;	2505 iFrame->fExtra[i] = -1;

2479 }	2506 }

2480 return iFrame;	2507 return iFrame;

2481 }	2508 }

2482	2509

2483	2510

2484	2511

2485 //------------------------------------------------------------------------------ --	2512 //------------------------------------------------------------------------------ --

(...skipping 167 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2653 // fp The top frame pointer when called. At return, a new	2680 // fp The top frame pointer when called. At return, a new

2654 // fame will be present	2681 // fame will be present

2655 // savePatIdx An index into the compiled pattern. Goes into the origina l	2682 // savePatIdx An index into the compiled pattern. Goes into the origina l

2656 // (not new) frame. If execution ever back-tracks out of the	2683 // (not new) frame. If execution ever back-tracks out of the

2657 // new frame, this will be where we continue from in the patt ern.	2684 // new frame, this will be where we continue from in the patt ern.

2658 // Return	2685 // Return

2659 // The new frame pointer.	2686 // The new frame pointer.

2660 //	2687 //

2661 //------------------------------------------------------------------------------ --	2688 //------------------------------------------------------------------------------ --

2662 inline REStackFrame RegexMatcher::StateSave(REStackFrame fp, int64_t savePatId x, UErrorCode &status) {	2689 inline REStackFrame RegexMatcher::StateSave(REStackFrame fp, int64_t savePatId x, UErrorCode &status) {

	2690 if (U_FAILURE(status)) {

	2691 return fp;

	2692 }

2663 // push storage for a new frame.	2693 // push storage for a new frame.

2664 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);	2694 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);

2665 if (newFP == NULL) {	2695 if (U_FAILURE(status)) {

2666 // Failure on attempted stack expansion.	2696 // Failure on attempted stack expansion.

2667 // Stack function set some other error code, change it to a more	2697 // Stack function set some other error code, change it to a more

2668 // specific one for regular expressions.	2698 // specific one for regular expressions.

2669 status = U_REGEX_STACK_OVERFLOW;	2699 status = U_REGEX_STACK_OVERFLOW;

2670 // We need to return a writable stack frame, so just return the	2700 // We need to return a writable stack frame, so just return the

2671 // previous frame. The match operation will stop quickly	2701 // previous frame. The match operation will stop quickly

2672 // because of the error status, after which the frame will never	2702 // because of the error status, after which the frame will never

2673 // be looked at again.	2703 // be looked at again.

2674 return fp;	2704 return fp;

2675 }	2705 }

(...skipping 71 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2747	2777

2748 // Cache frequently referenced items from the compiled pattern	2778 // Cache frequently referenced items from the compiled pattern

2749 //	2779 //

2750 int64_t *pat = fPattern->fCompiledPat->getBuffer();	2780 int64_t *pat = fPattern->fCompiledPat->getBuffer();

2751	2781

2752 const UChar *litText = fPattern->fLiteralText.getBuffer();	2782 const UChar *litText = fPattern->fLiteralText.getBuffer();

2753 UVector *sets = fPattern->fSets;	2783 UVector *sets = fPattern->fSets;

2754	2784

2755 fFrameSize = fPattern->fFrameSize;	2785 fFrameSize = fPattern->fFrameSize;

2756 REStackFrame *fp = resetStack();	2786 REStackFrame *fp = resetStack();

	2787 if (U_FAILURE(fDeferredStatus)) {

	2788 status = fDeferredStatus;

	2789 return;

	2790 }

2757	2791

2758 fp->fPatIdx = 0;	2792 fp->fPatIdx = 0;

2759 fp->fInputIdx = startIdx;	2793 fp->fInputIdx = startIdx;

2760	2794

2761 // Zero out the pattern's static data	2795 // Zero out the pattern's static data

2762 int32_t i;	2796 int32_t i;

2763 for (i = 0; i<fPattern->fDataSize; i++) {	2797 for (i = 0; i<fPattern->fDataSize; i++) {

2764 fData[i] = 0;	2798 fData[i] = 0;

2765 }	2799 }

2766	2800

(...skipping 133 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2900 fRequireEnd = TRUE;	2934 fRequireEnd = TRUE;

2901 break;	2935 break;

2902 }	2936 }

2903	2937

2904 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);	2938 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);

2905	2939

2906 // If we are positioned just before a new-line that is located a t the	2940 // If we are positioned just before a new-line that is located a t the

2907 // end of input, succeed.	2941 // end of input, succeed.

2908 UChar32 c = UTEXT_NEXT32(fInputText);	2942 UChar32 c = UTEXT_NEXT32(fInputText);

2909 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {	2943 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {

2910 if ((c>=0x0a && c<=0x0d) \|\| c==0x85 \|\| c==0x2028 \|\| c==0x202 9) {	2944 if (isLineTerminator(c)) {

2911 // If not in the middle of a CR/LF sequence	2945 // If not in the middle of a CR/LF sequence

2912 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTE XT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {	2946 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)U TEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {

2913 // At new-line at end of input. Success	2947 // At new-line at end of input. Success

2914 fHitEnd = TRUE;	2948 fHitEnd = TRUE;

2915 fRequireEnd = TRUE;	2949 fRequireEnd = TRUE;

2916	2950

2917 break;	2951 break;

2918 }	2952 }

2919 }	2953 }

2920 } else {	2954 } else {

2921 UChar32 nextC = UTEXT_NEXT32(fInputText);	2955 UChar32 nextC = UTEXT_NEXT32(fInputText);

2922 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInpu tText) >= fAnchorLimit) {	2956 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInpu tText) >= fAnchorLimit) {

(...skipping 35 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2958 if (fp->fInputIdx >= fAnchorLimit) {	2992 if (fp->fInputIdx >= fAnchorLimit) {

2959 // We really are at the end of input. Success.	2993 // We really are at the end of input. Success.

2960 fHitEnd = TRUE;	2994 fHitEnd = TRUE;

2961 fRequireEnd = TRUE;	2995 fRequireEnd = TRUE;

2962 break;	2996 break;

2963 }	2997 }

2964 // If we are positioned just before a new-line, succeed.	2998 // If we are positioned just before a new-line, succeed.

2965 // It makes no difference where the new-line is within the inpu t.	2999 // It makes no difference where the new-line is within the inpu t.

2966 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);	3000 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);

2967 UChar32 c = UTEXT_CURRENT32(fInputText);	3001 UChar32 c = UTEXT_CURRENT32(fInputText);

2968 if ((c>=0x0a && c<=0x0d) \|\| c==0x85 \|\|c==0x2028 \|\| c==0x2029) {	3002 if (isLineTerminator(c)) {

2969 // At a line end, except for the odd chance of being in th e middle of a CR/LF sequence	3003 // At a line end, except for the odd chance of being in th e middle of a CR/LF sequence

2970 // In multi-line mode, hitting a new-line just before the end of input does not	3004 // In multi-line mode, hitting a new-line just before the end of input does not

2971 // set the hitEnd or requireEnd flags	3005 // set the hitEnd or requireEnd flags

2972 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVI OUS32(fInputText)==0x0d)) {	3006 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVI OUS32(fInputText)==0x0d)) {

2973 break;	3007 break;

2974 }	3008 }

2975 }	3009 }

2976 // not at a new line. Fail.	3010 // not at a new line. Fail.

2977 fp = (REStackFrame *)fStack->popFrame(fFrameSize);	3011 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

2978 }	3012 }

(...skipping 28 matching lines...) Expand all Loading...
3007 case URX_CARET_M: // ^, test for start of line in muli t-line mode	3041 case URX_CARET_M: // ^, test for start of line in muli t-line mode

3008 {	3042 {

3009 if (fp->fInputIdx == fAnchorStart) {	3043 if (fp->fInputIdx == fAnchorStart) {

3010 // We are at the start input. Success.	3044 // We are at the start input. Success.

3011 break;	3045 break;

3012 }	3046 }

3013 // Check whether character just before the current pos is a new-l ine	3047 // Check whether character just before the current pos is a new-l ine

3014 // unless we are at the end of input	3048 // unless we are at the end of input

3015 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);	3049 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);

3016 UChar32 c = UTEXT_PREVIOUS32(fInputText);	3050 UChar32 c = UTEXT_PREVIOUS32(fInputText);

3017 if ((fp->fInputIdx < fAnchorLimit) &&	3051 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {

3018 ((c<=0x0d && c>=0x0a) \|\| c==0x85 \|\|c==0x2028 \|\| c==0x2029)) {

3019 // It's a new-line. ^ is true. Success.	3052 // It's a new-line. ^ is true. Success.

3020 // TODO: what should be done with positions between a CR an d LF?	3053 // TODO: what should be done with positions between a CR an d LF?

3021 break;	3054 break;

3022 }	3055 }

3023 // Not at the start of a line. Fail.	3056 // Not at the start of a line. Fail.

3024 fp = (REStackFrame *)fStack->popFrame(fFrameSize);	3057 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

3025 }	3058 }

3026 break;	3059 break;

3027	3060

3028	3061

(...skipping 60 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3089 break;	3122 break;

3090	3123

3091	3124

3092 case URX_BACKSLASH_G: // Test for position at end of previous m atch	3125 case URX_BACKSLASH_G: // Test for position at end of previous m atch

3093 if (!((fMatch && fp->fInputIdx==fMatchEnd) \|\| (fMatch==FALSE && fp-> fInputIdx==fActiveStart))) {	3126 if (!((fMatch && fp->fInputIdx==fMatchEnd) \|\| (fMatch==FALSE && fp-> fInputIdx==fActiveStart))) {

3094 fp = (REStackFrame *)fStack->popFrame(fFrameSize);	3127 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

3095 }	3128 }

3096 break;	3129 break;

3097	3130

3098	3131

	3132 case URX_BACKSLASH_H: // Test for \h, horizontal white space.

	3133 {

	3134 if (fp->fInputIdx >= fActiveLimit) {

	3135 fHitEnd = TRUE;

	3136 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

	3137 break;

	3138 }

	3139 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);

	3140 UChar32 c = UTEXT_NEXT32(fInputText);

	3141 int8_t ctype = u_charType(c);

	3142 UBool success = (ctype == U_SPACE_SEPARATOR \|\| c == 9); // SPAC E_SEPARATOR \|\| TAB

	3143 success ^= (UBool)(opValue != 0); // flip sense for \H

	3144 if (success) {

	3145 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);

	3146 } else {

	3147 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

	3148 }

	3149 }

	3150 break;

	3151

	3152

	3153 case URX_BACKSLASH_R: // Test for \R, any line break sequence .

	3154 {

	3155 if (fp->fInputIdx >= fActiveLimit) {

	3156 fHitEnd = TRUE;

	3157 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

	3158 break;

	3159 }

	3160 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);

	3161 UChar32 c = UTEXT_NEXT32(fInputText);

	3162 if (isLineTerminator(c)) {

	3163 if (c == 0x0d && utext_current32(fInputText) == 0x0a) {

	3164 utext_next32(fInputText);

	3165 }

	3166 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);

	3167 } else {

	3168 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

	3169 }

	3170 }

	3171 break;

	3172

	3173

	3174 case URX_BACKSLASH_V: // \v, any single line ending character .

	3175 {

	3176 if (fp->fInputIdx >= fActiveLimit) {

	3177 fHitEnd = TRUE;

	3178 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

	3179 break;

	3180 }

	3181 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);

	3182 UChar32 c = UTEXT_NEXT32(fInputText);

	3183 UBool success = isLineTerminator(c);

	3184 success ^= (UBool)(opValue != 0); // flip sense for \V

	3185 if (success) {

	3186 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);

	3187 } else {

	3188 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

	3189 }

	3190 }

	3191 break;

	3192

	3193

3099 case URX_BACKSLASH_X:	3194 case URX_BACKSLASH_X:

3100 // Match a Grapheme, as defined by Unicode TR 29.	3195 // Match a Grapheme, as defined by Unicode TR 29.

3101 // Differs slightly from Perl, which consumes combining marks indep endently	3196 // Differs slightly from Perl, which consumes combining marks indep endently

3102 // of context.	3197 // of context.

3103 {	3198 {

3104	3199

3105 // Fail if at end of input	3200 // Fail if at end of input

3106 if (fp->fInputIdx >= fActiveLimit) {	3201 if (fp->fInputIdx >= fActiveLimit) {

3107 fHitEnd = TRUE;	3202 fHitEnd = TRUE;

3108 fp = (REStackFrame *)fStack->popFrame(fFrameSize);	3203 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

(...skipping 207 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3316 // At end of input. Match failed. Backtrack out.	3411 // At end of input. Match failed. Backtrack out.

3317 fHitEnd = TRUE;	3412 fHitEnd = TRUE;

3318 fp = (REStackFrame *)fStack->popFrame(fFrameSize);	3413 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

3319 break;	3414 break;

3320 }	3415 }

3321	3416

3322 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);	3417 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);

3323	3418

3324 // There is input left. Advance over one char, unless we've hit end-of-line	3419 // There is input left. Advance over one char, unless we've hit end-of-line

3325 UChar32 c = UTEXT_NEXT32(fInputText);	3420 UChar32 c = UTEXT_NEXT32(fInputText);

3326 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible	3421 if (isLineTerminator(c)) {

3327 ((c<=0x0d && c>=0x0a) \|\| c==0x85 \|\|c==0x2028 \|\| c==0x2029)) {

3328 // End of line in normal mode. . does not match.	3422 // End of line in normal mode. . does not match.

3329 fp = (REStackFrame *)fStack->popFrame(fFrameSize);	3423 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

3330 break;	3424 break;

3331 }	3425 }

3332 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);	3426 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);

3333 }	3427 }

3334 break;	3428 break;

3335	3429

3336	3430

3337 case URX_DOTANY_ALL:	3431 case URX_DOTANY_ALL:

(...skipping 736 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4074 UTEXT_SETNATIVEINDEX(fInputText, ix);	4168 UTEXT_SETNATIVEINDEX(fInputText, ix);

4075 for (;;) {	4169 for (;;) {

4076 if (ix >= fActiveLimit) {	4170 if (ix >= fActiveLimit) {

4077 fHitEnd = TRUE;	4171 fHitEnd = TRUE;

4078 break;	4172 break;

4079 }	4173 }

4080 UChar32 c = UTEXT_NEXT32(fInputText);	4174 UChar32 c = UTEXT_NEXT32(fInputText);

4081 if ((c & 0x7f) <= 0x29) { // Fast filter of non -new-line-s	4175 if ((c & 0x7f) <= 0x29) { // Fast filter of non -new-line-s

4082 if ((c == 0x0a) \|\| // 0x0a is newline i n both modes.	4176 if ((c == 0x0a) \|\| // 0x0a is newline i n both modes.

4083 (((opValue & 2) == 0) && // IF not UNIX_LINES mode	4177 (((opValue & 2) == 0) && // IF not UNIX_LINES mode

4084 (c<=0x0d && c>=0x0a)) \|\| c==0x85 \|\|c==0x2028 \|\| c==0x2029) {	4178 isLineTerminator(c))) {

4085 // char is a line ending. Exit the scanning lo op.	4179 // char is a line ending. Exit the scanning lo op.

4086 break;	4180 break;

4087 }	4181 }

4088 }	4182 }

4089 ix = UTEXT_GETNATIVEINDEX(fInputText);	4183 ix = UTEXT_GETNATIVEINDEX(fInputText);

4090 }	4184 }

4091 }	4185 }

4092	4186

4093 // If there were no matching characters, skip over the loop alto gether.	4187 // If there were no matching characters, skip over the loop alto gether.

4094 // The loop doesn't run at all, a * op always succeeds.	4188 // The loop doesn't run at all, a * op always succeeds.

(...skipping 155 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4250 //	4344 //

4251 int64_t *pat = fPattern->fCompiledPat->getBuffer();	4345 int64_t *pat = fPattern->fCompiledPat->getBuffer();

4252	4346

4253 const UChar *litText = fPattern->fLiteralText.getBuffer();	4347 const UChar *litText = fPattern->fLiteralText.getBuffer();

4254 UVector *sets = fPattern->fSets;	4348 UVector *sets = fPattern->fSets;

4255	4349

4256 const UChar *inputBuf = fInputText->chunkContents;	4350 const UChar *inputBuf = fInputText->chunkContents;

4257	4351

4258 fFrameSize = fPattern->fFrameSize;	4352 fFrameSize = fPattern->fFrameSize;

4259 REStackFrame *fp = resetStack();	4353 REStackFrame *fp = resetStack();

	4354 if (U_FAILURE(fDeferredStatus)) {

	4355 status = fDeferredStatus;

	4356 return;

	4357 }

4260	4358

4261 fp->fPatIdx = 0;	4359 fp->fPatIdx = 0;

4262 fp->fInputIdx = startIdx;	4360 fp->fInputIdx = startIdx;

4263	4361

4264 // Zero out the pattern's static data	4362 // Zero out the pattern's static data

4265 int32_t i;	4363 int32_t i;

4266 for (i = 0; i<fPattern->fDataSize; i++) {	4364 for (i = 0; i<fPattern->fDataSize; i++) {

4267 fData[i] = 0;	4365 fData[i] = 0;

4268 }	4366 }

4269	4367

(...skipping 135 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4405 fRequireEnd = TRUE;	4503 fRequireEnd = TRUE;

4406 break;	4504 break;

4407 }	4505 }

4408	4506

4409 // If we are positioned just before a new-line that is located at th e	4507 // If we are positioned just before a new-line that is located at th e

4410 // end of input, succeed.	4508 // end of input, succeed.

4411 if (fp->fInputIdx == fAnchorLimit-1) {	4509 if (fp->fInputIdx == fAnchorLimit-1) {

4412 UChar32 c;	4510 UChar32 c;

4413 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);	4511 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);

4414	4512

4415 if ((c>=0x0a && c<=0x0d) \|\| c==0x85 \|\| c==0x2028 \|\| c==0x2029) {	4513 if (isLineTerminator(c)) {

4416 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp- >fInputIdx-1]==0x0d)) {	4514 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp- >fInputIdx-1]==0x0d)) {

4417 // At new-line at end of input. Success	4515 // At new-line at end of input. Success

4418 fHitEnd = TRUE;	4516 fHitEnd = TRUE;

4419 fRequireEnd = TRUE;	4517 fRequireEnd = TRUE;

4420 break;	4518 break;

4421 }	4519 }

4422 }	4520 }

4423 } else if (fp->fInputIdx == fAnchorLimit-2 &&	4521 } else if (fp->fInputIdx == fAnchorLimit-2 &&

4424 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a ) {	4522 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a ) {

4425 fHitEnd = TRUE;	4523 fHitEnd = TRUE;

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4459 {	4557 {

4460 if (fp->fInputIdx >= fAnchorLimit) {	4558 if (fp->fInputIdx >= fAnchorLimit) {

4461 // We really are at the end of input. Success.	4559 // We really are at the end of input. Success.

4462 fHitEnd = TRUE;	4560 fHitEnd = TRUE;

4463 fRequireEnd = TRUE;	4561 fRequireEnd = TRUE;

4464 break;	4562 break;

4465 }	4563 }

4466 // If we are positioned just before a new-line, succeed.	4564 // If we are positioned just before a new-line, succeed.

4467 // It makes no difference where the new-line is within the input .	4565 // It makes no difference where the new-line is within the input .

4468 UChar32 c = inputBuf[fp->fInputIdx];	4566 UChar32 c = inputBuf[fp->fInputIdx];

4469 if ((c>=0x0a && c<=0x0d) \|\| c==0x85 \|\|c==0x2028 \|\| c==0x2029) {	4567 if (isLineTerminator(c)) {

4470 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence	4568 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence

4471 // In multi-line mode, hitting a new-line just before the e nd of input does not	4569 // In multi-line mode, hitting a new-line just before the e nd of input does not

4472 // set the hitEnd or requireEnd flags	4570 // set the hitEnd or requireEnd flags

4473 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp- >fInputIdx-1]==0x0d)) {	4571 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp- >fInputIdx-1]==0x0d)) {

4474 break;	4572 break;

4475 }	4573 }

4476 }	4574 }

4477 // not at a new line. Fail.	4575 // not at a new line. Fail.

4478 fp = (REStackFrame *)fStack->popFrame(fFrameSize);	4576 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

4479 }	4577 }

(...skipping 27 matching lines...) Expand all Loading...
4507 case URX_CARET_M: // ^, test for start of line in mul it-line mode	4605 case URX_CARET_M: // ^, test for start of line in mul it-line mode

4508 {	4606 {

4509 if (fp->fInputIdx == fAnchorStart) {	4607 if (fp->fInputIdx == fAnchorStart) {

4510 // We are at the start input. Success.	4608 // We are at the start input. Success.

4511 break;	4609 break;

4512 }	4610 }

4513 // Check whether character just before the current pos is a new- line	4611 // Check whether character just before the current pos is a new- line

4514 // unless we are at the end of input	4612 // unless we are at the end of input

4515 UChar c = inputBuf[fp->fInputIdx - 1];	4613 UChar c = inputBuf[fp->fInputIdx - 1];

4516 if ((fp->fInputIdx < fAnchorLimit) &&	4614 if ((fp->fInputIdx < fAnchorLimit) &&

4517 ((c<=0x0d && c>=0x0a) \|\| c==0x85 \|\|c==0x2028 \|\| c==0x2029)) {	4615 isLineTerminator(c)) {

4518 // It's a new-line. ^ is true. Success.	4616 // It's a new-line. ^ is true. Success.

4519 // TODO: what should be done with positions between a CR a nd LF?	4617 // TODO: what should be done with positions between a CR a nd LF?

4520 break;	4618 break;

4521 }	4619 }

4522 // Not at the start of a line. Fail.	4620 // Not at the start of a line. Fail.

4523 fp = (REStackFrame *)fStack->popFrame(fFrameSize);	4621 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

4524 }	4622 }

4525 break;	4623 break;

4526	4624

4527	4625

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4584 break;	4682 break;

4585	4683

4586	4684

4587 case URX_BACKSLASH_G: // Test for position at end of previous m atch	4685 case URX_BACKSLASH_G: // Test for position at end of previous m atch

4588 if (!((fMatch && fp->fInputIdx==fMatchEnd) \|\| (fMatch==FALSE && fp-> fInputIdx==fActiveStart))) {	4686 if (!((fMatch && fp->fInputIdx==fMatchEnd) \|\| (fMatch==FALSE && fp-> fInputIdx==fActiveStart))) {

4589 fp = (REStackFrame *)fStack->popFrame(fFrameSize);	4687 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

4590 }	4688 }

4591 break;	4689 break;

4592	4690

4593	4691

	4692 case URX_BACKSLASH_H: // Test for \h, horizontal white space.

	4693 {

	4694 if (fp->fInputIdx >= fActiveLimit) {

	4695 fHitEnd = TRUE;

	4696 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

	4697 break;

	4698 }

	4699 UChar32 c;

	4700 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);

	4701 int8_t ctype = u_charType(c);

	4702 UBool success = (ctype == U_SPACE_SEPARATOR \|\| c == 9); // SPAC E_SEPARATOR \|\| TAB

	4703 success ^= (UBool)(opValue != 0); // flip sense for \H

	4704 if (!success) {

	4705 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

	4706 }

	4707 }

	4708 break;

	4709

	4710

	4711 case URX_BACKSLASH_R: // Test for \R, any line break sequence .

	4712 {

	4713 if (fp->fInputIdx >= fActiveLimit) {

	4714 fHitEnd = TRUE;

	4715 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

	4716 break;

	4717 }

	4718 UChar32 c;

	4719 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);

	4720 if (isLineTerminator(c)) {

	4721 if (c == 0x0d && fp->fInputIdx < fActiveLimit) {

	4722 // Check for CR/LF sequence. Consume both together when found.

	4723 UChar c2;

	4724 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);

	4725 if (c2 != 0x0a) {

	4726 U16_PREV(inputBuf, 0, fp->fInputIdx, c2);

	4727 }

	4728 }

	4729 } else {

	4730 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

	4731 }

	4732 }

	4733 break;

	4734

	4735

	4736 case URX_BACKSLASH_V: // Any single code point line ending.

	4737 {

	4738 if (fp->fInputIdx >= fActiveLimit) {

	4739 fHitEnd = TRUE;

	4740 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

	4741 break;

	4742 }

	4743 UChar32 c;

	4744 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);

	4745 UBool success = isLineTerminator(c);

	4746 success ^= (UBool)(opValue != 0); // flip sense for \V

	4747 if (!success) {

	4748 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

	4749 }

	4750 }

	4751 break;

	4752

	4753

	4754

4594 case URX_BACKSLASH_X:	4755 case URX_BACKSLASH_X:

4595 // Match a Grapheme, as defined by Unicode TR 29.	4756 // Match a Grapheme, as defined by Unicode TR 29.

4596 // Differs slightly from Perl, which consumes combining marks independe ntly	4757 // Differs slightly from Perl, which consumes combining marks independe ntly

4597 // of context.	4758 // of context.

4598 {	4759 {

4599	4760

4600 // Fail if at end of input	4761 // Fail if at end of input

4601 if (fp->fInputIdx >= fActiveLimit) {	4762 if (fp->fInputIdx >= fActiveLimit) {

4602 fHitEnd = TRUE;	4763 fHitEnd = TRUE;

4603 fp = (REStackFrame *)fStack->popFrame(fFrameSize);	4764 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

(...skipping 189 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4793 if (fp->fInputIdx >= fActiveLimit) {	4954 if (fp->fInputIdx >= fActiveLimit) {

4794 // At end of input. Match failed. Backtrack out.	4955 // At end of input. Match failed. Backtrack out.

4795 fHitEnd = TRUE;	4956 fHitEnd = TRUE;

4796 fp = (REStackFrame *)fStack->popFrame(fFrameSize);	4957 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

4797 break;	4958 break;

4798 }	4959 }

4799	4960

4800 // There is input left. Advance over one char, unless we've hit end-of-line	4961 // There is input left. Advance over one char, unless we've hit end-of-line

4801 UChar32 c;	4962 UChar32 c;

4802 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);	4963 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);

4803 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible	4964 if (isLineTerminator(c)) {

4804 ((c<=0x0d && c>=0x0a) \|\| c==0x85 \|\|c==0x2028 \|\| c==0x2029)) {

4805 // End of line in normal mode. . does not match.	4965 // End of line in normal mode. . does not match.

4806 fp = (REStackFrame *)fStack->popFrame(fFrameSize);	4966 fp = (REStackFrame *)fStack->popFrame(fFrameSize);

4807 break;	4967 break;

4808 }	4968 }

4809 }	4969 }

4810 break;	4970 break;

4811	4971

4812	4972

4813 case URX_DOTANY_ALL:	4973 case URX_DOTANY_ALL:

4814 {	4974 {

(...skipping 693 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5508 for (;;) {	5668 for (;;) {

5509 if (ix >= fActiveLimit) {	5669 if (ix >= fActiveLimit) {

5510 fHitEnd = TRUE;	5670 fHitEnd = TRUE;

5511 break;	5671 break;

5512 }	5672 }

5513 UChar32 c;	5673 UChar32 c;

5514 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputB uf[ix++]	5674 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputB uf[ix++]

5515 if ((c & 0x7f) <= 0x29) { // Fast filter of non -new-line-s	5675 if ((c & 0x7f) <= 0x29) { // Fast filter of non -new-line-s

5516 if ((c == 0x0a) \|\| // 0x0a is newline i n both modes.	5676 if ((c == 0x0a) \|\| // 0x0a is newline i n both modes.

5517 (((opValue & 2) == 0) && // IF not UNIX_LINES mode	5677 (((opValue & 2) == 0) && // IF not UNIX_LINES mode

5518 ((c<=0x0d && c>=0x0a) \|\| c==0x85 \|\| c==0x2028 \|\| c==0x2029))) {	5678 isLineTerminator(c))) {

5519 // char is a line ending. Put the input pos ba ck to the	5679 // char is a line ending. Put the input pos ba ck to the

5520 // line ending char, and exit the scanning lo op.	5680 // line ending char, and exit the scanning lo op.

5521 U16_BACK_1(inputBuf, 0, ix);	5681 U16_BACK_1(inputBuf, 0, ix);

5522 break;	5682 break;

5523 }	5683 }

5524 }	5684 }

5525 }	5685 }

5526 }	5686 }

5527	5687

5528 // If there were no matching characters, skip over the loop alto gether.	5688 // If there were no matching characters, skip over the loop alto gether.

(...skipping 95 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5624	5784

5625 return;	5785 return;

5626 }	5786 }

5627	5787

5628	5788

5629 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)	5789 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)

5630	5790

5631 U_NAMESPACE_END	5791 U_NAMESPACE_END

5632	5792

5633 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS	5793 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS

OLD	NEW

« no previous file with comments | « source/i18n/region.cpp ('k') | source/i18n/repattrn.cpp » ('j') | no next file with comments »