Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(275)

Side by Side Diff: source/i18n/rematch.cpp

Issue 1621843002: ICU 56 update step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@561
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/i18n/region.cpp ('k') | source/i18n/repattrn.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 ************************************************************************** 2 **************************************************************************
3 * Copyright (C) 2002-2014 International Business Machines Corporation * 3 * Copyright (C) 2002-2015 International Business Machines Corporation *
4 * and others. All rights reserved. * 4 * and others. All rights reserved. *
5 ************************************************************************** 5 **************************************************************************
6 */ 6 */
7 // 7 //
8 // file: rematch.cpp 8 // file: rematch.cpp
9 // 9 //
10 // Contains the implementation of class RegexMatcher, 10 // Contains the implementation of class RegexMatcher,
11 // which is one of the main API classes for the ICU regular expression p ackage. 11 // which is one of the main API classes for the ICU regular expression p ackage.
12 // 12 //
13 13
(...skipping 28 matching lines...) Expand all
42 // backtrack point. 42 // backtrack point.
43 // 43 //
44 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; 44 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
45 45
46 // Time limit counter constant. 46 // Time limit counter constant.
47 // Time limits for expression evaluation are in terms of quanta of work by 47 // Time limits for expression evaluation are in terms of quanta of work by
48 // the engine, each of which is 10,000 state saves. 48 // the engine, each of which is 10,000 state saves.
49 // This constant determines that state saves per tick number. 49 // This constant determines that state saves per tick number.
50 static const int32_t TIMER_INITIAL_VALUE = 10000; 50 static const int32_t TIMER_INITIAL_VALUE = 10000;
51 51
52
53 // Test for any of the Unicode line terminating characters.
54 static inline UBool isLineTerminator(UChar32 c) {
55 if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
56 return false;
57 }
58 return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
59 }
60
52 //----------------------------------------------------------------------------- 61 //-----------------------------------------------------------------------------
53 // 62 //
54 // Constructor and Destructor 63 // Constructor and Destructor
55 // 64 //
56 //----------------------------------------------------------------------------- 65 //-----------------------------------------------------------------------------
57 RegexMatcher::RegexMatcher(const RegexPattern *pat) { 66 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
58 fDeferredStatus = U_ZERO_ERROR; 67 fDeferredStatus = U_ZERO_ERROR;
59 init(fDeferredStatus); 68 init(fDeferredStatus);
60 if (U_FAILURE(fDeferredStatus)) { 69 if (U_FAILURE(fDeferredStatus)) {
61 return; 70 return;
(...skipping 147 matching lines...) Expand 10 before | Expand all | Expand 10 after
209 fDeferredStatus = status; 218 fDeferredStatus = status;
210 fData = fSmallData; 219 fData = fSmallData;
211 fWordBreakItr = NULL; 220 fWordBreakItr = NULL;
212 221
213 fStack = NULL; 222 fStack = NULL;
214 fInputText = NULL; 223 fInputText = NULL;
215 fAltInputText = NULL; 224 fAltInputText = NULL;
216 fInput = NULL; 225 fInput = NULL;
217 fInputLength = 0; 226 fInputLength = 0;
218 fInputUniStrMaybeMutable = FALSE; 227 fInputUniStrMaybeMutable = FALSE;
219
220 if (U_FAILURE(status)) {
221 fDeferredStatus = status;
222 }
223 } 228 }
224 229
225 // 230 //
226 // init2() Common initialization for use by RegexMatcher constructors, part 2 . 231 // init2() Common initialization for use by RegexMatcher constructors, part 2 .
227 // This handles the common setup to be done after the Pattern is avai lable. 232 // This handles the common setup to be done after the Pattern is avai lable.
228 // 233 //
229 void RegexMatcher::init2(UText *input, UErrorCode &status) { 234 void RegexMatcher::init2(UText *input, UErrorCode &status) {
230 if (U_FAILURE(status)) { 235 if (U_FAILURE(status)) {
231 fDeferredStatus = status; 236 fDeferredStatus = status;
232 return; 237 return;
(...skipping 17 matching lines...) Expand all
250 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status); 255 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
251 if (U_FAILURE(status)) { 256 if (U_FAILURE(status)) {
252 fDeferredStatus = status; 257 fDeferredStatus = status;
253 return; 258 return;
254 } 259 }
255 } 260 }
256 261
257 262
258 static const UChar BACKSLASH = 0x5c; 263 static const UChar BACKSLASH = 0x5c;
259 static const UChar DOLLARSIGN = 0x24; 264 static const UChar DOLLARSIGN = 0x24;
265 static const UChar LEFTBRACKET = 0x7b;
266 static const UChar RIGHTBRACKET = 0x7d;
267
260 //------------------------------------------------------------------------------ -- 268 //------------------------------------------------------------------------------ --
261 // 269 //
262 // appendReplacement 270 // appendReplacement
263 // 271 //
264 //------------------------------------------------------------------------------ -- 272 //------------------------------------------------------------------------------ --
265 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, 273 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
266 const UnicodeString &replacement, 274 const UnicodeString &replacement,
267 UErrorCode &status) { 275 UErrorCode &status) {
268 UText replacementText = UTEXT_INITIALIZER; 276 UText replacementText = UTEXT_INITIALIZER;
269 277
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after
324 uprv_free(inputChars); 332 uprv_free(inputChars);
325 } 333 }
326 } 334 }
327 fAppendPosition = fMatchEnd; 335 fAppendPosition = fMatchEnd;
328 336
329 337
330 // scan the replacement text, looking for substitutions ($n) and \escapes. 338 // scan the replacement text, looking for substitutions ($n) and \escapes.
331 // TODO: optimize this loop by efficiently scanning for '$' or '\', 339 // TODO: optimize this loop by efficiently scanning for '$' or '\',
332 // move entire ranges not containing substitutions. 340 // move entire ranges not containing substitutions.
333 UTEXT_SETNATIVEINDEX(replacement, 0); 341 UTEXT_SETNATIVEINDEX(replacement, 0);
334 UChar32 c = UTEXT_NEXT32(replacement); 342 for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENT INEL; c = UTEXT_NEXT32(replacement)) {
335 while (c != U_SENTINEL) {
336 if (c == BACKSLASH) { 343 if (c == BACKSLASH) {
337 // Backslash Escape. Copy the following char out without further ch ecks. 344 // Backslash Escape. Copy the following char out without further ch ecks.
338 // Note: Surrogate pairs don't need any special handling 345 // Note: Surrogate pairs don't need any special handling
339 // The second half wont be a '$' or a '\', and 346 // The second half wont be a '$' or a '\', and
340 // will move to the dest normally on the n ext 347 // will move to the dest normally on the n ext
341 // loop iteration. 348 // loop iteration.
342 c = UTEXT_CURRENT32(replacement); 349 c = UTEXT_CURRENT32(replacement);
343 if (c == U_SENTINEL) { 350 if (c == U_SENTINEL) {
344 break; 351 break;
345 } 352 }
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
391 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &statu s); 398 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &statu s);
392 } else { 399 } else {
393 UChar surrogate[2]; 400 UChar surrogate[2];
394 surrogate[0] = U16_LEAD(c); 401 surrogate[0] = U16_LEAD(c);
395 surrogate[1] = U16_TRAIL(c); 402 surrogate[1] = U16_TRAIL(c);
396 if (U_SUCCESS(status)) { 403 if (U_SUCCESS(status)) {
397 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 404 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
398 } 405 }
399 } 406 }
400 } else { 407 } else {
401 // We've got a $. Pick up a capture group number if one follows. 408 // We've got a $. Pick up a capture group name or number if one fol lows.
402 // Consume at most the number of digits necessary for the largest ca pture 409 // Consume digits so long as the resulting group number <= the numbe r of
403 // number that is valid for this pattern. 410 // number of capture groups in the pattern.
404 411
412 int32_t groupNum = 0;
405 int32_t numDigits = 0; 413 int32_t numDigits = 0;
406 int32_t groupNum = 0; 414 UChar32 nextChar = utext_current32(replacement);
407 UChar32 digitC; 415 if (nextChar == LEFTBRACKET) {
408 for (;;) { 416 // Scan for a Named Capture Group, ${name}.
409 digitC = UTEXT_CURRENT32(replacement); 417 UnicodeString groupName;
410 if (digitC == U_SENTINEL) { 418 utext_next32(replacement);
411 break; 419 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
420 nextChar = utext_next32(replacement);
421 if (nextChar == U_SENTINEL) {
422 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
423 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || / / A..Z
424 (nextChar >= 0x61 && nextChar <= 0x7a) || / / a..z
425 (nextChar >= 0x31 && nextChar <= 0x39)) { / / 0..9
426 groupName.append(nextChar);
427 } else if (nextChar == RIGHTBRACKET) {
428 groupNum = uhash_geti(fPattern->fNamedCaptureMap, &group Name);
429 if (groupNum == 0) {
430 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
431 }
432 } else {
433 // Character was something other than a name char or a c losing '}'
434 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
435 }
412 } 436 }
413 if (u_isdigit(digitC) == FALSE) { 437
414 break; 438 } else if (u_isdigit(nextChar)) {
439 // $n Scan for a capture group number
440 int32_t numCaptureGroups = fPattern->fGroupMap->size();
441 for (;;) {
442 nextChar = UTEXT_CURRENT32(replacement);
443 if (nextChar == U_SENTINEL) {
444 break;
445 }
446 if (u_isdigit(nextChar) == FALSE) {
447 break;
448 }
449 int32_t nextDigitVal = u_charDigitValue(nextChar);
450 if (groupNum*10 + nextDigitVal > numCaptureGroups) {
451 // Don't consume the next digit if it makes the capture group number too big.
452 if (numDigits == 0) {
453 status = U_INDEX_OUTOFBOUNDS_ERROR;
454 }
455 break;
456 }
457 (void)UTEXT_NEXT32(replacement);
458 groupNum=groupNum*10 + nextDigitVal;
459 ++numDigits;
415 } 460 }
416 (void)UTEXT_NEXT32(replacement); 461 } else {
417 groupNum=groupNum*10 + u_charDigitValue(digitC); 462 // $ not followed by capture group name or number.
418 numDigits++; 463 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
419 if (numDigits >= fPattern->fMaxCaptureDigits) {
420 break;
421 }
422 } 464 }
423 465
424 466 if (U_SUCCESS(status)) {
425 if (numDigits == 0) {
426 // The $ didn't introduce a group number at all.
427 // Treat it as just part of the substitution text.
428 UChar c16 = DOLLARSIGN;
429 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &statu s);
430 } else {
431 // Finally, append the capture group data to the destination.
432 destLen += appendGroup(groupNum, dest, status); 467 destLen += appendGroup(groupNum, dest, status);
433 if (U_FAILURE(status)) {
434 // Can fail if group number is out of range.
435 break;
436 }
437 } 468 }
438 } 469 } // End of $ capture group handling
439 470 } // End of per-character loop through the replacement string.
440 if (U_FAILURE(status)) {
441 break;
442 } else {
443 c = UTEXT_NEXT32(replacement);
444 }
445 }
446 471
447 return *this; 472 return *this;
448 } 473 }
449 474
450 475
451 476
452 //------------------------------------------------------------------------------ -- 477 //------------------------------------------------------------------------------ --
453 // 478 //
454 // appendTail Intended to be used in conjunction with appendReplacement() 479 // appendTail Intended to be used in conjunction with appendReplacement()
455 // To the destination string, append everything following 480 // To the destination string, append everything following
(...skipping 354 matching lines...) Expand 10 before | Expand all | Expand 10 after
810 c = UTEXT_NEXT32(fInputText); 835 c = UTEXT_NEXT32(fInputText);
811 startPos = UTEXT_GETNATIVEINDEX(fInputText); 836 startPos = UTEXT_GETNATIVEINDEX(fInputText);
812 // Note that it's perfectly OK for a pattern to have a zero- length 837 // Note that it's perfectly OK for a pattern to have a zero- length
813 // match at the end of a string, so we must make sure that the loop 838 // match at the end of a string, so we must make sure that the loop
814 // runs with startPos == testStartLimit the last time thro ugh. 839 // runs with startPos == testStartLimit the last time thro ugh.
815 if (findProgressInterrupt(startPos, status)) 840 if (findProgressInterrupt(startPos, status))
816 return FALSE; 841 return FALSE;
817 } 842 }
818 } else { 843 } else {
819 for (;;) { 844 for (;;) {
820 if (((c & 0x7f) <= 0x29) && // First quickly bypass as m any chars as possible 845 if (isLineTerminator(c)) {
821 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x202 9 )) { 846 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURREN T32(fInputText) == 0x0a) {
822 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CU RRENT32(fInputText) == 0x0a) { 847 (void)UTEXT_NEXT32(fInputText);
823 (void)UTEXT_NEXT32(fInputText); 848 startPos = UTEXT_GETNATIVEINDEX(fInputText);
824 startPos = UTEXT_GETNATIVEINDEX(fInputText); 849 }
825 } 850 MatchAt(startPos, FALSE, status);
826 MatchAt(startPos, FALSE, status); 851 if (U_FAILURE(status)) {
827 if (U_FAILURE(status)) { 852 return FALSE;
828 return FALSE; 853 }
829 } 854 if (fMatch) {
830 if (fMatch) { 855 return TRUE;
831 return TRUE; 856 }
832 } 857 UTEXT_SETNATIVEINDEX(fInputText, startPos);
833 UTEXT_SETNATIVEINDEX(fInputText, startPos);
834 } 858 }
835 if (startPos >= testStartLimit) { 859 if (startPos >= testStartLimit) {
836 fMatch = FALSE; 860 fMatch = FALSE;
837 fHitEnd = TRUE; 861 fHitEnd = TRUE;
838 return FALSE; 862 return FALSE;
839 } 863 }
840 c = UTEXT_NEXT32(fInputText); 864 c = UTEXT_NEXT32(fInputText);
841 startPos = UTEXT_GETNATIVEINDEX(fInputText); 865 startPos = UTEXT_GETNATIVEINDEX(fInputText);
842 // Note that it's perfectly OK for a pattern to have a zero- length 866 // Note that it's perfectly OK for a pattern to have a zero- length
843 // match at the end of a string, so we must make sure that the loop 867 // match at the end of a string, so we must make sure that the loop
(...skipping 227 matching lines...) Expand 10 before | Expand all | Expand 10 after
1071 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1095 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1072 // Note that it's perfectly OK for a pattern to have a zero-leng th 1096 // Note that it's perfectly OK for a pattern to have a zero-leng th
1073 // match at the end of a string, so we must make sure that the loop 1097 // match at the end of a string, so we must make sure that the loop
1074 // runs with startPos == testLen the last time through. 1098 // runs with startPos == testLen the last time through.
1075 if (findProgressInterrupt(startPos, status)) 1099 if (findProgressInterrupt(startPos, status))
1076 return FALSE; 1100 return FALSE;
1077 } 1101 }
1078 } else { 1102 } else {
1079 for (;;) { 1103 for (;;) {
1080 c = inputBuf[startPos-1]; 1104 c = inputBuf[startPos-1];
1081 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 1105 if (isLineTerminator(c)) {
1082 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
1083 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPo s] == 0x0a) { 1106 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPo s] == 0x0a) {
1084 startPos++; 1107 startPos++;
1085 } 1108 }
1086 MatchChunkAt(startPos, FALSE, status); 1109 MatchChunkAt(startPos, FALSE, status);
1087 if (U_FAILURE(status)) { 1110 if (U_FAILURE(status)) {
1088 return FALSE; 1111 return FALSE;
1089 } 1112 }
1090 if (fMatch) { 1113 if (fMatch) {
1091 return TRUE; 1114 return TRUE;
1092 } 1115 }
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after
1168 group_len = e - s; 1191 group_len = e - s;
1169 1192
1170 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); 1193 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1171 if (dest) 1194 if (dest)
1172 UTEXT_SETNATIVEINDEX(dest, s); 1195 UTEXT_SETNATIVEINDEX(dest, s);
1173 return dest; 1196 return dest;
1174 } 1197 }
1175 1198
1176 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { 1199 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1177 UnicodeString result; 1200 UnicodeString result;
1178 if (U_FAILURE(status)) { 1201 int64_t groupStart = start64(groupNum, status);
1202 int64_t groupEnd = end64(groupNum, status);
1203 if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1179 return result; 1204 return result;
1180 } 1205 }
1181 UText resultText = UTEXT_INITIALIZER; 1206
1182 utext_openUnicodeString(&resultText, &result, &status); 1207 // Get the group length using a utext_extract preflight.
1183 group(groupNum, &resultText, status); 1208 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1184 utext_close(&resultText); 1209 int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &s tatus);
1210 if (status != U_BUFFER_OVERFLOW_ERROR) {
1211 return result;
1212 }
1213
1214 status = U_ZERO_ERROR;
1215 UChar *buf = result.getBuffer(length);
1216 if (buf == NULL) {
1217 status = U_MEMORY_ALLOCATION_ERROR;
1218 } else {
1219 int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1220 result.releaseBuffer(extractLength);
1221 U_ASSERT(length == extractLength);
1222 }
1185 return result; 1223 return result;
1186 } 1224 }
1187 1225
1188 1226
1189 // Return deep (mutable) clone
1190 // Technology Preview (as an API), but note that the UnicodeString API is i mplemented
1191 // using this function.
1192 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) co nst {
1193 if (U_FAILURE(status)) {
1194 return dest;
1195 }
1196
1197 if (U_FAILURE(fDeferredStatus)) {
1198 status = fDeferredStatus;
1199 } else if (fMatch == FALSE) {
1200 status = U_REGEX_INVALID_STATE;
1201 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1202 status = U_INDEX_OUTOFBOUNDS_ERROR;
1203 }
1204 if (U_FAILURE(status)) {
1205 return dest;
1206 }
1207
1208 int64_t s, e;
1209 if (groupNum == 0) {
1210 s = fMatchStart;
1211 e = fMatchEnd;
1212 } else {
1213 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1214 U_ASSERT(groupOffset < fPattern->fFrameSize);
1215 U_ASSERT(groupOffset >= 0);
1216 s = fFrame->fExtra[groupOffset];
1217 e = fFrame->fExtra[groupOffset+1];
1218 }
1219
1220 if (s < 0) {
1221 // A capture group wasn't part of the match
1222 if (dest) {
1223 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1224 return dest;
1225 } else {
1226 return utext_openUChars(NULL, NULL, 0, &status);
1227 }
1228 }
1229 U_ASSERT(s <= e);
1230
1231 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1232 U_ASSERT(e <= fInputLength);
1233 if (dest) {
1234 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkCo ntents+s, (int32_t)(e-s), &status);
1235 } else {
1236 UText groupText = UTEXT_INITIALIZER;
1237 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &stat us);
1238 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1239 utext_close(&groupText);
1240 }
1241 } else {
1242 int32_t len16;
1243 if (UTEXT_USES_U16(fInputText)) {
1244 len16 = (int32_t)(e-s);
1245 } else {
1246 UErrorCode lengthStatus = U_ZERO_ERROR;
1247 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1248 }
1249 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1250 if (groupChars == NULL) {
1251 status = U_MEMORY_ALLOCATION_ERROR;
1252 return dest;
1253 }
1254 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1255
1256 if (dest) {
1257 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
1258 } else {
1259 UText groupText = UTEXT_INITIALIZER;
1260 utext_openUChars(&groupText, groupChars, len16, &status);
1261 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1262 utext_close(&groupText);
1263 }
1264
1265 uprv_free(groupChars);
1266 }
1267 return dest;
1268 }
1269
1270 //------------------------------------------------------------------------------ -- 1227 //------------------------------------------------------------------------------ --
1271 // 1228 //
1272 // appendGroup() -- currently internal only, appends a group to a UText rather 1229 // appendGroup() -- currently internal only, appends a group to a UText rather
1273 // than replacing its contents 1230 // than replacing its contents
1274 // 1231 //
1275 //------------------------------------------------------------------------------ -- 1232 //------------------------------------------------------------------------------ --
1276 1233
1277 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &sta tus) const { 1234 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &sta tus) const {
1278 if (U_FAILURE(status)) { 1235 if (U_FAILURE(status)) {
1279 return 0; 1236 return 0;
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after
1340 1297
1341 //------------------------------------------------------------------------------ -- 1298 //------------------------------------------------------------------------------ --
1342 // 1299 //
1343 // groupCount() 1300 // groupCount()
1344 // 1301 //
1345 //------------------------------------------------------------------------------ -- 1302 //------------------------------------------------------------------------------ --
1346 int32_t RegexMatcher::groupCount() const { 1303 int32_t RegexMatcher::groupCount() const {
1347 return fPattern->fGroupMap->size(); 1304 return fPattern->fGroupMap->size();
1348 } 1305 }
1349 1306
1350
1351
1352 //------------------------------------------------------------------------------ -- 1307 //------------------------------------------------------------------------------ --
1353 // 1308 //
1354 // hasAnchoringBounds() 1309 // hasAnchoringBounds()
1355 // 1310 //
1356 //------------------------------------------------------------------------------ -- 1311 //------------------------------------------------------------------------------ --
1357 UBool RegexMatcher::hasAnchoringBounds() const { 1312 UBool RegexMatcher::hasAnchoringBounds() const {
1358 return fAnchoringBounds; 1313 return fAnchoringBounds;
1359 } 1314 }
1360 1315
1361 1316
(...skipping 515 matching lines...) Expand 10 before | Expand all | Expand 10 after
1877 fTickCounter = TIMER_INITIAL_VALUE; 1832 fTickCounter = TIMER_INITIAL_VALUE;
1878 //resetStack(); // more expensive than it looks... 1833 //resetStack(); // more expensive than it looks...
1879 } 1834 }
1880 1835
1881 1836
1882 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { 1837 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1883 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStat us); 1838 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStat us);
1884 if (fPattern->fNeedsAltInput) { 1839 if (fPattern->fNeedsAltInput) {
1885 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDe ferredStatus); 1840 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDe ferredStatus);
1886 } 1841 }
1842 if (U_FAILURE(fDeferredStatus)) {
1843 return *this;
1844 }
1887 fInputLength = utext_nativeLength(fInputText); 1845 fInputLength = utext_nativeLength(fInputText);
1888 1846
1889 reset(); 1847 reset();
1890 delete fInput; 1848 delete fInput;
1891 fInput = NULL; 1849 fInput = NULL;
1892 1850
1893 // Do the following for any UnicodeString. 1851 // Do the following for any UnicodeString.
1894 // This is for compatibility for those clients who modify the input string "live" during regex operations. 1852 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1895 fInputUniStrMaybeMutable = TRUE; 1853 fInputUniStrMaybeMutable = TRUE;
1896 1854
1897 if (fWordBreakItr != NULL) { 1855 if (fWordBreakItr != NULL) {
1898 #if UCONFIG_NO_BREAK_ITERATION==0 1856 #if UCONFIG_NO_BREAK_ITERATION==0
1899 UErrorCode status = U_ZERO_ERROR; 1857 UErrorCode status = U_ZERO_ERROR;
1900 fWordBreakItr->setText(fInputText, status); 1858 fWordBreakItr->setText(fInputText, status);
1901 #endif 1859 #endif
1902 } 1860 }
1903 return *this; 1861 return *this;
1904 } 1862 }
1905 1863
1906 1864
1907 RegexMatcher &RegexMatcher::reset(UText *input) { 1865 RegexMatcher &RegexMatcher::reset(UText *input) {
1908 if (fInputText != input) { 1866 if (fInputText != input) {
1909 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatu s); 1867 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatu s);
1910 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1868 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1869 if (U_FAILURE(fDeferredStatus)) {
1870 return *this;
1871 }
1911 fInputLength = utext_nativeLength(fInputText); 1872 fInputLength = utext_nativeLength(fInputText);
1912 1873
1913 delete fInput; 1874 delete fInput;
1914 fInput = NULL; 1875 fInput = NULL;
1915 1876
1916 if (fWordBreakItr != NULL) { 1877 if (fWordBreakItr != NULL) {
1917 #if UCONFIG_NO_BREAK_ITERATION==0 1878 #if UCONFIG_NO_BREAK_ITERATION==0
1918 UErrorCode status = U_ZERO_ERROR; 1879 UErrorCode status = U_ZERO_ERROR;
1919 fWordBreakItr->setText(input, status); 1880 fWordBreakItr->setText(input, status);
1920 #endif 1881 #endif
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
1988 // 1949 //
1989 // setTrace 1950 // setTrace
1990 // 1951 //
1991 //------------------------------------------------------------------------------ -- 1952 //------------------------------------------------------------------------------ --
1992 void RegexMatcher::setTrace(UBool state) { 1953 void RegexMatcher::setTrace(UBool state) {
1993 fTraceDebug = state; 1954 fTraceDebug = state;
1994 } 1955 }
1995 1956
1996 1957
1997 1958
1959 /**
1960 * UText, replace entire contents of the destination UText with a substring of the source UText.
1961 *
1962 * @param src The source UText
1963 * @param dest The destination UText. Must be writable.
1964 * May be NULL, in which case a new UText will be allocated.
1965 * @param start Start index of source substring.
1966 * @param limit Limit index of source substring.
1967 * @param status An error code.
1968 */
1969 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int6 4_t limit, UErrorCode *status) {
1970 if (U_FAILURE(*status)) {
1971 return dest;
1972 }
1973 if (start == limit) {
1974 if (dest) {
1975 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1976 return dest;
1977 } else {
1978 return utext_openUChars(NULL, NULL, 0, status);
1979 }
1980 }
1981 int32_t length = utext_extract(src, start, limit, NULL, 0, status);
1982 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
1983 return dest;
1984 }
1985 *status = U_ZERO_ERROR;
1986 MaybeStackArray<UChar, 40> buffer;
1987 if (length >= buffer.getCapacity()) {
1988 UChar *newBuf = buffer.resize(length+1); // Leave space for terminatin g Nul.
1989 if (newBuf == NULL) {
1990 *status = U_MEMORY_ALLOCATION_ERROR;
1991 }
1992 }
1993 utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
1994 if (dest) {
1995 utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), leng th, status);
1996 return dest;
1997 }
1998
1999 // Caller did not provide a prexisting UText.
2000 // Open a new one, and have it adopt the text buffer storage.
2001 if (U_FAILURE(*status)) {
2002 return NULL;
2003 }
2004 int32_t ownedLength = 0;
2005 UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2006 if (ownedBuf == NULL) {
2007 *status = U_MEMORY_ALLOCATION_ERROR;
2008 return NULL;
2009 }
2010 UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2011 if (U_FAILURE(*status)) {
2012 uprv_free(ownedBuf);
2013 return NULL;
2014 }
2015 result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2016 return result;
2017 }
2018
2019
1998 //--------------------------------------------------------------------- 2020 //---------------------------------------------------------------------
1999 // 2021 //
2000 // split 2022 // split
2001 // 2023 //
2002 //--------------------------------------------------------------------- 2024 //---------------------------------------------------------------------
2003 int32_t RegexMatcher::split(const UnicodeString &input, 2025 int32_t RegexMatcher::split(const UnicodeString &input,
2004 UnicodeString dest[], 2026 UnicodeString dest[],
2005 int32_t destCapacity, 2027 int32_t destCapacity,
2006 UErrorCode &status) 2028 UErrorCode &status)
2007 { 2029 {
(...skipping 146 matching lines...) Expand 10 before | Expand all | Expand 10 after
2154 // text goes out into the next n destination strings. 2176 // text goes out into the next n destination strings.
2155 int32_t groupNum; 2177 int32_t groupNum;
2156 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 2178 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2157 if (i >= destCapacity-2) { 2179 if (i >= destCapacity-2) {
2158 // Never fill the last available output string with capture group text. 2180 // Never fill the last available output string with capture group text.
2159 // It will filled with the last field, the remainder of the 2181 // It will filled with the last field, the remainder of the
2160 // unsplit input text. 2182 // unsplit input text.
2161 break; 2183 break;
2162 } 2184 }
2163 i++; 2185 i++;
2164 dest[i] = group(groupNum, dest[i], status); 2186 dest[i] = utext_extract_replace(fInputText, dest[i],
2187 start64(groupNum, status), end64( groupNum, status), &status);
2165 } 2188 }
2166 2189
2167 if (nextOutputStringStart == fActiveLimit) { 2190 if (nextOutputStringStart == fActiveLimit) {
2168 // The delimiter was at the end of the string. We're done, but first 2191 // The delimiter was at the end of the string. We're done, but first
2169 // we output one last empty string, for the empty field followin g 2192 // we output one last empty string, for the empty field followin g
2170 // the delimiter at the end of input. 2193 // the delimiter at the end of input.
2171 if (i+1 < destCapacity) { 2194 if (i+1 < destCapacity) {
2172 ++i; 2195 ++i;
2173 if (dest[i] == NULL) { 2196 if (dest[i] == NULL) {
2174 dest[i] = utext_openUChars(NULL, NULL, 0, &status); 2197 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
(...skipping 291 matching lines...) Expand 10 before | Expand all | Expand 10 after
2466 // new stack frame to all -1. The -1s are needed for capture group li mits, 2489 // new stack frame to all -1. The -1s are needed for capture group li mits,
2467 // where they indicate that a group has not yet matched anything. 2490 // where they indicate that a group has not yet matched anything.
2468 //------------------------------------------------------------------------------ -- 2491 //------------------------------------------------------------------------------ --
2469 REStackFrame *RegexMatcher::resetStack() { 2492 REStackFrame *RegexMatcher::resetStack() {
2470 // Discard any previous contents of the state save stack, and initialize a 2493 // Discard any previous contents of the state save stack, and initialize a
2471 // new stack frame with all -1 data. The -1s are needed for capture group limits, 2494 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2472 // where they indicate that a group has not yet matched anything. 2495 // where they indicate that a group has not yet matched anything.
2473 fStack->removeAllElements(); 2496 fStack->removeAllElements();
2474 2497
2475 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrame Size, fDeferredStatus); 2498 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrame Size, fDeferredStatus);
2499 if(U_FAILURE(fDeferredStatus)) {
2500 return NULL;
2501 }
2502
2476 int32_t i; 2503 int32_t i;
2477 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { 2504 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2478 iFrame->fExtra[i] = -1; 2505 iFrame->fExtra[i] = -1;
2479 } 2506 }
2480 return iFrame; 2507 return iFrame;
2481 } 2508 }
2482 2509
2483 2510
2484 2511
2485 //------------------------------------------------------------------------------ -- 2512 //------------------------------------------------------------------------------ --
(...skipping 167 matching lines...) Expand 10 before | Expand all | Expand 10 after
2653 // fp The top frame pointer when called. At return, a new 2680 // fp The top frame pointer when called. At return, a new
2654 // fame will be present 2681 // fame will be present
2655 // savePatIdx An index into the compiled pattern. Goes into the origina l 2682 // savePatIdx An index into the compiled pattern. Goes into the origina l
2656 // (not new) frame. If execution ever back-tracks out of the 2683 // (not new) frame. If execution ever back-tracks out of the
2657 // new frame, this will be where we continue from in the patt ern. 2684 // new frame, this will be where we continue from in the patt ern.
2658 // Return 2685 // Return
2659 // The new frame pointer. 2686 // The new frame pointer.
2660 // 2687 //
2661 //------------------------------------------------------------------------------ -- 2688 //------------------------------------------------------------------------------ --
2662 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatId x, UErrorCode &status) { 2689 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatId x, UErrorCode &status) {
2690 if (U_FAILURE(status)) {
2691 return fp;
2692 }
2663 // push storage for a new frame. 2693 // push storage for a new frame.
2664 int64_t *newFP = fStack->reserveBlock(fFrameSize, status); 2694 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2665 if (newFP == NULL) { 2695 if (U_FAILURE(status)) {
2666 // Failure on attempted stack expansion. 2696 // Failure on attempted stack expansion.
2667 // Stack function set some other error code, change it to a more 2697 // Stack function set some other error code, change it to a more
2668 // specific one for regular expressions. 2698 // specific one for regular expressions.
2669 status = U_REGEX_STACK_OVERFLOW; 2699 status = U_REGEX_STACK_OVERFLOW;
2670 // We need to return a writable stack frame, so just return the 2700 // We need to return a writable stack frame, so just return the
2671 // previous frame. The match operation will stop quickly 2701 // previous frame. The match operation will stop quickly
2672 // because of the error status, after which the frame will never 2702 // because of the error status, after which the frame will never
2673 // be looked at again. 2703 // be looked at again.
2674 return fp; 2704 return fp;
2675 } 2705 }
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after
2747 2777
2748 // Cache frequently referenced items from the compiled pattern 2778 // Cache frequently referenced items from the compiled pattern
2749 // 2779 //
2750 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 2780 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2751 2781
2752 const UChar *litText = fPattern->fLiteralText.getBuffer(); 2782 const UChar *litText = fPattern->fLiteralText.getBuffer();
2753 UVector *sets = fPattern->fSets; 2783 UVector *sets = fPattern->fSets;
2754 2784
2755 fFrameSize = fPattern->fFrameSize; 2785 fFrameSize = fPattern->fFrameSize;
2756 REStackFrame *fp = resetStack(); 2786 REStackFrame *fp = resetStack();
2787 if (U_FAILURE(fDeferredStatus)) {
2788 status = fDeferredStatus;
2789 return;
2790 }
2757 2791
2758 fp->fPatIdx = 0; 2792 fp->fPatIdx = 0;
2759 fp->fInputIdx = startIdx; 2793 fp->fInputIdx = startIdx;
2760 2794
2761 // Zero out the pattern's static data 2795 // Zero out the pattern's static data
2762 int32_t i; 2796 int32_t i;
2763 for (i = 0; i<fPattern->fDataSize; i++) { 2797 for (i = 0; i<fPattern->fDataSize; i++) {
2764 fData[i] = 0; 2798 fData[i] = 0;
2765 } 2799 }
2766 2800
(...skipping 133 matching lines...) Expand 10 before | Expand all | Expand 10 after
2900 fRequireEnd = TRUE; 2934 fRequireEnd = TRUE;
2901 break; 2935 break;
2902 } 2936 }
2903 2937
2904 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2938 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2905 2939
2906 // If we are positioned just before a new-line that is located a t the 2940 // If we are positioned just before a new-line that is located a t the
2907 // end of input, succeed. 2941 // end of input, succeed.
2908 UChar32 c = UTEXT_NEXT32(fInputText); 2942 UChar32 c = UTEXT_NEXT32(fInputText);
2909 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 2943 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2910 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x202 9) { 2944 if (isLineTerminator(c)) {
2911 // If not in the middle of a CR/LF sequence 2945 // If not in the middle of a CR/LF sequence
2912 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTE XT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { 2946 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)U TEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2913 // At new-line at end of input. Success 2947 // At new-line at end of input. Success
2914 fHitEnd = TRUE; 2948 fHitEnd = TRUE;
2915 fRequireEnd = TRUE; 2949 fRequireEnd = TRUE;
2916 2950
2917 break; 2951 break;
2918 } 2952 }
2919 } 2953 }
2920 } else { 2954 } else {
2921 UChar32 nextC = UTEXT_NEXT32(fInputText); 2955 UChar32 nextC = UTEXT_NEXT32(fInputText);
2922 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInpu tText) >= fAnchorLimit) { 2956 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInpu tText) >= fAnchorLimit) {
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
2958 if (fp->fInputIdx >= fAnchorLimit) { 2992 if (fp->fInputIdx >= fAnchorLimit) {
2959 // We really are at the end of input. Success. 2993 // We really are at the end of input. Success.
2960 fHitEnd = TRUE; 2994 fHitEnd = TRUE;
2961 fRequireEnd = TRUE; 2995 fRequireEnd = TRUE;
2962 break; 2996 break;
2963 } 2997 }
2964 // If we are positioned just before a new-line, succeed. 2998 // If we are positioned just before a new-line, succeed.
2965 // It makes no difference where the new-line is within the inpu t. 2999 // It makes no difference where the new-line is within the inpu t.
2966 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3000 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2967 UChar32 c = UTEXT_CURRENT32(fInputText); 3001 UChar32 c = UTEXT_CURRENT32(fInputText);
2968 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { 3002 if (isLineTerminator(c)) {
2969 // At a line end, except for the odd chance of being in th e middle of a CR/LF sequence 3003 // At a line end, except for the odd chance of being in th e middle of a CR/LF sequence
2970 // In multi-line mode, hitting a new-line just before the end of input does not 3004 // In multi-line mode, hitting a new-line just before the end of input does not
2971 // set the hitEnd or requireEnd flags 3005 // set the hitEnd or requireEnd flags
2972 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVI OUS32(fInputText)==0x0d)) { 3006 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVI OUS32(fInputText)==0x0d)) {
2973 break; 3007 break;
2974 } 3008 }
2975 } 3009 }
2976 // not at a new line. Fail. 3010 // not at a new line. Fail.
2977 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3011 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2978 } 3012 }
(...skipping 28 matching lines...) Expand all
3007 case URX_CARET_M: // ^, test for start of line in muli t-line mode 3041 case URX_CARET_M: // ^, test for start of line in muli t-line mode
3008 { 3042 {
3009 if (fp->fInputIdx == fAnchorStart) { 3043 if (fp->fInputIdx == fAnchorStart) {
3010 // We are at the start input. Success. 3044 // We are at the start input. Success.
3011 break; 3045 break;
3012 } 3046 }
3013 // Check whether character just before the current pos is a new-l ine 3047 // Check whether character just before the current pos is a new-l ine
3014 // unless we are at the end of input 3048 // unless we are at the end of input
3015 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3049 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3016 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3050 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3017 if ((fp->fInputIdx < fAnchorLimit) && 3051 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3018 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3019 // It's a new-line. ^ is true. Success. 3052 // It's a new-line. ^ is true. Success.
3020 // TODO: what should be done with positions between a CR an d LF? 3053 // TODO: what should be done with positions between a CR an d LF?
3021 break; 3054 break;
3022 } 3055 }
3023 // Not at the start of a line. Fail. 3056 // Not at the start of a line. Fail.
3024 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3057 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3025 } 3058 }
3026 break; 3059 break;
3027 3060
3028 3061
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after
3089 break; 3122 break;
3090 3123
3091 3124
3092 case URX_BACKSLASH_G: // Test for position at end of previous m atch 3125 case URX_BACKSLASH_G: // Test for position at end of previous m atch
3093 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp-> fInputIdx==fActiveStart))) { 3126 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp-> fInputIdx==fActiveStart))) {
3094 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3127 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3095 } 3128 }
3096 break; 3129 break;
3097 3130
3098 3131
3132 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
3133 {
3134 if (fp->fInputIdx >= fActiveLimit) {
3135 fHitEnd = TRUE;
3136 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3137 break;
3138 }
3139 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3140 UChar32 c = UTEXT_NEXT32(fInputText);
3141 int8_t ctype = u_charType(c);
3142 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPAC E_SEPARATOR || TAB
3143 success ^= (UBool)(opValue != 0); // flip sense for \H
3144 if (success) {
3145 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3146 } else {
3147 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3148 }
3149 }
3150 break;
3151
3152
3153 case URX_BACKSLASH_R: // Test for \R, any line break sequence .
3154 {
3155 if (fp->fInputIdx >= fActiveLimit) {
3156 fHitEnd = TRUE;
3157 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3158 break;
3159 }
3160 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3161 UChar32 c = UTEXT_NEXT32(fInputText);
3162 if (isLineTerminator(c)) {
3163 if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3164 utext_next32(fInputText);
3165 }
3166 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3167 } else {
3168 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3169 }
3170 }
3171 break;
3172
3173
3174 case URX_BACKSLASH_V: // \v, any single line ending character .
3175 {
3176 if (fp->fInputIdx >= fActiveLimit) {
3177 fHitEnd = TRUE;
3178 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3179 break;
3180 }
3181 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3182 UChar32 c = UTEXT_NEXT32(fInputText);
3183 UBool success = isLineTerminator(c);
3184 success ^= (UBool)(opValue != 0); // flip sense for \V
3185 if (success) {
3186 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3187 } else {
3188 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3189 }
3190 }
3191 break;
3192
3193
3099 case URX_BACKSLASH_X: 3194 case URX_BACKSLASH_X:
3100 // Match a Grapheme, as defined by Unicode TR 29. 3195 // Match a Grapheme, as defined by Unicode TR 29.
3101 // Differs slightly from Perl, which consumes combining marks indep endently 3196 // Differs slightly from Perl, which consumes combining marks indep endently
3102 // of context. 3197 // of context.
3103 { 3198 {
3104 3199
3105 // Fail if at end of input 3200 // Fail if at end of input
3106 if (fp->fInputIdx >= fActiveLimit) { 3201 if (fp->fInputIdx >= fActiveLimit) {
3107 fHitEnd = TRUE; 3202 fHitEnd = TRUE;
3108 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3203 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
(...skipping 207 matching lines...) Expand 10 before | Expand all | Expand 10 after
3316 // At end of input. Match failed. Backtrack out. 3411 // At end of input. Match failed. Backtrack out.
3317 fHitEnd = TRUE; 3412 fHitEnd = TRUE;
3318 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3413 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3319 break; 3414 break;
3320 } 3415 }
3321 3416
3322 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3417 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3323 3418
3324 // There is input left. Advance over one char, unless we've hit end-of-line 3419 // There is input left. Advance over one char, unless we've hit end-of-line
3325 UChar32 c = UTEXT_NEXT32(fInputText); 3420 UChar32 c = UTEXT_NEXT32(fInputText);
3326 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 3421 if (isLineTerminator(c)) {
3327 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3328 // End of line in normal mode. . does not match. 3422 // End of line in normal mode. . does not match.
3329 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3423 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3330 break; 3424 break;
3331 } 3425 }
3332 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3426 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3333 } 3427 }
3334 break; 3428 break;
3335 3429
3336 3430
3337 case URX_DOTANY_ALL: 3431 case URX_DOTANY_ALL:
(...skipping 736 matching lines...) Expand 10 before | Expand all | Expand 10 after
4074 UTEXT_SETNATIVEINDEX(fInputText, ix); 4168 UTEXT_SETNATIVEINDEX(fInputText, ix);
4075 for (;;) { 4169 for (;;) {
4076 if (ix >= fActiveLimit) { 4170 if (ix >= fActiveLimit) {
4077 fHitEnd = TRUE; 4171 fHitEnd = TRUE;
4078 break; 4172 break;
4079 } 4173 }
4080 UChar32 c = UTEXT_NEXT32(fInputText); 4174 UChar32 c = UTEXT_NEXT32(fInputText);
4081 if ((c & 0x7f) <= 0x29) { // Fast filter of non -new-line-s 4175 if ((c & 0x7f) <= 0x29) { // Fast filter of non -new-line-s
4082 if ((c == 0x0a) || // 0x0a is newline i n both modes. 4176 if ((c == 0x0a) || // 0x0a is newline i n both modes.
4083 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 4177 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4084 (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) { 4178 isLineTerminator(c))) {
4085 // char is a line ending. Exit the scanning lo op. 4179 // char is a line ending. Exit the scanning lo op.
4086 break; 4180 break;
4087 } 4181 }
4088 } 4182 }
4089 ix = UTEXT_GETNATIVEINDEX(fInputText); 4183 ix = UTEXT_GETNATIVEINDEX(fInputText);
4090 } 4184 }
4091 } 4185 }
4092 4186
4093 // If there were no matching characters, skip over the loop alto gether. 4187 // If there were no matching characters, skip over the loop alto gether.
4094 // The loop doesn't run at all, a * op always succeeds. 4188 // The loop doesn't run at all, a * op always succeeds.
(...skipping 155 matching lines...) Expand 10 before | Expand all | Expand 10 after
4250 // 4344 //
4251 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 4345 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4252 4346
4253 const UChar *litText = fPattern->fLiteralText.getBuffer(); 4347 const UChar *litText = fPattern->fLiteralText.getBuffer();
4254 UVector *sets = fPattern->fSets; 4348 UVector *sets = fPattern->fSets;
4255 4349
4256 const UChar *inputBuf = fInputText->chunkContents; 4350 const UChar *inputBuf = fInputText->chunkContents;
4257 4351
4258 fFrameSize = fPattern->fFrameSize; 4352 fFrameSize = fPattern->fFrameSize;
4259 REStackFrame *fp = resetStack(); 4353 REStackFrame *fp = resetStack();
4354 if (U_FAILURE(fDeferredStatus)) {
4355 status = fDeferredStatus;
4356 return;
4357 }
4260 4358
4261 fp->fPatIdx = 0; 4359 fp->fPatIdx = 0;
4262 fp->fInputIdx = startIdx; 4360 fp->fInputIdx = startIdx;
4263 4361
4264 // Zero out the pattern's static data 4362 // Zero out the pattern's static data
4265 int32_t i; 4363 int32_t i;
4266 for (i = 0; i<fPattern->fDataSize; i++) { 4364 for (i = 0; i<fPattern->fDataSize; i++) {
4267 fData[i] = 0; 4365 fData[i] = 0;
4268 } 4366 }
4269 4367
(...skipping 135 matching lines...) Expand 10 before | Expand all | Expand 10 after
4405 fRequireEnd = TRUE; 4503 fRequireEnd = TRUE;
4406 break; 4504 break;
4407 } 4505 }
4408 4506
4409 // If we are positioned just before a new-line that is located at th e 4507 // If we are positioned just before a new-line that is located at th e
4410 // end of input, succeed. 4508 // end of input, succeed.
4411 if (fp->fInputIdx == fAnchorLimit-1) { 4509 if (fp->fInputIdx == fAnchorLimit-1) {
4412 UChar32 c; 4510 UChar32 c;
4413 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); 4511 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4414 4512
4415 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { 4513 if (isLineTerminator(c)) {
4416 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp- >fInputIdx-1]==0x0d)) { 4514 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp- >fInputIdx-1]==0x0d)) {
4417 // At new-line at end of input. Success 4515 // At new-line at end of input. Success
4418 fHitEnd = TRUE; 4516 fHitEnd = TRUE;
4419 fRequireEnd = TRUE; 4517 fRequireEnd = TRUE;
4420 break; 4518 break;
4421 } 4519 }
4422 } 4520 }
4423 } else if (fp->fInputIdx == fAnchorLimit-2 && 4521 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4424 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a ) { 4522 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a ) {
4425 fHitEnd = TRUE; 4523 fHitEnd = TRUE;
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
4459 { 4557 {
4460 if (fp->fInputIdx >= fAnchorLimit) { 4558 if (fp->fInputIdx >= fAnchorLimit) {
4461 // We really are at the end of input. Success. 4559 // We really are at the end of input. Success.
4462 fHitEnd = TRUE; 4560 fHitEnd = TRUE;
4463 fRequireEnd = TRUE; 4561 fRequireEnd = TRUE;
4464 break; 4562 break;
4465 } 4563 }
4466 // If we are positioned just before a new-line, succeed. 4564 // If we are positioned just before a new-line, succeed.
4467 // It makes no difference where the new-line is within the input . 4565 // It makes no difference where the new-line is within the input .
4468 UChar32 c = inputBuf[fp->fInputIdx]; 4566 UChar32 c = inputBuf[fp->fInputIdx];
4469 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { 4567 if (isLineTerminator(c)) {
4470 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 4568 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4471 // In multi-line mode, hitting a new-line just before the e nd of input does not 4569 // In multi-line mode, hitting a new-line just before the e nd of input does not
4472 // set the hitEnd or requireEnd flags 4570 // set the hitEnd or requireEnd flags
4473 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp- >fInputIdx-1]==0x0d)) { 4571 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp- >fInputIdx-1]==0x0d)) {
4474 break; 4572 break;
4475 } 4573 }
4476 } 4574 }
4477 // not at a new line. Fail. 4575 // not at a new line. Fail.
4478 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4576 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4479 } 4577 }
(...skipping 27 matching lines...) Expand all
4507 case URX_CARET_M: // ^, test for start of line in mul it-line mode 4605 case URX_CARET_M: // ^, test for start of line in mul it-line mode
4508 { 4606 {
4509 if (fp->fInputIdx == fAnchorStart) { 4607 if (fp->fInputIdx == fAnchorStart) {
4510 // We are at the start input. Success. 4608 // We are at the start input. Success.
4511 break; 4609 break;
4512 } 4610 }
4513 // Check whether character just before the current pos is a new- line 4611 // Check whether character just before the current pos is a new- line
4514 // unless we are at the end of input 4612 // unless we are at the end of input
4515 UChar c = inputBuf[fp->fInputIdx - 1]; 4613 UChar c = inputBuf[fp->fInputIdx - 1];
4516 if ((fp->fInputIdx < fAnchorLimit) && 4614 if ((fp->fInputIdx < fAnchorLimit) &&
4517 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 4615 isLineTerminator(c)) {
4518 // It's a new-line. ^ is true. Success. 4616 // It's a new-line. ^ is true. Success.
4519 // TODO: what should be done with positions between a CR a nd LF? 4617 // TODO: what should be done with positions between a CR a nd LF?
4520 break; 4618 break;
4521 } 4619 }
4522 // Not at the start of a line. Fail. 4620 // Not at the start of a line. Fail.
4523 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4621 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4524 } 4622 }
4525 break; 4623 break;
4526 4624
4527 4625
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
4584 break; 4682 break;
4585 4683
4586 4684
4587 case URX_BACKSLASH_G: // Test for position at end of previous m atch 4685 case URX_BACKSLASH_G: // Test for position at end of previous m atch
4588 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp-> fInputIdx==fActiveStart))) { 4686 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp-> fInputIdx==fActiveStart))) {
4589 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4687 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4590 } 4688 }
4591 break; 4689 break;
4592 4690
4593 4691
4692 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
4693 {
4694 if (fp->fInputIdx >= fActiveLimit) {
4695 fHitEnd = TRUE;
4696 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4697 break;
4698 }
4699 UChar32 c;
4700 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4701 int8_t ctype = u_charType(c);
4702 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPAC E_SEPARATOR || TAB
4703 success ^= (UBool)(opValue != 0); // flip sense for \H
4704 if (!success) {
4705 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4706 }
4707 }
4708 break;
4709
4710
4711 case URX_BACKSLASH_R: // Test for \R, any line break sequence .
4712 {
4713 if (fp->fInputIdx >= fActiveLimit) {
4714 fHitEnd = TRUE;
4715 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4716 break;
4717 }
4718 UChar32 c;
4719 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4720 if (isLineTerminator(c)) {
4721 if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4722 // Check for CR/LF sequence. Consume both together when found.
4723 UChar c2;
4724 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4725 if (c2 != 0x0a) {
4726 U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4727 }
4728 }
4729 } else {
4730 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4731 }
4732 }
4733 break;
4734
4735
4736 case URX_BACKSLASH_V: // Any single code point line ending.
4737 {
4738 if (fp->fInputIdx >= fActiveLimit) {
4739 fHitEnd = TRUE;
4740 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4741 break;
4742 }
4743 UChar32 c;
4744 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4745 UBool success = isLineTerminator(c);
4746 success ^= (UBool)(opValue != 0); // flip sense for \V
4747 if (!success) {
4748 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4749 }
4750 }
4751 break;
4752
4753
4754
4594 case URX_BACKSLASH_X: 4755 case URX_BACKSLASH_X:
4595 // Match a Grapheme, as defined by Unicode TR 29. 4756 // Match a Grapheme, as defined by Unicode TR 29.
4596 // Differs slightly from Perl, which consumes combining marks independe ntly 4757 // Differs slightly from Perl, which consumes combining marks independe ntly
4597 // of context. 4758 // of context.
4598 { 4759 {
4599 4760
4600 // Fail if at end of input 4761 // Fail if at end of input
4601 if (fp->fInputIdx >= fActiveLimit) { 4762 if (fp->fInputIdx >= fActiveLimit) {
4602 fHitEnd = TRUE; 4763 fHitEnd = TRUE;
4603 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4764 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
(...skipping 189 matching lines...) Expand 10 before | Expand all | Expand 10 after
4793 if (fp->fInputIdx >= fActiveLimit) { 4954 if (fp->fInputIdx >= fActiveLimit) {
4794 // At end of input. Match failed. Backtrack out. 4955 // At end of input. Match failed. Backtrack out.
4795 fHitEnd = TRUE; 4956 fHitEnd = TRUE;
4796 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4957 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4797 break; 4958 break;
4798 } 4959 }
4799 4960
4800 // There is input left. Advance over one char, unless we've hit end-of-line 4961 // There is input left. Advance over one char, unless we've hit end-of-line
4801 UChar32 c; 4962 UChar32 c;
4802 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4963 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4803 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 4964 if (isLineTerminator(c)) {
4804 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4805 // End of line in normal mode. . does not match. 4965 // End of line in normal mode. . does not match.
4806 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4966 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4807 break; 4967 break;
4808 } 4968 }
4809 } 4969 }
4810 break; 4970 break;
4811 4971
4812 4972
4813 case URX_DOTANY_ALL: 4973 case URX_DOTANY_ALL:
4814 { 4974 {
(...skipping 693 matching lines...) Expand 10 before | Expand all | Expand 10 after
5508 for (;;) { 5668 for (;;) {
5509 if (ix >= fActiveLimit) { 5669 if (ix >= fActiveLimit) {
5510 fHitEnd = TRUE; 5670 fHitEnd = TRUE;
5511 break; 5671 break;
5512 } 5672 }
5513 UChar32 c; 5673 UChar32 c;
5514 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputB uf[ix++] 5674 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputB uf[ix++]
5515 if ((c & 0x7f) <= 0x29) { // Fast filter of non -new-line-s 5675 if ((c & 0x7f) <= 0x29) { // Fast filter of non -new-line-s
5516 if ((c == 0x0a) || // 0x0a is newline i n both modes. 5676 if ((c == 0x0a) || // 0x0a is newline i n both modes.
5517 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 5677 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
5518 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) { 5678 isLineTerminator(c))) {
5519 // char is a line ending. Put the input pos ba ck to the 5679 // char is a line ending. Put the input pos ba ck to the
5520 // line ending char, and exit the scanning lo op. 5680 // line ending char, and exit the scanning lo op.
5521 U16_BACK_1(inputBuf, 0, ix); 5681 U16_BACK_1(inputBuf, 0, ix);
5522 break; 5682 break;
5523 } 5683 }
5524 } 5684 }
5525 } 5685 }
5526 } 5686 }
5527 5687
5528 // If there were no matching characters, skip over the loop alto gether. 5688 // If there were no matching characters, skip over the loop alto gether.
(...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after
5624 5784
5625 return; 5785 return;
5626 } 5786 }
5627 5787
5628 5788
5629 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) 5789 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5630 5790
5631 U_NAMESPACE_END 5791 U_NAMESPACE_END
5632 5792
5633 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 5793 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
OLDNEW
« no previous file with comments | « source/i18n/region.cpp ('k') | source/i18n/repattrn.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698