Index: source/i18n/rematch.cpp |
diff --git a/source/i18n/rematch.cpp b/source/i18n/rematch.cpp |
index 4389985e9ed7d0e0a4033801f8ea39af3f800626..c7aeac015ff3e99e9496f3a247f0e44ccbf419da 100644 |
--- a/source/i18n/rematch.cpp |
+++ b/source/i18n/rematch.cpp |
@@ -1,6 +1,6 @@ |
/* |
************************************************************************** |
-* Copyright (C) 2002-2014 International Business Machines Corporation * |
+* Copyright (C) 2002-2015 International Business Machines Corporation * |
* and others. All rights reserved. * |
************************************************************************** |
*/ |
@@ -49,6 +49,15 @@ static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; |
// This constant determines that state saves per tick number. |
static const int32_t TIMER_INITIAL_VALUE = 10000; |
+ |
+// Test for any of the Unicode line terminating characters. |
+static inline UBool isLineTerminator(UChar32 c) { |
+ if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) { |
+ return false; |
+ } |
+ return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029; |
+} |
+ |
//----------------------------------------------------------------------------- |
// |
// Constructor and Destructor |
@@ -216,10 +225,6 @@ void RegexMatcher::init(UErrorCode &status) { |
fInput = NULL; |
fInputLength = 0; |
fInputUniStrMaybeMutable = FALSE; |
- |
- if (U_FAILURE(status)) { |
- fDeferredStatus = status; |
- } |
} |
// |
@@ -257,6 +262,9 @@ void RegexMatcher::init2(UText *input, UErrorCode &status) { |
static const UChar BACKSLASH = 0x5c; |
static const UChar DOLLARSIGN = 0x24; |
+static const UChar LEFTBRACKET = 0x7b; |
+static const UChar RIGHTBRACKET = 0x7d; |
+ |
//-------------------------------------------------------------------------------- |
// |
// appendReplacement |
@@ -331,8 +339,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest, |
// TODO: optimize this loop by efficiently scanning for '$' or '\', |
// move entire ranges not containing substitutions. |
UTEXT_SETNATIVEINDEX(replacement, 0); |
- UChar32 c = UTEXT_NEXT32(replacement); |
- while (c != U_SENTINEL) { |
+ for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) { |
if (c == BACKSLASH) { |
// Backslash Escape. Copy the following char out without further checks. |
// Note: Surrogate pairs don't need any special handling |
@@ -398,51 +405,69 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest, |
} |
} |
} else { |
- // We've got a $. Pick up a capture group number if one follows. |
- // Consume at most the number of digits necessary for the largest capture |
- // number that is valid for this pattern. |
+ // We've got a $. Pick up a capture group name or number if one follows. |
+ // Consume digits so long as the resulting group number <= the number of |
+ // number of capture groups in the pattern. |
- int32_t numDigits = 0; |
int32_t groupNum = 0; |
- UChar32 digitC; |
- for (;;) { |
- digitC = UTEXT_CURRENT32(replacement); |
- if (digitC == U_SENTINEL) { |
- break; |
- } |
- if (u_isdigit(digitC) == FALSE) { |
- break; |
+ int32_t numDigits = 0; |
+ UChar32 nextChar = utext_current32(replacement); |
+ if (nextChar == LEFTBRACKET) { |
+ // Scan for a Named Capture Group, ${name}. |
+ UnicodeString groupName; |
+ utext_next32(replacement); |
+ while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) { |
+ nextChar = utext_next32(replacement); |
+ if (nextChar == U_SENTINEL) { |
+ status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; |
+ } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z |
+ (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z |
+ (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9 |
+ groupName.append(nextChar); |
+ } else if (nextChar == RIGHTBRACKET) { |
+ groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName); |
+ if (groupNum == 0) { |
+ status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; |
+ } |
+ } else { |
+ // Character was something other than a name char or a closing '}' |
+ status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; |
+ } |
} |
- (void)UTEXT_NEXT32(replacement); |
- groupNum=groupNum*10 + u_charDigitValue(digitC); |
- numDigits++; |
- if (numDigits >= fPattern->fMaxCaptureDigits) { |
- break; |
+ |
+ } else if (u_isdigit(nextChar)) { |
+ // $n Scan for a capture group number |
+ int32_t numCaptureGroups = fPattern->fGroupMap->size(); |
+ for (;;) { |
+ nextChar = UTEXT_CURRENT32(replacement); |
+ if (nextChar == U_SENTINEL) { |
+ break; |
+ } |
+ if (u_isdigit(nextChar) == FALSE) { |
+ break; |
+ } |
+ int32_t nextDigitVal = u_charDigitValue(nextChar); |
+ if (groupNum*10 + nextDigitVal > numCaptureGroups) { |
+ // Don't consume the next digit if it makes the capture group number too big. |
+ if (numDigits == 0) { |
+ status = U_INDEX_OUTOFBOUNDS_ERROR; |
+ } |
+ break; |
+ } |
+ (void)UTEXT_NEXT32(replacement); |
+ groupNum=groupNum*10 + nextDigitVal; |
+ ++numDigits; |
} |
+ } else { |
+ // $ not followed by capture group name or number. |
+ status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; |
} |
- |
- if (numDigits == 0) { |
- // The $ didn't introduce a group number at all. |
- // Treat it as just part of the substitution text. |
- UChar c16 = DOLLARSIGN; |
- destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); |
- } else { |
- // Finally, append the capture group data to the destination. |
+ if (U_SUCCESS(status)) { |
destLen += appendGroup(groupNum, dest, status); |
- if (U_FAILURE(status)) { |
- // Can fail if group number is out of range. |
- break; |
- } |
} |
- } |
- |
- if (U_FAILURE(status)) { |
- break; |
- } else { |
- c = UTEXT_NEXT32(replacement); |
- } |
- } |
+ } // End of $ capture group handling |
+ } // End of per-character loop through the replacement string. |
return *this; |
} |
@@ -817,20 +842,19 @@ UBool RegexMatcher::find(UErrorCode &status) { |
} |
} else { |
for (;;) { |
- if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible |
- ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { |
- if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { |
- (void)UTEXT_NEXT32(fInputText); |
- startPos = UTEXT_GETNATIVEINDEX(fInputText); |
- } |
- MatchAt(startPos, FALSE, status); |
- if (U_FAILURE(status)) { |
- return FALSE; |
- } |
- if (fMatch) { |
- return TRUE; |
- } |
- UTEXT_SETNATIVEINDEX(fInputText, startPos); |
+ if (isLineTerminator(c)) { |
+ if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { |
+ (void)UTEXT_NEXT32(fInputText); |
+ startPos = UTEXT_GETNATIVEINDEX(fInputText); |
+ } |
+ MatchAt(startPos, FALSE, status); |
+ if (U_FAILURE(status)) { |
+ return FALSE; |
+ } |
+ if (fMatch) { |
+ return TRUE; |
+ } |
+ UTEXT_SETNATIVEINDEX(fInputText, startPos); |
} |
if (startPos >= testStartLimit) { |
fMatch = FALSE; |
@@ -1078,8 +1102,7 @@ UBool RegexMatcher::findUsingChunk(UErrorCode &status) { |
} else { |
for (;;) { |
c = inputBuf[startPos-1]; |
- if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible |
- ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { |
+ if (isLineTerminator(c)) { |
if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { |
startPos++; |
} |
@@ -1175,98 +1198,32 @@ UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UE |
UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { |
UnicodeString result; |
- if (U_FAILURE(status)) { |
+ int64_t groupStart = start64(groupNum, status); |
+ int64_t groupEnd = end64(groupNum, status); |
+ if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) { |
return result; |
} |
- UText resultText = UTEXT_INITIALIZER; |
- utext_openUnicodeString(&resultText, &result, &status); |
- group(groupNum, &resultText, status); |
- utext_close(&resultText); |
- return result; |
-} |
- |
- |
-// Return deep (mutable) clone |
-// Technology Preview (as an API), but note that the UnicodeString API is implemented |
-// using this function. |
-UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const { |
- if (U_FAILURE(status)) { |
- return dest; |
- } |
- if (U_FAILURE(fDeferredStatus)) { |
- status = fDeferredStatus; |
- } else if (fMatch == FALSE) { |
- status = U_REGEX_INVALID_STATE; |
- } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { |
- status = U_INDEX_OUTOFBOUNDS_ERROR; |
- } |
- if (U_FAILURE(status)) { |
- return dest; |
- } |
- |
- int64_t s, e; |
- if (groupNum == 0) { |
- s = fMatchStart; |
- e = fMatchEnd; |
- } else { |
- int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); |
- U_ASSERT(groupOffset < fPattern->fFrameSize); |
- U_ASSERT(groupOffset >= 0); |
- s = fFrame->fExtra[groupOffset]; |
- e = fFrame->fExtra[groupOffset+1]; |
- } |
- |
- if (s < 0) { |
- // A capture group wasn't part of the match |
- if (dest) { |
- utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); |
- return dest; |
- } else { |
- return utext_openUChars(NULL, NULL, 0, &status); |
- } |
+ // Get the group length using a utext_extract preflight. |
+ // UText is actually pretty efficient at this when underlying encoding is UTF-16. |
+ int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status); |
+ if (status != U_BUFFER_OVERFLOW_ERROR) { |
+ return result; |
} |
- U_ASSERT(s <= e); |
- if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
- U_ASSERT(e <= fInputLength); |
- if (dest) { |
- utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status); |
- } else { |
- UText groupText = UTEXT_INITIALIZER; |
- utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status); |
- dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); |
- utext_close(&groupText); |
- } |
+ status = U_ZERO_ERROR; |
+ UChar *buf = result.getBuffer(length); |
+ if (buf == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
} else { |
- int32_t len16; |
- if (UTEXT_USES_U16(fInputText)) { |
- len16 = (int32_t)(e-s); |
- } else { |
- UErrorCode lengthStatus = U_ZERO_ERROR; |
- len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); |
- } |
- UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); |
- if (groupChars == NULL) { |
- status = U_MEMORY_ALLOCATION_ERROR; |
- return dest; |
- } |
- utext_extract(fInputText, s, e, groupChars, len16+1, &status); |
- |
- if (dest) { |
- utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status); |
- } else { |
- UText groupText = UTEXT_INITIALIZER; |
- utext_openUChars(&groupText, groupChars, len16, &status); |
- dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); |
- utext_close(&groupText); |
- } |
- |
- uprv_free(groupChars); |
+ int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status); |
+ result.releaseBuffer(extractLength); |
+ U_ASSERT(length == extractLength); |
} |
- return dest; |
+ return result; |
} |
+ |
//-------------------------------------------------------------------------------- |
// |
// appendGroup() -- currently internal only, appends a group to a UText rather |
@@ -1347,8 +1304,6 @@ int32_t RegexMatcher::groupCount() const { |
return fPattern->fGroupMap->size(); |
} |
- |
- |
//-------------------------------------------------------------------------------- |
// |
// hasAnchoringBounds() |
@@ -1884,6 +1839,9 @@ RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { |
if (fPattern->fNeedsAltInput) { |
fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); |
} |
+ if (U_FAILURE(fDeferredStatus)) { |
+ return *this; |
+ } |
fInputLength = utext_nativeLength(fInputText); |
reset(); |
@@ -1908,6 +1866,9 @@ RegexMatcher &RegexMatcher::reset(UText *input) { |
if (fInputText != input) { |
fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus); |
if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); |
+ if (U_FAILURE(fDeferredStatus)) { |
+ return *this; |
+ } |
fInputLength = utext_nativeLength(fInputText); |
delete fInput; |
@@ -1995,6 +1956,67 @@ void RegexMatcher::setTrace(UBool state) { |
+/** |
+ * UText, replace entire contents of the destination UText with a substring of the source UText. |
+ * |
+ * @param src The source UText |
+ * @param dest The destination UText. Must be writable. |
+ * May be NULL, in which case a new UText will be allocated. |
+ * @param start Start index of source substring. |
+ * @param limit Limit index of source substring. |
+ * @param status An error code. |
+ */ |
+static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) { |
+ if (U_FAILURE(*status)) { |
+ return dest; |
+ } |
+ if (start == limit) { |
+ if (dest) { |
+ utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status); |
+ return dest; |
+ } else { |
+ return utext_openUChars(NULL, NULL, 0, status); |
+ } |
+ } |
+ int32_t length = utext_extract(src, start, limit, NULL, 0, status); |
+ if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) { |
+ return dest; |
+ } |
+ *status = U_ZERO_ERROR; |
+ MaybeStackArray<UChar, 40> buffer; |
+ if (length >= buffer.getCapacity()) { |
+ UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul. |
+ if (newBuf == NULL) { |
+ *status = U_MEMORY_ALLOCATION_ERROR; |
+ } |
+ } |
+ utext_extract(src, start, limit, buffer.getAlias(), length+1, status); |
+ if (dest) { |
+ utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status); |
+ return dest; |
+ } |
+ |
+ // Caller did not provide a prexisting UText. |
+ // Open a new one, and have it adopt the text buffer storage. |
+ if (U_FAILURE(*status)) { |
+ return NULL; |
+ } |
+ int32_t ownedLength = 0; |
+ UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength); |
+ if (ownedBuf == NULL) { |
+ *status = U_MEMORY_ALLOCATION_ERROR; |
+ return NULL; |
+ } |
+ UText *result = utext_openUChars(NULL, ownedBuf, length, status); |
+ if (U_FAILURE(*status)) { |
+ uprv_free(ownedBuf); |
+ return NULL; |
+ } |
+ result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT); |
+ return result; |
+} |
+ |
+ |
//--------------------------------------------------------------------- |
// |
// split |
@@ -2161,7 +2183,8 @@ int32_t RegexMatcher::split(UText *input, |
break; |
} |
i++; |
- dest[i] = group(groupNum, dest[i], status); |
+ dest[i] = utext_extract_replace(fInputText, dest[i], |
+ start64(groupNum, status), end64(groupNum, status), &status); |
} |
if (nextOutputStringStart == fActiveLimit) { |
@@ -2473,6 +2496,10 @@ REStackFrame *RegexMatcher::resetStack() { |
fStack->removeAllElements(); |
REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); |
+ if(U_FAILURE(fDeferredStatus)) { |
+ return NULL; |
+ } |
+ |
int32_t i; |
for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { |
iFrame->fExtra[i] = -1; |
@@ -2660,9 +2687,12 @@ void RegexMatcher::IncrementTime(UErrorCode &status) { |
// |
//-------------------------------------------------------------------------------- |
inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) { |
+ if (U_FAILURE(status)) { |
+ return fp; |
+ } |
// push storage for a new frame. |
int64_t *newFP = fStack->reserveBlock(fFrameSize, status); |
- if (newFP == NULL) { |
+ if (U_FAILURE(status)) { |
// Failure on attempted stack expansion. |
// Stack function set some other error code, change it to a more |
// specific one for regular expressions. |
@@ -2754,6 +2784,10 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { |
fFrameSize = fPattern->fFrameSize; |
REStackFrame *fp = resetStack(); |
+ if (U_FAILURE(fDeferredStatus)) { |
+ status = fDeferredStatus; |
+ return; |
+ } |
fp->fPatIdx = 0; |
fp->fInputIdx = startIdx; |
@@ -2907,9 +2941,9 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { |
// end of input, succeed. |
UChar32 c = UTEXT_NEXT32(fInputText); |
if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { |
- if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { |
+ if (isLineTerminator(c)) { |
// If not in the middle of a CR/LF sequence |
- if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { |
+ if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { |
// At new-line at end of input. Success |
fHitEnd = TRUE; |
fRequireEnd = TRUE; |
@@ -2965,7 +2999,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { |
// It makes no difference where the new-line is within the input. |
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
UChar32 c = UTEXT_CURRENT32(fInputText); |
- if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { |
+ if (isLineTerminator(c)) { |
// At a line end, except for the odd chance of being in the middle of a CR/LF sequence |
// In multi-line mode, hitting a new-line just before the end of input does not |
// set the hitEnd or requireEnd flags |
@@ -3014,8 +3048,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { |
// unless we are at the end of input |
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
UChar32 c = UTEXT_PREVIOUS32(fInputText); |
- if ((fp->fInputIdx < fAnchorLimit) && |
- ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { |
+ if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) { |
// It's a new-line. ^ is true. Success. |
// TODO: what should be done with positions between a CR and LF? |
break; |
@@ -3096,6 +3129,68 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { |
break; |
+ case URX_BACKSLASH_H: // Test for \h, horizontal white space. |
+ { |
+ if (fp->fInputIdx >= fActiveLimit) { |
+ fHitEnd = TRUE; |
+ fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
+ break; |
+ } |
+ UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
+ UChar32 c = UTEXT_NEXT32(fInputText); |
+ int8_t ctype = u_charType(c); |
+ UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB |
+ success ^= (UBool)(opValue != 0); // flip sense for \H |
+ if (success) { |
+ fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
+ } else { |
+ fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
+ } |
+ } |
+ break; |
+ |
+ |
+ case URX_BACKSLASH_R: // Test for \R, any line break sequence. |
+ { |
+ if (fp->fInputIdx >= fActiveLimit) { |
+ fHitEnd = TRUE; |
+ fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
+ break; |
+ } |
+ UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
+ UChar32 c = UTEXT_NEXT32(fInputText); |
+ if (isLineTerminator(c)) { |
+ if (c == 0x0d && utext_current32(fInputText) == 0x0a) { |
+ utext_next32(fInputText); |
+ } |
+ fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
+ } else { |
+ fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
+ } |
+ } |
+ break; |
+ |
+ |
+ case URX_BACKSLASH_V: // \v, any single line ending character. |
+ { |
+ if (fp->fInputIdx >= fActiveLimit) { |
+ fHitEnd = TRUE; |
+ fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
+ break; |
+ } |
+ UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
+ UChar32 c = UTEXT_NEXT32(fInputText); |
+ UBool success = isLineTerminator(c); |
+ success ^= (UBool)(opValue != 0); // flip sense for \V |
+ if (success) { |
+ fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
+ } else { |
+ fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
+ } |
+ } |
+ break; |
+ |
+ |
case URX_BACKSLASH_X: |
// Match a Grapheme, as defined by Unicode TR 29. |
// Differs slightly from Perl, which consumes combining marks independently |
@@ -3323,8 +3418,7 @@ GC_Done: |
// There is input left. Advance over one char, unless we've hit end-of-line |
UChar32 c = UTEXT_NEXT32(fInputText); |
- if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible |
- ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { |
+ if (isLineTerminator(c)) { |
// End of line in normal mode. . does not match. |
fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
break; |
@@ -4081,7 +4175,7 @@ GC_Done: |
if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s |
if ((c == 0x0a) || // 0x0a is newline in both modes. |
(((opValue & 2) == 0) && // IF not UNIX_LINES mode |
- (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) { |
+ isLineTerminator(c))) { |
// char is a line ending. Exit the scanning loop. |
break; |
} |
@@ -4257,6 +4351,10 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu |
fFrameSize = fPattern->fFrameSize; |
REStackFrame *fp = resetStack(); |
+ if (U_FAILURE(fDeferredStatus)) { |
+ status = fDeferredStatus; |
+ return; |
+ } |
fp->fPatIdx = 0; |
fp->fInputIdx = startIdx; |
@@ -4412,7 +4510,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu |
UChar32 c; |
U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); |
- if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { |
+ if (isLineTerminator(c)) { |
if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { |
// At new-line at end of input. Success |
fHitEnd = TRUE; |
@@ -4466,7 +4564,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu |
// If we are positioned just before a new-line, succeed. |
// It makes no difference where the new-line is within the input. |
UChar32 c = inputBuf[fp->fInputIdx]; |
- if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { |
+ if (isLineTerminator(c)) { |
// At a line end, except for the odd chance of being in the middle of a CR/LF sequence |
// In multi-line mode, hitting a new-line just before the end of input does not |
// set the hitEnd or requireEnd flags |
@@ -4514,7 +4612,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu |
// unless we are at the end of input |
UChar c = inputBuf[fp->fInputIdx - 1]; |
if ((fp->fInputIdx < fAnchorLimit) && |
- ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { |
+ isLineTerminator(c)) { |
// It's a new-line. ^ is true. Success. |
// TODO: what should be done with positions between a CR and LF? |
break; |
@@ -4591,6 +4689,69 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu |
break; |
+ case URX_BACKSLASH_H: // Test for \h, horizontal white space. |
+ { |
+ if (fp->fInputIdx >= fActiveLimit) { |
+ fHitEnd = TRUE; |
+ fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
+ break; |
+ } |
+ UChar32 c; |
+ U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
+ int8_t ctype = u_charType(c); |
+ UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB |
+ success ^= (UBool)(opValue != 0); // flip sense for \H |
+ if (!success) { |
+ fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
+ } |
+ } |
+ break; |
+ |
+ |
+ case URX_BACKSLASH_R: // Test for \R, any line break sequence. |
+ { |
+ if (fp->fInputIdx >= fActiveLimit) { |
+ fHitEnd = TRUE; |
+ fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
+ break; |
+ } |
+ UChar32 c; |
+ U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
+ if (isLineTerminator(c)) { |
+ if (c == 0x0d && fp->fInputIdx < fActiveLimit) { |
+ // Check for CR/LF sequence. Consume both together when found. |
+ UChar c2; |
+ U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2); |
+ if (c2 != 0x0a) { |
+ U16_PREV(inputBuf, 0, fp->fInputIdx, c2); |
+ } |
+ } |
+ } else { |
+ fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
+ } |
+ } |
+ break; |
+ |
+ |
+ case URX_BACKSLASH_V: // Any single code point line ending. |
+ { |
+ if (fp->fInputIdx >= fActiveLimit) { |
+ fHitEnd = TRUE; |
+ fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
+ break; |
+ } |
+ UChar32 c; |
+ U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
+ UBool success = isLineTerminator(c); |
+ success ^= (UBool)(opValue != 0); // flip sense for \V |
+ if (!success) { |
+ fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
+ } |
+ } |
+ break; |
+ |
+ |
+ |
case URX_BACKSLASH_X: |
// Match a Grapheme, as defined by Unicode TR 29. |
// Differs slightly from Perl, which consumes combining marks independently |
@@ -4800,8 +4961,7 @@ GC_Done: |
// There is input left. Advance over one char, unless we've hit end-of-line |
UChar32 c; |
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
- if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible |
- ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { |
+ if (isLineTerminator(c)) { |
// End of line in normal mode. . does not match. |
fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
break; |
@@ -5515,7 +5675,7 @@ GC_Done: |
if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s |
if ((c == 0x0a) || // 0x0a is newline in both modes. |
(((opValue & 2) == 0) && // IF not UNIX_LINES mode |
- ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) { |
+ isLineTerminator(c))) { |
// char is a line ending. Put the input pos back to the |
// line ending char, and exit the scanning loop. |
U16_BACK_1(inputBuf, 0, ix); |