| Index: source/i18n/rematch.cpp
|
| diff --git a/source/i18n/rematch.cpp b/source/i18n/rematch.cpp
|
| index 4389985e9ed7d0e0a4033801f8ea39af3f800626..c7aeac015ff3e99e9496f3a247f0e44ccbf419da 100644
|
| --- a/source/i18n/rematch.cpp
|
| +++ b/source/i18n/rematch.cpp
|
| @@ -1,6 +1,6 @@
|
| /*
|
| **************************************************************************
|
| -* Copyright (C) 2002-2014 International Business Machines Corporation *
|
| +* Copyright (C) 2002-2015 International Business Machines Corporation *
|
| * and others. All rights reserved. *
|
| **************************************************************************
|
| */
|
| @@ -49,6 +49,15 @@ static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
|
| // This constant determines that state saves per tick number.
|
| static const int32_t TIMER_INITIAL_VALUE = 10000;
|
|
|
| +
|
| +// Test for any of the Unicode line terminating characters.
|
| +static inline UBool isLineTerminator(UChar32 c) {
|
| + if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
|
| + return false;
|
| + }
|
| + return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
|
| +}
|
| +
|
| //-----------------------------------------------------------------------------
|
| //
|
| // Constructor and Destructor
|
| @@ -216,10 +225,6 @@ void RegexMatcher::init(UErrorCode &status) {
|
| fInput = NULL;
|
| fInputLength = 0;
|
| fInputUniStrMaybeMutable = FALSE;
|
| -
|
| - if (U_FAILURE(status)) {
|
| - fDeferredStatus = status;
|
| - }
|
| }
|
|
|
| //
|
| @@ -257,6 +262,9 @@ void RegexMatcher::init2(UText *input, UErrorCode &status) {
|
|
|
| static const UChar BACKSLASH = 0x5c;
|
| static const UChar DOLLARSIGN = 0x24;
|
| +static const UChar LEFTBRACKET = 0x7b;
|
| +static const UChar RIGHTBRACKET = 0x7d;
|
| +
|
| //--------------------------------------------------------------------------------
|
| //
|
| // appendReplacement
|
| @@ -331,8 +339,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
|
| // TODO: optimize this loop by efficiently scanning for '$' or '\',
|
| // move entire ranges not containing substitutions.
|
| UTEXT_SETNATIVEINDEX(replacement, 0);
|
| - UChar32 c = UTEXT_NEXT32(replacement);
|
| - while (c != U_SENTINEL) {
|
| + for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) {
|
| if (c == BACKSLASH) {
|
| // Backslash Escape. Copy the following char out without further checks.
|
| // Note: Surrogate pairs don't need any special handling
|
| @@ -398,51 +405,69 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
|
| }
|
| }
|
| } else {
|
| - // We've got a $. Pick up a capture group number if one follows.
|
| - // Consume at most the number of digits necessary for the largest capture
|
| - // number that is valid for this pattern.
|
| + // We've got a $. Pick up a capture group name or number if one follows.
|
| + // Consume digits so long as the resulting group number <= the number of
|
| + // number of capture groups in the pattern.
|
|
|
| - int32_t numDigits = 0;
|
| int32_t groupNum = 0;
|
| - UChar32 digitC;
|
| - for (;;) {
|
| - digitC = UTEXT_CURRENT32(replacement);
|
| - if (digitC == U_SENTINEL) {
|
| - break;
|
| - }
|
| - if (u_isdigit(digitC) == FALSE) {
|
| - break;
|
| + int32_t numDigits = 0;
|
| + UChar32 nextChar = utext_current32(replacement);
|
| + if (nextChar == LEFTBRACKET) {
|
| + // Scan for a Named Capture Group, ${name}.
|
| + UnicodeString groupName;
|
| + utext_next32(replacement);
|
| + while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
|
| + nextChar = utext_next32(replacement);
|
| + if (nextChar == U_SENTINEL) {
|
| + status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
| + } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
|
| + (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
|
| + (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
|
| + groupName.append(nextChar);
|
| + } else if (nextChar == RIGHTBRACKET) {
|
| + groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
|
| + if (groupNum == 0) {
|
| + status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
| + }
|
| + } else {
|
| + // Character was something other than a name char or a closing '}'
|
| + status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
| + }
|
| }
|
| - (void)UTEXT_NEXT32(replacement);
|
| - groupNum=groupNum*10 + u_charDigitValue(digitC);
|
| - numDigits++;
|
| - if (numDigits >= fPattern->fMaxCaptureDigits) {
|
| - break;
|
| +
|
| + } else if (u_isdigit(nextChar)) {
|
| + // $n Scan for a capture group number
|
| + int32_t numCaptureGroups = fPattern->fGroupMap->size();
|
| + for (;;) {
|
| + nextChar = UTEXT_CURRENT32(replacement);
|
| + if (nextChar == U_SENTINEL) {
|
| + break;
|
| + }
|
| + if (u_isdigit(nextChar) == FALSE) {
|
| + break;
|
| + }
|
| + int32_t nextDigitVal = u_charDigitValue(nextChar);
|
| + if (groupNum*10 + nextDigitVal > numCaptureGroups) {
|
| + // Don't consume the next digit if it makes the capture group number too big.
|
| + if (numDigits == 0) {
|
| + status = U_INDEX_OUTOFBOUNDS_ERROR;
|
| + }
|
| + break;
|
| + }
|
| + (void)UTEXT_NEXT32(replacement);
|
| + groupNum=groupNum*10 + nextDigitVal;
|
| + ++numDigits;
|
| }
|
| + } else {
|
| + // $ not followed by capture group name or number.
|
| + status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
| }
|
|
|
| -
|
| - if (numDigits == 0) {
|
| - // The $ didn't introduce a group number at all.
|
| - // Treat it as just part of the substitution text.
|
| - UChar c16 = DOLLARSIGN;
|
| - destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
|
| - } else {
|
| - // Finally, append the capture group data to the destination.
|
| + if (U_SUCCESS(status)) {
|
| destLen += appendGroup(groupNum, dest, status);
|
| - if (U_FAILURE(status)) {
|
| - // Can fail if group number is out of range.
|
| - break;
|
| - }
|
| }
|
| - }
|
| -
|
| - if (U_FAILURE(status)) {
|
| - break;
|
| - } else {
|
| - c = UTEXT_NEXT32(replacement);
|
| - }
|
| - }
|
| + } // End of $ capture group handling
|
| + } // End of per-character loop through the replacement string.
|
|
|
| return *this;
|
| }
|
| @@ -817,20 +842,19 @@ UBool RegexMatcher::find(UErrorCode &status) {
|
| }
|
| } else {
|
| for (;;) {
|
| - if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
|
| - ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
|
| - if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
|
| - (void)UTEXT_NEXT32(fInputText);
|
| - startPos = UTEXT_GETNATIVEINDEX(fInputText);
|
| - }
|
| - MatchAt(startPos, FALSE, status);
|
| - if (U_FAILURE(status)) {
|
| - return FALSE;
|
| - }
|
| - if (fMatch) {
|
| - return TRUE;
|
| - }
|
| - UTEXT_SETNATIVEINDEX(fInputText, startPos);
|
| + if (isLineTerminator(c)) {
|
| + if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
|
| + (void)UTEXT_NEXT32(fInputText);
|
| + startPos = UTEXT_GETNATIVEINDEX(fInputText);
|
| + }
|
| + MatchAt(startPos, FALSE, status);
|
| + if (U_FAILURE(status)) {
|
| + return FALSE;
|
| + }
|
| + if (fMatch) {
|
| + return TRUE;
|
| + }
|
| + UTEXT_SETNATIVEINDEX(fInputText, startPos);
|
| }
|
| if (startPos >= testStartLimit) {
|
| fMatch = FALSE;
|
| @@ -1078,8 +1102,7 @@ UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
|
| } else {
|
| for (;;) {
|
| c = inputBuf[startPos-1];
|
| - if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
|
| - ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
|
| + if (isLineTerminator(c)) {
|
| if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
|
| startPos++;
|
| }
|
| @@ -1175,98 +1198,32 @@ UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UE
|
|
|
| UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
|
| UnicodeString result;
|
| - if (U_FAILURE(status)) {
|
| + int64_t groupStart = start64(groupNum, status);
|
| + int64_t groupEnd = end64(groupNum, status);
|
| + if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
|
| return result;
|
| }
|
| - UText resultText = UTEXT_INITIALIZER;
|
| - utext_openUnicodeString(&resultText, &result, &status);
|
| - group(groupNum, &resultText, status);
|
| - utext_close(&resultText);
|
| - return result;
|
| -}
|
| -
|
| -
|
| -// Return deep (mutable) clone
|
| -// Technology Preview (as an API), but note that the UnicodeString API is implemented
|
| -// using this function.
|
| -UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
|
| - if (U_FAILURE(status)) {
|
| - return dest;
|
| - }
|
|
|
| - if (U_FAILURE(fDeferredStatus)) {
|
| - status = fDeferredStatus;
|
| - } else if (fMatch == FALSE) {
|
| - status = U_REGEX_INVALID_STATE;
|
| - } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
|
| - status = U_INDEX_OUTOFBOUNDS_ERROR;
|
| - }
|
| - if (U_FAILURE(status)) {
|
| - return dest;
|
| - }
|
| -
|
| - int64_t s, e;
|
| - if (groupNum == 0) {
|
| - s = fMatchStart;
|
| - e = fMatchEnd;
|
| - } else {
|
| - int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
|
| - U_ASSERT(groupOffset < fPattern->fFrameSize);
|
| - U_ASSERT(groupOffset >= 0);
|
| - s = fFrame->fExtra[groupOffset];
|
| - e = fFrame->fExtra[groupOffset+1];
|
| - }
|
| -
|
| - if (s < 0) {
|
| - // A capture group wasn't part of the match
|
| - if (dest) {
|
| - utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
|
| - return dest;
|
| - } else {
|
| - return utext_openUChars(NULL, NULL, 0, &status);
|
| - }
|
| + // Get the group length using a utext_extract preflight.
|
| + // UText is actually pretty efficient at this when underlying encoding is UTF-16.
|
| + int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
|
| + if (status != U_BUFFER_OVERFLOW_ERROR) {
|
| + return result;
|
| }
|
| - U_ASSERT(s <= e);
|
|
|
| - if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
|
| - U_ASSERT(e <= fInputLength);
|
| - if (dest) {
|
| - utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
|
| - } else {
|
| - UText groupText = UTEXT_INITIALIZER;
|
| - utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
|
| - dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
|
| - utext_close(&groupText);
|
| - }
|
| + status = U_ZERO_ERROR;
|
| + UChar *buf = result.getBuffer(length);
|
| + if (buf == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| } else {
|
| - int32_t len16;
|
| - if (UTEXT_USES_U16(fInputText)) {
|
| - len16 = (int32_t)(e-s);
|
| - } else {
|
| - UErrorCode lengthStatus = U_ZERO_ERROR;
|
| - len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
|
| - }
|
| - UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
|
| - if (groupChars == NULL) {
|
| - status = U_MEMORY_ALLOCATION_ERROR;
|
| - return dest;
|
| - }
|
| - utext_extract(fInputText, s, e, groupChars, len16+1, &status);
|
| -
|
| - if (dest) {
|
| - utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
|
| - } else {
|
| - UText groupText = UTEXT_INITIALIZER;
|
| - utext_openUChars(&groupText, groupChars, len16, &status);
|
| - dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
|
| - utext_close(&groupText);
|
| - }
|
| -
|
| - uprv_free(groupChars);
|
| + int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
|
| + result.releaseBuffer(extractLength);
|
| + U_ASSERT(length == extractLength);
|
| }
|
| - return dest;
|
| + return result;
|
| }
|
|
|
| +
|
| //--------------------------------------------------------------------------------
|
| //
|
| // appendGroup() -- currently internal only, appends a group to a UText rather
|
| @@ -1347,8 +1304,6 @@ int32_t RegexMatcher::groupCount() const {
|
| return fPattern->fGroupMap->size();
|
| }
|
|
|
| -
|
| -
|
| //--------------------------------------------------------------------------------
|
| //
|
| // hasAnchoringBounds()
|
| @@ -1884,6 +1839,9 @@ RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
|
| if (fPattern->fNeedsAltInput) {
|
| fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
|
| }
|
| + if (U_FAILURE(fDeferredStatus)) {
|
| + return *this;
|
| + }
|
| fInputLength = utext_nativeLength(fInputText);
|
|
|
| reset();
|
| @@ -1908,6 +1866,9 @@ RegexMatcher &RegexMatcher::reset(UText *input) {
|
| if (fInputText != input) {
|
| fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
|
| if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
|
| + if (U_FAILURE(fDeferredStatus)) {
|
| + return *this;
|
| + }
|
| fInputLength = utext_nativeLength(fInputText);
|
|
|
| delete fInput;
|
| @@ -1995,6 +1956,67 @@ void RegexMatcher::setTrace(UBool state) {
|
|
|
|
|
|
|
| +/**
|
| + * UText, replace entire contents of the destination UText with a substring of the source UText.
|
| + *
|
| + * @param src The source UText
|
| + * @param dest The destination UText. Must be writable.
|
| + * May be NULL, in which case a new UText will be allocated.
|
| + * @param start Start index of source substring.
|
| + * @param limit Limit index of source substring.
|
| + * @param status An error code.
|
| + */
|
| +static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
|
| + if (U_FAILURE(*status)) {
|
| + return dest;
|
| + }
|
| + if (start == limit) {
|
| + if (dest) {
|
| + utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
|
| + return dest;
|
| + } else {
|
| + return utext_openUChars(NULL, NULL, 0, status);
|
| + }
|
| + }
|
| + int32_t length = utext_extract(src, start, limit, NULL, 0, status);
|
| + if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
|
| + return dest;
|
| + }
|
| + *status = U_ZERO_ERROR;
|
| + MaybeStackArray<UChar, 40> buffer;
|
| + if (length >= buffer.getCapacity()) {
|
| + UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul.
|
| + if (newBuf == NULL) {
|
| + *status = U_MEMORY_ALLOCATION_ERROR;
|
| + }
|
| + }
|
| + utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
|
| + if (dest) {
|
| + utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
|
| + return dest;
|
| + }
|
| +
|
| + // Caller did not provide a prexisting UText.
|
| + // Open a new one, and have it adopt the text buffer storage.
|
| + if (U_FAILURE(*status)) {
|
| + return NULL;
|
| + }
|
| + int32_t ownedLength = 0;
|
| + UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
|
| + if (ownedBuf == NULL) {
|
| + *status = U_MEMORY_ALLOCATION_ERROR;
|
| + return NULL;
|
| + }
|
| + UText *result = utext_openUChars(NULL, ownedBuf, length, status);
|
| + if (U_FAILURE(*status)) {
|
| + uprv_free(ownedBuf);
|
| + return NULL;
|
| + }
|
| + result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
|
| + return result;
|
| +}
|
| +
|
| +
|
| //---------------------------------------------------------------------
|
| //
|
| // split
|
| @@ -2161,7 +2183,8 @@ int32_t RegexMatcher::split(UText *input,
|
| break;
|
| }
|
| i++;
|
| - dest[i] = group(groupNum, dest[i], status);
|
| + dest[i] = utext_extract_replace(fInputText, dest[i],
|
| + start64(groupNum, status), end64(groupNum, status), &status);
|
| }
|
|
|
| if (nextOutputStringStart == fActiveLimit) {
|
| @@ -2473,6 +2496,10 @@ REStackFrame *RegexMatcher::resetStack() {
|
| fStack->removeAllElements();
|
|
|
| REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
|
| + if(U_FAILURE(fDeferredStatus)) {
|
| + return NULL;
|
| + }
|
| +
|
| int32_t i;
|
| for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
|
| iFrame->fExtra[i] = -1;
|
| @@ -2660,9 +2687,12 @@ void RegexMatcher::IncrementTime(UErrorCode &status) {
|
| //
|
| //--------------------------------------------------------------------------------
|
| inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
|
| + if (U_FAILURE(status)) {
|
| + return fp;
|
| + }
|
| // push storage for a new frame.
|
| int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
|
| - if (newFP == NULL) {
|
| + if (U_FAILURE(status)) {
|
| // Failure on attempted stack expansion.
|
| // Stack function set some other error code, change it to a more
|
| // specific one for regular expressions.
|
| @@ -2754,6 +2784,10 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
|
|
| fFrameSize = fPattern->fFrameSize;
|
| REStackFrame *fp = resetStack();
|
| + if (U_FAILURE(fDeferredStatus)) {
|
| + status = fDeferredStatus;
|
| + return;
|
| + }
|
|
|
| fp->fPatIdx = 0;
|
| fp->fInputIdx = startIdx;
|
| @@ -2907,9 +2941,9 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
| // end of input, succeed.
|
| UChar32 c = UTEXT_NEXT32(fInputText);
|
| if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
|
| - if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
|
| + if (isLineTerminator(c)) {
|
| // If not in the middle of a CR/LF sequence
|
| - if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
|
| + if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
|
| // At new-line at end of input. Success
|
| fHitEnd = TRUE;
|
| fRequireEnd = TRUE;
|
| @@ -2965,7 +2999,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
| // It makes no difference where the new-line is within the input.
|
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
| UChar32 c = UTEXT_CURRENT32(fInputText);
|
| - if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
|
| + if (isLineTerminator(c)) {
|
| // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
|
| // In multi-line mode, hitting a new-line just before the end of input does not
|
| // set the hitEnd or requireEnd flags
|
| @@ -3014,8 +3048,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
| // unless we are at the end of input
|
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
| UChar32 c = UTEXT_PREVIOUS32(fInputText);
|
| - if ((fp->fInputIdx < fAnchorLimit) &&
|
| - ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
|
| + if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
|
| // It's a new-line. ^ is true. Success.
|
| // TODO: what should be done with positions between a CR and LF?
|
| break;
|
| @@ -3096,6 +3129,68 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
| break;
|
|
|
|
|
| + case URX_BACKSLASH_H: // Test for \h, horizontal white space.
|
| + {
|
| + if (fp->fInputIdx >= fActiveLimit) {
|
| + fHitEnd = TRUE;
|
| + fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
| + break;
|
| + }
|
| + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
| + UChar32 c = UTEXT_NEXT32(fInputText);
|
| + int8_t ctype = u_charType(c);
|
| + UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
|
| + success ^= (UBool)(opValue != 0); // flip sense for \H
|
| + if (success) {
|
| + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
| + } else {
|
| + fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
| + }
|
| + }
|
| + break;
|
| +
|
| +
|
| + case URX_BACKSLASH_R: // Test for \R, any line break sequence.
|
| + {
|
| + if (fp->fInputIdx >= fActiveLimit) {
|
| + fHitEnd = TRUE;
|
| + fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
| + break;
|
| + }
|
| + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
| + UChar32 c = UTEXT_NEXT32(fInputText);
|
| + if (isLineTerminator(c)) {
|
| + if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
|
| + utext_next32(fInputText);
|
| + }
|
| + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
| + } else {
|
| + fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
| + }
|
| + }
|
| + break;
|
| +
|
| +
|
| + case URX_BACKSLASH_V: // \v, any single line ending character.
|
| + {
|
| + if (fp->fInputIdx >= fActiveLimit) {
|
| + fHitEnd = TRUE;
|
| + fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
| + break;
|
| + }
|
| + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
| + UChar32 c = UTEXT_NEXT32(fInputText);
|
| + UBool success = isLineTerminator(c);
|
| + success ^= (UBool)(opValue != 0); // flip sense for \V
|
| + if (success) {
|
| + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
| + } else {
|
| + fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
| + }
|
| + }
|
| + break;
|
| +
|
| +
|
| case URX_BACKSLASH_X:
|
| // Match a Grapheme, as defined by Unicode TR 29.
|
| // Differs slightly from Perl, which consumes combining marks independently
|
| @@ -3323,8 +3418,7 @@ GC_Done:
|
|
|
| // There is input left. Advance over one char, unless we've hit end-of-line
|
| UChar32 c = UTEXT_NEXT32(fInputText);
|
| - if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
|
| - ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
|
| + if (isLineTerminator(c)) {
|
| // End of line in normal mode. . does not match.
|
| fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
| break;
|
| @@ -4081,7 +4175,7 @@ GC_Done:
|
| if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
|
| if ((c == 0x0a) || // 0x0a is newline in both modes.
|
| (((opValue & 2) == 0) && // IF not UNIX_LINES mode
|
| - (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
|
| + isLineTerminator(c))) {
|
| // char is a line ending. Exit the scanning loop.
|
| break;
|
| }
|
| @@ -4257,6 +4351,10 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
|
|
| fFrameSize = fPattern->fFrameSize;
|
| REStackFrame *fp = resetStack();
|
| + if (U_FAILURE(fDeferredStatus)) {
|
| + status = fDeferredStatus;
|
| + return;
|
| + }
|
|
|
| fp->fPatIdx = 0;
|
| fp->fInputIdx = startIdx;
|
| @@ -4412,7 +4510,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
| UChar32 c;
|
| U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
|
|
|
| - if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
|
| + if (isLineTerminator(c)) {
|
| if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
|
| // At new-line at end of input. Success
|
| fHitEnd = TRUE;
|
| @@ -4466,7 +4564,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
| // If we are positioned just before a new-line, succeed.
|
| // It makes no difference where the new-line is within the input.
|
| UChar32 c = inputBuf[fp->fInputIdx];
|
| - if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
|
| + if (isLineTerminator(c)) {
|
| // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
|
| // In multi-line mode, hitting a new-line just before the end of input does not
|
| // set the hitEnd or requireEnd flags
|
| @@ -4514,7 +4612,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
| // unless we are at the end of input
|
| UChar c = inputBuf[fp->fInputIdx - 1];
|
| if ((fp->fInputIdx < fAnchorLimit) &&
|
| - ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
|
| + isLineTerminator(c)) {
|
| // It's a new-line. ^ is true. Success.
|
| // TODO: what should be done with positions between a CR and LF?
|
| break;
|
| @@ -4591,6 +4689,69 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
| break;
|
|
|
|
|
| + case URX_BACKSLASH_H: // Test for \h, horizontal white space.
|
| + {
|
| + if (fp->fInputIdx >= fActiveLimit) {
|
| + fHitEnd = TRUE;
|
| + fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
| + break;
|
| + }
|
| + UChar32 c;
|
| + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
| + int8_t ctype = u_charType(c);
|
| + UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
|
| + success ^= (UBool)(opValue != 0); // flip sense for \H
|
| + if (!success) {
|
| + fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
| + }
|
| + }
|
| + break;
|
| +
|
| +
|
| + case URX_BACKSLASH_R: // Test for \R, any line break sequence.
|
| + {
|
| + if (fp->fInputIdx >= fActiveLimit) {
|
| + fHitEnd = TRUE;
|
| + fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
| + break;
|
| + }
|
| + UChar32 c;
|
| + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
| + if (isLineTerminator(c)) {
|
| + if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
|
| + // Check for CR/LF sequence. Consume both together when found.
|
| + UChar c2;
|
| + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
|
| + if (c2 != 0x0a) {
|
| + U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
|
| + }
|
| + }
|
| + } else {
|
| + fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
| + }
|
| + }
|
| + break;
|
| +
|
| +
|
| + case URX_BACKSLASH_V: // Any single code point line ending.
|
| + {
|
| + if (fp->fInputIdx >= fActiveLimit) {
|
| + fHitEnd = TRUE;
|
| + fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
| + break;
|
| + }
|
| + UChar32 c;
|
| + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
| + UBool success = isLineTerminator(c);
|
| + success ^= (UBool)(opValue != 0); // flip sense for \V
|
| + if (!success) {
|
| + fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
| + }
|
| + }
|
| + break;
|
| +
|
| +
|
| +
|
| case URX_BACKSLASH_X:
|
| // Match a Grapheme, as defined by Unicode TR 29.
|
| // Differs slightly from Perl, which consumes combining marks independently
|
| @@ -4800,8 +4961,7 @@ GC_Done:
|
| // There is input left. Advance over one char, unless we've hit end-of-line
|
| UChar32 c;
|
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
| - if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
|
| - ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
|
| + if (isLineTerminator(c)) {
|
| // End of line in normal mode. . does not match.
|
| fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
| break;
|
| @@ -5515,7 +5675,7 @@ GC_Done:
|
| if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
|
| if ((c == 0x0a) || // 0x0a is newline in both modes.
|
| (((opValue & 2) == 0) && // IF not UNIX_LINES mode
|
| - ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
|
| + isLineTerminator(c))) {
|
| // char is a line ending. Put the input pos back to the
|
| // line ending char, and exit the scanning loop.
|
| U16_BACK_1(inputBuf, 0, ix);
|
|
|