| Index: source/i18n/uregex.cpp
|
| diff --git a/source/i18n/uregex.cpp b/source/i18n/uregex.cpp
|
| index 01951234b9c0fead09307ee16038722ff28517bf..99e94283816cd2f6eec8848dea657fa4198ec52d 100644
|
| --- a/source/i18n/uregex.cpp
|
| +++ b/source/i18n/uregex.cpp
|
| @@ -1,6 +1,6 @@
|
| /*
|
| *******************************************************************************
|
| -* Copyright (C) 2004-2014, International Business Machines
|
| +* Copyright (C) 2004-2015, International Business Machines
|
| * Corporation and others. All Rights Reserved.
|
| *******************************************************************************
|
| * file name: uregex.cpp
|
| @@ -17,14 +17,14 @@
|
| #include "unicode/uchar.h"
|
| #include "unicode/uobject.h"
|
| #include "unicode/utf16.h"
|
| -#include "umutex.h"
|
| -#include "uassert.h"
|
| #include "cmemory.h"
|
| +#include "uassert.h"
|
| +#include "uhash.h"
|
| +#include "umutex.h"
|
| +#include "uvectr32.h"
|
|
|
| #include "regextxt.h"
|
|
|
| -#include <stdio.h>
|
| -
|
| U_NAMESPACE_BEGIN
|
|
|
| #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
|
| @@ -627,6 +627,36 @@ uregex_groupCount(URegularExpression *regexp2,
|
|
|
| //------------------------------------------------------------------------------
|
| //
|
| +// uregex_groupNumberFromName
|
| +//
|
| +//------------------------------------------------------------------------------
|
| +int32_t
|
| +uregex_groupNumberFromName(URegularExpression *regexp2,
|
| + const UChar *groupName,
|
| + int32_t nameLength,
|
| + UErrorCode *status) {
|
| + RegularExpression *regexp = (RegularExpression*)regexp2;
|
| + if (validateRE(regexp, FALSE, status) == FALSE) {
|
| + return 0;
|
| + }
|
| + int32_t result = regexp->fPat->groupNumberFromName(UnicodeString(groupName, nameLength), *status);
|
| + return result;
|
| +}
|
| +
|
| +int32_t
|
| +uregex_groupNumberFromCName(URegularExpression *regexp2,
|
| + const char *groupName,
|
| + int32_t nameLength,
|
| + UErrorCode *status) {
|
| + RegularExpression *regexp = (RegularExpression*)regexp2;
|
| + if (validateRE(regexp, FALSE, status) == FALSE) {
|
| + return 0;
|
| + }
|
| + return regexp->fPat->groupNumberFromName(groupName, nameLength, *status);
|
| +}
|
| +
|
| +//------------------------------------------------------------------------------
|
| +//
|
| // uregex_group
|
| //
|
| //------------------------------------------------------------------------------
|
| @@ -647,7 +677,7 @@ uregex_group(URegularExpression *regexp2,
|
|
|
| if (destCapacity == 0 || regexp->fText != NULL) {
|
| // If preflighting or if we already have the text as UChars,
|
| - // this is a little cheaper than going through uregex_groupUTextDeep()
|
| + // this is a little cheaper than extracting from the UText
|
|
|
| //
|
| // Pick up the range of characters from the matcher
|
| @@ -680,14 +710,18 @@ uregex_group(URegularExpression *regexp2,
|
| }
|
| return fullLength;
|
| } else {
|
| - int32_t result = 0;
|
| - UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status);
|
| - if (U_SUCCESS(*status)) {
|
| - result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status);
|
| + int64_t start = regexp->fMatcher->start64(groupNum, *status);
|
| + int64_t limit = regexp->fMatcher->end64(groupNum, *status);
|
| + if (U_FAILURE(*status)) {
|
| + return 0;
|
| }
|
| - utext_close(groupText);
|
| - return result;
|
| + // Note edge cases:
|
| + // Group didn't match: start == end == -1. UText trims to 0, UText gives zero length result.
|
| + // Zero Length Match: start == end.
|
| + int32_t length = utext_extract(regexp->fMatcher->inputText(), start, limit, dest, destCapacity, status);
|
| + return length;
|
| }
|
| +
|
| }
|
|
|
|
|
| @@ -713,49 +747,6 @@ uregex_groupUText(URegularExpression *regexp2,
|
|
|
| //------------------------------------------------------------------------------
|
| //
|
| -// uregex_groupUTextDeep
|
| -//
|
| -//------------------------------------------------------------------------------
|
| -U_CAPI UText * U_EXPORT2
|
| -uregex_groupUTextDeep(URegularExpression *regexp2,
|
| - int32_t groupNum,
|
| - UText *dest,
|
| - UErrorCode *status) {
|
| - RegularExpression *regexp = (RegularExpression*)regexp2;
|
| - if (validateRE(regexp, TRUE, status) == FALSE) {
|
| - UErrorCode emptyTextStatus = U_ZERO_ERROR;
|
| - return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
|
| - }
|
| -
|
| - if (regexp->fText != NULL) {
|
| - //
|
| - // Pick up the range of characters from the matcher
|
| - // and use our already-extracted characters
|
| - //
|
| - int32_t startIx = regexp->fMatcher->start(groupNum, *status);
|
| - int32_t endIx = regexp->fMatcher->end (groupNum, *status);
|
| - if (U_FAILURE(*status)) {
|
| - UErrorCode emptyTextStatus = U_ZERO_ERROR;
|
| - return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
|
| - }
|
| -
|
| - if (dest) {
|
| - utext_replace(dest, 0, utext_nativeLength(dest), ®exp->fText[startIx], endIx - startIx, status);
|
| - } else {
|
| - UText groupText = UTEXT_INITIALIZER;
|
| - utext_openUChars(&groupText, ®exp->fText[startIx], endIx - startIx, status);
|
| - dest = utext_clone(NULL, &groupText, TRUE, FALSE, status);
|
| - utext_close(&groupText);
|
| - }
|
| -
|
| - return dest;
|
| - } else {
|
| - return regexp->fMatcher->group(groupNum, dest, *status);
|
| - }
|
| -}
|
| -
|
| -//------------------------------------------------------------------------------
|
| -//
|
| // uregex_start
|
| //
|
| //------------------------------------------------------------------------------
|
| @@ -1324,6 +1315,8 @@ U_NAMESPACE_END
|
|
|
| static const UChar BACKSLASH = 0x5c;
|
| static const UChar DOLLARSIGN = 0x24;
|
| +static const UChar LEFTBRACKET = 0x7b;
|
| +static const UChar RIGHTBRACKET = 0x7d;
|
|
|
| //
|
| // Move a character to an output buffer, with bounds checking on the index.
|
| @@ -1398,10 +1391,10 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
|
| matchStart = (int32_t)m->fMatchStart;
|
| } else {
|
| // !!!: Would like a better way to do this!
|
| - UErrorCode status = U_ZERO_ERROR;
|
| - lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
|
| - status = U_ZERO_ERROR;
|
| - matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
|
| + UErrorCode tempStatus = U_ZERO_ERROR;
|
| + lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &tempStatus);
|
| + tempStatus = U_ZERO_ERROR;
|
| + matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &tempStatus);
|
| }
|
| for (i=lastMatchEnd; i<matchStart; i++) {
|
| appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
|
| @@ -1416,7 +1409,7 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
|
|
|
| // scan the replacement text, looking for substitutions ($n) and \escapes.
|
| int32_t replIdx = 0;
|
| - while (replIdx < replacementLength) {
|
| + while (replIdx < replacementLength && U_SUCCESS(*status)) {
|
| UChar c = replacementText[replIdx];
|
| replIdx++;
|
| if (c != DOLLARSIGN && c != BACKSLASH) {
|
| @@ -1465,55 +1458,84 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
|
| continue;
|
| }
|
|
|
| + // We've got a $. Pick up the following capture group name or number.
|
| + // For numbers, consume only digits that produce a valid capture group for the pattern.
|
|
|
| -
|
| - // We've got a $. Pick up a capture group number if one follows.
|
| - // Consume at most the number of digits necessary for the largest capture
|
| - // number that is valid for this pattern.
|
| -
|
| - int32_t numDigits = 0;
|
| int32_t groupNum = 0;
|
| - UChar32 digitC;
|
| - for (;;) {
|
| - if (replIdx >= replacementLength) {
|
| - break;
|
| - }
|
| - U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
|
| - if (u_isdigit(digitC) == FALSE) {
|
| - break;
|
| - }
|
| + U_ASSERT(c == DOLLARSIGN);
|
| + UChar32 c32;
|
| + U16_GET(replacementText, 0, replIdx, replacementLength, c32);
|
| + if (u_isdigit(c32)) {
|
| + int32_t numDigits = 0;
|
| + int32_t numCaptureGroups = m->fPattern->fGroupMap->size();
|
| + for (;;) {
|
| + if (replIdx >= replacementLength) {
|
| + break;
|
| + }
|
| + U16_GET(replacementText, 0, replIdx, replacementLength, c32);
|
| + if (u_isdigit(c32) == FALSE) {
|
| + break;
|
| + }
|
|
|
| + int32_t digitVal = u_charDigitValue(c32);
|
| + if (groupNum * 10 + digitVal <= numCaptureGroups) {
|
| + groupNum = groupNum * 10 + digitVal;
|
| + U16_FWD_1(replacementText, replIdx, replacementLength);
|
| + numDigits++;
|
| + } else {
|
| + if (numDigits == 0) {
|
| + *status = U_INDEX_OUTOFBOUNDS_ERROR;
|
| + }
|
| + break;
|
| + }
|
| + }
|
| + } else if (c32 == LEFTBRACKET) {
|
| + // Scan for Named Capture Group, ${name}.
|
| + UnicodeString groupName;
|
| U16_FWD_1(replacementText, replIdx, replacementLength);
|
| - groupNum=groupNum*10 + u_charDigitValue(digitC);
|
| - numDigits++;
|
| - if (numDigits >= m->fPattern->fMaxCaptureDigits) {
|
| - break;
|
| + while (U_SUCCESS(*status) && c32 != RIGHTBRACKET) {
|
| + if (replIdx >= replacementLength) {
|
| + *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
| + break;
|
| + }
|
| + U16_NEXT(replacementText, replIdx, replacementLength, c32);
|
| + if ((c32 >= 0x41 && c32 <= 0x5a) || // A..Z
|
| + (c32 >= 0x61 && c32 <= 0x7a) || // a..z
|
| + (c32 >= 0x31 && c32 <= 0x39)) { // 0..9
|
| + groupName.append(c32);
|
| + } else if (c32 == RIGHTBRACKET) {
|
| + groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName);
|
| + if (groupNum == 0) {
|
| + // Name not defined by pattern.
|
| + *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
| + }
|
| + } else {
|
| + // Character was something other than a name char or a closing '}'
|
| + *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
| + }
|
| }
|
| + } else {
|
| + // $ not followed by {name} or digits.
|
| + *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
| }
|
|
|
|
|
| - if (numDigits == 0) {
|
| - // The $ didn't introduce a group number at all.
|
| - // Treat it as just part of the substitution text.
|
| - appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
|
| - continue;
|
| - }
|
| -
|
| // Finally, append the capture group data to the destination.
|
| - destIdx += uregex_group((URegularExpression*)regexp, groupNum,
|
| - dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
|
| - if (*status == U_BUFFER_OVERFLOW_ERROR) {
|
| - // Ignore buffer overflow when extracting the group. We need to
|
| - // continue on to get full size of the untruncated result. We will
|
| - // raise our own buffer overflow error at the end.
|
| - *status = U_ZERO_ERROR;
|
| + if (U_SUCCESS(*status)) {
|
| + destIdx += uregex_group((URegularExpression*)regexp, groupNum,
|
| + dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
|
| + if (*status == U_BUFFER_OVERFLOW_ERROR) {
|
| + // Ignore buffer overflow when extracting the group. We need to
|
| + // continue on to get full size of the untruncated result. We will
|
| + // raise our own buffer overflow error at the end.
|
| + *status = U_ZERO_ERROR;
|
| + }
|
| }
|
|
|
| if (U_FAILURE(*status)) {
|
| - // Can fail if group number is out of range.
|
| + // bad group number or name.
|
| break;
|
| }
|
| -
|
| }
|
|
|
| //
|
| @@ -1522,10 +1544,12 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
|
| //
|
| if (destIdx < capacity) {
|
| dest[destIdx] = 0;
|
| - } else if (destIdx == *destCapacity) {
|
| - *status = U_STRING_NOT_TERMINATED_WARNING;
|
| - } else {
|
| - *status = U_BUFFER_OVERFLOW_ERROR;
|
| + } else if (U_SUCCESS(*status)) {
|
| + if (destIdx == *destCapacity) {
|
| + *status = U_STRING_NOT_TERMINATED_WARNING;
|
| + } else {
|
| + *status = U_BUFFER_OVERFLOW_ERROR;
|
| + }
|
| }
|
|
|
| //
|
|
|