| Index: source/i18n/regexcmp.cpp
|
| diff --git a/source/i18n/regexcmp.cpp b/source/i18n/regexcmp.cpp
|
| index ea01d5ab60bd28dce80a60b82a656c032a675c19..e518e84cd3520972ca2326f7fd5deb0405de9ac1 100644
|
| --- a/source/i18n/regexcmp.cpp
|
| +++ b/source/i18n/regexcmp.cpp
|
| @@ -1,7 +1,7 @@
|
| //
|
| // file: regexcmp.cpp
|
| //
|
| -// Copyright (C) 2002-2014 International Business Machines Corporation and others.
|
| +// Copyright (C) 2002-2015 International Business Machines Corporation and others.
|
| // All Rights Reserved.
|
| //
|
| // This file contains the ICU regular expression compiler, which is responsible
|
| @@ -70,6 +70,7 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
|
|
|
| fMatchOpenParen = -1;
|
| fMatchCloseParen = -1;
|
| + fCaptureName = NULL;
|
|
|
| if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
|
| status = rxp->fDeferredStatus;
|
| @@ -86,6 +87,8 @@ static const UChar chDash = 0x2d; // '-'
|
| //
|
| //------------------------------------------------------------------------------
|
| RegexCompile::~RegexCompile() {
|
| + delete fCaptureName; // Normally will be NULL, but can exist if pattern
|
| + // compilation stops with a syntax error.
|
| }
|
|
|
| static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
|
| @@ -138,6 +141,9 @@ void RegexCompile::compile(
|
|
|
| // Prepare the RegexPattern object to receive the compiled pattern.
|
| fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
|
| + if (U_FAILURE(*fStatus)) {
|
| + return;
|
| + }
|
| fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets;
|
| fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8;
|
|
|
| @@ -284,17 +290,6 @@ void RegexCompile::compile(
|
| //
|
|
|
| //
|
| - // Compute the number of digits requried for the largest capture group number.
|
| - //
|
| - fRXPat->fMaxCaptureDigits = 1;
|
| - int32_t n = 10;
|
| - int32_t groupCount = fRXPat->fGroupMap->size();
|
| - while (n <= groupCount) {
|
| - fRXPat->fMaxCaptureDigits++;
|
| - n *= 10;
|
| - }
|
| -
|
| - //
|
| // The pattern's fFrameSize so far has accumulated the requirements for
|
| // storage for capture parentheses, counters, etc. that are encountered
|
| // in the pattern. Add space for the two variables that are always
|
| @@ -435,8 +430,25 @@ UBool RegexCompile::doParseActions(int32_t action)
|
| break;
|
|
|
|
|
| + case doBeginNamedCapture:
|
| + // Scanning (?<letter.
|
| + // The first letter of the name will come through again under doConinueNamedCapture.
|
| + fCaptureName = new UnicodeString();
|
| + if (fCaptureName == NULL) {
|
| + error(U_MEMORY_ALLOCATION_ERROR);
|
| + }
|
| + break;
|
| +
|
| + case doContinueNamedCapture:
|
| + fCaptureName->append(fC.fChar);
|
| + break;
|
| +
|
| + case doBadNamedCapture:
|
| + error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
|
| + break;
|
| +
|
| case doOpenCaptureParen:
|
| - // Open Paren.
|
| + // Open Capturing Paren, possibly named.
|
| // Compile to a
|
| // - NOP, which later may be replaced by a save-state if the
|
| // parenthesized group gets a * quantifier, followed by
|
| @@ -471,8 +483,18 @@ UBool RegexCompile::doParseActions(int32_t action)
|
|
|
| // Save the mapping from group number to stack frame variable position.
|
| fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
|
| +
|
| + // If this is a named capture group, add the name->group number mapping.
|
| + if (fCaptureName != NULL) {
|
| + int32_t groupNumber = fRXPat->fGroupMap->size();
|
| + int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
|
| + fCaptureName = NULL; // hash table takes ownership of the name (key) string.
|
| + if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
|
| + error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
|
| + }
|
| + }
|
| }
|
| - break;
|
| + break;
|
|
|
| case doOpenNonCaptureParen:
|
| // Open non-caputuring (grouping only) Paren.
|
| @@ -978,9 +1000,11 @@ UBool RegexCompile::doParseActions(int32_t action)
|
| {
|
| int32_t digitValue = u_charDigitValue(fC.fChar);
|
| U_ASSERT(digitValue >= 0);
|
| - fIntervalLow = fIntervalLow*10 + digitValue;
|
| - if (fIntervalLow < 0) {
|
| + int64_t val = (int64_t)fIntervalLow*10 + digitValue;
|
| + if (val > INT32_MAX) {
|
| error(U_REGEX_NUMBER_TOO_BIG);
|
| + } else {
|
| + fIntervalLow = (int32_t)val;
|
| }
|
| }
|
| break;
|
| @@ -993,9 +1017,11 @@ UBool RegexCompile::doParseActions(int32_t action)
|
| }
|
| int32_t digitValue = u_charDigitValue(fC.fChar);
|
| U_ASSERT(digitValue >= 0);
|
| - fIntervalUpper = fIntervalUpper*10 + digitValue;
|
| - if (fIntervalUpper < 0) {
|
| + int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
|
| + if (val > INT32_MAX) {
|
| error(U_REGEX_NUMBER_TOO_BIG);
|
| + } else {
|
| + fIntervalUpper = (int32_t)val;
|
| }
|
| }
|
| break;
|
| @@ -1162,6 +1188,21 @@ UBool RegexCompile::doParseActions(int32_t action)
|
| appendOp(URX_BACKSLASH_G, 0);
|
| break;
|
|
|
| + case doBackslashH:
|
| + fixLiterals(FALSE);
|
| + appendOp(URX_BACKSLASH_H, 1);
|
| + break;
|
| +
|
| + case doBackslashh:
|
| + fixLiterals(FALSE);
|
| + appendOp(URX_BACKSLASH_H, 0);
|
| + break;
|
| +
|
| + case doBackslashR:
|
| + fixLiterals(FALSE);
|
| + appendOp(URX_BACKSLASH_R, 0);
|
| + break;
|
| +
|
| case doBackslashS:
|
| fixLiterals(FALSE);
|
| appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
|
| @@ -1172,6 +1213,16 @@ UBool RegexCompile::doParseActions(int32_t action)
|
| appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
|
| break;
|
|
|
| + case doBackslashV:
|
| + fixLiterals(FALSE);
|
| + appendOp(URX_BACKSLASH_V, 1);
|
| + break;
|
| +
|
| + case doBackslashv:
|
| + fixLiterals(FALSE);
|
| + appendOp(URX_BACKSLASH_V, 0);
|
| + break;
|
| +
|
| case doBackslashW:
|
| fixLiterals(FALSE);
|
| appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
|
| @@ -1263,7 +1314,41 @@ UBool RegexCompile::doParseActions(int32_t action)
|
| }
|
| break;
|
|
|
| + case doBeginNamedBackRef:
|
| + U_ASSERT(fCaptureName == NULL);
|
| + fCaptureName = new UnicodeString;
|
| + if (fCaptureName == NULL) {
|
| + error(U_MEMORY_ALLOCATION_ERROR);
|
| + }
|
| + break;
|
| +
|
| + case doContinueNamedBackRef:
|
| + fCaptureName->append(fC.fChar);
|
| + break;
|
|
|
| + case doCompleteNamedBackRef:
|
| + {
|
| + int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
|
| + if (groupNumber == 0) {
|
| + // Group name has not been defined.
|
| + // Could be a forward reference. If we choose to support them at some
|
| + // future time, extra mechanism will be required at this point.
|
| + error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
|
| + } else {
|
| + // Given the number, handle identically to a \n numbered back reference.
|
| + // See comments above, under doBackRef
|
| + fixLiterals(FALSE);
|
| + if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
|
| + appendOp(URX_BACKREF_I, groupNumber);
|
| + } else {
|
| + appendOp(URX_BACKREF, groupNumber);
|
| + }
|
| + }
|
| + delete fCaptureName;
|
| + fCaptureName = NULL;
|
| + break;
|
| + }
|
| +
|
| case doPossessivePlus:
|
| // Possessive ++ quantifier.
|
| // Compiles to
|
| @@ -1488,6 +1573,48 @@ UBool RegexCompile::doParseActions(int32_t action)
|
| break;
|
| }
|
|
|
| + case doSetBackslash_h:
|
| + {
|
| + UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
|
| + UnicodeSet h;
|
| + h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
|
| + h.add((UChar32)9); // Tab
|
| + set->addAll(h);
|
| + break;
|
| + }
|
| +
|
| + case doSetBackslash_H:
|
| + {
|
| + UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
|
| + UnicodeSet h;
|
| + h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
|
| + h.add((UChar32)9); // Tab
|
| + h.complement();
|
| + set->addAll(h);
|
| + break;
|
| + }
|
| +
|
| + case doSetBackslash_v:
|
| + {
|
| + UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
|
| + set->add((UChar32)0x0a, (UChar32)0x0d); // add range
|
| + set->add((UChar32)0x85);
|
| + set->add((UChar32)0x2028, (UChar32)0x2029);
|
| + break;
|
| + }
|
| +
|
| + case doSetBackslash_V:
|
| + {
|
| + UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
|
| + UnicodeSet v;
|
| + v.add((UChar32)0x0a, (UChar32)0x0d); // add range
|
| + v.add((UChar32)0x85);
|
| + v.add((UChar32)0x2028, (UChar32)0x2029);
|
| + v.complement();
|
| + set->addAll(v);
|
| + break;
|
| + }
|
| +
|
| case doSetBackslash_w:
|
| {
|
| UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
|
| @@ -2689,6 +2816,43 @@ void RegexCompile::matchStartType() {
|
| break;
|
|
|
|
|
| + case URX_BACKSLASH_H:
|
| + // Horiz white space
|
| + if (currentLen == 0) {
|
| + UnicodeSet s;
|
| + s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
|
| + s.add((UChar32)9); // Tab
|
| + if (URX_VAL(op) != 0) {
|
| + s.complement();
|
| + }
|
| + fRXPat->fInitialChars->addAll(s);
|
| + numInitialStrings += 2;
|
| + }
|
| + currentLen++;
|
| + atStart = FALSE;
|
| + break;
|
| +
|
| +
|
| + case URX_BACKSLASH_R: // Any line ending sequence
|
| + case URX_BACKSLASH_V: // Any line ending code point, with optional negation
|
| + if (currentLen == 0) {
|
| + UnicodeSet s;
|
| + s.add((UChar32)0x0a, (UChar32)0x0d); // add range
|
| + s.add((UChar32)0x85);
|
| + s.add((UChar32)0x2028, (UChar32)0x2029);
|
| + if (URX_VAL(op) != 0) {
|
| + // Complement option applies to URX_BACKSLASH_V only.
|
| + s.complement();
|
| + }
|
| + fRXPat->fInitialChars->addAll(s);
|
| + numInitialStrings += 2;
|
| + }
|
| + currentLen++;
|
| + atStart = FALSE;
|
| + break;
|
| +
|
| +
|
| +
|
| case URX_ONECHAR_I:
|
| // Case Insensitive Single Character.
|
| if (currentLen == 0) {
|
| @@ -3077,6 +3241,9 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
|
| case URX_STAT_SETREF_N:
|
| case URX_SETREF:
|
| case URX_BACKSLASH_D:
|
| + case URX_BACKSLASH_H:
|
| + case URX_BACKSLASH_R:
|
| + case URX_BACKSLASH_V:
|
| case URX_ONECHAR_I:
|
| case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
|
| case URX_DOTANY_ALL: // . matches one or two.
|
| @@ -3358,6 +3525,9 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
|
| case URX_STAT_SETREF_N:
|
| case URX_SETREF:
|
| case URX_BACKSLASH_D:
|
| + case URX_BACKSLASH_H:
|
| + case URX_BACKSLASH_R:
|
| + case URX_BACKSLASH_V:
|
| case URX_ONECHAR_I:
|
| case URX_DOTANY_ALL:
|
| case URX_DOTANY:
|
| @@ -3479,12 +3649,13 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
|
| }
|
|
|
| U_ASSERT(loopEndLoc >= loc+4);
|
| - int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call.
|
| - if (blockLen == INT32_MAX) {
|
| - currentLen = blockLen;
|
| + int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call.
|
| + int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
|
| + if (updatedLen >= INT32_MAX) {
|
| + currentLen = INT32_MAX;
|
| break;
|
| }
|
| - currentLen += blockLen * maxLoopCount;
|
| + currentLen = (int32_t)updatedLen;
|
| loc = loopEndLoc;
|
| break;
|
| }
|
| @@ -3685,6 +3856,9 @@ void RegexCompile::stripNOPs() {
|
| case URX_LOOP_C:
|
| case URX_DOLLAR_D:
|
| case URX_DOLLAR_MD:
|
| + case URX_BACKSLASH_H:
|
| + case URX_BACKSLASH_R:
|
| + case URX_BACKSLASH_V:
|
| // These instructions are unaltered by the relocation.
|
| fRXPat->fCompiledPat->setElementAt(op, dst);
|
| dst++;
|
|
|