Index: source/i18n/regexcmp.cpp |
diff --git a/source/i18n/regexcmp.cpp b/source/i18n/regexcmp.cpp |
index ea01d5ab60bd28dce80a60b82a656c032a675c19..e518e84cd3520972ca2326f7fd5deb0405de9ac1 100644 |
--- a/source/i18n/regexcmp.cpp |
+++ b/source/i18n/regexcmp.cpp |
@@ -1,7 +1,7 @@ |
// |
// file: regexcmp.cpp |
// |
-// Copyright (C) 2002-2014 International Business Machines Corporation and others. |
+// Copyright (C) 2002-2015 International Business Machines Corporation and others. |
// All Rights Reserved. |
// |
// This file contains the ICU regular expression compiler, which is responsible |
@@ -70,6 +70,7 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : |
fMatchOpenParen = -1; |
fMatchCloseParen = -1; |
+ fCaptureName = NULL; |
if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) { |
status = rxp->fDeferredStatus; |
@@ -86,6 +87,8 @@ static const UChar chDash = 0x2d; // '-' |
// |
//------------------------------------------------------------------------------ |
RegexCompile::~RegexCompile() { |
+ delete fCaptureName; // Normally will be NULL, but can exist if pattern |
+ // compilation stops with a syntax error. |
} |
static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) { |
@@ -138,6 +141,9 @@ void RegexCompile::compile( |
// Prepare the RegexPattern object to receive the compiled pattern. |
fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus); |
+ if (U_FAILURE(*fStatus)) { |
+ return; |
+ } |
fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets; |
fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8; |
@@ -284,17 +290,6 @@ void RegexCompile::compile( |
// |
// |
- // Compute the number of digits requried for the largest capture group number. |
- // |
- fRXPat->fMaxCaptureDigits = 1; |
- int32_t n = 10; |
- int32_t groupCount = fRXPat->fGroupMap->size(); |
- while (n <= groupCount) { |
- fRXPat->fMaxCaptureDigits++; |
- n *= 10; |
- } |
- |
- // |
// The pattern's fFrameSize so far has accumulated the requirements for |
// storage for capture parentheses, counters, etc. that are encountered |
// in the pattern. Add space for the two variables that are always |
@@ -435,8 +430,25 @@ UBool RegexCompile::doParseActions(int32_t action) |
break; |
+ case doBeginNamedCapture: |
+ // Scanning (?<letter. |
+ // The first letter of the name will come through again under doConinueNamedCapture. |
+ fCaptureName = new UnicodeString(); |
+ if (fCaptureName == NULL) { |
+ error(U_MEMORY_ALLOCATION_ERROR); |
+ } |
+ break; |
+ |
+ case doContinueNamedCapture: |
+ fCaptureName->append(fC.fChar); |
+ break; |
+ |
+ case doBadNamedCapture: |
+ error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); |
+ break; |
+ |
case doOpenCaptureParen: |
- // Open Paren. |
+ // Open Capturing Paren, possibly named. |
// Compile to a |
// - NOP, which later may be replaced by a save-state if the |
// parenthesized group gets a * quantifier, followed by |
@@ -471,8 +483,18 @@ UBool RegexCompile::doParseActions(int32_t action) |
// Save the mapping from group number to stack frame variable position. |
fRXPat->fGroupMap->addElement(varsLoc, *fStatus); |
+ |
+ // If this is a named capture group, add the name->group number mapping. |
+ if (fCaptureName != NULL) { |
+ int32_t groupNumber = fRXPat->fGroupMap->size(); |
+ int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus); |
+ fCaptureName = NULL; // hash table takes ownership of the name (key) string. |
+ if (previousMapping > 0 && U_SUCCESS(*fStatus)) { |
+ error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); |
+ } |
+ } |
} |
- break; |
+ break; |
case doOpenNonCaptureParen: |
// Open non-caputuring (grouping only) Paren. |
@@ -978,9 +1000,11 @@ UBool RegexCompile::doParseActions(int32_t action) |
{ |
int32_t digitValue = u_charDigitValue(fC.fChar); |
U_ASSERT(digitValue >= 0); |
- fIntervalLow = fIntervalLow*10 + digitValue; |
- if (fIntervalLow < 0) { |
+ int64_t val = (int64_t)fIntervalLow*10 + digitValue; |
+ if (val > INT32_MAX) { |
error(U_REGEX_NUMBER_TOO_BIG); |
+ } else { |
+ fIntervalLow = (int32_t)val; |
} |
} |
break; |
@@ -993,9 +1017,11 @@ UBool RegexCompile::doParseActions(int32_t action) |
} |
int32_t digitValue = u_charDigitValue(fC.fChar); |
U_ASSERT(digitValue >= 0); |
- fIntervalUpper = fIntervalUpper*10 + digitValue; |
- if (fIntervalUpper < 0) { |
+ int64_t val = (int64_t)fIntervalUpper*10 + digitValue; |
+ if (val > INT32_MAX) { |
error(U_REGEX_NUMBER_TOO_BIG); |
+ } else { |
+ fIntervalUpper = (int32_t)val; |
} |
} |
break; |
@@ -1162,6 +1188,21 @@ UBool RegexCompile::doParseActions(int32_t action) |
appendOp(URX_BACKSLASH_G, 0); |
break; |
+ case doBackslashH: |
+ fixLiterals(FALSE); |
+ appendOp(URX_BACKSLASH_H, 1); |
+ break; |
+ |
+ case doBackslashh: |
+ fixLiterals(FALSE); |
+ appendOp(URX_BACKSLASH_H, 0); |
+ break; |
+ |
+ case doBackslashR: |
+ fixLiterals(FALSE); |
+ appendOp(URX_BACKSLASH_R, 0); |
+ break; |
+ |
case doBackslashS: |
fixLiterals(FALSE); |
appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET); |
@@ -1172,6 +1213,16 @@ UBool RegexCompile::doParseActions(int32_t action) |
appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET); |
break; |
+ case doBackslashV: |
+ fixLiterals(FALSE); |
+ appendOp(URX_BACKSLASH_V, 1); |
+ break; |
+ |
+ case doBackslashv: |
+ fixLiterals(FALSE); |
+ appendOp(URX_BACKSLASH_V, 0); |
+ break; |
+ |
case doBackslashW: |
fixLiterals(FALSE); |
appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET); |
@@ -1263,7 +1314,41 @@ UBool RegexCompile::doParseActions(int32_t action) |
} |
break; |
+ case doBeginNamedBackRef: |
+ U_ASSERT(fCaptureName == NULL); |
+ fCaptureName = new UnicodeString; |
+ if (fCaptureName == NULL) { |
+ error(U_MEMORY_ALLOCATION_ERROR); |
+ } |
+ break; |
+ |
+ case doContinueNamedBackRef: |
+ fCaptureName->append(fC.fChar); |
+ break; |
+ case doCompleteNamedBackRef: |
+ { |
+ int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName); |
+ if (groupNumber == 0) { |
+ // Group name has not been defined. |
+ // Could be a forward reference. If we choose to support them at some |
+ // future time, extra mechanism will be required at this point. |
+ error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); |
+ } else { |
+ // Given the number, handle identically to a \n numbered back reference. |
+ // See comments above, under doBackRef |
+ fixLiterals(FALSE); |
+ if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
+ appendOp(URX_BACKREF_I, groupNumber); |
+ } else { |
+ appendOp(URX_BACKREF, groupNumber); |
+ } |
+ } |
+ delete fCaptureName; |
+ fCaptureName = NULL; |
+ break; |
+ } |
+ |
case doPossessivePlus: |
// Possessive ++ quantifier. |
// Compiles to |
@@ -1488,6 +1573,48 @@ UBool RegexCompile::doParseActions(int32_t action) |
break; |
} |
+ case doSetBackslash_h: |
+ { |
+ UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
+ UnicodeSet h; |
+ h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus); |
+ h.add((UChar32)9); // Tab |
+ set->addAll(h); |
+ break; |
+ } |
+ |
+ case doSetBackslash_H: |
+ { |
+ UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
+ UnicodeSet h; |
+ h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus); |
+ h.add((UChar32)9); // Tab |
+ h.complement(); |
+ set->addAll(h); |
+ break; |
+ } |
+ |
+ case doSetBackslash_v: |
+ { |
+ UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
+ set->add((UChar32)0x0a, (UChar32)0x0d); // add range |
+ set->add((UChar32)0x85); |
+ set->add((UChar32)0x2028, (UChar32)0x2029); |
+ break; |
+ } |
+ |
+ case doSetBackslash_V: |
+ { |
+ UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
+ UnicodeSet v; |
+ v.add((UChar32)0x0a, (UChar32)0x0d); // add range |
+ v.add((UChar32)0x85); |
+ v.add((UChar32)0x2028, (UChar32)0x2029); |
+ v.complement(); |
+ set->addAll(v); |
+ break; |
+ } |
+ |
case doSetBackslash_w: |
{ |
UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
@@ -2689,6 +2816,43 @@ void RegexCompile::matchStartType() { |
break; |
+ case URX_BACKSLASH_H: |
+ // Horiz white space |
+ if (currentLen == 0) { |
+ UnicodeSet s; |
+ s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus); |
+ s.add((UChar32)9); // Tab |
+ if (URX_VAL(op) != 0) { |
+ s.complement(); |
+ } |
+ fRXPat->fInitialChars->addAll(s); |
+ numInitialStrings += 2; |
+ } |
+ currentLen++; |
+ atStart = FALSE; |
+ break; |
+ |
+ |
+ case URX_BACKSLASH_R: // Any line ending sequence |
+ case URX_BACKSLASH_V: // Any line ending code point, with optional negation |
+ if (currentLen == 0) { |
+ UnicodeSet s; |
+ s.add((UChar32)0x0a, (UChar32)0x0d); // add range |
+ s.add((UChar32)0x85); |
+ s.add((UChar32)0x2028, (UChar32)0x2029); |
+ if (URX_VAL(op) != 0) { |
+ // Complement option applies to URX_BACKSLASH_V only. |
+ s.complement(); |
+ } |
+ fRXPat->fInitialChars->addAll(s); |
+ numInitialStrings += 2; |
+ } |
+ currentLen++; |
+ atStart = FALSE; |
+ break; |
+ |
+ |
+ |
case URX_ONECHAR_I: |
// Case Insensitive Single Character. |
if (currentLen == 0) { |
@@ -3077,6 +3241,9 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { |
case URX_STAT_SETREF_N: |
case URX_SETREF: |
case URX_BACKSLASH_D: |
+ case URX_BACKSLASH_H: |
+ case URX_BACKSLASH_R: |
+ case URX_BACKSLASH_V: |
case URX_ONECHAR_I: |
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. |
case URX_DOTANY_ALL: // . matches one or two. |
@@ -3358,6 +3525,9 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { |
case URX_STAT_SETREF_N: |
case URX_SETREF: |
case URX_BACKSLASH_D: |
+ case URX_BACKSLASH_H: |
+ case URX_BACKSLASH_R: |
+ case URX_BACKSLASH_V: |
case URX_ONECHAR_I: |
case URX_DOTANY_ALL: |
case URX_DOTANY: |
@@ -3479,12 +3649,13 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { |
} |
U_ASSERT(loopEndLoc >= loc+4); |
- int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call. |
- if (blockLen == INT32_MAX) { |
- currentLen = blockLen; |
+ int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call. |
+ int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount; |
+ if (updatedLen >= INT32_MAX) { |
+ currentLen = INT32_MAX; |
break; |
} |
- currentLen += blockLen * maxLoopCount; |
+ currentLen = (int32_t)updatedLen; |
loc = loopEndLoc; |
break; |
} |
@@ -3685,6 +3856,9 @@ void RegexCompile::stripNOPs() { |
case URX_LOOP_C: |
case URX_DOLLAR_D: |
case URX_DOLLAR_MD: |
+ case URX_BACKSLASH_H: |
+ case URX_BACKSLASH_R: |
+ case URX_BACKSLASH_V: |
// These instructions are unaltered by the relocation. |
fRXPat->fCompiledPat->setElementAt(op, dst); |
dst++; |