Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(466)

Unified Diff: source/i18n/regexcmp.cpp

Issue 1621843002: ICU 56 update step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@561
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « source/i18n/regexcmp.h ('k') | source/i18n/regexcst.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: source/i18n/regexcmp.cpp
diff --git a/source/i18n/regexcmp.cpp b/source/i18n/regexcmp.cpp
index ea01d5ab60bd28dce80a60b82a656c032a675c19..e518e84cd3520972ca2326f7fd5deb0405de9ac1 100644
--- a/source/i18n/regexcmp.cpp
+++ b/source/i18n/regexcmp.cpp
@@ -1,7 +1,7 @@
//
// file: regexcmp.cpp
//
-// Copyright (C) 2002-2014 International Business Machines Corporation and others.
+// Copyright (C) 2002-2015 International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains the ICU regular expression compiler, which is responsible
@@ -70,6 +70,7 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
fMatchOpenParen = -1;
fMatchCloseParen = -1;
+ fCaptureName = NULL;
if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
status = rxp->fDeferredStatus;
@@ -86,6 +87,8 @@ static const UChar chDash = 0x2d; // '-'
//
//------------------------------------------------------------------------------
RegexCompile::~RegexCompile() {
+ delete fCaptureName; // Normally will be NULL, but can exist if pattern
+ // compilation stops with a syntax error.
}
static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
@@ -138,6 +141,9 @@ void RegexCompile::compile(
// Prepare the RegexPattern object to receive the compiled pattern.
fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets;
fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8;
@@ -284,17 +290,6 @@ void RegexCompile::compile(
//
//
- // Compute the number of digits requried for the largest capture group number.
- //
- fRXPat->fMaxCaptureDigits = 1;
- int32_t n = 10;
- int32_t groupCount = fRXPat->fGroupMap->size();
- while (n <= groupCount) {
- fRXPat->fMaxCaptureDigits++;
- n *= 10;
- }
-
- //
// The pattern's fFrameSize so far has accumulated the requirements for
// storage for capture parentheses, counters, etc. that are encountered
// in the pattern. Add space for the two variables that are always
@@ -435,8 +430,25 @@ UBool RegexCompile::doParseActions(int32_t action)
break;
+ case doBeginNamedCapture:
+ // Scanning (?<letter.
+ // The first letter of the name will come through again under doConinueNamedCapture.
+ fCaptureName = new UnicodeString();
+ if (fCaptureName == NULL) {
+ error(U_MEMORY_ALLOCATION_ERROR);
+ }
+ break;
+
+ case doContinueNamedCapture:
+ fCaptureName->append(fC.fChar);
+ break;
+
+ case doBadNamedCapture:
+ error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
+ break;
+
case doOpenCaptureParen:
- // Open Paren.
+ // Open Capturing Paren, possibly named.
// Compile to a
// - NOP, which later may be replaced by a save-state if the
// parenthesized group gets a * quantifier, followed by
@@ -471,8 +483,18 @@ UBool RegexCompile::doParseActions(int32_t action)
// Save the mapping from group number to stack frame variable position.
fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
+
+ // If this is a named capture group, add the name->group number mapping.
+ if (fCaptureName != NULL) {
+ int32_t groupNumber = fRXPat->fGroupMap->size();
+ int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
+ fCaptureName = NULL; // hash table takes ownership of the name (key) string.
+ if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
+ error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
+ }
+ }
}
- break;
+ break;
case doOpenNonCaptureParen:
// Open non-caputuring (grouping only) Paren.
@@ -978,9 +1000,11 @@ UBool RegexCompile::doParseActions(int32_t action)
{
int32_t digitValue = u_charDigitValue(fC.fChar);
U_ASSERT(digitValue >= 0);
- fIntervalLow = fIntervalLow*10 + digitValue;
- if (fIntervalLow < 0) {
+ int64_t val = (int64_t)fIntervalLow*10 + digitValue;
+ if (val > INT32_MAX) {
error(U_REGEX_NUMBER_TOO_BIG);
+ } else {
+ fIntervalLow = (int32_t)val;
}
}
break;
@@ -993,9 +1017,11 @@ UBool RegexCompile::doParseActions(int32_t action)
}
int32_t digitValue = u_charDigitValue(fC.fChar);
U_ASSERT(digitValue >= 0);
- fIntervalUpper = fIntervalUpper*10 + digitValue;
- if (fIntervalUpper < 0) {
+ int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
+ if (val > INT32_MAX) {
error(U_REGEX_NUMBER_TOO_BIG);
+ } else {
+ fIntervalUpper = (int32_t)val;
}
}
break;
@@ -1162,6 +1188,21 @@ UBool RegexCompile::doParseActions(int32_t action)
appendOp(URX_BACKSLASH_G, 0);
break;
+ case doBackslashH:
+ fixLiterals(FALSE);
+ appendOp(URX_BACKSLASH_H, 1);
+ break;
+
+ case doBackslashh:
+ fixLiterals(FALSE);
+ appendOp(URX_BACKSLASH_H, 0);
+ break;
+
+ case doBackslashR:
+ fixLiterals(FALSE);
+ appendOp(URX_BACKSLASH_R, 0);
+ break;
+
case doBackslashS:
fixLiterals(FALSE);
appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
@@ -1172,6 +1213,16 @@ UBool RegexCompile::doParseActions(int32_t action)
appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
break;
+ case doBackslashV:
+ fixLiterals(FALSE);
+ appendOp(URX_BACKSLASH_V, 1);
+ break;
+
+ case doBackslashv:
+ fixLiterals(FALSE);
+ appendOp(URX_BACKSLASH_V, 0);
+ break;
+
case doBackslashW:
fixLiterals(FALSE);
appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
@@ -1263,7 +1314,41 @@ UBool RegexCompile::doParseActions(int32_t action)
}
break;
+ case doBeginNamedBackRef:
+ U_ASSERT(fCaptureName == NULL);
+ fCaptureName = new UnicodeString;
+ if (fCaptureName == NULL) {
+ error(U_MEMORY_ALLOCATION_ERROR);
+ }
+ break;
+
+ case doContinueNamedBackRef:
+ fCaptureName->append(fC.fChar);
+ break;
+ case doCompleteNamedBackRef:
+ {
+ int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
+ if (groupNumber == 0) {
+ // Group name has not been defined.
+ // Could be a forward reference. If we choose to support them at some
+ // future time, extra mechanism will be required at this point.
+ error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
+ } else {
+ // Given the number, handle identically to a \n numbered back reference.
+ // See comments above, under doBackRef
+ fixLiterals(FALSE);
+ if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
+ appendOp(URX_BACKREF_I, groupNumber);
+ } else {
+ appendOp(URX_BACKREF, groupNumber);
+ }
+ }
+ delete fCaptureName;
+ fCaptureName = NULL;
+ break;
+ }
+
case doPossessivePlus:
// Possessive ++ quantifier.
// Compiles to
@@ -1488,6 +1573,48 @@ UBool RegexCompile::doParseActions(int32_t action)
break;
}
+ case doSetBackslash_h:
+ {
+ UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
+ UnicodeSet h;
+ h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
+ h.add((UChar32)9); // Tab
+ set->addAll(h);
+ break;
+ }
+
+ case doSetBackslash_H:
+ {
+ UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
+ UnicodeSet h;
+ h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
+ h.add((UChar32)9); // Tab
+ h.complement();
+ set->addAll(h);
+ break;
+ }
+
+ case doSetBackslash_v:
+ {
+ UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
+ set->add((UChar32)0x0a, (UChar32)0x0d); // add range
+ set->add((UChar32)0x85);
+ set->add((UChar32)0x2028, (UChar32)0x2029);
+ break;
+ }
+
+ case doSetBackslash_V:
+ {
+ UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
+ UnicodeSet v;
+ v.add((UChar32)0x0a, (UChar32)0x0d); // add range
+ v.add((UChar32)0x85);
+ v.add((UChar32)0x2028, (UChar32)0x2029);
+ v.complement();
+ set->addAll(v);
+ break;
+ }
+
case doSetBackslash_w:
{
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
@@ -2689,6 +2816,43 @@ void RegexCompile::matchStartType() {
break;
+ case URX_BACKSLASH_H:
+ // Horiz white space
+ if (currentLen == 0) {
+ UnicodeSet s;
+ s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
+ s.add((UChar32)9); // Tab
+ if (URX_VAL(op) != 0) {
+ s.complement();
+ }
+ fRXPat->fInitialChars->addAll(s);
+ numInitialStrings += 2;
+ }
+ currentLen++;
+ atStart = FALSE;
+ break;
+
+
+ case URX_BACKSLASH_R: // Any line ending sequence
+ case URX_BACKSLASH_V: // Any line ending code point, with optional negation
+ if (currentLen == 0) {
+ UnicodeSet s;
+ s.add((UChar32)0x0a, (UChar32)0x0d); // add range
+ s.add((UChar32)0x85);
+ s.add((UChar32)0x2028, (UChar32)0x2029);
+ if (URX_VAL(op) != 0) {
+ // Complement option applies to URX_BACKSLASH_V only.
+ s.complement();
+ }
+ fRXPat->fInitialChars->addAll(s);
+ numInitialStrings += 2;
+ }
+ currentLen++;
+ atStart = FALSE;
+ break;
+
+
+
case URX_ONECHAR_I:
// Case Insensitive Single Character.
if (currentLen == 0) {
@@ -3077,6 +3241,9 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
case URX_STAT_SETREF_N:
case URX_SETREF:
case URX_BACKSLASH_D:
+ case URX_BACKSLASH_H:
+ case URX_BACKSLASH_R:
+ case URX_BACKSLASH_V:
case URX_ONECHAR_I:
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
case URX_DOTANY_ALL: // . matches one or two.
@@ -3358,6 +3525,9 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
case URX_STAT_SETREF_N:
case URX_SETREF:
case URX_BACKSLASH_D:
+ case URX_BACKSLASH_H:
+ case URX_BACKSLASH_R:
+ case URX_BACKSLASH_V:
case URX_ONECHAR_I:
case URX_DOTANY_ALL:
case URX_DOTANY:
@@ -3479,12 +3649,13 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
}
U_ASSERT(loopEndLoc >= loc+4);
- int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call.
- if (blockLen == INT32_MAX) {
- currentLen = blockLen;
+ int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call.
+ int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
+ if (updatedLen >= INT32_MAX) {
+ currentLen = INT32_MAX;
break;
}
- currentLen += blockLen * maxLoopCount;
+ currentLen = (int32_t)updatedLen;
loc = loopEndLoc;
break;
}
@@ -3685,6 +3856,9 @@ void RegexCompile::stripNOPs() {
case URX_LOOP_C:
case URX_DOLLAR_D:
case URX_DOLLAR_MD:
+ case URX_BACKSLASH_H:
+ case URX_BACKSLASH_R:
+ case URX_BACKSLASH_V:
// These instructions are unaltered by the relocation.
fRXPat->fCompiledPat->setElementAt(op, dst);
dst++;
« no previous file with comments | « source/i18n/regexcmp.h ('k') | source/i18n/regexcst.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698