Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(112)

Unified Diff: source/i18n/uregex.cpp

Issue 1621843002: ICU 56 update step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@561
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « source/i18n/unum.cpp ('k') | source/i18n/uregion.cpp » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: source/i18n/uregex.cpp
diff --git a/source/i18n/uregex.cpp b/source/i18n/uregex.cpp
index 01951234b9c0fead09307ee16038722ff28517bf..99e94283816cd2f6eec8848dea657fa4198ec52d 100644
--- a/source/i18n/uregex.cpp
+++ b/source/i18n/uregex.cpp
@@ -1,6 +1,6 @@
/*
*******************************************************************************
-* Copyright (C) 2004-2014, International Business Machines
+* Copyright (C) 2004-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: uregex.cpp
@@ -17,14 +17,14 @@
#include "unicode/uchar.h"
#include "unicode/uobject.h"
#include "unicode/utf16.h"
-#include "umutex.h"
-#include "uassert.h"
#include "cmemory.h"
+#include "uassert.h"
+#include "uhash.h"
+#include "umutex.h"
+#include "uvectr32.h"
#include "regextxt.h"
-#include <stdio.h>
-
U_NAMESPACE_BEGIN
#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
@@ -627,6 +627,36 @@ uregex_groupCount(URegularExpression *regexp2,
//------------------------------------------------------------------------------
//
+// uregex_groupNumberFromName
+//
+//------------------------------------------------------------------------------
+int32_t
+uregex_groupNumberFromName(URegularExpression *regexp2,
+ const UChar *groupName,
+ int32_t nameLength,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status) == FALSE) {
+ return 0;
+ }
+ int32_t result = regexp->fPat->groupNumberFromName(UnicodeString(groupName, nameLength), *status);
+ return result;
+}
+
+int32_t
+uregex_groupNumberFromCName(URegularExpression *regexp2,
+ const char *groupName,
+ int32_t nameLength,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status) == FALSE) {
+ return 0;
+ }
+ return regexp->fPat->groupNumberFromName(groupName, nameLength, *status);
+}
+
+//------------------------------------------------------------------------------
+//
// uregex_group
//
//------------------------------------------------------------------------------
@@ -647,7 +677,7 @@ uregex_group(URegularExpression *regexp2,
if (destCapacity == 0 || regexp->fText != NULL) {
// If preflighting or if we already have the text as UChars,
- // this is a little cheaper than going through uregex_groupUTextDeep()
+ // this is a little cheaper than extracting from the UText
//
// Pick up the range of characters from the matcher
@@ -680,14 +710,18 @@ uregex_group(URegularExpression *regexp2,
}
return fullLength;
} else {
- int32_t result = 0;
- UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status);
- if (U_SUCCESS(*status)) {
- result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status);
+ int64_t start = regexp->fMatcher->start64(groupNum, *status);
+ int64_t limit = regexp->fMatcher->end64(groupNum, *status);
+ if (U_FAILURE(*status)) {
+ return 0;
}
- utext_close(groupText);
- return result;
+ // Note edge cases:
+ // Group didn't match: start == end == -1. UText trims to 0, UText gives zero length result.
+ // Zero Length Match: start == end.
+ int32_t length = utext_extract(regexp->fMatcher->inputText(), start, limit, dest, destCapacity, status);
+ return length;
}
+
}
@@ -713,49 +747,6 @@ uregex_groupUText(URegularExpression *regexp2,
//------------------------------------------------------------------------------
//
-// uregex_groupUTextDeep
-//
-//------------------------------------------------------------------------------
-U_CAPI UText * U_EXPORT2
-uregex_groupUTextDeep(URegularExpression *regexp2,
- int32_t groupNum,
- UText *dest,
- UErrorCode *status) {
- RegularExpression *regexp = (RegularExpression*)regexp2;
- if (validateRE(regexp, TRUE, status) == FALSE) {
- UErrorCode emptyTextStatus = U_ZERO_ERROR;
- return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
- }
-
- if (regexp->fText != NULL) {
- //
- // Pick up the range of characters from the matcher
- // and use our already-extracted characters
- //
- int32_t startIx = regexp->fMatcher->start(groupNum, *status);
- int32_t endIx = regexp->fMatcher->end (groupNum, *status);
- if (U_FAILURE(*status)) {
- UErrorCode emptyTextStatus = U_ZERO_ERROR;
- return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
- }
-
- if (dest) {
- utext_replace(dest, 0, utext_nativeLength(dest), &regexp->fText[startIx], endIx - startIx, status);
- } else {
- UText groupText = UTEXT_INITIALIZER;
- utext_openUChars(&groupText, &regexp->fText[startIx], endIx - startIx, status);
- dest = utext_clone(NULL, &groupText, TRUE, FALSE, status);
- utext_close(&groupText);
- }
-
- return dest;
- } else {
- return regexp->fMatcher->group(groupNum, dest, *status);
- }
-}
-
-//------------------------------------------------------------------------------
-//
// uregex_start
//
//------------------------------------------------------------------------------
@@ -1324,6 +1315,8 @@ U_NAMESPACE_END
static const UChar BACKSLASH = 0x5c;
static const UChar DOLLARSIGN = 0x24;
+static const UChar LEFTBRACKET = 0x7b;
+static const UChar RIGHTBRACKET = 0x7d;
//
// Move a character to an output buffer, with bounds checking on the index.
@@ -1398,10 +1391,10 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
matchStart = (int32_t)m->fMatchStart;
} else {
// !!!: Would like a better way to do this!
- UErrorCode status = U_ZERO_ERROR;
- lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
- status = U_ZERO_ERROR;
- matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
+ UErrorCode tempStatus = U_ZERO_ERROR;
+ lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &tempStatus);
+ tempStatus = U_ZERO_ERROR;
+ matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &tempStatus);
}
for (i=lastMatchEnd; i<matchStart; i++) {
appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
@@ -1416,7 +1409,7 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
// scan the replacement text, looking for substitutions ($n) and \escapes.
int32_t replIdx = 0;
- while (replIdx < replacementLength) {
+ while (replIdx < replacementLength && U_SUCCESS(*status)) {
UChar c = replacementText[replIdx];
replIdx++;
if (c != DOLLARSIGN && c != BACKSLASH) {
@@ -1465,55 +1458,84 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
continue;
}
+ // We've got a $. Pick up the following capture group name or number.
+ // For numbers, consume only digits that produce a valid capture group for the pattern.
-
- // We've got a $. Pick up a capture group number if one follows.
- // Consume at most the number of digits necessary for the largest capture
- // number that is valid for this pattern.
-
- int32_t numDigits = 0;
int32_t groupNum = 0;
- UChar32 digitC;
- for (;;) {
- if (replIdx >= replacementLength) {
- break;
- }
- U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
- if (u_isdigit(digitC) == FALSE) {
- break;
- }
+ U_ASSERT(c == DOLLARSIGN);
+ UChar32 c32;
+ U16_GET(replacementText, 0, replIdx, replacementLength, c32);
+ if (u_isdigit(c32)) {
+ int32_t numDigits = 0;
+ int32_t numCaptureGroups = m->fPattern->fGroupMap->size();
+ for (;;) {
+ if (replIdx >= replacementLength) {
+ break;
+ }
+ U16_GET(replacementText, 0, replIdx, replacementLength, c32);
+ if (u_isdigit(c32) == FALSE) {
+ break;
+ }
+ int32_t digitVal = u_charDigitValue(c32);
+ if (groupNum * 10 + digitVal <= numCaptureGroups) {
+ groupNum = groupNum * 10 + digitVal;
+ U16_FWD_1(replacementText, replIdx, replacementLength);
+ numDigits++;
+ } else {
+ if (numDigits == 0) {
+ *status = U_INDEX_OUTOFBOUNDS_ERROR;
+ }
+ break;
+ }
+ }
+ } else if (c32 == LEFTBRACKET) {
+ // Scan for Named Capture Group, ${name}.
+ UnicodeString groupName;
U16_FWD_1(replacementText, replIdx, replacementLength);
- groupNum=groupNum*10 + u_charDigitValue(digitC);
- numDigits++;
- if (numDigits >= m->fPattern->fMaxCaptureDigits) {
- break;
+ while (U_SUCCESS(*status) && c32 != RIGHTBRACKET) {
+ if (replIdx >= replacementLength) {
+ *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+ break;
+ }
+ U16_NEXT(replacementText, replIdx, replacementLength, c32);
+ if ((c32 >= 0x41 && c32 <= 0x5a) || // A..Z
+ (c32 >= 0x61 && c32 <= 0x7a) || // a..z
+ (c32 >= 0x31 && c32 <= 0x39)) { // 0..9
+ groupName.append(c32);
+ } else if (c32 == RIGHTBRACKET) {
+ groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName);
+ if (groupNum == 0) {
+ // Name not defined by pattern.
+ *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+ }
+ } else {
+ // Character was something other than a name char or a closing '}'
+ *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+ }
}
+ } else {
+ // $ not followed by {name} or digits.
+ *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
- if (numDigits == 0) {
- // The $ didn't introduce a group number at all.
- // Treat it as just part of the substitution text.
- appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
- continue;
- }
-
// Finally, append the capture group data to the destination.
- destIdx += uregex_group((URegularExpression*)regexp, groupNum,
- dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
- if (*status == U_BUFFER_OVERFLOW_ERROR) {
- // Ignore buffer overflow when extracting the group. We need to
- // continue on to get full size of the untruncated result. We will
- // raise our own buffer overflow error at the end.
- *status = U_ZERO_ERROR;
+ if (U_SUCCESS(*status)) {
+ destIdx += uregex_group((URegularExpression*)regexp, groupNum,
+ dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
+ if (*status == U_BUFFER_OVERFLOW_ERROR) {
+ // Ignore buffer overflow when extracting the group. We need to
+ // continue on to get full size of the untruncated result. We will
+ // raise our own buffer overflow error at the end.
+ *status = U_ZERO_ERROR;
+ }
}
if (U_FAILURE(*status)) {
- // Can fail if group number is out of range.
+ // bad group number or name.
break;
}
-
}
//
@@ -1522,10 +1544,12 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
//
if (destIdx < capacity) {
dest[destIdx] = 0;
- } else if (destIdx == *destCapacity) {
- *status = U_STRING_NOT_TERMINATED_WARNING;
- } else {
- *status = U_BUFFER_OVERFLOW_ERROR;
+ } else if (U_SUCCESS(*status)) {
+ if (destIdx == *destCapacity) {
+ *status = U_STRING_NOT_TERMINATED_WARNING;
+ } else {
+ *status = U_BUFFER_OVERFLOW_ERROR;
+ }
}
//
« no previous file with comments | « source/i18n/unum.cpp ('k') | source/i18n/uregion.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698