Index: source/i18n/uregex.cpp |
diff --git a/source/i18n/uregex.cpp b/source/i18n/uregex.cpp |
index 01951234b9c0fead09307ee16038722ff28517bf..99e94283816cd2f6eec8848dea657fa4198ec52d 100644 |
--- a/source/i18n/uregex.cpp |
+++ b/source/i18n/uregex.cpp |
@@ -1,6 +1,6 @@ |
/* |
******************************************************************************* |
-* Copyright (C) 2004-2014, International Business Machines |
+* Copyright (C) 2004-2015, International Business Machines |
* Corporation and others. All Rights Reserved. |
******************************************************************************* |
* file name: uregex.cpp |
@@ -17,14 +17,14 @@ |
#include "unicode/uchar.h" |
#include "unicode/uobject.h" |
#include "unicode/utf16.h" |
-#include "umutex.h" |
-#include "uassert.h" |
#include "cmemory.h" |
+#include "uassert.h" |
+#include "uhash.h" |
+#include "umutex.h" |
+#include "uvectr32.h" |
#include "regextxt.h" |
-#include <stdio.h> |
- |
U_NAMESPACE_BEGIN |
#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0) |
@@ -627,6 +627,36 @@ uregex_groupCount(URegularExpression *regexp2, |
//------------------------------------------------------------------------------ |
// |
+// uregex_groupNumberFromName |
+// |
+//------------------------------------------------------------------------------ |
+int32_t |
+uregex_groupNumberFromName(URegularExpression *regexp2, |
+ const UChar *groupName, |
+ int32_t nameLength, |
+ UErrorCode *status) { |
+ RegularExpression *regexp = (RegularExpression*)regexp2; |
+ if (validateRE(regexp, FALSE, status) == FALSE) { |
+ return 0; |
+ } |
+ int32_t result = regexp->fPat->groupNumberFromName(UnicodeString(groupName, nameLength), *status); |
+ return result; |
+} |
+ |
+int32_t |
+uregex_groupNumberFromCName(URegularExpression *regexp2, |
+ const char *groupName, |
+ int32_t nameLength, |
+ UErrorCode *status) { |
+ RegularExpression *regexp = (RegularExpression*)regexp2; |
+ if (validateRE(regexp, FALSE, status) == FALSE) { |
+ return 0; |
+ } |
+ return regexp->fPat->groupNumberFromName(groupName, nameLength, *status); |
+} |
+ |
+//------------------------------------------------------------------------------ |
+// |
// uregex_group |
// |
//------------------------------------------------------------------------------ |
@@ -647,7 +677,7 @@ uregex_group(URegularExpression *regexp2, |
if (destCapacity == 0 || regexp->fText != NULL) { |
// If preflighting or if we already have the text as UChars, |
- // this is a little cheaper than going through uregex_groupUTextDeep() |
+ // this is a little cheaper than extracting from the UText |
// |
// Pick up the range of characters from the matcher |
@@ -680,14 +710,18 @@ uregex_group(URegularExpression *regexp2, |
} |
return fullLength; |
} else { |
- int32_t result = 0; |
- UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status); |
- if (U_SUCCESS(*status)) { |
- result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status); |
+ int64_t start = regexp->fMatcher->start64(groupNum, *status); |
+ int64_t limit = regexp->fMatcher->end64(groupNum, *status); |
+ if (U_FAILURE(*status)) { |
+ return 0; |
} |
- utext_close(groupText); |
- return result; |
+ // Note edge cases: |
+ // Group didn't match: start == end == -1. UText trims to 0, UText gives zero length result. |
+ // Zero Length Match: start == end. |
+ int32_t length = utext_extract(regexp->fMatcher->inputText(), start, limit, dest, destCapacity, status); |
+ return length; |
} |
+ |
} |
@@ -713,49 +747,6 @@ uregex_groupUText(URegularExpression *regexp2, |
//------------------------------------------------------------------------------ |
// |
-// uregex_groupUTextDeep |
-// |
-//------------------------------------------------------------------------------ |
-U_CAPI UText * U_EXPORT2 |
-uregex_groupUTextDeep(URegularExpression *regexp2, |
- int32_t groupNum, |
- UText *dest, |
- UErrorCode *status) { |
- RegularExpression *regexp = (RegularExpression*)regexp2; |
- if (validateRE(regexp, TRUE, status) == FALSE) { |
- UErrorCode emptyTextStatus = U_ZERO_ERROR; |
- return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); |
- } |
- |
- if (regexp->fText != NULL) { |
- // |
- // Pick up the range of characters from the matcher |
- // and use our already-extracted characters |
- // |
- int32_t startIx = regexp->fMatcher->start(groupNum, *status); |
- int32_t endIx = regexp->fMatcher->end (groupNum, *status); |
- if (U_FAILURE(*status)) { |
- UErrorCode emptyTextStatus = U_ZERO_ERROR; |
- return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); |
- } |
- |
- if (dest) { |
- utext_replace(dest, 0, utext_nativeLength(dest), ®exp->fText[startIx], endIx - startIx, status); |
- } else { |
- UText groupText = UTEXT_INITIALIZER; |
- utext_openUChars(&groupText, ®exp->fText[startIx], endIx - startIx, status); |
- dest = utext_clone(NULL, &groupText, TRUE, FALSE, status); |
- utext_close(&groupText); |
- } |
- |
- return dest; |
- } else { |
- return regexp->fMatcher->group(groupNum, dest, *status); |
- } |
-} |
- |
-//------------------------------------------------------------------------------ |
-// |
// uregex_start |
// |
//------------------------------------------------------------------------------ |
@@ -1324,6 +1315,8 @@ U_NAMESPACE_END |
static const UChar BACKSLASH = 0x5c; |
static const UChar DOLLARSIGN = 0x24; |
+static const UChar LEFTBRACKET = 0x7b; |
+static const UChar RIGHTBRACKET = 0x7d; |
// |
// Move a character to an output buffer, with bounds checking on the index. |
@@ -1398,10 +1391,10 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, |
matchStart = (int32_t)m->fMatchStart; |
} else { |
// !!!: Would like a better way to do this! |
- UErrorCode status = U_ZERO_ERROR; |
- lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status); |
- status = U_ZERO_ERROR; |
- matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status); |
+ UErrorCode tempStatus = U_ZERO_ERROR; |
+ lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &tempStatus); |
+ tempStatus = U_ZERO_ERROR; |
+ matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &tempStatus); |
} |
for (i=lastMatchEnd; i<matchStart; i++) { |
appendToBuf(regexp->fText[i], &destIdx, dest, capacity); |
@@ -1416,7 +1409,7 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, |
// scan the replacement text, looking for substitutions ($n) and \escapes. |
int32_t replIdx = 0; |
- while (replIdx < replacementLength) { |
+ while (replIdx < replacementLength && U_SUCCESS(*status)) { |
UChar c = replacementText[replIdx]; |
replIdx++; |
if (c != DOLLARSIGN && c != BACKSLASH) { |
@@ -1465,55 +1458,84 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, |
continue; |
} |
+ // We've got a $. Pick up the following capture group name or number. |
+ // For numbers, consume only digits that produce a valid capture group for the pattern. |
- |
- // We've got a $. Pick up a capture group number if one follows. |
- // Consume at most the number of digits necessary for the largest capture |
- // number that is valid for this pattern. |
- |
- int32_t numDigits = 0; |
int32_t groupNum = 0; |
- UChar32 digitC; |
- for (;;) { |
- if (replIdx >= replacementLength) { |
- break; |
- } |
- U16_GET(replacementText, 0, replIdx, replacementLength, digitC); |
- if (u_isdigit(digitC) == FALSE) { |
- break; |
- } |
+ U_ASSERT(c == DOLLARSIGN); |
+ UChar32 c32; |
+ U16_GET(replacementText, 0, replIdx, replacementLength, c32); |
+ if (u_isdigit(c32)) { |
+ int32_t numDigits = 0; |
+ int32_t numCaptureGroups = m->fPattern->fGroupMap->size(); |
+ for (;;) { |
+ if (replIdx >= replacementLength) { |
+ break; |
+ } |
+ U16_GET(replacementText, 0, replIdx, replacementLength, c32); |
+ if (u_isdigit(c32) == FALSE) { |
+ break; |
+ } |
+ int32_t digitVal = u_charDigitValue(c32); |
+ if (groupNum * 10 + digitVal <= numCaptureGroups) { |
+ groupNum = groupNum * 10 + digitVal; |
+ U16_FWD_1(replacementText, replIdx, replacementLength); |
+ numDigits++; |
+ } else { |
+ if (numDigits == 0) { |
+ *status = U_INDEX_OUTOFBOUNDS_ERROR; |
+ } |
+ break; |
+ } |
+ } |
+ } else if (c32 == LEFTBRACKET) { |
+ // Scan for Named Capture Group, ${name}. |
+ UnicodeString groupName; |
U16_FWD_1(replacementText, replIdx, replacementLength); |
- groupNum=groupNum*10 + u_charDigitValue(digitC); |
- numDigits++; |
- if (numDigits >= m->fPattern->fMaxCaptureDigits) { |
- break; |
+ while (U_SUCCESS(*status) && c32 != RIGHTBRACKET) { |
+ if (replIdx >= replacementLength) { |
+ *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; |
+ break; |
+ } |
+ U16_NEXT(replacementText, replIdx, replacementLength, c32); |
+ if ((c32 >= 0x41 && c32 <= 0x5a) || // A..Z |
+ (c32 >= 0x61 && c32 <= 0x7a) || // a..z |
+ (c32 >= 0x31 && c32 <= 0x39)) { // 0..9 |
+ groupName.append(c32); |
+ } else if (c32 == RIGHTBRACKET) { |
+ groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName); |
+ if (groupNum == 0) { |
+ // Name not defined by pattern. |
+ *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; |
+ } |
+ } else { |
+ // Character was something other than a name char or a closing '}' |
+ *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; |
+ } |
} |
+ } else { |
+ // $ not followed by {name} or digits. |
+ *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; |
} |
- if (numDigits == 0) { |
- // The $ didn't introduce a group number at all. |
- // Treat it as just part of the substitution text. |
- appendToBuf(DOLLARSIGN, &destIdx, dest, capacity); |
- continue; |
- } |
- |
// Finally, append the capture group data to the destination. |
- destIdx += uregex_group((URegularExpression*)regexp, groupNum, |
- dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status); |
- if (*status == U_BUFFER_OVERFLOW_ERROR) { |
- // Ignore buffer overflow when extracting the group. We need to |
- // continue on to get full size of the untruncated result. We will |
- // raise our own buffer overflow error at the end. |
- *status = U_ZERO_ERROR; |
+ if (U_SUCCESS(*status)) { |
+ destIdx += uregex_group((URegularExpression*)regexp, groupNum, |
+ dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status); |
+ if (*status == U_BUFFER_OVERFLOW_ERROR) { |
+ // Ignore buffer overflow when extracting the group. We need to |
+ // continue on to get full size of the untruncated result. We will |
+ // raise our own buffer overflow error at the end. |
+ *status = U_ZERO_ERROR; |
+ } |
} |
if (U_FAILURE(*status)) { |
- // Can fail if group number is out of range. |
+ // bad group number or name. |
break; |
} |
- |
} |
// |
@@ -1522,10 +1544,12 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, |
// |
if (destIdx < capacity) { |
dest[destIdx] = 0; |
- } else if (destIdx == *destCapacity) { |
- *status = U_STRING_NOT_TERMINATED_WARNING; |
- } else { |
- *status = U_BUFFER_OVERFLOW_ERROR; |
+ } else if (U_SUCCESS(*status)) { |
+ if (destIdx == *destCapacity) { |
+ *status = U_STRING_NOT_TERMINATED_WARNING; |
+ } else { |
+ *status = U_BUFFER_OVERFLOW_ERROR; |
+ } |
} |
// |