| OLD | NEW |
| 1 /* | 1 /* |
| 2 ************************************************************************** | 2 ************************************************************************** |
| 3 * Copyright (C) 2002-2007 International Business Machines Corporation * | 3 * Copyright (C) 2002-2008 International Business Machines Corporation * |
| 4 * and others. All rights reserved. * | 4 * and others. All rights reserved. * |
| 5 ************************************************************************** | 5 ************************************************************************** |
| 6 */ | 6 */ |
| 7 // | 7 // |
| 8 // file: rematch.cpp | 8 // file: rematch.cpp |
| 9 // | 9 // |
| 10 // Contains the implementation of class RegexMatcher, | 10 // Contains the implementation of class RegexMatcher, |
| 11 // which is one of the main API classes for the ICU regular expression p
ackage. | 11 // which is one of the main API classes for the ICU regular expression p
ackage. |
| 12 // | 12 // |
| 13 | 13 |
| 14 #include "unicode/utypes.h" | 14 #include "unicode/utypes.h" |
| 15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS | 15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 16 | 16 |
| 17 #include "unicode/regex.h" | 17 #include "unicode/regex.h" |
| 18 #include "unicode/uniset.h" | 18 #include "unicode/uniset.h" |
| 19 #include "unicode/uchar.h" | 19 #include "unicode/uchar.h" |
| 20 #include "unicode/ustring.h" | 20 #include "unicode/ustring.h" |
| 21 #include "unicode/rbbi.h" | 21 #include "unicode/rbbi.h" |
| 22 #include "uassert.h" | 22 #include "uassert.h" |
| 23 #include "cmemory.h" | 23 #include "cmemory.h" |
| 24 #include "uvector.h" | 24 #include "uvector.h" |
| 25 #include "uvectr32.h" | 25 #include "uvectr32.h" |
| 26 #include "regeximp.h" | 26 #include "regeximp.h" |
| 27 #include "regexst.h" | 27 #include "regexst.h" |
| 28 | 28 |
| 29 // #include <malloc.h> // Needed for heapcheck testing | 29 // #include <malloc.h> // Needed for heapcheck testing |
| 30 | 30 |
| 31 U_NAMESPACE_BEGIN | 31 U_NAMESPACE_BEGIN |
| 32 | 32 |
| 33 // Limit the size of the back track stack, to avoid system failures caused |
| 34 // by heap exhaustion. Units are in 32 bit words, not bytes. |
| 35 // This value puts ICU's limits higher than most other regexp implementations, |
| 36 // which use recursion rather than the heap, and take more storage per |
| 37 // backtrack point. |
| 38 // This constant is _temporary_. Proper API to control the value will added. |
| 39 // |
| 40 static const int32_t BACKTRACK_STACK_CAPACITY = 8000000; |
| 41 |
| 33 //----------------------------------------------------------------------------- | 42 //----------------------------------------------------------------------------- |
| 34 // | 43 // |
| 35 // Constructor and Destructor | 44 // Constructor and Destructor |
| 36 // | 45 // |
| 37 //----------------------------------------------------------------------------- | 46 //----------------------------------------------------------------------------- |
| 38 RegexMatcher::RegexMatcher(const RegexPattern *pat) { | 47 RegexMatcher::RegexMatcher(const RegexPattern *pat) { |
| 39 fPattern = pat; | 48 fPattern = pat; |
| 40 fPatternOwned = NULL; | 49 fPatternOwned = NULL; |
| 41 fInput = NULL; | 50 fInput = NULL; |
| 42 fTraceDebug = FALSE; | 51 fTraceDebug = FALSE; |
| 43 fDeferredStatus = U_ZERO_ERROR; | 52 fDeferredStatus = U_ZERO_ERROR; |
| 44 fStack = new UVector32(fDeferredStatus); | 53 fStack = new UVector32(fDeferredStatus); |
| 45 fData = fSmallData; | 54 fData = fSmallData; |
| 46 fWordBreakItr = NULL; | 55 fWordBreakItr = NULL; |
| 47 if (pat==NULL) { | 56 if (pat==NULL) { |
| 48 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; | 57 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; |
| 49 return; | 58 return; |
| 50 } | 59 } |
| 51 if (pat->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { | 60 if (pat->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { |
| 52 fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t)); | 61 fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t)); |
| 53 } | 62 } |
| 54 if (fStack == NULL || fData == NULL) { | 63 if (fStack == NULL || fData == NULL) { |
| 55 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; | 64 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
| 65 } else { |
| 66 fStack->setMaxCapacity(BACKTRACK_STACK_CAPACITY); |
| 56 } | 67 } |
| 57 | |
| 58 reset(RegexStaticSets::gStaticSets->fEmptyString); | 68 reset(RegexStaticSets::gStaticSets->fEmptyString); |
| 59 } | 69 } |
| 60 | 70 |
| 61 | 71 |
| 62 | 72 |
| 63 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &inp
ut, | 73 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &inp
ut, |
| 64 uint32_t flags, UErrorCode &status) { | 74 uint32_t flags, UErrorCode &status) { |
| 65 UParseError pe; | 75 UParseError pe; |
| 66 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); | 76 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
| 67 fPattern = fPatternOwned; | 77 fPattern = fPatternOwned; |
| 68 fTraceDebug = FALSE; | 78 fTraceDebug = FALSE; |
| 69 fDeferredStatus = U_ZERO_ERROR; | 79 fDeferredStatus = U_ZERO_ERROR; |
| 70 fStack = new UVector32(status); | 80 fStack = new UVector32(status); |
| 71 fData = fSmallData; | 81 fData = fSmallData; |
| 72 fWordBreakItr = NULL; | 82 fWordBreakItr = NULL; |
| 73 if (U_FAILURE(status)) { | 83 if (U_FAILURE(status)) { |
| 74 return; | 84 return; |
| 75 } | 85 } |
| 76 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { | 86 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { |
| 77 fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t)); | 87 fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t)); |
| 78 } | 88 } |
| 79 if (fStack == NULL || fData == NULL) { | 89 if (fStack == NULL || fData == NULL) { |
| 80 status = U_MEMORY_ALLOCATION_ERROR; | 90 status = U_MEMORY_ALLOCATION_ERROR; |
| 91 } else { |
| 92 fStack->setMaxCapacity(BACKTRACK_STACK_CAPACITY); |
| 81 } | 93 } |
| 82 reset(input); | 94 reset(input); |
| 83 } | 95 } |
| 84 | 96 |
| 85 | 97 |
| 86 RegexMatcher::RegexMatcher(const UnicodeString ®exp, | 98 RegexMatcher::RegexMatcher(const UnicodeString ®exp, |
| 87 uint32_t flags, UErrorCode &status) { | 99 uint32_t flags, UErrorCode &status) { |
| 88 UParseError pe; | 100 UParseError pe; |
| 89 fTraceDebug = FALSE; | 101 fTraceDebug = FALSE; |
| 90 fDeferredStatus = U_ZERO_ERROR; | 102 fDeferredStatus = U_ZERO_ERROR; |
| 91 fStack = new UVector32(status); | 103 fStack = new UVector32(status); |
| 92 fData = fSmallData; | 104 fData = fSmallData; |
| 93 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); | 105 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
| 94 fPattern = fPatternOwned; | 106 fPattern = fPatternOwned; |
| 95 fWordBreakItr = NULL; | 107 fWordBreakItr = NULL; |
| 96 if (U_FAILURE(status)) { | 108 if (U_FAILURE(status)) { |
| 97 return; | 109 return; |
| 98 } | 110 } |
| 99 | 111 |
| 100 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { | 112 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { |
| 101 fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t)); | 113 fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t)); |
| 102 } | 114 } |
| 103 if (fStack == NULL || fData == NULL) { | 115 if (fStack == NULL || fData == NULL) { |
| 104 status = U_MEMORY_ALLOCATION_ERROR; | 116 status = U_MEMORY_ALLOCATION_ERROR; |
| 117 } else { |
| 118 fStack->setMaxCapacity(BACKTRACK_STACK_CAPACITY); |
| 105 } | 119 } |
| 106 reset(RegexStaticSets::gStaticSets->fEmptyString); | 120 reset(RegexStaticSets::gStaticSets->fEmptyString); |
| 107 } | 121 } |
| 108 | 122 |
| 109 | 123 |
| 110 | 124 |
| 111 RegexMatcher::~RegexMatcher() { | 125 RegexMatcher::~RegexMatcher() { |
| 112 delete fStack; | 126 delete fStack; |
| 113 if (fData != fSmallData) { | 127 if (fData != fSmallData) { |
| 114 uprv_free(fData); | 128 uprv_free(fData); |
| (...skipping 892 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1007 // in the opcode. Execution of the engine continues with the state in | 1021 // in the opcode. Execution of the engine continues with the state in |
| 1008 // the newly created stack frame | 1022 // the newly created stack frame |
| 1009 // | 1023 // |
| 1010 // Note that reserveBlock() may grow the stack, resulting in the | 1024 // Note that reserveBlock() may grow the stack, resulting in the |
| 1011 // whole thing being relocated in memory. | 1025 // whole thing being relocated in memory. |
| 1012 // | 1026 // |
| 1013 //------------------------------------------------------------------------------
-- | 1027 //------------------------------------------------------------------------------
-- |
| 1014 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatId
x, int32_t frameSize, UErrorCode &status) { | 1028 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatId
x, int32_t frameSize, UErrorCode &status) { |
| 1015 // push storage for a new frame. | 1029 // push storage for a new frame. |
| 1016 int32_t *newFP = fStack->reserveBlock(frameSize, status); | 1030 int32_t *newFP = fStack->reserveBlock(frameSize, status); |
| 1031 if (newFP == NULL) { |
| 1032 // Heap allocation error on attempted stack expansion. |
| 1033 // We need to return a writable stack frame, so just return the |
| 1034 // previous frame. The match operation will stop quickly |
| 1035 // becuase of the error status, after which the frame will never |
| 1036 // be looked at again. |
| 1037 return fp; |
| 1038 } |
| 1017 fp = (REStackFrame *)(newFP - frameSize); // in case of realloc of stack. | 1039 fp = (REStackFrame *)(newFP - frameSize); // in case of realloc of stack. |
| 1018 | 1040 |
| 1019 // New stack frame = copy of old top frame. | 1041 // New stack frame = copy of old top frame. |
| 1020 int32_t *source = (int32_t *)fp; | 1042 int32_t *source = (int32_t *)fp; |
| 1021 int32_t *dest = newFP; | 1043 int32_t *dest = newFP; |
| 1022 for (;;) { | 1044 for (;;) { |
| 1023 *dest++ = *source++; | 1045 *dest++ = *source++; |
| 1024 if (source == newFP) { | 1046 if (source == newFP) { |
| 1025 break; | 1047 break; |
| 1026 } | 1048 } |
| 1027 } | 1049 } |
| 1028 | 1050 |
| 1029 fp->fPatIdx = savePatIdx; | 1051 fp->fPatIdx = savePatIdx; |
| 1030 return (REStackFrame *)newFP; | 1052 return (REStackFrame *)newFP; |
| 1031 } | 1053 } |
| 1032 | 1054 |
| 1033 | 1055 |
| 1034 //------------------------------------------------------------------------------
-- | 1056 //------------------------------------------------------------------------------
-- |
| 1035 // | 1057 // |
| 1036 // MatchAt This is the actual matching engine. | 1058 // MatchAt This is the actual matching engine. |
| 1037 // | 1059 // |
| 1038 //------------------------------------------------------------------------------
-- | 1060 //------------------------------------------------------------------------------
-- |
| 1039 void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { | 1061 void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { |
| 1040 UBool isMatch = FALSE; // True if the we have a match. | 1062 UBool isMatch = FALSE; // True if the we have a match. |
| 1041 | 1063 |
| 1042 int32_t op; // Operation from the compiled pattern, s
plit into | 1064 int32_t op; // Operation from the compiled pattern, s
plit into |
| 1043 int32_t opType; // the opcode | 1065 int32_t opType; // the opcode |
| (...skipping 1210 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2254 | 2276 |
| 2255 | 2277 |
| 2256 | 2278 |
| 2257 default: | 2279 default: |
| 2258 // Trouble. The compiled pattern contains an entry with an | 2280 // Trouble. The compiled pattern contains an entry with an |
| 2259 // unrecognized type tag. | 2281 // unrecognized type tag. |
| 2260 U_ASSERT(FALSE); | 2282 U_ASSERT(FALSE); |
| 2261 } | 2283 } |
| 2262 | 2284 |
| 2263 if (U_FAILURE(status)) { | 2285 if (U_FAILURE(status)) { |
| 2286 isMatch = FALSE; |
| 2264 break; | 2287 break; |
| 2265 } | 2288 } |
| 2266 } | 2289 } |
| 2267 | 2290 |
| 2268 breakFromLoop: | 2291 breakFromLoop: |
| 2269 fMatch = isMatch; | 2292 fMatch = isMatch; |
| 2270 if (isMatch) { | 2293 if (isMatch) { |
| 2271 fLastMatchEnd = fMatchEnd; | 2294 fLastMatchEnd = fMatchEnd; |
| 2272 fMatchStart = startIdx; | 2295 fMatchStart = startIdx; |
| 2273 fMatchEnd = fp->fInputIdx; | 2296 fMatchEnd = fp->fInputIdx; |
| (...skipping 16 matching lines...) Expand all Loading... |
| 2290 } | 2313 } |
| 2291 | 2314 |
| 2292 | 2315 |
| 2293 | 2316 |
| 2294 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) | 2317 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) |
| 2295 | 2318 |
| 2296 U_NAMESPACE_END | 2319 U_NAMESPACE_END |
| 2297 | 2320 |
| 2298 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS | 2321 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 2299 | 2322 |
| OLD | NEW |