| OLD | NEW |
| 1 // | 1 // |
| 2 // file: regexcmp.cpp | 2 // file: regexcmp.cpp |
| 3 // | 3 // |
| 4 // Copyright (C) 2002-2014 International Business Machines Corporation and othe
rs. | 4 // Copyright (C) 2002-2015 International Business Machines Corporation and othe
rs. |
| 5 // All Rights Reserved. | 5 // All Rights Reserved. |
| 6 // | 6 // |
| 7 // This file contains the ICU regular expression compiler, which is responsible | 7 // This file contains the ICU regular expression compiler, which is responsible |
| 8 // for processing a regular expression pattern into the compiled form that | 8 // for processing a regular expression pattern into the compiled form that |
| 9 // is used by the match finding engine. | 9 // is used by the match finding engine. |
| 10 // | 10 // |
| 11 | 11 |
| 12 #include "unicode/utypes.h" | 12 #include "unicode/utypes.h" |
| 13 | 13 |
| 14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS | 14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| (...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 63 fPeekChar = -1; | 63 fPeekChar = -1; |
| 64 fLineNum = 1; | 64 fLineNum = 1; |
| 65 fCharNum = 0; | 65 fCharNum = 0; |
| 66 fQuoteMode = FALSE; | 66 fQuoteMode = FALSE; |
| 67 fInBackslashQuote = FALSE; | 67 fInBackslashQuote = FALSE; |
| 68 fModeFlags = fRXPat->fFlags | 0x80000000; | 68 fModeFlags = fRXPat->fFlags | 0x80000000; |
| 69 fEOLComments = TRUE; | 69 fEOLComments = TRUE; |
| 70 | 70 |
| 71 fMatchOpenParen = -1; | 71 fMatchOpenParen = -1; |
| 72 fMatchCloseParen = -1; | 72 fMatchCloseParen = -1; |
| 73 fCaptureName = NULL; |
| 73 | 74 |
| 74 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) { | 75 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) { |
| 75 status = rxp->fDeferredStatus; | 76 status = rxp->fDeferredStatus; |
| 76 } | 77 } |
| 77 } | 78 } |
| 78 | 79 |
| 79 static const UChar chAmp = 0x26; // '&' | 80 static const UChar chAmp = 0x26; // '&' |
| 80 static const UChar chDash = 0x2d; // '-' | 81 static const UChar chDash = 0x2d; // '-' |
| 81 | 82 |
| 82 | 83 |
| 83 //------------------------------------------------------------------------------ | 84 //------------------------------------------------------------------------------ |
| 84 // | 85 // |
| 85 // Destructor | 86 // Destructor |
| 86 // | 87 // |
| 87 //------------------------------------------------------------------------------ | 88 //------------------------------------------------------------------------------ |
| 88 RegexCompile::~RegexCompile() { | 89 RegexCompile::~RegexCompile() { |
| 90 delete fCaptureName; // Normally will be NULL, but can exist if patt
ern |
| 91 // compilation stops with a syntax error. |
| 89 } | 92 } |
| 90 | 93 |
| 91 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) { | 94 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) { |
| 92 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK,
value, ec)); | 95 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK,
value, ec)); |
| 93 } | 96 } |
| 94 | 97 |
| 95 //------------------------------------------------------------------------------ | 98 //------------------------------------------------------------------------------ |
| 96 // | 99 // |
| 97 // Compile regex pattern. The state machine for rexexp pattern parsing is her
e. | 100 // Compile regex pattern. The state machine for rexexp pattern parsing is her
e. |
| 98 // The state tables are hand-written in the file regex
cst.txt, | 101 // The state tables are hand-written in the file regex
cst.txt, |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 131 | 134 |
| 132 if (U_FAILURE(*fStatus)) { | 135 if (U_FAILURE(*fStatus)) { |
| 133 return; | 136 return; |
| 134 } | 137 } |
| 135 | 138 |
| 136 // There should be no pattern stuff in the RegexPattern object. They can no
t be reused. | 139 // There should be no pattern stuff in the RegexPattern object. They can no
t be reused. |
| 137 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) ==
0); | 140 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) ==
0); |
| 138 | 141 |
| 139 // Prepare the RegexPattern object to receive the compiled pattern. | 142 // Prepare the RegexPattern object to receive the compiled pattern. |
| 140 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fS
tatus); | 143 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fS
tatus); |
| 144 if (U_FAILURE(*fStatus)) { |
| 145 return; |
| 146 } |
| 141 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets; | 147 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets; |
| 142 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8; | 148 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8; |
| 143 | 149 |
| 144 | 150 |
| 145 // Initialize the pattern scanning state machine | 151 // Initialize the pattern scanning state machine |
| 146 fPatternLength = utext_nativeLength(pat); | 152 fPatternLength = utext_nativeLength(pat); |
| 147 uint16_t state = 1; | 153 uint16_t state = 1; |
| 148 const RegexTableEl *tableEl; | 154 const RegexTableEl *tableEl; |
| 149 | 155 |
| 150 // UREGEX_LITERAL force entire pattern to be treated as a literal string. | 156 // UREGEX_LITERAL force entire pattern to be treated as a literal string. |
| (...skipping 126 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 277 delete (UnicodeSet *)fSetStack.pop(); | 283 delete (UnicodeSet *)fSetStack.pop(); |
| 278 } | 284 } |
| 279 return; | 285 return; |
| 280 } | 286 } |
| 281 | 287 |
| 282 // | 288 // |
| 283 // The pattern has now been read and processed, and the compiled code genera
ted. | 289 // The pattern has now been read and processed, and the compiled code genera
ted. |
| 284 // | 290 // |
| 285 | 291 |
| 286 // | 292 // |
| 287 // Compute the number of digits requried for the largest capture group numbe
r. | |
| 288 // | |
| 289 fRXPat->fMaxCaptureDigits = 1; | |
| 290 int32_t n = 10; | |
| 291 int32_t groupCount = fRXPat->fGroupMap->size(); | |
| 292 while (n <= groupCount) { | |
| 293 fRXPat->fMaxCaptureDigits++; | |
| 294 n *= 10; | |
| 295 } | |
| 296 | |
| 297 // | |
| 298 // The pattern's fFrameSize so far has accumulated the requirements for | 293 // The pattern's fFrameSize so far has accumulated the requirements for |
| 299 // storage for capture parentheses, counters, etc. that are encountered | 294 // storage for capture parentheses, counters, etc. that are encountered |
| 300 // in the pattern. Add space for the two variables that are always | 295 // in the pattern. Add space for the two variables that are always |
| 301 // present in the saved state: the input string position (int64_t) and | 296 // present in the saved state: the input string position (int64_t) and |
| 302 // the position in the compiled pattern. | 297 // the position in the compiled pattern. |
| 303 // | 298 // |
| 304 allocateStackData(RESTACKFRAME_HDRCOUNT); | 299 allocateStackData(RESTACKFRAME_HDRCOUNT); |
| 305 | 300 |
| 306 // | 301 // |
| 307 // Optimization pass 1: NOPs, back-references, and case-folding | 302 // Optimization pass 1: NOPs, back-references, and case-folding |
| (...skipping 120 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 428 | 423 |
| 429 // Append a NOP to the compiled pattern. This is the slot reserved | 424 // Append a NOP to the compiled pattern. This is the slot reserved |
| 430 // for a SAVE in the event that there is yet another '|' following | 425 // for a SAVE in the event that there is yet another '|' following |
| 431 // this one. | 426 // this one. |
| 432 appendOp(URX_NOP, 0); | 427 appendOp(URX_NOP, 0); |
| 433 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); | 428 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); |
| 434 } | 429 } |
| 435 break; | 430 break; |
| 436 | 431 |
| 437 | 432 |
| 433 case doBeginNamedCapture: |
| 434 // Scanning (?<letter. |
| 435 // The first letter of the name will come through again under doConinu
eNamedCapture. |
| 436 fCaptureName = new UnicodeString(); |
| 437 if (fCaptureName == NULL) { |
| 438 error(U_MEMORY_ALLOCATION_ERROR); |
| 439 } |
| 440 break; |
| 441 |
| 442 case doContinueNamedCapture: |
| 443 fCaptureName->append(fC.fChar); |
| 444 break; |
| 445 |
| 446 case doBadNamedCapture: |
| 447 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); |
| 448 break; |
| 449 |
| 438 case doOpenCaptureParen: | 450 case doOpenCaptureParen: |
| 439 // Open Paren. | 451 // Open Capturing Paren, possibly named. |
| 440 // Compile to a | 452 // Compile to a |
| 441 // - NOP, which later may be replaced by a save-state if the | 453 // - NOP, which later may be replaced by a save-state if the |
| 442 // parenthesized group gets a * quantifier, followed by | 454 // parenthesized group gets a * quantifier, followed by |
| 443 // - START_CAPTURE n where n is stack frame offset to the captu
re group variables. | 455 // - START_CAPTURE n where n is stack frame offset to the captu
re group variables. |
| 444 // - NOP, which may later be replaced by a save-state if there | 456 // - NOP, which may later be replaced by a save-state if there |
| 445 // is an '|' alternation within the parens. | 457 // is an '|' alternation within the parens. |
| 446 // | 458 // |
| 447 // Each capture group gets three slots in the save stack frame: | 459 // Each capture group gets three slots in the save stack frame: |
| 448 // 0: Capture Group start position (in input string being matche
d.) | 460 // 0: Capture Group start position (in input string being matche
d.) |
| 449 // 1: Capture Group end position. | 461 // 1: Capture Group end position. |
| (...skipping 14 matching lines...) Expand all Loading... |
| 464 // of the two NOPs. Depending on what follows in the pattern, the | 476 // of the two NOPs. Depending on what follows in the pattern, the |
| 465 // NOPs may be changed to SAVE_STATE or JMP ops, with a target | 477 // NOPs may be changed to SAVE_STATE or JMP ops, with a target |
| 466 // address of the end of the parenthesized group. | 478 // address of the end of the parenthesized group. |
| 467 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 479 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 468 fParenStack.push(capturing, *fStatus); // Fra
me type. | 480 fParenStack.push(capturing, *fStatus); // Fra
me type. |
| 469 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP location | 481 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP location |
| 470 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc | 482 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc |
| 471 | 483 |
| 472 // Save the mapping from group number to stack frame variable positi
on. | 484 // Save the mapping from group number to stack frame variable positi
on. |
| 473 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); | 485 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); |
| 486 |
| 487 // If this is a named capture group, add the name->group number mapp
ing. |
| 488 if (fCaptureName != NULL) { |
| 489 int32_t groupNumber = fRXPat->fGroupMap->size(); |
| 490 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, f
CaptureName, groupNumber, fStatus); |
| 491 fCaptureName = NULL; // hash table takes ownership of the nam
e (key) string. |
| 492 if (previousMapping > 0 && U_SUCCESS(*fStatus)) { |
| 493 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); |
| 494 } |
| 495 } |
| 474 } | 496 } |
| 475 break; | 497 break; |
| 476 | 498 |
| 477 case doOpenNonCaptureParen: | 499 case doOpenNonCaptureParen: |
| 478 // Open non-caputuring (grouping only) Paren. | 500 // Open non-caputuring (grouping only) Paren. |
| 479 // Compile to a | 501 // Compile to a |
| 480 // - NOP, which later may be replaced by a save-state if the | 502 // - NOP, which later may be replaced by a save-state if the |
| 481 // parenthesized group gets a * quantifier, followed by | 503 // parenthesized group gets a * quantifier, followed by |
| 482 // - NOP, which may later be replaced by a save-state if there | 504 // - NOP, which may later be replaced by a save-state if there |
| 483 // is an '|' alternation within the parens. | 505 // is an '|' alternation within the parens. |
| 484 { | 506 { |
| 485 fixLiterals(); | 507 fixLiterals(); |
| (...skipping 485 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 971 // are scanned. | 993 // are scanned. |
| 972 fIntervalLow = 0; | 994 fIntervalLow = 0; |
| 973 fIntervalUpper = -1; | 995 fIntervalUpper = -1; |
| 974 break; | 996 break; |
| 975 | 997 |
| 976 case doIntevalLowerDigit: | 998 case doIntevalLowerDigit: |
| 977 // Scanned a digit from the lower value of an {lower,upper} interval | 999 // Scanned a digit from the lower value of an {lower,upper} interval |
| 978 { | 1000 { |
| 979 int32_t digitValue = u_charDigitValue(fC.fChar); | 1001 int32_t digitValue = u_charDigitValue(fC.fChar); |
| 980 U_ASSERT(digitValue >= 0); | 1002 U_ASSERT(digitValue >= 0); |
| 981 fIntervalLow = fIntervalLow*10 + digitValue; | 1003 int64_t val = (int64_t)fIntervalLow*10 + digitValue; |
| 982 if (fIntervalLow < 0) { | 1004 if (val > INT32_MAX) { |
| 983 error(U_REGEX_NUMBER_TOO_BIG); | 1005 error(U_REGEX_NUMBER_TOO_BIG); |
| 1006 } else { |
| 1007 fIntervalLow = (int32_t)val; |
| 984 } | 1008 } |
| 985 } | 1009 } |
| 986 break; | 1010 break; |
| 987 | 1011 |
| 988 case doIntervalUpperDigit: | 1012 case doIntervalUpperDigit: |
| 989 // Scanned a digit from the upper value of an {lower,upper} interval | 1013 // Scanned a digit from the upper value of an {lower,upper} interval |
| 990 { | 1014 { |
| 991 if (fIntervalUpper < 0) { | 1015 if (fIntervalUpper < 0) { |
| 992 fIntervalUpper = 0; | 1016 fIntervalUpper = 0; |
| 993 } | 1017 } |
| 994 int32_t digitValue = u_charDigitValue(fC.fChar); | 1018 int32_t digitValue = u_charDigitValue(fC.fChar); |
| 995 U_ASSERT(digitValue >= 0); | 1019 U_ASSERT(digitValue >= 0); |
| 996 fIntervalUpper = fIntervalUpper*10 + digitValue; | 1020 int64_t val = (int64_t)fIntervalUpper*10 + digitValue; |
| 997 if (fIntervalUpper < 0) { | 1021 if (val > INT32_MAX) { |
| 998 error(U_REGEX_NUMBER_TOO_BIG); | 1022 error(U_REGEX_NUMBER_TOO_BIG); |
| 1023 } else { |
| 1024 fIntervalUpper = (int32_t)val; |
| 999 } | 1025 } |
| 1000 } | 1026 } |
| 1001 break; | 1027 break; |
| 1002 | 1028 |
| 1003 case doIntervalSame: | 1029 case doIntervalSame: |
| 1004 // Scanned a single value interval like {27}. Upper = Lower. | 1030 // Scanned a single value interval like {27}. Upper = Lower. |
| 1005 fIntervalUpper = fIntervalLow; | 1031 fIntervalUpper = fIntervalLow; |
| 1006 break; | 1032 break; |
| 1007 | 1033 |
| 1008 case doInterval: | 1034 case doInterval: |
| (...skipping 146 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1155 case doBackslashd: | 1181 case doBackslashd: |
| 1156 fixLiterals(FALSE); | 1182 fixLiterals(FALSE); |
| 1157 appendOp(URX_BACKSLASH_D, 0); | 1183 appendOp(URX_BACKSLASH_D, 0); |
| 1158 break; | 1184 break; |
| 1159 | 1185 |
| 1160 case doBackslashG: | 1186 case doBackslashG: |
| 1161 fixLiterals(FALSE); | 1187 fixLiterals(FALSE); |
| 1162 appendOp(URX_BACKSLASH_G, 0); | 1188 appendOp(URX_BACKSLASH_G, 0); |
| 1163 break; | 1189 break; |
| 1164 | 1190 |
| 1191 case doBackslashH: |
| 1192 fixLiterals(FALSE); |
| 1193 appendOp(URX_BACKSLASH_H, 1); |
| 1194 break; |
| 1195 |
| 1196 case doBackslashh: |
| 1197 fixLiterals(FALSE); |
| 1198 appendOp(URX_BACKSLASH_H, 0); |
| 1199 break; |
| 1200 |
| 1201 case doBackslashR: |
| 1202 fixLiterals(FALSE); |
| 1203 appendOp(URX_BACKSLASH_R, 0); |
| 1204 break; |
| 1205 |
| 1165 case doBackslashS: | 1206 case doBackslashS: |
| 1166 fixLiterals(FALSE); | 1207 fixLiterals(FALSE); |
| 1167 appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET); | 1208 appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET); |
| 1168 break; | 1209 break; |
| 1169 | 1210 |
| 1170 case doBackslashs: | 1211 case doBackslashs: |
| 1171 fixLiterals(FALSE); | 1212 fixLiterals(FALSE); |
| 1172 appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET); | 1213 appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET); |
| 1173 break; | 1214 break; |
| 1174 | 1215 |
| 1216 case doBackslashV: |
| 1217 fixLiterals(FALSE); |
| 1218 appendOp(URX_BACKSLASH_V, 1); |
| 1219 break; |
| 1220 |
| 1221 case doBackslashv: |
| 1222 fixLiterals(FALSE); |
| 1223 appendOp(URX_BACKSLASH_V, 0); |
| 1224 break; |
| 1225 |
| 1175 case doBackslashW: | 1226 case doBackslashW: |
| 1176 fixLiterals(FALSE); | 1227 fixLiterals(FALSE); |
| 1177 appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET); | 1228 appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET); |
| 1178 break; | 1229 break; |
| 1179 | 1230 |
| 1180 case doBackslashw: | 1231 case doBackslashw: |
| 1181 fixLiterals(FALSE); | 1232 fixLiterals(FALSE); |
| 1182 appendOp(URX_STATIC_SETREF, URX_ISWORD_SET); | 1233 appendOp(URX_STATIC_SETREF, URX_ISWORD_SET); |
| 1183 break; | 1234 break; |
| 1184 | 1235 |
| (...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1256 // and shouldn't enter this code path at
all. | 1307 // and shouldn't enter this code path at
all. |
| 1257 fixLiterals(FALSE); | 1308 fixLiterals(FALSE); |
| 1258 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 1309 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
| 1259 appendOp(URX_BACKREF_I, groupNum); | 1310 appendOp(URX_BACKREF_I, groupNum); |
| 1260 } else { | 1311 } else { |
| 1261 appendOp(URX_BACKREF, groupNum); | 1312 appendOp(URX_BACKREF, groupNum); |
| 1262 } | 1313 } |
| 1263 } | 1314 } |
| 1264 break; | 1315 break; |
| 1265 | 1316 |
| 1317 case doBeginNamedBackRef: |
| 1318 U_ASSERT(fCaptureName == NULL); |
| 1319 fCaptureName = new UnicodeString; |
| 1320 if (fCaptureName == NULL) { |
| 1321 error(U_MEMORY_ALLOCATION_ERROR); |
| 1322 } |
| 1323 break; |
| 1324 |
| 1325 case doContinueNamedBackRef: |
| 1326 fCaptureName->append(fC.fChar); |
| 1327 break; |
| 1266 | 1328 |
| 1329 case doCompleteNamedBackRef: |
| 1330 { |
| 1331 int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName)
; |
| 1332 if (groupNumber == 0) { |
| 1333 // Group name has not been defined. |
| 1334 // Could be a forward reference. If we choose to support them at s
ome |
| 1335 // future time, extra mechanism will be required at this point. |
| 1336 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); |
| 1337 } else { |
| 1338 // Given the number, handle identically to a \n numbered back refere
nce. |
| 1339 // See comments above, under doBackRef |
| 1340 fixLiterals(FALSE); |
| 1341 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
| 1342 appendOp(URX_BACKREF_I, groupNumber); |
| 1343 } else { |
| 1344 appendOp(URX_BACKREF, groupNumber); |
| 1345 } |
| 1346 } |
| 1347 delete fCaptureName; |
| 1348 fCaptureName = NULL; |
| 1349 break; |
| 1350 } |
| 1351 |
| 1267 case doPossessivePlus: | 1352 case doPossessivePlus: |
| 1268 // Possessive ++ quantifier. | 1353 // Possessive ++ quantifier. |
| 1269 // Compiles to | 1354 // Compiles to |
| 1270 // 1. STO_SP | 1355 // 1. STO_SP |
| 1271 // 2. body of stuff being iterated over | 1356 // 2. body of stuff being iterated over |
| 1272 // 3. STATE_SAVE 5 | 1357 // 3. STATE_SAVE 5 |
| 1273 // 4. JMP 2 | 1358 // 4. JMP 2 |
| 1274 // 5. LD_SP | 1359 // 5. LD_SP |
| 1275 // 6. ... | 1360 // 6. ... |
| 1276 // | 1361 // |
| (...skipping 204 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1481 { | 1566 { |
| 1482 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); | 1567 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| 1483 UnicodeSet digits; | 1568 UnicodeSet digits; |
| 1484 // TODO - make a static set, ticket 6058. | 1569 // TODO - make a static set, ticket 6058. |
| 1485 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MA
SK, *fStatus); | 1570 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MA
SK, *fStatus); |
| 1486 digits.complement(); | 1571 digits.complement(); |
| 1487 set->addAll(digits); | 1572 set->addAll(digits); |
| 1488 break; | 1573 break; |
| 1489 } | 1574 } |
| 1490 | 1575 |
| 1576 case doSetBackslash_h: |
| 1577 { |
| 1578 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| 1579 UnicodeSet h; |
| 1580 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *
fStatus); |
| 1581 h.add((UChar32)9); // Tab |
| 1582 set->addAll(h); |
| 1583 break; |
| 1584 } |
| 1585 |
| 1586 case doSetBackslash_H: |
| 1587 { |
| 1588 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| 1589 UnicodeSet h; |
| 1590 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *
fStatus); |
| 1591 h.add((UChar32)9); // Tab |
| 1592 h.complement(); |
| 1593 set->addAll(h); |
| 1594 break; |
| 1595 } |
| 1596 |
| 1597 case doSetBackslash_v: |
| 1598 { |
| 1599 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| 1600 set->add((UChar32)0x0a, (UChar32)0x0d); // add range |
| 1601 set->add((UChar32)0x85); |
| 1602 set->add((UChar32)0x2028, (UChar32)0x2029); |
| 1603 break; |
| 1604 } |
| 1605 |
| 1606 case doSetBackslash_V: |
| 1607 { |
| 1608 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| 1609 UnicodeSet v; |
| 1610 v.add((UChar32)0x0a, (UChar32)0x0d); // add range |
| 1611 v.add((UChar32)0x85); |
| 1612 v.add((UChar32)0x2028, (UChar32)0x2029); |
| 1613 v.complement(); |
| 1614 set->addAll(v); |
| 1615 break; |
| 1616 } |
| 1617 |
| 1491 case doSetBackslash_w: | 1618 case doSetBackslash_w: |
| 1492 { | 1619 { |
| 1493 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); | 1620 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| 1494 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]
); | 1621 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]
); |
| 1495 break; | 1622 break; |
| 1496 } | 1623 } |
| 1497 | 1624 |
| 1498 case doSetBackslash_W: | 1625 case doSetBackslash_W: |
| 1499 { | 1626 { |
| 1500 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); | 1627 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| (...skipping 1181 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2682 s.complement(); | 2809 s.complement(); |
| 2683 } | 2810 } |
| 2684 fRXPat->fInitialChars->addAll(s); | 2811 fRXPat->fInitialChars->addAll(s); |
| 2685 numInitialStrings += 2; | 2812 numInitialStrings += 2; |
| 2686 } | 2813 } |
| 2687 currentLen++; | 2814 currentLen++; |
| 2688 atStart = FALSE; | 2815 atStart = FALSE; |
| 2689 break; | 2816 break; |
| 2690 | 2817 |
| 2691 | 2818 |
| 2819 case URX_BACKSLASH_H: |
| 2820 // Horiz white space |
| 2821 if (currentLen == 0) { |
| 2822 UnicodeSet s; |
| 2823 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MAS
K, *fStatus); |
| 2824 s.add((UChar32)9); // Tab |
| 2825 if (URX_VAL(op) != 0) { |
| 2826 s.complement(); |
| 2827 } |
| 2828 fRXPat->fInitialChars->addAll(s); |
| 2829 numInitialStrings += 2; |
| 2830 } |
| 2831 currentLen++; |
| 2832 atStart = FALSE; |
| 2833 break; |
| 2834 |
| 2835 |
| 2836 case URX_BACKSLASH_R: // Any line ending sequence |
| 2837 case URX_BACKSLASH_V: // Any line ending code point, with optional
negation |
| 2838 if (currentLen == 0) { |
| 2839 UnicodeSet s; |
| 2840 s.add((UChar32)0x0a, (UChar32)0x0d); // add range |
| 2841 s.add((UChar32)0x85); |
| 2842 s.add((UChar32)0x2028, (UChar32)0x2029); |
| 2843 if (URX_VAL(op) != 0) { |
| 2844 // Complement option applies to URX_BACKSLASH_V only. |
| 2845 s.complement(); |
| 2846 } |
| 2847 fRXPat->fInitialChars->addAll(s); |
| 2848 numInitialStrings += 2; |
| 2849 } |
| 2850 currentLen++; |
| 2851 atStart = FALSE; |
| 2852 break; |
| 2853 |
| 2854 |
| 2855 |
| 2692 case URX_ONECHAR_I: | 2856 case URX_ONECHAR_I: |
| 2693 // Case Insensitive Single Character. | 2857 // Case Insensitive Single Character. |
| 2694 if (currentLen == 0) { | 2858 if (currentLen == 0) { |
| 2695 UChar32 c = URX_VAL(op); | 2859 UChar32 c = URX_VAL(op); |
| 2696 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { | 2860 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { |
| 2697 UnicodeSet starters(c, c); | 2861 UnicodeSet starters(c, c); |
| 2698 starters.closeOver(USET_CASE_INSENSITIVE); | 2862 starters.closeOver(USET_CASE_INSENSITIVE); |
| 2699 // findCaseInsensitiveStarters(c, &starters); | 2863 // findCaseInsensitiveStarters(c, &starters); |
| 2700 // For ONECHAR_I, no need to worry about text chars that e
xpand on folding into strings. | 2864 // For ONECHAR_I, no need to worry about text chars that e
xpand on folding into strings. |
| 2701 // The expanded folding can't match the pattern. | 2865 // The expanded folding can't match the pattern. |
| (...skipping 368 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3070 break; | 3234 break; |
| 3071 | 3235 |
| 3072 | 3236 |
| 3073 // Ops that match a minimum of one character (one or two 16 bit code
units.) | 3237 // Ops that match a minimum of one character (one or two 16 bit code
units.) |
| 3074 // | 3238 // |
| 3075 case URX_ONECHAR: | 3239 case URX_ONECHAR: |
| 3076 case URX_STATIC_SETREF: | 3240 case URX_STATIC_SETREF: |
| 3077 case URX_STAT_SETREF_N: | 3241 case URX_STAT_SETREF_N: |
| 3078 case URX_SETREF: | 3242 case URX_SETREF: |
| 3079 case URX_BACKSLASH_D: | 3243 case URX_BACKSLASH_D: |
| 3244 case URX_BACKSLASH_H: |
| 3245 case URX_BACKSLASH_R: |
| 3246 case URX_BACKSLASH_V: |
| 3080 case URX_ONECHAR_I: | 3247 case URX_ONECHAR_I: |
| 3081 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounde
d. | 3248 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounde
d. |
| 3082 case URX_DOTANY_ALL: // . matches one or two. | 3249 case URX_DOTANY_ALL: // . matches one or two. |
| 3083 case URX_DOTANY: | 3250 case URX_DOTANY: |
| 3084 case URX_DOTANY_UNIX: | 3251 case URX_DOTANY_UNIX: |
| 3085 currentLen++; | 3252 currentLen++; |
| 3086 break; | 3253 break; |
| 3087 | 3254 |
| 3088 | 3255 |
| 3089 case URX_JMPX: | 3256 case URX_JMPX: |
| (...skipping 261 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3351 currentLen = INT32_MAX; | 3518 currentLen = INT32_MAX; |
| 3352 break; | 3519 break; |
| 3353 | 3520 |
| 3354 | 3521 |
| 3355 // Ops that match a max of one character (possibly two 16 bit code u
nits.) | 3522 // Ops that match a max of one character (possibly two 16 bit code u
nits.) |
| 3356 // | 3523 // |
| 3357 case URX_STATIC_SETREF: | 3524 case URX_STATIC_SETREF: |
| 3358 case URX_STAT_SETREF_N: | 3525 case URX_STAT_SETREF_N: |
| 3359 case URX_SETREF: | 3526 case URX_SETREF: |
| 3360 case URX_BACKSLASH_D: | 3527 case URX_BACKSLASH_D: |
| 3528 case URX_BACKSLASH_H: |
| 3529 case URX_BACKSLASH_R: |
| 3530 case URX_BACKSLASH_V: |
| 3361 case URX_ONECHAR_I: | 3531 case URX_ONECHAR_I: |
| 3362 case URX_DOTANY_ALL: | 3532 case URX_DOTANY_ALL: |
| 3363 case URX_DOTANY: | 3533 case URX_DOTANY: |
| 3364 case URX_DOTANY_UNIX: | 3534 case URX_DOTANY_UNIX: |
| 3365 currentLen = safeIncrement(currentLen, 2); | 3535 currentLen = safeIncrement(currentLen, 2); |
| 3366 break; | 3536 break; |
| 3367 | 3537 |
| 3368 // Single literal character. Increase current max length by one or
two, | 3538 // Single literal character. Increase current max length by one or
two, |
| 3369 // depending on whether the char is in the supplementary range
. | 3539 // depending on whether the char is in the supplementary range
. |
| 3370 case URX_ONECHAR: | 3540 case URX_ONECHAR: |
| (...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3472 } | 3642 } |
| 3473 | 3643 |
| 3474 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat
->elementAti(loc+3)); | 3644 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat
->elementAti(loc+3)); |
| 3475 if (maxLoopCount == -1) { | 3645 if (maxLoopCount == -1) { |
| 3476 // Unbounded Loop. No upper bound on match length. | 3646 // Unbounded Loop. No upper bound on match length. |
| 3477 currentLen = INT32_MAX; | 3647 currentLen = INT32_MAX; |
| 3478 break; | 3648 break; |
| 3479 } | 3649 } |
| 3480 | 3650 |
| 3481 U_ASSERT(loopEndLoc >= loc+4); | 3651 U_ASSERT(loopEndLoc >= loc+4); |
| 3482 int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Rec
ursive call. | 3652 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recu
rsive call. |
| 3483 if (blockLen == INT32_MAX) { | 3653 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCou
nt; |
| 3484 currentLen = blockLen; | 3654 if (updatedLen >= INT32_MAX) { |
| 3655 currentLen = INT32_MAX; |
| 3485 break; | 3656 break; |
| 3486 } | 3657 } |
| 3487 currentLen += blockLen * maxLoopCount; | 3658 currentLen = (int32_t)updatedLen; |
| 3488 loc = loopEndLoc; | 3659 loc = loopEndLoc; |
| 3489 break; | 3660 break; |
| 3490 } | 3661 } |
| 3491 | 3662 |
| 3492 case URX_CTR_LOOP: | 3663 case URX_CTR_LOOP: |
| 3493 case URX_CTR_LOOP_NG: | 3664 case URX_CTR_LOOP_NG: |
| 3494 // These opcodes will be skipped over by code for URX_CRT_INIT. | 3665 // These opcodes will be skipped over by code for URX_CRT_INIT. |
| 3495 // We shouldn't encounter them here. | 3666 // We shouldn't encounter them here. |
| 3496 U_ASSERT(FALSE); | 3667 U_ASSERT(FALSE); |
| 3497 break; | 3668 break; |
| (...skipping 180 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3678 case URX_LB_START: | 3849 case URX_LB_START: |
| 3679 case URX_LB_CONT: | 3850 case URX_LB_CONT: |
| 3680 case URX_LB_END: | 3851 case URX_LB_END: |
| 3681 case URX_LBN_CONT: | 3852 case URX_LBN_CONT: |
| 3682 case URX_LBN_END: | 3853 case URX_LBN_END: |
| 3683 case URX_LOOP_SR_I: | 3854 case URX_LOOP_SR_I: |
| 3684 case URX_LOOP_DOT_I: | 3855 case URX_LOOP_DOT_I: |
| 3685 case URX_LOOP_C: | 3856 case URX_LOOP_C: |
| 3686 case URX_DOLLAR_D: | 3857 case URX_DOLLAR_D: |
| 3687 case URX_DOLLAR_MD: | 3858 case URX_DOLLAR_MD: |
| 3859 case URX_BACKSLASH_H: |
| 3860 case URX_BACKSLASH_R: |
| 3861 case URX_BACKSLASH_V: |
| 3688 // These instructions are unaltered by the relocation. | 3862 // These instructions are unaltered by the relocation. |
| 3689 fRXPat->fCompiledPat->setElementAt(op, dst); | 3863 fRXPat->fCompiledPat->setElementAt(op, dst); |
| 3690 dst++; | 3864 dst++; |
| 3691 break; | 3865 break; |
| 3692 | 3866 |
| 3693 default: | 3867 default: |
| 3694 // Some op is unaccounted for. | 3868 // Some op is unaccounted for. |
| 3695 U_ASSERT(FALSE); | 3869 U_ASSERT(FALSE); |
| 3696 error(U_REGEX_INTERNAL_ERROR); | 3870 error(U_REGEX_INTERNAL_ERROR); |
| 3697 } | 3871 } |
| (...skipping 751 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4449 | 4623 |
| 4450 void RegexCompile::setPushOp(int32_t op) { | 4624 void RegexCompile::setPushOp(int32_t op) { |
| 4451 setEval(op); | 4625 setEval(op); |
| 4452 fSetOpStack.push(op, *fStatus); | 4626 fSetOpStack.push(op, *fStatus); |
| 4453 fSetStack.push(new UnicodeSet(), *fStatus); | 4627 fSetStack.push(new UnicodeSet(), *fStatus); |
| 4454 } | 4628 } |
| 4455 | 4629 |
| 4456 U_NAMESPACE_END | 4630 U_NAMESPACE_END |
| 4457 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS | 4631 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 4458 | 4632 |
| OLD | NEW |