OLD | NEW |
1 // | 1 // |
2 // file: regexcmp.cpp | 2 // file: regexcmp.cpp |
3 // | 3 // |
4 // Copyright (C) 2002-2014 International Business Machines Corporation and othe
rs. | 4 // Copyright (C) 2002-2015 International Business Machines Corporation and othe
rs. |
5 // All Rights Reserved. | 5 // All Rights Reserved. |
6 // | 6 // |
7 // This file contains the ICU regular expression compiler, which is responsible | 7 // This file contains the ICU regular expression compiler, which is responsible |
8 // for processing a regular expression pattern into the compiled form that | 8 // for processing a regular expression pattern into the compiled form that |
9 // is used by the match finding engine. | 9 // is used by the match finding engine. |
10 // | 10 // |
11 | 11 |
12 #include "unicode/utypes.h" | 12 #include "unicode/utypes.h" |
13 | 13 |
14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS | 14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
63 fPeekChar = -1; | 63 fPeekChar = -1; |
64 fLineNum = 1; | 64 fLineNum = 1; |
65 fCharNum = 0; | 65 fCharNum = 0; |
66 fQuoteMode = FALSE; | 66 fQuoteMode = FALSE; |
67 fInBackslashQuote = FALSE; | 67 fInBackslashQuote = FALSE; |
68 fModeFlags = fRXPat->fFlags | 0x80000000; | 68 fModeFlags = fRXPat->fFlags | 0x80000000; |
69 fEOLComments = TRUE; | 69 fEOLComments = TRUE; |
70 | 70 |
71 fMatchOpenParen = -1; | 71 fMatchOpenParen = -1; |
72 fMatchCloseParen = -1; | 72 fMatchCloseParen = -1; |
| 73 fCaptureName = NULL; |
73 | 74 |
74 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) { | 75 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) { |
75 status = rxp->fDeferredStatus; | 76 status = rxp->fDeferredStatus; |
76 } | 77 } |
77 } | 78 } |
78 | 79 |
79 static const UChar chAmp = 0x26; // '&' | 80 static const UChar chAmp = 0x26; // '&' |
80 static const UChar chDash = 0x2d; // '-' | 81 static const UChar chDash = 0x2d; // '-' |
81 | 82 |
82 | 83 |
83 //------------------------------------------------------------------------------ | 84 //------------------------------------------------------------------------------ |
84 // | 85 // |
85 // Destructor | 86 // Destructor |
86 // | 87 // |
87 //------------------------------------------------------------------------------ | 88 //------------------------------------------------------------------------------ |
88 RegexCompile::~RegexCompile() { | 89 RegexCompile::~RegexCompile() { |
| 90 delete fCaptureName; // Normally will be NULL, but can exist if patt
ern |
| 91 // compilation stops with a syntax error. |
89 } | 92 } |
90 | 93 |
91 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) { | 94 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) { |
92 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK,
value, ec)); | 95 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK,
value, ec)); |
93 } | 96 } |
94 | 97 |
95 //------------------------------------------------------------------------------ | 98 //------------------------------------------------------------------------------ |
96 // | 99 // |
97 // Compile regex pattern. The state machine for rexexp pattern parsing is her
e. | 100 // Compile regex pattern. The state machine for rexexp pattern parsing is her
e. |
98 // The state tables are hand-written in the file regex
cst.txt, | 101 // The state tables are hand-written in the file regex
cst.txt, |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
131 | 134 |
132 if (U_FAILURE(*fStatus)) { | 135 if (U_FAILURE(*fStatus)) { |
133 return; | 136 return; |
134 } | 137 } |
135 | 138 |
136 // There should be no pattern stuff in the RegexPattern object. They can no
t be reused. | 139 // There should be no pattern stuff in the RegexPattern object. They can no
t be reused. |
137 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) ==
0); | 140 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) ==
0); |
138 | 141 |
139 // Prepare the RegexPattern object to receive the compiled pattern. | 142 // Prepare the RegexPattern object to receive the compiled pattern. |
140 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fS
tatus); | 143 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fS
tatus); |
| 144 if (U_FAILURE(*fStatus)) { |
| 145 return; |
| 146 } |
141 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets; | 147 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets; |
142 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8; | 148 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8; |
143 | 149 |
144 | 150 |
145 // Initialize the pattern scanning state machine | 151 // Initialize the pattern scanning state machine |
146 fPatternLength = utext_nativeLength(pat); | 152 fPatternLength = utext_nativeLength(pat); |
147 uint16_t state = 1; | 153 uint16_t state = 1; |
148 const RegexTableEl *tableEl; | 154 const RegexTableEl *tableEl; |
149 | 155 |
150 // UREGEX_LITERAL force entire pattern to be treated as a literal string. | 156 // UREGEX_LITERAL force entire pattern to be treated as a literal string. |
(...skipping 126 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
277 delete (UnicodeSet *)fSetStack.pop(); | 283 delete (UnicodeSet *)fSetStack.pop(); |
278 } | 284 } |
279 return; | 285 return; |
280 } | 286 } |
281 | 287 |
282 // | 288 // |
283 // The pattern has now been read and processed, and the compiled code genera
ted. | 289 // The pattern has now been read and processed, and the compiled code genera
ted. |
284 // | 290 // |
285 | 291 |
286 // | 292 // |
287 // Compute the number of digits requried for the largest capture group numbe
r. | |
288 // | |
289 fRXPat->fMaxCaptureDigits = 1; | |
290 int32_t n = 10; | |
291 int32_t groupCount = fRXPat->fGroupMap->size(); | |
292 while (n <= groupCount) { | |
293 fRXPat->fMaxCaptureDigits++; | |
294 n *= 10; | |
295 } | |
296 | |
297 // | |
298 // The pattern's fFrameSize so far has accumulated the requirements for | 293 // The pattern's fFrameSize so far has accumulated the requirements for |
299 // storage for capture parentheses, counters, etc. that are encountered | 294 // storage for capture parentheses, counters, etc. that are encountered |
300 // in the pattern. Add space for the two variables that are always | 295 // in the pattern. Add space for the two variables that are always |
301 // present in the saved state: the input string position (int64_t) and | 296 // present in the saved state: the input string position (int64_t) and |
302 // the position in the compiled pattern. | 297 // the position in the compiled pattern. |
303 // | 298 // |
304 allocateStackData(RESTACKFRAME_HDRCOUNT); | 299 allocateStackData(RESTACKFRAME_HDRCOUNT); |
305 | 300 |
306 // | 301 // |
307 // Optimization pass 1: NOPs, back-references, and case-folding | 302 // Optimization pass 1: NOPs, back-references, and case-folding |
(...skipping 120 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
428 | 423 |
429 // Append a NOP to the compiled pattern. This is the slot reserved | 424 // Append a NOP to the compiled pattern. This is the slot reserved |
430 // for a SAVE in the event that there is yet another '|' following | 425 // for a SAVE in the event that there is yet another '|' following |
431 // this one. | 426 // this one. |
432 appendOp(URX_NOP, 0); | 427 appendOp(URX_NOP, 0); |
433 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); | 428 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); |
434 } | 429 } |
435 break; | 430 break; |
436 | 431 |
437 | 432 |
| 433 case doBeginNamedCapture: |
| 434 // Scanning (?<letter. |
| 435 // The first letter of the name will come through again under doConinu
eNamedCapture. |
| 436 fCaptureName = new UnicodeString(); |
| 437 if (fCaptureName == NULL) { |
| 438 error(U_MEMORY_ALLOCATION_ERROR); |
| 439 } |
| 440 break; |
| 441 |
| 442 case doContinueNamedCapture: |
| 443 fCaptureName->append(fC.fChar); |
| 444 break; |
| 445 |
| 446 case doBadNamedCapture: |
| 447 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); |
| 448 break; |
| 449 |
438 case doOpenCaptureParen: | 450 case doOpenCaptureParen: |
439 // Open Paren. | 451 // Open Capturing Paren, possibly named. |
440 // Compile to a | 452 // Compile to a |
441 // - NOP, which later may be replaced by a save-state if the | 453 // - NOP, which later may be replaced by a save-state if the |
442 // parenthesized group gets a * quantifier, followed by | 454 // parenthesized group gets a * quantifier, followed by |
443 // - START_CAPTURE n where n is stack frame offset to the captu
re group variables. | 455 // - START_CAPTURE n where n is stack frame offset to the captu
re group variables. |
444 // - NOP, which may later be replaced by a save-state if there | 456 // - NOP, which may later be replaced by a save-state if there |
445 // is an '|' alternation within the parens. | 457 // is an '|' alternation within the parens. |
446 // | 458 // |
447 // Each capture group gets three slots in the save stack frame: | 459 // Each capture group gets three slots in the save stack frame: |
448 // 0: Capture Group start position (in input string being matche
d.) | 460 // 0: Capture Group start position (in input string being matche
d.) |
449 // 1: Capture Group end position. | 461 // 1: Capture Group end position. |
(...skipping 14 matching lines...) Expand all Loading... |
464 // of the two NOPs. Depending on what follows in the pattern, the | 476 // of the two NOPs. Depending on what follows in the pattern, the |
465 // NOPs may be changed to SAVE_STATE or JMP ops, with a target | 477 // NOPs may be changed to SAVE_STATE or JMP ops, with a target |
466 // address of the end of the parenthesized group. | 478 // address of the end of the parenthesized group. |
467 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 479 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
468 fParenStack.push(capturing, *fStatus); // Fra
me type. | 480 fParenStack.push(capturing, *fStatus); // Fra
me type. |
469 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP location | 481 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP location |
470 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc | 482 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc |
471 | 483 |
472 // Save the mapping from group number to stack frame variable positi
on. | 484 // Save the mapping from group number to stack frame variable positi
on. |
473 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); | 485 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); |
| 486 |
| 487 // If this is a named capture group, add the name->group number mapp
ing. |
| 488 if (fCaptureName != NULL) { |
| 489 int32_t groupNumber = fRXPat->fGroupMap->size(); |
| 490 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, f
CaptureName, groupNumber, fStatus); |
| 491 fCaptureName = NULL; // hash table takes ownership of the nam
e (key) string. |
| 492 if (previousMapping > 0 && U_SUCCESS(*fStatus)) { |
| 493 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); |
| 494 } |
| 495 } |
474 } | 496 } |
475 break; | 497 break; |
476 | 498 |
477 case doOpenNonCaptureParen: | 499 case doOpenNonCaptureParen: |
478 // Open non-caputuring (grouping only) Paren. | 500 // Open non-caputuring (grouping only) Paren. |
479 // Compile to a | 501 // Compile to a |
480 // - NOP, which later may be replaced by a save-state if the | 502 // - NOP, which later may be replaced by a save-state if the |
481 // parenthesized group gets a * quantifier, followed by | 503 // parenthesized group gets a * quantifier, followed by |
482 // - NOP, which may later be replaced by a save-state if there | 504 // - NOP, which may later be replaced by a save-state if there |
483 // is an '|' alternation within the parens. | 505 // is an '|' alternation within the parens. |
484 { | 506 { |
485 fixLiterals(); | 507 fixLiterals(); |
(...skipping 485 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
971 // are scanned. | 993 // are scanned. |
972 fIntervalLow = 0; | 994 fIntervalLow = 0; |
973 fIntervalUpper = -1; | 995 fIntervalUpper = -1; |
974 break; | 996 break; |
975 | 997 |
976 case doIntevalLowerDigit: | 998 case doIntevalLowerDigit: |
977 // Scanned a digit from the lower value of an {lower,upper} interval | 999 // Scanned a digit from the lower value of an {lower,upper} interval |
978 { | 1000 { |
979 int32_t digitValue = u_charDigitValue(fC.fChar); | 1001 int32_t digitValue = u_charDigitValue(fC.fChar); |
980 U_ASSERT(digitValue >= 0); | 1002 U_ASSERT(digitValue >= 0); |
981 fIntervalLow = fIntervalLow*10 + digitValue; | 1003 int64_t val = (int64_t)fIntervalLow*10 + digitValue; |
982 if (fIntervalLow < 0) { | 1004 if (val > INT32_MAX) { |
983 error(U_REGEX_NUMBER_TOO_BIG); | 1005 error(U_REGEX_NUMBER_TOO_BIG); |
| 1006 } else { |
| 1007 fIntervalLow = (int32_t)val; |
984 } | 1008 } |
985 } | 1009 } |
986 break; | 1010 break; |
987 | 1011 |
988 case doIntervalUpperDigit: | 1012 case doIntervalUpperDigit: |
989 // Scanned a digit from the upper value of an {lower,upper} interval | 1013 // Scanned a digit from the upper value of an {lower,upper} interval |
990 { | 1014 { |
991 if (fIntervalUpper < 0) { | 1015 if (fIntervalUpper < 0) { |
992 fIntervalUpper = 0; | 1016 fIntervalUpper = 0; |
993 } | 1017 } |
994 int32_t digitValue = u_charDigitValue(fC.fChar); | 1018 int32_t digitValue = u_charDigitValue(fC.fChar); |
995 U_ASSERT(digitValue >= 0); | 1019 U_ASSERT(digitValue >= 0); |
996 fIntervalUpper = fIntervalUpper*10 + digitValue; | 1020 int64_t val = (int64_t)fIntervalUpper*10 + digitValue; |
997 if (fIntervalUpper < 0) { | 1021 if (val > INT32_MAX) { |
998 error(U_REGEX_NUMBER_TOO_BIG); | 1022 error(U_REGEX_NUMBER_TOO_BIG); |
| 1023 } else { |
| 1024 fIntervalUpper = (int32_t)val; |
999 } | 1025 } |
1000 } | 1026 } |
1001 break; | 1027 break; |
1002 | 1028 |
1003 case doIntervalSame: | 1029 case doIntervalSame: |
1004 // Scanned a single value interval like {27}. Upper = Lower. | 1030 // Scanned a single value interval like {27}. Upper = Lower. |
1005 fIntervalUpper = fIntervalLow; | 1031 fIntervalUpper = fIntervalLow; |
1006 break; | 1032 break; |
1007 | 1033 |
1008 case doInterval: | 1034 case doInterval: |
(...skipping 146 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1155 case doBackslashd: | 1181 case doBackslashd: |
1156 fixLiterals(FALSE); | 1182 fixLiterals(FALSE); |
1157 appendOp(URX_BACKSLASH_D, 0); | 1183 appendOp(URX_BACKSLASH_D, 0); |
1158 break; | 1184 break; |
1159 | 1185 |
1160 case doBackslashG: | 1186 case doBackslashG: |
1161 fixLiterals(FALSE); | 1187 fixLiterals(FALSE); |
1162 appendOp(URX_BACKSLASH_G, 0); | 1188 appendOp(URX_BACKSLASH_G, 0); |
1163 break; | 1189 break; |
1164 | 1190 |
| 1191 case doBackslashH: |
| 1192 fixLiterals(FALSE); |
| 1193 appendOp(URX_BACKSLASH_H, 1); |
| 1194 break; |
| 1195 |
| 1196 case doBackslashh: |
| 1197 fixLiterals(FALSE); |
| 1198 appendOp(URX_BACKSLASH_H, 0); |
| 1199 break; |
| 1200 |
| 1201 case doBackslashR: |
| 1202 fixLiterals(FALSE); |
| 1203 appendOp(URX_BACKSLASH_R, 0); |
| 1204 break; |
| 1205 |
1165 case doBackslashS: | 1206 case doBackslashS: |
1166 fixLiterals(FALSE); | 1207 fixLiterals(FALSE); |
1167 appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET); | 1208 appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET); |
1168 break; | 1209 break; |
1169 | 1210 |
1170 case doBackslashs: | 1211 case doBackslashs: |
1171 fixLiterals(FALSE); | 1212 fixLiterals(FALSE); |
1172 appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET); | 1213 appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET); |
1173 break; | 1214 break; |
1174 | 1215 |
| 1216 case doBackslashV: |
| 1217 fixLiterals(FALSE); |
| 1218 appendOp(URX_BACKSLASH_V, 1); |
| 1219 break; |
| 1220 |
| 1221 case doBackslashv: |
| 1222 fixLiterals(FALSE); |
| 1223 appendOp(URX_BACKSLASH_V, 0); |
| 1224 break; |
| 1225 |
1175 case doBackslashW: | 1226 case doBackslashW: |
1176 fixLiterals(FALSE); | 1227 fixLiterals(FALSE); |
1177 appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET); | 1228 appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET); |
1178 break; | 1229 break; |
1179 | 1230 |
1180 case doBackslashw: | 1231 case doBackslashw: |
1181 fixLiterals(FALSE); | 1232 fixLiterals(FALSE); |
1182 appendOp(URX_STATIC_SETREF, URX_ISWORD_SET); | 1233 appendOp(URX_STATIC_SETREF, URX_ISWORD_SET); |
1183 break; | 1234 break; |
1184 | 1235 |
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1256 // and shouldn't enter this code path at
all. | 1307 // and shouldn't enter this code path at
all. |
1257 fixLiterals(FALSE); | 1308 fixLiterals(FALSE); |
1258 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 1309 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
1259 appendOp(URX_BACKREF_I, groupNum); | 1310 appendOp(URX_BACKREF_I, groupNum); |
1260 } else { | 1311 } else { |
1261 appendOp(URX_BACKREF, groupNum); | 1312 appendOp(URX_BACKREF, groupNum); |
1262 } | 1313 } |
1263 } | 1314 } |
1264 break; | 1315 break; |
1265 | 1316 |
| 1317 case doBeginNamedBackRef: |
| 1318 U_ASSERT(fCaptureName == NULL); |
| 1319 fCaptureName = new UnicodeString; |
| 1320 if (fCaptureName == NULL) { |
| 1321 error(U_MEMORY_ALLOCATION_ERROR); |
| 1322 } |
| 1323 break; |
| 1324 |
| 1325 case doContinueNamedBackRef: |
| 1326 fCaptureName->append(fC.fChar); |
| 1327 break; |
1266 | 1328 |
| 1329 case doCompleteNamedBackRef: |
| 1330 { |
| 1331 int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName)
; |
| 1332 if (groupNumber == 0) { |
| 1333 // Group name has not been defined. |
| 1334 // Could be a forward reference. If we choose to support them at s
ome |
| 1335 // future time, extra mechanism will be required at this point. |
| 1336 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); |
| 1337 } else { |
| 1338 // Given the number, handle identically to a \n numbered back refere
nce. |
| 1339 // See comments above, under doBackRef |
| 1340 fixLiterals(FALSE); |
| 1341 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
| 1342 appendOp(URX_BACKREF_I, groupNumber); |
| 1343 } else { |
| 1344 appendOp(URX_BACKREF, groupNumber); |
| 1345 } |
| 1346 } |
| 1347 delete fCaptureName; |
| 1348 fCaptureName = NULL; |
| 1349 break; |
| 1350 } |
| 1351 |
1267 case doPossessivePlus: | 1352 case doPossessivePlus: |
1268 // Possessive ++ quantifier. | 1353 // Possessive ++ quantifier. |
1269 // Compiles to | 1354 // Compiles to |
1270 // 1. STO_SP | 1355 // 1. STO_SP |
1271 // 2. body of stuff being iterated over | 1356 // 2. body of stuff being iterated over |
1272 // 3. STATE_SAVE 5 | 1357 // 3. STATE_SAVE 5 |
1273 // 4. JMP 2 | 1358 // 4. JMP 2 |
1274 // 5. LD_SP | 1359 // 5. LD_SP |
1275 // 6. ... | 1360 // 6. ... |
1276 // | 1361 // |
(...skipping 204 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1481 { | 1566 { |
1482 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); | 1567 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
1483 UnicodeSet digits; | 1568 UnicodeSet digits; |
1484 // TODO - make a static set, ticket 6058. | 1569 // TODO - make a static set, ticket 6058. |
1485 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MA
SK, *fStatus); | 1570 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MA
SK, *fStatus); |
1486 digits.complement(); | 1571 digits.complement(); |
1487 set->addAll(digits); | 1572 set->addAll(digits); |
1488 break; | 1573 break; |
1489 } | 1574 } |
1490 | 1575 |
| 1576 case doSetBackslash_h: |
| 1577 { |
| 1578 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| 1579 UnicodeSet h; |
| 1580 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *
fStatus); |
| 1581 h.add((UChar32)9); // Tab |
| 1582 set->addAll(h); |
| 1583 break; |
| 1584 } |
| 1585 |
| 1586 case doSetBackslash_H: |
| 1587 { |
| 1588 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| 1589 UnicodeSet h; |
| 1590 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *
fStatus); |
| 1591 h.add((UChar32)9); // Tab |
| 1592 h.complement(); |
| 1593 set->addAll(h); |
| 1594 break; |
| 1595 } |
| 1596 |
| 1597 case doSetBackslash_v: |
| 1598 { |
| 1599 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| 1600 set->add((UChar32)0x0a, (UChar32)0x0d); // add range |
| 1601 set->add((UChar32)0x85); |
| 1602 set->add((UChar32)0x2028, (UChar32)0x2029); |
| 1603 break; |
| 1604 } |
| 1605 |
| 1606 case doSetBackslash_V: |
| 1607 { |
| 1608 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| 1609 UnicodeSet v; |
| 1610 v.add((UChar32)0x0a, (UChar32)0x0d); // add range |
| 1611 v.add((UChar32)0x85); |
| 1612 v.add((UChar32)0x2028, (UChar32)0x2029); |
| 1613 v.complement(); |
| 1614 set->addAll(v); |
| 1615 break; |
| 1616 } |
| 1617 |
1491 case doSetBackslash_w: | 1618 case doSetBackslash_w: |
1492 { | 1619 { |
1493 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); | 1620 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
1494 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]
); | 1621 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]
); |
1495 break; | 1622 break; |
1496 } | 1623 } |
1497 | 1624 |
1498 case doSetBackslash_W: | 1625 case doSetBackslash_W: |
1499 { | 1626 { |
1500 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); | 1627 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
(...skipping 1181 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2682 s.complement(); | 2809 s.complement(); |
2683 } | 2810 } |
2684 fRXPat->fInitialChars->addAll(s); | 2811 fRXPat->fInitialChars->addAll(s); |
2685 numInitialStrings += 2; | 2812 numInitialStrings += 2; |
2686 } | 2813 } |
2687 currentLen++; | 2814 currentLen++; |
2688 atStart = FALSE; | 2815 atStart = FALSE; |
2689 break; | 2816 break; |
2690 | 2817 |
2691 | 2818 |
| 2819 case URX_BACKSLASH_H: |
| 2820 // Horiz white space |
| 2821 if (currentLen == 0) { |
| 2822 UnicodeSet s; |
| 2823 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MAS
K, *fStatus); |
| 2824 s.add((UChar32)9); // Tab |
| 2825 if (URX_VAL(op) != 0) { |
| 2826 s.complement(); |
| 2827 } |
| 2828 fRXPat->fInitialChars->addAll(s); |
| 2829 numInitialStrings += 2; |
| 2830 } |
| 2831 currentLen++; |
| 2832 atStart = FALSE; |
| 2833 break; |
| 2834 |
| 2835 |
| 2836 case URX_BACKSLASH_R: // Any line ending sequence |
| 2837 case URX_BACKSLASH_V: // Any line ending code point, with optional
negation |
| 2838 if (currentLen == 0) { |
| 2839 UnicodeSet s; |
| 2840 s.add((UChar32)0x0a, (UChar32)0x0d); // add range |
| 2841 s.add((UChar32)0x85); |
| 2842 s.add((UChar32)0x2028, (UChar32)0x2029); |
| 2843 if (URX_VAL(op) != 0) { |
| 2844 // Complement option applies to URX_BACKSLASH_V only. |
| 2845 s.complement(); |
| 2846 } |
| 2847 fRXPat->fInitialChars->addAll(s); |
| 2848 numInitialStrings += 2; |
| 2849 } |
| 2850 currentLen++; |
| 2851 atStart = FALSE; |
| 2852 break; |
| 2853 |
| 2854 |
| 2855 |
2692 case URX_ONECHAR_I: | 2856 case URX_ONECHAR_I: |
2693 // Case Insensitive Single Character. | 2857 // Case Insensitive Single Character. |
2694 if (currentLen == 0) { | 2858 if (currentLen == 0) { |
2695 UChar32 c = URX_VAL(op); | 2859 UChar32 c = URX_VAL(op); |
2696 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { | 2860 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { |
2697 UnicodeSet starters(c, c); | 2861 UnicodeSet starters(c, c); |
2698 starters.closeOver(USET_CASE_INSENSITIVE); | 2862 starters.closeOver(USET_CASE_INSENSITIVE); |
2699 // findCaseInsensitiveStarters(c, &starters); | 2863 // findCaseInsensitiveStarters(c, &starters); |
2700 // For ONECHAR_I, no need to worry about text chars that e
xpand on folding into strings. | 2864 // For ONECHAR_I, no need to worry about text chars that e
xpand on folding into strings. |
2701 // The expanded folding can't match the pattern. | 2865 // The expanded folding can't match the pattern. |
(...skipping 368 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3070 break; | 3234 break; |
3071 | 3235 |
3072 | 3236 |
3073 // Ops that match a minimum of one character (one or two 16 bit code
units.) | 3237 // Ops that match a minimum of one character (one or two 16 bit code
units.) |
3074 // | 3238 // |
3075 case URX_ONECHAR: | 3239 case URX_ONECHAR: |
3076 case URX_STATIC_SETREF: | 3240 case URX_STATIC_SETREF: |
3077 case URX_STAT_SETREF_N: | 3241 case URX_STAT_SETREF_N: |
3078 case URX_SETREF: | 3242 case URX_SETREF: |
3079 case URX_BACKSLASH_D: | 3243 case URX_BACKSLASH_D: |
| 3244 case URX_BACKSLASH_H: |
| 3245 case URX_BACKSLASH_R: |
| 3246 case URX_BACKSLASH_V: |
3080 case URX_ONECHAR_I: | 3247 case URX_ONECHAR_I: |
3081 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounde
d. | 3248 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounde
d. |
3082 case URX_DOTANY_ALL: // . matches one or two. | 3249 case URX_DOTANY_ALL: // . matches one or two. |
3083 case URX_DOTANY: | 3250 case URX_DOTANY: |
3084 case URX_DOTANY_UNIX: | 3251 case URX_DOTANY_UNIX: |
3085 currentLen++; | 3252 currentLen++; |
3086 break; | 3253 break; |
3087 | 3254 |
3088 | 3255 |
3089 case URX_JMPX: | 3256 case URX_JMPX: |
(...skipping 261 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3351 currentLen = INT32_MAX; | 3518 currentLen = INT32_MAX; |
3352 break; | 3519 break; |
3353 | 3520 |
3354 | 3521 |
3355 // Ops that match a max of one character (possibly two 16 bit code u
nits.) | 3522 // Ops that match a max of one character (possibly two 16 bit code u
nits.) |
3356 // | 3523 // |
3357 case URX_STATIC_SETREF: | 3524 case URX_STATIC_SETREF: |
3358 case URX_STAT_SETREF_N: | 3525 case URX_STAT_SETREF_N: |
3359 case URX_SETREF: | 3526 case URX_SETREF: |
3360 case URX_BACKSLASH_D: | 3527 case URX_BACKSLASH_D: |
| 3528 case URX_BACKSLASH_H: |
| 3529 case URX_BACKSLASH_R: |
| 3530 case URX_BACKSLASH_V: |
3361 case URX_ONECHAR_I: | 3531 case URX_ONECHAR_I: |
3362 case URX_DOTANY_ALL: | 3532 case URX_DOTANY_ALL: |
3363 case URX_DOTANY: | 3533 case URX_DOTANY: |
3364 case URX_DOTANY_UNIX: | 3534 case URX_DOTANY_UNIX: |
3365 currentLen = safeIncrement(currentLen, 2); | 3535 currentLen = safeIncrement(currentLen, 2); |
3366 break; | 3536 break; |
3367 | 3537 |
3368 // Single literal character. Increase current max length by one or
two, | 3538 // Single literal character. Increase current max length by one or
two, |
3369 // depending on whether the char is in the supplementary range
. | 3539 // depending on whether the char is in the supplementary range
. |
3370 case URX_ONECHAR: | 3540 case URX_ONECHAR: |
(...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3472 } | 3642 } |
3473 | 3643 |
3474 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat
->elementAti(loc+3)); | 3644 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat
->elementAti(loc+3)); |
3475 if (maxLoopCount == -1) { | 3645 if (maxLoopCount == -1) { |
3476 // Unbounded Loop. No upper bound on match length. | 3646 // Unbounded Loop. No upper bound on match length. |
3477 currentLen = INT32_MAX; | 3647 currentLen = INT32_MAX; |
3478 break; | 3648 break; |
3479 } | 3649 } |
3480 | 3650 |
3481 U_ASSERT(loopEndLoc >= loc+4); | 3651 U_ASSERT(loopEndLoc >= loc+4); |
3482 int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Rec
ursive call. | 3652 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recu
rsive call. |
3483 if (blockLen == INT32_MAX) { | 3653 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCou
nt; |
3484 currentLen = blockLen; | 3654 if (updatedLen >= INT32_MAX) { |
| 3655 currentLen = INT32_MAX; |
3485 break; | 3656 break; |
3486 } | 3657 } |
3487 currentLen += blockLen * maxLoopCount; | 3658 currentLen = (int32_t)updatedLen; |
3488 loc = loopEndLoc; | 3659 loc = loopEndLoc; |
3489 break; | 3660 break; |
3490 } | 3661 } |
3491 | 3662 |
3492 case URX_CTR_LOOP: | 3663 case URX_CTR_LOOP: |
3493 case URX_CTR_LOOP_NG: | 3664 case URX_CTR_LOOP_NG: |
3494 // These opcodes will be skipped over by code for URX_CRT_INIT. | 3665 // These opcodes will be skipped over by code for URX_CRT_INIT. |
3495 // We shouldn't encounter them here. | 3666 // We shouldn't encounter them here. |
3496 U_ASSERT(FALSE); | 3667 U_ASSERT(FALSE); |
3497 break; | 3668 break; |
(...skipping 180 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3678 case URX_LB_START: | 3849 case URX_LB_START: |
3679 case URX_LB_CONT: | 3850 case URX_LB_CONT: |
3680 case URX_LB_END: | 3851 case URX_LB_END: |
3681 case URX_LBN_CONT: | 3852 case URX_LBN_CONT: |
3682 case URX_LBN_END: | 3853 case URX_LBN_END: |
3683 case URX_LOOP_SR_I: | 3854 case URX_LOOP_SR_I: |
3684 case URX_LOOP_DOT_I: | 3855 case URX_LOOP_DOT_I: |
3685 case URX_LOOP_C: | 3856 case URX_LOOP_C: |
3686 case URX_DOLLAR_D: | 3857 case URX_DOLLAR_D: |
3687 case URX_DOLLAR_MD: | 3858 case URX_DOLLAR_MD: |
| 3859 case URX_BACKSLASH_H: |
| 3860 case URX_BACKSLASH_R: |
| 3861 case URX_BACKSLASH_V: |
3688 // These instructions are unaltered by the relocation. | 3862 // These instructions are unaltered by the relocation. |
3689 fRXPat->fCompiledPat->setElementAt(op, dst); | 3863 fRXPat->fCompiledPat->setElementAt(op, dst); |
3690 dst++; | 3864 dst++; |
3691 break; | 3865 break; |
3692 | 3866 |
3693 default: | 3867 default: |
3694 // Some op is unaccounted for. | 3868 // Some op is unaccounted for. |
3695 U_ASSERT(FALSE); | 3869 U_ASSERT(FALSE); |
3696 error(U_REGEX_INTERNAL_ERROR); | 3870 error(U_REGEX_INTERNAL_ERROR); |
3697 } | 3871 } |
(...skipping 751 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4449 | 4623 |
4450 void RegexCompile::setPushOp(int32_t op) { | 4624 void RegexCompile::setPushOp(int32_t op) { |
4451 setEval(op); | 4625 setEval(op); |
4452 fSetOpStack.push(op, *fStatus); | 4626 fSetOpStack.push(op, *fStatus); |
4453 fSetStack.push(new UnicodeSet(), *fStatus); | 4627 fSetStack.push(new UnicodeSet(), *fStatus); |
4454 } | 4628 } |
4455 | 4629 |
4456 U_NAMESPACE_END | 4630 U_NAMESPACE_END |
4457 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS | 4631 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
4458 | 4632 |
OLD | NEW |