Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(20)

Side by Side Diff: source/i18n/regexcmp.cpp

Issue 1621843002: ICU 56 update step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@561
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/i18n/regexcmp.h ('k') | source/i18n/regexcst.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // 1 //
2 // file: regexcmp.cpp 2 // file: regexcmp.cpp
3 // 3 //
4 // Copyright (C) 2002-2014 International Business Machines Corporation and othe rs. 4 // Copyright (C) 2002-2015 International Business Machines Corporation and othe rs.
5 // All Rights Reserved. 5 // All Rights Reserved.
6 // 6 //
7 // This file contains the ICU regular expression compiler, which is responsible 7 // This file contains the ICU regular expression compiler, which is responsible
8 // for processing a regular expression pattern into the compiled form that 8 // for processing a regular expression pattern into the compiled form that
9 // is used by the match finding engine. 9 // is used by the match finding engine.
10 // 10 //
11 11
12 #include "unicode/utypes.h" 12 #include "unicode/utypes.h"
13 13
14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
63 fPeekChar = -1; 63 fPeekChar = -1;
64 fLineNum = 1; 64 fLineNum = 1;
65 fCharNum = 0; 65 fCharNum = 0;
66 fQuoteMode = FALSE; 66 fQuoteMode = FALSE;
67 fInBackslashQuote = FALSE; 67 fInBackslashQuote = FALSE;
68 fModeFlags = fRXPat->fFlags | 0x80000000; 68 fModeFlags = fRXPat->fFlags | 0x80000000;
69 fEOLComments = TRUE; 69 fEOLComments = TRUE;
70 70
71 fMatchOpenParen = -1; 71 fMatchOpenParen = -1;
72 fMatchCloseParen = -1; 72 fMatchCloseParen = -1;
73 fCaptureName = NULL;
73 74
74 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) { 75 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
75 status = rxp->fDeferredStatus; 76 status = rxp->fDeferredStatus;
76 } 77 }
77 } 78 }
78 79
79 static const UChar chAmp = 0x26; // '&' 80 static const UChar chAmp = 0x26; // '&'
80 static const UChar chDash = 0x2d; // '-' 81 static const UChar chDash = 0x2d; // '-'
81 82
82 83
83 //------------------------------------------------------------------------------ 84 //------------------------------------------------------------------------------
84 // 85 //
85 // Destructor 86 // Destructor
86 // 87 //
87 //------------------------------------------------------------------------------ 88 //------------------------------------------------------------------------------
88 RegexCompile::~RegexCompile() { 89 RegexCompile::~RegexCompile() {
90 delete fCaptureName; // Normally will be NULL, but can exist if patt ern
91 // compilation stops with a syntax error.
89 } 92 }
90 93
91 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) { 94 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
92 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec)); 95 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
93 } 96 }
94 97
95 //------------------------------------------------------------------------------ 98 //------------------------------------------------------------------------------
96 // 99 //
97 // Compile regex pattern. The state machine for rexexp pattern parsing is her e. 100 // Compile regex pattern. The state machine for rexexp pattern parsing is her e.
98 // The state tables are hand-written in the file regex cst.txt, 101 // The state tables are hand-written in the file regex cst.txt,
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
131 134
132 if (U_FAILURE(*fStatus)) { 135 if (U_FAILURE(*fStatus)) {
133 return; 136 return;
134 } 137 }
135 138
136 // There should be no pattern stuff in the RegexPattern object. They can no t be reused. 139 // There should be no pattern stuff in the RegexPattern object. They can no t be reused.
137 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0); 140 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
138 141
139 // Prepare the RegexPattern object to receive the compiled pattern. 142 // Prepare the RegexPattern object to receive the compiled pattern.
140 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fS tatus); 143 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fS tatus);
144 if (U_FAILURE(*fStatus)) {
145 return;
146 }
141 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets; 147 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets;
142 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8; 148 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8;
143 149
144 150
145 // Initialize the pattern scanning state machine 151 // Initialize the pattern scanning state machine
146 fPatternLength = utext_nativeLength(pat); 152 fPatternLength = utext_nativeLength(pat);
147 uint16_t state = 1; 153 uint16_t state = 1;
148 const RegexTableEl *tableEl; 154 const RegexTableEl *tableEl;
149 155
150 // UREGEX_LITERAL force entire pattern to be treated as a literal string. 156 // UREGEX_LITERAL force entire pattern to be treated as a literal string.
(...skipping 126 matching lines...) Expand 10 before | Expand all | Expand 10 after
277 delete (UnicodeSet *)fSetStack.pop(); 283 delete (UnicodeSet *)fSetStack.pop();
278 } 284 }
279 return; 285 return;
280 } 286 }
281 287
282 // 288 //
283 // The pattern has now been read and processed, and the compiled code genera ted. 289 // The pattern has now been read and processed, and the compiled code genera ted.
284 // 290 //
285 291
286 // 292 //
287 // Compute the number of digits requried for the largest capture group numbe r.
288 //
289 fRXPat->fMaxCaptureDigits = 1;
290 int32_t n = 10;
291 int32_t groupCount = fRXPat->fGroupMap->size();
292 while (n <= groupCount) {
293 fRXPat->fMaxCaptureDigits++;
294 n *= 10;
295 }
296
297 //
298 // The pattern's fFrameSize so far has accumulated the requirements for 293 // The pattern's fFrameSize so far has accumulated the requirements for
299 // storage for capture parentheses, counters, etc. that are encountered 294 // storage for capture parentheses, counters, etc. that are encountered
300 // in the pattern. Add space for the two variables that are always 295 // in the pattern. Add space for the two variables that are always
301 // present in the saved state: the input string position (int64_t) and 296 // present in the saved state: the input string position (int64_t) and
302 // the position in the compiled pattern. 297 // the position in the compiled pattern.
303 // 298 //
304 allocateStackData(RESTACKFRAME_HDRCOUNT); 299 allocateStackData(RESTACKFRAME_HDRCOUNT);
305 300
306 // 301 //
307 // Optimization pass 1: NOPs, back-references, and case-folding 302 // Optimization pass 1: NOPs, back-references, and case-folding
(...skipping 120 matching lines...) Expand 10 before | Expand all | Expand 10 after
428 423
429 // Append a NOP to the compiled pattern. This is the slot reserved 424 // Append a NOP to the compiled pattern. This is the slot reserved
430 // for a SAVE in the event that there is yet another '|' following 425 // for a SAVE in the event that there is yet another '|' following
431 // this one. 426 // this one.
432 appendOp(URX_NOP, 0); 427 appendOp(URX_NOP, 0);
433 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 428 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
434 } 429 }
435 break; 430 break;
436 431
437 432
433 case doBeginNamedCapture:
434 // Scanning (?<letter.
435 // The first letter of the name will come through again under doConinu eNamedCapture.
436 fCaptureName = new UnicodeString();
437 if (fCaptureName == NULL) {
438 error(U_MEMORY_ALLOCATION_ERROR);
439 }
440 break;
441
442 case doContinueNamedCapture:
443 fCaptureName->append(fC.fChar);
444 break;
445
446 case doBadNamedCapture:
447 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
448 break;
449
438 case doOpenCaptureParen: 450 case doOpenCaptureParen:
439 // Open Paren. 451 // Open Capturing Paren, possibly named.
440 // Compile to a 452 // Compile to a
441 // - NOP, which later may be replaced by a save-state if the 453 // - NOP, which later may be replaced by a save-state if the
442 // parenthesized group gets a * quantifier, followed by 454 // parenthesized group gets a * quantifier, followed by
443 // - START_CAPTURE n where n is stack frame offset to the captu re group variables. 455 // - START_CAPTURE n where n is stack frame offset to the captu re group variables.
444 // - NOP, which may later be replaced by a save-state if there 456 // - NOP, which may later be replaced by a save-state if there
445 // is an '|' alternation within the parens. 457 // is an '|' alternation within the parens.
446 // 458 //
447 // Each capture group gets three slots in the save stack frame: 459 // Each capture group gets three slots in the save stack frame:
448 // 0: Capture Group start position (in input string being matche d.) 460 // 0: Capture Group start position (in input string being matche d.)
449 // 1: Capture Group end position. 461 // 1: Capture Group end position.
(...skipping 14 matching lines...) Expand all
464 // of the two NOPs. Depending on what follows in the pattern, the 476 // of the two NOPs. Depending on what follows in the pattern, the
465 // NOPs may be changed to SAVE_STATE or JMP ops, with a target 477 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
466 // address of the end of the parenthesized group. 478 // address of the end of the parenthesized group.
467 fParenStack.push(fModeFlags, *fStatus); // Mat ch mode state 479 fParenStack.push(fModeFlags, *fStatus); // Mat ch mode state
468 fParenStack.push(capturing, *fStatus); // Fra me type. 480 fParenStack.push(capturing, *fStatus); // Fra me type.
469 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location 481 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location
470 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 482 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
471 483
472 // Save the mapping from group number to stack frame variable positi on. 484 // Save the mapping from group number to stack frame variable positi on.
473 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); 485 fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
486
487 // If this is a named capture group, add the name->group number mapp ing.
488 if (fCaptureName != NULL) {
489 int32_t groupNumber = fRXPat->fGroupMap->size();
490 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, f CaptureName, groupNumber, fStatus);
491 fCaptureName = NULL; // hash table takes ownership of the nam e (key) string.
492 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
493 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
494 }
495 }
474 } 496 }
475 break; 497 break;
476 498
477 case doOpenNonCaptureParen: 499 case doOpenNonCaptureParen:
478 // Open non-caputuring (grouping only) Paren. 500 // Open non-caputuring (grouping only) Paren.
479 // Compile to a 501 // Compile to a
480 // - NOP, which later may be replaced by a save-state if the 502 // - NOP, which later may be replaced by a save-state if the
481 // parenthesized group gets a * quantifier, followed by 503 // parenthesized group gets a * quantifier, followed by
482 // - NOP, which may later be replaced by a save-state if there 504 // - NOP, which may later be replaced by a save-state if there
483 // is an '|' alternation within the parens. 505 // is an '|' alternation within the parens.
484 { 506 {
485 fixLiterals(); 507 fixLiterals();
(...skipping 485 matching lines...) Expand 10 before | Expand all | Expand 10 after
971 // are scanned. 993 // are scanned.
972 fIntervalLow = 0; 994 fIntervalLow = 0;
973 fIntervalUpper = -1; 995 fIntervalUpper = -1;
974 break; 996 break;
975 997
976 case doIntevalLowerDigit: 998 case doIntevalLowerDigit:
977 // Scanned a digit from the lower value of an {lower,upper} interval 999 // Scanned a digit from the lower value of an {lower,upper} interval
978 { 1000 {
979 int32_t digitValue = u_charDigitValue(fC.fChar); 1001 int32_t digitValue = u_charDigitValue(fC.fChar);
980 U_ASSERT(digitValue >= 0); 1002 U_ASSERT(digitValue >= 0);
981 fIntervalLow = fIntervalLow*10 + digitValue; 1003 int64_t val = (int64_t)fIntervalLow*10 + digitValue;
982 if (fIntervalLow < 0) { 1004 if (val > INT32_MAX) {
983 error(U_REGEX_NUMBER_TOO_BIG); 1005 error(U_REGEX_NUMBER_TOO_BIG);
1006 } else {
1007 fIntervalLow = (int32_t)val;
984 } 1008 }
985 } 1009 }
986 break; 1010 break;
987 1011
988 case doIntervalUpperDigit: 1012 case doIntervalUpperDigit:
989 // Scanned a digit from the upper value of an {lower,upper} interval 1013 // Scanned a digit from the upper value of an {lower,upper} interval
990 { 1014 {
991 if (fIntervalUpper < 0) { 1015 if (fIntervalUpper < 0) {
992 fIntervalUpper = 0; 1016 fIntervalUpper = 0;
993 } 1017 }
994 int32_t digitValue = u_charDigitValue(fC.fChar); 1018 int32_t digitValue = u_charDigitValue(fC.fChar);
995 U_ASSERT(digitValue >= 0); 1019 U_ASSERT(digitValue >= 0);
996 fIntervalUpper = fIntervalUpper*10 + digitValue; 1020 int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
997 if (fIntervalUpper < 0) { 1021 if (val > INT32_MAX) {
998 error(U_REGEX_NUMBER_TOO_BIG); 1022 error(U_REGEX_NUMBER_TOO_BIG);
1023 } else {
1024 fIntervalUpper = (int32_t)val;
999 } 1025 }
1000 } 1026 }
1001 break; 1027 break;
1002 1028
1003 case doIntervalSame: 1029 case doIntervalSame:
1004 // Scanned a single value interval like {27}. Upper = Lower. 1030 // Scanned a single value interval like {27}. Upper = Lower.
1005 fIntervalUpper = fIntervalLow; 1031 fIntervalUpper = fIntervalLow;
1006 break; 1032 break;
1007 1033
1008 case doInterval: 1034 case doInterval:
(...skipping 146 matching lines...) Expand 10 before | Expand all | Expand 10 after
1155 case doBackslashd: 1181 case doBackslashd:
1156 fixLiterals(FALSE); 1182 fixLiterals(FALSE);
1157 appendOp(URX_BACKSLASH_D, 0); 1183 appendOp(URX_BACKSLASH_D, 0);
1158 break; 1184 break;
1159 1185
1160 case doBackslashG: 1186 case doBackslashG:
1161 fixLiterals(FALSE); 1187 fixLiterals(FALSE);
1162 appendOp(URX_BACKSLASH_G, 0); 1188 appendOp(URX_BACKSLASH_G, 0);
1163 break; 1189 break;
1164 1190
1191 case doBackslashH:
1192 fixLiterals(FALSE);
1193 appendOp(URX_BACKSLASH_H, 1);
1194 break;
1195
1196 case doBackslashh:
1197 fixLiterals(FALSE);
1198 appendOp(URX_BACKSLASH_H, 0);
1199 break;
1200
1201 case doBackslashR:
1202 fixLiterals(FALSE);
1203 appendOp(URX_BACKSLASH_R, 0);
1204 break;
1205
1165 case doBackslashS: 1206 case doBackslashS:
1166 fixLiterals(FALSE); 1207 fixLiterals(FALSE);
1167 appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET); 1208 appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1168 break; 1209 break;
1169 1210
1170 case doBackslashs: 1211 case doBackslashs:
1171 fixLiterals(FALSE); 1212 fixLiterals(FALSE);
1172 appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET); 1213 appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1173 break; 1214 break;
1174 1215
1216 case doBackslashV:
1217 fixLiterals(FALSE);
1218 appendOp(URX_BACKSLASH_V, 1);
1219 break;
1220
1221 case doBackslashv:
1222 fixLiterals(FALSE);
1223 appendOp(URX_BACKSLASH_V, 0);
1224 break;
1225
1175 case doBackslashW: 1226 case doBackslashW:
1176 fixLiterals(FALSE); 1227 fixLiterals(FALSE);
1177 appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET); 1228 appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1178 break; 1229 break;
1179 1230
1180 case doBackslashw: 1231 case doBackslashw:
1181 fixLiterals(FALSE); 1232 fixLiterals(FALSE);
1182 appendOp(URX_STATIC_SETREF, URX_ISWORD_SET); 1233 appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1183 break; 1234 break;
1184 1235
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after
1256 // and shouldn't enter this code path at all. 1307 // and shouldn't enter this code path at all.
1257 fixLiterals(FALSE); 1308 fixLiterals(FALSE);
1258 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1309 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1259 appendOp(URX_BACKREF_I, groupNum); 1310 appendOp(URX_BACKREF_I, groupNum);
1260 } else { 1311 } else {
1261 appendOp(URX_BACKREF, groupNum); 1312 appendOp(URX_BACKREF, groupNum);
1262 } 1313 }
1263 } 1314 }
1264 break; 1315 break;
1265 1316
1317 case doBeginNamedBackRef:
1318 U_ASSERT(fCaptureName == NULL);
1319 fCaptureName = new UnicodeString;
1320 if (fCaptureName == NULL) {
1321 error(U_MEMORY_ALLOCATION_ERROR);
1322 }
1323 break;
1324
1325 case doContinueNamedBackRef:
1326 fCaptureName->append(fC.fChar);
1327 break;
1266 1328
1329 case doCompleteNamedBackRef:
1330 {
1331 int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) ;
1332 if (groupNumber == 0) {
1333 // Group name has not been defined.
1334 // Could be a forward reference. If we choose to support them at s ome
1335 // future time, extra mechanism will be required at this point.
1336 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1337 } else {
1338 // Given the number, handle identically to a \n numbered back refere nce.
1339 // See comments above, under doBackRef
1340 fixLiterals(FALSE);
1341 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1342 appendOp(URX_BACKREF_I, groupNumber);
1343 } else {
1344 appendOp(URX_BACKREF, groupNumber);
1345 }
1346 }
1347 delete fCaptureName;
1348 fCaptureName = NULL;
1349 break;
1350 }
1351
1267 case doPossessivePlus: 1352 case doPossessivePlus:
1268 // Possessive ++ quantifier. 1353 // Possessive ++ quantifier.
1269 // Compiles to 1354 // Compiles to
1270 // 1. STO_SP 1355 // 1. STO_SP
1271 // 2. body of stuff being iterated over 1356 // 2. body of stuff being iterated over
1272 // 3. STATE_SAVE 5 1357 // 3. STATE_SAVE 5
1273 // 4. JMP 2 1358 // 4. JMP 2
1274 // 5. LD_SP 1359 // 5. LD_SP
1275 // 6. ... 1360 // 6. ...
1276 // 1361 //
(...skipping 204 matching lines...) Expand 10 before | Expand all | Expand 10 after
1481 { 1566 {
1482 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1567 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1483 UnicodeSet digits; 1568 UnicodeSet digits;
1484 // TODO - make a static set, ticket 6058. 1569 // TODO - make a static set, ticket 6058.
1485 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MA SK, *fStatus); 1570 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MA SK, *fStatus);
1486 digits.complement(); 1571 digits.complement();
1487 set->addAll(digits); 1572 set->addAll(digits);
1488 break; 1573 break;
1489 } 1574 }
1490 1575
1576 case doSetBackslash_h:
1577 {
1578 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1579 UnicodeSet h;
1580 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, * fStatus);
1581 h.add((UChar32)9); // Tab
1582 set->addAll(h);
1583 break;
1584 }
1585
1586 case doSetBackslash_H:
1587 {
1588 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1589 UnicodeSet h;
1590 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, * fStatus);
1591 h.add((UChar32)9); // Tab
1592 h.complement();
1593 set->addAll(h);
1594 break;
1595 }
1596
1597 case doSetBackslash_v:
1598 {
1599 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1600 set->add((UChar32)0x0a, (UChar32)0x0d); // add range
1601 set->add((UChar32)0x85);
1602 set->add((UChar32)0x2028, (UChar32)0x2029);
1603 break;
1604 }
1605
1606 case doSetBackslash_V:
1607 {
1608 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1609 UnicodeSet v;
1610 v.add((UChar32)0x0a, (UChar32)0x0d); // add range
1611 v.add((UChar32)0x85);
1612 v.add((UChar32)0x2028, (UChar32)0x2029);
1613 v.complement();
1614 set->addAll(v);
1615 break;
1616 }
1617
1491 case doSetBackslash_w: 1618 case doSetBackslash_w:
1492 { 1619 {
1493 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1620 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1494 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET] ); 1621 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET] );
1495 break; 1622 break;
1496 } 1623 }
1497 1624
1498 case doSetBackslash_W: 1625 case doSetBackslash_W:
1499 { 1626 {
1500 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1627 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
(...skipping 1181 matching lines...) Expand 10 before | Expand all | Expand 10 after
2682 s.complement(); 2809 s.complement();
2683 } 2810 }
2684 fRXPat->fInitialChars->addAll(s); 2811 fRXPat->fInitialChars->addAll(s);
2685 numInitialStrings += 2; 2812 numInitialStrings += 2;
2686 } 2813 }
2687 currentLen++; 2814 currentLen++;
2688 atStart = FALSE; 2815 atStart = FALSE;
2689 break; 2816 break;
2690 2817
2691 2818
2819 case URX_BACKSLASH_H:
2820 // Horiz white space
2821 if (currentLen == 0) {
2822 UnicodeSet s;
2823 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MAS K, *fStatus);
2824 s.add((UChar32)9); // Tab
2825 if (URX_VAL(op) != 0) {
2826 s.complement();
2827 }
2828 fRXPat->fInitialChars->addAll(s);
2829 numInitialStrings += 2;
2830 }
2831 currentLen++;
2832 atStart = FALSE;
2833 break;
2834
2835
2836 case URX_BACKSLASH_R: // Any line ending sequence
2837 case URX_BACKSLASH_V: // Any line ending code point, with optional negation
2838 if (currentLen == 0) {
2839 UnicodeSet s;
2840 s.add((UChar32)0x0a, (UChar32)0x0d); // add range
2841 s.add((UChar32)0x85);
2842 s.add((UChar32)0x2028, (UChar32)0x2029);
2843 if (URX_VAL(op) != 0) {
2844 // Complement option applies to URX_BACKSLASH_V only.
2845 s.complement();
2846 }
2847 fRXPat->fInitialChars->addAll(s);
2848 numInitialStrings += 2;
2849 }
2850 currentLen++;
2851 atStart = FALSE;
2852 break;
2853
2854
2855
2692 case URX_ONECHAR_I: 2856 case URX_ONECHAR_I:
2693 // Case Insensitive Single Character. 2857 // Case Insensitive Single Character.
2694 if (currentLen == 0) { 2858 if (currentLen == 0) {
2695 UChar32 c = URX_VAL(op); 2859 UChar32 c = URX_VAL(op);
2696 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 2860 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2697 UnicodeSet starters(c, c); 2861 UnicodeSet starters(c, c);
2698 starters.closeOver(USET_CASE_INSENSITIVE); 2862 starters.closeOver(USET_CASE_INSENSITIVE);
2699 // findCaseInsensitiveStarters(c, &starters); 2863 // findCaseInsensitiveStarters(c, &starters);
2700 // For ONECHAR_I, no need to worry about text chars that e xpand on folding into strings. 2864 // For ONECHAR_I, no need to worry about text chars that e xpand on folding into strings.
2701 // The expanded folding can't match the pattern. 2865 // The expanded folding can't match the pattern.
(...skipping 368 matching lines...) Expand 10 before | Expand all | Expand 10 after
3070 break; 3234 break;
3071 3235
3072 3236
3073 // Ops that match a minimum of one character (one or two 16 bit code units.) 3237 // Ops that match a minimum of one character (one or two 16 bit code units.)
3074 // 3238 //
3075 case URX_ONECHAR: 3239 case URX_ONECHAR:
3076 case URX_STATIC_SETREF: 3240 case URX_STATIC_SETREF:
3077 case URX_STAT_SETREF_N: 3241 case URX_STAT_SETREF_N:
3078 case URX_SETREF: 3242 case URX_SETREF:
3079 case URX_BACKSLASH_D: 3243 case URX_BACKSLASH_D:
3244 case URX_BACKSLASH_H:
3245 case URX_BACKSLASH_R:
3246 case URX_BACKSLASH_V:
3080 case URX_ONECHAR_I: 3247 case URX_ONECHAR_I:
3081 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounde d. 3248 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounde d.
3082 case URX_DOTANY_ALL: // . matches one or two. 3249 case URX_DOTANY_ALL: // . matches one or two.
3083 case URX_DOTANY: 3250 case URX_DOTANY:
3084 case URX_DOTANY_UNIX: 3251 case URX_DOTANY_UNIX:
3085 currentLen++; 3252 currentLen++;
3086 break; 3253 break;
3087 3254
3088 3255
3089 case URX_JMPX: 3256 case URX_JMPX:
(...skipping 261 matching lines...) Expand 10 before | Expand all | Expand 10 after
3351 currentLen = INT32_MAX; 3518 currentLen = INT32_MAX;
3352 break; 3519 break;
3353 3520
3354 3521
3355 // Ops that match a max of one character (possibly two 16 bit code u nits.) 3522 // Ops that match a max of one character (possibly two 16 bit code u nits.)
3356 // 3523 //
3357 case URX_STATIC_SETREF: 3524 case URX_STATIC_SETREF:
3358 case URX_STAT_SETREF_N: 3525 case URX_STAT_SETREF_N:
3359 case URX_SETREF: 3526 case URX_SETREF:
3360 case URX_BACKSLASH_D: 3527 case URX_BACKSLASH_D:
3528 case URX_BACKSLASH_H:
3529 case URX_BACKSLASH_R:
3530 case URX_BACKSLASH_V:
3361 case URX_ONECHAR_I: 3531 case URX_ONECHAR_I:
3362 case URX_DOTANY_ALL: 3532 case URX_DOTANY_ALL:
3363 case URX_DOTANY: 3533 case URX_DOTANY:
3364 case URX_DOTANY_UNIX: 3534 case URX_DOTANY_UNIX:
3365 currentLen = safeIncrement(currentLen, 2); 3535 currentLen = safeIncrement(currentLen, 2);
3366 break; 3536 break;
3367 3537
3368 // Single literal character. Increase current max length by one or two, 3538 // Single literal character. Increase current max length by one or two,
3369 // depending on whether the char is in the supplementary range . 3539 // depending on whether the char is in the supplementary range .
3370 case URX_ONECHAR: 3540 case URX_ONECHAR:
(...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after
3472 } 3642 }
3473 3643
3474 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat ->elementAti(loc+3)); 3644 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat ->elementAti(loc+3));
3475 if (maxLoopCount == -1) { 3645 if (maxLoopCount == -1) {
3476 // Unbounded Loop. No upper bound on match length. 3646 // Unbounded Loop. No upper bound on match length.
3477 currentLen = INT32_MAX; 3647 currentLen = INT32_MAX;
3478 break; 3648 break;
3479 } 3649 }
3480 3650
3481 U_ASSERT(loopEndLoc >= loc+4); 3651 U_ASSERT(loopEndLoc >= loc+4);
3482 int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Rec ursive call. 3652 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recu rsive call.
3483 if (blockLen == INT32_MAX) { 3653 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCou nt;
3484 currentLen = blockLen; 3654 if (updatedLen >= INT32_MAX) {
3655 currentLen = INT32_MAX;
3485 break; 3656 break;
3486 } 3657 }
3487 currentLen += blockLen * maxLoopCount; 3658 currentLen = (int32_t)updatedLen;
3488 loc = loopEndLoc; 3659 loc = loopEndLoc;
3489 break; 3660 break;
3490 } 3661 }
3491 3662
3492 case URX_CTR_LOOP: 3663 case URX_CTR_LOOP:
3493 case URX_CTR_LOOP_NG: 3664 case URX_CTR_LOOP_NG:
3494 // These opcodes will be skipped over by code for URX_CRT_INIT. 3665 // These opcodes will be skipped over by code for URX_CRT_INIT.
3495 // We shouldn't encounter them here. 3666 // We shouldn't encounter them here.
3496 U_ASSERT(FALSE); 3667 U_ASSERT(FALSE);
3497 break; 3668 break;
(...skipping 180 matching lines...) Expand 10 before | Expand all | Expand 10 after
3678 case URX_LB_START: 3849 case URX_LB_START:
3679 case URX_LB_CONT: 3850 case URX_LB_CONT:
3680 case URX_LB_END: 3851 case URX_LB_END:
3681 case URX_LBN_CONT: 3852 case URX_LBN_CONT:
3682 case URX_LBN_END: 3853 case URX_LBN_END:
3683 case URX_LOOP_SR_I: 3854 case URX_LOOP_SR_I:
3684 case URX_LOOP_DOT_I: 3855 case URX_LOOP_DOT_I:
3685 case URX_LOOP_C: 3856 case URX_LOOP_C:
3686 case URX_DOLLAR_D: 3857 case URX_DOLLAR_D:
3687 case URX_DOLLAR_MD: 3858 case URX_DOLLAR_MD:
3859 case URX_BACKSLASH_H:
3860 case URX_BACKSLASH_R:
3861 case URX_BACKSLASH_V:
3688 // These instructions are unaltered by the relocation. 3862 // These instructions are unaltered by the relocation.
3689 fRXPat->fCompiledPat->setElementAt(op, dst); 3863 fRXPat->fCompiledPat->setElementAt(op, dst);
3690 dst++; 3864 dst++;
3691 break; 3865 break;
3692 3866
3693 default: 3867 default:
3694 // Some op is unaccounted for. 3868 // Some op is unaccounted for.
3695 U_ASSERT(FALSE); 3869 U_ASSERT(FALSE);
3696 error(U_REGEX_INTERNAL_ERROR); 3870 error(U_REGEX_INTERNAL_ERROR);
3697 } 3871 }
(...skipping 751 matching lines...) Expand 10 before | Expand all | Expand 10 after
4449 4623
4450 void RegexCompile::setPushOp(int32_t op) { 4624 void RegexCompile::setPushOp(int32_t op) {
4451 setEval(op); 4625 setEval(op);
4452 fSetOpStack.push(op, *fStatus); 4626 fSetOpStack.push(op, *fStatus);
4453 fSetStack.push(new UnicodeSet(), *fStatus); 4627 fSetStack.push(new UnicodeSet(), *fStatus);
4454 } 4628 }
4455 4629
4456 U_NAMESPACE_END 4630 U_NAMESPACE_END
4457 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 4631 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
4458 4632
OLDNEW
« no previous file with comments | « source/i18n/regexcmp.h ('k') | source/i18n/regexcst.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698